diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 0be1d5df616..8f3e24242eb 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -714,10 +714,13 @@ Status ExecutorImpl::Initialize(const Graph& graph) {
         used_outputs[e->src_output()] = true;
       }
     }
+    int i = 0;
     for (bool used_output : used_outputs) {
       if (!used_output) {
         metrics::RecordUnusedOutput(n->type_string());
+        item->kernel->set_output_required(i, false);
       }
+      ++i;
     }
   }
 
@@ -2093,9 +2096,9 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
   for (int i = 0; i < item.num_outputs; ++i) {
     const TensorValue val = ctx->release_output(i);
     if (val.tensor == nullptr) {
-      // Unless it's a Switch or a Recv, the node must produce a
-      // tensor value at i-th output.
-      if (!item.is_recv_or_switch) {
+      // Unless it's a Switch or a Recv, or the executor has marked the output
+      // as not required, the node must produce a tensor value at i-th output.
+      if (!(item.is_recv_or_switch || !item.kernel->output_required(i))) {
         s.Update(errors::Internal("Missing ", i, "-th output from ",
                                   FormatNodeDefForError(item.kernel->def())));
       }
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 38c56eb3b1c..e6fbdd1c113 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -105,7 +105,8 @@ OpKernel::OpKernel(OpKernelConstruction* context, bool is_deferred)
       type_string_view_(props_->node_def.op()),
       graph_def_version_(context->graph_def_version()),
       is_deferred_(is_deferred),
-      cost_estimate_(OpKernel::kInitialCostEstimateCycles) {
+      cost_estimate_(OpKernel::kInitialCostEstimateCycles),
+      outputs_required_(context->num_outputs(), true) {
   OP_REQUIRES_OK(context,
                  NameRangesForNode(props_->node_def, *props_->op_def,
                                    &input_name_map_, &output_name_map_));
@@ -133,7 +134,8 @@ OpKernel::OpKernel(OpKernelConstruction* context, NodeDef&& custom_def,
       type_string_view_(props_->node_def.op()),
       graph_def_version_(context->graph_def_version()),
       is_deferred_(is_deferred),
-      cost_estimate_(OpKernel::kInitialCostEstimateCycles) {
+      cost_estimate_(OpKernel::kInitialCostEstimateCycles),
+      outputs_required_(context->num_outputs(), true) {
   OP_REQUIRES_OK(context,
                  NameRangesForNode(props_->node_def, *props_->op_def,
                                    &input_name_map_, &output_name_map_));
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 9e22321b42c..8f339e878b9 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -156,6 +156,18 @@ class OpKernel {
   // Returns a pointer to the tensor stored inside constant ops.
   virtual const Tensor* const_tensor() const { return nullptr; }
 
+  // Returns true if this kernel must produce its ith output.
+  // REQUIRES: 0 <= i < num_inputs().
+  bool output_required(int i) const { return outputs_required_[i]; }
+
+  // Hints whether or not the ith output must be produced when running the
+  // kernel. By default, all outputs are required. The kernel implementation
+  // may ignore the hint.
+  // REQUIRES: 0 <= i < num_inputs().
+  void set_output_required(int i, bool is_required) {
+    outputs_required_[i] = is_required;
+  }
+
   // Updates the dynamic cost estimate, which is used to determine whether this
   // op is expensive. The new cost estimate is a weighted average of the old
   // cost estimate and the latest cost.
@@ -223,6 +235,7 @@ class OpKernel {
   const bool is_deferred_;
   bool expensive_;
   std::atomic_uint_fast64_t cost_estimate_;
+  std::vector<bool> outputs_required_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernel);
 };
@@ -941,10 +954,8 @@ class OpKernelContext {
   // should call allocate_output(index, ...), set_output(index, ...),
   // set_output_ref(index, ...), or set the status to a non-ok value.
   // If it returns false, it may output, but is not required to do so.
-  // TODO(mrry): Convert this to return Status, and implement a string
-  // name version.
   bool output_required(int index) const {
-    return true;  // TODO(josh11b): implement
+    return op_kernel().output_required(index);
   }
 
   // Allocation of tensors during kernel execution inside the Compute
diff --git a/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
index a97c5cb47a2..8de93cf9b30 100644
--- a/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
+++ b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
@@ -78,16 +78,23 @@ class SparseFillEmptyRowsOp : public OpKernel {
     const int64 N = indices_t.shape().dim_size(0);
     const int64 dense_rows = dense_shape(0);
 
-    Tensor* empty_row_indicator_t;
-    OP_REQUIRES_OK(context, context->allocate_output(kEmptyRowIndicatorOutput,
-                                                     TensorShape({dense_rows}),
-                                                     &empty_row_indicator_t));
-    auto empty_row_indicator = empty_row_indicator_t->vec<bool>();
-    Tensor* reverse_index_map_t;
-    OP_REQUIRES_OK(context, context->allocate_output(kReverseIndexMapOutput,
-                                                     TensorShape({N}),
-                                                     &reverse_index_map_t));
-    auto reverse_index_map = reverse_index_map_t->vec<int64>();
+    bool* empty_row_indicator = nullptr;
+    if (context->output_required(kEmptyRowIndicatorOutput)) {
+      Tensor* empty_row_indicator_t = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(kEmptyRowIndicatorOutput,
+                                              TensorShape({dense_rows}),
+                                              &empty_row_indicator_t));
+      empty_row_indicator = empty_row_indicator_t->vec<bool>().data();
+    }
+    int64* reverse_index_map = nullptr;
+    if (context->output_required(kReverseIndexMapOutput)) {
+      Tensor* reverse_index_map_t = nullptr;
+      OP_REQUIRES_OK(context, context->allocate_output(kReverseIndexMapOutput,
+                                                       TensorShape({N}),
+                                                       &reverse_index_map_t));
+      reverse_index_map = reverse_index_map_t->vec<int64>().data();
+    }
 
     int rank = indices_t.shape().dim_size(1);
 
@@ -122,8 +129,11 @@ class SparseFillEmptyRowsOp : public OpKernel {
     bool all_rows_full = true;
     for (int row = 0; row < dense_rows; ++row) {
       // csr_offset here describes the number of elements in this dense row
-      empty_row_indicator(row) = (csr_offset[row] == 0);
-      all_rows_full = all_rows_full & !empty_row_indicator(row);
+      bool row_empty = (csr_offset[row] == 0);
+      if (empty_row_indicator) {
+        empty_row_indicator[row] = row_empty;
+      }
+      all_rows_full = all_rows_full & !row_empty;
       // In filled version, each row has at least one element.
       csr_offset[row] = std::max(csr_offset[row], int64{1});
       // Update csr_offset to represent the number of elements up to and
@@ -140,8 +150,10 @@ class SparseFillEmptyRowsOp : public OpKernel {
     if (all_rows_full) {
       context->set_output(kOutputIndicesOutput, indices_t);
       context->set_output(kOutputValuesOutput, values_t);
-      for (int64 i = 0; i < N; ++i) {
-        reverse_index_map(i) = i;
+      if (reverse_index_map) {
+        for (int64 i = 0; i < N; ++i) {
+          reverse_index_map[i] = i;
+        }
       }
     } else {
       Tensor* output_indices_t;
@@ -169,7 +181,9 @@ class SparseFillEmptyRowsOp : public OpKernel {
         std::copy_n(&indices(i, 0), rank, &output_indices(output_i, 0));
         output_values(output_i) = values(i);
         // We'll need this reverse index map to backprop correctly.
-        reverse_index_map(i) = output_i;
+        if (reverse_index_map) {
+          reverse_index_map[i] = output_i;
+        }
       }
 
       // Fill in values for rows that are missing