From c5ef52c5f0c698b76133eae0aa93d83fa7ab9f79 Mon Sep 17 00:00:00 2001
From: Daniel Nguyen <nguyendaniel@google.com>
Date: Fri, 31 Jul 2020 23:43:44 +0000
Subject: [PATCH 1/3] added draft of function

---
 tensorflow/c/kernels.cc | 26 +++++++++++++++++++++++++-
 tensorflow/c/kernels.h  |  5 +++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 20a6c5117cf..0fa1c83cac2 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -279,4 +279,28 @@ TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context, int index,
     return nullptr;
   }
   return tf_tensor;
-}
\ No newline at end of file
+}
+
+void TF_ForwardInputOrAllocateOutput(TF_OpKernelContext* context, 
+    int* candidate_input_indices, int num_input_indices, int output_index, 
+    int64_t* output_dims, int output_num_dims, TF_Tensor** output, 
+    int* forwarded_input, TF_Status* status) { 
+  TF_SetStatus(status, TF_OK, ""); 
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
+  tensorflow::gtl::ArraySlice<int> input_indices_array(candidate_input_indices, 
+                                                       num_input_indices); 
+  tensorflow::gtl::ArraySlice<tensorflow::int64> output_dimarray(
+      reinterpret_cast<tensorflow::int64*>(output_dims), output_num_dims);
+  tensorflow::Tensor output_tensor;  
+  tensorflow::Status s = TF_TensorToTensor(*output, &output_tensor); 
+  if (!s.ok()) { 
+    ::tensorflow::Set_TF_Status_from_Status(status, s);
+    return;
+  }
+  tensorflow::Tensor* output_tensor_pointer = &output_tensor;
+  tensorflow::Status forward_input_status = cc_ctx->
+      forward_input_or_allocate_output(input_indices_array, output_index, 
+      tensorflow::TensorShape(output_dimarray), &output_tensor_pointer, 
+      forwarded_input); 
+    ::tensorflow::Set_TF_Status_from_Status(status, s);
+}
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index c7138a39c73..22424ddc096 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -199,6 +199,11 @@ TF_CAPI_EXPORT TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context,
                                             int64_t* dims, int num_dims,
                                             size_t len, TF_Status* status);
 
+TF_CAPI_EXPORT void TF_ForwardInputOrAllocateOutput(TF_OpKernelContext* context,
+    int* candidate_input_indices, int num_input_indices, int output_index, 
+    int64_t* output_dims, int output_num_dims, TF_Tensor** output, 
+    int* forwarded_input, TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif

From 0a79e7111037c4bb793964708acc27f4e7cc12ee Mon Sep 17 00:00:00 2001
From: Daniel Nguyen <nguyendaniel@google.com>
Date: Mon, 10 Aug 2020 16:53:45 +0000
Subject: [PATCH 2/3] finished implementation and passes tests

---
 tensorflow/c/kernels.cc      | 26 ++++++++-------
 tensorflow/c/kernels.h       | 15 ++++++---
 tensorflow/c/kernels_test.cc | 64 ++++++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 15 deletions(-)

diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 0fa1c83cac2..86d88943f9a 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -281,26 +281,30 @@ TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context, int index,
   return tf_tensor;
 }
 
-void TF_ForwardInputOrAllocateOutput(TF_OpKernelContext* context, 
+TF_Tensor* TF_ForwardInputOrAllocateOutput(TF_OpKernelContext* context, 
     int* candidate_input_indices, int num_input_indices, int output_index, 
-    int64_t* output_dims, int output_num_dims, TF_Tensor** output, 
-    int* forwarded_input, TF_Status* status) { 
+    int64_t* output_dims, int output_num_dims, int* forwarded_input, 
+    TF_Status* status) { 
   TF_SetStatus(status, TF_OK, ""); 
   auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
   tensorflow::gtl::ArraySlice<int> input_indices_array(candidate_input_indices, 
                                                        num_input_indices); 
   tensorflow::gtl::ArraySlice<tensorflow::int64> output_dimarray(
       reinterpret_cast<tensorflow::int64*>(output_dims), output_num_dims);
-  tensorflow::Tensor output_tensor;  
-  tensorflow::Status s = TF_TensorToTensor(*output, &output_tensor); 
-  if (!s.ok()) { 
-    ::tensorflow::Set_TF_Status_from_Status(status, s);
-    return;
-  }
-  tensorflow::Tensor* output_tensor_pointer = &output_tensor;
-  tensorflow::Status forward_input_status = cc_ctx->
+  tensorflow::Tensor* output_tensor_pointer;
+  tensorflow::Status s = cc_ctx->
       forward_input_or_allocate_output(input_indices_array, output_index, 
       tensorflow::TensorShape(output_dimarray), &output_tensor_pointer, 
       forwarded_input); 
+  if (!s.ok()) { 
     ::tensorflow::Set_TF_Status_from_Status(status, s);
+    return nullptr;  
+  } 
+  TF_Tensor* tf_tensor_output = TF_TensorFromTensor(
+      *output_tensor_pointer, &s); 
+  if (!s.ok()) { 
+    ::tensorflow::Set_TF_Status_from_Status(status, s);
+    return nullptr;  
+  }
+  return tf_tensor_output;
 }
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index 22424ddc096..f9aae309df8 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -199,10 +199,17 @@ TF_CAPI_EXPORT TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context,
                                             int64_t* dims, int num_dims,
                                             size_t len, TF_Status* status);
 
-TF_CAPI_EXPORT void TF_ForwardInputOrAllocateOutput(TF_OpKernelContext* context,
-    int* candidate_input_indices, int num_input_indices, int output_index, 
-    int64_t* output_dims, int output_num_dims, TF_Tensor** output, 
-    int* forwarded_input, TF_Status* status);
+// Tries to forward one of the inputs given in input_indices to
+// output[output_index]. If none of the given inputs can be forwarded, calls
+// allocate_output() to allocate a new output buffer. The index of the
+// forwarded input will be assign to output argument forwarded_input (if it's
+// not nullptr). If no inputs are forwarded, forwarded_input will be assigned
+// -1.
+
+TF_CAPI_EXPORT TF_Tensor* TF_ForwardInputOrAllocateOutput(
+    TF_OpKernelContext* context, int* candidate_input_indices, 
+    int num_input_indices, int output_index, int64_t* output_dims, 
+    int output_num_dims, int* forwarded_input, TF_Status* status);
 
 #ifdef __cplusplus
 } /* end extern "C" */
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index 3c8ac934428..1ff461e0f03 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -474,4 +474,68 @@ TEST_F(DeviceKernelOpTest, TestAllocateOutputSize2x3) {
   EXPECT_EQ("Tensor<type: float shape: [2,3] values: [1 2 3][4 5 6]>",
             output->DebugString(100));
 }
+
+TEST_F(DeviceKernelOpTest, TestForwardInputOrAllocateOutput) {
+  const char* node_name = "TestForwardInputOrAllocateOutputKernel";
+  const char* op_name = "BazOp";
+  const char* device_name = "FakeDeviceName";
+
+  REGISTER_OP(op_name)
+      .Input("input1: float")
+      .Input("input2: float")
+      .Output("output1: float")
+      .Attr("SomeDataTypeAttr: type");;
+
+  // A kernel whose Compute function that forwards one input to output
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    TF_Status* s = TF_NewStatus();
+    int candidate_input_indices[1] = {0}; 
+    int forwarded_input; 
+    int64_t output_dims[1] = {};
+    TF_Tensor* output = TF_ForwardInputOrAllocateOutput(ctx, 
+        candidate_input_indices, 1, 0, output_dims, 0, &forwarded_input, s); 
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+    EXPECT_EQ(forwarded_input, 0);
+    EXPECT_EQ(TF_FLOAT, TF_TensorType(output));
+    EXPECT_EQ(0, TF_NumDims(output));
+    TF_DeleteStatus(s);
+  };
+
+  TF_KernelBuilder* builder = TF_NewKernelBuilder(op_name, device_name, nullptr,
+                                                  my_compute_func, nullptr);
+
+  {
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(node_name, builder, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    TF_DeleteStatus(status);
+  }
+
+  {
+    OpKernelContext::Params p;
+    DummyDevice dummy_device(nullptr);
+    p.device = &dummy_device;
+    AllocatorAttributes alloc_attrs; 
+    p.output_attr_array = &alloc_attrs;
+
+    Tensor t(static_cast<float>(123));
+
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    // GetFakeKernel requires a NodeDef with two inputs 
+    inputs.emplace_back(&t);
+    inputs.emplace_back();
+    p.inputs = &inputs;
+
+    Status status;
+    std::unique_ptr<OpKernel> kernel =
+        GetFakeKernel(device_name, op_name, node_name, &status);
+    TF_EXPECT_OK(status);
+    ASSERT_NE(nullptr, kernel.get());
+
+    p.op_kernel = kernel.get();
+    OpKernelContext ctx(&p);
+    kernel->Compute(&ctx);
+    ASSERT_EQ(123, ctx.mutable_output(0)->scalar<float>()());
+  }
+}
 }  // namespace tensorflow

From aa88605eae286960f52d1dc3fdee06238221d6d2 Mon Sep 17 00:00:00 2001
From: Daniel Nguyen <nguyendaniel@google.com>
Date: Tue, 11 Aug 2020 18:18:25 +0000
Subject: [PATCH 3/3] clean up only

---
 tensorflow/c/kernels.cc      | 13 ++++++++-----
 tensorflow/c/kernels.h       |  3 +--
 tensorflow/c/kernels_test.cc | 10 ++++++----
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 86d88943f9a..a3d4e6a90f6 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -282,13 +282,16 @@ TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context, int index,
 }
 
 TF_Tensor* TF_ForwardInputOrAllocateOutput(TF_OpKernelContext* context, 
-    int* candidate_input_indices, int num_input_indices, int output_index, 
-    int64_t* output_dims, int output_num_dims, int* forwarded_input, 
-    TF_Status* status) { 
+    int* candidate_input_indices, int num_candidate_input_indices, 
+    int output_index, int64_t* output_dims, int output_num_dims, 
+    int* forwarded_input, TF_Status* status) { 
   TF_SetStatus(status, TF_OK, ""); 
   auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
-  tensorflow::gtl::ArraySlice<int> input_indices_array(candidate_input_indices, 
-                                                       num_input_indices); 
+
+  static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
+                "64-bit int types should match in size");
+  tensorflow::gtl::ArraySlice<int> input_indices_array(candidate_input_indices,
+      num_candidate_input_indices);
   tensorflow::gtl::ArraySlice<tensorflow::int64> output_dimarray(
       reinterpret_cast<tensorflow::int64*>(output_dims), output_num_dims);
   tensorflow::Tensor* output_tensor_pointer;
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index f9aae309df8..fe388b98dbd 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -205,10 +205,9 @@ TF_CAPI_EXPORT TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context,
 // forwarded input will be assign to output argument forwarded_input (if it's
 // not nullptr). If no inputs are forwarded, forwarded_input will be assigned
 // -1.
-
 TF_CAPI_EXPORT TF_Tensor* TF_ForwardInputOrAllocateOutput(
     TF_OpKernelContext* context, int* candidate_input_indices, 
-    int num_input_indices, int output_index, int64_t* output_dims, 
+    int num_candidate_input_indices, int output_index, int64_t* output_dims, 
     int output_num_dims, int* forwarded_input, TF_Status* status);
 
 #ifdef __cplusplus
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index 1ff461e0f03..e48e2bc4bb8 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -486,14 +486,16 @@ TEST_F(DeviceKernelOpTest, TestForwardInputOrAllocateOutput) {
       .Output("output1: float")
       .Attr("SomeDataTypeAttr: type");;
 
-  // A kernel whose Compute function that forwards one input to output
+  // A kernel whose Compute function that forwards a scalar input to output
   auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
     TF_Status* s = TF_NewStatus();
     int candidate_input_indices[1] = {0}; 
     int forwarded_input; 
     int64_t output_dims[1] = {};
-    TF_Tensor* output = TF_ForwardInputOrAllocateOutput(ctx, 
-        candidate_input_indices, 1, 0, output_dims, 0, &forwarded_input, s); 
+    TF_Tensor* output = TF_ForwardInputOrAllocateOutput(/*context=*/ctx, 
+        candidate_input_indices, /*num_candidate_input_indices=*/1, 
+        /*output_index=*/0, output_dims, /*output_num_dims=*/0, 
+        &forwarded_input, /*status=*/s); 
     EXPECT_EQ(TF_OK, TF_GetCode(s));
     EXPECT_EQ(forwarded_input, 0);
     EXPECT_EQ(TF_FLOAT, TF_TensorType(output));
@@ -518,7 +520,7 @@ TEST_F(DeviceKernelOpTest, TestForwardInputOrAllocateOutput) {
     AllocatorAttributes alloc_attrs; 
     p.output_attr_array = &alloc_attrs;
 
-    Tensor t(static_cast<float>(123));
+    Tensor t(123.0f);
 
     gtl::InlinedVector<TensorValue, 4> inputs;
     // GetFakeKernel requires a NodeDef with two inputs