Merge pull request #41713 from dnguyen28061:allocate_temp

PiperOrigin-RevId: 326305159 Change-Id: Ic95dcb86e7bc58ced1666c7adcbddb9cc99dd2f4
2020-08-12 14:08:54 -07:00 · 2020-08-12 14:08:54 -07:00 · efa82dd9a7
commit efa82dd9a7
parent d487959ffc a7e4df9246
6 changed files with 225 additions and 36 deletions
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@ -23,6 +23,7 @@ filegroup(
    srcs = [
        "c_api.h",
        "c_api_experimental.h",
+        "c_api_macros.h",
        "tensor_interface.h",
        "tf_attrtype.h",
        "tf_datatype.h",
@ -61,6 +62,7 @@ filegroup(
    name = "pywrap_required_hdrs",
    srcs = [
        "c_api_internal.h",
+        "c_api_macros.h",
        "conversion_macros.h",
        "python_api.h",
        "tensor_interface.h",
@ -79,6 +81,7 @@ tf_cuda_library(
    hdrs = [
        "c_api.h",
        "c_api_internal.h",
+        "c_api_macros.h",
        "tf_datatype.h",
        "tf_tensor.h",
        "tf_tstring.h",
@ -310,6 +313,7 @@ cc_library(
    hdrs = ["tf_tensor.h"],
    visibility = ["//visibility:public"],
    deps = [
+        ":c_api_macros",
        ":tensor_interface",
        ":tf_datatype",
        ":tf_status",
@ -336,6 +340,7 @@ tf_cuda_library(
    ],
    visibility = ["//tensorflow:internal"],
    deps = [
+        ":c_api_macros",
        ":tensor_interface",
        ":tf_datatype",
        ":tf_status",
--- a/tensorflow/c/c_api_macros.h
+++ b/tensorflow/c/c_api_macros.h
@ -30,4 +30,17 @@ limitations under the License.
 #endif  // _WIN32
 #endif  // SWIG

+// TF_Bool is the C API typedef for unsigned char, while TF_BOOL is
+// the datatype for boolean tensors.
+#ifndef TF_Bool
+#define TF_Bool unsigned char
+#endif  // TF_Bool
+
+// Macro used to calculate struct size for maintaining ABI stability across
+// different struct implementations.
+#ifndef TF_OFFSET_OF_END
+#define TF_OFFSET_OF_END(TYPE, MEMBER) \
+  (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER))
+#endif  // TF_OFFSET_OF_END
+
 #endif  // TENSORFLOW_C_C_API_MACROS_H_
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@ -261,7 +261,6 @@ TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context, int index,
                             size_t len, TF_Status* status) {
  TF_SetStatus(status, TF_OK, "");
  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
-
  static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
                "64-bit int types should match in size");
  tensorflow::gtl::ArraySlice<tensorflow::int64> dimarray(
@ -279,4 +278,43 @@ TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context, int index,
    return nullptr;
  }
  return tf_tensor;
-}
+}
+
+TF_Tensor* TF_AllocateTemp(TF_OpKernelContext* context, TF_DataType dtype,
+                           int64_t* dims, int num_dims,
+                           TF_AllocatorAttributes* attributes,
+                           TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
+  TF_SetStatus(status, TF_OK, "");
+  static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
+                "64-bit int types should match in size");
+  tensorflow::gtl::ArraySlice<tensorflow::int64> dimarray(
+      reinterpret_cast<tensorflow::int64*>(dims), num_dims);
+  if (attributes && !attributes->struct_size) {
+    TF_SetStatus(
+        status, TF_INVALID_ARGUMENT,
+        "TF_AllocatorAttributes struct "
+        "size member must be set to TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE");
+    return nullptr;
+  }
+  tensorflow::AllocatorAttributes allocator_attr;
+  if (attributes && attributes->on_host) {
+    allocator_attr.set_on_host(true);
+  }
+  tensorflow::Status s;
+  tensorflow::Tensor tensor;
+  s = cc_ctx->allocate_temp(static_cast<tensorflow::DataType>(dtype),
+                            tensorflow::TensorShape(dimarray), &tensor,
+                            allocator_attr);
+  if (!s.ok()) {
+    ::tensorflow::Set_TF_Status_from_Status(status, s);
+    return nullptr;
+  }
+  TF_Tensor* tf_tensor;
+  tf_tensor = TF_TensorFromTensor(tensor, &s);
+  if (!s.ok()) {
+    ::tensorflow::Set_TF_Status_from_Status(status, s);
+    return nullptr;
+  }
+  return tf_tensor;
+}
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"

 // Macro to control visibility of exported symbols in the shared library (.so,
 // .dylib, .dll).
@ -199,6 +200,15 @@ TF_CAPI_EXPORT TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context,
                                            int64_t* dims, int num_dims,
                                            size_t len, TF_Status* status);

+// Allocates a temporary Tensor of the specified type and shape. The
+// Tensor must not be used after kernel construction is
+// complete.
+//
+// num_dims must equal the size of array dims
+TF_CAPI_EXPORT extern TF_Tensor* TF_AllocateTemp(
+    TF_OpKernelContext* context, TF_DataType dtype, int64_t* dims, int num_dims,
+    TF_AllocatorAttributes* alloc_attrs, TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@ -368,6 +368,16 @@ class DeviceKernelOpTest : public OpsTestBase {
 #endif
 };

+// Validates that the tensor has shape and type corresponding to
+// dims and dtype.
+void validate_tensor(TF_Tensor* tensor, int64_t* dims, int64_t num_dims,
+                     TF_DataType dtype);
+
+// Copies data of length tensor_size_bytes from values to tensor.
+template <typename T>
+void set_tensor_data(TF_Tensor* tensor, T* values, size_t tensor_size_bytes,
+                     TF_OpKernelContext* ctx);
+
 REGISTER_OP("AllocateOutputOp1").Output("output1: float");

 TEST_F(DeviceKernelOpTest, TestAllocateOutputSizeOne) {
@ -379,22 +389,11 @@ TEST_F(DeviceKernelOpTest, TestAllocateOutputSizeOne) {
    TF_Tensor* output = TF_AllocateOutput(
        /*context=*/ctx, /*index=*/0, /*dtype=*/TF_FLOAT, /*dims=*/&dim,
        /*num_dims=*/1, /*len=*/tensor_size_bytes, s);
-    EXPECT_EQ(TF_OK, TF_GetCode(s));
-    EXPECT_EQ(TF_FLOAT, TF_TensorType(output));
-    EXPECT_EQ(1, TF_NumDims(output));
-    EXPECT_EQ(1, TF_Dim(output, 0));
+    validate_tensor(output, &dim, 1, TF_FLOAT);

    // Set output to 3
-    float* data = reinterpret_cast<float*>(TF_TensorData(output));
-    float value = 3.0f;
-#if GOOGLE_CUDA
-    OpKernelContext* cc_ctx = reinterpret_cast<OpKernelContext*>(ctx);
-    cc_ctx->eigen_gpu_device().memcpyHostToDevice(data, &value,
-                                                  tensor_size_bytes);
-#else
-    *data = value;
-#endif
-
+    float values[1] = {3.0f};
+    set_tensor_data<float>(output, values, tensor_size_bytes, ctx);
    TF_DeleteStatus(s);
    TF_DeleteTensor(output);
  };
@ -417,12 +416,8 @@ TEST_F(DeviceKernelOpTest, TestAllocateEmptyOutput) {
    TF_Tensor* output = TF_AllocateOutput(
        /*context=*/ctx, /*index=*/0, /*dtype=*/TF_FLOAT, /*dims=*/&dim,
        /*num_dims=*/1, /*len=*/0, s);
-
    EXPECT_EQ(TF_OK, TF_GetCode(s));
-    EXPECT_EQ(TF_FLOAT, TF_TensorType(output));
-    EXPECT_EQ(1, TF_NumDims(output));
-    EXPECT_EQ(0, TF_Dim(output, 0));
-
+    validate_tensor(output, &dim, 1, TF_FLOAT);
    TF_DeleteStatus(s);
    TF_DeleteTensor(output);
  };
@ -442,27 +437,16 @@ TEST_F(DeviceKernelOpTest, TestAllocateOutputSize2x3) {
    TF_Status* s = TF_NewStatus();
    // Allocate 2x3 output
    int64_t dim[2] = {2, 3};
-    size_t tensor_size_bytes = 6 * TF_DataTypeSize(TF_FLOAT);
+    size_t tensor_size_bytes = TF_DataTypeSize(TF_FLOAT) * 6;
    TF_Tensor* output = TF_AllocateOutput(
        /*context=*/ctx, /*index=*/0, /*dtype=*/TF_FLOAT, /*dims=*/dim,
        /*num_dims=*/2, /*len=*/tensor_size_bytes, s);
    EXPECT_EQ(TF_OK, TF_GetCode(s));
-    EXPECT_EQ(TF_FLOAT, TF_TensorType(output));
-    EXPECT_EQ(2, TF_NumDims(output));
-    EXPECT_EQ(2, TF_Dim(output, 0));
-    EXPECT_EQ(3, TF_Dim(output, 1));
+    validate_tensor(output, dim, 2, TF_FLOAT);

    // Set output to [1 2 3 4 5 6]
-    void* data = TF_TensorData(output);
-    float value[6] = {1, 2, 3, 4, 5, 6};
-#if GOOGLE_CUDA
-    OpKernelContext* cc_ctx = reinterpret_cast<OpKernelContext*>(ctx);
-    cc_ctx->eigen_gpu_device().memcpyHostToDevice(data, value,
-                                                  tensor_size_bytes);
-#else
-    memcpy(data, value, tensor_size_bytes);
-#endif
-
+    float values[6] = {1, 2, 3, 4, 5, 6};
+    set_tensor_data<float>(output, values, tensor_size_bytes, ctx);
    TF_DeleteStatus(s);
    TF_DeleteTensor(output);
  };
@ -474,4 +458,132 @@ TEST_F(DeviceKernelOpTest, TestAllocateOutputSize2x3) {
  EXPECT_EQ("Tensor<type: float shape: [2,3] values: [1 2 3][4 5 6]>",
            output->DebugString(100));
 }
+
+REGISTER_OP("AllocateTempOp1").Output("output1: float");
+
+TEST_F(DeviceKernelOpTest, TestAllocateTempSizeOne) {
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    // Allocate scalar TF_Tensor
+    TF_Status* s = TF_NewStatus();
+    int64_t dim = 1;
+    TF_AllocatorAttributes alloc_attrs;
+    alloc_attrs.struct_size = TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE;
+#if GOOGLE_CUDA
+    alloc_attrs.on_host = 0;
+#else
+    alloc_attrs.on_host = 1;
+#endif
+    TF_Tensor* output = TF_AllocateTemp(
+        /*context=*/ctx, /*dtype=*/TF_FLOAT, /*dims=*/&dim,
+        /*num_dims=*/1, /*allocator_attributes*/ &alloc_attrs, s);
+    size_t tensor_size_bytes = TF_DataTypeSize(TF_FLOAT);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+    validate_tensor(output, &dim, 1, TF_FLOAT);
+
+    // Set TF_Tensor value to 3
+    float values[1] = {3.0f};
+    set_tensor_data<float>(output, values, tensor_size_bytes, ctx);
+    TF_SetOutput(ctx, 0, output, s);
+    TF_DeleteStatus(s);
+    TF_DeleteTensor(output);
+  };
+
+  SetupOp("AllocateTempOp1", "AllocateTemp1", my_compute_func);
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor* output = GetOutput(0);
+  EXPECT_EQ("Tensor<type: float shape: [1] values: 3>",
+            output->DebugString(100));
+}
+
+REGISTER_OP("AllocateTempOp0").Output("output1: float");
+
+TEST_F(DeviceKernelOpTest, TestAllocateTempEmpty) {
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    TF_Status* s = TF_NewStatus();
+    // Allocate empty TF_Tensor
+    int64_t dim = 0;
+    TF_AllocatorAttributes alloc_attrs;
+    alloc_attrs.struct_size = TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE;
+#if GOOGLE_CUDA
+    alloc_attrs.on_host = 0;
+#else
+    alloc_attrs.on_host = 1;
+#endif
+    TF_Tensor* output = TF_AllocateTemp(
+        /*context=*/ctx, /*dtype=*/TF_FLOAT, /*dims=*/&dim,
+        /*num_dims=*/1, /*allocator_attributes*/ &alloc_attrs, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+    validate_tensor(output, &dim, 1, TF_FLOAT);
+    TF_SetOutput(ctx, 0, output, s);
+    TF_DeleteStatus(s);
+    TF_DeleteTensor(output);
+  };
+
+  SetupOp("AllocateTempOp0", "AllocateTemp0", my_compute_func);
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor* output = GetOutput(0);
+  EXPECT_EQ("Tensor<type: float shape: [0] values: >",
+            output->DebugString(100));
+}
+
+REGISTER_OP("AllocateTempOp2x3").Output("output1: float");
+
+TEST_F(DeviceKernelOpTest, TestAllocateTempSize2x3) {
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    TF_Status* s = TF_NewStatus();
+    size_t tensor_size_bytes = 6 * TF_DataTypeSize(TF_FLOAT);
+    // Allocate 2x3 TF_Tensor
+    int64_t dim[2] = {2, 3};
+    TF_AllocatorAttributes alloc_attrs;
+    alloc_attrs.struct_size = TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE;
+#if GOOGLE_CUDA
+    alloc_attrs.on_host = 0;
+#else
+    alloc_attrs.on_host = 1;
+#endif
+    TF_Tensor* output = TF_AllocateTemp(
+        /*context=*/ctx, /*dtype=*/TF_FLOAT, /*dims=*/dim,
+        /*num_dims=*/2, /*allocator_attributes*/ &alloc_attrs, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+    validate_tensor(output, dim, 2, TF_FLOAT);
+
+    // Set TF_Tensor values to [1 2 3 4 5 6]
+    float values[6] = {1, 2, 3, 4, 5, 6};
+    set_tensor_data<float>(output, values, tensor_size_bytes, ctx);
+    TF_SetOutput(ctx, 0, output, s);
+    TF_DeleteStatus(s);
+    TF_DeleteTensor(output);
+  };
+
+  SetupOp("AllocateTempOp2x3", "AllocateTempOp2x3", my_compute_func);
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor* output = GetOutput(0);
+  EXPECT_EQ("Tensor<type: float shape: [2,3] values: [1 2 3][4 5 6]>",
+            output->DebugString(100));
+}
+
+void validate_tensor(TF_Tensor* tensor, int64_t* dims, int64_t num_dims,
+                     TF_DataType dtype) {
+  EXPECT_EQ(TF_FLOAT, TF_TensorType(tensor));
+  EXPECT_EQ(num_dims, TF_NumDims(tensor));
+  for (int i = 0; i < num_dims; ++i) {
+    EXPECT_EQ(dims[i], TF_Dim(tensor, i));
+  }
+}
+
+template <typename T>
+void set_tensor_data(TF_Tensor* tensor, T* values, size_t tensor_size_bytes,
+                     TF_OpKernelContext* ctx) {
+  T* data = reinterpret_cast<T*>(TF_TensorData(tensor));
+#if GOOGLE_CUDA
+  OpKernelContext* cc_ctx = reinterpret_cast<OpKernelContext*>(ctx);
+  cc_ctx->eigen_gpu_device().memcpyHostToDevice(data, values,
+                                                tensor_size_bytes);
+#else
+  memcpy(data, values, tensor_size_bytes);
+#endif
+}
 }  // namespace tensorflow
--- a/tensorflow/c/tf_tensor.h
+++ b/tensorflow/c/tf_tensor.h
@ -19,6 +19,7 @@ limitations under the License.
 #include <stdbool.h>
 #include <stdint.h>

+#include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"

@ -45,6 +46,16 @@ limitations under the License.
 extern "C" {
 #endif

+// Allocator Attributes used for tensor allocation.
+typedef struct TF_AllocatorAttributes {
+  size_t struct_size;
+  // Set boolean to 1 for CPU allocation, else 0.
+  TF_Bool on_host;
+} TF_AllocatorAttributes;
+
+#define TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE \
+  TF_OFFSET_OF_END(TF_AllocatorAttributes, on_host)
+
 // --------------------------------------------------------------------------
 // TF_Tensor holds a multi-dimensional array of elements of a single data type.
 // For all types other than TF_STRING, the data buffer stores elements