Add a function in DeviceBase to deep copy on the device.

This change adds support for a CPU and GPU deep copy function, accessible from the DeviceBase class. Other devices are not supported for now. This new functionality is intended for TensorFlow internal runtime use only. PiperOrigin-RevId: 243263305
2019-04-12 07:53:06 -07:00 · 2019-04-12 07:53:06 -07:00 · d1db9860a2
commit d1db9860a2
parent 8fd880bca0
12 changed files with 227 additions and 11 deletions
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -3900,6 +3900,7 @@ tf_cc_tests(
        "common_runtime/pending_counts_test.cc",
        "common_runtime/placer_test.cc",
        "common_runtime/session_test.cc",
+        "common_runtime/threadpool_device_test.cc",
        "example/feature_util_test.cc",
        "framework/allocator_test.cc",
        "framework/attr_value_util_test.cc",
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@ -745,6 +745,14 @@ Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
  }
 }

+void BaseGPUDevice::CopyTensorInSameDevice(const Tensor* input_tensor,
+                                           Tensor* output_tensor,
+                                           const DeviceContext* device_context,
+                                           StatusCallback done) {
+  GPUUtil::CopyGPUTensorToSameGPU(static_cast<Device*>(this), device_context,
+                                  input_tensor, output_tensor, std::move(done));
+}
+
 namespace {
 class ConcretePerOpGpuDevice : public PerOpGpuDevice {
 public:
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@ -90,6 +90,10 @@ class BaseGPUDevice : public LocalDevice {
                             const AllocatorAttributes alloc_attrs,
                             Tensor* tensor) override;

+  void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
+                              const DeviceContext* device_context,
+                              StatusCallback done) override;
+
  // The caller owns the returned device.
  PerOpGpuDevice* MakeGpuDevice() override;

--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@ -85,6 +85,36 @@ class GPUDeviceTest : public ::testing::Test {
    }
    return options;
  }
+
+  void InitCPUTensor(Tensor* cpu_tensor, int num_elements, float value) {
+    auto tensor = cpu_tensor->tensor<float, 1>();
+    for (int i = 0; i < num_elements; ++i) {
+      tensor(i) = value;
+    }
+  }
+
+  void CopyCPUToGPU(Tensor* cpu_tensor, Tensor* gpu_tensor, Device* device,
+                    DeviceContext* device_context) {
+    Notification note;
+    device_context->CopyCPUTensorToDevice(cpu_tensor, device, gpu_tensor,
+                                          [&note](const Status& s) {
+                                            TF_ASSERT_OK(s);
+                                            note.Notify();
+                                          });
+    note.WaitForNotification();
+  }
+
+  void CopyGPUToCPU(Tensor* gpu_tensor, Tensor* cpu_tensor, Device* device,
+                    DeviceContext* device_context) {
+    Notification note;
+    device_context->CopyDeviceTensorToCPU(gpu_tensor, /*tensor_name=*/"",
+                                          device, cpu_tensor,
+                                          [&note](const Status& s) {
+                                            TF_ASSERT_OK(s);
+                                            note.Notify();
+                                          });
+    note.WaitForNotification();
+  }
 };

 TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
@ -277,6 +307,45 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
  allocator->DeallocateRaw(ptr);
 }

+TEST_F(GPUDeviceTest, CopyTensorInSameDevice) {
+  SessionOptions opts = MakeSessionOptions("0");
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_ASSERT_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  Device* device = devices[0].get();
+  auto* device_info = device->tensorflow_gpu_device_info();
+  CHECK(device_info);
+  DeviceContext* device_context = device_info->default_context;
+  Allocator* allocator = device->GetAllocator(AllocatorAttributes());
+
+  constexpr int kNumElements = 4;
+  Tensor input_tensor(allocator, DT_FLOAT, TensorShape({kNumElements}));
+  Tensor output_tensor(allocator, DT_FLOAT, TensorShape({kNumElements}));
+  Tensor cpu_tensor(cpu_allocator(), DT_FLOAT, TensorShape({kNumElements}));
+  // Initialize input as {1, 1, 1, 1} and output as {0, 0, 0, 0}.  After copy,
+  // both should become {1, 1, 1, 1}.
+  InitCPUTensor(&cpu_tensor, kNumElements, 0);
+  CopyCPUToGPU(&cpu_tensor, &output_tensor, device, device_context);
+  InitCPUTensor(&cpu_tensor, kNumElements, 1);
+  CopyCPUToGPU(&cpu_tensor, &input_tensor, device, device_context);
+  Notification note;
+  device->CopyTensorInSameDevice(&input_tensor, &output_tensor, device_context,
+                                 [&note](const Status& s) {
+                                   TF_ASSERT_OK(s);
+                                   note.Notify();
+                                 });
+  note.WaitForNotification();
+
+  Tensor output_cpu_tensor(cpu_allocator(), DT_FLOAT,
+                           TensorShape({kNumElements}));
+  CopyGPUToCPU(&output_tensor, &output_cpu_tensor, device, device_context);
+  auto input = cpu_tensor.tensor<float, 1>();
+  auto output = output_cpu_tensor.tensor<float, 1>();
+  for (int i = 0; i < kNumElements; ++i) {
+    EXPECT_EQ(input(i), output(i)) << " for index " << i;
+  }
+}
+
 class GPUKernelTrackerTest : public ::testing::Test {
 protected:
  void SetUp() {
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@ -94,6 +94,13 @@ class RenamedDevice : public Device {
    return underlying_->MakeTensorFromProto(tensor_proto, alloc_attrs, tensor);
  }

+  void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
+                              const DeviceContext* device_context,
+                              StatusCallback done) override {
+    underlying_->CopyTensorInSameDevice(input_tensor, output_tensor,
+                                        device_context, std::move(done));
+  }
+
  // Below are virtual methods defined on Device

  void Compute(OpKernel* op_kernel, OpKernelContext* context) override {
--- a/tensorflow/core/common_runtime/single_threaded_cpu_device.cc
+++ b/tensorflow/core/common_runtime/single_threaded_cpu_device.cc
@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"

 namespace tensorflow {
@ -69,6 +70,19 @@ class SingleThreadedCpuDevice : public Device {
    return Status::OK();
  }

+  void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
+                              const DeviceContext*,
+                              StatusCallback done) override {
+    if (input_tensor->NumElements() != output_tensor->NumElements()) {
+      done(errors::Internal(
+          "SingleThreadedCPU->SingleThreadedCPU copy shape mismatch: input=",
+          input_tensor->shape(), ", output=", output_tensor->shape()));
+      return;
+    }
+    tensor::DeepCopy(*input_tensor, output_tensor);
+    done(Status::OK());
+  }
+
  Allocator* GetAllocator(AllocatorAttributes attr) override {
    return cpu_allocator();
  }
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/types.h"
 #include "tensorflow/core/lib/hash/hash.h"
@ -102,6 +103,19 @@ Status ThreadPoolDevice::MakeTensorFromProto(
                                 ProtoDebugString(tensor_proto));
 }

+void ThreadPoolDevice::CopyTensorInSameDevice(
+    const Tensor* input_tensor, Tensor* output_tensor,
+    const DeviceContext* device_context, StatusCallback done) {
+  if (input_tensor->NumElements() != output_tensor->NumElements()) {
+    done(errors::Internal(
+        "CPU->CPU copy shape mismatch: input=", input_tensor->shape(),
+        ", output=", output_tensor->shape()));
+    return;
+  }
+  tensor::DeepCopy(*input_tensor, output_tensor);
+  done(Status::OK());
+}
+
 #ifdef INTEL_MKL
 namespace {
 class MklCPUAllocatorFactory : public AllocatorFactory {
--- a/tensorflow/core/common_runtime/threadpool_device.h
+++ b/tensorflow/core/common_runtime/threadpool_device.h
@ -38,6 +38,9 @@ class ThreadPoolDevice : public LocalDevice {
  Status MakeTensorFromProto(const TensorProto& tensor_proto,
                             const AllocatorAttributes alloc_attrs,
                             Tensor* tensor) override;
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
+                              const DeviceContext* device_context,
+                              StatusCallback done) override;

  Status Sync() override { return Status::OK(); }

--- a/tensorflow/core/common_runtime/threadpool_device_test.cc
+++ b/tensorflow/core/common_runtime/threadpool_device_test.cc
@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+const int kDimSize = 2;
+
+void InitTensor(Tensor* tensor, float value) {
+  auto eigen_tensor = tensor->tensor<float, kDimSize>();
+  for (int i = 0; i < kDimSize; ++i) {
+    for (int j = 0; j < kDimSize; ++j) {
+      eigen_tensor(i, j) = value;
+    }
+  }
+}
+
+bool Equal(const Tensor& tensor1, const Tensor& tensor2) {
+  auto eigen_tensor1 = tensor1.tensor<float, kDimSize>();
+  auto eigen_tensor2 = tensor2.tensor<float, kDimSize>();
+  for (int i = 0; i < kDimSize; ++i) {
+    for (int j = 0; j < kDimSize; ++j) {
+      if (eigen_tensor1(i, j) != eigen_tensor2(i, j)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+TEST(ThreadPoolDeviceTest, CopyTensor) {
+  Tensor input(DT_FLOAT, TensorShape({kDimSize, kDimSize}));
+  Tensor output(DT_FLOAT, TensorShape({kDimSize, kDimSize}));
+  InitTensor(&input, 1);
+  InitTensor(&output, 0);
+  ASSERT_FALSE(Equal(input, output));
+
+  ThreadPoolDevice device(SessionOptions(), "/device:CPU:0", Bytes(256),
+                          DeviceLocality(), cpu_allocator());
+  DeviceContext* device_context = new DeviceContext;
+  Notification note;
+  device.CopyTensorInSameDevice(&input, &output, device_context,
+                                [&note](const Status& s) {
+                                  TF_ASSERT_OK(s);
+                                  note.Notify();
+                                });
+  note.WaitForNotification();
+  ASSERT_TRUE(Equal(input, output));
+
+  device_context->Unref();
+}
+
+}  // namespace
+}  // namespace tensorflow
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@ -255,6 +255,22 @@ class DeviceBase {
  // be tagging deallocated memory chunks using the same counter.
  virtual uint64 SafeAllocFrontier() { return 0; }

+  // Copies `input_tensor` to `output_tensor`, where both tensors are on this
+  // device. This function assumes that `output_tensor` has already been
+  // allocated with a buffer that is large enough to hold `input_tensor`'s data.
+  // Calls `done` from a device-specific thread after copy is finished, which
+  // may be the same as calling thread.
+  //
+  // NOTE(ayushd): This function is for TensorFlow internal use only.  Deep copy
+  // is discouraged and should not be used in OpKernels.
+  virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
+                                      Tensor* output_tensor,
+                                      const DeviceContext* device_context,
+                                      StatusCallback done) {
+    done(errors::Internal("Device ", name(), " does not implement ",
+                          "CopyTensorInSameDevice"));
+  }
+
 protected:
  // Does not take ownership.
  void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) {
--- a/tensorflow/core/framework/tensor_util.cc
+++ b/tensorflow/core/framework/tensor_util.cc
@ -31,24 +31,28 @@ namespace tensor {

 Tensor DeepCopy(const Tensor& other) {
  Tensor tmp = Tensor(other.dtype(), other.shape());
-  if (DataTypeCanUseMemcpy(other.dtype())) {
-    if (other.NumElements() > 0) {
-      StringPiece other_data = other.tensor_data();
+  DeepCopy(other, &tmp);
+  return tmp;
+}
+
+void DeepCopy(const Tensor& input, Tensor* output) {
+  if (DataTypeCanUseMemcpy(input.dtype())) {
+    if (input.NumElements() > 0) {
+      StringPiece input_data = input.tensor_data();

      // We use StringPiece as a convenient map over the tensor buffer,
      // but we cast the type to get to the underlying buffer to do the
      // copy.
-      StringPiece tmp_data = tmp.tensor_data();
-      memcpy(const_cast<char*>(tmp_data.data()), other_data.data(),
-             other_data.size());
+      StringPiece output_data = output->tensor_data();
+      memcpy(const_cast<char*>(output_data.data()), input_data.data(),
+             input_data.size());
    }
-  } else if (other.dtype() == DT_STRING) {
-    tmp.unaligned_flat<string>() = other.unaligned_flat<string>();
+  } else if (input.dtype() == DT_STRING) {
+    output->unaligned_flat<string>() = input.unaligned_flat<string>();
  } else {
-    CHECK_EQ(DT_VARIANT, other.dtype());
-    tmp.unaligned_flat<Variant>() = other.unaligned_flat<Variant>();
+    CHECK_EQ(DT_VARIANT, input.dtype());
+    output->unaligned_flat<Variant>() = input.unaligned_flat<Variant>();
  }
-  return tmp;
 }

 Status Concat(const gtl::ArraySlice<Tensor>& tensors, Tensor* result) {
--- a/tensorflow/core/framework/tensor_util.h
+++ b/tensorflow/core/framework/tensor_util.h
@ -38,6 +38,10 @@ namespace tensor {
 //           'other' is not appropriately memory-aligned.
 Tensor DeepCopy(const Tensor& other);

+// Deep copies input to output.  This function is similar to above, but assumes
+// that the memory for the output has already been allocated.
+void DeepCopy(const Tensor& input, Tensor* output);
+
 // Concatenates 'tensors' into a single tensor, along their 0th dimension.
 //
 // REQUIRES: All members of 'tensors' must have the same data type parameter.