From ef523e77cad1356fe50d1269a5f0d819edb02984 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Tue, 19 Dec 2017 16:51:25 -0800
Subject: [PATCH] TFE copy across gpus

PiperOrigin-RevId: 179622259
---
 tensorflow/c/eager/BUILD         |  6 ++-
 tensorflow/c/eager/c_api.cc      | 64 ++++++++++++--------------------
 tensorflow/c/eager/c_api_test.cc | 58 +++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 41 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index d533758e360..53df884e7ca 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -33,7 +33,7 @@ tf_cuda_library(
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
         ],
-    }),
+    }) + ["//tensorflow/core:gpu_runtime"],
 )
 
 tf_cuda_library(
@@ -55,6 +55,10 @@ tf_cuda_library(
 tf_cuda_cc_test(
     name = "c_api_test",
     srcs = ["c_api_test.cc"],
+    tags = [
+        "guitar",
+        "multi_gpu",
+    ],
     deps = [
         ":c_api",
         "//tensorflow/core:lib",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 706c89536db..beffa191d16 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/runtime.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -167,18 +168,6 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
   if (is_same_device) {
     return new TFE_TensorHandle(h->t, dst_cpu ? nullptr : dstd);
   }
-  const bool src_cpu = IsCPU(srcd);
-  if (src_cpu == dst_cpu) {
-    TF_SetStatus(
-        status, TF_INVALID_ARGUMENT,
-        tensorflow::strings::StrCat(
-            "TFE_TensorHandleCopyToDevice requires either the source "
-            "TFE_TensorHandle be on or the destination device be on CPU "
-            "or be the same (they are ",
-            DeviceName(srcd), " and ", DeviceName(dstd), " in this call)")
-            .c_str());
-    return nullptr;
-  }
   tensorflow::Tensor* src = &(h->t);
   if (!dst_cpu && !tensorflow::DataTypeCanUseMemcpy(src->dtype())) {
     TF_SetStatus(
@@ -189,26 +178,19 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
             .c_str());
     return nullptr;
   }
-  if (src_cpu) {
-    tensorflow::Tensor dst(
-        dstd->GetAllocator(tensorflow::AllocatorAttributes()), src->dtype(),
-        src->shape());
-    if (src->shape().num_elements() == 0) {
-      return new TFE_TensorHandle(dst, dstd);
-    }
-    tensorflow::Notification n;
-    dstd->tensorflow_gpu_device_info()->default_context->CopyCPUTensorToDevice(
-        src, dstd, &dst, [status, &n](const tensorflow::Status& s) {
-          status->status = s;
-          n.Notify();
-        });
-    n.WaitForNotification();
-    return (TF_GetCode(status) == TF_OK) ? new TFE_TensorHandle(dst, dstd)
-                                         : nullptr;
+  tensorflow::Tensor dst(dstd->GetAllocator(tensorflow::AllocatorAttributes()),
+                         src->dtype(), src->shape());
+  if (src->shape().num_elements() == 0) {
+    return new TFE_TensorHandle(dst, dst_cpu ? nullptr : dstd);
+  }
+  tensorflow::DeviceContext* src_device_context = nullptr;
+  if (!IsCPU(srcd)) {
+    src_device_context = srcd->tensorflow_gpu_device_info()->default_context;
+  }
+  tensorflow::DeviceContext* dst_device_context = nullptr;
+  if (!dst_cpu) {
+    dst_device_context = dstd->tensorflow_gpu_device_info()->default_context;
   }
-  CHECK(dst_cpu);
-  tensorflow::Tensor dst(src->dtype(), src->shape());
-  tensorflow::Notification n;
   // TODO(ashankar): The Sync() call below may be more aggressive than
   // necessary. It is based on knowledge of implementation details - that
   // GPU devices are implemented using 3 streams - one for host->device copies,
@@ -217,16 +199,18 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
   // but more than necessary (since it waits for operations that might have
   // nothing to do with this tensor to complete).
   status->status = srcd->Sync();
-  if (!status->status.ok()) return nullptr;
-  srcd->tensorflow_gpu_device_info()->default_context->CopyDeviceTensorToCPU(
-      src, "IGNORE_MY_TENSOR_NAME", srcd, &dst,
-      [status, &n](const tensorflow::Status& s) {
-        status->status = s;
-        n.Notify();
-      });
+  tensorflow::Notification n;
+  tensorflow::CopyTensor::ViaDMA("copy", src_device_context, dst_device_context,
+                                 srcd, dstd, tensorflow::AllocatorAttributes(),
+                                 tensorflow::AllocatorAttributes(), src, &dst,
+                                 [status, &n](const tensorflow::Status& s) {
+                                   status->status = s;
+                                   n.Notify();
+                                 });
   n.WaitForNotification();
-  return (TF_GetCode(status) == TF_OK) ? new TFE_TensorHandle(dst, nullptr)
-                                       : nullptr;
+  return (TF_GetCode(status) == TF_OK)
+             ? new TFE_TensorHandle(dst, dst_cpu ? nullptr : dstd)
+             : nullptr;
 }
 
 TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 3fe0b7efa11..c5ec0cfc31d 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -216,6 +216,64 @@ TEST(CAPI, TensorHandleCopyBetweenDevices) {
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 }
 
+TEST(CAPI, TensorHandleCopyBetweenTwoGPUDevices) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status.get());
+  TFE_DeleteContextOptions(opts);
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TFE_TensorHandle* hcpu = TestMatrixTensorHandle();
+  TF_Tensor* t = TFE_TensorHandleResolve(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  const int num_devices = TF_DeviceListCount(devices);
+
+  const char* kCPUDevice = "CPU:0";
+  if (num_devices < 3) {
+    TF_DeleteDeviceList(devices);
+    TF_DeleteTensor(t);
+    TFE_DeleteTensorHandle(hcpu);
+    TFE_DeleteContext(ctx, status.get());
+    return;
+  }
+  const string gpu_1_name(TF_DeviceListName(devices, 1, status.get()));
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK);
+  const string gpu_2_name(TF_DeviceListName(devices, 2, status.get()));
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK);
+  TFE_TensorHandle* hdevice =
+      TFE_TensorHandleCopyToDevice(hcpu, ctx, gpu_1_name.c_str(), status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK);
+
+  TFE_TensorHandle* hdevice2 = TFE_TensorHandleCopyToDevice(
+      hdevice, ctx, gpu_2_name.c_str(), status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK);
+  TFE_DeleteTensorHandle(hdevice);
+  // Copy back to CPU
+  TFE_TensorHandle* hcopy =
+      TFE_TensorHandleCopyToDevice(hdevice2, ctx, kCPUDevice, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK);
+  TFE_DeleteTensorHandle(hdevice2);
+
+  // Ensure that the contents are the same!
+  TF_Tensor* tcopy = TFE_TensorHandleResolve(hcopy, status.get());
+  TFE_DeleteTensorHandle(hcopy);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK);
+  EXPECT_EQ(TF_TensorByteSize(t), TF_TensorByteSize(tcopy));
+  EXPECT_EQ(
+      0, memcmp(TF_TensorData(t), TF_TensorData(tcopy), TF_TensorByteSize(t)));
+  TF_DeleteTensor(tcopy);
+
+  TF_DeleteDeviceList(devices);
+  TF_DeleteTensor(t);
+  TFE_DeleteTensorHandle(hcpu);
+  TFE_DeleteContext(ctx, status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+}
+
 TEST(CAPI, TensorHandleSilentCopy) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);