From ef523e77cad1356fe50d1269a5f0d819edb02984 Mon Sep 17 00:00:00 2001 From: Alexandre Passos Date: Tue, 19 Dec 2017 16:51:25 -0800 Subject: [PATCH] TFE copy across gpus PiperOrigin-RevId: 179622259 --- tensorflow/c/eager/BUILD | 6 ++- tensorflow/c/eager/c_api.cc | 64 ++++++++++++-------------------- tensorflow/c/eager/c_api_test.cc | 58 +++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 41 deletions(-) diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD index d533758e360..53df884e7ca 100644 --- a/tensorflow/c/eager/BUILD +++ b/tensorflow/c/eager/BUILD @@ -33,7 +33,7 @@ tf_cuda_library( "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", ], - }), + }) + ["//tensorflow/core:gpu_runtime"], ) tf_cuda_library( @@ -55,6 +55,10 @@ tf_cuda_library( tf_cuda_cc_test( name = "c_api_test", srcs = ["c_api_test.cc"], + tags = [ + "guitar", + "multi_gpu", + ], deps = [ ":c_api", "//tensorflow/core:lib", diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc index 706c89536db..beffa191d16 100644 --- a/tensorflow/c/eager/c_api.cc +++ b/tensorflow/c/eager/c_api.cc @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/c/c_api_internal.h" #include "tensorflow/c/eager/c_api_internal.h" #include "tensorflow/c/eager/runtime.h" +#include "tensorflow/core/common_runtime/copy_tensor.h" #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/device_mgr.h" #include "tensorflow/core/common_runtime/function.h" @@ -167,18 +168,6 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h, if (is_same_device) { return new TFE_TensorHandle(h->t, dst_cpu ? nullptr : dstd); } - const bool src_cpu = IsCPU(srcd); - if (src_cpu == dst_cpu) { - TF_SetStatus( - status, TF_INVALID_ARGUMENT, - tensorflow::strings::StrCat( - "TFE_TensorHandleCopyToDevice requires either the source " - "TFE_TensorHandle be on or the destination device be on CPU " - "or be the same (they are ", - DeviceName(srcd), " and ", DeviceName(dstd), " in this call)") - .c_str()); - return nullptr; - } tensorflow::Tensor* src = &(h->t); if (!dst_cpu && !tensorflow::DataTypeCanUseMemcpy(src->dtype())) { TF_SetStatus( @@ -189,26 +178,19 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h, .c_str()); return nullptr; } - if (src_cpu) { - tensorflow::Tensor dst( - dstd->GetAllocator(tensorflow::AllocatorAttributes()), src->dtype(), - src->shape()); - if (src->shape().num_elements() == 0) { - return new TFE_TensorHandle(dst, dstd); - } - tensorflow::Notification n; - dstd->tensorflow_gpu_device_info()->default_context->CopyCPUTensorToDevice( - src, dstd, &dst, [status, &n](const tensorflow::Status& s) { - status->status = s; - n.Notify(); - }); - n.WaitForNotification(); - return (TF_GetCode(status) == TF_OK) ? new TFE_TensorHandle(dst, dstd) - : nullptr; + tensorflow::Tensor dst(dstd->GetAllocator(tensorflow::AllocatorAttributes()), + src->dtype(), src->shape()); + if (src->shape().num_elements() == 0) { + return new TFE_TensorHandle(dst, dst_cpu ? nullptr : dstd); + } + tensorflow::DeviceContext* src_device_context = nullptr; + if (!IsCPU(srcd)) { + src_device_context = srcd->tensorflow_gpu_device_info()->default_context; + } + tensorflow::DeviceContext* dst_device_context = nullptr; + if (!dst_cpu) { + dst_device_context = dstd->tensorflow_gpu_device_info()->default_context; } - CHECK(dst_cpu); - tensorflow::Tensor dst(src->dtype(), src->shape()); - tensorflow::Notification n; // TODO(ashankar): The Sync() call below may be more aggressive than // necessary. It is based on knowledge of implementation details - that // GPU devices are implemented using 3 streams - one for host->device copies, @@ -217,16 +199,18 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h, // but more than necessary (since it waits for operations that might have // nothing to do with this tensor to complete). status->status = srcd->Sync(); - if (!status->status.ok()) return nullptr; - srcd->tensorflow_gpu_device_info()->default_context->CopyDeviceTensorToCPU( - src, "IGNORE_MY_TENSOR_NAME", srcd, &dst, - [status, &n](const tensorflow::Status& s) { - status->status = s; - n.Notify(); - }); + tensorflow::Notification n; + tensorflow::CopyTensor::ViaDMA("copy", src_device_context, dst_device_context, + srcd, dstd, tensorflow::AllocatorAttributes(), + tensorflow::AllocatorAttributes(), src, &dst, + [status, &n](const tensorflow::Status& s) { + status->status = s; + n.Notify(); + }); n.WaitForNotification(); - return (TF_GetCode(status) == TF_OK) ? new TFE_TensorHandle(dst, nullptr) - : nullptr; + return (TF_GetCode(status) == TF_OK) + ? new TFE_TensorHandle(dst, dst_cpu ? nullptr : dstd) + : nullptr; } TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name, diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc index 3fe0b7efa11..c5ec0cfc31d 100644 --- a/tensorflow/c/eager/c_api_test.cc +++ b/tensorflow/c/eager/c_api_test.cc @@ -216,6 +216,64 @@ TEST(CAPI, TensorHandleCopyBetweenDevices) { EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); } +TEST(CAPI, TensorHandleCopyBetweenTwoGPUDevices) { + std::unique_ptr status( + TF_NewStatus(), TF_DeleteStatus); + TFE_ContextOptions* opts = TFE_NewContextOptions(); + TFE_Context* ctx = TFE_NewContext(opts, status.get()); + TFE_DeleteContextOptions(opts); + ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); + + TFE_TensorHandle* hcpu = TestMatrixTensorHandle(); + TF_Tensor* t = TFE_TensorHandleResolve(hcpu, status.get()); + ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); + + TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get()); + ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); + const int num_devices = TF_DeviceListCount(devices); + + const char* kCPUDevice = "CPU:0"; + if (num_devices < 3) { + TF_DeleteDeviceList(devices); + TF_DeleteTensor(t); + TFE_DeleteTensorHandle(hcpu); + TFE_DeleteContext(ctx, status.get()); + return; + } + const string gpu_1_name(TF_DeviceListName(devices, 1, status.get())); + ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK); + const string gpu_2_name(TF_DeviceListName(devices, 2, status.get())); + ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK); + TFE_TensorHandle* hdevice = + TFE_TensorHandleCopyToDevice(hcpu, ctx, gpu_1_name.c_str(), status.get()); + ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK); + + TFE_TensorHandle* hdevice2 = TFE_TensorHandleCopyToDevice( + hdevice, ctx, gpu_2_name.c_str(), status.get()); + ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK); + TFE_DeleteTensorHandle(hdevice); + // Copy back to CPU + TFE_TensorHandle* hcopy = + TFE_TensorHandleCopyToDevice(hdevice2, ctx, kCPUDevice, status.get()); + ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK); + TFE_DeleteTensorHandle(hdevice2); + + // Ensure that the contents are the same! + TF_Tensor* tcopy = TFE_TensorHandleResolve(hcopy, status.get()); + TFE_DeleteTensorHandle(hcopy); + ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK); + EXPECT_EQ(TF_TensorByteSize(t), TF_TensorByteSize(tcopy)); + EXPECT_EQ( + 0, memcmp(TF_TensorData(t), TF_TensorData(tcopy), TF_TensorByteSize(t))); + TF_DeleteTensor(tcopy); + + TF_DeleteDeviceList(devices); + TF_DeleteTensor(t); + TFE_DeleteTensorHandle(hcpu); + TFE_DeleteContext(ctx, status.get()); + EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); +} + TEST(CAPI, TensorHandleSilentCopy) { std::unique_ptr status( TF_NewStatus(), TF_DeleteStatus);