ConvertToEagerTensor now allocates on GPU if one is available and no device is specified

This allows to get rid of redundant H->D transfers. For example tf.gather(tf.constant([42.0]), 0) would previously allocate both [42.0] and 0 on CPU, and then transfer both to GPU to compute Gather. This could potentially hurt ops with inputs pinned to host e.g. Range. PiperOrigin-RevId: 275252442 Change-Id: I7d85d3314b9701e7b9df76acea12c2fcfdf2960e
2019-10-17 07:18:04 -07:00 · 2019-10-17 07:18:04 -07:00 · 17c4db8be1
commit 17c4db8be1
parent 2542bbea51
5 changed files with 12 additions and 53 deletions
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@ -1051,17 +1051,16 @@ class SendRecvTest(test_util.TensorFlowTestCase):
    configure_virtual_cpus()

  def testBasic(self):
-    with ops.device(self.cpu_device):
-      t0 = constant_op.constant(1.0)
-      t1 = constant_op.constant(2.0)
-      self._send(t0, 't0', self.cpu_device)
-      self._send(t1, 't1', self.cpu_device)
-      self.assertAllEqual(
-          self._recv(dtypes.float32, 't0', self.cpu_device),
-          1.0)
-      self.assertAllEqual(
-          self._recv(dtypes.float32, 't1', self.cpu_device),
-          2.0)
+    t0 = constant_op.constant(1.0)
+    t1 = constant_op.constant(2.0)
+    self._send(t0, 't0', self.cpu_device)
+    self._send(t1, 't1', self.cpu_device)
+    self.assertAllEqual(
+        self._recv(dtypes.float32, 't0', self.cpu_device),
+        1.0)
+    self.assertAllEqual(
+        self._recv(dtypes.float32, 't1', self.cpu_device),
+        2.0)

  @test_util.run_gpu_only
  def testLocalCrossDevice(self):
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@ -268,26 +268,6 @@ TFE_TensorHandle* PySeqToTFE_TensorHandle(PyObject* value, DataType dtype) {
  return new TFE_TensorHandle(handle);
 }

-const char* MaybeUpdateDevice(TFE_Context* ctx, DataType dtype,
-                              const char* device_name) {
-  if (!(device_name == nullptr || strcmp(device_name, "") == 0) ||
-      dtype == DT_INVALID || DataTypeAlwaysOnHost(dtype)) {
-    return device_name;
-  }
-
-  // Approximately follow the logic of SelectDevice and
-  // ColocationGraph::FilterSupportedDevices. Unlike the latter, though,
-  // here we do not sort by device name to avoid allocating a temporary.
-  const auto& devices = *(ctx->context->devices());
-  const auto first_local_gpu =
-      std::find_if(devices.begin(), devices.end(), [](const Device* dev) {
-        return dev->IsLocal() && dev->device_type() == DEVICE_GPU;
-      });
-  return first_local_gpu == devices.end()
-             ? nullptr
-             : strdup((*first_local_gpu)->name().c_str());
-}
-
 TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
                                               PyObject* value,
                                               tensorflow::DataType dtype,
@ -369,9 +349,6 @@ TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
    }
  }

-  device_name =
-      MaybeUpdateDevice(ctx, static_cast<DataType>(handle_dtype), device_name);
-
  // Almost all TensorFlow kernels for GPU devices keep int32 tensors in host
  // memory. We approximate the same behavior for eager execution - keeping
  // int32 tensors in host memory.
@ -418,10 +395,6 @@ TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
 TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
                                       DataType dtype,
                                       const char* device_name) {
-  // The device is updated twice: before the conversion using the
-  // desired dtype (if given), and after, using the effective dtype.
-  device_name = MaybeUpdateDevice(ctx, dtype, device_name);
-
  // Reduce the overhead of allocation/transfer-to-device for scalars by
  // caching the corresponding handles. Note that currently only Python
  // scalars are cached.
@ -433,14 +406,7 @@ TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
    handle = ConvertToEagerTensorUncached(ctx, value, dtype, device_name);
    if (handle == nullptr) return nullptr;
    if (!PyFloat_Check(value) || std::isfinite(PyFloat_AS_DOUBLE(value))) {
-      // ConvertToEagerTensorUncached might have decided to allocate a
-      // tensor on a different device (e.g. due to TF_INT32 hack). Resolve
-      // device name from handle.
-      const auto* dev = handle->handle->op_device();
-      static constexpr char kEmpty[] = "";
-      cache->Insert(value, dtype,
-                    dev == nullptr ? kEmpty : strdup(dev->name().c_str()),
-                    handle);
+      cache->Insert(value, dtype, device_name, handle);
    }
    return handle;
  } else {
--- a/tensorflow/python/eager/pywrap_tensor_conversion.h
+++ b/tensorflow/python/eager/pywrap_tensor_conversion.h
@ -94,8 +94,6 @@ struct TFE_TensorHandleCache {
  // Not guarded by a mutex because the code is only used while the
  // GIL is held.
  absl::flat_hash_map<Key, tensorflow::TensorHandle*> cache;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(TFE_TensorHandleCache);
 };

 }  // namespace tensorflow
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@ -374,9 +374,6 @@ class TFETensorTest(test_util.TensorFlowTestCase):

  def test_numpyIsView(self):
    t = constant_op.constant([0.0])
-    if not t.device.endswith("CPU:0"):
-      self.skipTest(".numpy() only returns a view on CPU")
-
    t._numpy()[0] = 42.0
    self.assertAllClose(t, constant_op.constant([42.0]))

--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@ -68,8 +68,7 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
    context.ensure_initialized()

    def copy_tensor(dtype=dtypes.int32):
-      with ops.device('CPU:0'):
-        cpu_tensor = constant_op.constant(1, dtype=dtype)
+      cpu_tensor = constant_op.constant(1, dtype=dtype)
      gpu_tensor = cpu_tensor.gpu()
      self.assertAllEqual(cpu_tensor + gpu_tensor, 2.0)