ConvertToEagerTensor now allocates on GPU if one is available and no device is specified

This allows to get rid of redundant H->D transfers. For example

  tf.gather(tf.constant([42.0]), 0)

would previously allocate both [42.0] and 0 on CPU, and then transfer
both to GPU to compute Gather.

This could potentially hurt ops with inputs pinned to host e.g. Range.

PiperOrigin-RevId: 275252442
Change-Id: I7d85d3314b9701e7b9df76acea12c2fcfdf2960e
This commit is contained in:
Sergei Lebedev 2019-10-17 07:18:04 -07:00 committed by TensorFlower Gardener
parent 2542bbea51
commit 17c4db8be1
5 changed files with 12 additions and 53 deletions

View File

@ -1051,17 +1051,16 @@ class SendRecvTest(test_util.TensorFlowTestCase):
configure_virtual_cpus()
def testBasic(self):
with ops.device(self.cpu_device):
t0 = constant_op.constant(1.0)
t1 = constant_op.constant(2.0)
self._send(t0, 't0', self.cpu_device)
self._send(t1, 't1', self.cpu_device)
self.assertAllEqual(
self._recv(dtypes.float32, 't0', self.cpu_device),
1.0)
self.assertAllEqual(
self._recv(dtypes.float32, 't1', self.cpu_device),
2.0)
t0 = constant_op.constant(1.0)
t1 = constant_op.constant(2.0)
self._send(t0, 't0', self.cpu_device)
self._send(t1, 't1', self.cpu_device)
self.assertAllEqual(
self._recv(dtypes.float32, 't0', self.cpu_device),
1.0)
self.assertAllEqual(
self._recv(dtypes.float32, 't1', self.cpu_device),
2.0)
@test_util.run_gpu_only
def testLocalCrossDevice(self):

View File

@ -268,26 +268,6 @@ TFE_TensorHandle* PySeqToTFE_TensorHandle(PyObject* value, DataType dtype) {
return new TFE_TensorHandle(handle);
}
const char* MaybeUpdateDevice(TFE_Context* ctx, DataType dtype,
const char* device_name) {
if (!(device_name == nullptr || strcmp(device_name, "") == 0) ||
dtype == DT_INVALID || DataTypeAlwaysOnHost(dtype)) {
return device_name;
}
// Approximately follow the logic of SelectDevice and
// ColocationGraph::FilterSupportedDevices. Unlike the latter, though,
// here we do not sort by device name to avoid allocating a temporary.
const auto& devices = *(ctx->context->devices());
const auto first_local_gpu =
std::find_if(devices.begin(), devices.end(), [](const Device* dev) {
return dev->IsLocal() && dev->device_type() == DEVICE_GPU;
});
return first_local_gpu == devices.end()
? nullptr
: strdup((*first_local_gpu)->name().c_str());
}
TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
PyObject* value,
tensorflow::DataType dtype,
@ -369,9 +349,6 @@ TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
}
}
device_name =
MaybeUpdateDevice(ctx, static_cast<DataType>(handle_dtype), device_name);
// Almost all TensorFlow kernels for GPU devices keep int32 tensors in host
// memory. We approximate the same behavior for eager execution - keeping
// int32 tensors in host memory.
@ -418,10 +395,6 @@ TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
DataType dtype,
const char* device_name) {
// The device is updated twice: before the conversion using the
// desired dtype (if given), and after, using the effective dtype.
device_name = MaybeUpdateDevice(ctx, dtype, device_name);
// Reduce the overhead of allocation/transfer-to-device for scalars by
// caching the corresponding handles. Note that currently only Python
// scalars are cached.
@ -433,14 +406,7 @@ TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
handle = ConvertToEagerTensorUncached(ctx, value, dtype, device_name);
if (handle == nullptr) return nullptr;
if (!PyFloat_Check(value) || std::isfinite(PyFloat_AS_DOUBLE(value))) {
// ConvertToEagerTensorUncached might have decided to allocate a
// tensor on a different device (e.g. due to TF_INT32 hack). Resolve
// device name from handle.
const auto* dev = handle->handle->op_device();
static constexpr char kEmpty[] = "";
cache->Insert(value, dtype,
dev == nullptr ? kEmpty : strdup(dev->name().c_str()),
handle);
cache->Insert(value, dtype, device_name, handle);
}
return handle;
} else {

View File

@ -94,8 +94,6 @@ struct TFE_TensorHandleCache {
// Not guarded by a mutex because the code is only used while the
// GIL is held.
absl::flat_hash_map<Key, tensorflow::TensorHandle*> cache;
TF_DISALLOW_COPY_AND_ASSIGN(TFE_TensorHandleCache);
};
} // namespace tensorflow

View File

@ -374,9 +374,6 @@ class TFETensorTest(test_util.TensorFlowTestCase):
def test_numpyIsView(self):
t = constant_op.constant([0.0])
if not t.device.endswith("CPU:0"):
self.skipTest(".numpy() only returns a view on CPU")
t._numpy()[0] = 42.0
self.assertAllClose(t, constant_op.constant([42.0]))

View File

@ -68,8 +68,7 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
context.ensure_initialized()
def copy_tensor(dtype=dtypes.int32):
with ops.device('CPU:0'):
cpu_tensor = constant_op.constant(1, dtype=dtype)
cpu_tensor = constant_op.constant(1, dtype=dtype)
gpu_tensor = cpu_tensor.gpu()
self.assertAllEqual(cpu_tensor + gpu_tensor, 2.0)