ConvertToEagerTensor now allocates on GPU if one is available and no device is specified
This allows to get rid of redundant H->D transfers. For example tf.gather(tf.constant([42.0]), 0) would previously allocate both [42.0] and 0 on CPU, and then transfer both to GPU to compute Gather. This could potentially hurt ops with inputs pinned to host e.g. Range. PiperOrigin-RevId: 275252442 Change-Id: I7d85d3314b9701e7b9df76acea12c2fcfdf2960e
This commit is contained in:
parent
2542bbea51
commit
17c4db8be1
tensorflow/python
@ -1051,17 +1051,16 @@ class SendRecvTest(test_util.TensorFlowTestCase):
|
||||
configure_virtual_cpus()
|
||||
|
||||
def testBasic(self):
|
||||
with ops.device(self.cpu_device):
|
||||
t0 = constant_op.constant(1.0)
|
||||
t1 = constant_op.constant(2.0)
|
||||
self._send(t0, 't0', self.cpu_device)
|
||||
self._send(t1, 't1', self.cpu_device)
|
||||
self.assertAllEqual(
|
||||
self._recv(dtypes.float32, 't0', self.cpu_device),
|
||||
1.0)
|
||||
self.assertAllEqual(
|
||||
self._recv(dtypes.float32, 't1', self.cpu_device),
|
||||
2.0)
|
||||
t0 = constant_op.constant(1.0)
|
||||
t1 = constant_op.constant(2.0)
|
||||
self._send(t0, 't0', self.cpu_device)
|
||||
self._send(t1, 't1', self.cpu_device)
|
||||
self.assertAllEqual(
|
||||
self._recv(dtypes.float32, 't0', self.cpu_device),
|
||||
1.0)
|
||||
self.assertAllEqual(
|
||||
self._recv(dtypes.float32, 't1', self.cpu_device),
|
||||
2.0)
|
||||
|
||||
@test_util.run_gpu_only
|
||||
def testLocalCrossDevice(self):
|
||||
|
@ -268,26 +268,6 @@ TFE_TensorHandle* PySeqToTFE_TensorHandle(PyObject* value, DataType dtype) {
|
||||
return new TFE_TensorHandle(handle);
|
||||
}
|
||||
|
||||
const char* MaybeUpdateDevice(TFE_Context* ctx, DataType dtype,
|
||||
const char* device_name) {
|
||||
if (!(device_name == nullptr || strcmp(device_name, "") == 0) ||
|
||||
dtype == DT_INVALID || DataTypeAlwaysOnHost(dtype)) {
|
||||
return device_name;
|
||||
}
|
||||
|
||||
// Approximately follow the logic of SelectDevice and
|
||||
// ColocationGraph::FilterSupportedDevices. Unlike the latter, though,
|
||||
// here we do not sort by device name to avoid allocating a temporary.
|
||||
const auto& devices = *(ctx->context->devices());
|
||||
const auto first_local_gpu =
|
||||
std::find_if(devices.begin(), devices.end(), [](const Device* dev) {
|
||||
return dev->IsLocal() && dev->device_type() == DEVICE_GPU;
|
||||
});
|
||||
return first_local_gpu == devices.end()
|
||||
? nullptr
|
||||
: strdup((*first_local_gpu)->name().c_str());
|
||||
}
|
||||
|
||||
TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
|
||||
PyObject* value,
|
||||
tensorflow::DataType dtype,
|
||||
@ -369,9 +349,6 @@ TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
|
||||
}
|
||||
}
|
||||
|
||||
device_name =
|
||||
MaybeUpdateDevice(ctx, static_cast<DataType>(handle_dtype), device_name);
|
||||
|
||||
// Almost all TensorFlow kernels for GPU devices keep int32 tensors in host
|
||||
// memory. We approximate the same behavior for eager execution - keeping
|
||||
// int32 tensors in host memory.
|
||||
@ -418,10 +395,6 @@ TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
|
||||
TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
|
||||
DataType dtype,
|
||||
const char* device_name) {
|
||||
// The device is updated twice: before the conversion using the
|
||||
// desired dtype (if given), and after, using the effective dtype.
|
||||
device_name = MaybeUpdateDevice(ctx, dtype, device_name);
|
||||
|
||||
// Reduce the overhead of allocation/transfer-to-device for scalars by
|
||||
// caching the corresponding handles. Note that currently only Python
|
||||
// scalars are cached.
|
||||
@ -433,14 +406,7 @@ TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
|
||||
handle = ConvertToEagerTensorUncached(ctx, value, dtype, device_name);
|
||||
if (handle == nullptr) return nullptr;
|
||||
if (!PyFloat_Check(value) || std::isfinite(PyFloat_AS_DOUBLE(value))) {
|
||||
// ConvertToEagerTensorUncached might have decided to allocate a
|
||||
// tensor on a different device (e.g. due to TF_INT32 hack). Resolve
|
||||
// device name from handle.
|
||||
const auto* dev = handle->handle->op_device();
|
||||
static constexpr char kEmpty[] = "";
|
||||
cache->Insert(value, dtype,
|
||||
dev == nullptr ? kEmpty : strdup(dev->name().c_str()),
|
||||
handle);
|
||||
cache->Insert(value, dtype, device_name, handle);
|
||||
}
|
||||
return handle;
|
||||
} else {
|
||||
|
@ -94,8 +94,6 @@ struct TFE_TensorHandleCache {
|
||||
// Not guarded by a mutex because the code is only used while the
|
||||
// GIL is held.
|
||||
absl::flat_hash_map<Key, tensorflow::TensorHandle*> cache;
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(TFE_TensorHandleCache);
|
||||
};
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -374,9 +374,6 @@ class TFETensorTest(test_util.TensorFlowTestCase):
|
||||
|
||||
def test_numpyIsView(self):
|
||||
t = constant_op.constant([0.0])
|
||||
if not t.device.endswith("CPU:0"):
|
||||
self.skipTest(".numpy() only returns a view on CPU")
|
||||
|
||||
t._numpy()[0] = 42.0
|
||||
self.assertAllClose(t, constant_op.constant([42.0]))
|
||||
|
||||
|
@ -68,8 +68,7 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
|
||||
context.ensure_initialized()
|
||||
|
||||
def copy_tensor(dtype=dtypes.int32):
|
||||
with ops.device('CPU:0'):
|
||||
cpu_tensor = constant_op.constant(1, dtype=dtype)
|
||||
cpu_tensor = constant_op.constant(1, dtype=dtype)
|
||||
gpu_tensor = cpu_tensor.gpu()
|
||||
self.assertAllEqual(cpu_tensor + gpu_tensor, 2.0)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user