Merge pull request #42615 from drebain:drebain_patch

PiperOrigin-RevId: 328985012 Change-Id: I0b4c461f07e8a838df05bfa329efdb1a8f1293f1
2020-08-28 12:19:18 -07:00 · 2020-08-28 12:19:18 -07:00 · 16895e59b8
commit 16895e59b8
parent c49645f4b7 9225122330
3 changed files with 29 additions and 0 deletions
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@ -48,6 +48,14 @@ limitations under the License.
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/transform_output_iterator.h"

+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+using stream_executor::cuda::ScopedActivateExecutorContext;
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/rocm.h"
+using stream_executor::rocm::ScopedActivateExecutorContext;
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {

 typedef Eigen::GpuDevice GPUDevice;
@ -302,6 +310,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
    TensorReference partition_ref(partition_count);
    auto wrapped_callback = [this, c, &data, &partitions, indices_out,
                             partition_ref, cpu_tensor, done]() {
+      auto stream = c->op_device_context()->stream();
+      ScopedActivateExecutorContext scoped_activation{stream->parent()};
+
      OpOutputList outputs;
      this->AllocateOutputs(c, &data, &partitions, &cpu_tensor, &outputs, done);
      if (!c->status().ok()) {
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@ -2048,6 +2048,9 @@ cuda_py_test(
    name = "dynamic_partition_op_test",
    size = "medium",
    srcs = ["dynamic_partition_op_test.py"],
+    tags = [
+        "multi_and_single_gpu",
+    ],
    tfrt_enabled = True,
    deps = [
        "//tensorflow/python:array_ops",
--- a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
@ -23,8 +23,10 @@ import unittest
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin

+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
@ -346,6 +348,19 @@ class DynamicPartitionTest(test.TestCase):
      res = self.evaluate(partitioned)
    self.assertEqual(res[-1].shape[0], 192)

+  #  see https://github.com/tensorflow/tensorflow/issues/42500
+  def testMultiGPU(self):
+    device_list = config.list_logical_devices("GPU")
+    results = []
+    for device in device_list:
+      with ops.device(device.name):
+        data = constant_op.constant(np.zeros((1000,)))
+        partitions = constant_op.constant(np.arange(1000, dtype=np.int32) % 10)
+        result = data_flow_ops.dynamic_partition(data, partitions, 10)
+        results.append(self.evaluate(result))
+    if device_list:
+      self.assertAllEqual(results, np.zeros((len(device_list), 10, 100)))
+

 if __name__ == "__main__":
  test.main()