[XLA:GPU] Add missing synchronization when multistreaming is enabled.

Substreams must wait for the main stream, otherwise their operations might execute before, e.g., the parameters are ready. PiperOrigin-RevId: 299575884 Change-Id: Ibf97e18c2416b89ef03d5487807745c185365e12
2020-03-07 11:30:19 -08:00 · 2020-03-07 11:30:19 -08:00 · 928a8730a3
commit 928a8730a3
parent d8ee1d31d5
7 changed files with 140 additions and 6 deletions
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@ -394,3 +394,25 @@ pybind_extension(
        "//conditions:default": [],
    }),
 )
+
+tf_cc_test(
+    name = "gpu_multistream_test",
+    srcs = ["gpu_multistream_test.cc"],
+    tags = [
+        # TODO(phawkins): figure out TF test infra such that this only runs under GPU.
+        "no_oss",
+        "requires-gpu-nvidia",
+    ],
+    deps = [
+        ":local_client",
+        ":nvidia_gpu_device",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:executable_build_options",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:random",
+    ],
+)
--- a/tensorflow/compiler/xla/python/gpu_multistream_test.cc
+++ b/tensorflow/compiler/xla/python/gpu_multistream_test.cc
@ -0,0 +1,98 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/python/local_client.h"
+#include "tensorflow/compiler/xla/python/nvidia_gpu_device.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/platform/random.h"
+
+namespace xla {
+namespace {
+
+// Regression test that verifies that substreams of a multistream GPU
+// computation wait for the inputs to be produced before executing.
+TEST(GpuMultiStream, Basics) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<PyLocalClient> client,
+      GetNvidiaGpuClient(/*asynchronous=*/true, GpuAllocatorConfig(),
+                         /*distributed_client=*/nullptr, /*node_id=*/0));
+
+  std::shared_ptr<Device> device = client->local_devices().at(0);
+
+  int n = 1024;
+  Shape shape = ShapeUtil::MakeShape(S32, {n});
+  std::vector<int32> inputs(n);
+  std::vector<int32> expected_outputs(n);
+
+  XlaBuilder builder("acomputation");
+  auto p0 = Parameter(&builder, 0, shape, "param");
+  auto p1 = Parameter(&builder, 1, shape, "param");
+  Tuple(&builder, {Neg(p0), Neg(p1)});
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder.Build());
+
+  ExecutableBuildOptions build_options;
+  build_options.mutable_debug_options()->set_xla_gpu_disable_multi_streaming(
+      false);
+  build_options.mutable_debug_options()->set_xla_gpu_use_random_streams(true);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PyLocalExecutable> executable,
+      PyLocalExecutable::CompileForDevices(computation, {}, &build_options,
+                                           client, {{device}}));
+
+  int64 dummy_size = 1 << 20;
+  std::vector<int32> dummy_inputs(dummy_size);
+  Shape dummy_shape = ShapeUtil::MakeShape(S32, {dummy_size});
+
+  for (int i = 0; i < 100; ++i) {
+    for (int i = 0; i < n; ++i) {
+      inputs[i] = tensorflow::random::New64();
+      expected_outputs[i] = -inputs[i];
+    }
+    // Transfer a large dummy buffer, behind which the inputs to the computation
+    // must wait.
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto dummy_buffer,
+        PyLocalBuffer::FromHostBuffer(
+            dummy_inputs.data(), dummy_shape, /*force_copy=*/false,
+            /*buffer_reference=*/nullptr, client, device));
+    TF_ASSERT_OK_AND_ASSIGN(auto in_buffer0,
+                            PyLocalBuffer::FromHostBuffer(
+                                inputs.data(), shape, /*force_copy=*/false,
+                                /*buffer_reference=*/nullptr, client, device));
+    TF_ASSERT_OK_AND_ASSIGN(auto in_buffer1,
+                            PyLocalBuffer::FromHostBuffer(
+                                inputs.data(), shape, /*force_copy=*/false,
+                                /*buffer_reference=*/nullptr, client, device));
+    // The execution may be enqueued before the transfers complete, requiring
+    // adequate device-side synchronization.
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto out_buffer,
+        executable->Execute({in_buffer0.get(), in_buffer1.get()}));
+
+    TF_ASSERT_OK_AND_ASSIGN(auto out_buffers, out_buffer->DestructureTuple());
+
+    TF_ASSERT_OK_AND_ASSIGN(auto out_literal, out_buffers[0]->ToLiteral());
+    LiteralTestUtil::ExpectR1Equal<int32>(expected_outputs, *out_literal);
+    TF_ASSERT_OK_AND_ASSIGN(out_literal, out_buffers[1]->ToLiteral());
+    LiteralTestUtil::ExpectR1Equal<int32>(expected_outputs, *out_literal);
+  }
+}
+
+}  // namespace
+}  // namespace xla
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@ -156,6 +156,7 @@ cc_library(
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla/service:hlo",
        "//tensorflow/compiler/xla/service:hlo_reachability",
+        "//tensorflow/core/platform:random",
        "@com_google_absl//absl/container:flat_hash_map",
        "@com_google_absl//absl/container:flat_hash_set",
        "@com_google_absl//absl/memory",
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@ -161,6 +161,9 @@ Status GpuExecutable::ExecuteThunks(
    sub_streams.emplace_back();
    TF_ASSIGN_OR_RETURN(sub_streams.back(),
                        run_options->BorrowStream(executor->device_ordinal()));
+    // Require substreams to wait for the main stream, otherwise substreams may
+    // execute before the program is scheduled to start on the main stream.
+    sub_streams.back()->ThenWaitFor(main_stream);
  }

  HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
+#include "tensorflow/core/platform/random.h"

 namespace xla {
 namespace gpu {
@ -72,13 +73,17 @@ int ComputeStreamToAssign(
    return kInvalidStreamNum;
  }

-  if (hlo.GetModule()
-          ->config()
-          .debug_options()
-          .xla_gpu_disable_multi_streaming()) {
+  const auto& debug_options = hlo.GetModule()->config().debug_options();
+  if (debug_options.xla_gpu_disable_multi_streaming()) {
    return 0;
  }

+  if (debug_options.xla_gpu_use_random_streams()) {
+    // Debug feature: make random stream assignments to try to uncover
+    // concurrency bugs.
+    return tensorflow::random::New64() % 100;
+  }
+
  if (!(IsCublasGemm(hlo) || IsMatrixMultiplication(hlo))) {
    // If `hlo` is not implemented as a GEMM, keep it close to its operands to
    // avoid excessive synchronization.
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.h
@ -30,7 +30,7 @@ class StreamAssignment {
  int StreamNumberForHlo(const HloInstruction& hlo) const;
  bool HasStreamAssigned(const HloInstruction& hlo) const;
  // `hlo` needs to outlive this StreamAssignment object.
-  void AssignStreamToHlo(const HloInstruction* hlo, int stream_no);
+  void AssignStreamToHlo(const HloInstruction* hlo, int stream_num);

 private:
  int stream_count_ = 1;  // At least the main stream.
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@ -69,6 +69,10 @@ message DebugOptions {
  // Disable multi-streaming in the GPU backend.
  bool xla_gpu_disable_multi_streaming = 63;

+  // Debugging feature: if enabled, the GPU backend will assign HLO operators to
+  // randomly chosen streams. This is intended to trigger concurrency bugs.
+  bool xla_gpu_use_random_streams = 134;
+
  // If true, in LLVM-based backends, emit !alias.scope metadata in
  // generated IR.
  bool xla_llvm_enable_alias_scope_metadata = 70;
@ -260,7 +264,8 @@ message DebugOptions {

  // Guarantee run-to-run determinism from reductions on XLA:GPU.
  bool xla_gpu_deterministic_reductions = 130;
-  // Next id: 134
+
+  // Next id: 135

  // Extra options to pass to the compilation backend (e.g. LLVM); specific
  // interpretation of these values is left to the backend.