[XLA:GPU] Add missing synchronization when multistreaming is enabled.

Substreams must wait for the main stream, otherwise their operations might execute before, e.g., the parameters are ready.

PiperOrigin-RevId: 299575884
Change-Id: Ibf97e18c2416b89ef03d5487807745c185365e12
This commit is contained in:
Peter Hawkins 2020-03-07 11:30:19 -08:00 committed by TensorFlower Gardener
parent d8ee1d31d5
commit 928a8730a3
7 changed files with 140 additions and 6 deletions

View File

@ -394,3 +394,25 @@ pybind_extension(
"//conditions:default": [],
}),
)
tf_cc_test(
name = "gpu_multistream_test",
srcs = ["gpu_multistream_test.cc"],
tags = [
# TODO(phawkins): figure out TF test infra such that this only runs under GPU.
"no_oss",
"requires-gpu-nvidia",
],
deps = [
":local_client",
":nvidia_gpu_device",
"//tensorflow/compiler/xla:test",
"//tensorflow/compiler/xla/client:executable_build_options",
"//tensorflow/compiler/xla/client:xla_builder",
"//tensorflow/compiler/xla/service:gpu_plugin",
"//tensorflow/compiler/xla/tests:literal_test_util",
"//tensorflow/core:lib",
"//tensorflow/core:test_main",
"//tensorflow/core/platform:random",
],
)

View File

@ -0,0 +1,98 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/compiler/xla/client/executable_build_options.h"
#include "tensorflow/compiler/xla/client/xla_builder.h"
#include "tensorflow/compiler/xla/python/local_client.h"
#include "tensorflow/compiler/xla/python/nvidia_gpu_device.h"
#include "tensorflow/compiler/xla/test.h"
#include "tensorflow/compiler/xla/tests/literal_test_util.h"
#include "tensorflow/core/platform/random.h"
namespace xla {
namespace {
// Regression test that verifies that substreams of a multistream GPU
// computation wait for the inputs to be produced before executing.
TEST(GpuMultiStream, Basics) {
TF_ASSERT_OK_AND_ASSIGN(
std::shared_ptr<PyLocalClient> client,
GetNvidiaGpuClient(/*asynchronous=*/true, GpuAllocatorConfig(),
/*distributed_client=*/nullptr, /*node_id=*/0));
std::shared_ptr<Device> device = client->local_devices().at(0);
int n = 1024;
Shape shape = ShapeUtil::MakeShape(S32, {n});
std::vector<int32> inputs(n);
std::vector<int32> expected_outputs(n);
XlaBuilder builder("acomputation");
auto p0 = Parameter(&builder, 0, shape, "param");
auto p1 = Parameter(&builder, 1, shape, "param");
Tuple(&builder, {Neg(p0), Neg(p1)});
TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder.Build());
ExecutableBuildOptions build_options;
build_options.mutable_debug_options()->set_xla_gpu_disable_multi_streaming(
false);
build_options.mutable_debug_options()->set_xla_gpu_use_random_streams(true);
TF_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<PyLocalExecutable> executable,
PyLocalExecutable::CompileForDevices(computation, {}, &build_options,
client, {{device}}));
int64 dummy_size = 1 << 20;
std::vector<int32> dummy_inputs(dummy_size);
Shape dummy_shape = ShapeUtil::MakeShape(S32, {dummy_size});
for (int i = 0; i < 100; ++i) {
for (int i = 0; i < n; ++i) {
inputs[i] = tensorflow::random::New64();
expected_outputs[i] = -inputs[i];
}
// Transfer a large dummy buffer, behind which the inputs to the computation
// must wait.
TF_ASSERT_OK_AND_ASSIGN(
auto dummy_buffer,
PyLocalBuffer::FromHostBuffer(
dummy_inputs.data(), dummy_shape, /*force_copy=*/false,
/*buffer_reference=*/nullptr, client, device));
TF_ASSERT_OK_AND_ASSIGN(auto in_buffer0,
PyLocalBuffer::FromHostBuffer(
inputs.data(), shape, /*force_copy=*/false,
/*buffer_reference=*/nullptr, client, device));
TF_ASSERT_OK_AND_ASSIGN(auto in_buffer1,
PyLocalBuffer::FromHostBuffer(
inputs.data(), shape, /*force_copy=*/false,
/*buffer_reference=*/nullptr, client, device));
// The execution may be enqueued before the transfers complete, requiring
// adequate device-side synchronization.
TF_ASSERT_OK_AND_ASSIGN(
auto out_buffer,
executable->Execute({in_buffer0.get(), in_buffer1.get()}));
TF_ASSERT_OK_AND_ASSIGN(auto out_buffers, out_buffer->DestructureTuple());
TF_ASSERT_OK_AND_ASSIGN(auto out_literal, out_buffers[0]->ToLiteral());
LiteralTestUtil::ExpectR1Equal<int32>(expected_outputs, *out_literal);
TF_ASSERT_OK_AND_ASSIGN(out_literal, out_buffers[1]->ToLiteral());
LiteralTestUtil::ExpectR1Equal<int32>(expected_outputs, *out_literal);
}
}
} // namespace
} // namespace xla

View File

@ -156,6 +156,7 @@ cc_library(
"//tensorflow/compiler/xla:util",
"//tensorflow/compiler/xla/service:hlo",
"//tensorflow/compiler/xla/service:hlo_reachability",
"//tensorflow/core/platform:random",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/memory",

View File

@ -161,6 +161,9 @@ Status GpuExecutable::ExecuteThunks(
sub_streams.emplace_back();
TF_ASSIGN_OR_RETURN(sub_streams.back(),
run_options->BorrowStream(executor->device_ordinal()));
// Require substreams to wait for the main stream, otherwise substreams may
// execute before the program is scheduled to start on the main stream.
sub_streams.back()->ThenWaitFor(main_stream);
}
HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,

View File

@ -21,6 +21,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
#include "tensorflow/compiler/xla/service/hlo_computation.h"
#include "tensorflow/compiler/xla/service/hlo_reachability.h"
#include "tensorflow/core/platform/random.h"
namespace xla {
namespace gpu {
@ -72,13 +73,17 @@ int ComputeStreamToAssign(
return kInvalidStreamNum;
}
if (hlo.GetModule()
->config()
.debug_options()
.xla_gpu_disable_multi_streaming()) {
const auto& debug_options = hlo.GetModule()->config().debug_options();
if (debug_options.xla_gpu_disable_multi_streaming()) {
return 0;
}
if (debug_options.xla_gpu_use_random_streams()) {
// Debug feature: make random stream assignments to try to uncover
// concurrency bugs.
return tensorflow::random::New64() % 100;
}
if (!(IsCublasGemm(hlo) || IsMatrixMultiplication(hlo))) {
// If `hlo` is not implemented as a GEMM, keep it close to its operands to
// avoid excessive synchronization.

View File

@ -30,7 +30,7 @@ class StreamAssignment {
int StreamNumberForHlo(const HloInstruction& hlo) const;
bool HasStreamAssigned(const HloInstruction& hlo) const;
// `hlo` needs to outlive this StreamAssignment object.
void AssignStreamToHlo(const HloInstruction* hlo, int stream_no);
void AssignStreamToHlo(const HloInstruction* hlo, int stream_num);
private:
int stream_count_ = 1; // At least the main stream.

View File

@ -69,6 +69,10 @@ message DebugOptions {
// Disable multi-streaming in the GPU backend.
bool xla_gpu_disable_multi_streaming = 63;
// Debugging feature: if enabled, the GPU backend will assign HLO operators to
// randomly chosen streams. This is intended to trigger concurrency bugs.
bool xla_gpu_use_random_streams = 134;
// If true, in LLVM-based backends, emit !alias.scope metadata in
// generated IR.
bool xla_llvm_enable_alias_scope_metadata = 70;
@ -260,7 +264,8 @@ message DebugOptions {
// Guarantee run-to-run determinism from reductions on XLA:GPU.
bool xla_gpu_deterministic_reductions = 130;
// Next id: 134
// Next id: 135
// Extra options to pass to the compilation backend (e.g. LLVM); specific
// interpretation of these values is left to the backend.