[XLA:GPU] Add missing synchronization when multistreaming is enabled.
Substreams must wait for the main stream, otherwise their operations might execute before, e.g., the parameters are ready. PiperOrigin-RevId: 299575884 Change-Id: Ibf97e18c2416b89ef03d5487807745c185365e12
This commit is contained in:
parent
d8ee1d31d5
commit
928a8730a3
@ -394,3 +394,25 @@ pybind_extension(
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
)
|
||||
|
||||
tf_cc_test(
|
||||
name = "gpu_multistream_test",
|
||||
srcs = ["gpu_multistream_test.cc"],
|
||||
tags = [
|
||||
# TODO(phawkins): figure out TF test infra such that this only runs under GPU.
|
||||
"no_oss",
|
||||
"requires-gpu-nvidia",
|
||||
],
|
||||
deps = [
|
||||
":local_client",
|
||||
":nvidia_gpu_device",
|
||||
"//tensorflow/compiler/xla:test",
|
||||
"//tensorflow/compiler/xla/client:executable_build_options",
|
||||
"//tensorflow/compiler/xla/client:xla_builder",
|
||||
"//tensorflow/compiler/xla/service:gpu_plugin",
|
||||
"//tensorflow/compiler/xla/tests:literal_test_util",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:test_main",
|
||||
"//tensorflow/core/platform:random",
|
||||
],
|
||||
)
|
||||
|
||||
98
tensorflow/compiler/xla/python/gpu_multistream_test.cc
Normal file
98
tensorflow/compiler/xla/python/gpu_multistream_test.cc
Normal file
@ -0,0 +1,98 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/compiler/xla/client/executable_build_options.h"
|
||||
#include "tensorflow/compiler/xla/client/xla_builder.h"
|
||||
#include "tensorflow/compiler/xla/python/local_client.h"
|
||||
#include "tensorflow/compiler/xla/python/nvidia_gpu_device.h"
|
||||
#include "tensorflow/compiler/xla/test.h"
|
||||
#include "tensorflow/compiler/xla/tests/literal_test_util.h"
|
||||
#include "tensorflow/core/platform/random.h"
|
||||
|
||||
namespace xla {
|
||||
namespace {
|
||||
|
||||
// Regression test that verifies that substreams of a multistream GPU
|
||||
// computation wait for the inputs to be produced before executing.
|
||||
TEST(GpuMultiStream, Basics) {
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
std::shared_ptr<PyLocalClient> client,
|
||||
GetNvidiaGpuClient(/*asynchronous=*/true, GpuAllocatorConfig(),
|
||||
/*distributed_client=*/nullptr, /*node_id=*/0));
|
||||
|
||||
std::shared_ptr<Device> device = client->local_devices().at(0);
|
||||
|
||||
int n = 1024;
|
||||
Shape shape = ShapeUtil::MakeShape(S32, {n});
|
||||
std::vector<int32> inputs(n);
|
||||
std::vector<int32> expected_outputs(n);
|
||||
|
||||
XlaBuilder builder("acomputation");
|
||||
auto p0 = Parameter(&builder, 0, shape, "param");
|
||||
auto p1 = Parameter(&builder, 1, shape, "param");
|
||||
Tuple(&builder, {Neg(p0), Neg(p1)});
|
||||
TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder.Build());
|
||||
|
||||
ExecutableBuildOptions build_options;
|
||||
build_options.mutable_debug_options()->set_xla_gpu_disable_multi_streaming(
|
||||
false);
|
||||
build_options.mutable_debug_options()->set_xla_gpu_use_random_streams(true);
|
||||
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
std::unique_ptr<PyLocalExecutable> executable,
|
||||
PyLocalExecutable::CompileForDevices(computation, {}, &build_options,
|
||||
client, {{device}}));
|
||||
|
||||
int64 dummy_size = 1 << 20;
|
||||
std::vector<int32> dummy_inputs(dummy_size);
|
||||
Shape dummy_shape = ShapeUtil::MakeShape(S32, {dummy_size});
|
||||
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
inputs[i] = tensorflow::random::New64();
|
||||
expected_outputs[i] = -inputs[i];
|
||||
}
|
||||
// Transfer a large dummy buffer, behind which the inputs to the computation
|
||||
// must wait.
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
auto dummy_buffer,
|
||||
PyLocalBuffer::FromHostBuffer(
|
||||
dummy_inputs.data(), dummy_shape, /*force_copy=*/false,
|
||||
/*buffer_reference=*/nullptr, client, device));
|
||||
TF_ASSERT_OK_AND_ASSIGN(auto in_buffer0,
|
||||
PyLocalBuffer::FromHostBuffer(
|
||||
inputs.data(), shape, /*force_copy=*/false,
|
||||
/*buffer_reference=*/nullptr, client, device));
|
||||
TF_ASSERT_OK_AND_ASSIGN(auto in_buffer1,
|
||||
PyLocalBuffer::FromHostBuffer(
|
||||
inputs.data(), shape, /*force_copy=*/false,
|
||||
/*buffer_reference=*/nullptr, client, device));
|
||||
// The execution may be enqueued before the transfers complete, requiring
|
||||
// adequate device-side synchronization.
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
auto out_buffer,
|
||||
executable->Execute({in_buffer0.get(), in_buffer1.get()}));
|
||||
|
||||
TF_ASSERT_OK_AND_ASSIGN(auto out_buffers, out_buffer->DestructureTuple());
|
||||
|
||||
TF_ASSERT_OK_AND_ASSIGN(auto out_literal, out_buffers[0]->ToLiteral());
|
||||
LiteralTestUtil::ExpectR1Equal<int32>(expected_outputs, *out_literal);
|
||||
TF_ASSERT_OK_AND_ASSIGN(out_literal, out_buffers[1]->ToLiteral());
|
||||
LiteralTestUtil::ExpectR1Equal<int32>(expected_outputs, *out_literal);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace xla
|
||||
@ -156,6 +156,7 @@ cc_library(
|
||||
"//tensorflow/compiler/xla:util",
|
||||
"//tensorflow/compiler/xla/service:hlo",
|
||||
"//tensorflow/compiler/xla/service:hlo_reachability",
|
||||
"//tensorflow/core/platform:random",
|
||||
"@com_google_absl//absl/container:flat_hash_map",
|
||||
"@com_google_absl//absl/container:flat_hash_set",
|
||||
"@com_google_absl//absl/memory",
|
||||
|
||||
@ -161,6 +161,9 @@ Status GpuExecutable::ExecuteThunks(
|
||||
sub_streams.emplace_back();
|
||||
TF_ASSIGN_OR_RETURN(sub_streams.back(),
|
||||
run_options->BorrowStream(executor->device_ordinal()));
|
||||
// Require substreams to wait for the main stream, otherwise substreams may
|
||||
// execute before the program is scheduled to start on the main stream.
|
||||
sub_streams.back()->ThenWaitFor(main_stream);
|
||||
}
|
||||
|
||||
HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
|
||||
|
||||
@ -21,6 +21,7 @@ limitations under the License.
|
||||
#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
|
||||
#include "tensorflow/compiler/xla/service/hlo_computation.h"
|
||||
#include "tensorflow/compiler/xla/service/hlo_reachability.h"
|
||||
#include "tensorflow/core/platform/random.h"
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
@ -72,13 +73,17 @@ int ComputeStreamToAssign(
|
||||
return kInvalidStreamNum;
|
||||
}
|
||||
|
||||
if (hlo.GetModule()
|
||||
->config()
|
||||
.debug_options()
|
||||
.xla_gpu_disable_multi_streaming()) {
|
||||
const auto& debug_options = hlo.GetModule()->config().debug_options();
|
||||
if (debug_options.xla_gpu_disable_multi_streaming()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (debug_options.xla_gpu_use_random_streams()) {
|
||||
// Debug feature: make random stream assignments to try to uncover
|
||||
// concurrency bugs.
|
||||
return tensorflow::random::New64() % 100;
|
||||
}
|
||||
|
||||
if (!(IsCublasGemm(hlo) || IsMatrixMultiplication(hlo))) {
|
||||
// If `hlo` is not implemented as a GEMM, keep it close to its operands to
|
||||
// avoid excessive synchronization.
|
||||
|
||||
@ -30,7 +30,7 @@ class StreamAssignment {
|
||||
int StreamNumberForHlo(const HloInstruction& hlo) const;
|
||||
bool HasStreamAssigned(const HloInstruction& hlo) const;
|
||||
// `hlo` needs to outlive this StreamAssignment object.
|
||||
void AssignStreamToHlo(const HloInstruction* hlo, int stream_no);
|
||||
void AssignStreamToHlo(const HloInstruction* hlo, int stream_num);
|
||||
|
||||
private:
|
||||
int stream_count_ = 1; // At least the main stream.
|
||||
|
||||
@ -69,6 +69,10 @@ message DebugOptions {
|
||||
// Disable multi-streaming in the GPU backend.
|
||||
bool xla_gpu_disable_multi_streaming = 63;
|
||||
|
||||
// Debugging feature: if enabled, the GPU backend will assign HLO operators to
|
||||
// randomly chosen streams. This is intended to trigger concurrency bugs.
|
||||
bool xla_gpu_use_random_streams = 134;
|
||||
|
||||
// If true, in LLVM-based backends, emit !alias.scope metadata in
|
||||
// generated IR.
|
||||
bool xla_llvm_enable_alias_scope_metadata = 70;
|
||||
@ -260,7 +264,8 @@ message DebugOptions {
|
||||
|
||||
// Guarantee run-to-run determinism from reductions on XLA:GPU.
|
||||
bool xla_gpu_deterministic_reductions = 130;
|
||||
// Next id: 134
|
||||
|
||||
// Next id: 135
|
||||
|
||||
// Extra options to pass to the compilation backend (e.g. LLVM); specific
|
||||
// interpretation of these values is left to the backend.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user