[XLA/GPU] Use SequentialHloOrdering for single stream modules.
Also simplify tests in gpu_hlo_schedule_test. PiperOrigin-RevId: 358025620 Change-Id: I1de2f85c91765e967865c6ea90bc629627146559
This commit is contained in:
parent
7716b719e5
commit
2c332fbc4b
@ -520,9 +520,9 @@ GpuCompiler::RunHloPassesAndBufferAssignement(
|
|||||||
|
|
||||||
std::unique_ptr<StreamAssignment> stream_assignment =
|
std::unique_ptr<StreamAssignment> stream_assignment =
|
||||||
AssignStreams(*hlo_module);
|
AssignStreams(*hlo_module);
|
||||||
TF_ASSIGN_OR_RETURN(std::unique_ptr<GpuHloSchedule> hlo_schedule,
|
TF_ASSIGN_OR_RETURN(
|
||||||
GpuHloSchedule::Build(hlo_module.get(),
|
std::unique_ptr<GpuHloSchedule> hlo_schedule,
|
||||||
*stream_assignment, pointer_size_));
|
GpuHloSchedule::Build(*hlo_module, *stream_assignment, pointer_size_));
|
||||||
|
|
||||||
auto buffer_size_bytes_function =
|
auto buffer_size_bytes_function =
|
||||||
[this](const BufferValue& buffer_value) -> int64 {
|
[this](const BufferValue& buffer_value) -> int64 {
|
||||||
@ -565,7 +565,7 @@ static Status CompileModuleToLlvmIrImpl(
|
|||||||
AssignStreams(*hlo_module);
|
AssignStreams(*hlo_module);
|
||||||
TF_ASSIGN_OR_RETURN(
|
TF_ASSIGN_OR_RETURN(
|
||||||
std::unique_ptr<GpuHloSchedule> hlo_schedule,
|
std::unique_ptr<GpuHloSchedule> hlo_schedule,
|
||||||
GpuHloSchedule::Build(hlo_module, *stream_assignment, pointer_size));
|
GpuHloSchedule::Build(*hlo_module, *stream_assignment, pointer_size));
|
||||||
|
|
||||||
auto buffer_size_bytes_function =
|
auto buffer_size_bytes_function =
|
||||||
[pointer_size](const BufferValue& buffer_value) -> int64 {
|
[pointer_size](const BufferValue& buffer_value) -> int64 {
|
||||||
|
|||||||
@ -190,29 +190,30 @@ GpuHloSchedule::GpuHloSchedule() {}
|
|||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
StatusOr<std::unique_ptr<GpuHloSchedule>> GpuHloSchedule::Build(
|
StatusOr<std::unique_ptr<GpuHloSchedule>> GpuHloSchedule::Build(
|
||||||
HloModule* module, const StreamAssignment& stream_assignment,
|
const HloModule& module, const StreamAssignment& stream_assignment,
|
||||||
int64 pointer_size) {
|
int64 pointer_size) {
|
||||||
std::unique_ptr<GpuHloSchedule> schedule(new GpuHloSchedule);
|
std::unique_ptr<GpuHloSchedule> schedule(new GpuHloSchedule);
|
||||||
|
|
||||||
// Initialize thunk_launch_order_, the total order of thunk launches.
|
// Initialize thunk_launch_order_, the total order of thunk launches.
|
||||||
HloComputation* entry_computation = module->entry_computation();
|
HloComputation* entry_computation = module.entry_computation();
|
||||||
if (stream_assignment.StreamCount() == 1) {
|
if (stream_assignment.StreamCount() == 1) {
|
||||||
|
// All kernels are launched on a single stream, so there's no loss of
|
||||||
|
// concurrency by optimizing for minimal memory usage.
|
||||||
TF_ASSIGN_OR_RETURN(
|
TF_ASSIGN_OR_RETURN(
|
||||||
HloSchedule sequences,
|
HloInstructionSequence sequence,
|
||||||
ScheduleModule(module, [pointer_size](const BufferValue& buffer) {
|
ScheduleComputation(
|
||||||
return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
|
entry_computation, [pointer_size](const BufferValue& buffer) {
|
||||||
}));
|
return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
|
||||||
schedule->thunk_launch_order_ =
|
}));
|
||||||
sequences.sequence(entry_computation).instructions();
|
schedule->thunk_launch_order_ = sequence.instructions();
|
||||||
schedule->hlo_ordering_ =
|
|
||||||
absl::make_unique<SequentialHloOrdering>(sequences);
|
|
||||||
} else {
|
} else {
|
||||||
// BFS tends to increase concurrency, but also increases memory usage.
|
// BFS tends to increase concurrency, but also increases memory usage.
|
||||||
BFSLaunchOrder(entry_computation, &schedule->thunk_launch_order_);
|
BFSLaunchOrder(entry_computation, &schedule->thunk_launch_order_);
|
||||||
schedule->hlo_ordering_ = absl::make_unique<GpuHloOrdering>(
|
|
||||||
module, stream_assignment, schedule->thunk_launch_order_);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
schedule->hlo_ordering_ = absl::make_unique<GpuHloOrdering>(
|
||||||
|
&module, stream_assignment, schedule->thunk_launch_order_);
|
||||||
|
|
||||||
return std::move(schedule);
|
return std::move(schedule);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -41,7 +41,7 @@ class GpuHloSchedule {
|
|||||||
// Constructs an GpuHloSchedule for the given module, based on the given
|
// Constructs an GpuHloSchedule for the given module, based on the given
|
||||||
// stream assignment.
|
// stream assignment.
|
||||||
static StatusOr<std::unique_ptr<GpuHloSchedule>> Build(
|
static StatusOr<std::unique_ptr<GpuHloSchedule>> Build(
|
||||||
HloModule* module, const StreamAssignment& stream_assignment,
|
const HloModule& module, const StreamAssignment& stream_assignment,
|
||||||
int64 pointer_size);
|
int64 pointer_size);
|
||||||
|
|
||||||
// Returns the total order of thunk launches, represented in terms of HLO
|
// Returns the total order of thunk launches, represented in terms of HLO
|
||||||
|
|||||||
@ -39,7 +39,7 @@ class GpuHloScheduleTest : public HloTestBase {
|
|||||||
Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2});
|
Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2});
|
||||||
|
|
||||||
static std::unique_ptr<GpuHloSchedule> BuildGpuHloSchedule(
|
static std::unique_ptr<GpuHloSchedule> BuildGpuHloSchedule(
|
||||||
HloModule* module, const StreamAssignment& streams) {
|
const HloModule& module, const StreamAssignment& streams) {
|
||||||
return GpuHloSchedule::Build(module, streams, /*pointer_size=*/8)
|
return GpuHloSchedule::Build(module, streams, /*pointer_size=*/8)
|
||||||
.ConsumeValueOrDie();
|
.ConsumeValueOrDie();
|
||||||
}
|
}
|
||||||
@ -86,7 +86,7 @@ TEST_F(GpuHloScheduleTest, SequentialMatMul) {
|
|||||||
EXPECT_EQ(streams->StreamNumberForHlo(*dot1),
|
EXPECT_EQ(streams->StreamNumberForHlo(*dot1),
|
||||||
streams->StreamNumberForHlo(*dot2));
|
streams->StreamNumberForHlo(*dot2));
|
||||||
|
|
||||||
auto schedule = BuildGpuHloSchedule(module.get(), *streams);
|
auto schedule = BuildGpuHloSchedule(*module, *streams);
|
||||||
// Remove parameters, which are unordered.
|
// Remove parameters, which are unordered.
|
||||||
EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}),
|
EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}),
|
||||||
HloVec({dot1, dot2}));
|
HloVec({dot1, dot2}));
|
||||||
@ -94,10 +94,32 @@ TEST_F(GpuHloScheduleTest, SequentialMatMul) {
|
|||||||
// Parameters x,y,z are mutually unordered, while dot1 and dot2 are
|
// Parameters x,y,z are mutually unordered, while dot1 and dot2 are
|
||||||
// transitively ordered by operands.
|
// transitively ordered by operands.
|
||||||
auto order = schedule->ConsumeHloOrdering();
|
auto order = schedule->ConsumeHloOrdering();
|
||||||
EXPECT_TRUE(order->ExecutesBefore(z, y));
|
|
||||||
EXPECT_TRUE(order->ExecutesBefore(y, x));
|
|
||||||
EXPECT_TRUE(order->ExecutesBefore(x, dot1));
|
EXPECT_TRUE(order->ExecutesBefore(x, dot1));
|
||||||
|
EXPECT_TRUE(order->ExecutesBefore(x, dot2));
|
||||||
|
EXPECT_TRUE(order->ExecutesBefore(y, dot1));
|
||||||
|
EXPECT_TRUE(order->ExecutesBefore(y, dot2));
|
||||||
|
EXPECT_TRUE(order->ExecutesBefore(z, dot2));
|
||||||
EXPECT_TRUE(order->ExecutesBefore(dot1, dot2));
|
EXPECT_TRUE(order->ExecutesBefore(dot1, dot2));
|
||||||
|
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(x, x));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(x, y));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(x, z));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(y, x));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(y, y));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(y, z));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(z, x));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(z, y));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(z, z));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(z, dot1));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(dot1, x));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(dot1, y));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(dot1, z));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(dot1, dot1));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(dot2, x));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(dot2, y));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(dot2, z));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(dot2, dot1));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(dot2, dot2));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test of a single stream, where data dependencies do not fully determine the
|
// Test of a single stream, where data dependencies do not fully determine the
|
||||||
@ -126,7 +148,7 @@ TEST_F(GpuHloScheduleTest, SequentialAdd) {
|
|||||||
EXPECT_EQ(streams->StreamNumberForHlo(*add1),
|
EXPECT_EQ(streams->StreamNumberForHlo(*add1),
|
||||||
streams->StreamNumberForHlo(*add3));
|
streams->StreamNumberForHlo(*add3));
|
||||||
|
|
||||||
auto schedule = BuildGpuHloSchedule(module.get(), *streams);
|
auto schedule = BuildGpuHloSchedule(*module, *streams);
|
||||||
// Remove parameters, which are unordered.
|
// Remove parameters, which are unordered.
|
||||||
EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}),
|
EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}),
|
||||||
HloVec({add1, add2, add3}));
|
HloVec({add1, add2, add3}));
|
||||||
@ -134,11 +156,43 @@ TEST_F(GpuHloScheduleTest, SequentialAdd) {
|
|||||||
// Parameters x,y,z are mutually unordered, while add1, add2 and add3 are
|
// Parameters x,y,z are mutually unordered, while add1, add2 and add3 are
|
||||||
// transitively ordered by operands.
|
// transitively ordered by operands.
|
||||||
auto order = schedule->ConsumeHloOrdering();
|
auto order = schedule->ConsumeHloOrdering();
|
||||||
EXPECT_TRUE(order->ExecutesBefore(y, z));
|
|
||||||
EXPECT_TRUE(order->ExecutesBefore(z, x));
|
|
||||||
EXPECT_TRUE(order->ExecutesBefore(x, add1));
|
EXPECT_TRUE(order->ExecutesBefore(x, add1));
|
||||||
EXPECT_TRUE(order->ExecutesBefore(add1, add2));
|
EXPECT_TRUE(order->ExecutesBefore(x, add2));
|
||||||
|
EXPECT_TRUE(order->ExecutesBefore(x, add3));
|
||||||
|
EXPECT_TRUE(order->ExecutesBefore(y, add1));
|
||||||
|
EXPECT_TRUE(order->ExecutesBefore(y, add2));
|
||||||
|
EXPECT_TRUE(order->ExecutesBefore(y, add3));
|
||||||
|
EXPECT_TRUE(order->ExecutesBefore(z, add2));
|
||||||
|
EXPECT_TRUE(order->ExecutesBefore(z, add3));
|
||||||
|
EXPECT_TRUE(order->ExecutesBefore(add1, add3));
|
||||||
EXPECT_TRUE(order->ExecutesBefore(add2, add3));
|
EXPECT_TRUE(order->ExecutesBefore(add2, add3));
|
||||||
|
// The HLO graph does not define an ordering for add1 and add2, but their
|
||||||
|
// assignment onto the same stream does define an ordering.
|
||||||
|
if (order->ExecutesBefore(add1, add2)) {
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(add2, add1));
|
||||||
|
} else {
|
||||||
|
EXPECT_TRUE(order->ExecutesBefore(add2, add1));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(add1, add2));
|
||||||
|
}
|
||||||
|
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(x, x));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(x, y));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(x, z));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(y, x));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(y, y));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(y, z));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(z, x));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(z, y));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(z, z));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(z, add1));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(add1, x));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(add1, y));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(add1, z));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(add1, add1));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(add2, x));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(add2, y));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(add2, z));
|
||||||
|
EXPECT_FALSE(order->ExecutesBefore(add2, add2));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test of two streams.
|
// Test of two streams.
|
||||||
@ -162,7 +216,7 @@ TEST_F(GpuHloScheduleTest, ConcurrentMatMul) {
|
|||||||
EXPECT_NE(streams->StreamNumberForHlo(*dot1),
|
EXPECT_NE(streams->StreamNumberForHlo(*dot1),
|
||||||
streams->StreamNumberForHlo(*dot2));
|
streams->StreamNumberForHlo(*dot2));
|
||||||
|
|
||||||
auto schedule = BuildGpuHloSchedule(module.get(), *streams);
|
auto schedule = BuildGpuHloSchedule(*module, *streams);
|
||||||
// Remove parameters, which are unordered.
|
// Remove parameters, which are unordered.
|
||||||
HloVec thunk_launch_order = RemoveHlo(schedule->ThunkLaunchOrder(), {x, y});
|
HloVec thunk_launch_order = RemoveHlo(schedule->ThunkLaunchOrder(), {x, y});
|
||||||
EXPECT_TRUE(thunk_launch_order == HloVec({dot1, dot2, add}) ||
|
EXPECT_TRUE(thunk_launch_order == HloVec({dot1, dot2, add}) ||
|
||||||
@ -254,7 +308,7 @@ TEST_F(GpuHloScheduleTest, LatticeMatMul) {
|
|||||||
|
|
||||||
// We don't check the thunk launch order, since there are many valid total
|
// We don't check the thunk launch order, since there are many valid total
|
||||||
// orders, and it's annoying to express.
|
// orders, and it's annoying to express.
|
||||||
auto schedule = BuildGpuHloSchedule(module.get(), *streams);
|
auto schedule = BuildGpuHloSchedule(*module, *streams);
|
||||||
|
|
||||||
auto order = schedule->ConsumeHloOrdering();
|
auto order = schedule->ConsumeHloOrdering();
|
||||||
const HloVec all_params(
|
const HloVec all_params(
|
||||||
|
|||||||
@ -3829,20 +3829,6 @@ Status CheckConditionalBuffersShareAllocation(
|
|||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
Status AcceptMaybeOrdered(HloComputation* computation,
|
|
||||||
IrEmitterUnnested* emitter,
|
|
||||||
const BufferAssignment& buffer_assignment) {
|
|
||||||
const auto& debug_options = computation->parent()->config().debug_options();
|
|
||||||
if (debug_options.xla_gpu_disable_multi_streaming()) {
|
|
||||||
const HloInstructionSequence* sequence =
|
|
||||||
buffer_assignment.hlo_ordering().SequentialOrder(*computation);
|
|
||||||
// Always expect a sequential ordering for single-stream programs.
|
|
||||||
TF_RET_CHECK(sequence);
|
|
||||||
return computation->AcceptOrdered(emitter, sequence->instructions());
|
|
||||||
}
|
|
||||||
return computation->Accept(emitter);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
|
StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
|
||||||
@ -3856,18 +3842,14 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
|
|||||||
TF_ASSIGN_OR_RETURN(auto ir_emitter_condition,
|
TF_ASSIGN_OR_RETURN(auto ir_emitter_condition,
|
||||||
IrEmitterUnnested::Create(hlo_module_config_, condition,
|
IrEmitterUnnested::Create(hlo_module_config_, condition,
|
||||||
ir_emitter_context_));
|
ir_emitter_context_));
|
||||||
|
TF_RETURN_IF_ERROR(condition->Accept(ir_emitter_condition.get()));
|
||||||
TF_RETURN_IF_ERROR(
|
|
||||||
AcceptMaybeOrdered(condition, ir_emitter_condition.get(),
|
|
||||||
ir_emitter_context_->buffer_assignment()));
|
|
||||||
|
|
||||||
// Generate thunk sequence for while 'body'.
|
// Generate thunk sequence for while 'body'.
|
||||||
HloComputation* body = hlo->while_body();
|
HloComputation* body = hlo->while_body();
|
||||||
TF_ASSIGN_OR_RETURN(
|
TF_ASSIGN_OR_RETURN(
|
||||||
auto ir_emitter_body,
|
auto ir_emitter_body,
|
||||||
IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_));
|
IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_));
|
||||||
TF_RETURN_IF_ERROR(AcceptMaybeOrdered(
|
TF_RETURN_IF_ERROR(body->Accept(ir_emitter_body.get()));
|
||||||
body, ir_emitter_body.get(), ir_emitter_context_->buffer_assignment()));
|
|
||||||
|
|
||||||
const auto* index_map = ir_emitter_context_->profile_index_map();
|
const auto* index_map = ir_emitter_context_->profile_index_map();
|
||||||
absl::optional<size_t> condition_profile_index, body_profile_index;
|
absl::optional<size_t> condition_profile_index, body_profile_index;
|
||||||
@ -3895,8 +3877,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildForThunk(
|
|||||||
TF_ASSIGN_OR_RETURN(
|
TF_ASSIGN_OR_RETURN(
|
||||||
auto ir_emitter_body,
|
auto ir_emitter_body,
|
||||||
IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_));
|
IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_));
|
||||||
TF_RETURN_IF_ERROR(AcceptMaybeOrdered(
|
TF_RETURN_IF_ERROR(body->Accept(ir_emitter_body.get()));
|
||||||
body, ir_emitter_body.get(), ir_emitter_context_->buffer_assignment()));
|
|
||||||
|
|
||||||
const auto* index_map = ir_emitter_context_->profile_index_map();
|
const auto* index_map = ir_emitter_context_->profile_index_map();
|
||||||
absl::optional<size_t> body_profile_index;
|
absl::optional<size_t> body_profile_index;
|
||||||
@ -3933,8 +3914,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildConditionalThunk(
|
|||||||
auto ir_emitter,
|
auto ir_emitter,
|
||||||
IrEmitterUnnested::Create(hlo_module_config_, branch_computation,
|
IrEmitterUnnested::Create(hlo_module_config_, branch_computation,
|
||||||
ir_emitter_context_));
|
ir_emitter_context_));
|
||||||
TF_CHECK_OK(AcceptMaybeOrdered(branch_computation, ir_emitter.get(),
|
TF_CHECK_OK(branch_computation->Accept(ir_emitter.get()));
|
||||||
ir_emitter_context_->buffer_assignment()));
|
|
||||||
branch_thunks.push_back(std::move(*ir_emitter->ConsumeThunkSequence()));
|
branch_thunks.push_back(std::move(*ir_emitter->ConsumeThunkSequence()));
|
||||||
|
|
||||||
absl::optional<size_t> profile_index;
|
absl::optional<size_t> profile_index;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user