Internal tests cleanup.
PiperOrigin-RevId: 339456235 Change-Id: Ia960a93771ef371256dc10078a39421ca1faeb14
This commit is contained in:
parent
08af2ba27c
commit
fbac0a99f7
@ -2100,10 +2100,14 @@ std::unique_ptr<HloComputation> MakeBenchmarkWhileBody() {
|
||||
return builder.Build();
|
||||
}
|
||||
|
||||
void BM_SequentialWhiles(int num_iters, int num_whiles) {
|
||||
void BM_SequentialWhiles(::testing::benchmark::State& state) {
|
||||
const int num_whiles = state.range(0);
|
||||
|
||||
// This benchmark constructs a chain of sequential while instructions.
|
||||
tensorflow::testing::StopTiming();
|
||||
for (int i = 0; i < num_iters; ++i) {
|
||||
// Timer starts automatically at the first iteration of this loop
|
||||
// and ends after the last one.
|
||||
for (auto s : state) {
|
||||
state.PauseTiming();
|
||||
HloModuleConfig config;
|
||||
config.set_debug_options(GetDebugOptionsFromFlags());
|
||||
HloModule module("BM_SequentialWhiles", config);
|
||||
@ -2131,19 +2135,22 @@ void BM_SequentialWhiles(int num_iters, int num_whiles) {
|
||||
|
||||
CopyInsertion copy_insertion;
|
||||
|
||||
tensorflow::testing::StartTiming();
|
||||
state.ResumeTiming();
|
||||
ASSERT_IS_OK(copy_insertion.Run(&module).status());
|
||||
tensorflow::testing::StopTiming();
|
||||
state.PauseTiming();
|
||||
|
||||
// The entry computation should have three copies, and each body has one.
|
||||
ASSERT_EQ(CountCopies(module), 3 + num_whiles);
|
||||
state.ResumeTiming();
|
||||
}
|
||||
}
|
||||
|
||||
void BM_ParallelWhiles(int num_iters, int num_whiles) {
|
||||
void BM_ParallelWhiles(::testing::benchmark::State& state) {
|
||||
const int num_whiles = state.range(0);
|
||||
|
||||
// This benchmark constructs a fan-out of parallel while instructions.
|
||||
tensorflow::testing::StopTiming();
|
||||
for (int i = 0; i < num_iters; ++i) {
|
||||
for (auto s : state) {
|
||||
state.PauseTiming();
|
||||
HloModuleConfig config;
|
||||
config.set_debug_options(GetDebugOptionsFromFlags());
|
||||
HloModule module("BM_SequentialWhiles", config);
|
||||
@ -2182,9 +2189,9 @@ void BM_ParallelWhiles(int num_iters, int num_whiles) {
|
||||
|
||||
CopyInsertion copy_insertion;
|
||||
|
||||
tensorflow::testing::StartTiming();
|
||||
state.ResumeTiming();
|
||||
ASSERT_IS_OK(copy_insertion.Run(&module).status());
|
||||
tensorflow::testing::StopTiming();
|
||||
state.PauseTiming();
|
||||
|
||||
// Each body receives of copy of two of the parameters (the corresponding
|
||||
// elements in the body are modified), and there is one copy in each body.
|
||||
@ -2209,14 +2216,15 @@ std::unique_ptr<HloComputation> MakeBenchmarkWhileBody(
|
||||
return builder.Build();
|
||||
}
|
||||
|
||||
void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) {
|
||||
tensorflow::testing::StopTiming();
|
||||
void BM_ManyElementTuple(::testing::benchmark::State& state) {
|
||||
const int num_tuple_inputs = state.range(0);
|
||||
HloModuleConfig config;
|
||||
config.set_debug_options(GetDebugOptionsFromFlags());
|
||||
CopyInsertion copy_insertion;
|
||||
const Shape element_shape = ShapeUtil::MakeShape(F32, {});
|
||||
std::vector<HloInstruction*> tuple_params(num_tuple_inputs);
|
||||
for (int i = 0; i < num_iters; ++i) {
|
||||
for (auto s : state) {
|
||||
state.PauseTiming();
|
||||
auto builder = HloComputation::Builder("BM_ParallelWhiles");
|
||||
HloModule module("BM_ManyElementTuple", config);
|
||||
for (int j = 0; j < num_tuple_inputs; ++j) {
|
||||
@ -2234,9 +2242,8 @@ void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) {
|
||||
builder.AddInstruction(HloInstruction::CreateGetTupleElement(
|
||||
ShapeUtil::MakeShape(F32, {}), xla_while, 0));
|
||||
module.AddEntryComputation(builder.Build());
|
||||
tensorflow::testing::StartTiming();
|
||||
state.ResumeTiming();
|
||||
ASSERT_IS_OK(copy_insertion.Run(&module).status());
|
||||
tensorflow::testing::StopTiming();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2545,8 +2545,7 @@ TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
|
||||
|
||||
// Reducing many numbers should be fast because it doesn't create
|
||||
// intermediate Literals; the microbenchmark should finish in < 1 msec.
|
||||
void BM_ReducePrecisely(int num_iters) {
|
||||
tensorflow::testing::StopTiming();
|
||||
void BM_ReducePrecisely(::testing::benchmark::State& state) {
|
||||
HloComputation::Builder b("BM_ReducePrecisely");
|
||||
HloModuleConfig config;
|
||||
config.set_debug_options(GetDebugOptionsFromFlags());
|
||||
@ -2574,10 +2573,11 @@ void BM_ReducePrecisely(int num_iters) {
|
||||
/*dimensions_to_reduce=*/{0}, add_func));
|
||||
module.AddEntryComputation(b.Build());
|
||||
|
||||
HloEvaluator hlo_eval;
|
||||
tensorflow::testing::StartTiming();
|
||||
hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
|
||||
tensorflow::testing::StopTiming();
|
||||
// Benchmark loop
|
||||
for (auto s : state) {
|
||||
HloEvaluator hlo_eval;
|
||||
hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ReducePrecisely);
|
||||
|
@ -173,8 +173,10 @@ TEST(ScopedShapedBufferTest, TestSubShapeTree) {
|
||||
|
||||
// Test TakeSubTree with different depths (depth of ShapeTree) and fan-outs
|
||||
// (cardinality of each non-leaf node's children).
|
||||
void BM_TakeSubTree(int iters, int depth, int fan_out) {
|
||||
tensorflow::testing::StopTiming();
|
||||
void BM_TakeSubTree(::testing::benchmark::State& state) {
|
||||
const int depth = state.range(0);
|
||||
const int fan_out = state.range(1);
|
||||
|
||||
TestAllocator allocator;
|
||||
xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128});
|
||||
for (int i = 0; i < depth; ++i) {
|
||||
@ -183,13 +185,11 @@ void BM_TakeSubTree(int iters, int depth, int fan_out) {
|
||||
}
|
||||
xla::ScopedShapedBuffer shaped_buffer(shape, /*allocator=*/&allocator,
|
||||
/*device_ordinal=*/0);
|
||||
tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
// Extract a buffer from approximately the middle of the first level of the
|
||||
// tree.
|
||||
(void)shaped_buffer.TakeSubTree(/*index=*/{fan_out / 2}).release();
|
||||
}
|
||||
tensorflow::testing::StopTiming();
|
||||
}
|
||||
|
||||
BENCHMARK(BM_TakeSubTree)
|
||||
|
@ -535,94 +535,100 @@ TEST_F(ShapeTreeTest, ReverseIterateOrderLeaves) {
|
||||
}));
|
||||
}
|
||||
|
||||
void BM_Construct(int iters, int depth, int fan_out) {
|
||||
tensorflow::testing::StopTiming();
|
||||
void BM_Construct(::testing::benchmark::State& state) {
|
||||
const int depth = state.range(0);
|
||||
const int fan_out = state.range(1);
|
||||
|
||||
Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
|
||||
for (int i = 0; i < depth; ++i) {
|
||||
std::vector<xla::Shape> shapes(fan_out, shape);
|
||||
shape = ShapeUtil::MakeTupleShape(shapes);
|
||||
}
|
||||
tensorflow::testing::StartTiming();
|
||||
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
ShapeTree<int> shape_tree(shape);
|
||||
}
|
||||
}
|
||||
|
||||
void BM_ConstructUnowned(int iters, int depth, int fan_out) {
|
||||
tensorflow::testing::StopTiming();
|
||||
void BM_ConstructUnowned(::testing::benchmark::State& state) {
|
||||
const int depth = state.range(0);
|
||||
const int fan_out = state.range(1);
|
||||
|
||||
Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
|
||||
for (int i = 0; i < depth; ++i) {
|
||||
std::vector<xla::Shape> shapes(fan_out, shape);
|
||||
shape = ShapeUtil::MakeTupleShape(shapes);
|
||||
}
|
||||
tensorflow::testing::StartTiming();
|
||||
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
ShapeTree<int> shape_tree(&shape);
|
||||
}
|
||||
}
|
||||
|
||||
void BM_Copy(int iters, int depth, int fan_out) {
|
||||
tensorflow::testing::StopTiming();
|
||||
void BM_Copy(::testing::benchmark::State& state) {
|
||||
const int depth = state.range(0);
|
||||
const int fan_out = state.range(1);
|
||||
|
||||
Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
|
||||
for (int i = 0; i < depth; ++i) {
|
||||
std::vector<xla::Shape> shapes(fan_out, shape);
|
||||
shape = ShapeUtil::MakeTupleShape(shapes);
|
||||
}
|
||||
tensorflow::testing::StartTiming();
|
||||
|
||||
ShapeTree<int> shape_tree(shape);
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
ShapeTree<int> copy = shape_tree;
|
||||
tensorflow::testing::DoNotOptimize(copy);
|
||||
}
|
||||
}
|
||||
|
||||
void BM_Move(int iters, int depth, int fan_out) {
|
||||
tensorflow::testing::StopTiming();
|
||||
void BM_Move(::testing::benchmark::State& state) {
|
||||
const int depth = state.range(0);
|
||||
const int fan_out = state.range(1);
|
||||
|
||||
Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
|
||||
for (int i = 0; i < depth; ++i) {
|
||||
std::vector<xla::Shape> shapes(fan_out, shape);
|
||||
shape = ShapeUtil::MakeTupleShape(shapes);
|
||||
}
|
||||
tensorflow::testing::StartTiming();
|
||||
|
||||
ShapeTree<int> shape_tree(shape);
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
ShapeTree<int> copy = std::move(shape_tree);
|
||||
shape_tree = std::move(copy);
|
||||
}
|
||||
}
|
||||
|
||||
void BM_ForEach(int iters, int depth, int fan_out) {
|
||||
tensorflow::testing::StopTiming();
|
||||
void BM_ForEach(::testing::benchmark::State& state) {
|
||||
const int depth = state.range(0);
|
||||
const int fan_out = state.range(1);
|
||||
|
||||
Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
|
||||
for (int i = 0; i < depth; ++i) {
|
||||
std::vector<xla::Shape> shapes(fan_out, shape);
|
||||
shape = ShapeUtil::MakeTupleShape(shapes);
|
||||
}
|
||||
tensorflow::testing::StartTiming();
|
||||
|
||||
ShapeTree<int> shape_tree(shape);
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
shape_tree.ForEachMutableElement([](const ShapeIndex& index, int* data) {
|
||||
tensorflow::testing::DoNotOptimize(index);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void BM_Iterate(int iters, int depth, int fan_out) {
|
||||
tensorflow::testing::StopTiming();
|
||||
void BM_Iterate(::testing::benchmark::State& state) {
|
||||
const int depth = state.range(0);
|
||||
const int fan_out = state.range(1);
|
||||
|
||||
Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
|
||||
for (int i = 0; i < depth; ++i) {
|
||||
std::vector<xla::Shape> shapes(fan_out, shape);
|
||||
shape = ShapeUtil::MakeTupleShape(shapes);
|
||||
}
|
||||
tensorflow::testing::StartTiming();
|
||||
|
||||
ShapeTree<int> shape_tree(shape);
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
for (auto& iter : shape_tree) {
|
||||
tensorflow::testing::DoNotOptimize(iter.second);
|
||||
}
|
||||
|
@ -824,9 +824,8 @@ XLA_TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) {
|
||||
ComputeAndCompare(&b, {});
|
||||
}
|
||||
|
||||
void BM_ParallelFusion(int num_iters) {
|
||||
void BM_ParallelFusion(::testing::benchmark::State& state) {
|
||||
// Simple element-wise computation to benchmark parallel task partitioning.
|
||||
tensorflow::testing::StopTiming();
|
||||
|
||||
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
||||
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
||||
@ -915,17 +914,16 @@ void BM_ParallelFusion(int num_iters) {
|
||||
const int64 total_bytes = param0_dim0 * param0_dim0 +
|
||||
param1_dim0 * param1_dim0 +
|
||||
param2_dim0 * param2_dim0;
|
||||
tensorflow::testing::BytesProcessed(static_cast<int64>(num_iters) *
|
||||
total_bytes * sizeof(float));
|
||||
tensorflow::testing::UseRealTime();
|
||||
tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < num_iters; ++i) {
|
||||
|
||||
for (auto s : state) {
|
||||
auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options);
|
||||
ASSERT_TRUE(result.ok());
|
||||
}
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * total_bytes *
|
||||
sizeof(float));
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ParallelFusion);
|
||||
BENCHMARK(BM_ParallelFusion)->UseRealTime();
|
||||
|
||||
} // namespace
|
||||
} // namespace xla
|
||||
|
@ -750,9 +750,7 @@ XLA_TEST_F(HloTestBase, AddOfDUS) {
|
||||
EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
|
||||
}
|
||||
|
||||
void BM_DynamicSlice(int num_iters) {
|
||||
tensorflow::testing::StopTiming();
|
||||
|
||||
void BM_DynamicSlice(::testing::benchmark::State& state) {
|
||||
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
||||
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
||||
se::StreamExecutorMemoryAllocator allocator(platform, executors);
|
||||
@ -817,8 +815,7 @@ void BM_DynamicSlice(int num_iters) {
|
||||
}
|
||||
|
||||
// Run benchmark.
|
||||
tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < num_iters; ++i) {
|
||||
for (auto s : state) {
|
||||
auto result = executable->Run(shaped_buffer_ptrs, options);
|
||||
ASSERT_TRUE(result.ok());
|
||||
}
|
||||
|
@ -946,9 +946,7 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_INTERPRETER(InfeedOutfeedTest)) {
|
||||
|
||||
// Benchmark that measures the overhead of the LocalClient API when running a
|
||||
// trivial computation
|
||||
void BM_LocalClientOverhead(int num_iters) {
|
||||
tensorflow::testing::StopTiming();
|
||||
|
||||
void BM_LocalClientOverhead(benchmark::State& state) {
|
||||
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
||||
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
||||
se::StreamExecutorMemoryAllocator allocator(platform, executors);
|
||||
@ -990,8 +988,7 @@ void BM_LocalClientOverhead(int num_iters) {
|
||||
ASSERT_IS_OK(result);
|
||||
}
|
||||
|
||||
tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < num_iters; ++i) {
|
||||
for (auto s : state) {
|
||||
auto result = executable->Run({&buffer}, run_options);
|
||||
ASSERT_IS_OK(result);
|
||||
}
|
||||
|
@ -357,8 +357,8 @@ class TransferDeviceToHostBenchmark : public TransferManagerTest {
|
||||
using TransferManagerTest::TransferManagerTest;
|
||||
~TransferDeviceToHostBenchmark() override {}
|
||||
|
||||
void Run(int iters, int num_tuple_elements, int array_size) {
|
||||
tensorflow::testing::StopTiming();
|
||||
void Run(::testing::benchmark::State& state, int num_tuple_elements,
|
||||
int array_size) {
|
||||
SetUp();
|
||||
|
||||
std::vector<Literal> tuple_elements;
|
||||
@ -370,13 +370,11 @@ class TransferDeviceToHostBenchmark : public TransferManagerTest {
|
||||
auto device_buffer = AllocateDeviceBuffer(literal.shape());
|
||||
TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
|
||||
device_buffer));
|
||||
tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
Literal result,
|
||||
transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer));
|
||||
}
|
||||
tensorflow::testing::StopTiming();
|
||||
TearDown();
|
||||
}
|
||||
|
||||
@ -388,7 +386,8 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest {
|
||||
using TransferManagerTest::TransferManagerTest;
|
||||
~TransferHostToDeviceBenchmark() override {}
|
||||
|
||||
void Run(int iters, int num_tuple_elements, int array_size) {
|
||||
void Run(::testing::benchmark::State& state, int num_tuple_elements,
|
||||
int array_size) {
|
||||
tensorflow::testing::StopTiming();
|
||||
SetUp();
|
||||
|
||||
@ -400,7 +399,7 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest {
|
||||
Literal literal = LiteralUtil::MakeTupleOwned(std::move(tuple_elements));
|
||||
auto device_buffer = AllocateDeviceBuffer(literal.shape());
|
||||
tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
|
||||
device_buffer));
|
||||
}
|
||||
@ -411,16 +410,20 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest {
|
||||
void TestBody() override {}
|
||||
};
|
||||
|
||||
void BM_TransferDeviceToHost(int iters, int num_tuple_elements,
|
||||
int array_size) {
|
||||
void BM_TransferDeviceToHost(::testing::benchmark::State& state) {
|
||||
const int num_tuple_elements = state.range(0);
|
||||
const int array_size = state.range(1);
|
||||
|
||||
TransferDeviceToHostBenchmark bm;
|
||||
bm.Run(iters, num_tuple_elements, array_size);
|
||||
bm.Run(state, num_tuple_elements, array_size);
|
||||
}
|
||||
|
||||
void BM_TransferHostToDevice(int iters, int num_tuple_elements,
|
||||
int array_size) {
|
||||
void BM_TransferHostToDevice(::testing::benchmark::State& state) {
|
||||
const int num_tuple_elements = state.range(0);
|
||||
const int array_size = state.range(1);
|
||||
|
||||
TransferHostToDeviceBenchmark bm;
|
||||
bm.Run(iters, num_tuple_elements, array_size);
|
||||
bm.Run(state, num_tuple_elements, array_size);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_TransferHostToDevice)
|
||||
|
@ -1259,9 +1259,8 @@ XLA_TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileInfeedCondition)) {
|
||||
ComputeAndCompareR0<int32>(&builder, 2, {});
|
||||
}
|
||||
|
||||
void BM_WhileLoop(int num_iters) {
|
||||
void BM_WhileLoop(::testing::benchmark::State& state) {
|
||||
// Benchmark a simple kernel to measure while loop overheads.
|
||||
tensorflow::testing::StopTiming();
|
||||
|
||||
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
||||
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
||||
@ -1330,8 +1329,7 @@ void BM_WhileLoop(int num_iters) {
|
||||
}
|
||||
|
||||
// Run benchmark.
|
||||
tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < num_iters; ++i) {
|
||||
for (auto s : state) {
|
||||
auto result =
|
||||
executable->Run(absl::Span<const ShapedBuffer* const>(), options);
|
||||
ASSERT_TRUE(result.ok());
|
||||
|
Loading…
Reference in New Issue
Block a user