From 127c98b9e3ed3b79284c5f16046071aa4ac2e7cb Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Tue, 2 Jan 2018 10:37:44 -0800 Subject: [PATCH] [XLA] Fix a bug in --xla_hlo_profile and add a test that would have caught it I'm pretty sure I introduced the bug when refactoring HloExecutionProfile some time back. Also change the bytes-per-cycle field to use a floating point format when its value is less than 1, to avoid rounding down to 0 (common on CPU). PiperOrigin-RevId: 180563380 --- .../compiler/xla/service/cpu/cpu_compiler.cc | 1 + .../compiler/xla/service/gpu/gpu_compiler.cc | 1 + .../service/human_readable_profile_builder.cc | 8 +- tensorflow/compiler/xla/tests/BUILD | 17 ++ .../xla/tests/xla_hlo_profile_test.cc | 214 ++++++++++++++++++ 5 files changed, 240 insertions(+), 1 deletion(-) create mode 100644 tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index 50872131225..ba208d72490 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -508,6 +508,7 @@ StatusOr> CpuCompiler::RunBackend( }; HloCostAnalysis cost_analysis(shape_size_bytes); + TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis)); hlo_profile_printer = CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis); } diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index 68d7b2d5c2f..234f06fe2d7 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -483,6 +483,7 @@ StatusOr> GpuCompiler::RunBackend( if (module->config().hlo_profiling_enabled()) { HloCostAnalysis cost_analysis(ShapeSizeBytesFunction()); + TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis)); profile_index_map = MakeUnique(*module); profile_printer = CreateHloProfilePrinter(*profile_index_map, cost_analysis); diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc index b7c40fdeeb1..13e4557317f 100644 --- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc +++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc @@ -25,6 +25,7 @@ namespace xla { using tensorflow::strings::Appendf; using tensorflow::strings::HumanReadableElapsedTime; using tensorflow::strings::HumanReadableNumBytes; +using tensorflow::strings::Printf; using tensorflow::strings::StrAppend; string HumanReadableProfileBuilder::ToString() const { @@ -43,7 +44,12 @@ string HumanReadableProfileBuilder::ToString() const { } else { bytes_per_sec = HumanReadableNumBytes(op.bytes_accessed / CyclesToSeconds(op.cycles)); - bytes_per_cycle = HumanReadableNumBytes(op.bytes_accessed / op.cycles); + if (op.bytes_accessed > op.cycles) { + bytes_per_cycle = HumanReadableNumBytes(op.bytes_accessed / op.cycles); + } else { + bytes_per_cycle = + Printf("%.3fB", static_cast(op.bytes_accessed) / op.cycles); + } } double cycles_percent = 0; diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD index 45689976b0d..9ae4526d78c 100644 --- a/tensorflow/compiler/xla/tests/BUILD +++ b/tensorflow/compiler/xla/tests/BUILD @@ -345,6 +345,23 @@ xla_test( ], ) +xla_test( + name = "xla_hlo_profile_test", + srcs = ["xla_hlo_profile_test.cc"], + deps = [ + "//tensorflow/compiler/xla:array2d", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla/client:computation_builder", + "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/service:platform_util", + "//tensorflow/compiler/xla/tests:client_library_test_base", + "//tensorflow/compiler/xla/tests:test_utils", + "//tensorflow/core:lib", + "//tensorflow/core:regexp_internal", + "//tensorflow/core:test", + ], +) + xla_test( name = "axpy_simple_test", srcs = ["axpy_simple_test.cc"], diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc new file mode 100644 index 00000000000..7da5d4667b2 --- /dev/null +++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc @@ -0,0 +1,214 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "tensorflow/compiler/xla/array2d.h" +#include "tensorflow/compiler/xla/client/computation_builder.h" +#include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/service/platform_util.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" +#include "tensorflow/compiler/xla/tests/test_utils.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/regexp.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { +namespace { +namespace se = ::perftools::gputools; + +class HloProfileTest : public ClientLibraryTestBase {}; + +struct ParsedProfileOutputLine { + int64 cycles; + double usec; + string flops; + string trops; + string bytes_per_sec; + string bytes_per_cycle; + string name; +}; + +StatusOr ParseProfileOutputLine(const string& line, + bool expect_flops, + bool expect_trops) { + string separator = "[^:]*:: +"; + string match_cycles = "(\\d+) cycles"; + string match_usecs = "([0-9.]+) usec"; + string match_flops = expect_flops ? "([0-9.TGMk]+)FLOP/s" : "()"; + string match_trops = expect_trops ? "([0-9.TGMk]+)TROP/s" : "()"; + string match_bytes_per_sec = "([0-9.TGMKi]+)B/s"; + string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle"; + string regexp_pattern = tensorflow::strings::StrCat( + " +", match_cycles, separator, match_usecs, separator, match_flops, + separator, match_trops, separator, match_bytes_per_sec, separator, + match_bytes_per_cycle, separator, "(.*)"); + + RE2 pattern(regexp_pattern); + ParsedProfileOutputLine parsed_line; + bool matched = RE2::FullMatch( + line, pattern, &parsed_line.cycles, &parsed_line.usec, &parsed_line.flops, + &parsed_line.trops, &parsed_line.bytes_per_sec, + &parsed_line.bytes_per_cycle, &parsed_line.name); + if (!matched) { + return tensorflow::errors::InvalidArgument( + "Input did not match regexp. Input: ", line, + ", Regexp: ", regexp_pattern); + } + + return parsed_line; +} + +// Returns void so that we can ASSERT. +void ExecuteAndFetchProfile(string* profile_output, LocalClient* client, + const Computation& computation, + const Shape& lhs_arg_shape, + const Shape& rhs_arg_shape) { + LocalService* service = ClientLibrary::GetXlaService(client->platform()); + Backend* backend = service->mutable_backend(); + se::StreamExecutor* executor = backend->default_stream_executor(); + DeviceMemoryAllocator* allocator = backend->memory_allocator(); + auto* transfer_manager = backend->transfer_manager(); + + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr lhs_arg, + transfer_manager->AllocateScopedShapedBuffer( + lhs_arg_shape, allocator, backend->default_device_ordinal())); + TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice( + executor, *Literal::CreateFromShape(lhs_arg_shape), *lhs_arg)); + + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr rhs_arg, + transfer_manager->AllocateScopedShapedBuffer( + rhs_arg_shape, allocator, backend->default_device_ordinal())); + TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice( + executor, *Literal::CreateFromShape(rhs_arg_shape), *rhs_arg)); + + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr local_executable, + client->Compile(computation, {&lhs_arg_shape, &rhs_arg_shape}, + ExecutableBuildOptions())); + + Executable* executable = local_executable->executable(); + HloExecutionProfile hlo_execution_profile( + &executable->hlo_profile_printer(), &executable->hlo_profile_index_map()); + + TF_ASSERT_OK_AND_ASSIGN( + Backend::StreamPtr stream_ptr, + backend->BorrowStream(backend->default_device_ordinal())); + ExecutableRunOptions exec_run_options; + exec_run_options.set_stream(stream_ptr.get()); + exec_run_options.set_allocator(backend->memory_allocator()); + exec_run_options.set_intra_op_thread_pool( + backend->eigen_intra_op_thread_pool_device()); + ServiceExecutableRunOptions run_options( + exec_run_options, /*borrow_stream=*/nullptr, + backend->eigen_intra_op_thread_pool()); + TF_ASSERT_OK_AND_ASSIGN( + auto execution_result, + executable->ExecuteOnStream(&run_options, {lhs_arg.get(), rhs_arg.get()}, + &hlo_execution_profile)); + (void)execution_result; + + *profile_output = + hlo_execution_profile.ToString(executor->GetDeviceDescription()); +} + +// TODO(b/71364943): This test exposes a bug in the parallel CPU backend. +XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) { + const int64 m = 256, k = 256, n = 256; + Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k}); + Shape rhs_shape = ShapeUtil::MakeShape(F32, {m, k}); + + TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform, + PlatformUtil::GetDefaultPlatform()); + TF_ASSERT_OK_AND_ASSIGN(LocalClient * client, + ClientLibrary::GetOrCreateLocalClient(platform)); + + ComputationBuilder builder(client, TestName()); + auto result = builder.Tanh(builder.Dot( + builder.Parameter(0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"), + builder.Parameter(1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs"))); + + TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build()); + + string profile_output; + ExecuteAndFetchProfile(&profile_output, client, computation, lhs_shape, + rhs_shape); + + std::vector profile_output_lines = + tensorflow::str_util::Split(profile_output, '\n'); + + TF_ASSERT_OK_AND_ASSIGN( + ParsedProfileOutputLine total_profile, + ParseProfileOutputLine(profile_output_lines[1], /*expect_flops=*/true, + /*expect_trops=*/true)); + + TF_ASSERT_OK_AND_ASSIGN( + ParsedProfileOutputLine dot_profile, + ParseProfileOutputLine(profile_output_lines[2], /*expect_flops=*/true, + /*expect_trops=*/false)); + + TF_ASSERT_OK_AND_ASSIGN( + ParsedProfileOutputLine tanh_profile, + ParseProfileOutputLine(profile_output_lines[3], /*expect_flops=*/false, + /*expect_trops=*/true)); + + EXPECT_GT(total_profile.cycles, 0); + EXPECT_GT(total_profile.cycles, dot_profile.cycles); + EXPECT_GT(total_profile.cycles, tanh_profile.cycles); +} +} // namespace +} // namespace xla + +static std::pair AddXlaHloProfileFlag(int argc, char** argv) { + // Intentional "leak". + char** new_argv = new char*[argc + 2]; + for (int i = 0; i < argc; i++) { + new_argv[i] = argv[i]; + } + + // We do it this way (as opposed to piping in a modified DebugOptions + // instance) for better end-to-end integration testing. + new_argv[argc] = strdup("--xla_hlo_profile"); + + // Fusion can change the Hlo instructions that show up in the final Hlo + // executable, so block it here. + new_argv[argc + 1] = strdup("--xla_disable_hlo_passes=fusion"); + return {argc + 2, new_argv}; +} + +GTEST_API_ int main(int argc, char** argv) { + std::vector flag_list; + xla::legacy_flags::AppendDebugOptionsFlags(&flag_list); + std::tie(argc, argv) = AddXlaHloProfileFlag(argc, argv); + + auto usage = tensorflow::Flags::Usage(argv[0], flag_list); + if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) { + LOG(ERROR) << "\n" << usage; + return 2; + } + + testing::InitGoogleTest(&argc, argv); + if (argc > 1) { + LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage; + return 2; + } + return RUN_ALL_TESTS(); +}