[XLA] Fix a bug in --xla_hlo_profile and add a test that would have caught it

I'm pretty sure I introduced the bug when refactoring HloExecutionProfile some time back. Also change the bytes-per-cycle field to use a floating point format when its value is less than 1, to avoid rounding down to 0 (common on CPU). PiperOrigin-RevId: 180563380
2018-01-02 10:37:44 -08:00 · 2018-01-02 10:37:44 -08:00 · 127c98b9e3
commit 127c98b9e3
parent 135db42bc2
5 changed files with 240 additions and 1 deletions
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@ -508,6 +508,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
    };
    HloCostAnalysis cost_analysis(shape_size_bytes);
    TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
    hlo_profile_printer =
        CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis);
  }
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@ -483,6 +483,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
  if (module->config().hlo_profiling_enabled()) {
    HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
    TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
    profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
    profile_printer =
        CreateHloProfilePrinter(*profile_index_map, cost_analysis);
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@ -25,6 +25,7 @@ namespace xla {
 using tensorflow::strings::Appendf;
 using tensorflow::strings::HumanReadableElapsedTime;
 using tensorflow::strings::HumanReadableNumBytes;
 using tensorflow::strings::Printf;
 using tensorflow::strings::StrAppend;
 string HumanReadableProfileBuilder::ToString() const {
@ -43,7 +44,12 @@ string HumanReadableProfileBuilder::ToString() const {
    } else {
      bytes_per_sec =
          HumanReadableNumBytes(op.bytes_accessed / CyclesToSeconds(op.cycles));
-      bytes_per_cycle = HumanReadableNumBytes(op.bytes_accessed / op.cycles);
+      if (op.bytes_accessed > op.cycles) {
        bytes_per_cycle = HumanReadableNumBytes(op.bytes_accessed / op.cycles);
      } else {
        bytes_per_cycle =
            Printf("%.3fB", static_cast<float>(op.bytes_accessed) / op.cycles);
      }
    }
    double cycles_percent = 0;
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@ -345,6 +345,23 @@ xla_test(
    ],
 )
 xla_test(
    name = "xla_hlo_profile_test",
    srcs = ["xla_hlo_profile_test.cc"],
    deps = [
        "//tensorflow/compiler/xla:array2d",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla/client:computation_builder",
        "//tensorflow/compiler/xla/client:local_client",
        "//tensorflow/compiler/xla/service:platform_util",
        "//tensorflow/compiler/xla/tests:client_library_test_base",
        "//tensorflow/compiler/xla/tests:test_utils",
        "//tensorflow/core:lib",
        "//tensorflow/core:regexp_internal",
        "//tensorflow/core:test",
    ],
 )
 xla_test(
    name = "axpy_simple_test",
    srcs = ["axpy_simple_test.cc"],
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@ -0,0 +1,214 @@
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <memory>
 #include <vector>
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 namespace xla {
 namespace {
 namespace se = ::perftools::gputools;
 class HloProfileTest : public ClientLibraryTestBase {};
 struct ParsedProfileOutputLine {
  int64 cycles;
  double usec;
  string flops;
  string trops;
  string bytes_per_sec;
  string bytes_per_cycle;
  string name;
 };
 StatusOr<ParsedProfileOutputLine> ParseProfileOutputLine(const string& line,
                                                         bool expect_flops,
                                                         bool expect_trops) {
  string separator = "[^:]*:: +";
  string match_cycles = "(\\d+) cycles";
  string match_usecs = "([0-9.]+) usec";
  string match_flops = expect_flops ? "([0-9.TGMk]+)FLOP/s" : "(<none>)";
  string match_trops = expect_trops ? "([0-9.TGMk]+)TROP/s" : "(<none>)";
  string match_bytes_per_sec = "([0-9.TGMKi]+)B/s";
  string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle";
  string regexp_pattern = tensorflow::strings::StrCat(
      " +", match_cycles, separator, match_usecs, separator, match_flops,
      separator, match_trops, separator, match_bytes_per_sec, separator,
      match_bytes_per_cycle, separator, "(.*)");
  RE2 pattern(regexp_pattern);
  ParsedProfileOutputLine parsed_line;
  bool matched = RE2::FullMatch(
      line, pattern, &parsed_line.cycles, &parsed_line.usec, &parsed_line.flops,
      &parsed_line.trops, &parsed_line.bytes_per_sec,
      &parsed_line.bytes_per_cycle, &parsed_line.name);
  if (!matched) {
    return tensorflow::errors::InvalidArgument(
        "Input did not match regexp.  Input: ", line,
        ", Regexp: ", regexp_pattern);
  }
  return parsed_line;
 }
 // Returns void so that we can ASSERT.
 void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
                            const Computation& computation,
                            const Shape& lhs_arg_shape,
                            const Shape& rhs_arg_shape) {
  LocalService* service = ClientLibrary::GetXlaService(client->platform());
  Backend* backend = service->mutable_backend();
  se::StreamExecutor* executor = backend->default_stream_executor();
  DeviceMemoryAllocator* allocator = backend->memory_allocator();
  auto* transfer_manager = backend->transfer_manager();
  TF_ASSERT_OK_AND_ASSIGN(
      std::unique_ptr<ScopedShapedBuffer> lhs_arg,
      transfer_manager->AllocateScopedShapedBuffer(
          lhs_arg_shape, allocator, backend->default_device_ordinal()));
  TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
      executor, *Literal::CreateFromShape(lhs_arg_shape), *lhs_arg));
  TF_ASSERT_OK_AND_ASSIGN(
      std::unique_ptr<ScopedShapedBuffer> rhs_arg,
      transfer_manager->AllocateScopedShapedBuffer(
          rhs_arg_shape, allocator, backend->default_device_ordinal()));
  TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
      executor, *Literal::CreateFromShape(rhs_arg_shape), *rhs_arg));
  TF_ASSERT_OK_AND_ASSIGN(
      std::unique_ptr<LocalExecutable> local_executable,
      client->Compile(computation, {&lhs_arg_shape, &rhs_arg_shape},
                      ExecutableBuildOptions()));
  Executable* executable = local_executable->executable();
  HloExecutionProfile hlo_execution_profile(
      &executable->hlo_profile_printer(), &executable->hlo_profile_index_map());
  TF_ASSERT_OK_AND_ASSIGN(
      Backend::StreamPtr stream_ptr,
      backend->BorrowStream(backend->default_device_ordinal()));
  ExecutableRunOptions exec_run_options;
  exec_run_options.set_stream(stream_ptr.get());
  exec_run_options.set_allocator(backend->memory_allocator());
  exec_run_options.set_intra_op_thread_pool(
      backend->eigen_intra_op_thread_pool_device());
  ServiceExecutableRunOptions run_options(
      exec_run_options, /*borrow_stream=*/nullptr,
      backend->eigen_intra_op_thread_pool());
  TF_ASSERT_OK_AND_ASSIGN(
      auto execution_result,
      executable->ExecuteOnStream(&run_options, {lhs_arg.get(), rhs_arg.get()},
                                  &hlo_execution_profile));
  (void)execution_result;
  *profile_output =
      hlo_execution_profile.ToString(executor->GetDeviceDescription());
 }
 // TODO(b/71364943): This test exposes a bug in the parallel CPU backend.
 XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) {
  const int64 m = 256, k = 256, n = 256;
  Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
  Shape rhs_shape = ShapeUtil::MakeShape(F32, {m, k});
  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
                          PlatformUtil::GetDefaultPlatform());
  TF_ASSERT_OK_AND_ASSIGN(LocalClient * client,
                          ClientLibrary::GetOrCreateLocalClient(platform));
  ComputationBuilder builder(client, TestName());
  auto result = builder.Tanh(builder.Dot(
      builder.Parameter(0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"),
      builder.Parameter(1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs")));
  TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
  string profile_output;
  ExecuteAndFetchProfile(&profile_output, client, computation, lhs_shape,
                         rhs_shape);
  std::vector<string> profile_output_lines =
      tensorflow::str_util::Split(profile_output, '\n');
  TF_ASSERT_OK_AND_ASSIGN(
      ParsedProfileOutputLine total_profile,
      ParseProfileOutputLine(profile_output_lines[1], /*expect_flops=*/true,
                             /*expect_trops=*/true));
  TF_ASSERT_OK_AND_ASSIGN(
      ParsedProfileOutputLine dot_profile,
      ParseProfileOutputLine(profile_output_lines[2], /*expect_flops=*/true,
                             /*expect_trops=*/false));
  TF_ASSERT_OK_AND_ASSIGN(
      ParsedProfileOutputLine tanh_profile,
      ParseProfileOutputLine(profile_output_lines[3], /*expect_flops=*/false,
                             /*expect_trops=*/true));
  EXPECT_GT(total_profile.cycles, 0);
  EXPECT_GT(total_profile.cycles, dot_profile.cycles);
  EXPECT_GT(total_profile.cycles, tanh_profile.cycles);
 }
 }  // namespace
 }  // namespace xla
 static std::pair<int, char**> AddXlaHloProfileFlag(int argc, char** argv) {
  // Intentional "leak".
  char** new_argv = new char*[argc + 2];
  for (int i = 0; i < argc; i++) {
    new_argv[i] = argv[i];
  }
  // We do it this way (as opposed to piping in a modified DebugOptions
  // instance) for better end-to-end integration testing.
  new_argv[argc] = strdup("--xla_hlo_profile");
  // Fusion can change the Hlo instructions that show up in the final Hlo
  // executable, so block it here.
  new_argv[argc + 1] = strdup("--xla_disable_hlo_passes=fusion");
  return {argc + 2, new_argv};
 }
 GTEST_API_ int main(int argc, char** argv) {
  std::vector<tensorflow::Flag> flag_list;
  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
  std::tie(argc, argv) = AddXlaHloProfileFlag(argc, argv);
  auto usage = tensorflow::Flags::Usage(argv[0], flag_list);
  if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) {
    LOG(ERROR) << "\n" << usage;
    return 2;
  }
  testing::InitGoogleTest(&argc, argv);
  if (argc > 1) {
    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
    return 2;
  }
  return RUN_ALL_TESTS();
 }