From 127c98b9e3ed3b79284c5f16046071aa4ac2e7cb Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Tue, 2 Jan 2018 10:37:44 -0800
Subject: [PATCH] [XLA] Fix a bug in --xla_hlo_profile and add a test that
 would have caught it

I'm pretty sure I introduced the bug when refactoring HloExecutionProfile some
time back.

Also change the bytes-per-cycle field to use a floating point format when its
value is less than 1, to avoid rounding down to 0 (common on CPU).

PiperOrigin-RevId: 180563380
---
 .../compiler/xla/service/cpu/cpu_compiler.cc  |   1 +
 .../compiler/xla/service/gpu/gpu_compiler.cc  |   1 +
 .../service/human_readable_profile_builder.cc |   8 +-
 tensorflow/compiler/xla/tests/BUILD           |  17 ++
 .../xla/tests/xla_hlo_profile_test.cc         | 214 ++++++++++++++++++
 5 files changed, 240 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 50872131225..ba208d72490 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -508,6 +508,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     };
 
     HloCostAnalysis cost_analysis(shape_size_bytes);
+    TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
     hlo_profile_printer =
         CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis);
   }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 68d7b2d5c2f..234f06fe2d7 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -483,6 +483,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   if (module->config().hlo_profiling_enabled()) {
     HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
+    TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
     profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
     profile_printer =
         CreateHloProfilePrinter(*profile_index_map, cost_analysis);
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index b7c40fdeeb1..13e4557317f 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -25,6 +25,7 @@ namespace xla {
 using tensorflow::strings::Appendf;
 using tensorflow::strings::HumanReadableElapsedTime;
 using tensorflow::strings::HumanReadableNumBytes;
+using tensorflow::strings::Printf;
 using tensorflow::strings::StrAppend;
 
 string HumanReadableProfileBuilder::ToString() const {
@@ -43,7 +44,12 @@ string HumanReadableProfileBuilder::ToString() const {
     } else {
       bytes_per_sec =
           HumanReadableNumBytes(op.bytes_accessed / CyclesToSeconds(op.cycles));
-      bytes_per_cycle = HumanReadableNumBytes(op.bytes_accessed / op.cycles);
+      if (op.bytes_accessed > op.cycles) {
+        bytes_per_cycle = HumanReadableNumBytes(op.bytes_accessed / op.cycles);
+      } else {
+        bytes_per_cycle =
+            Printf("%.3fB", static_cast<float>(op.bytes_accessed) / op.cycles);
+      }
     }
 
     double cycles_percent = 0;
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 45689976b0d..9ae4526d78c 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -345,6 +345,23 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "xla_hlo_profile_test",
+    srcs = ["xla_hlo_profile_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "axpy_simple_test",
     srcs = ["axpy_simple_test.cc"],
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
new file mode 100644
index 00000000000..7da5d4667b2
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -0,0 +1,214 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+namespace se = ::perftools::gputools;
+
+class HloProfileTest : public ClientLibraryTestBase {};
+
+struct ParsedProfileOutputLine {
+  int64 cycles;
+  double usec;
+  string flops;
+  string trops;
+  string bytes_per_sec;
+  string bytes_per_cycle;
+  string name;
+};
+
+StatusOr<ParsedProfileOutputLine> ParseProfileOutputLine(const string& line,
+                                                         bool expect_flops,
+                                                         bool expect_trops) {
+  string separator = "[^:]*:: +";
+  string match_cycles = "(\\d+) cycles";
+  string match_usecs = "([0-9.]+) usec";
+  string match_flops = expect_flops ? "([0-9.TGMk]+)FLOP/s" : "(<none>)";
+  string match_trops = expect_trops ? "([0-9.TGMk]+)TROP/s" : "(<none>)";
+  string match_bytes_per_sec = "([0-9.TGMKi]+)B/s";
+  string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle";
+  string regexp_pattern = tensorflow::strings::StrCat(
+      " +", match_cycles, separator, match_usecs, separator, match_flops,
+      separator, match_trops, separator, match_bytes_per_sec, separator,
+      match_bytes_per_cycle, separator, "(.*)");
+
+  RE2 pattern(regexp_pattern);
+  ParsedProfileOutputLine parsed_line;
+  bool matched = RE2::FullMatch(
+      line, pattern, &parsed_line.cycles, &parsed_line.usec, &parsed_line.flops,
+      &parsed_line.trops, &parsed_line.bytes_per_sec,
+      &parsed_line.bytes_per_cycle, &parsed_line.name);
+  if (!matched) {
+    return tensorflow::errors::InvalidArgument(
+        "Input did not match regexp.  Input: ", line,
+        ", Regexp: ", regexp_pattern);
+  }
+
+  return parsed_line;
+}
+
+// Returns void so that we can ASSERT.
+void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
+                            const Computation& computation,
+                            const Shape& lhs_arg_shape,
+                            const Shape& rhs_arg_shape) {
+  LocalService* service = ClientLibrary::GetXlaService(client->platform());
+  Backend* backend = service->mutable_backend();
+  se::StreamExecutor* executor = backend->default_stream_executor();
+  DeviceMemoryAllocator* allocator = backend->memory_allocator();
+  auto* transfer_manager = backend->transfer_manager();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<ScopedShapedBuffer> lhs_arg,
+      transfer_manager->AllocateScopedShapedBuffer(
+          lhs_arg_shape, allocator, backend->default_device_ordinal()));
+  TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
+      executor, *Literal::CreateFromShape(lhs_arg_shape), *lhs_arg));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<ScopedShapedBuffer> rhs_arg,
+      transfer_manager->AllocateScopedShapedBuffer(
+          rhs_arg_shape, allocator, backend->default_device_ordinal()));
+  TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
+      executor, *Literal::CreateFromShape(rhs_arg_shape), *rhs_arg));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LocalExecutable> local_executable,
+      client->Compile(computation, {&lhs_arg_shape, &rhs_arg_shape},
+                      ExecutableBuildOptions()));
+
+  Executable* executable = local_executable->executable();
+  HloExecutionProfile hlo_execution_profile(
+      &executable->hlo_profile_printer(), &executable->hlo_profile_index_map());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      Backend::StreamPtr stream_ptr,
+      backend->BorrowStream(backend->default_device_ordinal()));
+  ExecutableRunOptions exec_run_options;
+  exec_run_options.set_stream(stream_ptr.get());
+  exec_run_options.set_allocator(backend->memory_allocator());
+  exec_run_options.set_intra_op_thread_pool(
+      backend->eigen_intra_op_thread_pool_device());
+  ServiceExecutableRunOptions run_options(
+      exec_run_options, /*borrow_stream=*/nullptr,
+      backend->eigen_intra_op_thread_pool());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto execution_result,
+      executable->ExecuteOnStream(&run_options, {lhs_arg.get(), rhs_arg.get()},
+                                  &hlo_execution_profile));
+  (void)execution_result;
+
+  *profile_output =
+      hlo_execution_profile.ToString(executor->GetDeviceDescription());
+}
+
+// TODO(b/71364943): This test exposes a bug in the parallel CPU backend.
+XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) {
+  const int64 m = 256, k = 256, n = 256;
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {m, k});
+
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetDefaultPlatform());
+  TF_ASSERT_OK_AND_ASSIGN(LocalClient * client,
+                          ClientLibrary::GetOrCreateLocalClient(platform));
+
+  ComputationBuilder builder(client, TestName());
+  auto result = builder.Tanh(builder.Dot(
+      builder.Parameter(0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"),
+      builder.Parameter(1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs")));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
+
+  string profile_output;
+  ExecuteAndFetchProfile(&profile_output, client, computation, lhs_shape,
+                         rhs_shape);
+
+  std::vector<string> profile_output_lines =
+      tensorflow::str_util::Split(profile_output, '\n');
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ParsedProfileOutputLine total_profile,
+      ParseProfileOutputLine(profile_output_lines[1], /*expect_flops=*/true,
+                             /*expect_trops=*/true));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ParsedProfileOutputLine dot_profile,
+      ParseProfileOutputLine(profile_output_lines[2], /*expect_flops=*/true,
+                             /*expect_trops=*/false));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ParsedProfileOutputLine tanh_profile,
+      ParseProfileOutputLine(profile_output_lines[3], /*expect_flops=*/false,
+                             /*expect_trops=*/true));
+
+  EXPECT_GT(total_profile.cycles, 0);
+  EXPECT_GT(total_profile.cycles, dot_profile.cycles);
+  EXPECT_GT(total_profile.cycles, tanh_profile.cycles);
+}
+}  // namespace
+}  // namespace xla
+
+static std::pair<int, char**> AddXlaHloProfileFlag(int argc, char** argv) {
+  // Intentional "leak".
+  char** new_argv = new char*[argc + 2];
+  for (int i = 0; i < argc; i++) {
+    new_argv[i] = argv[i];
+  }
+
+  // We do it this way (as opposed to piping in a modified DebugOptions
+  // instance) for better end-to-end integration testing.
+  new_argv[argc] = strdup("--xla_hlo_profile");
+
+  // Fusion can change the Hlo instructions that show up in the final Hlo
+  // executable, so block it here.
+  new_argv[argc + 1] = strdup("--xla_disable_hlo_passes=fusion");
+  return {argc + 2, new_argv};
+}
+
+GTEST_API_ int main(int argc, char** argv) {
+  std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  std::tie(argc, argv) = AddXlaHloProfileFlag(argc, argv);
+
+  auto usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) {
+    LOG(ERROR) << "\n" << usage;
+    return 2;
+  }
+
+  testing::InitGoogleTest(&argc, argv);
+  if (argc > 1) {
+    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+    return 2;
+  }
+  return RUN_ALL_TESTS();
+}