From b562be27f705034d46e9920ebbcdd293485a7305 Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Fri, 1 Mar 2019 14:28:03 -0800
Subject: [PATCH] Move autotuning.proto to tensorflow/core/protobuf. This is in
 preparation of the next patch, to tensorflow::Logger::LogProto() for TF
 convolutions.

This patch also has a functional change, that is to not log an AutotuneResult if the profile result is not valid. Usually they are just unsupported configurations. Ideally we'd log the exact error message of unsupported configs, but today StreamExecutor doesn't propagate it anyway.

PiperOrigin-RevId: 236378741
---
 tensorflow/compiler/xla/protobuf_util.h       | 15 -----
 tensorflow/compiler/xla/service/gpu/BUILD     | 14 +++--
 .../gpu/cudnn_conv_algorithm_picker.cc        | 60 ++++++++++---------
 .../service/gpu/cudnn_conv_algorithm_picker.h |  4 +-
 .../xla/service/gpu/gpu_autotuning.proto      | 13 ++++
 tensorflow/core/BUILD                         |  1 +
 .../gpu => core/protobuf}/autotuning.proto    | 24 +++-----
 tensorflow/core/util/proto/BUILD              |  2 +
 tensorflow/core/util/proto/proto_utils.h      | 16 +++++
 9 files changed, 83 insertions(+), 66 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
 rename tensorflow/{compiler/xla/service/gpu => core/protobuf}/autotuning.proto (80%)

diff --git a/tensorflow/compiler/xla/protobuf_util.h b/tensorflow/compiler/xla/protobuf_util.h
index 4a88a48f285..e20a7e95a63 100644
--- a/tensorflow/compiler/xla/protobuf_util.h
+++ b/tensorflow/compiler/xla/protobuf_util.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
 
-#include "google/protobuf/duration.pb.h"
 #include "absl/time/time.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -45,20 +44,6 @@ Status DumpProtoToDirectory(const tensorflow::protobuf::Message& message,
 // dirpath along as-is.
 void RegisterDirectoryExpander(const std::function<string(string)>& expander);
 
-// Converts an absl::Duration to a google::protobuf::Duration.
-inline google::protobuf::Duration ToDurationProto(absl::Duration duration) {
-  google::protobuf::Duration proto;
-  proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration));
-  proto.set_nanos(
-      absl::IDivDuration(duration, absl::Nanoseconds(1), &duration));
-  return proto;
-}
-
-// Converts a google::protobuf::Duration to an absl::Duration.
-inline absl::Duration FromDurationProto(google::protobuf::Duration proto) {
-  return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos());
-}
-
 }  // namespace protobuf_util
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 3bc0daf9e70..3e4aefa55d3 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -440,15 +440,14 @@ cc_library(
     srcs = ["cudnn_conv_algorithm_picker.cc"],
     hdrs = ["cudnn_conv_algorithm_picker.h"],
     deps = [
-        ":autotuning_proto",
         ":backend_configs",
         ":buffer_comparator",
         ":cudnn_conv_runner",
+        ":gpu_autotuning_proto",
         ":gpu_executable",
         ":ir_emission_utils",
         ":scratch_allocator",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo",
@@ -456,7 +455,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
         "//tensorflow/core:logger",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/util/proto:proto_utils",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
@@ -776,6 +777,7 @@ cc_library(
     hdrs = ["gpu_transfer_manager.h"],
     deps = [
         ":gpu_compiler",
+        ":infeed_manager",
         ":outfeed_manager",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -788,7 +790,6 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:generic_transfer_manager",
         "//tensorflow/compiler/xla/service:transfer_manager",
-        "//tensorflow/compiler/xla/service/gpu:infeed_manager",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/memory",
@@ -1137,8 +1138,8 @@ tf_cc_test(
     srcs = ["cudnn_fused_conv_rewriter_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
+        ":ir_emission_utils",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
@@ -1183,10 +1184,11 @@ tf_cc_test(
 )
 
 xla_proto_library(
-    name = "autotuning_proto",
-    srcs = ["autotuning.proto"],
+    name = "gpu_autotuning_proto",
+    srcs = ["gpu_autotuning.proto"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
index 0c4980f6549..02eb191cf58 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -14,21 +14,23 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
+#include "google/protobuf/any.pb.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/time/time.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_autotuning.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/scratch_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
 
 namespace xla {
 namespace gpu {
@@ -37,6 +39,7 @@ namespace {
 using absl::optional;
 using se::DeviceMemoryBase;
 using se::dnn::AlgorithmDesc;
+using tensorflow::AutotuneResult;
 
 std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
                                          se::StreamExecutor* stream_exec) {
@@ -94,8 +97,8 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
   return tensorflow::mutex_lock{it->second};
 }
 
-xla::gpu::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
-  xla::gpu::CudnnVersion cudnn_version;
+tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
+  tensorflow::CudnnVersion cudnn_version;
   if (auto* dnn = stream_executor->AsDnn()) {
     StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
     if (version_or.ok()) {
@@ -108,9 +111,9 @@ xla::gpu::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
   return cudnn_version;
 }
 
-xla::gpu::ComputeCapability GetComputeCapability(
+tensorflow::ComputeCapability GetComputeCapability(
     se::StreamExecutor* stream_executor) {
-  xla::gpu::ComputeCapability cc;
+  tensorflow::ComputeCapability cc;
   int cc_major, cc_minor;
   stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
                                                                   &cc_minor);
@@ -243,25 +246,23 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
         RunCudnnConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
                      &scratch_allocator, &stream, options);
 
+    if (!launch_status.ok()) {
+      continue;
+    }
+
+    if (!profile_result.is_valid()) {
+      continue;
+    }
+
     profile_results.emplace_back();
     AutotuneResult& result = profile_results.back();
     result.mutable_conv()->set_algorithm(alg.algo_id());
     result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled());
 
-    if (!launch_status.ok()) {
-      result.set_error_string(launch_status.error_message());
-      continue;
-    }
-
-    if (!profile_result.is_valid()) {
-      result.set_error_string("Invalid profile result");
-      continue;
-    }
-
     int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
     result.mutable_success()->set_scratch_bytes(scratch_bytes_used);
     *result.mutable_success()->mutable_run_time() =
-        protobuf_util::ToDurationProto(
+        tensorflow::proto_utils::ToDurationProto(
             absl::Milliseconds(profile_result.elapsed_time_in_ms()));
 
     const bool crash_on_checking_failure =
@@ -308,10 +309,14 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
 
   // Log the autotuning result.
   {
-    AutotuneLog log;
-    *log.mutable_instr()->mutable_instruction() = instr->ToProto();
-    for (const auto* op : instr->operands()) {
-      *log.mutable_instr()->add_operand_shapes() = op->shape().ToProto();
+    tensorflow::AutotuningLog log;
+    {
+      ConvInstructionLog instr_log;
+      *instr_log.mutable_instruction() = instr->ToProto();
+      for (const auto* op : instr->operands()) {
+        *instr_log.add_operand_shapes() = op->shape().ToProto();
+      }
+      log.mutable_instr()->PackFrom(instr_log);
     }
     for (const auto& profile : profile_results) {
       *log.add_results() = profile;
@@ -330,13 +335,14 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
         // The successful one should have a smaller key, since we are doing
         // min_element. If they are both unsuccessful, keep the earlier one in
         // the vector by comparing pointers.
-        return std::make_tuple(
-                   !lhs.has_success(),
-                   protobuf_util::FromDurationProto(lhs.success().run_time()),
-                   &lhs) < std::make_tuple(!rhs.has_success(),
-                                           protobuf_util::FromDurationProto(
-                                               rhs.success().run_time()),
-                                           &rhs);
+        return std::make_tuple(!lhs.has_success(),
+                               tensorflow::proto_utils::FromDurationProto(
+                                   lhs.success().run_time()),
+                               &lhs) <
+               std::make_tuple(!rhs.has_success(),
+                               tensorflow::proto_utils::FromDurationProto(
+                                   rhs.success().run_time()),
+                               &rhs);
       });
 
   if (best_result != profile_results_end && best_result->has_success()) {
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
index 2e34ba96723..6ab9c7a9ece 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
@@ -20,12 +20,12 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/gpu/autotuning.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/protobuf/autotuning.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -50,7 +50,7 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
  private:
   StatusOr<bool> RunOnComputation(HloComputation* computation);
   StatusOr<bool> RunOnInstruction(HloInstruction* instr);
-  StatusOr<AutotuneResult> PickBestAlgorithm(
+  StatusOr<tensorflow::AutotuneResult> PickBestAlgorithm(
       const HloCustomCallInstruction* instr);
 
   se::StreamExecutor* stream_exec_;                   // never null
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
new file mode 100644
index 00000000000..ec4f6e9c913
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
@@ -0,0 +1,13 @@
+// This is used for convolution logging. Also see
+// tensorflow/core/protobuf/autotuing.h
+syntax = "proto3";
+
+package xla.gpu;
+
+import "tensorflow/compiler/xla/service/hlo.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+
+message ConvInstructionLog {
+  xla.HloInstructionProto instruction = 1;
+  repeated xla.ShapeProto operand_shapes = 2;
+}
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 8f5de683220..9114013251a 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -233,6 +233,7 @@ CORE_PROTO_SRCS = COMMON_PROTO_SRCS + ERROR_CODES_PROTO_SRCS
 ADDITIONAL_CORE_PROTO_SRCS = [
     "example/example_parser_configuration.proto",
     "protobuf/trackable_object_graph.proto",
+    "protobuf/autotuning.proto",
     "protobuf/control_flow.proto",
     # TODO(ebrevdo): Re-enable once CriticalSection is in core.
     # "protobuf/critical_section.proto",
diff --git a/tensorflow/compiler/xla/service/gpu/autotuning.proto b/tensorflow/core/protobuf/autotuning.proto
similarity index 80%
rename from tensorflow/compiler/xla/service/gpu/autotuning.proto
rename to tensorflow/core/protobuf/autotuning.proto
index b4a08963b4f..29e4d00a85f 100644
--- a/tensorflow/compiler/xla/service/gpu/autotuning.proto
+++ b/tensorflow/core/protobuf/autotuning.proto
@@ -1,15 +1,14 @@
-// This file defines protos that store the results of autotuning XLA:GPU
+// This file defines protos that store the results of autotuning various
 // operations.
 //
 // They are in proto format because we want to log them structured. They offer
 // tremendous statistical, testing, and debugging value.
 syntax = "proto3";
 
-package xla.gpu;
+package tensorflow;
 
+import "google/protobuf/any.proto";
 import "google/protobuf/duration.proto";
-import "tensorflow/compiler/xla/xla_data.proto";
-import "tensorflow/compiler/xla/service/hlo.proto";
 
 message CudnnVersion {
   int32 major = 1;
@@ -63,19 +62,12 @@ message AutotuneResult {
   }
 }
 
-message AutotuneLog {
-  message Instruction {
-    xla.HloInstructionProto instruction = 1;
-    repeated xla.ShapeProto operand_shapes = 2;
-  }
-
-  oneof instr_oneof {
-    Instruction instr = 1;
-  }
+message AutotuningLog {
+  google.protobuf.Any instr = 1;
 
   // Records all auto-tuning results per algorithm.
-  repeated AutotuneResult results = 3;
+  repeated AutotuneResult results = 2;
 
-  CudnnVersion cudnn_version = 4;
-  ComputeCapability compute_capability = 5;
+  CudnnVersion cudnn_version = 3;
+  ComputeCapability compute_capability = 4;
 }
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
index b990f0a7491..890bd837025 100644
--- a/tensorflow/core/util/proto/BUILD
+++ b/tensorflow/core/util/proto/BUILD
@@ -70,6 +70,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:platform_base",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@protobuf_archive//:protobuf_headers",
     ],
 )
 
diff --git a/tensorflow/core/util/proto/proto_utils.h b/tensorflow/core/util/proto/proto_utils.h
index 9451e317a13..ba45f8a5b0e 100644
--- a/tensorflow/core/util/proto/proto_utils.h
+++ b/tensorflow/core/util/proto/proto_utils.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
 #define TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
 
+#include "google/protobuf/duration.pb.h"
 #include "absl/strings/string_view.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -58,6 +60,20 @@ class StringErrorCollector : public protobuf::io::ErrorCollector {
   const int index_offset_;
 };
 
+// Converts an absl::Duration to a google::protobuf::Duration.
+inline google::protobuf::Duration ToDurationProto(absl::Duration duration) {
+  google::protobuf::Duration proto;
+  proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration));
+  proto.set_nanos(
+      absl::IDivDuration(duration, absl::Nanoseconds(1), &duration));
+  return proto;
+}
+
+// Converts a google::protobuf::Duration to an absl::Duration.
+inline absl::Duration FromDurationProto(google::protobuf::Duration proto) {
+  return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos());
+}
+
 }  // namespace proto_utils
 }  // namespace tensorflow