diff --git a/tensorflow/compiler/xla/protobuf_util.h b/tensorflow/compiler/xla/protobuf_util.h
index 4a88a48f285..e20a7e95a63 100644
--- a/tensorflow/compiler/xla/protobuf_util.h
+++ b/tensorflow/compiler/xla/protobuf_util.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
 
-#include "google/protobuf/duration.pb.h"
 #include "absl/time/time.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -45,20 +44,6 @@ Status DumpProtoToDirectory(const tensorflow::protobuf::Message& message,
 // dirpath along as-is.
 void RegisterDirectoryExpander(const std::function<string(string)>& expander);
 
-// Converts an absl::Duration to a google::protobuf::Duration.
-inline google::protobuf::Duration ToDurationProto(absl::Duration duration) {
-  google::protobuf::Duration proto;
-  proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration));
-  proto.set_nanos(
-      absl::IDivDuration(duration, absl::Nanoseconds(1), &duration));
-  return proto;
-}
-
-// Converts a google::protobuf::Duration to an absl::Duration.
-inline absl::Duration FromDurationProto(google::protobuf::Duration proto) {
-  return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos());
-}
-
 }  // namespace protobuf_util
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 3bc0daf9e70..3e4aefa55d3 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -440,15 +440,14 @@ cc_library(
     srcs = ["cudnn_conv_algorithm_picker.cc"],
     hdrs = ["cudnn_conv_algorithm_picker.h"],
     deps = [
-        ":autotuning_proto",
         ":backend_configs",
         ":buffer_comparator",
         ":cudnn_conv_runner",
+        ":gpu_autotuning_proto",
         ":gpu_executable",
         ":ir_emission_utils",
         ":scratch_allocator",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo",
@@ -456,7 +455,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
         "//tensorflow/core:logger",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/util/proto:proto_utils",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
@@ -776,6 +777,7 @@ cc_library(
     hdrs = ["gpu_transfer_manager.h"],
     deps = [
         ":gpu_compiler",
+        ":infeed_manager",
         ":outfeed_manager",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -788,7 +790,6 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:generic_transfer_manager",
         "//tensorflow/compiler/xla/service:transfer_manager",
-        "//tensorflow/compiler/xla/service/gpu:infeed_manager",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/memory",
@@ -1137,8 +1138,8 @@ tf_cc_test(
     srcs = ["cudnn_fused_conv_rewriter_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
+        ":ir_emission_utils",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
@@ -1183,10 +1184,11 @@ tf_cc_test(
 )
 
 xla_proto_library(
-    name = "autotuning_proto",
-    srcs = ["autotuning.proto"],
+    name = "gpu_autotuning_proto",
+    srcs = ["gpu_autotuning.proto"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
index 0c4980f6549..02eb191cf58 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -14,21 +14,23 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
+#include "google/protobuf/any.pb.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/time/time.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_autotuning.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/scratch_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
 
 namespace xla {
 namespace gpu {
@@ -37,6 +39,7 @@ namespace {
 using absl::optional;
 using se::DeviceMemoryBase;
 using se::dnn::AlgorithmDesc;
+using tensorflow::AutotuneResult;
 
 std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
                                          se::StreamExecutor* stream_exec) {
@@ -94,8 +97,8 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
   return tensorflow::mutex_lock{it->second};
 }
 
-xla::gpu::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
-  xla::gpu::CudnnVersion cudnn_version;
+tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
+  tensorflow::CudnnVersion cudnn_version;
   if (auto* dnn = stream_executor->AsDnn()) {
     StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
     if (version_or.ok()) {
@@ -108,9 +111,9 @@ xla::gpu::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
   return cudnn_version;
 }
 
-xla::gpu::ComputeCapability GetComputeCapability(
+tensorflow::ComputeCapability GetComputeCapability(
     se::StreamExecutor* stream_executor) {
-  xla::gpu::ComputeCapability cc;
+  tensorflow::ComputeCapability cc;
   int cc_major, cc_minor;
   stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
                                                                   &cc_minor);
@@ -243,25 +246,23 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
         RunCudnnConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
                      &scratch_allocator, &stream, options);
 
+    if (!launch_status.ok()) {
+      continue;
+    }
+
+    if (!profile_result.is_valid()) {
+      continue;
+    }
+
     profile_results.emplace_back();
     AutotuneResult& result = profile_results.back();
     result.mutable_conv()->set_algorithm(alg.algo_id());
     result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled());
 
-    if (!launch_status.ok()) {
-      result.set_error_string(launch_status.error_message());
-      continue;
-    }
-
-    if (!profile_result.is_valid()) {
-      result.set_error_string("Invalid profile result");
-      continue;
-    }
-
     int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
     result.mutable_success()->set_scratch_bytes(scratch_bytes_used);
     *result.mutable_success()->mutable_run_time() =
-        protobuf_util::ToDurationProto(
+        tensorflow::proto_utils::ToDurationProto(
             absl::Milliseconds(profile_result.elapsed_time_in_ms()));
 
     const bool crash_on_checking_failure =
@@ -308,10 +309,14 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
 
   // Log the autotuning result.
   {
-    AutotuneLog log;
-    *log.mutable_instr()->mutable_instruction() = instr->ToProto();
-    for (const auto* op : instr->operands()) {
-      *log.mutable_instr()->add_operand_shapes() = op->shape().ToProto();
+    tensorflow::AutotuningLog log;
+    {
+      ConvInstructionLog instr_log;
+      *instr_log.mutable_instruction() = instr->ToProto();
+      for (const auto* op : instr->operands()) {
+        *instr_log.add_operand_shapes() = op->shape().ToProto();
+      }
+      log.mutable_instr()->PackFrom(instr_log);
     }
     for (const auto& profile : profile_results) {
       *log.add_results() = profile;
@@ -330,13 +335,14 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
         // The successful one should have a smaller key, since we are doing
         // min_element. If they are both unsuccessful, keep the earlier one in
         // the vector by comparing pointers.
-        return std::make_tuple(
-                   !lhs.has_success(),
-                   protobuf_util::FromDurationProto(lhs.success().run_time()),
-                   &lhs) < std::make_tuple(!rhs.has_success(),
-                                           protobuf_util::FromDurationProto(
-                                               rhs.success().run_time()),
-                                           &rhs);
+        return std::make_tuple(!lhs.has_success(),
+                               tensorflow::proto_utils::FromDurationProto(
+                                   lhs.success().run_time()),
+                               &lhs) <
+               std::make_tuple(!rhs.has_success(),
+                               tensorflow::proto_utils::FromDurationProto(
+                                   rhs.success().run_time()),
+                               &rhs);
       });
 
   if (best_result != profile_results_end && best_result->has_success()) {
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
index 2e34ba96723..6ab9c7a9ece 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
@@ -20,12 +20,12 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/gpu/autotuning.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/protobuf/autotuning.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -50,7 +50,7 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
  private:
   StatusOr<bool> RunOnComputation(HloComputation* computation);
   StatusOr<bool> RunOnInstruction(HloInstruction* instr);
-  StatusOr<AutotuneResult> PickBestAlgorithm(
+  StatusOr<tensorflow::AutotuneResult> PickBestAlgorithm(
       const HloCustomCallInstruction* instr);
 
   se::StreamExecutor* stream_exec_;                   // never null
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
new file mode 100644
index 00000000000..ec4f6e9c913
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
@@ -0,0 +1,13 @@
+// This is used for convolution logging. Also see
+// tensorflow/core/protobuf/autotuing.h
+syntax = "proto3";
+
+package xla.gpu;
+
+import "tensorflow/compiler/xla/service/hlo.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+
+message ConvInstructionLog {
+  xla.HloInstructionProto instruction = 1;
+  repeated xla.ShapeProto operand_shapes = 2;
+}
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 8f5de683220..9114013251a 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -233,6 +233,7 @@ CORE_PROTO_SRCS = COMMON_PROTO_SRCS + ERROR_CODES_PROTO_SRCS
 ADDITIONAL_CORE_PROTO_SRCS = [
     "example/example_parser_configuration.proto",
     "protobuf/trackable_object_graph.proto",
+    "protobuf/autotuning.proto",
     "protobuf/control_flow.proto",
     # TODO(ebrevdo): Re-enable once CriticalSection is in core.
     # "protobuf/critical_section.proto",
diff --git a/tensorflow/compiler/xla/service/gpu/autotuning.proto b/tensorflow/core/protobuf/autotuning.proto
similarity index 80%
rename from tensorflow/compiler/xla/service/gpu/autotuning.proto
rename to tensorflow/core/protobuf/autotuning.proto
index b4a08963b4f..29e4d00a85f 100644
--- a/tensorflow/compiler/xla/service/gpu/autotuning.proto
+++ b/tensorflow/core/protobuf/autotuning.proto
@@ -1,15 +1,14 @@
-// This file defines protos that store the results of autotuning XLA:GPU
+// This file defines protos that store the results of autotuning various
 // operations.
 //
 // They are in proto format because we want to log them structured. They offer
 // tremendous statistical, testing, and debugging value.
 syntax = "proto3";
 
-package xla.gpu;
+package tensorflow;
 
+import "google/protobuf/any.proto";
 import "google/protobuf/duration.proto";
-import "tensorflow/compiler/xla/xla_data.proto";
-import "tensorflow/compiler/xla/service/hlo.proto";
 
 message CudnnVersion {
   int32 major = 1;
@@ -63,19 +62,12 @@ message AutotuneResult {
   }
 }
 
-message AutotuneLog {
-  message Instruction {
-    xla.HloInstructionProto instruction = 1;
-    repeated xla.ShapeProto operand_shapes = 2;
-  }
-
-  oneof instr_oneof {
-    Instruction instr = 1;
-  }
+message AutotuningLog {
+  google.protobuf.Any instr = 1;
 
   // Records all auto-tuning results per algorithm.
-  repeated AutotuneResult results = 3;
+  repeated AutotuneResult results = 2;
 
-  CudnnVersion cudnn_version = 4;
-  ComputeCapability compute_capability = 5;
+  CudnnVersion cudnn_version = 3;
+  ComputeCapability compute_capability = 4;
 }
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
index b990f0a7491..890bd837025 100644
--- a/tensorflow/core/util/proto/BUILD
+++ b/tensorflow/core/util/proto/BUILD
@@ -70,6 +70,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:platform_base",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@protobuf_archive//:protobuf_headers",
     ],
 )
 
diff --git a/tensorflow/core/util/proto/proto_utils.h b/tensorflow/core/util/proto/proto_utils.h
index 9451e317a13..ba45f8a5b0e 100644
--- a/tensorflow/core/util/proto/proto_utils.h
+++ b/tensorflow/core/util/proto/proto_utils.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
 #define TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
 
+#include "google/protobuf/duration.pb.h"
 #include "absl/strings/string_view.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -58,6 +60,20 @@ class StringErrorCollector : public protobuf::io::ErrorCollector {
   const int index_offset_;
 };
 
+// Converts an absl::Duration to a google::protobuf::Duration.
+inline google::protobuf::Duration ToDurationProto(absl::Duration duration) {
+  google::protobuf::Duration proto;
+  proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration));
+  proto.set_nanos(
+      absl::IDivDuration(duration, absl::Nanoseconds(1), &duration));
+  return proto;
+}
+
+// Converts a google::protobuf::Duration to an absl::Duration.
+inline absl::Duration FromDurationProto(google::protobuf::Duration proto) {
+  return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos());
+}
+
 }  // namespace proto_utils
 }  // namespace tensorflow