diff --git a/tensorflow/compiler/xla/protobuf_util.h b/tensorflow/compiler/xla/protobuf_util.h index 4a88a48f285..e20a7e95a63 100644 --- a/tensorflow/compiler/xla/protobuf_util.h +++ b/tensorflow/compiler/xla/protobuf_util.h @@ -16,7 +16,6 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_ #define TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_ -#include "google/protobuf/duration.pb.h" #include "absl/time/time.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" @@ -45,20 +44,6 @@ Status DumpProtoToDirectory(const tensorflow::protobuf::Message& message, // dirpath along as-is. void RegisterDirectoryExpander(const std::function& expander); -// Converts an absl::Duration to a google::protobuf::Duration. -inline google::protobuf::Duration ToDurationProto(absl::Duration duration) { - google::protobuf::Duration proto; - proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration)); - proto.set_nanos( - absl::IDivDuration(duration, absl::Nanoseconds(1), &duration)); - return proto; -} - -// Converts a google::protobuf::Duration to an absl::Duration. -inline absl::Duration FromDurationProto(google::protobuf::Duration proto) { - return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos()); -} - } // namespace protobuf_util } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index 3bc0daf9e70..3e4aefa55d3 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -440,15 +440,14 @@ cc_library( srcs = ["cudnn_conv_algorithm_picker.cc"], hdrs = ["cudnn_conv_algorithm_picker.h"], deps = [ - ":autotuning_proto", ":backend_configs", ":buffer_comparator", ":cudnn_conv_runner", + ":gpu_autotuning_proto", ":gpu_executable", ":ir_emission_utils", ":scratch_allocator", "//tensorflow/compiler/xla:literal_util", - "//tensorflow/compiler/xla:protobuf_util", "//tensorflow/compiler/xla/service:compiler", "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:hlo", @@ -456,7 +455,9 @@ cc_library( "//tensorflow/compiler/xla/service:hlo_pass", "//tensorflow/core:lib", "//tensorflow/core:logger", + "//tensorflow/core:protos_all_cc", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/core/util/proto:proto_utils", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/time", @@ -776,6 +777,7 @@ cc_library( hdrs = ["gpu_transfer_manager.h"], deps = [ ":gpu_compiler", + ":infeed_manager", ":outfeed_manager", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", @@ -788,7 +790,6 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:generic_transfer_manager", "//tensorflow/compiler/xla/service:transfer_manager", - "//tensorflow/compiler/xla/service/gpu:infeed_manager", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "@com_google_absl//absl/memory", @@ -1137,8 +1138,8 @@ tf_cc_test( srcs = ["cudnn_fused_conv_rewriter_test.cc"], tags = tf_cuda_tests_tags(), deps = [ + ":ir_emission_utils", "//tensorflow/compiler/xla/service:hlo_parser", - "//tensorflow/compiler/xla/service/gpu:ir_emission_utils", "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/core:test", @@ -1183,10 +1184,11 @@ tf_cc_test( ) xla_proto_library( - name = "autotuning_proto", - srcs = ["autotuning.proto"], + name = "gpu_autotuning_proto", + srcs = ["gpu_autotuning.proto"], deps = [ "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:hlo_proto", + "//tensorflow/core:protos_all_cc", ], ) diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc index 0c4980f6549..02eb191cf58 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc @@ -14,21 +14,23 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h" +#include "google/protobuf/any.pb.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "absl/time/time.h" #include "absl/types/optional.h" #include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/protobuf_util.h" #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h" #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h" #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_autotuning.pb.h" #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/gpu/scratch_allocator.h" #include "tensorflow/compiler/xla/service/hlo_casting_utils.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/platform/logger.h" #include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/util/proto/proto_utils.h" namespace xla { namespace gpu { @@ -37,6 +39,7 @@ namespace { using absl::optional; using se::DeviceMemoryBase; using se::dnn::AlgorithmDesc; +using tensorflow::AutotuneResult; std::vector GetAlgorithms(CudnnConvKind kind, se::StreamExecutor* stream_exec) { @@ -94,8 +97,8 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) { return tensorflow::mutex_lock{it->second}; } -xla::gpu::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) { - xla::gpu::CudnnVersion cudnn_version; +tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) { + tensorflow::CudnnVersion cudnn_version; if (auto* dnn = stream_executor->AsDnn()) { StatusOr version_or = dnn->GetVersion(); if (version_or.ok()) { @@ -108,9 +111,9 @@ xla::gpu::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) { return cudnn_version; } -xla::gpu::ComputeCapability GetComputeCapability( +tensorflow::ComputeCapability GetComputeCapability( se::StreamExecutor* stream_executor) { - xla::gpu::ComputeCapability cc; + tensorflow::ComputeCapability cc; int cc_major, cc_minor; stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major, &cc_minor); @@ -243,25 +246,23 @@ StatusOr CudnnConvAlgorithmPicker::PickBestAlgorithm( RunCudnnConv(instr, absl::MakeSpan(operand_buffers), result_buffer, &scratch_allocator, &stream, options); + if (!launch_status.ok()) { + continue; + } + + if (!profile_result.is_valid()) { + continue; + } + profile_results.emplace_back(); AutotuneResult& result = profile_results.back(); result.mutable_conv()->set_algorithm(alg.algo_id()); result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled()); - if (!launch_status.ok()) { - result.set_error_string(launch_status.error_message()); - continue; - } - - if (!profile_result.is_valid()) { - result.set_error_string("Invalid profile result"); - continue; - } - int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes(); result.mutable_success()->set_scratch_bytes(scratch_bytes_used); *result.mutable_success()->mutable_run_time() = - protobuf_util::ToDurationProto( + tensorflow::proto_utils::ToDurationProto( absl::Milliseconds(profile_result.elapsed_time_in_ms())); const bool crash_on_checking_failure = @@ -308,10 +309,14 @@ StatusOr CudnnConvAlgorithmPicker::PickBestAlgorithm( // Log the autotuning result. { - AutotuneLog log; - *log.mutable_instr()->mutable_instruction() = instr->ToProto(); - for (const auto* op : instr->operands()) { - *log.mutable_instr()->add_operand_shapes() = op->shape().ToProto(); + tensorflow::AutotuningLog log; + { + ConvInstructionLog instr_log; + *instr_log.mutable_instruction() = instr->ToProto(); + for (const auto* op : instr->operands()) { + *instr_log.add_operand_shapes() = op->shape().ToProto(); + } + log.mutable_instr()->PackFrom(instr_log); } for (const auto& profile : profile_results) { *log.add_results() = profile; @@ -330,13 +335,14 @@ StatusOr CudnnConvAlgorithmPicker::PickBestAlgorithm( // The successful one should have a smaller key, since we are doing // min_element. If they are both unsuccessful, keep the earlier one in // the vector by comparing pointers. - return std::make_tuple( - !lhs.has_success(), - protobuf_util::FromDurationProto(lhs.success().run_time()), - &lhs) < std::make_tuple(!rhs.has_success(), - protobuf_util::FromDurationProto( - rhs.success().run_time()), - &rhs); + return std::make_tuple(!lhs.has_success(), + tensorflow::proto_utils::FromDurationProto( + lhs.success().run_time()), + &lhs) < + std::make_tuple(!rhs.has_success(), + tensorflow::proto_utils::FromDurationProto( + rhs.success().run_time()), + &rhs); }); if (best_result != profile_results_end && best_result->has_success()) { diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h index 2e34ba96723..6ab9c7a9ece 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h @@ -20,12 +20,12 @@ limitations under the License. #include "absl/types/optional.h" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" -#include "tensorflow/compiler/xla/service/gpu/autotuning.pb.h" #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h" #include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/core/protobuf/autotuning.pb.h" namespace xla { namespace gpu { @@ -50,7 +50,7 @@ class CudnnConvAlgorithmPicker : public HloModulePass { private: StatusOr RunOnComputation(HloComputation* computation); StatusOr RunOnInstruction(HloInstruction* instr); - StatusOr PickBestAlgorithm( + StatusOr PickBestAlgorithm( const HloCustomCallInstruction* instr); se::StreamExecutor* stream_exec_; // never null diff --git a/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto new file mode 100644 index 00000000000..ec4f6e9c913 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto @@ -0,0 +1,13 @@ +// This is used for convolution logging. Also see +// tensorflow/core/protobuf/autotuing.h +syntax = "proto3"; + +package xla.gpu; + +import "tensorflow/compiler/xla/service/hlo.proto"; +import "tensorflow/compiler/xla/xla_data.proto"; + +message ConvInstructionLog { + xla.HloInstructionProto instruction = 1; + repeated xla.ShapeProto operand_shapes = 2; +} diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 8f5de683220..9114013251a 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -233,6 +233,7 @@ CORE_PROTO_SRCS = COMMON_PROTO_SRCS + ERROR_CODES_PROTO_SRCS ADDITIONAL_CORE_PROTO_SRCS = [ "example/example_parser_configuration.proto", "protobuf/trackable_object_graph.proto", + "protobuf/autotuning.proto", "protobuf/control_flow.proto", # TODO(ebrevdo): Re-enable once CriticalSection is in core. # "protobuf/critical_section.proto", diff --git a/tensorflow/compiler/xla/service/gpu/autotuning.proto b/tensorflow/core/protobuf/autotuning.proto similarity index 80% rename from tensorflow/compiler/xla/service/gpu/autotuning.proto rename to tensorflow/core/protobuf/autotuning.proto index b4a08963b4f..29e4d00a85f 100644 --- a/tensorflow/compiler/xla/service/gpu/autotuning.proto +++ b/tensorflow/core/protobuf/autotuning.proto @@ -1,15 +1,14 @@ -// This file defines protos that store the results of autotuning XLA:GPU +// This file defines protos that store the results of autotuning various // operations. // // They are in proto format because we want to log them structured. They offer // tremendous statistical, testing, and debugging value. syntax = "proto3"; -package xla.gpu; +package tensorflow; +import "google/protobuf/any.proto"; import "google/protobuf/duration.proto"; -import "tensorflow/compiler/xla/xla_data.proto"; -import "tensorflow/compiler/xla/service/hlo.proto"; message CudnnVersion { int32 major = 1; @@ -63,19 +62,12 @@ message AutotuneResult { } } -message AutotuneLog { - message Instruction { - xla.HloInstructionProto instruction = 1; - repeated xla.ShapeProto operand_shapes = 2; - } - - oneof instr_oneof { - Instruction instr = 1; - } +message AutotuningLog { + google.protobuf.Any instr = 1; // Records all auto-tuning results per algorithm. - repeated AutotuneResult results = 3; + repeated AutotuneResult results = 2; - CudnnVersion cudnn_version = 4; - ComputeCapability compute_capability = 5; + CudnnVersion cudnn_version = 3; + ComputeCapability compute_capability = 4; } diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD index b990f0a7491..890bd837025 100644 --- a/tensorflow/core/util/proto/BUILD +++ b/tensorflow/core/util/proto/BUILD @@ -70,6 +70,8 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:platform_base", "@com_google_absl//absl/strings", + "@com_google_absl//absl/time", + "@protobuf_archive//:protobuf_headers", ], ) diff --git a/tensorflow/core/util/proto/proto_utils.h b/tensorflow/core/util/proto/proto_utils.h index 9451e317a13..ba45f8a5b0e 100644 --- a/tensorflow/core/util/proto/proto_utils.h +++ b/tensorflow/core/util/proto/proto_utils.h @@ -16,7 +16,9 @@ limitations under the License. #ifndef TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_ #define TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_ +#include "google/protobuf/duration.pb.h" #include "absl/strings/string_view.h" +#include "absl/time/time.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/protobuf.h" @@ -58,6 +60,20 @@ class StringErrorCollector : public protobuf::io::ErrorCollector { const int index_offset_; }; +// Converts an absl::Duration to a google::protobuf::Duration. +inline google::protobuf::Duration ToDurationProto(absl::Duration duration) { + google::protobuf::Duration proto; + proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration)); + proto.set_nanos( + absl::IDivDuration(duration, absl::Nanoseconds(1), &duration)); + return proto; +} + +// Converts a google::protobuf::Duration to an absl::Duration. +inline absl::Duration FromDurationProto(google::protobuf::Duration proto) { + return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos()); +} + } // namespace proto_utils } // namespace tensorflow