From b562be27f705034d46e9920ebbcdd293485a7305 Mon Sep 17 00:00:00 2001 From: Tim Shen Date: Fri, 1 Mar 2019 14:28:03 -0800 Subject: [PATCH] Move autotuning.proto to tensorflow/core/protobuf. This is in preparation of the next patch, to tensorflow::Logger::LogProto() for TF convolutions. This patch also has a functional change, that is to not log an AutotuneResult if the profile result is not valid. Usually they are just unsupported configurations. Ideally we'd log the exact error message of unsupported configs, but today StreamExecutor doesn't propagate it anyway. PiperOrigin-RevId: 236378741 --- tensorflow/compiler/xla/protobuf_util.h | 15 ----- tensorflow/compiler/xla/service/gpu/BUILD | 14 +++-- .../gpu/cudnn_conv_algorithm_picker.cc | 60 ++++++++++--------- .../service/gpu/cudnn_conv_algorithm_picker.h | 4 +- .../xla/service/gpu/gpu_autotuning.proto | 13 ++++ tensorflow/core/BUILD | 1 + .../gpu => core/protobuf}/autotuning.proto | 24 +++----- tensorflow/core/util/proto/BUILD | 2 + tensorflow/core/util/proto/proto_utils.h | 16 +++++ 9 files changed, 83 insertions(+), 66 deletions(-) create mode 100644 tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto rename tensorflow/{compiler/xla/service/gpu => core/protobuf}/autotuning.proto (80%) diff --git a/tensorflow/compiler/xla/protobuf_util.h b/tensorflow/compiler/xla/protobuf_util.h index 4a88a48f285..e20a7e95a63 100644 --- a/tensorflow/compiler/xla/protobuf_util.h +++ b/tensorflow/compiler/xla/protobuf_util.h @@ -16,7 +16,6 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_ #define TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_ -#include "google/protobuf/duration.pb.h" #include "absl/time/time.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" @@ -45,20 +44,6 @@ Status DumpProtoToDirectory(const tensorflow::protobuf::Message& message, // dirpath along as-is. void RegisterDirectoryExpander(const std::function& expander); -// Converts an absl::Duration to a google::protobuf::Duration. -inline google::protobuf::Duration ToDurationProto(absl::Duration duration) { - google::protobuf::Duration proto; - proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration)); - proto.set_nanos( - absl::IDivDuration(duration, absl::Nanoseconds(1), &duration)); - return proto; -} - -// Converts a google::protobuf::Duration to an absl::Duration. -inline absl::Duration FromDurationProto(google::protobuf::Duration proto) { - return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos()); -} - } // namespace protobuf_util } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index 3bc0daf9e70..3e4aefa55d3 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -440,15 +440,14 @@ cc_library( srcs = ["cudnn_conv_algorithm_picker.cc"], hdrs = ["cudnn_conv_algorithm_picker.h"], deps = [ - ":autotuning_proto", ":backend_configs", ":buffer_comparator", ":cudnn_conv_runner", + ":gpu_autotuning_proto", ":gpu_executable", ":ir_emission_utils", ":scratch_allocator", "//tensorflow/compiler/xla:literal_util", - "//tensorflow/compiler/xla:protobuf_util", "//tensorflow/compiler/xla/service:compiler", "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:hlo", @@ -456,7 +455,9 @@ cc_library( "//tensorflow/compiler/xla/service:hlo_pass", "//tensorflow/core:lib", "//tensorflow/core:logger", + "//tensorflow/core:protos_all_cc", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/core/util/proto:proto_utils", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/time", @@ -776,6 +777,7 @@ cc_library( hdrs = ["gpu_transfer_manager.h"], deps = [ ":gpu_compiler", + ":infeed_manager", ":outfeed_manager", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", @@ -788,7 +790,6 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:generic_transfer_manager", "//tensorflow/compiler/xla/service:transfer_manager", - "//tensorflow/compiler/xla/service/gpu:infeed_manager", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "@com_google_absl//absl/memory", @@ -1137,8 +1138,8 @@ tf_cc_test( srcs = ["cudnn_fused_conv_rewriter_test.cc"], tags = tf_cuda_tests_tags(), deps = [ + ":ir_emission_utils", "//tensorflow/compiler/xla/service:hlo_parser", - "//tensorflow/compiler/xla/service/gpu:ir_emission_utils", "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/core:test", @@ -1183,10 +1184,11 @@ tf_cc_test( ) xla_proto_library( - name = "autotuning_proto", - srcs = ["autotuning.proto"], + name = "gpu_autotuning_proto", + srcs = ["gpu_autotuning.proto"], deps = [ "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:hlo_proto", + "//tensorflow/core:protos_all_cc", ], ) diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc index 0c4980f6549..02eb191cf58 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc @@ -14,21 +14,23 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h" +#include "google/protobuf/any.pb.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "absl/time/time.h" #include "absl/types/optional.h" #include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/protobuf_util.h" #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h" #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h" #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_autotuning.pb.h" #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/gpu/scratch_allocator.h" #include "tensorflow/compiler/xla/service/hlo_casting_utils.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/platform/logger.h" #include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/util/proto/proto_utils.h" namespace xla { namespace gpu { @@ -37,6 +39,7 @@ namespace { using absl::optional; using se::DeviceMemoryBase; using se::dnn::AlgorithmDesc; +using tensorflow::AutotuneResult; std::vector GetAlgorithms(CudnnConvKind kind, se::StreamExecutor* stream_exec) { @@ -94,8 +97,8 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) { return tensorflow::mutex_lock{it->second}; } -xla::gpu::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) { - xla::gpu::CudnnVersion cudnn_version; +tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) { + tensorflow::CudnnVersion cudnn_version; if (auto* dnn = stream_executor->AsDnn()) { StatusOr version_or = dnn->GetVersion(); if (version_or.ok()) { @@ -108,9 +111,9 @@ xla::gpu::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) { return cudnn_version; } -xla::gpu::ComputeCapability GetComputeCapability( +tensorflow::ComputeCapability GetComputeCapability( se::StreamExecutor* stream_executor) { - xla::gpu::ComputeCapability cc; + tensorflow::ComputeCapability cc; int cc_major, cc_minor; stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major, &cc_minor); @@ -243,25 +246,23 @@ StatusOr CudnnConvAlgorithmPicker::PickBestAlgorithm( RunCudnnConv(instr, absl::MakeSpan(operand_buffers), result_buffer, &scratch_allocator, &stream, options); + if (!launch_status.ok()) { + continue; + } + + if (!profile_result.is_valid()) { + continue; + } + profile_results.emplace_back(); AutotuneResult& result = profile_results.back(); result.mutable_conv()->set_algorithm(alg.algo_id()); result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled()); - if (!launch_status.ok()) { - result.set_error_string(launch_status.error_message()); - continue; - } - - if (!profile_result.is_valid()) { - result.set_error_string("Invalid profile result"); - continue; - } - int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes(); result.mutable_success()->set_scratch_bytes(scratch_bytes_used); *result.mutable_success()->mutable_run_time() = - protobuf_util::ToDurationProto( + tensorflow::proto_utils::ToDurationProto( absl::Milliseconds(profile_result.elapsed_time_in_ms())); const bool crash_on_checking_failure = @@ -308,10 +309,14 @@ StatusOr CudnnConvAlgorithmPicker::PickBestAlgorithm( // Log the autotuning result. { - AutotuneLog log; - *log.mutable_instr()->mutable_instruction() = instr->ToProto(); - for (const auto* op : instr->operands()) { - *log.mutable_instr()->add_operand_shapes() = op->shape().ToProto(); + tensorflow::AutotuningLog log; + { + ConvInstructionLog instr_log; + *instr_log.mutable_instruction() = instr->ToProto(); + for (const auto* op : instr->operands()) { + *instr_log.add_operand_shapes() = op->shape().ToProto(); + } + log.mutable_instr()->PackFrom(instr_log); } for (const auto& profile : profile_results) { *log.add_results() = profile; @@ -330,13 +335,14 @@ StatusOr CudnnConvAlgorithmPicker::PickBestAlgorithm( // The successful one should have a smaller key, since we are doing // min_element. If they are both unsuccessful, keep the earlier one in // the vector by comparing pointers. - return std::make_tuple( - !lhs.has_success(), - protobuf_util::FromDurationProto(lhs.success().run_time()), - &lhs) < std::make_tuple(!rhs.has_success(), - protobuf_util::FromDurationProto( - rhs.success().run_time()), - &rhs); + return std::make_tuple(!lhs.has_success(), + tensorflow::proto_utils::FromDurationProto( + lhs.success().run_time()), + &lhs) < + std::make_tuple(!rhs.has_success(), + tensorflow::proto_utils::FromDurationProto( + rhs.success().run_time()), + &rhs); }); if (best_result != profile_results_end && best_result->has_success()) { diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h index 2e34ba96723..6ab9c7a9ece 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h @@ -20,12 +20,12 @@ limitations under the License. #include "absl/types/optional.h" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" -#include "tensorflow/compiler/xla/service/gpu/autotuning.pb.h" #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h" #include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/core/protobuf/autotuning.pb.h" namespace xla { namespace gpu { @@ -50,7 +50,7 @@ class CudnnConvAlgorithmPicker : public HloModulePass { private: StatusOr RunOnComputation(HloComputation* computation); StatusOr RunOnInstruction(HloInstruction* instr); - StatusOr PickBestAlgorithm( + StatusOr PickBestAlgorithm( const HloCustomCallInstruction* instr); se::StreamExecutor* stream_exec_; // never null diff --git a/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto new file mode 100644 index 00000000000..ec4f6e9c913 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto @@ -0,0 +1,13 @@ +// This is used for convolution logging. Also see +// tensorflow/core/protobuf/autotuing.h +syntax = "proto3"; + +package xla.gpu; + +import "tensorflow/compiler/xla/service/hlo.proto"; +import "tensorflow/compiler/xla/xla_data.proto"; + +message ConvInstructionLog { + xla.HloInstructionProto instruction = 1; + repeated xla.ShapeProto operand_shapes = 2; +} diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 8f5de683220..9114013251a 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -233,6 +233,7 @@ CORE_PROTO_SRCS = COMMON_PROTO_SRCS + ERROR_CODES_PROTO_SRCS ADDITIONAL_CORE_PROTO_SRCS = [ "example/example_parser_configuration.proto", "protobuf/trackable_object_graph.proto", + "protobuf/autotuning.proto", "protobuf/control_flow.proto", # TODO(ebrevdo): Re-enable once CriticalSection is in core. # "protobuf/critical_section.proto", diff --git a/tensorflow/compiler/xla/service/gpu/autotuning.proto b/tensorflow/core/protobuf/autotuning.proto similarity index 80% rename from tensorflow/compiler/xla/service/gpu/autotuning.proto rename to tensorflow/core/protobuf/autotuning.proto index b4a08963b4f..29e4d00a85f 100644 --- a/tensorflow/compiler/xla/service/gpu/autotuning.proto +++ b/tensorflow/core/protobuf/autotuning.proto @@ -1,15 +1,14 @@ -// This file defines protos that store the results of autotuning XLA:GPU +// This file defines protos that store the results of autotuning various // operations. // // They are in proto format because we want to log them structured. They offer // tremendous statistical, testing, and debugging value. syntax = "proto3"; -package xla.gpu; +package tensorflow; +import "google/protobuf/any.proto"; import "google/protobuf/duration.proto"; -import "tensorflow/compiler/xla/xla_data.proto"; -import "tensorflow/compiler/xla/service/hlo.proto"; message CudnnVersion { int32 major = 1; @@ -63,19 +62,12 @@ message AutotuneResult { } } -message AutotuneLog { - message Instruction { - xla.HloInstructionProto instruction = 1; - repeated xla.ShapeProto operand_shapes = 2; - } - - oneof instr_oneof { - Instruction instr = 1; - } +message AutotuningLog { + google.protobuf.Any instr = 1; // Records all auto-tuning results per algorithm. - repeated AutotuneResult results = 3; + repeated AutotuneResult results = 2; - CudnnVersion cudnn_version = 4; - ComputeCapability compute_capability = 5; + CudnnVersion cudnn_version = 3; + ComputeCapability compute_capability = 4; } diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD index b990f0a7491..890bd837025 100644 --- a/tensorflow/core/util/proto/BUILD +++ b/tensorflow/core/util/proto/BUILD @@ -70,6 +70,8 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:platform_base", "@com_google_absl//absl/strings", + "@com_google_absl//absl/time", + "@protobuf_archive//:protobuf_headers", ], ) diff --git a/tensorflow/core/util/proto/proto_utils.h b/tensorflow/core/util/proto/proto_utils.h index 9451e317a13..ba45f8a5b0e 100644 --- a/tensorflow/core/util/proto/proto_utils.h +++ b/tensorflow/core/util/proto/proto_utils.h @@ -16,7 +16,9 @@ limitations under the License. #ifndef TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_ #define TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_ +#include "google/protobuf/duration.pb.h" #include "absl/strings/string_view.h" +#include "absl/time/time.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/protobuf.h" @@ -58,6 +60,20 @@ class StringErrorCollector : public protobuf::io::ErrorCollector { const int index_offset_; }; +// Converts an absl::Duration to a google::protobuf::Duration. +inline google::protobuf::Duration ToDurationProto(absl::Duration duration) { + google::protobuf::Duration proto; + proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration)); + proto.set_nanos( + absl::IDivDuration(duration, absl::Nanoseconds(1), &duration)); + return proto; +} + +// Converts a google::protobuf::Duration to an absl::Duration. +inline absl::Duration FromDurationProto(google::protobuf::Duration proto) { + return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos()); +} + } // namespace proto_utils } // namespace tensorflow