Move autotuning.proto to tensorflow/core/protobuf. This is in preparation of the next patch, to tensorflow::Logger::LogProto() for TF convolutions.
This patch also has a functional change, that is to not log an AutotuneResult if the profile result is not valid. Usually they are just unsupported configurations. Ideally we'd log the exact error message of unsupported configs, but today StreamExecutor doesn't propagate it anyway. PiperOrigin-RevId: 236378741
This commit is contained in:
parent
4a07787d7d
commit
b562be27f7
@ -16,7 +16,6 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
|
||||
#define TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
|
||||
|
||||
#include "google/protobuf/duration.pb.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "tensorflow/compiler/xla/statusor.h"
|
||||
#include "tensorflow/compiler/xla/types.h"
|
||||
@ -45,20 +44,6 @@ Status DumpProtoToDirectory(const tensorflow::protobuf::Message& message,
|
||||
// dirpath along as-is.
|
||||
void RegisterDirectoryExpander(const std::function<string(string)>& expander);
|
||||
|
||||
// Converts an absl::Duration to a google::protobuf::Duration.
|
||||
inline google::protobuf::Duration ToDurationProto(absl::Duration duration) {
|
||||
google::protobuf::Duration proto;
|
||||
proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration));
|
||||
proto.set_nanos(
|
||||
absl::IDivDuration(duration, absl::Nanoseconds(1), &duration));
|
||||
return proto;
|
||||
}
|
||||
|
||||
// Converts a google::protobuf::Duration to an absl::Duration.
|
||||
inline absl::Duration FromDurationProto(google::protobuf::Duration proto) {
|
||||
return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos());
|
||||
}
|
||||
|
||||
} // namespace protobuf_util
|
||||
} // namespace xla
|
||||
|
||||
|
@ -440,15 +440,14 @@ cc_library(
|
||||
srcs = ["cudnn_conv_algorithm_picker.cc"],
|
||||
hdrs = ["cudnn_conv_algorithm_picker.h"],
|
||||
deps = [
|
||||
":autotuning_proto",
|
||||
":backend_configs",
|
||||
":buffer_comparator",
|
||||
":cudnn_conv_runner",
|
||||
":gpu_autotuning_proto",
|
||||
":gpu_executable",
|
||||
":ir_emission_utils",
|
||||
":scratch_allocator",
|
||||
"//tensorflow/compiler/xla:literal_util",
|
||||
"//tensorflow/compiler/xla:protobuf_util",
|
||||
"//tensorflow/compiler/xla/service:compiler",
|
||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
||||
"//tensorflow/compiler/xla/service:hlo",
|
||||
@ -456,7 +455,9 @@ cc_library(
|
||||
"//tensorflow/compiler/xla/service:hlo_pass",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:logger",
|
||||
"//tensorflow/core:protos_all_cc",
|
||||
"//tensorflow/core:stream_executor_no_cuda",
|
||||
"//tensorflow/core/util/proto:proto_utils",
|
||||
"@com_google_absl//absl/strings",
|
||||
"@com_google_absl//absl/strings:str_format",
|
||||
"@com_google_absl//absl/time",
|
||||
@ -776,6 +777,7 @@ cc_library(
|
||||
hdrs = ["gpu_transfer_manager.h"],
|
||||
deps = [
|
||||
":gpu_compiler",
|
||||
":infeed_manager",
|
||||
":outfeed_manager",
|
||||
"//tensorflow/compiler/xla:literal",
|
||||
"//tensorflow/compiler/xla:literal_util",
|
||||
@ -788,7 +790,6 @@ cc_library(
|
||||
"//tensorflow/compiler/xla:xla_data_proto",
|
||||
"//tensorflow/compiler/xla/service:generic_transfer_manager",
|
||||
"//tensorflow/compiler/xla/service:transfer_manager",
|
||||
"//tensorflow/compiler/xla/service/gpu:infeed_manager",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:stream_executor_no_cuda",
|
||||
"@com_google_absl//absl/memory",
|
||||
@ -1137,8 +1138,8 @@ tf_cc_test(
|
||||
srcs = ["cudnn_fused_conv_rewriter_test.cc"],
|
||||
tags = tf_cuda_tests_tags(),
|
||||
deps = [
|
||||
":ir_emission_utils",
|
||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
||||
"//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
|
||||
"//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
|
||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||
"//tensorflow/core:test",
|
||||
@ -1183,10 +1184,11 @@ tf_cc_test(
|
||||
)
|
||||
|
||||
xla_proto_library(
|
||||
name = "autotuning_proto",
|
||||
srcs = ["autotuning.proto"],
|
||||
name = "gpu_autotuning_proto",
|
||||
srcs = ["gpu_autotuning.proto"],
|
||||
deps = [
|
||||
"//tensorflow/compiler/xla:xla_data_proto",
|
||||
"//tensorflow/compiler/xla/service:hlo_proto",
|
||||
"//tensorflow/core:protos_all_cc",
|
||||
],
|
||||
)
|
||||
|
@ -14,21 +14,23 @@ limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
|
||||
#include "google/protobuf/any.pb.h"
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "absl/types/optional.h"
|
||||
#include "tensorflow/compiler/xla/literal_util.h"
|
||||
#include "tensorflow/compiler/xla/protobuf_util.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/gpu_autotuning.pb.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/scratch_allocator.h"
|
||||
#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
|
||||
#include "tensorflow/core/lib/strings/numbers.h"
|
||||
#include "tensorflow/core/platform/logger.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/util/proto/proto_utils.h"
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
@ -37,6 +39,7 @@ namespace {
|
||||
using absl::optional;
|
||||
using se::DeviceMemoryBase;
|
||||
using se::dnn::AlgorithmDesc;
|
||||
using tensorflow::AutotuneResult;
|
||||
|
||||
std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
|
||||
se::StreamExecutor* stream_exec) {
|
||||
@ -94,8 +97,8 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
|
||||
return tensorflow::mutex_lock{it->second};
|
||||
}
|
||||
|
||||
xla::gpu::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
|
||||
xla::gpu::CudnnVersion cudnn_version;
|
||||
tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
|
||||
tensorflow::CudnnVersion cudnn_version;
|
||||
if (auto* dnn = stream_executor->AsDnn()) {
|
||||
StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
|
||||
if (version_or.ok()) {
|
||||
@ -108,9 +111,9 @@ xla::gpu::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
|
||||
return cudnn_version;
|
||||
}
|
||||
|
||||
xla::gpu::ComputeCapability GetComputeCapability(
|
||||
tensorflow::ComputeCapability GetComputeCapability(
|
||||
se::StreamExecutor* stream_executor) {
|
||||
xla::gpu::ComputeCapability cc;
|
||||
tensorflow::ComputeCapability cc;
|
||||
int cc_major, cc_minor;
|
||||
stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
|
||||
&cc_minor);
|
||||
@ -243,25 +246,23 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
|
||||
RunCudnnConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
|
||||
&scratch_allocator, &stream, options);
|
||||
|
||||
if (!launch_status.ok()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!profile_result.is_valid()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
profile_results.emplace_back();
|
||||
AutotuneResult& result = profile_results.back();
|
||||
result.mutable_conv()->set_algorithm(alg.algo_id());
|
||||
result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled());
|
||||
|
||||
if (!launch_status.ok()) {
|
||||
result.set_error_string(launch_status.error_message());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!profile_result.is_valid()) {
|
||||
result.set_error_string("Invalid profile result");
|
||||
continue;
|
||||
}
|
||||
|
||||
int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
|
||||
result.mutable_success()->set_scratch_bytes(scratch_bytes_used);
|
||||
*result.mutable_success()->mutable_run_time() =
|
||||
protobuf_util::ToDurationProto(
|
||||
tensorflow::proto_utils::ToDurationProto(
|
||||
absl::Milliseconds(profile_result.elapsed_time_in_ms()));
|
||||
|
||||
const bool crash_on_checking_failure =
|
||||
@ -308,10 +309,14 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
|
||||
|
||||
// Log the autotuning result.
|
||||
{
|
||||
AutotuneLog log;
|
||||
*log.mutable_instr()->mutable_instruction() = instr->ToProto();
|
||||
for (const auto* op : instr->operands()) {
|
||||
*log.mutable_instr()->add_operand_shapes() = op->shape().ToProto();
|
||||
tensorflow::AutotuningLog log;
|
||||
{
|
||||
ConvInstructionLog instr_log;
|
||||
*instr_log.mutable_instruction() = instr->ToProto();
|
||||
for (const auto* op : instr->operands()) {
|
||||
*instr_log.add_operand_shapes() = op->shape().ToProto();
|
||||
}
|
||||
log.mutable_instr()->PackFrom(instr_log);
|
||||
}
|
||||
for (const auto& profile : profile_results) {
|
||||
*log.add_results() = profile;
|
||||
@ -330,13 +335,14 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
|
||||
// The successful one should have a smaller key, since we are doing
|
||||
// min_element. If they are both unsuccessful, keep the earlier one in
|
||||
// the vector by comparing pointers.
|
||||
return std::make_tuple(
|
||||
!lhs.has_success(),
|
||||
protobuf_util::FromDurationProto(lhs.success().run_time()),
|
||||
&lhs) < std::make_tuple(!rhs.has_success(),
|
||||
protobuf_util::FromDurationProto(
|
||||
rhs.success().run_time()),
|
||||
&rhs);
|
||||
return std::make_tuple(!lhs.has_success(),
|
||||
tensorflow::proto_utils::FromDurationProto(
|
||||
lhs.success().run_time()),
|
||||
&lhs) <
|
||||
std::make_tuple(!rhs.has_success(),
|
||||
tensorflow::proto_utils::FromDurationProto(
|
||||
rhs.success().run_time()),
|
||||
&rhs);
|
||||
});
|
||||
|
||||
if (best_result != profile_results_end && best_result->has_success()) {
|
||||
|
@ -20,12 +20,12 @@ limitations under the License.
|
||||
#include "absl/types/optional.h"
|
||||
#include "tensorflow/compiler/xla/service/compiler.h"
|
||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/autotuning.pb.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
|
||||
#include "tensorflow/compiler/xla/service/hlo_instructions.h"
|
||||
#include "tensorflow/compiler/xla/service/hlo_module.h"
|
||||
#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
#include "tensorflow/core/protobuf/autotuning.pb.h"
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
@ -50,7 +50,7 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
|
||||
private:
|
||||
StatusOr<bool> RunOnComputation(HloComputation* computation);
|
||||
StatusOr<bool> RunOnInstruction(HloInstruction* instr);
|
||||
StatusOr<AutotuneResult> PickBestAlgorithm(
|
||||
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithm(
|
||||
const HloCustomCallInstruction* instr);
|
||||
|
||||
se::StreamExecutor* stream_exec_; // never null
|
||||
|
13
tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
Normal file
13
tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
Normal file
@ -0,0 +1,13 @@
|
||||
// This is used for convolution logging. Also see
|
||||
// tensorflow/core/protobuf/autotuing.h
|
||||
syntax = "proto3";
|
||||
|
||||
package xla.gpu;
|
||||
|
||||
import "tensorflow/compiler/xla/service/hlo.proto";
|
||||
import "tensorflow/compiler/xla/xla_data.proto";
|
||||
|
||||
message ConvInstructionLog {
|
||||
xla.HloInstructionProto instruction = 1;
|
||||
repeated xla.ShapeProto operand_shapes = 2;
|
||||
}
|
@ -233,6 +233,7 @@ CORE_PROTO_SRCS = COMMON_PROTO_SRCS + ERROR_CODES_PROTO_SRCS
|
||||
ADDITIONAL_CORE_PROTO_SRCS = [
|
||||
"example/example_parser_configuration.proto",
|
||||
"protobuf/trackable_object_graph.proto",
|
||||
"protobuf/autotuning.proto",
|
||||
"protobuf/control_flow.proto",
|
||||
# TODO(ebrevdo): Re-enable once CriticalSection is in core.
|
||||
# "protobuf/critical_section.proto",
|
||||
|
@ -1,15 +1,14 @@
|
||||
// This file defines protos that store the results of autotuning XLA:GPU
|
||||
// This file defines protos that store the results of autotuning various
|
||||
// operations.
|
||||
//
|
||||
// They are in proto format because we want to log them structured. They offer
|
||||
// tremendous statistical, testing, and debugging value.
|
||||
syntax = "proto3";
|
||||
|
||||
package xla.gpu;
|
||||
package tensorflow;
|
||||
|
||||
import "google/protobuf/any.proto";
|
||||
import "google/protobuf/duration.proto";
|
||||
import "tensorflow/compiler/xla/xla_data.proto";
|
||||
import "tensorflow/compiler/xla/service/hlo.proto";
|
||||
|
||||
message CudnnVersion {
|
||||
int32 major = 1;
|
||||
@ -63,19 +62,12 @@ message AutotuneResult {
|
||||
}
|
||||
}
|
||||
|
||||
message AutotuneLog {
|
||||
message Instruction {
|
||||
xla.HloInstructionProto instruction = 1;
|
||||
repeated xla.ShapeProto operand_shapes = 2;
|
||||
}
|
||||
|
||||
oneof instr_oneof {
|
||||
Instruction instr = 1;
|
||||
}
|
||||
message AutotuningLog {
|
||||
google.protobuf.Any instr = 1;
|
||||
|
||||
// Records all auto-tuning results per algorithm.
|
||||
repeated AutotuneResult results = 3;
|
||||
repeated AutotuneResult results = 2;
|
||||
|
||||
CudnnVersion cudnn_version = 4;
|
||||
ComputeCapability compute_capability = 5;
|
||||
CudnnVersion cudnn_version = 3;
|
||||
ComputeCapability compute_capability = 4;
|
||||
}
|
@ -70,6 +70,8 @@ cc_library(
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:platform_base",
|
||||
"@com_google_absl//absl/strings",
|
||||
"@com_google_absl//absl/time",
|
||||
"@protobuf_archive//:protobuf_headers",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -16,7 +16,9 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
|
||||
#define TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
|
||||
|
||||
#include "google/protobuf/duration.pb.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "tensorflow/core/framework/types.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/platform/protobuf.h"
|
||||
@ -58,6 +60,20 @@ class StringErrorCollector : public protobuf::io::ErrorCollector {
|
||||
const int index_offset_;
|
||||
};
|
||||
|
||||
// Converts an absl::Duration to a google::protobuf::Duration.
|
||||
inline google::protobuf::Duration ToDurationProto(absl::Duration duration) {
|
||||
google::protobuf::Duration proto;
|
||||
proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration));
|
||||
proto.set_nanos(
|
||||
absl::IDivDuration(duration, absl::Nanoseconds(1), &duration));
|
||||
return proto;
|
||||
}
|
||||
|
||||
// Converts a google::protobuf::Duration to an absl::Duration.
|
||||
inline absl::Duration FromDurationProto(google::protobuf::Duration proto) {
|
||||
return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos());
|
||||
}
|
||||
|
||||
} // namespace proto_utils
|
||||
} // namespace tensorflow
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user