From 401ab92175eec5a3c762aa2686120ed6078942e1 Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Mon, 8 Apr 2019 14:22:51 -0700 Subject: [PATCH] Add an auto-tuning process-global "database" to XLA convolution algorithm picker. As a side effect, adds hashing support to Shape. PiperOrigin-RevId: 242538365 --- tensorflow/compiler/xla/BUILD | 1 + tensorflow/compiler/xla/layout.h | 12 +++ tensorflow/compiler/xla/service/gpu/BUILD | 3 + .../gpu/cudnn_conv_algorithm_picker.cc | 97 ++++++++++++++++--- .../service/gpu/cudnn_conv_algorithm_picker.h | 3 + .../xla/service/gpu/cudnn_conv_runner.cc | 1 - tensorflow/compiler/xla/shape.h | 6 ++ tensorflow/compiler/xla/shape_test.cc | 8 ++ 8 files changed, 114 insertions(+), 17 deletions(-) diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD index 26117ae6bd0..1e5d170d6ce 100644 --- a/tensorflow/compiler/xla/BUILD +++ b/tensorflow/compiler/xla/BUILD @@ -287,6 +287,7 @@ tf_cc_test( ":xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:test_main", + "@com_google_absl//absl/hash:hash_testing", "@com_google_absl//absl/strings", ], ) diff --git a/tensorflow/compiler/xla/layout.h b/tensorflow/compiler/xla/layout.h index 63b2a566535..4721c9fcaa1 100644 --- a/tensorflow/compiler/xla/layout.h +++ b/tensorflow/compiler/xla/layout.h @@ -69,6 +69,11 @@ class Tile { // combined with the next minor dimension before tiling is applied. static constexpr int64 kCombineDimension = std::numeric_limits::min(); + template + friend H AbslHashValue(H h, const Tile& t) { + return H::combine(std::move(h), t.dimensions_); + } + private: // The bounds of the tile. std::vector dimensions_; @@ -212,6 +217,13 @@ class Layout { element_size_in_bits_ = 0; } + template + friend H AbslHashValue(H h, const Layout& l) { + return H::combine(std::move(h), l.format_, l.minor_to_major_, + l.max_sparse_elements_, l.tiles_, + l.element_size_in_bits_); + } + private: // The format of this layout. Format format_ = INVALID_FORMAT; diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index dc45f73efb7..f425d5f96ac 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -468,6 +468,7 @@ cc_library( ":ir_emission_utils", ":redzone_allocator", "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/service:compiler", "//tensorflow/compiler/xla/service:device_memory_allocator", @@ -478,7 +479,9 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:logger", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/core/kernels:conv_ops", "//tensorflow/core/util/proto:proto_utils", + "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/time", diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc index 46e8c416c63..1f33e6ab0de 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc @@ -14,7 +14,9 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h" + #include "google/protobuf/any.pb.h" +#include "absl/algorithm/container.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "absl/time/time.h" @@ -27,6 +29,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/gpu/redzone_allocator.h" #include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" +#include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/platform/logger.h" @@ -179,33 +183,88 @@ bool CheckRedzones(const RedzoneAllocator& allocator, se::Stream* stream, return false; } +using ConvCacheKey = + std::tuple, std::string, std::string, int64>; + +struct ConvCacheStats { + int64 cache_hits = 0; + int64 cache_misses = 0; + + void LogStats() { + VLOG(1) << "Cache hits: " << cache_hits; + VLOG(1) << "Cache misses: " << cache_misses; + } +}; + +StatusOr AutotuneCacheKeyfromInstruction( + const HloCustomCallInstruction* conv, se::StreamExecutor* se) { + TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config, + conv->backend_config()); + std::vector operand_shapes; + absl::c_transform(conv->operands(), std::back_inserter(operand_shapes), + [&](const HloInstruction* op) { return op->shape(); }); + + return std::make_tuple( + se, backend_config.SerializeAsString(), conv->custom_call_target(), + conv->shape(), std::move(operand_shapes), + conv->window().SerializeAsString(), + conv->convolution_dimension_numbers().SerializeAsString(), + conv->feature_group_count()); +} + +tensorflow::mutex autotune_cache_lock(tensorflow::LINKER_INITIALIZED); +auto& autotune_cache GUARDED_BY(autotune_cache_lock) = + *new absl::flat_hash_map(); +auto& autotune_cache_stats GUARDED_BY(autotune_cache_lock) = + *new ConvCacheStats(); } // anonymous namespace -// We could have caching here so that we don't redo this work for two identical -// convolutions. Unfortunately our cache key would have to be a tuple -// containing the protos passed to this function, and we have no utility for -// hashing protos. We could write our own hash functions, but they'd silently -// break if we ever added a field to one of the protos. Perhaps we could hack -// using the binary-encoded proto as the hash key, on the assumption that two -// protos being binary-equal is a sufficient, if not necessary, condition for -// proper equality. But that would still leave us open to having unnecessary -// cache misses and doing extra work. Overall, caching doesn't seem worth the -// trouble, but we may want to revisit this if we ever find a model where -// caching would speed up compilation a lot. StatusOr CudnnConvAlgorithmPicker::PickBestAlgorithm( const HloCustomCallInstruction* instr) { - XLA_SCOPED_LOGGING_TIMER(absl::StrCat( - "CudnnConvAlgorithmPicker::PickBestAlgorithm for ", instr->ToString())); - - const Shape& result_shape = instr->shape().tuple_shapes(0); - // Don't run this function concurrently on the same GPU. // // This is a bit of a hack and doesn't protect us against arbitrary concurrent // use of a GPU, but it's sufficient to let us compile two HLO modules // concurrently and then run them sequentially. + // + // Putting the lock in here rather than in PickBestAlgorithmNoCache lets us + // avoid ever doing duplicate work. If we have a cache miss, only one thread + // will run PickBestAlgorithmImpl for a particular device. tensorflow::mutex_lock lock = LockGpu(stream_exec_); + // We cache the autotuning results to avoid doing the duplicate work, + // which can greatly improve both stability (deterministic numeric results + // within a process for a given input) and performance (2x speedup on some + // models). + TF_ASSIGN_OR_RETURN(ConvCacheKey key, + AutotuneCacheKeyfromInstruction(instr, stream_exec_)); + { + tensorflow::mutex_lock lock(autotune_cache_lock); + auto it = autotune_cache.find(key); + if (it != autotune_cache.end()) { + autotune_cache_stats.cache_hits++; + return it->second; + } + autotune_cache_stats.cache_misses++; + } + + StatusOr result_or = PickBestAlgorithmNoCache(instr); + if (result_or.ok()) { + tensorflow::mutex_lock lock(autotune_cache_lock); + CHECK(autotune_cache.insert({key, result_or.ValueOrDie()}).second); + } + return result_or; +} + +StatusOr CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache( + const HloCustomCallInstruction* instr) { + XLA_SCOPED_LOGGING_TIMER( + absl::StrCat("CudnnConvAlgorithmPicker::PickBestAlgorithmImpl for ", + instr->ToString())); + + const Shape& result_shape = instr->shape().tuple_shapes(0); + // Make sure any previous activity on this executor is done. We don't want to // interfere with programs that are still running on the GPU. if (!stream_exec_->SynchronizeAllActivity()) { @@ -543,6 +602,12 @@ StatusOr CudnnConvAlgorithmPicker::Run(HloModule* module) { TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation)); changed |= result; } + + { + tensorflow::mutex_lock lock(autotune_cache_lock); + autotune_cache_stats.LogStats(); + } + return changed; } diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h index 6ab9c7a9ece..ee3a5fee870 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" +#include "tensorflow/core/kernels/conv_ops_fused_impl.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/protobuf/autotuning.pb.h" @@ -52,6 +53,8 @@ class CudnnConvAlgorithmPicker : public HloModulePass { StatusOr RunOnInstruction(HloInstruction* instr); StatusOr PickBestAlgorithm( const HloCustomCallInstruction* instr); + StatusOr PickBestAlgorithmNoCache( + const HloCustomCallInstruction* instr); se::StreamExecutor* stream_exec_; // never null DeviceMemoryAllocator* allocator_; // may be null diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc index b628f27f4b2..cd0198e2cb9 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc @@ -364,7 +364,6 @@ StatusOr GetCudnnConvParams( params.output_buf = operand_buffers[1]; break; case CudnnConvKind::kForwardActivation: { - params.kind = CudnnConvKind::kForwardActivation; params.input_shape = &lhs_shape; params.filter_shape = &rhs_shape; params.output_shape = &conv_result_shape; diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h index 0b8530dd929..a31bf0f8683 100644 --- a/tensorflow/compiler/xla/shape.h +++ b/tensorflow/compiler/xla/shape.h @@ -200,6 +200,12 @@ class Shape { bool operator==(const Shape& other) const { return Equal()(*this, other); } bool operator!=(const Shape& other) const { return !(*this == other); } + template + friend H AbslHashValue(H h, const Shape& s) { + return H::combine(std::move(h), s.element_type_, s.dimensions_, + s.dynamic_dimensions_, s.tuple_shapes_, s.layout_); + } + private: // The element type of this shape (tuple, array, etc). PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID; diff --git a/tensorflow/compiler/xla/shape_test.cc b/tensorflow/compiler/xla/shape_test.cc index dbdafcc0a1f..aa6c7d10989 100644 --- a/tensorflow/compiler/xla/shape_test.cc +++ b/tensorflow/compiler/xla/shape_test.cc @@ -16,6 +16,8 @@ limitations under the License. #include "tensorflow/compiler/xla/shape.h" #include + +#include "absl/hash/hash_testing.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/layout_util.h" @@ -210,5 +212,11 @@ TEST_F(ShapeTest, ProgramShapeToString) { prog.ToString()); } +TEST_F(ShapeTest, SupportsAbslHash) { + EXPECT_TRUE(absl::VerifyTypeImplementsAbslHashCorrectly( + {opaque_, token_, scalar_, scalar_with_tile_, matrix_, matrix2_, tuple_, + nested_tuple_, dyanmic_matrix_})); +} + } // namespace } // namespace xla