From 401ab92175eec5a3c762aa2686120ed6078942e1 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 8 Apr 2019 14:22:51 -0700
Subject: [PATCH] Add an auto-tuning process-global "database" to XLA
 convolution algorithm picker. As a side effect, adds hashing support to
 Shape.

PiperOrigin-RevId: 242538365
---
 tensorflow/compiler/xla/BUILD                 |  1 +
 tensorflow/compiler/xla/layout.h              | 12 +++
 tensorflow/compiler/xla/service/gpu/BUILD     |  3 +
 .../gpu/cudnn_conv_algorithm_picker.cc        | 97 ++++++++++++++++---
 .../service/gpu/cudnn_conv_algorithm_picker.h |  3 +
 .../xla/service/gpu/cudnn_conv_runner.cc      |  1 -
 tensorflow/compiler/xla/shape.h               |  6 ++
 tensorflow/compiler/xla/shape_test.cc         |  8 ++
 8 files changed, 114 insertions(+), 17 deletions(-)
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 26117ae6bd0..1e5d170d6ce 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -287,6 +287,7 @@ tf_cc_test(
         ":xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/hash:hash_testing",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/compiler/xla/layout.h b/tensorflow/compiler/xla/layout.h
index 63b2a566535..4721c9fcaa1 100644
--- a/tensorflow/compiler/xla/layout.h
+++ b/tensorflow/compiler/xla/layout.h
@@ -69,6 +69,11 @@ class Tile {
   // combined with the next minor dimension before tiling is applied.
   static constexpr int64 kCombineDimension = std::numeric_limits<int64>::min();
 
+  template <typename H>
+  friend H AbslHashValue(H h, const Tile& t) {
+    return H::combine(std::move(h), t.dimensions_);
+  }
+
  private:
   // The bounds of the tile.
   std::vector<int64> dimensions_;
@@ -212,6 +217,13 @@ class Layout {
     element_size_in_bits_ = 0;
   }
 
+  template <typename H>
+  friend H AbslHashValue(H h, const Layout& l) {
+    return H::combine(std::move(h), l.format_, l.minor_to_major_,
+                      l.max_sparse_elements_, l.tiles_,
+                      l.element_size_in_bits_);
+  }
+
  private:
   // The format of this layout.
   Format format_ = INVALID_FORMAT;
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index dc45f73efb7..f425d5f96ac 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -468,6 +468,7 @@ cc_library(
         ":ir_emission_utils",
         ":redzone_allocator",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
@@ -478,7 +479,9 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:logger",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/kernels:conv_ops",
         "//tensorflow/core/util/proto:proto_utils",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
index 46e8c416c63..1f33e6ab0de 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
+
 #include "google/protobuf/any.pb.h"
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/time/time.h"
@@ -27,6 +29,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/redzone_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logger.h"
@@ -179,33 +183,88 @@ bool CheckRedzones(const RedzoneAllocator& allocator, se::Stream* stream,
   return false;
 }
 
+using ConvCacheKey =
+    std::tuple<se::StreamExecutor*, std::string, std::string, Shape,
+               std::vector<Shape>, std::string, std::string, int64>;
+
+struct ConvCacheStats {
+  int64 cache_hits = 0;
+  int64 cache_misses = 0;
+
+  void LogStats() {
+    VLOG(1) << "Cache hits: " << cache_hits;
+    VLOG(1) << "Cache misses: " << cache_misses;
+  }
+};
+
+StatusOr<ConvCacheKey> AutotuneCacheKeyfromInstruction(
+    const HloCustomCallInstruction* conv, se::StreamExecutor* se) {
+  TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
+                      conv->backend_config<CudnnConvBackendConfig>());
+  std::vector<Shape> operand_shapes;
+  absl::c_transform(conv->operands(), std::back_inserter(operand_shapes),
+                    [&](const HloInstruction* op) { return op->shape(); });
+
+  return std::make_tuple(
+      se, backend_config.SerializeAsString(), conv->custom_call_target(),
+      conv->shape(), std::move(operand_shapes),
+      conv->window().SerializeAsString(),
+      conv->convolution_dimension_numbers().SerializeAsString(),
+      conv->feature_group_count());
+}
+
+tensorflow::mutex autotune_cache_lock(tensorflow::LINKER_INITIALIZED);
+auto& autotune_cache GUARDED_BY(autotune_cache_lock) =
+    *new absl::flat_hash_map<ConvCacheKey, AutotuneResult>();
+auto& autotune_cache_stats GUARDED_BY(autotune_cache_lock) =
+    *new ConvCacheStats();
 }  // anonymous namespace
 
-// We could have caching here so that we don't redo this work for two identical
-// convolutions.  Unfortunately our cache key would have to be a tuple
-// containing the protos passed to this function, and we have no utility for
-// hashing protos.  We could write our own hash functions, but they'd silently
-// break if we ever added a field to one of the protos.  Perhaps we could hack
-// using the binary-encoded proto as the hash key, on the assumption that two
-// protos being binary-equal is a sufficient, if not necessary, condition for
-// proper equality.  But that would still leave us open to having unnecessary
-// cache misses and doing extra work.  Overall, caching doesn't seem worth the
-// trouble, but we may want to revisit this if we ever find a model where
-// caching would speed up compilation a lot.
 StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
     const HloCustomCallInstruction* instr) {
-  XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
-      "CudnnConvAlgorithmPicker::PickBestAlgorithm for ", instr->ToString()));
-
-  const Shape& result_shape = instr->shape().tuple_shapes(0);
-
   // Don't run this function concurrently on the same GPU.
   //
   // This is a bit of a hack and doesn't protect us against arbitrary concurrent
   // use of a GPU, but it's sufficient to let us compile two HLO modules
   // concurrently and then run them sequentially.
+  //
+  // Putting the lock in here rather than in PickBestAlgorithmNoCache lets us
+  // avoid ever doing duplicate work.  If we have a cache miss, only one thread
+  // will run PickBestAlgorithmImpl for a particular device.
   tensorflow::mutex_lock lock = LockGpu(stream_exec_);
 
+  // We cache the autotuning results to avoid doing the duplicate work,
+  // which can greatly improve both stability (deterministic numeric results
+  // within a process for a given input) and performance (2x speedup on some
+  // models).
+  TF_ASSIGN_OR_RETURN(ConvCacheKey key,
+                      AutotuneCacheKeyfromInstruction(instr, stream_exec_));
+  {
+    tensorflow::mutex_lock lock(autotune_cache_lock);
+    auto it = autotune_cache.find(key);
+    if (it != autotune_cache.end()) {
+      autotune_cache_stats.cache_hits++;
+      return it->second;
+    }
+    autotune_cache_stats.cache_misses++;
+  }
+
+  StatusOr<AutotuneResult> result_or = PickBestAlgorithmNoCache(instr);
+  if (result_or.ok()) {
+    tensorflow::mutex_lock lock(autotune_cache_lock);
+    CHECK(autotune_cache.insert({key, result_or.ValueOrDie()}).second);
+  }
+  return result_or;
+}
+
+StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
+    const HloCustomCallInstruction* instr) {
+  XLA_SCOPED_LOGGING_TIMER(
+      absl::StrCat("CudnnConvAlgorithmPicker::PickBestAlgorithmImpl for ",
+                   instr->ToString()));
+
+  const Shape& result_shape = instr->shape().tuple_shapes(0);
+
   // Make sure any previous activity on this executor is done. We don't want to
   // interfere with programs that are still running on the GPU.
   if (!stream_exec_->SynchronizeAllActivity()) {
@@ -543,6 +602,12 @@ StatusOr<bool> CudnnConvAlgorithmPicker::Run(HloModule* module) {
     TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
     changed |= result;
   }
+
+  {
+    tensorflow::mutex_lock lock(autotune_cache_lock);
+    autotune_cache_stats.LogStats();
+  }
+
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
index 6ab9c7a9ece..ee3a5fee870 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/kernels/conv_ops_fused_impl.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 
@@ -52,6 +53,8 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
   StatusOr<bool> RunOnInstruction(HloInstruction* instr);
   StatusOr<tensorflow::AutotuneResult> PickBestAlgorithm(
       const HloCustomCallInstruction* instr);
+  StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCache(
+      const HloCustomCallInstruction* instr);
 
   se::StreamExecutor* stream_exec_;                   // never null
   DeviceMemoryAllocator* allocator_;                  // may be null
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
index b628f27f4b2..cd0198e2cb9 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
@@ -364,7 +364,6 @@ StatusOr<CudnnConvParams> GetCudnnConvParams(
       params.output_buf = operand_buffers[1];
       break;
     case CudnnConvKind::kForwardActivation: {
-      params.kind = CudnnConvKind::kForwardActivation;
       params.input_shape = &lhs_shape;
       params.filter_shape = &rhs_shape;
       params.output_shape = &conv_result_shape;
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
index 0b8530dd929..a31bf0f8683 100644
--- a/tensorflow/compiler/xla/shape.h
+++ b/tensorflow/compiler/xla/shape.h
@@ -200,6 +200,12 @@ class Shape {
   bool operator==(const Shape& other) const { return Equal()(*this, other); }
   bool operator!=(const Shape& other) const { return !(*this == other); }
 
+  template <typename H>
+  friend H AbslHashValue(H h, const Shape& s) {
+    return H::combine(std::move(h), s.element_type_, s.dimensions_,
+                      s.dynamic_dimensions_, s.tuple_shapes_, s.layout_);
+  }
+
  private:
   // The element type of this shape (tuple, array, etc).
   PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID;
diff --git a/tensorflow/compiler/xla/shape_test.cc b/tensorflow/compiler/xla/shape_test.cc
index dbdafcc0a1f..aa6c7d10989 100644
--- a/tensorflow/compiler/xla/shape_test.cc
+++ b/tensorflow/compiler/xla/shape_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape.h"
 
 #include <numeric>
+
+#include "absl/hash/hash_testing.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/layout_util.h"
@@ -210,5 +212,11 @@ TEST_F(ShapeTest, ProgramShapeToString) {
       prog.ToString());
 }
 
+TEST_F(ShapeTest, SupportsAbslHash) {
+  EXPECT_TRUE(absl::VerifyTypeImplementsAbslHashCorrectly(
+      {opaque_, token_, scalar_, scalar_with_tile_, matrix_, matrix2_, tuple_,
+       nested_tuple_, dyanmic_matrix_}));
+}
+
 }  // namespace
 }  // namespace xla