Add an auto-tuning process-global "database" to XLA convolution algorithm picker.
As a side effect, adds hashing support to Shape. PiperOrigin-RevId: 242538365
This commit is contained in:
parent
9bdf3e4018
commit
401ab92175
@ -287,6 +287,7 @@ tf_cc_test(
|
|||||||
":xla_data_proto",
|
":xla_data_proto",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:test_main",
|
"//tensorflow/core:test_main",
|
||||||
|
"@com_google_absl//absl/hash:hash_testing",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -69,6 +69,11 @@ class Tile {
|
|||||||
// combined with the next minor dimension before tiling is applied.
|
// combined with the next minor dimension before tiling is applied.
|
||||||
static constexpr int64 kCombineDimension = std::numeric_limits<int64>::min();
|
static constexpr int64 kCombineDimension = std::numeric_limits<int64>::min();
|
||||||
|
|
||||||
|
template <typename H>
|
||||||
|
friend H AbslHashValue(H h, const Tile& t) {
|
||||||
|
return H::combine(std::move(h), t.dimensions_);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// The bounds of the tile.
|
// The bounds of the tile.
|
||||||
std::vector<int64> dimensions_;
|
std::vector<int64> dimensions_;
|
||||||
@ -212,6 +217,13 @@ class Layout {
|
|||||||
element_size_in_bits_ = 0;
|
element_size_in_bits_ = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename H>
|
||||||
|
friend H AbslHashValue(H h, const Layout& l) {
|
||||||
|
return H::combine(std::move(h), l.format_, l.minor_to_major_,
|
||||||
|
l.max_sparse_elements_, l.tiles_,
|
||||||
|
l.element_size_in_bits_);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// The format of this layout.
|
// The format of this layout.
|
||||||
Format format_ = INVALID_FORMAT;
|
Format format_ = INVALID_FORMAT;
|
||||||
|
@ -468,6 +468,7 @@ cc_library(
|
|||||||
":ir_emission_utils",
|
":ir_emission_utils",
|
||||||
":redzone_allocator",
|
":redzone_allocator",
|
||||||
"//tensorflow/compiler/xla:literal_util",
|
"//tensorflow/compiler/xla:literal_util",
|
||||||
|
"//tensorflow/compiler/xla:status_macros",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla/service:compiler",
|
"//tensorflow/compiler/xla/service:compiler",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
||||||
@ -478,7 +479,9 @@ cc_library(
|
|||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:logger",
|
"//tensorflow/core:logger",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
|
"//tensorflow/core/kernels:conv_ops",
|
||||||
"//tensorflow/core/util/proto:proto_utils",
|
"//tensorflow/core/util/proto:proto_utils",
|
||||||
|
"@com_google_absl//absl/algorithm:container",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
"@com_google_absl//absl/strings:str_format",
|
"@com_google_absl//absl/strings:str_format",
|
||||||
"@com_google_absl//absl/time",
|
"@com_google_absl//absl/time",
|
||||||
|
@ -14,7 +14,9 @@ limitations under the License.
|
|||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
|
#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
|
||||||
|
|
||||||
#include "google/protobuf/any.pb.h"
|
#include "google/protobuf/any.pb.h"
|
||||||
|
#include "absl/algorithm/container.h"
|
||||||
#include "absl/strings/str_cat.h"
|
#include "absl/strings/str_cat.h"
|
||||||
#include "absl/strings/str_format.h"
|
#include "absl/strings/str_format.h"
|
||||||
#include "absl/time/time.h"
|
#include "absl/time/time.h"
|
||||||
@ -27,6 +29,8 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
|
#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
|
||||||
#include "tensorflow/compiler/xla/service/gpu/redzone_allocator.h"
|
#include "tensorflow/compiler/xla/service/gpu/redzone_allocator.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
|
#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
|
||||||
|
#include "tensorflow/compiler/xla/service/hlo_instructions.h"
|
||||||
|
#include "tensorflow/compiler/xla/status_macros.h"
|
||||||
#include "tensorflow/compiler/xla/util.h"
|
#include "tensorflow/compiler/xla/util.h"
|
||||||
#include "tensorflow/core/lib/strings/numbers.h"
|
#include "tensorflow/core/lib/strings/numbers.h"
|
||||||
#include "tensorflow/core/platform/logger.h"
|
#include "tensorflow/core/platform/logger.h"
|
||||||
@ -179,33 +183,88 @@ bool CheckRedzones(const RedzoneAllocator& allocator, se::Stream* stream,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
using ConvCacheKey =
|
||||||
|
std::tuple<se::StreamExecutor*, std::string, std::string, Shape,
|
||||||
|
std::vector<Shape>, std::string, std::string, int64>;
|
||||||
|
|
||||||
|
struct ConvCacheStats {
|
||||||
|
int64 cache_hits = 0;
|
||||||
|
int64 cache_misses = 0;
|
||||||
|
|
||||||
|
void LogStats() {
|
||||||
|
VLOG(1) << "Cache hits: " << cache_hits;
|
||||||
|
VLOG(1) << "Cache misses: " << cache_misses;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
StatusOr<ConvCacheKey> AutotuneCacheKeyfromInstruction(
|
||||||
|
const HloCustomCallInstruction* conv, se::StreamExecutor* se) {
|
||||||
|
TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
|
||||||
|
conv->backend_config<CudnnConvBackendConfig>());
|
||||||
|
std::vector<Shape> operand_shapes;
|
||||||
|
absl::c_transform(conv->operands(), std::back_inserter(operand_shapes),
|
||||||
|
[&](const HloInstruction* op) { return op->shape(); });
|
||||||
|
|
||||||
|
return std::make_tuple(
|
||||||
|
se, backend_config.SerializeAsString(), conv->custom_call_target(),
|
||||||
|
conv->shape(), std::move(operand_shapes),
|
||||||
|
conv->window().SerializeAsString(),
|
||||||
|
conv->convolution_dimension_numbers().SerializeAsString(),
|
||||||
|
conv->feature_group_count());
|
||||||
|
}
|
||||||
|
|
||||||
|
tensorflow::mutex autotune_cache_lock(tensorflow::LINKER_INITIALIZED);
|
||||||
|
auto& autotune_cache GUARDED_BY(autotune_cache_lock) =
|
||||||
|
*new absl::flat_hash_map<ConvCacheKey, AutotuneResult>();
|
||||||
|
auto& autotune_cache_stats GUARDED_BY(autotune_cache_lock) =
|
||||||
|
*new ConvCacheStats();
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
// We could have caching here so that we don't redo this work for two identical
|
|
||||||
// convolutions. Unfortunately our cache key would have to be a tuple
|
|
||||||
// containing the protos passed to this function, and we have no utility for
|
|
||||||
// hashing protos. We could write our own hash functions, but they'd silently
|
|
||||||
// break if we ever added a field to one of the protos. Perhaps we could hack
|
|
||||||
// using the binary-encoded proto as the hash key, on the assumption that two
|
|
||||||
// protos being binary-equal is a sufficient, if not necessary, condition for
|
|
||||||
// proper equality. But that would still leave us open to having unnecessary
|
|
||||||
// cache misses and doing extra work. Overall, caching doesn't seem worth the
|
|
||||||
// trouble, but we may want to revisit this if we ever find a model where
|
|
||||||
// caching would speed up compilation a lot.
|
|
||||||
StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
|
StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
|
||||||
const HloCustomCallInstruction* instr) {
|
const HloCustomCallInstruction* instr) {
|
||||||
XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
|
|
||||||
"CudnnConvAlgorithmPicker::PickBestAlgorithm for ", instr->ToString()));
|
|
||||||
|
|
||||||
const Shape& result_shape = instr->shape().tuple_shapes(0);
|
|
||||||
|
|
||||||
// Don't run this function concurrently on the same GPU.
|
// Don't run this function concurrently on the same GPU.
|
||||||
//
|
//
|
||||||
// This is a bit of a hack and doesn't protect us against arbitrary concurrent
|
// This is a bit of a hack and doesn't protect us against arbitrary concurrent
|
||||||
// use of a GPU, but it's sufficient to let us compile two HLO modules
|
// use of a GPU, but it's sufficient to let us compile two HLO modules
|
||||||
// concurrently and then run them sequentially.
|
// concurrently and then run them sequentially.
|
||||||
|
//
|
||||||
|
// Putting the lock in here rather than in PickBestAlgorithmNoCache lets us
|
||||||
|
// avoid ever doing duplicate work. If we have a cache miss, only one thread
|
||||||
|
// will run PickBestAlgorithmImpl for a particular device.
|
||||||
tensorflow::mutex_lock lock = LockGpu(stream_exec_);
|
tensorflow::mutex_lock lock = LockGpu(stream_exec_);
|
||||||
|
|
||||||
|
// We cache the autotuning results to avoid doing the duplicate work,
|
||||||
|
// which can greatly improve both stability (deterministic numeric results
|
||||||
|
// within a process for a given input) and performance (2x speedup on some
|
||||||
|
// models).
|
||||||
|
TF_ASSIGN_OR_RETURN(ConvCacheKey key,
|
||||||
|
AutotuneCacheKeyfromInstruction(instr, stream_exec_));
|
||||||
|
{
|
||||||
|
tensorflow::mutex_lock lock(autotune_cache_lock);
|
||||||
|
auto it = autotune_cache.find(key);
|
||||||
|
if (it != autotune_cache.end()) {
|
||||||
|
autotune_cache_stats.cache_hits++;
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
autotune_cache_stats.cache_misses++;
|
||||||
|
}
|
||||||
|
|
||||||
|
StatusOr<AutotuneResult> result_or = PickBestAlgorithmNoCache(instr);
|
||||||
|
if (result_or.ok()) {
|
||||||
|
tensorflow::mutex_lock lock(autotune_cache_lock);
|
||||||
|
CHECK(autotune_cache.insert({key, result_or.ValueOrDie()}).second);
|
||||||
|
}
|
||||||
|
return result_or;
|
||||||
|
}
|
||||||
|
|
||||||
|
StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
|
||||||
|
const HloCustomCallInstruction* instr) {
|
||||||
|
XLA_SCOPED_LOGGING_TIMER(
|
||||||
|
absl::StrCat("CudnnConvAlgorithmPicker::PickBestAlgorithmImpl for ",
|
||||||
|
instr->ToString()));
|
||||||
|
|
||||||
|
const Shape& result_shape = instr->shape().tuple_shapes(0);
|
||||||
|
|
||||||
// Make sure any previous activity on this executor is done. We don't want to
|
// Make sure any previous activity on this executor is done. We don't want to
|
||||||
// interfere with programs that are still running on the GPU.
|
// interfere with programs that are still running on the GPU.
|
||||||
if (!stream_exec_->SynchronizeAllActivity()) {
|
if (!stream_exec_->SynchronizeAllActivity()) {
|
||||||
@ -543,6 +602,12 @@ StatusOr<bool> CudnnConvAlgorithmPicker::Run(HloModule* module) {
|
|||||||
TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
|
TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
|
||||||
changed |= result;
|
changed |= result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
tensorflow::mutex_lock lock(autotune_cache_lock);
|
||||||
|
autotune_cache_stats.LogStats();
|
||||||
|
}
|
||||||
|
|
||||||
return changed;
|
return changed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -24,6 +24,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/service/hlo_instructions.h"
|
#include "tensorflow/compiler/xla/service/hlo_instructions.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_module.h"
|
#include "tensorflow/compiler/xla/service/hlo_module.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
|
#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
|
||||||
|
#include "tensorflow/core/kernels/conv_ops_fused_impl.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
#include "tensorflow/core/protobuf/autotuning.pb.h"
|
#include "tensorflow/core/protobuf/autotuning.pb.h"
|
||||||
|
|
||||||
@ -52,6 +53,8 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
|
|||||||
StatusOr<bool> RunOnInstruction(HloInstruction* instr);
|
StatusOr<bool> RunOnInstruction(HloInstruction* instr);
|
||||||
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithm(
|
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithm(
|
||||||
const HloCustomCallInstruction* instr);
|
const HloCustomCallInstruction* instr);
|
||||||
|
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCache(
|
||||||
|
const HloCustomCallInstruction* instr);
|
||||||
|
|
||||||
se::StreamExecutor* stream_exec_; // never null
|
se::StreamExecutor* stream_exec_; // never null
|
||||||
DeviceMemoryAllocator* allocator_; // may be null
|
DeviceMemoryAllocator* allocator_; // may be null
|
||||||
|
@ -364,7 +364,6 @@ StatusOr<CudnnConvParams> GetCudnnConvParams(
|
|||||||
params.output_buf = operand_buffers[1];
|
params.output_buf = operand_buffers[1];
|
||||||
break;
|
break;
|
||||||
case CudnnConvKind::kForwardActivation: {
|
case CudnnConvKind::kForwardActivation: {
|
||||||
params.kind = CudnnConvKind::kForwardActivation;
|
|
||||||
params.input_shape = &lhs_shape;
|
params.input_shape = &lhs_shape;
|
||||||
params.filter_shape = &rhs_shape;
|
params.filter_shape = &rhs_shape;
|
||||||
params.output_shape = &conv_result_shape;
|
params.output_shape = &conv_result_shape;
|
||||||
|
@ -200,6 +200,12 @@ class Shape {
|
|||||||
bool operator==(const Shape& other) const { return Equal()(*this, other); }
|
bool operator==(const Shape& other) const { return Equal()(*this, other); }
|
||||||
bool operator!=(const Shape& other) const { return !(*this == other); }
|
bool operator!=(const Shape& other) const { return !(*this == other); }
|
||||||
|
|
||||||
|
template <typename H>
|
||||||
|
friend H AbslHashValue(H h, const Shape& s) {
|
||||||
|
return H::combine(std::move(h), s.element_type_, s.dimensions_,
|
||||||
|
s.dynamic_dimensions_, s.tuple_shapes_, s.layout_);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// The element type of this shape (tuple, array, etc).
|
// The element type of this shape (tuple, array, etc).
|
||||||
PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID;
|
PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID;
|
||||||
|
@ -16,6 +16,8 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/shape.h"
|
#include "tensorflow/compiler/xla/shape.h"
|
||||||
|
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
|
|
||||||
|
#include "absl/hash/hash_testing.h"
|
||||||
#include "absl/strings/str_cat.h"
|
#include "absl/strings/str_cat.h"
|
||||||
#include "absl/strings/str_join.h"
|
#include "absl/strings/str_join.h"
|
||||||
#include "tensorflow/compiler/xla/layout_util.h"
|
#include "tensorflow/compiler/xla/layout_util.h"
|
||||||
@ -210,5 +212,11 @@ TEST_F(ShapeTest, ProgramShapeToString) {
|
|||||||
prog.ToString());
|
prog.ToString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(ShapeTest, SupportsAbslHash) {
|
||||||
|
EXPECT_TRUE(absl::VerifyTypeImplementsAbslHashCorrectly(
|
||||||
|
{opaque_, token_, scalar_, scalar_with_tile_, matrix_, matrix2_, tuple_,
|
||||||
|
nested_tuple_, dyanmic_matrix_}));
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
} // namespace xla
|
} // namespace xla
|
||||||
|
Loading…
Reference in New Issue
Block a user