Addressing review feedbacks
This commit is contained in:
parent
b21f969731
commit
98e4579b39
@ -20,7 +20,6 @@ limitations under the License.
|
|||||||
#include "absl/strings/str_format.h"
|
#include "absl/strings/str_format.h"
|
||||||
#include "absl/time/time.h"
|
#include "absl/time/time.h"
|
||||||
#include "absl/types/optional.h"
|
#include "absl/types/optional.h"
|
||||||
#include "google/protobuf/any.pb.h"
|
|
||||||
#include "tensorflow/compiler/xla/literal_util.h"
|
#include "tensorflow/compiler/xla/literal_util.h"
|
||||||
#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
|
#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
|
||||||
#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
|
#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
|
||||||
@ -306,9 +305,9 @@ StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
|
|||||||
// have diverged. Secifically, we need to make sure redzone allocator related
|
// have diverged. Secifically, we need to make sure redzone allocator related
|
||||||
// utilities are not used in ROCm routine
|
// utilities are not used in ROCm routine
|
||||||
if (stream_exec_->platform_kind() == se::PlatformKind::kROCm) {
|
if (stream_exec_->platform_kind() == se::PlatformKind::kROCm) {
|
||||||
result_or = PickBestAlgorithmNoCacheRocm(*instr, allocator, stream);
|
result_or = PickBestAlgorithmNoCacheRocm(instr, allocator, stream);
|
||||||
} else if (stream_exec_->platform_kind() == se::PlatformKind::kCuda) {
|
} else if (stream_exec_->platform_kind() == se::PlatformKind::kCuda) {
|
||||||
result_or = PickBestAlgorithmNoCacheCuda(*instr, allocator, stream);
|
result_or = PickBestAlgorithmNoCacheCuda(instr, allocator, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result_or.ok()) {
|
if (result_or.ok()) {
|
||||||
@ -320,13 +319,13 @@ StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
|
|||||||
|
|
||||||
StatusOr<tensorflow::AutotuneResult>
|
StatusOr<tensorflow::AutotuneResult>
|
||||||
GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
||||||
const HloCustomCallInstruction& instr, se::DeviceMemoryAllocator* allocator,
|
const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
|
||||||
se::Stream* stream) {
|
se::Stream* stream) {
|
||||||
// Right now Redzone allocator is available in Cuda target only
|
// Right now Redzone allocator is available in Cuda target only
|
||||||
XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
|
XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
|
||||||
"GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr.ToString()));
|
"GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
|
||||||
|
|
||||||
const Shape& result_shape = instr.shape().tuple_shapes(0);
|
const Shape& result_shape = instr->shape().tuple_shapes(0);
|
||||||
const auto device_ordinal = stream_exec_->device_ordinal();
|
const auto device_ordinal = stream_exec_->device_ordinal();
|
||||||
|
|
||||||
int64 rng_state = 0;
|
int64 rng_state = 0;
|
||||||
@ -337,13 +336,13 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
|||||||
InitializeBuffer(stream, buffer_shape.element_type(), &rng_state, buffer);
|
InitializeBuffer(stream, buffer_shape.element_type(), &rng_state, buffer);
|
||||||
};
|
};
|
||||||
|
|
||||||
const HloModuleConfig& hlo_module_config = instr.GetModule()->config();
|
const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
|
||||||
|
|
||||||
// Allocate space for the input, filter, and output of the convolution.
|
// Allocate space for the input, filter, and output of the convolution.
|
||||||
se::RedzoneAllocator input_output_allocator(
|
se::RedzoneAllocator input_output_allocator(
|
||||||
stream, allocator, PtxOptsFromConfig(hlo_module_config));
|
stream, allocator, PtxOptsFromConfig(hlo_module_config));
|
||||||
std::vector<se::DeviceMemoryBase> operand_buffers;
|
std::vector<se::DeviceMemoryBase> operand_buffers;
|
||||||
for (const auto* operand : instr.operands()) {
|
for (const auto* operand : instr->operands()) {
|
||||||
TF_ASSIGN_OR_RETURN(auto buffer,
|
TF_ASSIGN_OR_RETURN(auto buffer,
|
||||||
input_output_allocator.AllocateBytes(
|
input_output_allocator.AllocateBytes(
|
||||||
ShapeUtil::ByteSizeOf(operand->shape())));
|
ShapeUtil::ByteSizeOf(operand->shape())));
|
||||||
@ -356,7 +355,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
|||||||
initialize_buffer(result_buffer, result_shape);
|
initialize_buffer(result_buffer, result_shape);
|
||||||
|
|
||||||
TF_ASSIGN_OR_RETURN(auto backend_config,
|
TF_ASSIGN_OR_RETURN(auto backend_config,
|
||||||
instr.backend_config<CudnnConvBackendConfig>());
|
instr->backend_config<CudnnConvBackendConfig>());
|
||||||
|
|
||||||
optional<BufferComparator> comparator;
|
optional<BufferComparator> comparator;
|
||||||
// Use the first algorithm that's supported as reference. There isn't a
|
// Use the first algorithm that's supported as reference. There isn't a
|
||||||
@ -365,17 +364,17 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
|||||||
se::DeviceMemoryBase reference_result_buffer;
|
se::DeviceMemoryBase reference_result_buffer;
|
||||||
AlgorithmDesc first_algorithm;
|
AlgorithmDesc first_algorithm;
|
||||||
|
|
||||||
TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(&instr));
|
TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(instr));
|
||||||
std::vector<AutotuneResult> profile_results;
|
std::vector<AutotuneResult> profile_results;
|
||||||
|
|
||||||
const DebugOptions& debug_options =
|
const DebugOptions& debug_options =
|
||||||
instr.GetModule()->config().debug_options();
|
instr->GetModule()->config().debug_options();
|
||||||
|
|
||||||
const bool crash_on_checking_failure =
|
const bool crash_on_checking_failure =
|
||||||
debug_options.xla_gpu_crash_on_verification_failures();
|
debug_options.xla_gpu_crash_on_verification_failures();
|
||||||
|
|
||||||
const auto canonical_hlo =
|
const auto canonical_hlo =
|
||||||
std::get<1>(AutotuneCacheKeyfromInstruction(&instr, stream_exec_));
|
std::get<1>(AutotuneCacheKeyfromInstruction(instr, stream_exec_));
|
||||||
|
|
||||||
string blas_version;
|
string blas_version;
|
||||||
if (auto* blas = stream_exec_->AsBlas()) {
|
if (auto* blas = stream_exec_->AsBlas()) {
|
||||||
@ -395,7 +394,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
|||||||
|
|
||||||
if (absl::c_linear_search(blacklisted_algos, alg)) {
|
if (absl::c_linear_search(blacklisted_algos, alg)) {
|
||||||
LOG(INFO) << "Omitted potentially buggy algorithm "
|
LOG(INFO) << "Omitted potentially buggy algorithm "
|
||||||
<< AlgorithmToString(alg) << " for conv " << instr.ToString();
|
<< AlgorithmToString(alg) << " for conv " << instr->ToString();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -403,7 +402,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
|||||||
stream, allocator, PtxOptsFromConfig(hlo_module_config));
|
stream, allocator, PtxOptsFromConfig(hlo_module_config));
|
||||||
se::dnn::ProfileResult profile_result;
|
se::dnn::ProfileResult profile_result;
|
||||||
VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
|
VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
|
||||||
<< instr.ToString();
|
<< instr->ToString();
|
||||||
|
|
||||||
// Use assignment instead of brace-list to make GCC 4.9 happy.
|
// Use assignment instead of brace-list to make GCC 4.9 happy.
|
||||||
RunConvOptions options;
|
RunConvOptions options;
|
||||||
@ -435,11 +434,11 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
|||||||
// Check for writes to redzones.
|
// Check for writes to redzones.
|
||||||
TF_ASSIGN_OR_RETURN(bool input_output_allocator_redzone_clear,
|
TF_ASSIGN_OR_RETURN(bool input_output_allocator_redzone_clear,
|
||||||
CheckRedzones(input_output_allocator, stream,
|
CheckRedzones(input_output_allocator, stream,
|
||||||
"input/output", &instr, &result));
|
"input/output", instr, &result));
|
||||||
|
|
||||||
TF_ASSIGN_OR_RETURN(
|
TF_ASSIGN_OR_RETURN(
|
||||||
bool scratch_allocator_redzone_clear,
|
bool scratch_allocator_redzone_clear,
|
||||||
CheckRedzones(scratch_allocator, stream, "scratch", &instr, &result));
|
CheckRedzones(scratch_allocator, stream, "scratch", instr, &result));
|
||||||
|
|
||||||
if (!input_output_allocator_redzone_clear ||
|
if (!input_output_allocator_redzone_clear ||
|
||||||
!scratch_allocator_redzone_clear) {
|
!scratch_allocator_redzone_clear) {
|
||||||
@ -470,7 +469,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
|||||||
if (!compare_result.ok()) {
|
if (!compare_result.ok()) {
|
||||||
LOG(ERROR) << "Unable to compare " << AlgorithmToString(first_algorithm)
|
LOG(ERROR) << "Unable to compare " << AlgorithmToString(first_algorithm)
|
||||||
<< " against " << AlgorithmToString(alg) << " for "
|
<< " against " << AlgorithmToString(alg) << " for "
|
||||||
<< instr.ToString() << ": " << compare_result.status();
|
<< instr->ToString() << ": " << compare_result.status();
|
||||||
if (compare_result.status().code() ==
|
if (compare_result.status().code() ==
|
||||||
tensorflow::error::RESOURCE_EXHAUSTED) {
|
tensorflow::error::RESOURCE_EXHAUSTED) {
|
||||||
// Possibly OOM. Propatate the error.
|
// Possibly OOM. Propatate the error.
|
||||||
@ -481,11 +480,12 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
|||||||
LOG(ERROR)
|
LOG(ERROR)
|
||||||
<< "Results mismatch between different convolution algorithms. "
|
<< "Results mismatch between different convolution algorithms. "
|
||||||
"This is likely a bug/unexpected loss of precision in cudnn.\n"
|
"This is likely a bug/unexpected loss of precision in cudnn.\n"
|
||||||
<< instr.ToString() << " for " << AlgorithmToString(first_algorithm)
|
<< instr->ToString() << " for "
|
||||||
<< " vs " << AlgorithmToString(alg);
|
<< AlgorithmToString(first_algorithm) << " vs "
|
||||||
|
<< AlgorithmToString(alg);
|
||||||
PrintPlatformInfo(stream);
|
PrintPlatformInfo(stream);
|
||||||
VLOG(1) << "Full module on failure: \n"
|
VLOG(1) << "Full module on failure: \n"
|
||||||
<< instr.GetModule()->ToString();
|
<< instr->GetModule()->ToString();
|
||||||
auto* fail = result.mutable_failure();
|
auto* fail = result.mutable_failure();
|
||||||
fail->set_kind(AutotuneResult::WRONG_RESULT);
|
fail->set_kind(AutotuneResult::WRONG_RESULT);
|
||||||
fail->set_buffer_address(
|
fail->set_buffer_address(
|
||||||
@ -512,11 +512,11 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
|||||||
tensorflow::AutotuningLog log;
|
tensorflow::AutotuningLog log;
|
||||||
{
|
{
|
||||||
ConvInstructionLog instr_log;
|
ConvInstructionLog instr_log;
|
||||||
*instr_log.mutable_instruction() = instr.ToProto();
|
*instr_log.mutable_instruction() = instr->ToProto();
|
||||||
for (int i = 0; i < instr.operand_count(); i++) {
|
for (int i = 0; i < instr->operand_count(); i++) {
|
||||||
*instr_log.add_operand_shapes() = instr.operand(i)->shape().ToProto();
|
*instr_log.add_operand_shapes() = instr->operand(i)->shape().ToProto();
|
||||||
instr_log.add_operand_addresses(
|
instr_log.add_operand_addresses(
|
||||||
reinterpret_cast<uint64>((operand_buffers)[i].opaque()));
|
reinterpret_cast<uint64>(operand_buffers[i].opaque()));
|
||||||
}
|
}
|
||||||
instr_log.set_result_address(
|
instr_log.set_result_address(
|
||||||
reinterpret_cast<uint64>(result_buffer.opaque()));
|
reinterpret_cast<uint64>(result_buffer.opaque()));
|
||||||
@ -582,15 +582,15 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
|||||||
return InternalError(
|
return InternalError(
|
||||||
"All algorithms tried for convolution %s failed. Falling back to "
|
"All algorithms tried for convolution %s failed. Falling back to "
|
||||||
"default algorithm.",
|
"default algorithm.",
|
||||||
instr.ToString());
|
instr->ToString());
|
||||||
}
|
}
|
||||||
|
|
||||||
StatusOr<tensorflow::AutotuneResult>
|
StatusOr<tensorflow::AutotuneResult>
|
||||||
GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
|
GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
|
||||||
const HloCustomCallInstruction& instr, se::DeviceMemoryAllocator* allocator,
|
const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
|
||||||
se::Stream* stream) {
|
se::Stream* stream) {
|
||||||
XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
|
XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
|
||||||
"GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr.ToString()));
|
"GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
|
||||||
|
|
||||||
const auto device_ordinal = stream_exec_->device_ordinal();
|
const auto device_ordinal = stream_exec_->device_ordinal();
|
||||||
std::vector<se::DeviceMemoryBase> operand_buffers;
|
std::vector<se::DeviceMemoryBase> operand_buffers;
|
||||||
@ -607,7 +607,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
|
|||||||
// Allocate space for the input, filter, and output of the convolution. We
|
// Allocate space for the input, filter, and output of the convolution. We
|
||||||
// use a ScratchAllocator for this instead of calling allocator_ directly so
|
// use a ScratchAllocator for this instead of calling allocator_ directly so
|
||||||
// that our allocations don't leak.
|
// that our allocations don't leak.
|
||||||
for (const auto* operand : instr.operands()) {
|
for (const auto* operand : instr->operands()) {
|
||||||
TF_ASSIGN_OR_RETURN(auto buffer,
|
TF_ASSIGN_OR_RETURN(auto buffer,
|
||||||
input_output_allocator.AllocateBytes(
|
input_output_allocator.AllocateBytes(
|
||||||
ShapeUtil::ByteSizeOf(operand->shape())));
|
ShapeUtil::ByteSizeOf(operand->shape())));
|
||||||
@ -618,12 +618,12 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
|
|||||||
TF_ASSIGN_OR_RETURN(
|
TF_ASSIGN_OR_RETURN(
|
||||||
auto result_buffer,
|
auto result_buffer,
|
||||||
input_output_allocator.AllocateBytes(
|
input_output_allocator.AllocateBytes(
|
||||||
ShapeUtil::ByteSizeOf(instr.shape().tuple_shapes(0))));
|
ShapeUtil::ByteSizeOf(instr->shape().tuple_shapes(0))));
|
||||||
initialize_buffer(result_buffer);
|
initialize_buffer(result_buffer);
|
||||||
|
|
||||||
ScratchAllocator scratch_allocator(device_ordinal, allocator);
|
ScratchAllocator scratch_allocator(device_ordinal, allocator);
|
||||||
se::dnn::ProfileResult profile_result;
|
se::dnn::ProfileResult profile_result;
|
||||||
VLOG(3) << "Auto-tuning for " << instr.ToString();
|
VLOG(3) << "Auto-tuning for " << instr->ToString();
|
||||||
RunConvOptions options;
|
RunConvOptions options;
|
||||||
options.profile_result = &profile_result;
|
options.profile_result = &profile_result;
|
||||||
|
|
||||||
@ -632,8 +632,8 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
|
|||||||
options.algo_override = se::dnn::AlgorithmDesc();
|
options.algo_override = se::dnn::AlgorithmDesc();
|
||||||
|
|
||||||
bool launch_ok =
|
bool launch_ok =
|
||||||
RunCudnnConv(&instr, absl::MakeSpan(operand_buffers), result_buffer,
|
RunGpuConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
|
||||||
&scratch_allocator, stream, options)
|
&scratch_allocator, stream, options)
|
||||||
.ok();
|
.ok();
|
||||||
|
|
||||||
AutotuneResult best_result;
|
AutotuneResult best_result;
|
||||||
@ -653,7 +653,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
|
|||||||
return InternalError(
|
return InternalError(
|
||||||
"All algorithms tried for convolution %s failed. Falling back to "
|
"All algorithms tried for convolution %s failed. Falling back to "
|
||||||
"default algorithm.",
|
"default algorithm.",
|
||||||
instr.ToString());
|
instr->ToString());
|
||||||
}
|
}
|
||||||
|
|
||||||
StatusOr<bool> GpuConvAlgorithmPicker::RunOnInstruction(HloInstruction* instr) {
|
StatusOr<bool> GpuConvAlgorithmPicker::RunOnInstruction(HloInstruction* instr) {
|
||||||
|
@ -54,11 +54,11 @@ class GpuConvAlgorithmPicker : public HloModulePass {
|
|||||||
const HloCustomCallInstruction* instr);
|
const HloCustomCallInstruction* instr);
|
||||||
|
|
||||||
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheCuda(
|
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheCuda(
|
||||||
const HloCustomCallInstruction& instr,
|
const HloCustomCallInstruction* instr,
|
||||||
se::DeviceMemoryAllocator* allocator, se::Stream* stream);
|
se::DeviceMemoryAllocator* allocator, se::Stream* stream);
|
||||||
|
|
||||||
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheRocm(
|
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheRocm(
|
||||||
const HloCustomCallInstruction& instr,
|
const HloCustomCallInstruction* instr,
|
||||||
se::DeviceMemoryAllocator* allocator, se::Stream* stream);
|
se::DeviceMemoryAllocator* allocator, se::Stream* stream);
|
||||||
|
|
||||||
se::StreamExecutor* stream_exec_; // never null
|
se::StreamExecutor* stream_exec_; // never null
|
||||||
|
Loading…
Reference in New Issue
Block a user