Addressing review feedbacks

This commit is contained in:
jerryyin 2019-09-10 16:37:16 +00:00
parent b21f969731
commit 98e4579b39
2 changed files with 35 additions and 35 deletions

View File

@ -20,7 +20,6 @@ limitations under the License.
#include "absl/strings/str_format.h"
#include "absl/time/time.h"
#include "absl/types/optional.h"
#include "google/protobuf/any.pb.h"
#include "tensorflow/compiler/xla/literal_util.h"
#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
@ -306,9 +305,9 @@ StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
// have diverged. Secifically, we need to make sure redzone allocator related
// utilities are not used in ROCm routine
if (stream_exec_->platform_kind() == se::PlatformKind::kROCm) {
result_or = PickBestAlgorithmNoCacheRocm(*instr, allocator, stream);
result_or = PickBestAlgorithmNoCacheRocm(instr, allocator, stream);
} else if (stream_exec_->platform_kind() == se::PlatformKind::kCuda) {
result_or = PickBestAlgorithmNoCacheCuda(*instr, allocator, stream);
result_or = PickBestAlgorithmNoCacheCuda(instr, allocator, stream);
}
if (result_or.ok()) {
@ -320,13 +319,13 @@ StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
StatusOr<tensorflow::AutotuneResult>
GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
const HloCustomCallInstruction& instr, se::DeviceMemoryAllocator* allocator,
const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
se::Stream* stream) {
// Right now Redzone allocator is available in Cuda target only
XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
"GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr.ToString()));
"GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
const Shape& result_shape = instr.shape().tuple_shapes(0);
const Shape& result_shape = instr->shape().tuple_shapes(0);
const auto device_ordinal = stream_exec_->device_ordinal();
int64 rng_state = 0;
@ -337,13 +336,13 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
InitializeBuffer(stream, buffer_shape.element_type(), &rng_state, buffer);
};
const HloModuleConfig& hlo_module_config = instr.GetModule()->config();
const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
// Allocate space for the input, filter, and output of the convolution.
se::RedzoneAllocator input_output_allocator(
stream, allocator, PtxOptsFromConfig(hlo_module_config));
std::vector<se::DeviceMemoryBase> operand_buffers;
for (const auto* operand : instr.operands()) {
for (const auto* operand : instr->operands()) {
TF_ASSIGN_OR_RETURN(auto buffer,
input_output_allocator.AllocateBytes(
ShapeUtil::ByteSizeOf(operand->shape())));
@ -356,7 +355,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
initialize_buffer(result_buffer, result_shape);
TF_ASSIGN_OR_RETURN(auto backend_config,
instr.backend_config<CudnnConvBackendConfig>());
instr->backend_config<CudnnConvBackendConfig>());
optional<BufferComparator> comparator;
// Use the first algorithm that's supported as reference. There isn't a
@ -365,17 +364,17 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
se::DeviceMemoryBase reference_result_buffer;
AlgorithmDesc first_algorithm;
TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(&instr));
TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(instr));
std::vector<AutotuneResult> profile_results;
const DebugOptions& debug_options =
instr.GetModule()->config().debug_options();
instr->GetModule()->config().debug_options();
const bool crash_on_checking_failure =
debug_options.xla_gpu_crash_on_verification_failures();
const auto canonical_hlo =
std::get<1>(AutotuneCacheKeyfromInstruction(&instr, stream_exec_));
std::get<1>(AutotuneCacheKeyfromInstruction(instr, stream_exec_));
string blas_version;
if (auto* blas = stream_exec_->AsBlas()) {
@ -395,7 +394,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
if (absl::c_linear_search(blacklisted_algos, alg)) {
LOG(INFO) << "Omitted potentially buggy algorithm "
<< AlgorithmToString(alg) << " for conv " << instr.ToString();
<< AlgorithmToString(alg) << " for conv " << instr->ToString();
continue;
}
@ -403,7 +402,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
stream, allocator, PtxOptsFromConfig(hlo_module_config));
se::dnn::ProfileResult profile_result;
VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
<< instr.ToString();
<< instr->ToString();
// Use assignment instead of brace-list to make GCC 4.9 happy.
RunConvOptions options;
@ -435,11 +434,11 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
// Check for writes to redzones.
TF_ASSIGN_OR_RETURN(bool input_output_allocator_redzone_clear,
CheckRedzones(input_output_allocator, stream,
"input/output", &instr, &result));
"input/output", instr, &result));
TF_ASSIGN_OR_RETURN(
bool scratch_allocator_redzone_clear,
CheckRedzones(scratch_allocator, stream, "scratch", &instr, &result));
CheckRedzones(scratch_allocator, stream, "scratch", instr, &result));
if (!input_output_allocator_redzone_clear ||
!scratch_allocator_redzone_clear) {
@ -470,7 +469,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
if (!compare_result.ok()) {
LOG(ERROR) << "Unable to compare " << AlgorithmToString(first_algorithm)
<< " against " << AlgorithmToString(alg) << " for "
<< instr.ToString() << ": " << compare_result.status();
<< instr->ToString() << ": " << compare_result.status();
if (compare_result.status().code() ==
tensorflow::error::RESOURCE_EXHAUSTED) {
// Possibly OOM. Propatate the error.
@ -481,11 +480,12 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
LOG(ERROR)
<< "Results mismatch between different convolution algorithms. "
"This is likely a bug/unexpected loss of precision in cudnn.\n"
<< instr.ToString() << " for " << AlgorithmToString(first_algorithm)
<< " vs " << AlgorithmToString(alg);
<< instr->ToString() << " for "
<< AlgorithmToString(first_algorithm) << " vs "
<< AlgorithmToString(alg);
PrintPlatformInfo(stream);
VLOG(1) << "Full module on failure: \n"
<< instr.GetModule()->ToString();
<< instr->GetModule()->ToString();
auto* fail = result.mutable_failure();
fail->set_kind(AutotuneResult::WRONG_RESULT);
fail->set_buffer_address(
@ -512,11 +512,11 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
tensorflow::AutotuningLog log;
{
ConvInstructionLog instr_log;
*instr_log.mutable_instruction() = instr.ToProto();
for (int i = 0; i < instr.operand_count(); i++) {
*instr_log.add_operand_shapes() = instr.operand(i)->shape().ToProto();
*instr_log.mutable_instruction() = instr->ToProto();
for (int i = 0; i < instr->operand_count(); i++) {
*instr_log.add_operand_shapes() = instr->operand(i)->shape().ToProto();
instr_log.add_operand_addresses(
reinterpret_cast<uint64>((operand_buffers)[i].opaque()));
reinterpret_cast<uint64>(operand_buffers[i].opaque()));
}
instr_log.set_result_address(
reinterpret_cast<uint64>(result_buffer.opaque()));
@ -582,15 +582,15 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
return InternalError(
"All algorithms tried for convolution %s failed. Falling back to "
"default algorithm.",
instr.ToString());
instr->ToString());
}
StatusOr<tensorflow::AutotuneResult>
GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
const HloCustomCallInstruction& instr, se::DeviceMemoryAllocator* allocator,
const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
se::Stream* stream) {
XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
"GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr.ToString()));
"GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
const auto device_ordinal = stream_exec_->device_ordinal();
std::vector<se::DeviceMemoryBase> operand_buffers;
@ -607,7 +607,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
// Allocate space for the input, filter, and output of the convolution. We
// use a ScratchAllocator for this instead of calling allocator_ directly so
// that our allocations don't leak.
for (const auto* operand : instr.operands()) {
for (const auto* operand : instr->operands()) {
TF_ASSIGN_OR_RETURN(auto buffer,
input_output_allocator.AllocateBytes(
ShapeUtil::ByteSizeOf(operand->shape())));
@ -618,12 +618,12 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
TF_ASSIGN_OR_RETURN(
auto result_buffer,
input_output_allocator.AllocateBytes(
ShapeUtil::ByteSizeOf(instr.shape().tuple_shapes(0))));
ShapeUtil::ByteSizeOf(instr->shape().tuple_shapes(0))));
initialize_buffer(result_buffer);
ScratchAllocator scratch_allocator(device_ordinal, allocator);
se::dnn::ProfileResult profile_result;
VLOG(3) << "Auto-tuning for " << instr.ToString();
VLOG(3) << "Auto-tuning for " << instr->ToString();
RunConvOptions options;
options.profile_result = &profile_result;
@ -632,7 +632,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
options.algo_override = se::dnn::AlgorithmDesc();
bool launch_ok =
RunCudnnConv(&instr, absl::MakeSpan(operand_buffers), result_buffer,
RunGpuConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
&scratch_allocator, stream, options)
.ok();
@ -653,7 +653,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
return InternalError(
"All algorithms tried for convolution %s failed. Falling back to "
"default algorithm.",
instr.ToString());
instr->ToString());
}
StatusOr<bool> GpuConvAlgorithmPicker::RunOnInstruction(HloInstruction* instr) {

View File

@ -54,11 +54,11 @@ class GpuConvAlgorithmPicker : public HloModulePass {
const HloCustomCallInstruction* instr);
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheCuda(
const HloCustomCallInstruction& instr,
const HloCustomCallInstruction* instr,
se::DeviceMemoryAllocator* allocator, se::Stream* stream);
StatusOr<tensorflow::AutotuneResult> PickBestAlgorithmNoCacheRocm(
const HloCustomCallInstruction& instr,
const HloCustomCallInstruction* instr,
se::DeviceMemoryAllocator* allocator, se::Stream* stream);
se::StreamExecutor* stream_exec_; // never null