[XLA:GPU] Add --xla_gpu_deterministic_ops to force run-to-run determinism.

Fail if there is no deterministic lowering for HLO ops in the module. The env variable `TF_DETERMINISTIC_OPS` is related, but serves a somewhat different use case.
First, it is best-effort in the sense that deterministic lowering will be preferred if the exist. However, Scatter with non-unique indexes and SelectAndScatter do not have a deterministic implementation at present, which is silently ignored. In contrast, --xla_gpu_deterministic_ops will cleanly fail compilation for these ops.
Second, the `TF_DETERMINISTIC_OPS` env variable is not convenient for compiler frontends other than TensorFlow.

PiperOrigin-RevId: 347672477
Change-Id: I62c3093784560394b0aa173747afd350a299ed64
This commit is contained in:
Thomas Joerg 2020-12-15 12:45:39 -08:00 committed by TensorFlower Gardener
parent 5f08955b75
commit bb4a8a8b49
6 changed files with 37 additions and 3 deletions

View File

@ -600,6 +600,11 @@ static void AllocateFlags() {
flag_values->xla_gpu_force_compilation_parallelism(),
"Overrides normal multi-threaded compilation settting to use this many "
"threads. Setting to 0 (the default value) means no enforcement."));
flag_objects->push_back(tensorflow::Flag(
"xla_gpu_deterministic_ops",
bool_setter_for(&DebugOptions::set_xla_gpu_deterministic_ops),
flag_values->xla_gpu_deterministic_ops(),
"Guarantees run-to-run determinism on GPU."));
ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
}

View File

@ -421,7 +421,8 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
if (RequireDeterminism() ||
hlo_module->config().debug_options().xla_gpu_deterministic_reductions()) {
hlo_module->config().debug_options().xla_gpu_deterministic_reductions() ||
hlo_module->config().debug_options().xla_gpu_deterministic_ops()) {
pipeline.AddPass<HloPassFix<GpuTreeReductionRewriter>>();
}

View File

@ -636,7 +636,8 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
}
auto selected_result = filtered_results.begin();
if (!RequireCudnnDeterminism()) {
if (!RequireCudnnDeterminism() &&
!hlo_module_config.debug_options().xla_gpu_deterministic_ops()) {
selected_result = absl::c_min_element(
filtered_results,
[](const AutotuneResult& lhs, const AutotuneResult& rhs) {

View File

@ -110,6 +110,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/xla_data.pb.h"
#include "tensorflow/core/lib/core/bits.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/errors.h"
#include "tensorflow/core/platform/logging.h"
#if GOOGLE_CUDA
@ -1636,6 +1637,17 @@ Status IrEmitterUnnested::HandleGetTupleElement(HloInstruction*) {
return Status::OK();
}
Status IrEmitterUnnested::AssertNonDeterminismIsOkay(const string& op_name) {
if (hlo_module_config_.debug_options().xla_gpu_deterministic_ops()) {
return Unimplemented(
"HLO instruction %s does not have a deterministic implementation, "
"but run-to-run determinism is required by "
"--xla_gpu_deterministic_ops.",
op_name);
}
return Status::OK();
}
Status IrEmitterUnnested::HandleSelectAndScatter(
HloInstruction* select_and_scatter) {
const Window& window = select_and_scatter->window();
@ -1651,6 +1663,8 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
"Dilation for SelectAndScatter not implemented on GPU.");
}
TF_RETURN_IF_ERROR(AssertNonDeterminismIsOkay(select_and_scatter->name()));
TF_ASSIGN_OR_RETURN(auto input, GetMlirEmitterInput(select_and_scatter));
return EmitSelectAndScatterFromMlir(input);
}
@ -1936,6 +1950,9 @@ Status IrEmitterUnnested::HandleRngGetAndUpdateState(
}
Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
if (!scatter->unique_indices()) {
TF_RETURN_IF_ERROR(AssertNonDeterminismIsOkay(scatter->name()));
}
TF_ASSIGN_OR_RETURN(auto input, GetMlirEmitterInput(scatter));
return EmitScatterFromMlir(input);
}
@ -2037,6 +2054,9 @@ Status IrEmitterUnnested::EmitScatter(
Status IrEmitterUnnested::EmitScatter(const ScatterDescriptor& desc,
Thunk* thunk) {
if (!desc.unique_indices) {
TF_RETURN_IF_ERROR(AssertNonDeterminismIsOkay(desc.name));
}
auto loop_body_emitter = [&](const IrArray::Index& index) -> Status {
std::vector<llvm::Value*> raw_window_multidim;
std::vector<llvm::Value*> input_scatter_multidim;

View File

@ -708,6 +708,8 @@ class IrEmitterUnnested : public IrEmitter,
Thunk::ThunkInfo GetThunkInfo(const HloInstruction* hlo) const override;
Status AssertNonDeterminismIsOkay(const string& op_name);
// The thunk sequence this IrEmitter generates for the input computation.
ThunkSequence thunk_sequence_;

View File

@ -306,7 +306,12 @@ message DebugOptions {
// threads. Setting to 0 (the default value) means no enforcement.
int32 xla_gpu_force_compilation_parallelism = 147;
// Next id: 148
// Guarantees run-to-run determinism. At present, the HLO ops Scatter and
// SelectAndScatter do not have deterministic XLA:GPU implementations.
// Compilation errors out if these ops are encountered.
bool xla_gpu_deterministic_ops = 148;
// Next id: 149
// Extra options to pass to the compilation backend (e.g. LLVM); specific
// interpretation of these values is left to the backend.