[XLA:GPU] Add --xla_gpu_deterministic_ops to force run-to-run determinism.
Fail if there is no deterministic lowering for HLO ops in the module. The env variable `TF_DETERMINISTIC_OPS` is related, but serves a somewhat different use case. First, it is best-effort in the sense that deterministic lowering will be preferred if the exist. However, Scatter with non-unique indexes and SelectAndScatter do not have a deterministic implementation at present, which is silently ignored. In contrast, --xla_gpu_deterministic_ops will cleanly fail compilation for these ops. Second, the `TF_DETERMINISTIC_OPS` env variable is not convenient for compiler frontends other than TensorFlow. PiperOrigin-RevId: 347672477 Change-Id: I62c3093784560394b0aa173747afd350a299ed64
This commit is contained in:
parent
5f08955b75
commit
bb4a8a8b49
@ -600,6 +600,11 @@ static void AllocateFlags() {
|
||||
flag_values->xla_gpu_force_compilation_parallelism(),
|
||||
"Overrides normal multi-threaded compilation settting to use this many "
|
||||
"threads. Setting to 0 (the default value) means no enforcement."));
|
||||
flag_objects->push_back(tensorflow::Flag(
|
||||
"xla_gpu_deterministic_ops",
|
||||
bool_setter_for(&DebugOptions::set_xla_gpu_deterministic_ops),
|
||||
flag_values->xla_gpu_deterministic_ops(),
|
||||
"Guarantees run-to-run determinism on GPU."));
|
||||
|
||||
ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
|
||||
}
|
||||
|
@ -421,7 +421,8 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
|
||||
pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
|
||||
|
||||
if (RequireDeterminism() ||
|
||||
hlo_module->config().debug_options().xla_gpu_deterministic_reductions()) {
|
||||
hlo_module->config().debug_options().xla_gpu_deterministic_reductions() ||
|
||||
hlo_module->config().debug_options().xla_gpu_deterministic_ops()) {
|
||||
pipeline.AddPass<HloPassFix<GpuTreeReductionRewriter>>();
|
||||
}
|
||||
|
||||
|
@ -636,7 +636,8 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
||||
}
|
||||
|
||||
auto selected_result = filtered_results.begin();
|
||||
if (!RequireCudnnDeterminism()) {
|
||||
if (!RequireCudnnDeterminism() &&
|
||||
!hlo_module_config.debug_options().xla_gpu_deterministic_ops()) {
|
||||
selected_result = absl::c_min_element(
|
||||
filtered_results,
|
||||
[](const AutotuneResult& lhs, const AutotuneResult& rhs) {
|
||||
|
@ -110,6 +110,7 @@ limitations under the License.
|
||||
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
||||
#include "tensorflow/core/lib/core/bits.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/platform/errors.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
@ -1636,6 +1637,17 @@ Status IrEmitterUnnested::HandleGetTupleElement(HloInstruction*) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status IrEmitterUnnested::AssertNonDeterminismIsOkay(const string& op_name) {
|
||||
if (hlo_module_config_.debug_options().xla_gpu_deterministic_ops()) {
|
||||
return Unimplemented(
|
||||
"HLO instruction %s does not have a deterministic implementation, "
|
||||
"but run-to-run determinism is required by "
|
||||
"--xla_gpu_deterministic_ops.",
|
||||
op_name);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status IrEmitterUnnested::HandleSelectAndScatter(
|
||||
HloInstruction* select_and_scatter) {
|
||||
const Window& window = select_and_scatter->window();
|
||||
@ -1651,6 +1663,8 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
|
||||
"Dilation for SelectAndScatter not implemented on GPU.");
|
||||
}
|
||||
|
||||
TF_RETURN_IF_ERROR(AssertNonDeterminismIsOkay(select_and_scatter->name()));
|
||||
|
||||
TF_ASSIGN_OR_RETURN(auto input, GetMlirEmitterInput(select_and_scatter));
|
||||
return EmitSelectAndScatterFromMlir(input);
|
||||
}
|
||||
@ -1936,6 +1950,9 @@ Status IrEmitterUnnested::HandleRngGetAndUpdateState(
|
||||
}
|
||||
|
||||
Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
|
||||
if (!scatter->unique_indices()) {
|
||||
TF_RETURN_IF_ERROR(AssertNonDeterminismIsOkay(scatter->name()));
|
||||
}
|
||||
TF_ASSIGN_OR_RETURN(auto input, GetMlirEmitterInput(scatter));
|
||||
return EmitScatterFromMlir(input);
|
||||
}
|
||||
@ -2037,6 +2054,9 @@ Status IrEmitterUnnested::EmitScatter(
|
||||
|
||||
Status IrEmitterUnnested::EmitScatter(const ScatterDescriptor& desc,
|
||||
Thunk* thunk) {
|
||||
if (!desc.unique_indices) {
|
||||
TF_RETURN_IF_ERROR(AssertNonDeterminismIsOkay(desc.name));
|
||||
}
|
||||
auto loop_body_emitter = [&](const IrArray::Index& index) -> Status {
|
||||
std::vector<llvm::Value*> raw_window_multidim;
|
||||
std::vector<llvm::Value*> input_scatter_multidim;
|
||||
|
@ -708,6 +708,8 @@ class IrEmitterUnnested : public IrEmitter,
|
||||
|
||||
Thunk::ThunkInfo GetThunkInfo(const HloInstruction* hlo) const override;
|
||||
|
||||
Status AssertNonDeterminismIsOkay(const string& op_name);
|
||||
|
||||
// The thunk sequence this IrEmitter generates for the input computation.
|
||||
ThunkSequence thunk_sequence_;
|
||||
|
||||
|
@ -306,7 +306,12 @@ message DebugOptions {
|
||||
// threads. Setting to 0 (the default value) means no enforcement.
|
||||
int32 xla_gpu_force_compilation_parallelism = 147;
|
||||
|
||||
// Next id: 148
|
||||
// Guarantees run-to-run determinism. At present, the HLO ops Scatter and
|
||||
// SelectAndScatter do not have deterministic XLA:GPU implementations.
|
||||
// Compilation errors out if these ops are encountered.
|
||||
bool xla_gpu_deterministic_ops = 148;
|
||||
|
||||
// Next id: 149
|
||||
|
||||
// Extra options to pass to the compilation backend (e.g. LLVM); specific
|
||||
// interpretation of these values is left to the backend.
|
||||
|
Loading…
Reference in New Issue
Block a user