[XLA:GPU] Add --xla_gpu_deterministic_ops to force run-to-run determinism.

Fail if there is no deterministic lowering for HLO ops in the module. The env variable `TF_DETERMINISTIC_OPS` is related, but serves a somewhat different use case. First, it is best-effort in the sense that deterministic lowering will be preferred if the exist. However, Scatter with non-unique indexes and SelectAndScatter do not have a deterministic implementation at present, which is silently ignored. In contrast, --xla_gpu_deterministic_ops will cleanly fail compilation for these ops. Second, the `TF_DETERMINISTIC_OPS` env variable is not convenient for compiler frontends other than TensorFlow. PiperOrigin-RevId: 347672477 Change-Id: I62c3093784560394b0aa173747afd350a299ed64
2020-12-15 12:45:39 -08:00 · 2020-12-15 12:45:39 -08:00 · bb4a8a8b49
commit bb4a8a8b49
parent 5f08955b75
6 changed files with 37 additions and 3 deletions
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@ -600,6 +600,11 @@ static void AllocateFlags() {
      flag_values->xla_gpu_force_compilation_parallelism(),
      "Overrides normal multi-threaded compilation settting to use this many "
      "threads. Setting to 0 (the default value) means no enforcement."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_gpu_deterministic_ops",
+      bool_setter_for(&DebugOptions::set_xla_gpu_deterministic_ops),
+      flag_values->xla_gpu_deterministic_ops(),
+      "Guarantees run-to-run determinism on GPU."));

  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@ -421,7 +421,8 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
  pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);

  if (RequireDeterminism() ||
-      hlo_module->config().debug_options().xla_gpu_deterministic_reductions()) {
+      hlo_module->config().debug_options().xla_gpu_deterministic_reductions() ||
+      hlo_module->config().debug_options().xla_gpu_deterministic_ops()) {
    pipeline.AddPass<HloPassFix<GpuTreeReductionRewriter>>();
  }

--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
@ -636,7 +636,8 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
  }

  auto selected_result = filtered_results.begin();
-  if (!RequireCudnnDeterminism()) {
+  if (!RequireCudnnDeterminism() &&
+      !hlo_module_config.debug_options().xla_gpu_deterministic_ops()) {
    selected_result = absl::c_min_element(
        filtered_results,
        [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@ -110,6 +110,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"

 #if GOOGLE_CUDA
@ -1636,6 +1637,17 @@ Status IrEmitterUnnested::HandleGetTupleElement(HloInstruction*) {
  return Status::OK();
 }

+Status IrEmitterUnnested::AssertNonDeterminismIsOkay(const string& op_name) {
+  if (hlo_module_config_.debug_options().xla_gpu_deterministic_ops()) {
+    return Unimplemented(
+        "HLO instruction %s does not have a deterministic implementation, "
+        "but run-to-run determinism is required by "
+        "--xla_gpu_deterministic_ops.",
+        op_name);
+  }
+  return Status::OK();
+}
+
 Status IrEmitterUnnested::HandleSelectAndScatter(
    HloInstruction* select_and_scatter) {
  const Window& window = select_and_scatter->window();
@ -1651,6 +1663,8 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
        "Dilation for SelectAndScatter not implemented on GPU.");
  }

+  TF_RETURN_IF_ERROR(AssertNonDeterminismIsOkay(select_and_scatter->name()));
+
  TF_ASSIGN_OR_RETURN(auto input, GetMlirEmitterInput(select_and_scatter));
  return EmitSelectAndScatterFromMlir(input);
 }
@ -1936,6 +1950,9 @@ Status IrEmitterUnnested::HandleRngGetAndUpdateState(
 }

 Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
+  if (!scatter->unique_indices()) {
+    TF_RETURN_IF_ERROR(AssertNonDeterminismIsOkay(scatter->name()));
+  }
  TF_ASSIGN_OR_RETURN(auto input, GetMlirEmitterInput(scatter));
  return EmitScatterFromMlir(input);
 }
@ -2037,6 +2054,9 @@ Status IrEmitterUnnested::EmitScatter(

 Status IrEmitterUnnested::EmitScatter(const ScatterDescriptor& desc,
                                      Thunk* thunk) {
+  if (!desc.unique_indices) {
+    TF_RETURN_IF_ERROR(AssertNonDeterminismIsOkay(desc.name));
+  }
  auto loop_body_emitter = [&](const IrArray::Index& index) -> Status {
    std::vector<llvm::Value*> raw_window_multidim;
    std::vector<llvm::Value*> input_scatter_multidim;
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@ -708,6 +708,8 @@ class IrEmitterUnnested : public IrEmitter,

  Thunk::ThunkInfo GetThunkInfo(const HloInstruction* hlo) const override;

+  Status AssertNonDeterminismIsOkay(const string& op_name);
+
  // The thunk sequence this IrEmitter generates for the input computation.
  ThunkSequence thunk_sequence_;

--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@ -306,7 +306,12 @@ message DebugOptions {
  // threads. Setting to 0 (the default value) means no enforcement.
  int32 xla_gpu_force_compilation_parallelism = 147;

-  // Next id: 148
+  // Guarantees run-to-run determinism. At present, the HLO ops Scatter and
+  // SelectAndScatter do not have deterministic XLA:GPU implementations.
+  // Compilation errors out if these ops are encountered.
+  bool xla_gpu_deterministic_ops = 148;
+
+  // Next id: 149

  // Extra options to pass to the compilation backend (e.g. LLVM); specific
  // interpretation of these values is left to the backend.