From 27903a397838abaa6833541872f2ca60114725b9 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Tue, 9 Jul 2019 08:20:32 -0700
Subject: [PATCH 1/6] [XLA] add the Scatter option to not do atomic operation.
 This should keep the backward compatibility when this option didn't exist.

---
 tensorflow/compiler/xla/client/xla_builder.cc |  8 +++++---
 tensorflow/compiler/xla/client/xla_builder.h  |  6 +++---
 tensorflow/compiler/xla/python/xla_client.py  |  5 +++--
 .../xla/service/gpu/ir_emitter_unnested.cc    | 19 ++++++++++++++-----
 .../xla/service/gpu/ir_emitter_unnested.h     |  8 ++++++--
 tensorflow/compiler/xla/service/hlo.proto     |  7 ++++++-
 .../compiler/xla/service/hlo_instruction.cc   |  6 +++---
 .../compiler/xla/service/hlo_instruction.h    |  7 ++++++-
 .../compiler/xla/service/hlo_instructions.cc  | 14 ++++++++++----
 .../compiler/xla/service/hlo_instructions.h   |  4 +++-
 tensorflow/compiler/xla/service/hlo_parser.cc |  5 ++++-
 11 files changed, 63 insertions(+), 26 deletions(-)
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index dccdec22fb9..148a6f65c30 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -1734,11 +1734,13 @@ XlaOp XlaBuilder::Scatter(const XlaOp& input, const XlaOp& scatter_indices,
                           const XlaOp& updates,
                           const XlaComputation& update_computation,
                           const ScatterDimensionNumbers& dimension_numbers,
-                          bool indices_are_sorted) {
+                          bool indices_are_sorted, bool use_atomic) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     instr.set_indices_are_sorted(indices_are_sorted);
 
+    instr.set_use_atomic(use_atomic);
+
     TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input));
     TF_ASSIGN_OR_RETURN(const Shape& scatter_indices_shape,
                         GetShape(scatter_indices));
@@ -3378,10 +3380,10 @@ XlaOp Gather(const XlaOp input, const XlaOp start_indices,
 XlaOp Scatter(const XlaOp input, const XlaOp scatter_indices,
               const XlaOp updates, const XlaComputation& update_computation,
               const ScatterDimensionNumbers& dimension_numbers,
-              bool indices_are_sorted) {
+              bool indices_are_sorted, bool use_atomic) {
   return input.builder()->Scatter(input, scatter_indices, updates,
                                   update_computation, dimension_numbers,
-                                  indices_are_sorted);
+                                  indices_are_sorted, use_atomic);
 }
 
 void Send(const XlaOp operand, const ChannelHandle& handle) {
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 5c28e8b5150..30553183531 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -592,7 +592,7 @@ class XlaBuilder {
   XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
                 const XlaOp& updates, const XlaComputation& update_computation,
                 const ScatterDimensionNumbers& dimension_numbers,
-                bool indices_are_sorted = false);
+                bool indices_are_sorted = false, bool use_atomic = true);
 
   void Send(const XlaOp& operand, const ChannelHandle& handle);
   XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
@@ -1010,7 +1010,7 @@ class XlaBuilder {
   friend XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates,
                        const XlaComputation& update_computation,
                        const ScatterDimensionNumbers& dimension_numbers,
-                       bool indices_are_sorted);
+                       bool indices_are_sorted, bool use_atomic);
   friend void Send(XlaOp operand, const ChannelHandle& handle);
   friend XlaOp Recv(XlaBuilder* builder, const Shape& shape,
                     const ChannelHandle& handle);
@@ -1869,7 +1869,7 @@ XlaOp Gather(XlaOp input, XlaOp start_indices,
 XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates,
               const XlaComputation& update_computation,
               const ScatterDimensionNumbers& dimension_numbers,
-              bool indices_are_sorted = false);
+              bool indices_are_sorted = false, bool use_atomic = true);
 
 // Enqueues a Send node onto the computation for device-to-device
 // communication. This operation sends the given operand to
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 63a9ea37692..b2014c08951 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1544,11 +1544,12 @@ class ComputationBuilder(object):
               updates,
               update_computation,
               dimension_numbers,
-              indices_are_sorted=False):
+              indices_are_sorted=False,
+              use_atomic=True):
     """Enqueues a Scatter operation onto the computation."""
     return ops.Scatter(a, scatter_indices, updates,
                        update_computation.computation, dimension_numbers,
-                       indices_are_sorted)
+                       indices_are_sorted, use_atomic)
 
   def Fft(self, operand, fft_type, fft_lengths):
     """Enqueues a FFT operation onto the computation."""
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 0541367bc71..423b27cc2cd 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -403,7 +403,8 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
               /*scatter_indices_gen=*/
               scatter_fused_emitter.GetGenerator(root->operand(1)),
               /*updates_gen=*/
-              scatter_fused_emitter.GetGenerator(root->operand(2))));
+              scatter_fused_emitter.GetGenerator(root->operand(2)),
+              root->use_atomic()));
         }
         AddThunkToThunkSequence(
             absl::make_unique<SequentialThunk>(std::move(thunks), fusion));
@@ -840,7 +841,8 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
       [=](const IrArray::Index& index) {
         return GetIrArray(*updates, *scatter)
             .EmitReadArrayElement(index, &b_, "update");
-      }));
+      },
+      scatter->use_atomic()));
 
   // Elide the sequential thunk if there's no copy.
   if (thunks.size() == 1) {
@@ -856,7 +858,8 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
 Status IrEmitterUnnested::EmitScatter(
     Thunk* thunk, HloInstruction* scatter,
     const llvm_ir::ElementGenerator& scatter_indices_gen,
-    const llvm_ir::ElementGenerator& updates_gen) {
+    const llvm_ir::ElementGenerator& updates_gen,
+    bool use_atomic) {
   const HloInstruction* operand = scatter->operand(0);
   const HloInstruction* scatter_indices = scatter->operand(1);
   const HloInstruction* updates = scatter->operand(2);
@@ -965,8 +968,14 @@ Status IrEmitterUnnested::EmitScatter(
         updates->shape().element_type(), module_));
     TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, updates_gen(index));
     Store(input_ir_value, input_address);
-    return EmitAtomicOperationForNestedComputation(
-        *scatter->to_apply(), output_address, input_address);
+
+    if (scatter->use_atomic()) {
+      return EmitAtomicOperationForNestedComputation(
+          *scatter->to_apply(), output_address, input_address);
+    } else {
+      return EmitCallToNestedComputation(
+          *scatter->to_apply(), {output_address, input_address}, output_address);
+    }
   };
 
   // Launch a kernel that reads every element in the updates tensor. We could
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 4fcc5dedb67..a00b80b07e8 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -226,10 +226,14 @@ class IrEmitterUnnested : public IrEmitter,
   // Emits code for an in-place scatter, modifying `thunk`s launch dimensions in
   // the process. `scatter` may be fused, scatter indices are taken from
   // `scatter_indices_gen`, updates from`updates_gen`. The output buffer is
-  // expected to have the operand values in it already.
+  // expected to have the operand values in it already. If use_atomic
+  // is true, we will use an atomic update. Using false for use_atomic
+  // is safe only when it is guaranteed that there is no duplicate
+  // indices.
   Status EmitScatter(Thunk* thunk, HloInstruction* scatter,
                      const llvm_ir::ElementGenerator& scatter_indices_gen,
-                     const llvm_ir::ElementGenerator& updates_gen);
+                     const llvm_ir::ElementGenerator& updates_gen,
+                     bool use_atomic);
 
   // Returns true if a 0-2-1 tiling algorithm is already used to emit the kernel
   // for the hlo instruction.
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 61e562c7eda..84abe7ec0ab 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -35,7 +35,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 69
+// Next ID: 70
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -237,6 +237,11 @@ message HloInstructionProto {
 
   // Frontend attributes to pass to the XLA backend.
   xla.FrontendAttributes frontend_attributes = 68;
+
+  // Specifies if the scatter should use atomic operation or not. If
+  // there is duplicate index, then it should be true to compute the
+  // right answer.
+  bool use_atomic = 69;
 }
 
 // Serialization of HloComputation.
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index dabd7ab2836..ccd9041873a 100755
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -565,7 +565,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
               proto.scatter_dimension_numbers());
       instruction = CreateScatter(shape, operands(0), operands(1), operands(2),
                                   computations(0), *scatter_dimension_numbers,
-                                  proto.indices_are_sorted());
+                                  proto.indices_are_sorted(), proto.use_atomic());
       break;
     }
     case HloOpcode::kIota:
@@ -1393,10 +1393,10 @@ bool HloInstruction::HasSideEffect() const {
     HloInstruction* scatter_indices, HloInstruction* updates,
     HloComputation* update_computation,
     const ScatterDimensionNumbers& scatter_dim_numbers,
-    bool indices_are_sorted) {
+    bool indices_are_sorted, bool use_atomic) {
   return absl::make_unique<HloScatterInstruction>(
       shape, operand, scatter_indices, updates, update_computation,
-      scatter_dim_numbers, indices_are_sorted);
+      scatter_dim_numbers, indices_are_sorted, use_atomic);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDomain(
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 3119b52e377..f28184e7e78 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -801,7 +801,7 @@ class HloInstruction {
       HloInstruction* scatter_indices, HloInstruction* updates,
       HloComputation* update_computation,
       const ScatterDimensionNumbers& scatter_dim_numbers,
-      bool indices_are_sorted);
+      bool indices_are_sorted, bool use_atomic);
 
   // Creates a kDomain instruction which delimits an HLO domain which have
   // the provided user and operand side metadata.
@@ -1629,6 +1629,11 @@ class HloInstruction {
     LOG(FATAL) << "Unimplemented method.";
   }
 
+  // Returns the use_atomic field.
+  virtual bool use_atomic() const {
+    LOG(FATAL) << "Unimplemented method.";
+  }
+
   // Returns data on the dimension numbers used for a convolution operation,
   // which may be a kConvolution instruction or a kCustomCall that implements a
   // convolution.
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 183967941bf..e54eb81ed1f 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -2493,9 +2493,10 @@ HloScatterInstruction::HloScatterInstruction(
     const Shape& shape, HloInstruction* operand,
     HloInstruction* scatter_indices, HloInstruction* updates,
     HloComputation* update_computation,
-    const ScatterDimensionNumbers& scatter_dim_numbers, bool indices_are_sorted)
+    const ScatterDimensionNumbers& scatter_dim_numbers, bool indices_are_sorted,
+    bool use_atomic)
     : HloInstruction(HloOpcode::kScatter, shape),
-      indices_are_sorted_(indices_are_sorted) {
+      indices_are_sorted_(indices_are_sorted), use_atomic_(use_atomic) {
   AppendOperand(operand);
   AppendOperand(scatter_indices);
   AppendOperand(updates);
@@ -2550,6 +2551,7 @@ HloInstructionProto HloScatterInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   *proto.mutable_scatter_dimension_numbers() = scatter_dimension_numbers();
   proto.set_indices_are_sorted(indices_are_sorted());
+  proto.set_use_atomic(use_atomic());
   return proto;
 }
 
@@ -2560,6 +2562,9 @@ std::vector<string> HloScatterInstruction::ExtraAttributesToStringImpl(
   if (indices_are_sorted()) {
     attrs.push_back("indices_are_sorted=true");
   }
+  if (!use_atomic()) {
+    attrs.push_back("use_atomic=false");
+  }
   return attrs;
 }
 
@@ -2572,7 +2577,8 @@ bool HloScatterInstruction::IdenticalSlowPath(
              scatter_dimension_numbers(),
              casted_other.scatter_dimension_numbers()) &&
          eq_computations(to_apply(), casted_other.to_apply()) &&
-         indices_are_sorted() == casted_other.indices_are_sorted();
+         indices_are_sorted() == casted_other.indices_are_sorted() &&
+         use_atomic() == casted_other.use_atomic();
 }
 
 std::unique_ptr<HloInstruction> HloScatterInstruction::CloneWithNewOperandsImpl(
@@ -2581,7 +2587,7 @@ std::unique_ptr<HloInstruction> HloScatterInstruction::CloneWithNewOperandsImpl(
   CHECK_EQ(new_operands.size(), 3);
   return absl::make_unique<HloScatterInstruction>(
       shape, new_operands[0], new_operands[1], new_operands[2], to_apply(),
-      scatter_dimension_numbers(), indices_are_sorted());
+      scatter_dimension_numbers(), indices_are_sorted(), use_atomic());
 }
 
 HloIotaInstruction::HloIotaInstruction(const Shape& shape, int64 iota_dimension)
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 0de050108b7..cec090172ee 100755
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -1453,7 +1453,7 @@ class HloScatterInstruction : public HloInstruction {
       HloInstruction* scatter_indices, HloInstruction* updates,
       HloComputation* update_computation,
       const ScatterDimensionNumbers& scatter_dim_numbers,
-      bool indices_are_sorted);
+      bool indices_are_sorted, bool use_atomic);
   const ScatterDimensionNumbers& scatter_dimension_numbers() const {
     CHECK(scatter_dimension_numbers_ != nullptr);
     return *scatter_dimension_numbers_;
@@ -1462,6 +1462,7 @@ class HloScatterInstruction : public HloInstruction {
   void set_indices_are_sorted(bool indices_are_sorted) {
     indices_are_sorted_ = indices_are_sorted;
   }
+  bool use_atomic() const override { return use_atomic_; }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
@@ -1489,6 +1490,7 @@ class HloScatterInstruction : public HloInstruction {
 
   std::unique_ptr<ScatterDimensionNumbers> scatter_dimension_numbers_;
   bool indices_are_sorted_;
+  bool use_atomic_;
 };
 
 class HloIotaInstruction : public HloInstruction {
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index c96bfb15187..b5eb0694275 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -1726,6 +1726,9 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<bool> indices_are_sorted = false;
       attrs["indices_are_sorted"] = {/*required=*/false, AttrTy::kBool,
                                      &indices_are_sorted};
+      optional<bool> use_atomic = true;
+      attrs["use_atomic"] = {/*required=*/false, AttrTy::kBool,
+                             &use_atomic};
 
       if (!ParseOperands(&operands, /*expected_size=*/3) ||
           !ParseAttributes(attrs)) {
@@ -1742,7 +1745,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       instruction = builder->AddInstruction(HloInstruction::CreateScatter(
           shape, /*operand=*/operands[0], /*scatter_indices=*/operands[1],
           /*updates=*/operands[2], *update_computation, dim_numbers,
-          indices_are_sorted.value()));
+          indices_are_sorted.value(), use_atomic.value()));
       break;
     }
     case HloOpcode::kDomain: {

From 7d22d959e0bc3dc126ea0a431ebcb7690d48ac9d Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Thu, 11 Jul 2019 08:39:40 -0700
Subject: [PATCH 2/6] Add a test for Scatter without atomics.

---
 tensorflow/compiler/xla/tests/scatter_test.cc | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/scatter_test.cc b/tensorflow/compiler/xla/tests/scatter_test.cc
index 05073de9c90..1b8f58f858a 100644
--- a/tensorflow/compiler/xla/tests/scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/scatter_test.cc
@@ -225,6 +225,36 @@ ENTRY main {
   RunTest(hlo_text, &operand, &scatter_indices, &updates);
 }
 
+XLA_TEST_F(ScatterTest, TensorFlowScatter_Add_NoAtomic) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatter_Add
+
+add_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=add_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      use_atomic=false
+}
+)";
+  Literal operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
+  Literal updates = LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  RunTest(hlo_text, &operand, &scatter_indices, &updates);
+}
+
 XLA_TEST_F(ScatterTest, TensorFlowScatter_Mul) {
   const string hlo_text = R"(
 HloModule TensorFlowScatter_Mul

From 5fea6eecde8dde2e122ad29217c3fff33e89ded2 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Fri, 12 Jul 2019 06:20:56 -0700
Subject: [PATCH 3/6] Update comment

---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index a00b80b07e8..8308e97ec99 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -228,8 +228,10 @@ class IrEmitterUnnested : public IrEmitter,
   // `scatter_indices_gen`, updates from`updates_gen`. The output buffer is
   // expected to have the operand values in it already. If use_atomic
   // is true, we will use an atomic update. Using false for use_atomic
-  // is safe only when it is guaranteed that there is no duplicate
+  // is safe only when it is guaranteed that there are no duplicate
   // indices.
+  // When using use_atomi=false, it is the caller responsibility to
+  // ensure there is overlap.
   Status EmitScatter(Thunk* thunk, HloInstruction* scatter,
                      const llvm_ir::ElementGenerator& scatter_indices_gen,
                      const llvm_ir::ElementGenerator& updates_gen,

From c77c165dcd169ce8a02e2f86a0612b859a205000 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Fri, 23 Aug 2019 06:36:32 -0700
Subject: [PATCH 4/6] Add documentation and tests

---
 .../compiler/xla/g3doc/operation_semantics.md |  7 +++++++
 .../xla/service/hlo_instruction_test.cc       |  3 ++-
 .../compiler/xla/service/hlo_parser_test.cc   | 19 +++++++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 1f2790e98bb..bb6db42e168 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -1382,6 +1382,10 @@ For a more intuitive description, see the "Informal Description" section below.
 | `indices_are_sorted`   | `bool`              | Whether the indices are       |
 :                        :                     : guaranteed to be sorted by    :
 :                        :                     : the caller.                   :
+| `use_atomic`           | `bool`              | Whether to use atomic         |
+:                        :                     : operation for the update. To  :
+:                        :                     : use only when the the caller  :
+:                        :                     : guarante no duplicate indices :
 
 For convenience, we label dimensions in the output array not in `offset_dims`
 as `batch_dims`.
@@ -1450,6 +1454,9 @@ If `indices_are_sorted` is set to true then XLA can assume that `start_indices`
 are sorted (in ascending `start_index_map` order) by the user. If they are not
 then the semantics is implementation defined.
 
+If `use_atomic` is set to false then XLA will not use atomic operation. This
+is only safe when there is no duplicate indices.
+
 ### Informal Description and Examples
 
 Informally, every index `Out` in the output array corresponds to an element `E`
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 0a50ed04af7..d3ef5ec83c8 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -1529,7 +1529,8 @@ TEST_F(HloInstructionTest, StringifyScatter) {
               /*inserted_window_dims=*/{},
               /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
               /*index_vector_dim=*/2),
-          /*indices_are_sorted=*/false));
+              /*indices_are_sorted=*/false,
+              /*use_atomic=*/true));
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index c913784cd13..0f06af69ff5 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -934,6 +934,25 @@ ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7
   ROOT %scatter = f32[50,49,48,47,46]{4,3,2,1,0} scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %scatter_indices, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %updates), update_window_dims={4,5,6,7,8}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, indices_are_sorted=true, to_apply=%add_F32.v3
 }
 
+)"
+},
+{
+"AtomicScatter",
+R"(HloModule StringifyAtomicScatter
+
+%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
+}
+
+ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7,5], updates: f32[10,9,8,7,30,29,28,27,26]) -> f32[50,49,48,47,46] {
+  %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0)
+  %scatter_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
+  %updates = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} parameter(2)
+  ROOT %scatter = f32[50,49,48,47,46]{4,3,2,1,0} scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %scatter_indices, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %updates), update_window_dims={4,5,6,7,8}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, use_atomic=false, to_apply=%add_F32.v3
+}
+
 )"
 },
 {

From 2a99d7e898a5a96e258a4c323c1166da6ed4b0a9 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Fri, 23 Aug 2019 10:01:42 -0700
Subject: [PATCH 5/6] change the new parameter name.

---
 tensorflow/compiler/xla/client/xla_builder.cc      |  8 ++++----
 tensorflow/compiler/xla/client/xla_builder.h       |  6 +++---
 .../compiler/xla/g3doc/operation_semantics.md      | 13 +++++++------
 tensorflow/compiler/xla/python/xla_client.py       |  4 ++--
 .../xla/service/gpu/ir_emitter_unnested.cc         |  8 ++++----
 .../compiler/xla/service/gpu/ir_emitter_unnested.h | 10 +++++-----
 tensorflow/compiler/xla/service/hlo.proto          |  7 +++----
 tensorflow/compiler/xla/service/hlo_instruction.cc |  6 +++---
 tensorflow/compiler/xla/service/hlo_instruction.h  |  6 +++---
 .../compiler/xla/service/hlo_instruction_test.cc   |  2 +-
 .../compiler/xla/service/hlo_instructions.cc       | 14 +++++++-------
 tensorflow/compiler/xla/service/hlo_instructions.h |  6 +++---
 tensorflow/compiler/xla/service/hlo_parser.cc      |  8 ++++----
 tensorflow/compiler/xla/service/hlo_parser_test.cc |  6 +++---
 tensorflow/compiler/xla/tests/scatter_test.cc      |  4 ++--
 15 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 148a6f65c30..d0c80084473 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -1734,12 +1734,12 @@ XlaOp XlaBuilder::Scatter(const XlaOp& input, const XlaOp& scatter_indices,
                           const XlaOp& updates,
                           const XlaComputation& update_computation,
                           const ScatterDimensionNumbers& dimension_numbers,
-                          bool indices_are_sorted, bool use_atomic) {
+                          bool indices_are_sorted, bool unique_indices) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     instr.set_indices_are_sorted(indices_are_sorted);
 
-    instr.set_use_atomic(use_atomic);
+    instr.set_unique_indices(unique_indices);
 
     TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input));
     TF_ASSIGN_OR_RETURN(const Shape& scatter_indices_shape,
@@ -3380,10 +3380,10 @@ XlaOp Gather(const XlaOp input, const XlaOp start_indices,
 XlaOp Scatter(const XlaOp input, const XlaOp scatter_indices,
               const XlaOp updates, const XlaComputation& update_computation,
               const ScatterDimensionNumbers& dimension_numbers,
-              bool indices_are_sorted, bool use_atomic) {
+              bool indices_are_sorted, bool unique_indices) {
   return input.builder()->Scatter(input, scatter_indices, updates,
                                   update_computation, dimension_numbers,
-                                  indices_are_sorted, use_atomic);
+                                  indices_are_sorted, unique_indices);
 }
 
 void Send(const XlaOp operand, const ChannelHandle& handle) {
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 30553183531..187cd261833 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -592,7 +592,7 @@ class XlaBuilder {
   XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
                 const XlaOp& updates, const XlaComputation& update_computation,
                 const ScatterDimensionNumbers& dimension_numbers,
-                bool indices_are_sorted = false, bool use_atomic = true);
+                bool indices_are_sorted = false, bool unique_indices = false);
 
   void Send(const XlaOp& operand, const ChannelHandle& handle);
   XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
@@ -1010,7 +1010,7 @@ class XlaBuilder {
   friend XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates,
                        const XlaComputation& update_computation,
                        const ScatterDimensionNumbers& dimension_numbers,
-                       bool indices_are_sorted, bool use_atomic);
+                       bool indices_are_sorted, bool unique_indices);
   friend void Send(XlaOp operand, const ChannelHandle& handle);
   friend XlaOp Recv(XlaBuilder* builder, const Shape& shape,
                     const ChannelHandle& handle);
@@ -1869,7 +1869,7 @@ XlaOp Gather(XlaOp input, XlaOp start_indices,
 XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates,
               const XlaComputation& update_computation,
               const ScatterDimensionNumbers& dimension_numbers,
-              bool indices_are_sorted = false, bool use_atomic = true);
+              bool indices_are_sorted = false, bool unique_indices = false);
 
 // Enqueues a Send node onto the computation for device-to-device
 // communication. This operation sends the given operand to
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index bb6db42e168..355e5fec73a 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -1382,10 +1382,9 @@ For a more intuitive description, see the "Informal Description" section below.
 | `indices_are_sorted`   | `bool`              | Whether the indices are       |
 :                        :                     : guaranteed to be sorted by    :
 :                        :                     : the caller.                   :
-| `use_atomic`           | `bool`              | Whether to use atomic         |
-:                        :                     : operation for the update. To  :
-:                        :                     : use only when the the caller  :
-:                        :                     : guarante no duplicate indices :
+| `unique_indices`       | `bool`              | Whether the indices are       |
+:                        :                     : guaranteed to be unique by    :
+:                        :                     : the caller                    :
 
 For convenience, we label dimensions in the output array not in `offset_dims`
 as `batch_dims`.
@@ -1454,8 +1453,10 @@ If `indices_are_sorted` is set to true then XLA can assume that `start_indices`
 are sorted (in ascending `start_index_map` order) by the user. If they are not
 then the semantics is implementation defined.
 
-If `use_atomic` is set to false then XLA will not use atomic operation. This
-is only safe when there is no duplicate indices.
+If `unique_indices` is set to true then XLA can assume that all
+element scattered to are unique.  So XLA could use non-atomic
+operation. If they are not, then the semantics is implementation
+defined.
 
 ### Informal Description and Examples
 
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index b2014c08951..7abd2f7429d 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1545,11 +1545,11 @@ class ComputationBuilder(object):
               update_computation,
               dimension_numbers,
               indices_are_sorted=False,
-              use_atomic=True):
+              unique_indices=False):
     """Enqueues a Scatter operation onto the computation."""
     return ops.Scatter(a, scatter_indices, updates,
                        update_computation.computation, dimension_numbers,
-                       indices_are_sorted, use_atomic)
+                       indices_are_sorted, unique_indices)
 
   def Fft(self, operand, fft_type, fft_lengths):
     """Enqueues a FFT operation onto the computation."""
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 423b27cc2cd..e9cf69df882 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -404,7 +404,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
               scatter_fused_emitter.GetGenerator(root->operand(1)),
               /*updates_gen=*/
               scatter_fused_emitter.GetGenerator(root->operand(2)),
-              root->use_atomic()));
+              root->unique_indices()));
         }
         AddThunkToThunkSequence(
             absl::make_unique<SequentialThunk>(std::move(thunks), fusion));
@@ -842,7 +842,7 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
         return GetIrArray(*updates, *scatter)
             .EmitReadArrayElement(index, &b_, "update");
       },
-      scatter->use_atomic()));
+      scatter->unique_indices()));
 
   // Elide the sequential thunk if there's no copy.
   if (thunks.size() == 1) {
@@ -859,7 +859,7 @@ Status IrEmitterUnnested::EmitScatter(
     Thunk* thunk, HloInstruction* scatter,
     const llvm_ir::ElementGenerator& scatter_indices_gen,
     const llvm_ir::ElementGenerator& updates_gen,
-    bool use_atomic) {
+    bool unique_indices) {
   const HloInstruction* operand = scatter->operand(0);
   const HloInstruction* scatter_indices = scatter->operand(1);
   const HloInstruction* updates = scatter->operand(2);
@@ -969,7 +969,7 @@ Status IrEmitterUnnested::EmitScatter(
     TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, updates_gen(index));
     Store(input_ir_value, input_address);
 
-    if (scatter->use_atomic()) {
+    if (!scatter->unique_indices()) {
       return EmitAtomicOperationForNestedComputation(
           *scatter->to_apply(), output_address, input_address);
     } else {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 8308e97ec99..9c69545cb9f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -226,16 +226,16 @@ class IrEmitterUnnested : public IrEmitter,
   // Emits code for an in-place scatter, modifying `thunk`s launch dimensions in
   // the process. `scatter` may be fused, scatter indices are taken from
   // `scatter_indices_gen`, updates from`updates_gen`. The output buffer is
-  // expected to have the operand values in it already. If use_atomic
-  // is true, we will use an atomic update. Using false for use_atomic
+  // expected to have the operand values in it already. If unique_indices
+  // is false, we will use an atomic update. Using false for unique_indices
   // is safe only when it is guaranteed that there are no duplicate
   // indices.
-  // When using use_atomi=false, it is the caller responsibility to
-  // ensure there is overlap.
+  // When using unique_indices=true, it is the caller responsibility to
+  // ensure there is no overlap.
   Status EmitScatter(Thunk* thunk, HloInstruction* scatter,
                      const llvm_ir::ElementGenerator& scatter_indices_gen,
                      const llvm_ir::ElementGenerator& updates_gen,
-                     bool use_atomic);
+                     bool unique_indices);
 
   // Returns true if a 0-2-1 tiling algorithm is already used to emit the kernel
   // for the hlo instruction.
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 84abe7ec0ab..286562d0226 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -238,10 +238,9 @@ message HloInstructionProto {
   // Frontend attributes to pass to the XLA backend.
   xla.FrontendAttributes frontend_attributes = 68;
 
-  // Specifies if the scatter should use atomic operation or not. If
-  // there is duplicate index, then it should be true to compute the
-  // right answer.
-  bool use_atomic = 69;
+  // Specifies if all elements updated are guaranteed to be unique by
+  // the caller.
+  bool unique_indices = 69;
 }
 
 // Serialization of HloComputation.
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index ccd9041873a..52330af56a8 100755
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -565,7 +565,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
               proto.scatter_dimension_numbers());
       instruction = CreateScatter(shape, operands(0), operands(1), operands(2),
                                   computations(0), *scatter_dimension_numbers,
-                                  proto.indices_are_sorted(), proto.use_atomic());
+                                  proto.indices_are_sorted(), proto.unique_indices());
       break;
     }
     case HloOpcode::kIota:
@@ -1393,10 +1393,10 @@ bool HloInstruction::HasSideEffect() const {
     HloInstruction* scatter_indices, HloInstruction* updates,
     HloComputation* update_computation,
     const ScatterDimensionNumbers& scatter_dim_numbers,
-    bool indices_are_sorted, bool use_atomic) {
+    bool indices_are_sorted, bool unique_indices) {
   return absl::make_unique<HloScatterInstruction>(
       shape, operand, scatter_indices, updates, update_computation,
-      scatter_dim_numbers, indices_are_sorted, use_atomic);
+      scatter_dim_numbers, indices_are_sorted, unique_indices);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDomain(
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index f28184e7e78..23267ecab5e 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -801,7 +801,7 @@ class HloInstruction {
       HloInstruction* scatter_indices, HloInstruction* updates,
       HloComputation* update_computation,
       const ScatterDimensionNumbers& scatter_dim_numbers,
-      bool indices_are_sorted, bool use_atomic);
+      bool indices_are_sorted, bool unique_indices);
 
   // Creates a kDomain instruction which delimits an HLO domain which have
   // the provided user and operand side metadata.
@@ -1629,8 +1629,8 @@ class HloInstruction {
     LOG(FATAL) << "Unimplemented method.";
   }
 
-  // Returns the use_atomic field.
-  virtual bool use_atomic() const {
+  // Returns the unique_indices field.
+  virtual bool unique_indices() const {
     LOG(FATAL) << "Unimplemented method.";
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index d3ef5ec83c8..71dde2764fb 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -1530,7 +1530,7 @@ TEST_F(HloInstructionTest, StringifyScatter) {
               /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
               /*index_vector_dim=*/2),
               /*indices_are_sorted=*/false,
-              /*use_atomic=*/true));
+              /*unique_indices=*/false));
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index e54eb81ed1f..3c5ecbe06e2 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -2494,9 +2494,9 @@ HloScatterInstruction::HloScatterInstruction(
     HloInstruction* scatter_indices, HloInstruction* updates,
     HloComputation* update_computation,
     const ScatterDimensionNumbers& scatter_dim_numbers, bool indices_are_sorted,
-    bool use_atomic)
+    bool unique_indices)
     : HloInstruction(HloOpcode::kScatter, shape),
-      indices_are_sorted_(indices_are_sorted), use_atomic_(use_atomic) {
+      indices_are_sorted_(indices_are_sorted), unique_indices_(unique_indices) {
   AppendOperand(operand);
   AppendOperand(scatter_indices);
   AppendOperand(updates);
@@ -2551,7 +2551,7 @@ HloInstructionProto HloScatterInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   *proto.mutable_scatter_dimension_numbers() = scatter_dimension_numbers();
   proto.set_indices_are_sorted(indices_are_sorted());
-  proto.set_use_atomic(use_atomic());
+  proto.set_unique_indices(unique_indices());
   return proto;
 }
 
@@ -2562,8 +2562,8 @@ std::vector<string> HloScatterInstruction::ExtraAttributesToStringImpl(
   if (indices_are_sorted()) {
     attrs.push_back("indices_are_sorted=true");
   }
-  if (!use_atomic()) {
-    attrs.push_back("use_atomic=false");
+  if (unique_indices()) {
+    attrs.push_back("unique_indices=true");
   }
   return attrs;
 }
@@ -2578,7 +2578,7 @@ bool HloScatterInstruction::IdenticalSlowPath(
              casted_other.scatter_dimension_numbers()) &&
          eq_computations(to_apply(), casted_other.to_apply()) &&
          indices_are_sorted() == casted_other.indices_are_sorted() &&
-         use_atomic() == casted_other.use_atomic();
+         unique_indices() == casted_other.unique_indices();
 }
 
 std::unique_ptr<HloInstruction> HloScatterInstruction::CloneWithNewOperandsImpl(
@@ -2587,7 +2587,7 @@ std::unique_ptr<HloInstruction> HloScatterInstruction::CloneWithNewOperandsImpl(
   CHECK_EQ(new_operands.size(), 3);
   return absl::make_unique<HloScatterInstruction>(
       shape, new_operands[0], new_operands[1], new_operands[2], to_apply(),
-      scatter_dimension_numbers(), indices_are_sorted(), use_atomic());
+      scatter_dimension_numbers(), indices_are_sorted(), unique_indices());
 }
 
 HloIotaInstruction::HloIotaInstruction(const Shape& shape, int64 iota_dimension)
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index cec090172ee..59f2392866b 100755
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -1453,7 +1453,7 @@ class HloScatterInstruction : public HloInstruction {
       HloInstruction* scatter_indices, HloInstruction* updates,
       HloComputation* update_computation,
       const ScatterDimensionNumbers& scatter_dim_numbers,
-      bool indices_are_sorted, bool use_atomic);
+      bool indices_are_sorted, bool unique_indices);
   const ScatterDimensionNumbers& scatter_dimension_numbers() const {
     CHECK(scatter_dimension_numbers_ != nullptr);
     return *scatter_dimension_numbers_;
@@ -1462,7 +1462,7 @@ class HloScatterInstruction : public HloInstruction {
   void set_indices_are_sorted(bool indices_are_sorted) {
     indices_are_sorted_ = indices_are_sorted;
   }
-  bool use_atomic() const override { return use_atomic_; }
+  bool unique_indices() const override { return unique_indices_; }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
@@ -1490,7 +1490,7 @@ class HloScatterInstruction : public HloInstruction {
 
   std::unique_ptr<ScatterDimensionNumbers> scatter_dimension_numbers_;
   bool indices_are_sorted_;
-  bool use_atomic_;
+  bool unique_indices_;
 };
 
 class HloIotaInstruction : public HloInstruction {
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index b5eb0694275..6716c4f1744 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -1726,9 +1726,9 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<bool> indices_are_sorted = false;
       attrs["indices_are_sorted"] = {/*required=*/false, AttrTy::kBool,
                                      &indices_are_sorted};
-      optional<bool> use_atomic = true;
-      attrs["use_atomic"] = {/*required=*/false, AttrTy::kBool,
-                             &use_atomic};
+      optional<bool> unique_indices = false;
+      attrs["unique_indices"] = {/*required=*/false, AttrTy::kBool,
+                                 &unique_indices};
 
       if (!ParseOperands(&operands, /*expected_size=*/3) ||
           !ParseAttributes(attrs)) {
@@ -1745,7 +1745,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       instruction = builder->AddInstruction(HloInstruction::CreateScatter(
           shape, /*operand=*/operands[0], /*scatter_indices=*/operands[1],
           /*updates=*/operands[2], *update_computation, dim_numbers,
-          indices_are_sorted.value(), use_atomic.value()));
+          indices_are_sorted.value(), unique_indices.value()));
       break;
     }
     case HloOpcode::kDomain: {
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 0f06af69ff5..a2c8c61bee5 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -937,8 +937,8 @@ ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7
 )"
 },
 {
-"AtomicScatter",
-R"(HloModule StringifyAtomicScatter
+"UniqueIndicesScatter",
+R"(HloModule StringifyUniqueIndicesScatter
 
 %add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
   %lhs = f32[] parameter(0)
@@ -950,7 +950,7 @@ ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7
   %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0)
   %scatter_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
   %updates = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} parameter(2)
-  ROOT %scatter = f32[50,49,48,47,46]{4,3,2,1,0} scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %scatter_indices, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %updates), update_window_dims={4,5,6,7,8}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, use_atomic=false, to_apply=%add_F32.v3
+  ROOT %scatter = f32[50,49,48,47,46]{4,3,2,1,0} scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %scatter_indices, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %updates), update_window_dims={4,5,6,7,8}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, unique_indices=true, to_apply=%add_F32.v3
 }
 
 )"
diff --git a/tensorflow/compiler/xla/tests/scatter_test.cc b/tensorflow/compiler/xla/tests/scatter_test.cc
index 1b8f58f858a..0fdd176e8ef 100644
--- a/tensorflow/compiler/xla/tests/scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/scatter_test.cc
@@ -225,7 +225,7 @@ ENTRY main {
   RunTest(hlo_text, &operand, &scatter_indices, &updates);
 }
 
-XLA_TEST_F(ScatterTest, TensorFlowScatter_Add_NoAtomic) {
+XLA_TEST_F(ScatterTest, TensorFlowScatter_Add_UniqueIndices) {
   const string hlo_text = R"(
 HloModule TensorFlowScatter_Add
 
@@ -245,7 +245,7 @@ ENTRY main {
       inserted_window_dims={0},
       scatter_dims_to_operand_dims={0},
       index_vector_dim=1,
-      use_atomic=false
+      unique_indices=true
 }
 )";
   Literal operand =

From c08122ad6a7993054bfbc6557d99f2833a916f4a Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Fri, 23 Aug 2019 12:47:41 -0700
Subject: [PATCH 6/6] Fix doc typo and add a test.

---
 .../compiler/xla/g3doc/operation_semantics.md |  5 ++--
 .../xla/service/gpu/ir_emitter_unnested.h     |  2 +-
 .../xla/service/gpu/tests/gpu_atomic_test.cc  | 27 +++++++++++++++++++
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 355e5fec73a..a1772a0d1e1 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -1384,7 +1384,7 @@ For a more intuitive description, see the "Informal Description" section below.
 :                        :                     : the caller.                   :
 | `unique_indices`       | `bool`              | Whether the indices are       |
 :                        :                     : guaranteed to be unique by    :
-:                        :                     : the caller                    :
+:                        :                     : the caller.                   :
 
 For convenience, we label dimensions in the output array not in `offset_dims`
 as `batch_dims`.
@@ -1455,7 +1455,8 @@ then the semantics is implementation defined.
 
 If `unique_indices` is set to true then XLA can assume that all
 element scattered to are unique.  So XLA could use non-atomic
-operation. If they are not, then the semantics is implementation
+operations. If `unique_indices` is set to true and the indices being
+scattered to are not unique then the semantics is implementation
 defined.
 
 ### Informal Description and Examples
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 9c69545cb9f..0f52aecda23 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -230,7 +230,7 @@ class IrEmitterUnnested : public IrEmitter,
   // is false, we will use an atomic update. Using false for unique_indices
   // is safe only when it is guaranteed that there are no duplicate
   // indices.
-  // When using unique_indices=true, it is the caller responsibility to
+  // When using unique_indices=true, it is the caller's responsibility to
   // ensure there is no overlap.
   Status EmitScatter(Thunk* thunk, HloInstruction* scatter,
                      const llvm_ir::ElementGenerator& scatter_indices_gen,
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc
index 6b18c4c6371..a54c0e5ae44 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc
@@ -53,6 +53,33 @@ CHECK: store atomic{{.*}}unordered, align 4
 )");
 }
 
+TEST_F(GpuAtomicTest, TestStoreNoAtomic) {
+  const char* hlo_string = R"(
+    HloModule TensorFlowScatterV1
+
+    update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+      lhs = s32[] parameter(0)
+      ROOT rhs = s32[] parameter(1)
+    }
+
+    ENTRY main {
+      operand = s32[3,3] parameter(0)
+      indices = s32[2] parameter(1)
+      updates = s32[2,3] parameter(2)
+      ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+          to_apply=update_s32,
+          update_window_dims={1},
+          inserted_window_dims={0},
+          scatter_dims_to_operand_dims={0},
+          index_vector_dim=1, unique_indices=true
+    }
+)";
+
+  CompileAndVerifyIr(hlo_string, R"(
+CHECK-NOT: store atomic{{.*}}unordered, align 4
+)");
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla