From 27903a397838abaa6833541872f2ca60114725b9 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Tue, 9 Jul 2019 08:20:32 -0700 Subject: [PATCH 1/6] [XLA] add the Scatter option to not do atomic operation. This should keep the backward compatibility when this option didn't exist. --- tensorflow/compiler/xla/client/xla_builder.cc | 8 +++++--- tensorflow/compiler/xla/client/xla_builder.h | 6 +++--- tensorflow/compiler/xla/python/xla_client.py | 5 +++-- .../xla/service/gpu/ir_emitter_unnested.cc | 19 ++++++++++++++----- .../xla/service/gpu/ir_emitter_unnested.h | 8 ++++++-- tensorflow/compiler/xla/service/hlo.proto | 7 ++++++- .../compiler/xla/service/hlo_instruction.cc | 6 +++--- .../compiler/xla/service/hlo_instruction.h | 7 ++++++- .../compiler/xla/service/hlo_instructions.cc | 14 ++++++++++---- .../compiler/xla/service/hlo_instructions.h | 4 +++- tensorflow/compiler/xla/service/hlo_parser.cc | 5 ++++- 11 files changed, 63 insertions(+), 26 deletions(-) diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc index dccdec22fb9..148a6f65c30 100644 --- a/tensorflow/compiler/xla/client/xla_builder.cc +++ b/tensorflow/compiler/xla/client/xla_builder.cc @@ -1734,11 +1734,13 @@ XlaOp XlaBuilder::Scatter(const XlaOp& input, const XlaOp& scatter_indices, const XlaOp& updates, const XlaComputation& update_computation, const ScatterDimensionNumbers& dimension_numbers, - bool indices_are_sorted) { + bool indices_are_sorted, bool use_atomic) { return ReportErrorOrReturn([&]() -> StatusOr { HloInstructionProto instr; instr.set_indices_are_sorted(indices_are_sorted); + instr.set_use_atomic(use_atomic); + TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input)); TF_ASSIGN_OR_RETURN(const Shape& scatter_indices_shape, GetShape(scatter_indices)); @@ -3378,10 +3380,10 @@ XlaOp Gather(const XlaOp input, const XlaOp start_indices, XlaOp Scatter(const XlaOp input, const XlaOp scatter_indices, const XlaOp updates, const XlaComputation& update_computation, const ScatterDimensionNumbers& dimension_numbers, - bool indices_are_sorted) { + bool indices_are_sorted, bool use_atomic) { return input.builder()->Scatter(input, scatter_indices, updates, update_computation, dimension_numbers, - indices_are_sorted); + indices_are_sorted, use_atomic); } void Send(const XlaOp operand, const ChannelHandle& handle) { diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h index 5c28e8b5150..30553183531 100644 --- a/tensorflow/compiler/xla/client/xla_builder.h +++ b/tensorflow/compiler/xla/client/xla_builder.h @@ -592,7 +592,7 @@ class XlaBuilder { XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices, const XlaOp& updates, const XlaComputation& update_computation, const ScatterDimensionNumbers& dimension_numbers, - bool indices_are_sorted = false); + bool indices_are_sorted = false, bool use_atomic = true); void Send(const XlaOp& operand, const ChannelHandle& handle); XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token, @@ -1010,7 +1010,7 @@ class XlaBuilder { friend XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates, const XlaComputation& update_computation, const ScatterDimensionNumbers& dimension_numbers, - bool indices_are_sorted); + bool indices_are_sorted, bool use_atomic); friend void Send(XlaOp operand, const ChannelHandle& handle); friend XlaOp Recv(XlaBuilder* builder, const Shape& shape, const ChannelHandle& handle); @@ -1869,7 +1869,7 @@ XlaOp Gather(XlaOp input, XlaOp start_indices, XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates, const XlaComputation& update_computation, const ScatterDimensionNumbers& dimension_numbers, - bool indices_are_sorted = false); + bool indices_are_sorted = false, bool use_atomic = true); // Enqueues a Send node onto the computation for device-to-device // communication. This operation sends the given operand to diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py index 63a9ea37692..b2014c08951 100644 --- a/tensorflow/compiler/xla/python/xla_client.py +++ b/tensorflow/compiler/xla/python/xla_client.py @@ -1544,11 +1544,12 @@ class ComputationBuilder(object): updates, update_computation, dimension_numbers, - indices_are_sorted=False): + indices_are_sorted=False, + use_atomic=True): """Enqueues a Scatter operation onto the computation.""" return ops.Scatter(a, scatter_indices, updates, update_computation.computation, dimension_numbers, - indices_are_sorted) + indices_are_sorted, use_atomic) def Fft(self, operand, fft_type, fft_lengths): """Enqueues a FFT operation onto the computation.""" diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 0541367bc71..423b27cc2cd 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -403,7 +403,8 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) { /*scatter_indices_gen=*/ scatter_fused_emitter.GetGenerator(root->operand(1)), /*updates_gen=*/ - scatter_fused_emitter.GetGenerator(root->operand(2)))); + scatter_fused_emitter.GetGenerator(root->operand(2)), + root->use_atomic())); } AddThunkToThunkSequence( absl::make_unique(std::move(thunks), fusion)); @@ -840,7 +841,8 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) { [=](const IrArray::Index& index) { return GetIrArray(*updates, *scatter) .EmitReadArrayElement(index, &b_, "update"); - })); + }, + scatter->use_atomic())); // Elide the sequential thunk if there's no copy. if (thunks.size() == 1) { @@ -856,7 +858,8 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) { Status IrEmitterUnnested::EmitScatter( Thunk* thunk, HloInstruction* scatter, const llvm_ir::ElementGenerator& scatter_indices_gen, - const llvm_ir::ElementGenerator& updates_gen) { + const llvm_ir::ElementGenerator& updates_gen, + bool use_atomic) { const HloInstruction* operand = scatter->operand(0); const HloInstruction* scatter_indices = scatter->operand(1); const HloInstruction* updates = scatter->operand(2); @@ -965,8 +968,14 @@ Status IrEmitterUnnested::EmitScatter( updates->shape().element_type(), module_)); TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, updates_gen(index)); Store(input_ir_value, input_address); - return EmitAtomicOperationForNestedComputation( - *scatter->to_apply(), output_address, input_address); + + if (scatter->use_atomic()) { + return EmitAtomicOperationForNestedComputation( + *scatter->to_apply(), output_address, input_address); + } else { + return EmitCallToNestedComputation( + *scatter->to_apply(), {output_address, input_address}, output_address); + } }; // Launch a kernel that reads every element in the updates tensor. We could diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h index 4fcc5dedb67..a00b80b07e8 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h @@ -226,10 +226,14 @@ class IrEmitterUnnested : public IrEmitter, // Emits code for an in-place scatter, modifying `thunk`s launch dimensions in // the process. `scatter` may be fused, scatter indices are taken from // `scatter_indices_gen`, updates from`updates_gen`. The output buffer is - // expected to have the operand values in it already. + // expected to have the operand values in it already. If use_atomic + // is true, we will use an atomic update. Using false for use_atomic + // is safe only when it is guaranteed that there is no duplicate + // indices. Status EmitScatter(Thunk* thunk, HloInstruction* scatter, const llvm_ir::ElementGenerator& scatter_indices_gen, - const llvm_ir::ElementGenerator& updates_gen); + const llvm_ir::ElementGenerator& updates_gen, + bool use_atomic); // Returns true if a 0-2-1 tiling algorithm is already used to emit the kernel // for the hlo instruction. diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto index 61e562c7eda..84abe7ec0ab 100644 --- a/tensorflow/compiler/xla/service/hlo.proto +++ b/tensorflow/compiler/xla/service/hlo.proto @@ -35,7 +35,7 @@ import "tensorflow/compiler/xla/xla_data.proto"; option cc_enable_arenas = true; // Serialization of HloInstruction. -// Next ID: 69 +// Next ID: 70 message HloInstructionProto { reserved 10; reserved "parameter_name"; @@ -237,6 +237,11 @@ message HloInstructionProto { // Frontend attributes to pass to the XLA backend. xla.FrontendAttributes frontend_attributes = 68; + + // Specifies if the scatter should use atomic operation or not. If + // there is duplicate index, then it should be true to compute the + // right answer. + bool use_atomic = 69; } // Serialization of HloComputation. diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index dabd7ab2836..ccd9041873a 100755 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -565,7 +565,7 @@ StatusOr> HloInstruction::CreateFromProto( proto.scatter_dimension_numbers()); instruction = CreateScatter(shape, operands(0), operands(1), operands(2), computations(0), *scatter_dimension_numbers, - proto.indices_are_sorted()); + proto.indices_are_sorted(), proto.use_atomic()); break; } case HloOpcode::kIota: @@ -1393,10 +1393,10 @@ bool HloInstruction::HasSideEffect() const { HloInstruction* scatter_indices, HloInstruction* updates, HloComputation* update_computation, const ScatterDimensionNumbers& scatter_dim_numbers, - bool indices_are_sorted) { + bool indices_are_sorted, bool use_atomic) { return absl::make_unique( shape, operand, scatter_indices, updates, update_computation, - scatter_dim_numbers, indices_are_sorted); + scatter_dim_numbers, indices_are_sorted, use_atomic); } /* static */ std::unique_ptr HloInstruction::CreateDomain( diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 3119b52e377..f28184e7e78 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -801,7 +801,7 @@ class HloInstruction { HloInstruction* scatter_indices, HloInstruction* updates, HloComputation* update_computation, const ScatterDimensionNumbers& scatter_dim_numbers, - bool indices_are_sorted); + bool indices_are_sorted, bool use_atomic); // Creates a kDomain instruction which delimits an HLO domain which have // the provided user and operand side metadata. @@ -1629,6 +1629,11 @@ class HloInstruction { LOG(FATAL) << "Unimplemented method."; } + // Returns the use_atomic field. + virtual bool use_atomic() const { + LOG(FATAL) << "Unimplemented method."; + } + // Returns data on the dimension numbers used for a convolution operation, // which may be a kConvolution instruction or a kCustomCall that implements a // convolution. diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc index 183967941bf..e54eb81ed1f 100644 --- a/tensorflow/compiler/xla/service/hlo_instructions.cc +++ b/tensorflow/compiler/xla/service/hlo_instructions.cc @@ -2493,9 +2493,10 @@ HloScatterInstruction::HloScatterInstruction( const Shape& shape, HloInstruction* operand, HloInstruction* scatter_indices, HloInstruction* updates, HloComputation* update_computation, - const ScatterDimensionNumbers& scatter_dim_numbers, bool indices_are_sorted) + const ScatterDimensionNumbers& scatter_dim_numbers, bool indices_are_sorted, + bool use_atomic) : HloInstruction(HloOpcode::kScatter, shape), - indices_are_sorted_(indices_are_sorted) { + indices_are_sorted_(indices_are_sorted), use_atomic_(use_atomic) { AppendOperand(operand); AppendOperand(scatter_indices); AppendOperand(updates); @@ -2550,6 +2551,7 @@ HloInstructionProto HloScatterInstruction::ToProto() const { HloInstructionProto proto = HloInstruction::ToProto(); *proto.mutable_scatter_dimension_numbers() = scatter_dimension_numbers(); proto.set_indices_are_sorted(indices_are_sorted()); + proto.set_use_atomic(use_atomic()); return proto; } @@ -2560,6 +2562,9 @@ std::vector HloScatterInstruction::ExtraAttributesToStringImpl( if (indices_are_sorted()) { attrs.push_back("indices_are_sorted=true"); } + if (!use_atomic()) { + attrs.push_back("use_atomic=false"); + } return attrs; } @@ -2572,7 +2577,8 @@ bool HloScatterInstruction::IdenticalSlowPath( scatter_dimension_numbers(), casted_other.scatter_dimension_numbers()) && eq_computations(to_apply(), casted_other.to_apply()) && - indices_are_sorted() == casted_other.indices_are_sorted(); + indices_are_sorted() == casted_other.indices_are_sorted() && + use_atomic() == casted_other.use_atomic(); } std::unique_ptr HloScatterInstruction::CloneWithNewOperandsImpl( @@ -2581,7 +2587,7 @@ std::unique_ptr HloScatterInstruction::CloneWithNewOperandsImpl( CHECK_EQ(new_operands.size(), 3); return absl::make_unique( shape, new_operands[0], new_operands[1], new_operands[2], to_apply(), - scatter_dimension_numbers(), indices_are_sorted()); + scatter_dimension_numbers(), indices_are_sorted(), use_atomic()); } HloIotaInstruction::HloIotaInstruction(const Shape& shape, int64 iota_dimension) diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h index 0de050108b7..cec090172ee 100755 --- a/tensorflow/compiler/xla/service/hlo_instructions.h +++ b/tensorflow/compiler/xla/service/hlo_instructions.h @@ -1453,7 +1453,7 @@ class HloScatterInstruction : public HloInstruction { HloInstruction* scatter_indices, HloInstruction* updates, HloComputation* update_computation, const ScatterDimensionNumbers& scatter_dim_numbers, - bool indices_are_sorted); + bool indices_are_sorted, bool use_atomic); const ScatterDimensionNumbers& scatter_dimension_numbers() const { CHECK(scatter_dimension_numbers_ != nullptr); return *scatter_dimension_numbers_; @@ -1462,6 +1462,7 @@ class HloScatterInstruction : public HloInstruction { void set_indices_are_sorted(bool indices_are_sorted) { indices_are_sorted_ = indices_are_sorted; } + bool use_atomic() const override { return use_atomic_; } // Returns a serialized representation of this instruction. HloInstructionProto ToProto() const override; @@ -1489,6 +1490,7 @@ class HloScatterInstruction : public HloInstruction { std::unique_ptr scatter_dimension_numbers_; bool indices_are_sorted_; + bool use_atomic_; }; class HloIotaInstruction : public HloInstruction { diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc index c96bfb15187..b5eb0694275 100644 --- a/tensorflow/compiler/xla/service/hlo_parser.cc +++ b/tensorflow/compiler/xla/service/hlo_parser.cc @@ -1726,6 +1726,9 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder, optional indices_are_sorted = false; attrs["indices_are_sorted"] = {/*required=*/false, AttrTy::kBool, &indices_are_sorted}; + optional use_atomic = true; + attrs["use_atomic"] = {/*required=*/false, AttrTy::kBool, + &use_atomic}; if (!ParseOperands(&operands, /*expected_size=*/3) || !ParseAttributes(attrs)) { @@ -1742,7 +1745,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder, instruction = builder->AddInstruction(HloInstruction::CreateScatter( shape, /*operand=*/operands[0], /*scatter_indices=*/operands[1], /*updates=*/operands[2], *update_computation, dim_numbers, - indices_are_sorted.value())); + indices_are_sorted.value(), use_atomic.value())); break; } case HloOpcode::kDomain: { From 7d22d959e0bc3dc126ea0a431ebcb7690d48ac9d Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 11 Jul 2019 08:39:40 -0700 Subject: [PATCH 2/6] Add a test for Scatter without atomics. --- tensorflow/compiler/xla/tests/scatter_test.cc | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tensorflow/compiler/xla/tests/scatter_test.cc b/tensorflow/compiler/xla/tests/scatter_test.cc index 05073de9c90..1b8f58f858a 100644 --- a/tensorflow/compiler/xla/tests/scatter_test.cc +++ b/tensorflow/compiler/xla/tests/scatter_test.cc @@ -225,6 +225,36 @@ ENTRY main { RunTest(hlo_text, &operand, &scatter_indices, &updates); } +XLA_TEST_F(ScatterTest, TensorFlowScatter_Add_NoAtomic) { + const string hlo_text = R"( +HloModule TensorFlowScatter_Add + +add_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT add = s32[] add(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=add_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1, + use_atomic=false +} +)"; + Literal operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + Literal scatter_indices = LiteralUtil::CreateR1({0, 2}); + Literal updates = LiteralUtil::CreateR2({{10, 20, 30}, {70, 80, 90}}); + RunTest(hlo_text, &operand, &scatter_indices, &updates); +} + XLA_TEST_F(ScatterTest, TensorFlowScatter_Mul) { const string hlo_text = R"( HloModule TensorFlowScatter_Mul From 5fea6eecde8dde2e122ad29217c3fff33e89ded2 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Fri, 12 Jul 2019 06:20:56 -0700 Subject: [PATCH 3/6] Update comment --- tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h index a00b80b07e8..8308e97ec99 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h @@ -228,8 +228,10 @@ class IrEmitterUnnested : public IrEmitter, // `scatter_indices_gen`, updates from`updates_gen`. The output buffer is // expected to have the operand values in it already. If use_atomic // is true, we will use an atomic update. Using false for use_atomic - // is safe only when it is guaranteed that there is no duplicate + // is safe only when it is guaranteed that there are no duplicate // indices. + // When using use_atomi=false, it is the caller responsibility to + // ensure there is overlap. Status EmitScatter(Thunk* thunk, HloInstruction* scatter, const llvm_ir::ElementGenerator& scatter_indices_gen, const llvm_ir::ElementGenerator& updates_gen, From c77c165dcd169ce8a02e2f86a0612b859a205000 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Fri, 23 Aug 2019 06:36:32 -0700 Subject: [PATCH 4/6] Add documentation and tests --- .../compiler/xla/g3doc/operation_semantics.md | 7 +++++++ .../xla/service/hlo_instruction_test.cc | 3 ++- .../compiler/xla/service/hlo_parser_test.cc | 19 +++++++++++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md index 1f2790e98bb..bb6db42e168 100644 --- a/tensorflow/compiler/xla/g3doc/operation_semantics.md +++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md @@ -1382,6 +1382,10 @@ For a more intuitive description, see the "Informal Description" section below. | `indices_are_sorted` | `bool` | Whether the indices are | : : : guaranteed to be sorted by : : : : the caller. : +| `use_atomic` | `bool` | Whether to use atomic | +: : : operation for the update. To : +: : : use only when the the caller : +: : : guarante no duplicate indices : For convenience, we label dimensions in the output array not in `offset_dims` as `batch_dims`. @@ -1450,6 +1454,9 @@ If `indices_are_sorted` is set to true then XLA can assume that `start_indices` are sorted (in ascending `start_index_map` order) by the user. If they are not then the semantics is implementation defined. +If `use_atomic` is set to false then XLA will not use atomic operation. This +is only safe when there is no duplicate indices. + ### Informal Description and Examples Informally, every index `Out` in the output array corresponds to an element `E` diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc index 0a50ed04af7..d3ef5ec83c8 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc @@ -1529,7 +1529,8 @@ TEST_F(HloInstructionTest, StringifyScatter) { /*inserted_window_dims=*/{}, /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/2), - /*indices_are_sorted=*/false)); + /*indices_are_sorted=*/false, + /*use_atomic=*/true)); module->AddEntryComputation(builder.Build()); EXPECT_EQ( diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc index c913784cd13..0f06af69ff5 100644 --- a/tensorflow/compiler/xla/service/hlo_parser_test.cc +++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc @@ -934,6 +934,25 @@ ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7 ROOT %scatter = f32[50,49,48,47,46]{4,3,2,1,0} scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %scatter_indices, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %updates), update_window_dims={4,5,6,7,8}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, indices_are_sorted=true, to_apply=%add_F32.v3 } +)" +}, +{ +"AtomicScatter", +R"(HloModule StringifyAtomicScatter + +%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] { + %lhs = f32[] parameter(0) + %rhs = f32[] parameter(1) + ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs) +} + +ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7,5], updates: f32[10,9,8,7,30,29,28,27,26]) -> f32[50,49,48,47,46] { + %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0) + %scatter_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1) + %updates = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} parameter(2) + ROOT %scatter = f32[50,49,48,47,46]{4,3,2,1,0} scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %scatter_indices, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %updates), update_window_dims={4,5,6,7,8}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, use_atomic=false, to_apply=%add_F32.v3 +} + )" }, { From 2a99d7e898a5a96e258a4c323c1166da6ed4b0a9 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Fri, 23 Aug 2019 10:01:42 -0700 Subject: [PATCH 5/6] change the new parameter name. --- tensorflow/compiler/xla/client/xla_builder.cc | 8 ++++---- tensorflow/compiler/xla/client/xla_builder.h | 6 +++--- .../compiler/xla/g3doc/operation_semantics.md | 13 +++++++------ tensorflow/compiler/xla/python/xla_client.py | 4 ++-- .../xla/service/gpu/ir_emitter_unnested.cc | 8 ++++---- .../compiler/xla/service/gpu/ir_emitter_unnested.h | 10 +++++----- tensorflow/compiler/xla/service/hlo.proto | 7 +++---- tensorflow/compiler/xla/service/hlo_instruction.cc | 6 +++--- tensorflow/compiler/xla/service/hlo_instruction.h | 6 +++--- .../compiler/xla/service/hlo_instruction_test.cc | 2 +- .../compiler/xla/service/hlo_instructions.cc | 14 +++++++------- tensorflow/compiler/xla/service/hlo_instructions.h | 6 +++--- tensorflow/compiler/xla/service/hlo_parser.cc | 8 ++++---- tensorflow/compiler/xla/service/hlo_parser_test.cc | 6 +++--- tensorflow/compiler/xla/tests/scatter_test.cc | 4 ++-- 15 files changed, 54 insertions(+), 54 deletions(-) diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc index 148a6f65c30..d0c80084473 100644 --- a/tensorflow/compiler/xla/client/xla_builder.cc +++ b/tensorflow/compiler/xla/client/xla_builder.cc @@ -1734,12 +1734,12 @@ XlaOp XlaBuilder::Scatter(const XlaOp& input, const XlaOp& scatter_indices, const XlaOp& updates, const XlaComputation& update_computation, const ScatterDimensionNumbers& dimension_numbers, - bool indices_are_sorted, bool use_atomic) { + bool indices_are_sorted, bool unique_indices) { return ReportErrorOrReturn([&]() -> StatusOr { HloInstructionProto instr; instr.set_indices_are_sorted(indices_are_sorted); - instr.set_use_atomic(use_atomic); + instr.set_unique_indices(unique_indices); TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input)); TF_ASSIGN_OR_RETURN(const Shape& scatter_indices_shape, @@ -3380,10 +3380,10 @@ XlaOp Gather(const XlaOp input, const XlaOp start_indices, XlaOp Scatter(const XlaOp input, const XlaOp scatter_indices, const XlaOp updates, const XlaComputation& update_computation, const ScatterDimensionNumbers& dimension_numbers, - bool indices_are_sorted, bool use_atomic) { + bool indices_are_sorted, bool unique_indices) { return input.builder()->Scatter(input, scatter_indices, updates, update_computation, dimension_numbers, - indices_are_sorted, use_atomic); + indices_are_sorted, unique_indices); } void Send(const XlaOp operand, const ChannelHandle& handle) { diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h index 30553183531..187cd261833 100644 --- a/tensorflow/compiler/xla/client/xla_builder.h +++ b/tensorflow/compiler/xla/client/xla_builder.h @@ -592,7 +592,7 @@ class XlaBuilder { XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices, const XlaOp& updates, const XlaComputation& update_computation, const ScatterDimensionNumbers& dimension_numbers, - bool indices_are_sorted = false, bool use_atomic = true); + bool indices_are_sorted = false, bool unique_indices = false); void Send(const XlaOp& operand, const ChannelHandle& handle); XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token, @@ -1010,7 +1010,7 @@ class XlaBuilder { friend XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates, const XlaComputation& update_computation, const ScatterDimensionNumbers& dimension_numbers, - bool indices_are_sorted, bool use_atomic); + bool indices_are_sorted, bool unique_indices); friend void Send(XlaOp operand, const ChannelHandle& handle); friend XlaOp Recv(XlaBuilder* builder, const Shape& shape, const ChannelHandle& handle); @@ -1869,7 +1869,7 @@ XlaOp Gather(XlaOp input, XlaOp start_indices, XlaOp Scatter(XlaOp input, XlaOp scatter_indices, XlaOp updates, const XlaComputation& update_computation, const ScatterDimensionNumbers& dimension_numbers, - bool indices_are_sorted = false, bool use_atomic = true); + bool indices_are_sorted = false, bool unique_indices = false); // Enqueues a Send node onto the computation for device-to-device // communication. This operation sends the given operand to diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md index bb6db42e168..355e5fec73a 100644 --- a/tensorflow/compiler/xla/g3doc/operation_semantics.md +++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md @@ -1382,10 +1382,9 @@ For a more intuitive description, see the "Informal Description" section below. | `indices_are_sorted` | `bool` | Whether the indices are | : : : guaranteed to be sorted by : : : : the caller. : -| `use_atomic` | `bool` | Whether to use atomic | -: : : operation for the update. To : -: : : use only when the the caller : -: : : guarante no duplicate indices : +| `unique_indices` | `bool` | Whether the indices are | +: : : guaranteed to be unique by : +: : : the caller : For convenience, we label dimensions in the output array not in `offset_dims` as `batch_dims`. @@ -1454,8 +1453,10 @@ If `indices_are_sorted` is set to true then XLA can assume that `start_indices` are sorted (in ascending `start_index_map` order) by the user. If they are not then the semantics is implementation defined. -If `use_atomic` is set to false then XLA will not use atomic operation. This -is only safe when there is no duplicate indices. +If `unique_indices` is set to true then XLA can assume that all +element scattered to are unique. So XLA could use non-atomic +operation. If they are not, then the semantics is implementation +defined. ### Informal Description and Examples diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py index b2014c08951..7abd2f7429d 100644 --- a/tensorflow/compiler/xla/python/xla_client.py +++ b/tensorflow/compiler/xla/python/xla_client.py @@ -1545,11 +1545,11 @@ class ComputationBuilder(object): update_computation, dimension_numbers, indices_are_sorted=False, - use_atomic=True): + unique_indices=False): """Enqueues a Scatter operation onto the computation.""" return ops.Scatter(a, scatter_indices, updates, update_computation.computation, dimension_numbers, - indices_are_sorted, use_atomic) + indices_are_sorted, unique_indices) def Fft(self, operand, fft_type, fft_lengths): """Enqueues a FFT operation onto the computation.""" diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 423b27cc2cd..e9cf69df882 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -404,7 +404,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) { scatter_fused_emitter.GetGenerator(root->operand(1)), /*updates_gen=*/ scatter_fused_emitter.GetGenerator(root->operand(2)), - root->use_atomic())); + root->unique_indices())); } AddThunkToThunkSequence( absl::make_unique(std::move(thunks), fusion)); @@ -842,7 +842,7 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) { return GetIrArray(*updates, *scatter) .EmitReadArrayElement(index, &b_, "update"); }, - scatter->use_atomic())); + scatter->unique_indices())); // Elide the sequential thunk if there's no copy. if (thunks.size() == 1) { @@ -859,7 +859,7 @@ Status IrEmitterUnnested::EmitScatter( Thunk* thunk, HloInstruction* scatter, const llvm_ir::ElementGenerator& scatter_indices_gen, const llvm_ir::ElementGenerator& updates_gen, - bool use_atomic) { + bool unique_indices) { const HloInstruction* operand = scatter->operand(0); const HloInstruction* scatter_indices = scatter->operand(1); const HloInstruction* updates = scatter->operand(2); @@ -969,7 +969,7 @@ Status IrEmitterUnnested::EmitScatter( TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, updates_gen(index)); Store(input_ir_value, input_address); - if (scatter->use_atomic()) { + if (!scatter->unique_indices()) { return EmitAtomicOperationForNestedComputation( *scatter->to_apply(), output_address, input_address); } else { diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h index 8308e97ec99..9c69545cb9f 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h @@ -226,16 +226,16 @@ class IrEmitterUnnested : public IrEmitter, // Emits code for an in-place scatter, modifying `thunk`s launch dimensions in // the process. `scatter` may be fused, scatter indices are taken from // `scatter_indices_gen`, updates from`updates_gen`. The output buffer is - // expected to have the operand values in it already. If use_atomic - // is true, we will use an atomic update. Using false for use_atomic + // expected to have the operand values in it already. If unique_indices + // is false, we will use an atomic update. Using false for unique_indices // is safe only when it is guaranteed that there are no duplicate // indices. - // When using use_atomi=false, it is the caller responsibility to - // ensure there is overlap. + // When using unique_indices=true, it is the caller responsibility to + // ensure there is no overlap. Status EmitScatter(Thunk* thunk, HloInstruction* scatter, const llvm_ir::ElementGenerator& scatter_indices_gen, const llvm_ir::ElementGenerator& updates_gen, - bool use_atomic); + bool unique_indices); // Returns true if a 0-2-1 tiling algorithm is already used to emit the kernel // for the hlo instruction. diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto index 84abe7ec0ab..286562d0226 100644 --- a/tensorflow/compiler/xla/service/hlo.proto +++ b/tensorflow/compiler/xla/service/hlo.proto @@ -238,10 +238,9 @@ message HloInstructionProto { // Frontend attributes to pass to the XLA backend. xla.FrontendAttributes frontend_attributes = 68; - // Specifies if the scatter should use atomic operation or not. If - // there is duplicate index, then it should be true to compute the - // right answer. - bool use_atomic = 69; + // Specifies if all elements updated are guaranteed to be unique by + // the caller. + bool unique_indices = 69; } // Serialization of HloComputation. diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index ccd9041873a..52330af56a8 100755 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -565,7 +565,7 @@ StatusOr> HloInstruction::CreateFromProto( proto.scatter_dimension_numbers()); instruction = CreateScatter(shape, operands(0), operands(1), operands(2), computations(0), *scatter_dimension_numbers, - proto.indices_are_sorted(), proto.use_atomic()); + proto.indices_are_sorted(), proto.unique_indices()); break; } case HloOpcode::kIota: @@ -1393,10 +1393,10 @@ bool HloInstruction::HasSideEffect() const { HloInstruction* scatter_indices, HloInstruction* updates, HloComputation* update_computation, const ScatterDimensionNumbers& scatter_dim_numbers, - bool indices_are_sorted, bool use_atomic) { + bool indices_are_sorted, bool unique_indices) { return absl::make_unique( shape, operand, scatter_indices, updates, update_computation, - scatter_dim_numbers, indices_are_sorted, use_atomic); + scatter_dim_numbers, indices_are_sorted, unique_indices); } /* static */ std::unique_ptr HloInstruction::CreateDomain( diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index f28184e7e78..23267ecab5e 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -801,7 +801,7 @@ class HloInstruction { HloInstruction* scatter_indices, HloInstruction* updates, HloComputation* update_computation, const ScatterDimensionNumbers& scatter_dim_numbers, - bool indices_are_sorted, bool use_atomic); + bool indices_are_sorted, bool unique_indices); // Creates a kDomain instruction which delimits an HLO domain which have // the provided user and operand side metadata. @@ -1629,8 +1629,8 @@ class HloInstruction { LOG(FATAL) << "Unimplemented method."; } - // Returns the use_atomic field. - virtual bool use_atomic() const { + // Returns the unique_indices field. + virtual bool unique_indices() const { LOG(FATAL) << "Unimplemented method."; } diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc index d3ef5ec83c8..71dde2764fb 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc @@ -1530,7 +1530,7 @@ TEST_F(HloInstructionTest, StringifyScatter) { /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/2), /*indices_are_sorted=*/false, - /*use_atomic=*/true)); + /*unique_indices=*/false)); module->AddEntryComputation(builder.Build()); EXPECT_EQ( diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc index e54eb81ed1f..3c5ecbe06e2 100644 --- a/tensorflow/compiler/xla/service/hlo_instructions.cc +++ b/tensorflow/compiler/xla/service/hlo_instructions.cc @@ -2494,9 +2494,9 @@ HloScatterInstruction::HloScatterInstruction( HloInstruction* scatter_indices, HloInstruction* updates, HloComputation* update_computation, const ScatterDimensionNumbers& scatter_dim_numbers, bool indices_are_sorted, - bool use_atomic) + bool unique_indices) : HloInstruction(HloOpcode::kScatter, shape), - indices_are_sorted_(indices_are_sorted), use_atomic_(use_atomic) { + indices_are_sorted_(indices_are_sorted), unique_indices_(unique_indices) { AppendOperand(operand); AppendOperand(scatter_indices); AppendOperand(updates); @@ -2551,7 +2551,7 @@ HloInstructionProto HloScatterInstruction::ToProto() const { HloInstructionProto proto = HloInstruction::ToProto(); *proto.mutable_scatter_dimension_numbers() = scatter_dimension_numbers(); proto.set_indices_are_sorted(indices_are_sorted()); - proto.set_use_atomic(use_atomic()); + proto.set_unique_indices(unique_indices()); return proto; } @@ -2562,8 +2562,8 @@ std::vector HloScatterInstruction::ExtraAttributesToStringImpl( if (indices_are_sorted()) { attrs.push_back("indices_are_sorted=true"); } - if (!use_atomic()) { - attrs.push_back("use_atomic=false"); + if (unique_indices()) { + attrs.push_back("unique_indices=true"); } return attrs; } @@ -2578,7 +2578,7 @@ bool HloScatterInstruction::IdenticalSlowPath( casted_other.scatter_dimension_numbers()) && eq_computations(to_apply(), casted_other.to_apply()) && indices_are_sorted() == casted_other.indices_are_sorted() && - use_atomic() == casted_other.use_atomic(); + unique_indices() == casted_other.unique_indices(); } std::unique_ptr HloScatterInstruction::CloneWithNewOperandsImpl( @@ -2587,7 +2587,7 @@ std::unique_ptr HloScatterInstruction::CloneWithNewOperandsImpl( CHECK_EQ(new_operands.size(), 3); return absl::make_unique( shape, new_operands[0], new_operands[1], new_operands[2], to_apply(), - scatter_dimension_numbers(), indices_are_sorted(), use_atomic()); + scatter_dimension_numbers(), indices_are_sorted(), unique_indices()); } HloIotaInstruction::HloIotaInstruction(const Shape& shape, int64 iota_dimension) diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h index cec090172ee..59f2392866b 100755 --- a/tensorflow/compiler/xla/service/hlo_instructions.h +++ b/tensorflow/compiler/xla/service/hlo_instructions.h @@ -1453,7 +1453,7 @@ class HloScatterInstruction : public HloInstruction { HloInstruction* scatter_indices, HloInstruction* updates, HloComputation* update_computation, const ScatterDimensionNumbers& scatter_dim_numbers, - bool indices_are_sorted, bool use_atomic); + bool indices_are_sorted, bool unique_indices); const ScatterDimensionNumbers& scatter_dimension_numbers() const { CHECK(scatter_dimension_numbers_ != nullptr); return *scatter_dimension_numbers_; @@ -1462,7 +1462,7 @@ class HloScatterInstruction : public HloInstruction { void set_indices_are_sorted(bool indices_are_sorted) { indices_are_sorted_ = indices_are_sorted; } - bool use_atomic() const override { return use_atomic_; } + bool unique_indices() const override { return unique_indices_; } // Returns a serialized representation of this instruction. HloInstructionProto ToProto() const override; @@ -1490,7 +1490,7 @@ class HloScatterInstruction : public HloInstruction { std::unique_ptr scatter_dimension_numbers_; bool indices_are_sorted_; - bool use_atomic_; + bool unique_indices_; }; class HloIotaInstruction : public HloInstruction { diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc index b5eb0694275..6716c4f1744 100644 --- a/tensorflow/compiler/xla/service/hlo_parser.cc +++ b/tensorflow/compiler/xla/service/hlo_parser.cc @@ -1726,9 +1726,9 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder, optional indices_are_sorted = false; attrs["indices_are_sorted"] = {/*required=*/false, AttrTy::kBool, &indices_are_sorted}; - optional use_atomic = true; - attrs["use_atomic"] = {/*required=*/false, AttrTy::kBool, - &use_atomic}; + optional unique_indices = false; + attrs["unique_indices"] = {/*required=*/false, AttrTy::kBool, + &unique_indices}; if (!ParseOperands(&operands, /*expected_size=*/3) || !ParseAttributes(attrs)) { @@ -1745,7 +1745,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder, instruction = builder->AddInstruction(HloInstruction::CreateScatter( shape, /*operand=*/operands[0], /*scatter_indices=*/operands[1], /*updates=*/operands[2], *update_computation, dim_numbers, - indices_are_sorted.value(), use_atomic.value())); + indices_are_sorted.value(), unique_indices.value())); break; } case HloOpcode::kDomain: { diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc index 0f06af69ff5..a2c8c61bee5 100644 --- a/tensorflow/compiler/xla/service/hlo_parser_test.cc +++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc @@ -937,8 +937,8 @@ ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7 )" }, { -"AtomicScatter", -R"(HloModule StringifyAtomicScatter +"UniqueIndicesScatter", +R"(HloModule StringifyUniqueIndicesScatter %add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] { %lhs = f32[] parameter(0) @@ -950,7 +950,7 @@ ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7 %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0) %scatter_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1) %updates = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} parameter(2) - ROOT %scatter = f32[50,49,48,47,46]{4,3,2,1,0} scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %scatter_indices, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %updates), update_window_dims={4,5,6,7,8}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, use_atomic=false, to_apply=%add_F32.v3 + ROOT %scatter = f32[50,49,48,47,46]{4,3,2,1,0} scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %scatter_indices, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %updates), update_window_dims={4,5,6,7,8}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, unique_indices=true, to_apply=%add_F32.v3 } )" diff --git a/tensorflow/compiler/xla/tests/scatter_test.cc b/tensorflow/compiler/xla/tests/scatter_test.cc index 1b8f58f858a..0fdd176e8ef 100644 --- a/tensorflow/compiler/xla/tests/scatter_test.cc +++ b/tensorflow/compiler/xla/tests/scatter_test.cc @@ -225,7 +225,7 @@ ENTRY main { RunTest(hlo_text, &operand, &scatter_indices, &updates); } -XLA_TEST_F(ScatterTest, TensorFlowScatter_Add_NoAtomic) { +XLA_TEST_F(ScatterTest, TensorFlowScatter_Add_UniqueIndices) { const string hlo_text = R"( HloModule TensorFlowScatter_Add @@ -245,7 +245,7 @@ ENTRY main { inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=1, - use_atomic=false + unique_indices=true } )"; Literal operand = From c08122ad6a7993054bfbc6557d99f2833a916f4a Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Fri, 23 Aug 2019 12:47:41 -0700 Subject: [PATCH 6/6] Fix doc typo and add a test. --- .../compiler/xla/g3doc/operation_semantics.md | 5 ++-- .../xla/service/gpu/ir_emitter_unnested.h | 2 +- .../xla/service/gpu/tests/gpu_atomic_test.cc | 27 +++++++++++++++++++ 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md index 355e5fec73a..a1772a0d1e1 100644 --- a/tensorflow/compiler/xla/g3doc/operation_semantics.md +++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md @@ -1384,7 +1384,7 @@ For a more intuitive description, see the "Informal Description" section below. : : : the caller. : | `unique_indices` | `bool` | Whether the indices are | : : : guaranteed to be unique by : -: : : the caller : +: : : the caller. : For convenience, we label dimensions in the output array not in `offset_dims` as `batch_dims`. @@ -1455,7 +1455,8 @@ then the semantics is implementation defined. If `unique_indices` is set to true then XLA can assume that all element scattered to are unique. So XLA could use non-atomic -operation. If they are not, then the semantics is implementation +operations. If `unique_indices` is set to true and the indices being +scattered to are not unique then the semantics is implementation defined. ### Informal Description and Examples diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h index 9c69545cb9f..0f52aecda23 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h @@ -230,7 +230,7 @@ class IrEmitterUnnested : public IrEmitter, // is false, we will use an atomic update. Using false for unique_indices // is safe only when it is guaranteed that there are no duplicate // indices. - // When using unique_indices=true, it is the caller responsibility to + // When using unique_indices=true, it is the caller's responsibility to // ensure there is no overlap. Status EmitScatter(Thunk* thunk, HloInstruction* scatter, const llvm_ir::ElementGenerator& scatter_indices_gen, diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc index 6b18c4c6371..a54c0e5ae44 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc @@ -53,6 +53,33 @@ CHECK: store atomic{{.*}}unordered, align 4 )"); } +TEST_F(GpuAtomicTest, TestStoreNoAtomic) { + const char* hlo_string = R"( + HloModule TensorFlowScatterV1 + + update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) + } + + ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1, unique_indices=true + } +)"; + + CompileAndVerifyIr(hlo_string, R"( +CHECK-NOT: store atomic{{.*}}unordered, align 4 +)"); +} + } // namespace } // namespace gpu } // namespace xla