diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc index 4b8d20f53e5..f7ad847cc93 100644 --- a/tensorflow/compiler/xla/service/copy_insertion.cc +++ b/tensorflow/compiler/xla/service/copy_insertion.cc @@ -1222,57 +1222,4 @@ bool IsWhileBody(const HloComputation* computation, } } // namespace - -/* static */ StatusOr CopyInsertion::AddCopiesForBufferAssignment( - HloModule* module) { - std::unique_ptr call_graph = CallGraph::Build(module); - TF_ASSIGN_OR_RETURN(std::unique_ptr dataflow, - HloDataflowAnalysis::Run(*module)); - - bool changed = false; - - // If a buffer live out of a computation is a constant, a parameter, or not - // defined in the computation, then copy it to account for the limited - // computation-scoped analysis in buffer assignment. An exception to this rule - // is the while body which is handled properly without copies. - for (HloComputation* computation : module->computations()) { - if (computation == module->entry_computation() || - IsWhileBody(computation, *call_graph)) { - continue; - } - - HloInstruction* root = computation->root_instruction(); - ShapeTree indices_to_copy(root->shape(), /*init_value=*/false); - bool copy_root = false; - for (const auto& pair : dataflow->GetInstructionValueSet(root)) { - const ShapeIndex& index = pair.first; - const HloValueSet& value_set = pair.second; - for (const HloValue* value : value_set.values()) { - HloInstruction* def = value->defining_instruction(); - if (def->parent() != computation || - def->opcode() == HloOpcode::kConstant || - def->opcode() == HloOpcode::kParameter) { - *indices_to_copy.mutable_element(index) = true; - copy_root = true; - } - } - } - if (copy_root) { - TF_ASSIGN_OR_RETURN( - HloInstruction * root_copy, - computation->DeepCopyInstruction(root, &indices_to_copy)); - computation->set_root_instruction(root_copy); - changed = true; - } - } - - TupleSimplifier tuple_simplifier; - HloDCE dce; - TF_ASSIGN_OR_RETURN(bool tuple_simplifier_changed, - tuple_simplifier.Run(module)); - TF_ASSIGN_OR_RETURN(bool dce_changed, dce.Run(module)); - - return changed || tuple_simplifier_changed || dce_changed; -} - } // namespace xla diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h index 988b93b557f..2438578d6ae 100644 --- a/tensorflow/compiler/xla/service/copy_insertion.h +++ b/tensorflow/compiler/xla/service/copy_insertion.h @@ -60,17 +60,6 @@ class CopyInsertion : public HloModulePass { // (copies were inserted). StatusOr Run(HloModule* module) override; - // The CPU and GPU backend need additional copies added due to deficiencies in - // buffer assignment. Specifically, copies are needed for constants live-out - // of computations, and for values which are live-in and live-out of the same - // computation. These copies are needed because buffer-assignment uses a - // computation-scoped analyis (TuplePointsToAnalysis) and has limited - // visibility across computation boundaries. This method adds these necessary - // copies. Returns whether the module was modified. - // - // TODO(b/62548313): Remove this when buffer assignment is module-scoped. - static StatusOr AddCopiesForBufferAssignment(HloModule* module); - // Try to remove as many copies from the module as possible without // introducing live range interference. Only copy instructions that are // eligible for copy elision are considered for removal. diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index 227d8ffb1a0..8862fa373f2 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -76,7 +76,6 @@ cc_library( ":compiler_functor", ":buffer_info_util", ":conv_canonicalization", - ":cpu_copy_insertion", ":cpu_executable", ":cpu_hlo_support_checker", ":cpu_instruction_fusion", @@ -92,6 +91,7 @@ cc_library( "@com_google_absl//absl/strings", ":target_machine_features", "@com_google_absl//absl/types:span", + "//tensorflow/compiler/xla/service:copy_insertion", "//tensorflow/compiler/xla/service:hlo_casting_utils", "//tensorflow/compiler/xla/service:dump", "//tensorflow/compiler/xla/service:map_inliner", @@ -956,18 +956,6 @@ cc_library( ], ) -cc_library( - name = "cpu_copy_insertion", - srcs = ["cpu_copy_insertion.cc"], - hdrs = ["cpu_copy_insertion.h"], - deps = [ - "//tensorflow/compiler/xla/service:copy_insertion", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/compiler/xla/service:hlo_pass", - "//tensorflow/core:lib", - ], -) - cc_library( name = "vector_support_library", srcs = ["vector_support_library.cc"], @@ -986,26 +974,6 @@ cc_library( ], ) -tf_cc_test( - name = "cpu_copy_insertion_test", - srcs = ["cpu_copy_insertion_test.cc"], - deps = [ - ":cpu_copy_insertion", - "//tensorflow/compiler/xla:debug_options_flags", - "//tensorflow/compiler/xla:literal", - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:test", - "//tensorflow/compiler/xla:test_helpers", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/compiler/xla/service:hlo_graph_dumper", - "//tensorflow/compiler/xla/service:hlo_matchers", - "//tensorflow/compiler/xla/tests:hlo_test_base", - "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/core:test", - ], -) - cc_library( name = "cpu_hlo_support_checker", srcs = ["cpu_hlo_support_checker.cc"], diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index 3bccf0a2aec..af64fda0847 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -56,10 +56,10 @@ limitations under the License. #include "tensorflow/compiler/xla/service/conditional_simplifier.h" #include "tensorflow/compiler/xla/service/conditional_to_select.h" #include "tensorflow/compiler/xla/service/convolution_group_converter.h" +#include "tensorflow/compiler/xla/service/copy_insertion.h" #include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h" #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h" #include "tensorflow/compiler/xla/service/cpu/conv_canonicalization.h" -#include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h" #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h" #include "tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h" #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h" @@ -402,7 +402,7 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn( // interfering with the rewrites. pipeline.AddPass(); pipeline.AddPass(); - pipeline.AddPass(); + pipeline.AddPass(); pipeline.AddPass(); return pipeline.Run(module).status(); } diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.cc b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.cc deleted file mode 100644 index baaacd2ecc9..00000000000 --- a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h" - -#include -#include -#include - -#include "tensorflow/compiler/xla/service/copy_insertion.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/platform/logging.h" - -namespace xla { - -StatusOr CpuCopyInsertion::Run(HloModule* module) { - CopyInsertion generic_copy_insertion; - - TF_ASSIGN_OR_RETURN(bool generic_changed, generic_copy_insertion.Run(module)); - - // The CPU backend needs additional copies added due to deficiencies in - // buffer assignment. - TF_ASSIGN_OR_RETURN(bool buffer_assignment_changed, - CopyInsertion::AddCopiesForBufferAssignment(module)); - - return generic_changed || buffer_assignment_changed; -} - -} // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h deleted file mode 100644 index 076235f8874..00000000000 --- a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_COPY_INSERTION_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_COPY_INSERTION_H_ - -#include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" - -namespace xla { - -// Besides the modifications made by the generic xla::CopyInsertion, this -// CPU-specific copy insertion pass also adds copies to values live out of -// computations satisfying certain conditions (defined by constant or parameter, -// etc). This is necessary because of deficiencies of buffer -// assignment. Specifically, buffer assignment is computation-scoped and does -// not recognized aliasing between arguments and outputs of computations. -// -// TODO(b/62548313): Remove this when buffer assignment is smarter -// (module-scoped). -class CpuCopyInsertion : public HloModulePass { - public: - absl::string_view name() const override { return "copy-insertion"; } - - StatusOr Run(HloModule* module) override; -}; - -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_COPY_INSERTION_H_ diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc deleted file mode 100644 index c085f85fb73..00000000000 --- a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc +++ /dev/null @@ -1,139 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h" - -#include "tensorflow/compiler/xla/debug_options_flags.h" -#include "tensorflow/compiler/xla/literal.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/compiler/xla/service/hlo_matchers.h" -#include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/service/hlo_opcode.h" -#include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/compiler/xla/test.h" -#include "tensorflow/compiler/xla/test_helpers.h" -#include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/platform/test_benchmark.h" - -namespace xla { -namespace { - -namespace op = xla::testing::opcode_matchers; - -int64 CountCopies(const HloComputation& computation) { - int64 count = 0; - for (const auto& instruction : computation.instructions()) { - if (instruction->opcode() == HloOpcode::kCopy) { - count++; - } - } - return count; -} - -int64 CountCopies(const HloModule& module) { - int64 count = 0; - for (const auto& computation : module.computations()) { - count += CountCopies(*computation); - } - return count; -} - -class CpuCopyInsertionTest : public HloTestBase { - protected: - void InsertCopies(HloModule* module) { - CpuCopyInsertion copy_insertion; - ASSERT_IS_OK(copy_insertion.Run(module).status()); - } - - const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {}); -}; - -TEST_F(CpuCopyInsertionTest, WhileBodyWithConstantRoot) { - // Test a while body and condition which are each simply a constant (root of - // computation is a constant). Each constant should be copied. - auto module = CreateNewVerifiedModule(); - auto builder = HloComputation::Builder(TestName()); - auto param_0 = builder.AddInstruction( - HloInstruction::CreateParameter(0, scalar_shape_, "param_0")); - - auto body_builder = HloComputation::Builder("body"); - body_builder.AddInstruction( - HloInstruction::CreateParameter(0, scalar_shape_, "param")); - body_builder.AddInstruction( - HloInstruction::CreateConstant(LiteralUtil::CreateR0(123.0))); - HloComputation* body = module->AddEmbeddedComputation(body_builder.Build()); - - auto cond_builder = HloComputation::Builder("condition"); - cond_builder.AddInstruction( - HloInstruction::CreateParameter(0, scalar_shape_, "param")); - cond_builder.AddInstruction( - HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); - HloComputation* condition = - module->AddEmbeddedComputation(cond_builder.Build()); - - auto xla_while = builder.AddInstruction( - HloInstruction::CreateWhile(scalar_shape_, condition, body, param_0)); - - module->AddEntryComputation(builder.Build()); - - InsertCopies(module.get()); - - EXPECT_EQ(CountCopies(*module), 3); - - EXPECT_THAT(xla_while->operand(0), op::Copy(op::Parameter())); - EXPECT_THAT(body->root_instruction(), op::Copy(op::Constant())); - EXPECT_THAT(condition->root_instruction(), op::Copy(op::Constant())); -} - -TEST_F(CpuCopyInsertionTest, TupleCall) { - // Test a kCall instruction which calls a computation which produces a three - // element tuple: one is a constant, one is a parameter, and one is produced - // in the computation. The constant and parameter should be copied. - auto module = CreateNewVerifiedModule(); - auto builder = HloComputation::Builder(TestName()); - auto param = builder.AddInstruction( - HloInstruction::CreateParameter(0, scalar_shape_, "param_0")); - const Shape tuple_shape = - ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_, scalar_shape_}); - - auto sub_builder = HloComputation::Builder("subcomputation"); - auto sub_param = sub_builder.AddInstruction( - HloInstruction::CreateParameter(0, scalar_shape_, "param")); - auto constant = sub_builder.AddInstruction( - HloInstruction::CreateConstant(LiteralUtil::CreateR0(123.0))); - auto add = sub_builder.AddInstruction(HloInstruction::CreateBinary( - scalar_shape_, HloOpcode::kAdd, sub_param, constant)); - sub_builder.AddInstruction( - HloInstruction::CreateTuple({sub_param, constant, add})); - HloComputation* subcomputation = - module->AddEmbeddedComputation(sub_builder.Build()); - - builder.AddInstruction( - HloInstruction::CreateCall(tuple_shape, {param}, subcomputation)); - - module->AddEntryComputation(builder.Build()); - - InsertCopies(module.get()); - - EXPECT_EQ(CountCopies(*subcomputation), 2); - EXPECT_THAT(subcomputation->root_instruction(), - op::Tuple(op::Copy(op::Parameter()), op::Copy(op::Constant()), - op::Add())); -} - -} // namespace -} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc index e2ab00ce41c..6853e64e9f6 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc @@ -54,12 +54,7 @@ StatusOr GpuCopyInsertion::Run(HloModule* module) { } } - // The GPU backend needs additional copies added due to deficiencies in - // buffer assignment. - TF_ASSIGN_OR_RETURN(bool buffer_assignment_changed, - CopyInsertion::AddCopiesForBufferAssignment(module)); - - return changed || buffer_assignment_changed; + return changed; } } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc index 6e414bd7a4d..f1e555064c7 100644 --- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc +++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc @@ -51,7 +51,8 @@ void HloToIrBindings::EmitBasePointersForHlos( absl::flat_hash_set already_bound_for_this_function; auto arg_iter = function->arg_begin(); for (const HloInstruction* io_hlo : io_hlos) { - CHECK(!absl::c_count(non_io_hlos, io_hlo)) + CHECK(io_hlo == io_hlo->parent()->root_instruction() || + !absl::c_count(non_io_hlos, io_hlo)) << "IO HLOs and non-IO HLOs should be disjoint"; if (!already_bound_for_this_function.contains(io_hlo)) { if (!is_nested_ && io_hlo->opcode() == HloOpcode::kGetTupleElement) { diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc index fe568f65df8..f90e2716869 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc @@ -177,13 +177,6 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation( llvm::Value* source_address) { CHECK_EQ(2, computation.num_parameters()); - if (computation.instruction_count() != 3) { - // We special-case only computations with one computing instruction for now. - // Such computation has exactly three instructions given it has two - // parameters. - return false; - } - HloOpcode root_opcode = computation.root_instruction()->opcode(); PrimitiveType element_type = computation.root_instruction()->shape().element_type(); @@ -191,12 +184,11 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation( element_type == S64 || element_type == U64; llvm::Value* source = Load(source_address, "source"); - // kCopy of RHS -> atomic store. - if (root_opcode == HloOpcode::kCopy && + // Just passing along RHS -> atomic store. + if (computation.instruction_count() == 2 && + root_opcode == HloOpcode::kParameter && (element_type == F32 || is_atomic_integral) && - computation.root_instruction()->operand(0)->opcode() == - HloOpcode::kParameter && - computation.root_instruction()->operand(0)->parameter_number() == 1) { + computation.root_instruction()->parameter_number() == 1) { llvm::StoreInst* store = Store(source, output_address); store->setAtomic(llvm::AtomicOrdering::Unordered); // Derive a minimum alignment from the type. The optimizer can increase it @@ -205,6 +197,13 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation( return true; } + if (computation.instruction_count() != 3) { + // We special-case only computations with one computing instruction for now. + // Such computation has exactly three instructions given it has two + // parameters. + return false; + } + if (root_opcode == HloOpcode::kAdd) { // NVPTX supports atomicAdd on F32 and integer types. if (element_type == F32) {