diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index a52057ff2cd..e39ee46f9cb 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -35,6 +35,7 @@ cc_library( srcs = ["cpu_transfer_manager.cc"], hdrs = ["cpu_transfer_manager.h"], deps = [ + ":cpu_runtime", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", @@ -45,7 +46,6 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:generic_transfer_manager", "//tensorflow/compiler/xla/service:transfer_manager", - "//tensorflow/compiler/xla/service/cpu:cpu_runtime", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/stream_executor", @@ -1013,3 +1013,19 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:xla_internal_test_main", ], ) + +tf_cc_test( + name = "vectorized_reduce_with_no_vector_registers_test", + size = "small", + srcs = ["vectorized_reduce_with_no_vector_registers_test.cc"], + deps = [ + ":cpu_compiler", + ":cpu_transfer_manager", + "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "@llvm//:core", + "@llvm//:support", + "@llvm//:target", + ], +) diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc index ceaeacbea2a..f0d7461e5e7 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc @@ -1739,6 +1739,16 @@ StatusOr IrEmitter::EmitVectorizedReduce( return false; } + int vector_register_size_in_elements = + target_machine_features_.vector_register_byte_size( + *compute_function_->function()) / + ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type()); + if (vector_register_size_in_elements == 0) { + // Either we don't know the vector register width for the target or the + // vector register is smaller than the size of the primitive type. + return false; + } + int vectorization_factor_in_bytes = target_machine_features_.vectorization_factor_in_bytes(); diff --git a/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc new file mode 100644 index 00000000000..2918c886f08 --- /dev/null +++ b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc @@ -0,0 +1,106 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" +#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" + +namespace xla { +namespace { +class CodegenReduceOnArchWithNoVectorRegisters : public HloTestBase {}; + +StatusOr GetTargetVectorRegisterByteSize(std::string triple) { + // Unfortunately we need a lot of boilerplate to get to an + // llvm::TargetMachine. + + std::string error; + const llvm::Target* target = + llvm::TargetRegistry::lookupTarget(triple, error); + if (target == nullptr) { + return InternalError("TargetRegistry::lookupTarget failed: %s", error); + } + + llvm::LLVMContext context; + std::unique_ptr function = + absl::WrapUnique(llvm::Function::Create( + llvm::FunctionType::get(llvm::Type::getVoidTy(context), {}), + llvm::GlobalValue::ExternalLinkage, "test")); + + std::unique_ptr target_machine = + absl::WrapUnique(target->createTargetMachine( + /*TT=*/triple, /*CPU=*/"", /*Features=*/"", llvm::TargetOptions{}, + /*RM=*/llvm::None)); + cpu::LLVMTargetMachineFeatures target_machine_features(target_machine.get()); + return target_machine_features.vector_register_byte_size(*function); +} + +TEST_F(CodegenReduceOnArchWithNoVectorRegisters, Test) { + absl::string_view text = R"( +HloModule Reduce + +add { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) +} + +ENTRY main { + input = f32[1000,1000] parameter(0) + constant = f32[] constant(0) + ROOT reduce = f32[1000] reduce(input, constant), dimensions={0}, to_apply=add +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr hlo_module, + ParseAndReturnVerifiedModule(text)); + cpu::CpuCompiler cpu_compiler; + auto module_group = absl::make_unique("group"); + module_group->push_back(std::move(hlo_module)); + + // Check that the GetTargetVectorRegisterByteSize is itself working. + TF_ASSERT_OK_AND_ASSIGN(unsigned vector_register_byte_size_for_x86_64, + GetTargetVectorRegisterByteSize("x86_64-pc-linux")); + ASSERT_EQ(vector_register_byte_size_for_x86_64, 16); + + std::string triple = "i686-none-android"; + + TF_ASSERT_OK_AND_ASSIGN(unsigned vector_register_byte_size, + GetTargetVectorRegisterByteSize(triple)); + + // This test is supposed to check whether the XLA CPU vectorized reduction + // codegen works correctly for architectures that do not have vector + // registers. So first ASSERT that `triple` is actually a target with no + // vector registers, as otherwise the test isn't actually testing anything + // interesting. + + ASSERT_EQ(vector_register_byte_size, 0); + + cpu::CpuAotCompilationOptions aot_compilation_options( + /*triple=*/triple, /*cpu_name=*/"", /*features=*/"", + /*entry_point_name=*/"main", + cpu::CpuAotCompilationOptions::RelocationModel::BigPic); + + TF_ASSERT_OK_AND_ASSIGN( + std::vector> aot_compilation_result, + cpu_compiler.CompileAheadOfTime(std::move(module_group), + aot_compilation_options)); + EXPECT_EQ(aot_compilation_result.size(), 1); +} +} // namespace +} // namespace xla