Avoid dividing by zero in the vectorized reduce emitter

Before this we'd divide by zero if vector register size in elements is 0. Bail out of the vectorized reduction emission instead. PiperOrigin-RevId: 261012234
2019-07-31 15:36:34 -07:00 · 2019-07-31 15:36:34 -07:00 · e2ac3e26c2
commit e2ac3e26c2
parent 689bb37fa3
3 changed files with 133 additions and 1 deletions
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@ -35,6 +35,7 @@ cc_library(
    srcs = ["cpu_transfer_manager.cc"],
    hdrs = ["cpu_transfer_manager.h"],
    deps = [
+        ":cpu_runtime",
        "//tensorflow/compiler/xla:literal",
        "//tensorflow/compiler/xla:literal_util",
        "//tensorflow/compiler/xla:shape_util",
@ -45,7 +46,6 @@ cc_library(
        "//tensorflow/compiler/xla:xla_data_proto",
        "//tensorflow/compiler/xla/service:generic_transfer_manager",
        "//tensorflow/compiler/xla/service:transfer_manager",
-        "//tensorflow/compiler/xla/service/cpu:cpu_runtime",
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor_no_cuda",
        "//tensorflow/stream_executor",
@ -1013,3 +1013,19 @@ tf_cc_test(
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
    ],
 )
+
+tf_cc_test(
+    name = "vectorized_reduce_with_no_vector_registers_test",
+    size = "small",
+    srcs = ["vectorized_reduce_with_no_vector_registers_test.cc"],
+    deps = [
+        ":cpu_compiler",
+        ":cpu_transfer_manager",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@llvm//:core",
+        "@llvm//:support",
+        "@llvm//:target",
+    ],
+)
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@ -1739,6 +1739,16 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
    return false;
  }

+  int vector_register_size_in_elements =
+      target_machine_features_.vector_register_byte_size(
+          *compute_function_->function()) /
+      ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type());
+  if (vector_register_size_in_elements == 0) {
+    // Either we don't know the vector register width for the target or the
+    // vector register is smaller than the size of the primitive type.
+    return false;
+  }
+
  int vectorization_factor_in_bytes =
      target_machine_features_.vectorization_factor_in_bytes();

--- a/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
@ -0,0 +1,106 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+class CodegenReduceOnArchWithNoVectorRegisters : public HloTestBase {};
+
+StatusOr<unsigned> GetTargetVectorRegisterByteSize(std::string triple) {
+  // Unfortunately we need a lot of boilerplate to get to an
+  // llvm::TargetMachine.
+
+  std::string error;
+  const llvm::Target* target =
+      llvm::TargetRegistry::lookupTarget(triple, error);
+  if (target == nullptr) {
+    return InternalError("TargetRegistry::lookupTarget failed: %s", error);
+  }
+
+  llvm::LLVMContext context;
+  std::unique_ptr<llvm::Function> function =
+      absl::WrapUnique(llvm::Function::Create(
+          llvm::FunctionType::get(llvm::Type::getVoidTy(context), {}),
+          llvm::GlobalValue::ExternalLinkage, "test"));
+
+  std::unique_ptr<llvm::TargetMachine> target_machine =
+      absl::WrapUnique(target->createTargetMachine(
+          /*TT=*/triple, /*CPU=*/"", /*Features=*/"", llvm::TargetOptions{},
+          /*RM=*/llvm::None));
+  cpu::LLVMTargetMachineFeatures target_machine_features(target_machine.get());
+  return target_machine_features.vector_register_byte_size(*function);
+}
+
+TEST_F(CodegenReduceOnArchWithNoVectorRegisters, Test) {
+  absl::string_view text = R"(
+HloModule Reduce
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY main {
+  input = f32[1000,1000] parameter(0)
+  constant = f32[] constant(0)
+  ROOT reduce = f32[1000] reduce(input, constant), dimensions={0}, to_apply=add
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(text));
+  cpu::CpuCompiler cpu_compiler;
+  auto module_group = absl::make_unique<HloModuleGroup>("group");
+  module_group->push_back(std::move(hlo_module));
+
+  // Check that the GetTargetVectorRegisterByteSize is itself working.
+  TF_ASSERT_OK_AND_ASSIGN(unsigned vector_register_byte_size_for_x86_64,
+                          GetTargetVectorRegisterByteSize("x86_64-pc-linux"));
+  ASSERT_EQ(vector_register_byte_size_for_x86_64, 16);
+
+  std::string triple = "i686-none-android";
+
+  TF_ASSERT_OK_AND_ASSIGN(unsigned vector_register_byte_size,
+                          GetTargetVectorRegisterByteSize(triple));
+
+  // This test is supposed to check whether the XLA CPU vectorized reduction
+  // codegen works correctly for architectures that do not have vector
+  // registers.  So first ASSERT that `triple` is actually a target with no
+  // vector registers, as otherwise the test isn't actually testing anything
+  // interesting.
+
+  ASSERT_EQ(vector_register_byte_size, 0);
+
+  cpu::CpuAotCompilationOptions aot_compilation_options(
+      /*triple=*/triple, /*cpu_name=*/"", /*features=*/"",
+      /*entry_point_name=*/"main",
+      cpu::CpuAotCompilationOptions::RelocationModel::BigPic);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<AotCompilationResult>> aot_compilation_result,
+      cpu_compiler.CompileAheadOfTime(std::move(module_group),
+                                      aot_compilation_options));
+  EXPECT_EQ(aot_compilation_result.size(), 1);
+}
+}  // namespace
+}  // namespace xla