diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index a52057ff2cd..e39ee46f9cb 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -35,6 +35,7 @@ cc_library(
     srcs = ["cpu_transfer_manager.cc"],
     hdrs = ["cpu_transfer_manager.h"],
     deps = [
+        ":cpu_runtime",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -45,7 +46,6 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:generic_transfer_manager",
         "//tensorflow/compiler/xla/service:transfer_manager",
-        "//tensorflow/compiler/xla/service/cpu:cpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor",
@@ -1013,3 +1013,19 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
+
+tf_cc_test(
+    name = "vectorized_reduce_with_no_vector_registers_test",
+    size = "small",
+    srcs = ["vectorized_reduce_with_no_vector_registers_test.cc"],
+    deps = [
+        ":cpu_compiler",
+        ":cpu_transfer_manager",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@llvm//:core",
+        "@llvm//:support",
+        "@llvm//:target",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index ceaeacbea2a..f0d7461e5e7 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1739,6 +1739,16 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     return false;
   }
 
+  int vector_register_size_in_elements =
+      target_machine_features_.vector_register_byte_size(
+          *compute_function_->function()) /
+      ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type());
+  if (vector_register_size_in_elements == 0) {
+    // Either we don't know the vector register width for the target or the
+    // vector register is smaller than the size of the primitive type.
+    return false;
+  }
+
   int vectorization_factor_in_bytes =
       target_machine_features_.vectorization_factor_in_bytes();
 
diff --git a/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
new file mode 100644
index 00000000000..2918c886f08
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
@@ -0,0 +1,106 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+class CodegenReduceOnArchWithNoVectorRegisters : public HloTestBase {};
+
+StatusOr<unsigned> GetTargetVectorRegisterByteSize(std::string triple) {
+  // Unfortunately we need a lot of boilerplate to get to an
+  // llvm::TargetMachine.
+
+  std::string error;
+  const llvm::Target* target =
+      llvm::TargetRegistry::lookupTarget(triple, error);
+  if (target == nullptr) {
+    return InternalError("TargetRegistry::lookupTarget failed: %s", error);
+  }
+
+  llvm::LLVMContext context;
+  std::unique_ptr<llvm::Function> function =
+      absl::WrapUnique(llvm::Function::Create(
+          llvm::FunctionType::get(llvm::Type::getVoidTy(context), {}),
+          llvm::GlobalValue::ExternalLinkage, "test"));
+
+  std::unique_ptr<llvm::TargetMachine> target_machine =
+      absl::WrapUnique(target->createTargetMachine(
+          /*TT=*/triple, /*CPU=*/"", /*Features=*/"", llvm::TargetOptions{},
+          /*RM=*/llvm::None));
+  cpu::LLVMTargetMachineFeatures target_machine_features(target_machine.get());
+  return target_machine_features.vector_register_byte_size(*function);
+}
+
+TEST_F(CodegenReduceOnArchWithNoVectorRegisters, Test) {
+  absl::string_view text = R"(
+HloModule Reduce
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY main {
+  input = f32[1000,1000] parameter(0)
+  constant = f32[] constant(0)
+  ROOT reduce = f32[1000] reduce(input, constant), dimensions={0}, to_apply=add
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(text));
+  cpu::CpuCompiler cpu_compiler;
+  auto module_group = absl::make_unique<HloModuleGroup>("group");
+  module_group->push_back(std::move(hlo_module));
+
+  // Check that the GetTargetVectorRegisterByteSize is itself working.
+  TF_ASSERT_OK_AND_ASSIGN(unsigned vector_register_byte_size_for_x86_64,
+                          GetTargetVectorRegisterByteSize("x86_64-pc-linux"));
+  ASSERT_EQ(vector_register_byte_size_for_x86_64, 16);
+
+  std::string triple = "i686-none-android";
+
+  TF_ASSERT_OK_AND_ASSIGN(unsigned vector_register_byte_size,
+                          GetTargetVectorRegisterByteSize(triple));
+
+  // This test is supposed to check whether the XLA CPU vectorized reduction
+  // codegen works correctly for architectures that do not have vector
+  // registers.  So first ASSERT that `triple` is actually a target with no
+  // vector registers, as otherwise the test isn't actually testing anything
+  // interesting.
+
+  ASSERT_EQ(vector_register_byte_size, 0);
+
+  cpu::CpuAotCompilationOptions aot_compilation_options(
+      /*triple=*/triple, /*cpu_name=*/"", /*features=*/"",
+      /*entry_point_name=*/"main",
+      cpu::CpuAotCompilationOptions::RelocationModel::BigPic);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<AotCompilationResult>> aot_compilation_result,
+      cpu_compiler.CompileAheadOfTime(std::move(module_group),
+                                      aot_compilation_options));
+  EXPECT_EQ(aot_compilation_result.size(), 1);
+}
+}  // namespace
+}  // namespace xla