Add int64 axis support for reduction ops. (#13891)

* Add int64 axis support for reduction ops. This fix is a follow up to PR 13863. In PR 13863 the program crash is fixed if int64 axis is passed to reduction ops, e.g. reduce_sum, reduce_max, etc. However, 13863 does not process the case of int64 support, it merely fixes the crash. This fix adds the support for int64 axis of reduction ops. Signed-off-by: Yong Tang <yong.tang.github@outlook.com> * Add int64 axis support for mean, prod, sum Signed-off-by: Yong Tang <yong.tang.github@outlook.com> * Add int64 axis support for min and max. Signed-off-by: Yong Tang <yong.tang.github@outlook.com> * Add int64 axis support for reduce_all and reduce_any Signed-off-by: Yong Tang <yong.tang.github@outlook.com> * Add test cases for int64 axis support of reduce_any and reduce_all Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
2017-10-21 23:01:42 -07:00 · 2017-10-21 23:01:42 -07:00 · 1c1dad105a
commit 1c1dad105a
parent 17096081ee
10 changed files with 391 additions and 148 deletions
--- a/tensorflow/core/kernels/reduction_ops_all.cc
+++ b/tensorflow/core/kernels/reduction_ops_all.cc
@ -22,7 +22,13 @@ REGISTER_KERNEL_BUILDER(
        .TypeConstraint<int32>("Tidx")
        .Device(DEVICE_CPU)
        .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, bool, Eigen::internal::AndReducer>);
+    ReductionOp<CPUDevice, bool, int32, Eigen::internal::AndReducer>);
 REGISTER_KERNEL_BUILDER(
    Name("All")
        .TypeConstraint<int64>("Tidx")
        .Device(DEVICE_CPU)
        .HostMemory("reduction_indices"),
    ReductionOp<CPUDevice, bool, int64, Eigen::internal::AndReducer>);
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
@ -30,7 +36,13 @@ REGISTER_KERNEL_BUILDER(
        .TypeConstraint<int32>("Tidx")
        .Device(DEVICE_GPU)
        .HostMemory("reduction_indices"),
-    ReductionOp<GPUDevice, bool, Eigen::internal::AndReducer>);
+    ReductionOp<GPUDevice, bool, int32, Eigen::internal::AndReducer>);
 REGISTER_KERNEL_BUILDER(
    Name("All")
        .TypeConstraint<int64>("Tidx")
        .Device(DEVICE_GPU)
        .HostMemory("reduction_indices"),
    ReductionOp<GPUDevice, bool, int64, Eigen::internal::AndReducer>);
 #endif
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/reduction_ops_any.cc
+++ b/tensorflow/core/kernels/reduction_ops_any.cc
@ -22,7 +22,13 @@ REGISTER_KERNEL_BUILDER(
        .TypeConstraint<int32>("Tidx")
        .Device(DEVICE_CPU)
        .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, bool, Eigen::internal::OrReducer>);
+    ReductionOp<CPUDevice, bool, int32, Eigen::internal::OrReducer>);
 REGISTER_KERNEL_BUILDER(
    Name("Any")
        .TypeConstraint<int64>("Tidx")
        .Device(DEVICE_CPU)
        .HostMemory("reduction_indices"),
    ReductionOp<CPUDevice, bool, int64, Eigen::internal::OrReducer>);
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
@ -30,7 +36,13 @@ REGISTER_KERNEL_BUILDER(
        .TypeConstraint<int32>("Tidx")
        .Device(DEVICE_GPU)
        .HostMemory("reduction_indices"),
-    ReductionOp<GPUDevice, bool, Eigen::internal::OrReducer>);
+    ReductionOp<GPUDevice, bool, int32, Eigen::internal::OrReducer>);
 REGISTER_KERNEL_BUILDER(
    Name("Any")
        .TypeConstraint<int64>("Tidx")
        .Device(DEVICE_GPU)
        .HostMemory("reduction_indices"),
    ReductionOp<GPUDevice, bool, int64, Eigen::internal::OrReducer>);
 #endif
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/reduction_ops_common.cc
+++ b/tensorflow/core/kernels/reduction_ops_common.cc
@ -57,13 +57,12 @@ gtl::InlinedVector<int32, 8> ReductionHelper::permutation() {
  return perm;
 }
-Status ReductionHelper::Simplify(const Tensor& data, const Tensor& axis,
+template <typename Tperm>
-                                 const bool keep_dims) {
+Status SimplifyHelper(const Tensor& data, const Tensor& axis,
-  // bitmap[i] indicates whether to reduce data along i-th axis.
+                      gtl::InlinedVector<bool, 4>& bitmap) {
-  gtl::InlinedVector<bool, 4> bitmap(data.dims(), false);
+  auto axis_vec = axis.flat<Tperm>();
  auto axis_vec = axis.flat<int32>();
  for (int64 i = 0; i < axis.NumElements(); ++i) {
-    int32 index = axis_vec(i);
+    Tperm index = axis_vec(i);
    if (index < -data.dims() || index >= data.dims()) {
      return errors::InvalidArgument("Invalid reduction dimension (", index,
                                     " for input with ", data.dims(),
@ -72,7 +71,18 @@ Status ReductionHelper::Simplify(const Tensor& data, const Tensor& axis,
    index = (index + data.dims()) % data.dims();
    bitmap[index] = true;
  }
  return Status::OK();
 }
 Status ReductionHelper::Simplify(const Tensor& data, const Tensor& axis,
                                 const bool keep_dims) {
  // bitmap[i] indicates whether to reduce data along i-th axis.
  gtl::InlinedVector<bool, 4> bitmap(data.dims(), false);
  if (axis.dtype() == DT_INT32) {
    TF_RETURN_IF_ERROR(SimplifyHelper<int32>(data, axis, bitmap));
  } else {
    TF_RETURN_IF_ERROR(SimplifyHelper<int64>(data, axis, bitmap));
  }
  // Output tensor's dim sizes.
  out_shape_.clear();
  for (int i = 0; i < data.dims(); ++i) {
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@ -25,6 +25,7 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@ -68,9 +69,11 @@ struct ConstantsBase {
  const Eigen::IndexList<Eigen::type2index<1>> kOne;
  const Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<2>> kZeroTwo;
 };
-template<> struct Constants<CPUDevice> : ConstantsBase{};
+template <>
 struct Constants<CPUDevice> : ConstantsBase {};
 #ifdef TENSORFLOW_USE_SYCL
-template<> struct Constants<SYCLDevice> : ConstantsBase{};
+template <>
 struct Constants<SYCLDevice> : ConstantsBase {};
 #endif  // TENSORFLOW_USE_SYCL
 #endif  // EIGEN_HAS_INDEX_LIST
@ -131,12 +134,13 @@ class ReductionHelper {
 // For operations where the output is a reduction function along some
 // dimensions of the input.
-template <typename Device, class T, typename Reducer>
+template <typename Device, class T, typename Tperm, typename Reducer>
 class ReductionOp : public OpKernel {
 public:
  explicit ReductionOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    const DataType dt = DataTypeToEnum<T>::v();
-    OP_REQUIRES_OK(ctx, ctx->MatchSignature({dt, DT_INT32}, {dt}));
+    const DataType pt = DataTypeToEnum<Tperm>::v();
    OP_REQUIRES_OK(ctx, ctx->MatchSignature({dt, pt}, {dt}));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
  }
@ -266,8 +270,7 @@ struct ReduceFunctorBase {
  }
  template <typename OUT_T>
-  static void FillIdentity(const Device& d, OUT_T out,
+  static void FillIdentity(const Device& d, OUT_T out, const Reducer& reducer) {
                           const Reducer& reducer) {
    FillIdentityEigenImpl(d, out, reducer);
  }
 };
--- a/tensorflow/core/kernels/reduction_ops_max.cc
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@ -23,7 +23,13 @@ namespace tensorflow {
          .Device(DEVICE_CPU)                                                  \
          .TypeConstraint<type>("T")                                           \
          .TypeConstraint<int32>("Tidx"),                                      \
-      ReductionOp<CPUDevice, type, Eigen::internal::MaxReducer<type>>);
+      ReductionOp<CPUDevice, type, int32, Eigen::internal::MaxReducer<type>>); \
  REGISTER_KERNEL_BUILDER(                                                     \
      Name("Max")                                                              \
          .Device(DEVICE_CPU)                                                  \
          .TypeConstraint<type>("T")                                           \
          .TypeConstraint<int64>("Tidx"),                                      \
      ReductionOp<CPUDevice, type, int64, Eigen::internal::MaxReducer<type>>);
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
@ -36,7 +42,14 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
          .TypeConstraint<type>("T")                                           \
          .TypeConstraint<int32>("Tidx")                                       \
          .HostMemory("reduction_indices"),                                    \
-      ReductionOp<GPUDevice, type, Eigen::internal::MaxReducer<type>>);
+      ReductionOp<GPUDevice, type, int32, Eigen::internal::MaxReducer<type>>); \
  REGISTER_KERNEL_BUILDER(                                                     \
      Name("Max")                                                              \
          .Device(DEVICE_GPU)                                                  \
          .TypeConstraint<type>("T")                                           \
          .TypeConstraint<int64>("Tidx")                                       \
          .HostMemory("reduction_indices"),                                    \
      ReductionOp<GPUDevice, type, int64, Eigen::internal::MaxReducer<type>>);
 REGISTER_GPU_KERNELS(float);
 REGISTER_GPU_KERNELS(double);
 REGISTER_GPU_KERNELS(int64);
@ -52,7 +65,16 @@ REGISTER_KERNEL_BUILDER(
        .HostMemory("output")
        .TypeConstraint<int32>("T")
        .TypeConstraint<int32>("Tidx"),
-    ReductionOp<CPUDevice, int32, Eigen::internal::MaxReducer<int32>>);
+    ReductionOp<CPUDevice, int32, int32, Eigen::internal::MaxReducer<int32>>);
 REGISTER_KERNEL_BUILDER(
    Name("Max")
        .Device(DEVICE_GPU)
        .HostMemory("reduction_indices")
        .HostMemory("input")
        .HostMemory("output")
        .TypeConstraint<int32>("T")
        .TypeConstraint<int64>("Tidx"),
    ReductionOp<CPUDevice, int32, int64, Eigen::internal::MaxReducer<int32>>);
 #undef REGISTER_GPU_KERNELS
@ -60,13 +82,20 @@ REGISTER_KERNEL_BUILDER(
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(type)                                        \
-  REGISTER_KERNEL_BUILDER(                  \
+  REGISTER_KERNEL_BUILDER(Name("Max")                                      \
      Name("Max")                           \
                              .Device(DEVICE_SYCL)                         \
                              .TypeConstraint<type>("T")                   \
                              .TypeConstraint<int32>("Tidx")               \
                              .HostMemory("reduction_indices"),            \
-      ReductionOp<SYCLDevice, type, Eigen::internal::MaxReducer<type>>);
+                          ReductionOp<SYCLDevice, type, int32,             \
                                      Eigen::internal::MaxReducer<type>>); \
  REGISTER_KERNEL_BUILDER(Name("Max")                                      \
                              .Device(DEVICE_SYCL)                         \
                              .TypeConstraint<type>("T")                   \
                              .TypeConstraint<int64>("Tidx")               \
                              .HostMemory("reduction_indices"),            \
                          ReductionOp<SYCLDevice, type, int64,             \
                                      Eigen::internal::MaxReducer<type>>);
 REGISTER_SYCL_KERNELS(float);
 REGISTER_SYCL_KERNELS(double);
@ -78,7 +107,16 @@ REGISTER_KERNEL_BUILDER(
        .HostMemory("output")
        .TypeConstraint<int32>("T")
        .TypeConstraint<int32>("Tidx"),
-    ReductionOp<CPUDevice, int32, Eigen::internal::MaxReducer<int32>>);
+    ReductionOp<CPUDevice, int32, int32, Eigen::internal::MaxReducer<int32>>);
 REGISTER_KERNEL_BUILDER(
    Name("Max")
        .Device(DEVICE_SYCL)
        .HostMemory("reduction_indices")
        .HostMemory("input")
        .HostMemory("output")
        .TypeConstraint<int32>("T")
        .TypeConstraint<int64>("Tidx"),
    ReductionOp<CPUDevice, int32, int64, Eigen::internal::MaxReducer<int32>>);
 #undef REGISTER_SYCL_KERNELS
 #endif  // TENSORFLOW_USE_SYCL
--- a/tensorflow/core/kernels/reduction_ops_mean.cc
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@ -18,25 +18,38 @@ limitations under the License.
 namespace tensorflow {
 #define REGISTER_CPU_KERNELS(type)                                          \
-  REGISTER_KERNEL_BUILDER(                \
+  REGISTER_KERNEL_BUILDER(Name("Mean")                                      \
      Name("Mean")                        \
                              .Device(DEVICE_CPU)                           \
                              .TypeConstraint<type>("T")                    \
                              .TypeConstraint<int32>("Tidx"),               \
-      ReductionOp<CPUDevice, type, Eigen::internal::MeanReducer<type>>);
+                          ReductionOp<CPUDevice, type, int32,               \
                                      Eigen::internal::MeanReducer<type>>); \
  REGISTER_KERNEL_BUILDER(Name("Mean")                                      \
                              .Device(DEVICE_CPU)                           \
                              .TypeConstraint<type>("T")                    \
                              .TypeConstraint<int64>("Tidx"),               \
                          ReductionOp<CPUDevice, type, int64,               \
                                      Eigen::internal::MeanReducer<type>>);
 TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 #if GOOGLE_CUDA
 #define REGISTER_GPU_KERNELS(type)                                          \
-  REGISTER_KERNEL_BUILDER(                  \
+  REGISTER_KERNEL_BUILDER(Name("Mean")                                      \
      Name("Mean")                          \
                              .Device(DEVICE_GPU)                           \
                              .TypeConstraint<type>("T")                    \
                              .TypeConstraint<int32>("Tidx")                \
                              .HostMemory("reduction_indices"),             \
-      ReductionOp<GPUDevice, type, Eigen::internal::MeanReducer<type>>);
+                          ReductionOp<GPUDevice, type, int32,               \
                                      Eigen::internal::MeanReducer<type>>); \
  REGISTER_KERNEL_BUILDER(Name("Mean")                                      \
                              .Device(DEVICE_GPU)                           \
                              .TypeConstraint<type>("T")                    \
                              .TypeConstraint<int64>("Tidx")                \
                              .HostMemory("reduction_indices"),             \
                          ReductionOp<GPUDevice, type, int64,               \
                                      Eigen::internal::MeanReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_complex64(REGISTER_GPU_KERNELS);
 TF_CALL_complex128(REGISTER_GPU_KERNELS);
@ -46,13 +59,20 @@ TF_CALL_complex128(REGISTER_GPU_KERNELS);
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(type)                                         \
-  REGISTER_KERNEL_BUILDER(                  \
+  REGISTER_KERNEL_BUILDER(Name("Mean")                                      \
      Name("Mean")                          \
                              .Device(DEVICE_SYCL)                          \
                              .TypeConstraint<type>("T")                    \
                              .TypeConstraint<int32>("Tidx")                \
                              .HostMemory("reduction_indices"),             \
-      ReductionOp<SYCLDevice, type, Eigen::internal::MeanReducer<type>>);
+                          ReductionOp<SYCLDevice, type, int32,              \
                                      Eigen::internal::MeanReducer<type>>); \
  REGISTER_KERNEL_BUILDER(Name("Mean")                                      \
                              .Device(DEVICE_SYCL)                          \
                              .TypeConstraint<type>("T")                    \
                              .TypeConstraint<int64>("Tidx")                \
                              .HostMemory("reduction_indices"),             \
                          ReductionOp<SYCLDevice, type, int64,              \
                                      Eigen::internal::MeanReducer<type>>);
 REGISTER_SYCL_KERNELS(float);
 REGISTER_SYCL_KERNELS(double);
 #undef REGISTER_SYCL_KERNELS
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@ -23,7 +23,13 @@ namespace tensorflow {
          .Device(DEVICE_CPU)                                                  \
          .TypeConstraint<type>("T")                                           \
          .TypeConstraint<int32>("Tidx"),                                      \
-      ReductionOp<CPUDevice, type, Eigen::internal::MinReducer<type>>);
+      ReductionOp<CPUDevice, type, int32, Eigen::internal::MinReducer<type>>); \
  REGISTER_KERNEL_BUILDER(                                                     \
      Name("Min")                                                              \
          .Device(DEVICE_CPU)                                                  \
          .TypeConstraint<type>("T")                                           \
          .TypeConstraint<int64>("Tidx"),                                      \
      ReductionOp<CPUDevice, type, int64, Eigen::internal::MinReducer<type>>);
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
@ -36,7 +42,14 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
          .TypeConstraint<type>("T")                                           \
          .TypeConstraint<int32>("Tidx")                                       \
          .HostMemory("reduction_indices"),                                    \
-      ReductionOp<GPUDevice, type, Eigen::internal::MinReducer<type>>);
+      ReductionOp<GPUDevice, type, int32, Eigen::internal::MinReducer<type>>); \
  REGISTER_KERNEL_BUILDER(                                                     \
      Name("Min")                                                              \
          .Device(DEVICE_GPU)                                                  \
          .TypeConstraint<type>("T")                                           \
          .TypeConstraint<int64>("Tidx")                                       \
          .HostMemory("reduction_indices"),                                    \
      ReductionOp<GPUDevice, type, int64, Eigen::internal::MinReducer<type>>);
 REGISTER_GPU_KERNELS(float);
 REGISTER_GPU_KERNELS(double);
@ -51,7 +64,16 @@ REGISTER_KERNEL_BUILDER(
        .HostMemory("output")
        .TypeConstraint<int32>("T")
        .TypeConstraint<int32>("Tidx"),
-    ReductionOp<CPUDevice, int32, Eigen::internal::MinReducer<int32>>);
+    ReductionOp<CPUDevice, int32, int32, Eigen::internal::MinReducer<int32>>);
 REGISTER_KERNEL_BUILDER(
    Name("Min")
        .Device(DEVICE_GPU)
        .HostMemory("reduction_indices")
        .HostMemory("input")
        .HostMemory("output")
        .TypeConstraint<int32>("T")
        .TypeConstraint<int64>("Tidx"),
    ReductionOp<CPUDevice, int32, int64, Eigen::internal::MinReducer<int32>>);
 #undef REGISTER_GPU_KERNELS
@ -59,13 +81,20 @@ REGISTER_KERNEL_BUILDER(
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(type)                                        \
-  REGISTER_KERNEL_BUILDER(                  \
+  REGISTER_KERNEL_BUILDER(Name("Min")                                      \
      Name("Min")                           \
                              .Device(DEVICE_SYCL)                         \
                              .TypeConstraint<type>("T")                   \
                              .TypeConstraint<int32>("Tidx")               \
                              .HostMemory("reduction_indices"),            \
-      ReductionOp<SYCLDevice, type, Eigen::internal::MinReducer<type>>);
+                          ReductionOp<SYCLDevice, type, int32,             \
                                      Eigen::internal::MinReducer<type>>); \
  REGISTER_KERNEL_BUILDER(Name("Min")                                      \
                              .Device(DEVICE_SYCL)                         \
                              .TypeConstraint<type>("T")                   \
                              .TypeConstraint<int64>("Tidx")               \
                              .HostMemory("reduction_indices"),            \
                          ReductionOp<SYCLDevice, type, int64,             \
                                      Eigen::internal::MinReducer<type>>);
 REGISTER_SYCL_KERNELS(float);
 REGISTER_SYCL_KERNELS(double);
@ -77,7 +106,16 @@ REGISTER_KERNEL_BUILDER(
        .HostMemory("output")
        .TypeConstraint<int32>("T")
        .TypeConstraint<int32>("Tidx"),
-    ReductionOp<CPUDevice, int32, Eigen::internal::MinReducer<int32>>);
+    ReductionOp<CPUDevice, int32, int32, Eigen::internal::MinReducer<int32>>);
 REGISTER_KERNEL_BUILDER(
    Name("Min")
        .Device(DEVICE_SYCL)
        .HostMemory("reduction_indices")
        .HostMemory("input")
        .HostMemory("output")
        .TypeConstraint<int32>("T")
        .TypeConstraint<int64>("Tidx"),
    ReductionOp<CPUDevice, int32, int64, Eigen::internal::MinReducer<int32>>);
 #undef REGISTER_SYCL_KERNELS
 #endif  // TENSORFLOW_USE_SYCL
--- a/tensorflow/core/kernels/reduction_ops_prod.cc
+++ b/tensorflow/core/kernels/reduction_ops_prod.cc
@ -18,25 +18,38 @@ limitations under the License.
 namespace tensorflow {
 #define REGISTER_CPU_KERNELS(type)                                          \
-  REGISTER_KERNEL_BUILDER(                \
+  REGISTER_KERNEL_BUILDER(Name("Prod")                                      \
      Name("Prod")                        \
                              .Device(DEVICE_CPU)                           \
                              .TypeConstraint<type>("T")                    \
                              .TypeConstraint<int32>("Tidx"),               \
-      ReductionOp<CPUDevice, type, Eigen::internal::ProdReducer<type>>);
+                          ReductionOp<CPUDevice, type, int32,               \
                                      Eigen::internal::ProdReducer<type>>); \
  REGISTER_KERNEL_BUILDER(Name("Prod")                                      \
                              .Device(DEVICE_CPU)                           \
                              .TypeConstraint<type>("T")                    \
                              .TypeConstraint<int64>("Tidx"),               \
                          ReductionOp<CPUDevice, type, int64,               \
                                      Eigen::internal::ProdReducer<type>>);
 TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 #if GOOGLE_CUDA
 #define REGISTER_GPU_KERNELS(type)                                          \
-  REGISTER_KERNEL_BUILDER(                  \
+  REGISTER_KERNEL_BUILDER(Name("Prod")                                      \
      Name("Prod")                          \
                              .Device(DEVICE_GPU)                           \
                              .TypeConstraint<type>("T")                    \
                              .TypeConstraint<int32>("Tidx")                \
                              .HostMemory("reduction_indices"),             \
-      ReductionOp<GPUDevice, type, Eigen::internal::ProdReducer<type>>);
+                          ReductionOp<GPUDevice, type, int32,               \
                                      Eigen::internal::ProdReducer<type>>); \
  REGISTER_KERNEL_BUILDER(Name("Prod")                                      \
                              .Device(DEVICE_GPU)                           \
                              .TypeConstraint<type>("T")                    \
                              .TypeConstraint<int64>("Tidx")                \
                              .HostMemory("reduction_indices"),             \
                          ReductionOp<GPUDevice, type, int64,               \
                                      Eigen::internal::ProdReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int32(REGISTER_GPU_KERNELS);
 TF_CALL_complex64(REGISTER_GPU_KERNELS);
@ -47,13 +60,20 @@ TF_CALL_complex128(REGISTER_GPU_KERNELS);
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(type)                                         \
-  REGISTER_KERNEL_BUILDER(                  \
+  REGISTER_KERNEL_BUILDER(Name("Prod")                                      \
      Name("Prod")                          \
                              .Device(DEVICE_SYCL)                          \
                              .TypeConstraint<type>("T")                    \
                              .TypeConstraint<int32>("Tidx")                \
                              .HostMemory("reduction_indices"),             \
-      ReductionOp<SYCLDevice, type, Eigen::internal::ProdReducer<type>>);
+                          ReductionOp<SYCLDevice, type, int32,              \
                                      Eigen::internal::ProdReducer<type>>); \
  REGISTER_KERNEL_BUILDER(Name("Prod")                                      \
                              .Device(DEVICE_SYCL)                          \
                              .TypeConstraint<type>("T")                    \
                              .TypeConstraint<int64>("Tidx")                \
                              .HostMemory("reduction_indices"),             \
                          ReductionOp<SYCLDevice, type, int64,              \
                                      Eigen::internal::ProdReducer<type>>);
 REGISTER_SYCL_KERNELS(int32);
 REGISTER_SYCL_KERNELS(float);
 REGISTER_SYCL_KERNELS(double);
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@ -23,7 +23,13 @@ namespace tensorflow {
          .Device(DEVICE_CPU)                                                  \
          .TypeConstraint<type>("T")                                           \
          .TypeConstraint<int32>("Tidx"),                                      \
-      ReductionOp<CPUDevice, type, Eigen::internal::SumReducer<type>>);
+      ReductionOp<CPUDevice, type, int32, Eigen::internal::SumReducer<type>>); \
  REGISTER_KERNEL_BUILDER(                                                     \
      Name("Sum")                                                              \
          .Device(DEVICE_CPU)                                                  \
          .TypeConstraint<type>("T")                                           \
          .TypeConstraint<int64>("Tidx"),                                      \
      ReductionOp<CPUDevice, type, int64, Eigen::internal::SumReducer<type>>);
 TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
@ -36,7 +42,14 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
          .TypeConstraint<type>("T")                                           \
          .TypeConstraint<int32>("Tidx")                                       \
          .HostMemory("reduction_indices"),                                    \
-      ReductionOp<GPUDevice, type, Eigen::internal::SumReducer<type>>);
+      ReductionOp<GPUDevice, type, int32, Eigen::internal::SumReducer<type>>); \
  REGISTER_KERNEL_BUILDER(                                                     \
      Name("Sum")                                                              \
          .Device(DEVICE_GPU)                                                  \
          .TypeConstraint<type>("T")                                           \
          .TypeConstraint<int64>("Tidx")                                       \
          .HostMemory("reduction_indices"),                                    \
      ReductionOp<GPUDevice, type, int64, Eigen::internal::SumReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_complex64(REGISTER_GPU_KERNELS);
 TF_CALL_complex128(REGISTER_GPU_KERNELS);
@ -53,19 +66,35 @@ REGISTER_KERNEL_BUILDER(
        .HostMemory("input")
        .HostMemory("output")
        .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, int32, Eigen::internal::SumReducer<int32>>);
+    ReductionOp<CPUDevice, int32, int32, Eigen::internal::SumReducer<int32>>);
 REGISTER_KERNEL_BUILDER(
    Name("Sum")
        .Device(DEVICE_GPU)
        .TypeConstraint<int32>("T")
        .TypeConstraint<int64>("Tidx")
        .HostMemory("input")
        .HostMemory("output")
        .HostMemory("reduction_indices"),
    ReductionOp<CPUDevice, int32, int64, Eigen::internal::SumReducer<int32>>);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(type)                                        \
-  REGISTER_KERNEL_BUILDER(                  \
+  REGISTER_KERNEL_BUILDER(Name("Sum")                                      \
      Name("Sum")                           \
                              .Device(DEVICE_SYCL)                         \
                              .TypeConstraint<type>("T")                   \
                              .TypeConstraint<int32>("Tidx")               \
                              .HostMemory("reduction_indices"),            \
-      ReductionOp<SYCLDevice, type, Eigen::internal::SumReducer<type>>);
+                          ReductionOp<SYCLDevice, type, int32,             \
                                      Eigen::internal::SumReducer<type>>); \
  REGISTER_KERNEL_BUILDER(Name("Sum")                                      \
                              .Device(DEVICE_SYCL)                         \
                              .TypeConstraint<type>("T")                   \
                              .TypeConstraint<int64>("Tidx")               \
                              .HostMemory("reduction_indices"),            \
                          ReductionOp<SYCLDevice, type, int64,             \
                                      Eigen::internal::SumReducer<type>>);
 REGISTER_SYCL_KERNELS(float);
 REGISTER_SYCL_KERNELS(double);
@ -77,7 +106,16 @@ REGISTER_KERNEL_BUILDER(
        .HostMemory("input")
        .HostMemory("output")
        .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, int32, Eigen::internal::SumReducer<int32>>);
+    ReductionOp<CPUDevice, int32, int32, Eigen::internal::SumReducer<int32>>);
 REGISTER_KERNEL_BUILDER(
    Name("Sum")
        .Device(DEVICE_SYCL)
        .TypeConstraint<int32>("T")
        .TypeConstraint<int64>("Tidx")
        .HostMemory("input")
        .HostMemory("output")
        .HostMemory("reduction_indices"),
    ReductionOp<CPUDevice, int32, int64, Eigen::internal::SumReducer<int32>>);
 #undef REGISTER_SYCL_KERNELS
 #endif  // TENSORFLOW_USE_SYCL
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@ -163,6 +163,13 @@ class SumReductionTest(BaseReductionTest):
      reduction_axes = tuple(reduction_axes)
    return np.sum(x, axis=reduction_axes, keepdims=keep_dims)
  def testAxesType(self):
    for dtype in [dtypes.int64, dtypes.int32]:
      with self.test_session(use_gpu=True) as sess:
        v = math_ops.reduce_sum([0, 0], constant_op.constant(0, dtype=dtype))
        tf_v = sess.run(v)
      self.assertAllEqual(tf_v, 0)
  def testInfinity(self):
    for dtype in [np.float32, np.float64]:
      for special_value_x in [-np.inf, np.inf]:
@ -193,6 +200,7 @@ class SumReductionTest(BaseReductionTest):
      tf_out_mean = sess.run(tf_mean)
    self.assertAllClose(tf_out_mean, 1.)
  def testFloat32(self):
    for rank in range(1, _MAX_RANK + 1):
      np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
@ -369,6 +377,13 @@ class MeanReductionTest(BaseReductionTest):
      return np_sum // count
    return np_sum / count
  def testAxesType(self):
    for dtype in [dtypes.int64, dtypes.int32]:
      with self.test_session(use_gpu=True) as sess:
        v = math_ops.reduce_mean([0, 0], constant_op.constant(0, dtype=dtype))
        tf_v = sess.run(v)
      self.assertAllEqual(tf_v, 0)
  def testInfinity(self):
    for dtype in [np.float32, np.float64]:
      for special_value_x in [-np.inf, np.inf]:
@ -435,6 +450,13 @@ class ProdReductionTest(BaseReductionTest):
      reduction_axes = tuple(reduction_axes)
    return np.prod(x, axis=reduction_axes, keepdims=keep_dims)
  def testAxesType(self):
    for dtype in [dtypes.int64, dtypes.int32]:
      with self.test_session(use_gpu=True) as sess:
        v = math_ops.reduce_prod([0, 0], constant_op.constant(0, dtype=dtype))
        tf_v = sess.run(v)
      self.assertAllEqual(tf_v, 0)
  def testInfinity(self):
    for dtype in [np.float32, np.float64]:
      for special_value_x in [-np.inf, np.inf]:
@ -531,6 +553,13 @@ class MinReductionTest(test.TestCase):
    self._compare(x, reduction_axes, True, use_gpu=True)
    self._compare(x, reduction_axes, True, use_gpu=False)
  def testAxesType(self):
    for dtype in [dtypes.int64, dtypes.int32]:
      with self.test_session(use_gpu=True) as sess:
        v = math_ops.reduce_min([0, 0], constant_op.constant(0, dtype=dtype))
        tf_v = sess.run(v)
      self.assertAllEqual(tf_v, 0)
  def testInfinity(self):
    for dtype in [np.float32, np.float64]:
      for special_value_x in [-np.inf, np.inf]:
@ -637,6 +666,13 @@ class MaxReductionTest(test.TestCase):
    self._compare(x, reduction_axes, True, use_gpu=True)
    self._compare(x, reduction_axes, True, use_gpu=False)
  def testAxesType(self):
    for dtype in [dtypes.int64, dtypes.int32]:
      with self.test_session(use_gpu=True) as sess:
        v = math_ops.reduce_max([0, 0], constant_op.constant(0, dtype=dtype))
        tf_v = sess.run(v)
      self.assertAllEqual(tf_v, 0)
  def testInfinity(self):
    for dtype in [np.float32, np.float64]:
      for special_value_x in [-np.inf, np.inf]:
@ -757,6 +793,14 @@ class AllReductionTest(test.TestCase):
    self._compare(x, reduction_axes, True, use_gpu=True)
    self._compare(x, reduction_axes, True, use_gpu=False)
  def testAxesType(self):
    for dtype in [dtypes.int64, dtypes.int32]:
      with self.test_session(use_gpu=True) as sess:
        v = math_ops.reduce_all([True, True],
                                constant_op.constant(0, dtype=dtype))
        tf_v = sess.run(v)
      self.assertAllEqual(tf_v, True)
  def testAll3D(self):
    # Create a 3D array of bools and reduce across all possible
    # dimensions
@ -798,6 +842,14 @@ class AnyReductionTest(test.TestCase):
    self._compare(x, reduction_axes, True, use_gpu=True)
    self._compare(x, reduction_axes, True, use_gpu=False)
  def testAxesType(self):
    for dtype in [dtypes.int64, dtypes.int32]:
      with self.test_session(use_gpu=True) as sess:
        v = math_ops.reduce_any([True, True],
                                constant_op.constant(0, dtype=dtype))
        tf_v = sess.run(v)
      self.assertAllEqual(tf_v, True)
  def testAll3D(self):
    # Create a 3D array of bools and reduce across all possible
    # dimensions