From 4e7ce793d996dfed173bd46e90f489df9464a540 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Mon, 8 Jun 2020 00:34:46 -0700
Subject: [PATCH] Merge various kernel registrations with macros

We add the TF_CALL_COMPLEX_TYPES macro and update related kernel
registrations with more compact macros rather than the individual dtype
listings.

This should be a no-op and should give better visibility into what is
the dtype coverage for many of our kernels.

PiperOrigin-RevId: 315224662
Change-Id: I14aad07711a407fa632a94d891238a48ae89bcab
---
 tensorflow/core/framework/register_types.h    |  21 ++--
 tensorflow/core/kernels/aggregate_ops.cc      |   5 +-
 .../core/kernels/aggregate_ops_gpu.cu.cc      |   5 +-
 .../core/kernels/batch_matmul_op_complex.cc   |   6 +-
 tensorflow/core/kernels/cholesky_op.cc        |   3 +-
 tensorflow/core/kernels/concat_lib.h          |   5 +-
 tensorflow/core/kernels/concat_lib_gpu.cc     |   5 +-
 tensorflow/core/kernels/concat_lib_gpu.h      |   5 +-
 .../core/kernels/concat_lib_gpu_impl.cu.cc    |  28 ++---
 tensorflow/core/kernels/concat_op.cc          |   7 +-
 tensorflow/core/kernels/diag_op.cc            |  12 +-
 .../kernels/dynamic_partition_op_gpu.cu.cc    |   3 +-
 tensorflow/core/kernels/dynamic_stitch_op.cc  |  15 ++-
 .../core/kernels/dynamic_stitch_op_gpu.cu.cc  |   7 +-
 tensorflow/core/kernels/einsum_op_gpu.cu.cc   |   7 +-
 tensorflow/core/kernels/gather_functor.cc     |   3 +-
 .../core/kernels/gather_functor_batched.cc    |   3 +-
 .../kernels/gather_functor_batched_gpu.cu.cc  |   5 +-
 .../core/kernels/gather_functor_gpu.cu.cc     |   5 +-
 tensorflow/core/kernels/gather_nd_op.cc       |   6 +-
 .../core/kernels/gather_nd_op_gpu.cu.cc       |   3 +-
 tensorflow/core/kernels/gather_op.cc          |   5 +-
 tensorflow/core/kernels/list_kernels.cu.cc    |   7 +-
 tensorflow/core/kernels/matmul_op.cc          |  15 +--
 .../core/kernels/matrix_band_part_op.cc       |  10 +-
 .../kernels/matrix_band_part_op_gpu.cu.cc     |   5 +-
 tensorflow/core/kernels/matrix_diag_op.cc     |  10 +-
 .../core/kernels/matrix_diag_op_gpu.cu.cc     |   5 +-
 tensorflow/core/kernels/matrix_set_diag_op.cc |  10 +-
 .../core/kernels/matrix_set_diag_op_gpu.cu.cc |   5 +-
 .../matrix_triangular_solve_op_complex.cc     |   7 +-
 .../core/kernels/mkl_batch_matmul_op.cc       |   6 +-
 tensorflow/core/kernels/one_hot_op.cc         |  10 +-
 tensorflow/core/kernels/one_hot_op_gpu.cu.cc  |   5 +-
 tensorflow/core/kernels/pack_op.cc            |   5 +-
 .../core/kernels/reduction_ops_euclidean.cc   |   3 +-
 tensorflow/core/kernels/reduction_ops_mean.cc |   3 +-
 tensorflow/core/kernels/reduction_ops_prod.cc |   5 +-
 tensorflow/core/kernels/reduction_ops_sum.cc  |   5 +-
 tensorflow/core/kernels/reverse_op.cc         |  14 +--
 tensorflow/core/kernels/reverse_op_gpu.cu.cc  |   7 +-
 tensorflow/core/kernels/roll_op.cc            |   5 +-
 tensorflow/core/kernels/roll_op_gpu.cu.cc     |   5 +-
 tensorflow/core/kernels/scatter_nd_op.cc      |   9 +-
 .../core/kernels/scatter_nd_op_gpu.cu.cc      |   3 +-
 .../kernels/segment_reduction_ops_gpu.cu.cc   |   3 +-
 .../kernels/segment_reduction_ops_impl_3.cc   |   3 +-
 .../kernels/segment_reduction_ops_impl_4.cc   |   3 +-
 tensorflow/core/kernels/slice_op.cc           |  12 +-
 tensorflow/core/kernels/slice_op_gpu.cu.cc    |   9 +-
 tensorflow/core/kernels/split_lib_gpu.cu.cc   |  14 +--
 tensorflow/core/kernels/split_lib_gpu.h       |   5 +-
 tensorflow/core/kernels/split_op.cc           |   5 +-
 tensorflow/core/kernels/split_v_op.cc         |   5 +-
 tensorflow/core/kernels/strided_slice_op.cc   |   5 +-
 .../strided_slice_op_gpu_complex.cu.cc        |   3 +-
 .../core/kernels/strided_slice_op_impl.h      |  12 +-
 tensorflow/core/kernels/tensor_array.cc       |   6 +-
 tensorflow/core/kernels/tensor_array.h        |   6 +-
 tensorflow/core/kernels/tensor_array_ops.cc   |  37 +++---
 tensorflow/core/kernels/training_ops.cc       | 105 +++++-------------
 tensorflow/core/kernels/unpack_op.cc          |   5 +-
 tensorflow/core/kernels/variable_ops.cc       |   5 +-
 63 files changed, 169 insertions(+), 397 deletions(-)

diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index 47aab2efb61..bc3e5e1743b 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -179,13 +179,14 @@ limitations under the License.
       TF_CALL_int64(m) TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) \
           TF_CALL_int8(m)
 
-// Call "m" for all number types, including complex64 and complex128.
+#define TF_CALL_COMPLEX_TYPES(m) TF_CALL_complex64(m) TF_CALL_complex128(m)
+
+// Call "m" for all number types, including complex types
 #define TF_CALL_NUMBER_TYPES(m) \
-  TF_CALL_REAL_NUMBER_TYPES(m) TF_CALL_complex64(m) TF_CALL_complex128(m)
+  TF_CALL_REAL_NUMBER_TYPES(m) TF_CALL_COMPLEX_TYPES(m)
 
 #define TF_CALL_NUMBER_TYPES_NO_INT32(m) \
-  TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)  \
-  TF_CALL_complex64(m) TF_CALL_complex128(m)
+  TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m) TF_CALL_COMPLEX_TYPES(m)
 
 #define TF_CALL_POD_TYPES(m) TF_CALL_NUMBER_TYPES(m) TF_CALL_bool(m)
 
@@ -202,8 +203,7 @@ limitations under the License.
 
 // Call "m" on all types supported on GPU.
 #define TF_CALL_GPU_ALL_TYPES(m) \
-  TF_CALL_GPU_NUMBER_TYPES(m)    \
-  TF_CALL_bool(m) TF_CALL_complex64(m) TF_CALL_complex128(m)
+  TF_CALL_GPU_NUMBER_TYPES(m) TF_CALL_COMPLEX_TYPES(m) TF_CALL_bool(m)
 
 #define TF_CALL_GPU_NUMBER_TYPES_NO_HALF(m) TF_CALL_float(m) TF_CALL_double(m)
 
@@ -213,11 +213,10 @@ limitations under the License.
   TF_CALL_qint8(m) TF_CALL_quint8(m) TF_CALL_qint32(m)
 
 // Types used for save and restore ops.
-#define TF_CALL_SAVE_RESTORE_TYPES(m)                                     \
-  TF_CALL_INTEGRAL_TYPES(m)                                               \
-  TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m) TF_CALL_complex64(m) \
-      TF_CALL_complex128(m) TF_CALL_bool(m) TF_CALL_tstring(m)            \
-          TF_CALL_QUANTIZED_TYPES(m)
+#define TF_CALL_SAVE_RESTORE_TYPES(m)      \
+  TF_CALL_REAL_NUMBER_TYPES_NO_BFLOAT16(m) \
+  TF_CALL_COMPLEX_TYPES(m)                 \
+  TF_CALL_QUANTIZED_TYPES(m) TF_CALL_bool(m) TF_CALL_tstring(m)
 
 #ifdef TENSORFLOW_SYCL_NO_DOUBLE
 #define TF_CALL_SYCL_double(m)
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
index 511a5f77a66..79062aee156 100644
--- a/tensorflow/core/kernels/aggregate_ops.cc
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -48,11 +48,10 @@ REGISTER_ADDN_CPU(Variant);
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 #define REGISTER_ADDN_GPU(type) REGISTER_ADDN(type, GPU)
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_ADDN_GPU);
 TF_CALL_int64(REGISTER_ADDN_GPU);
-TF_CALL_complex64(REGISTER_ADDN_GPU);
-TF_CALL_complex128(REGISTER_ADDN_GPU);
 TF_CALL_variant(REGISTER_ADDN_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_ADDN_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_ADDN_GPU);
 #undef REGISTER_ADDN_GPU
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
index 85bdc2447cd..c73323a795a 100644
--- a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
@@ -154,10 +154,9 @@ struct Add9Functor<GPUDevice, T> {
   template struct functor::Add8pFunctor<GPUDevice, type>; \
   template struct functor::Add9Functor<GPUDevice, type>;
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_FUNCTORS);
 TF_CALL_int64(REGISTER_FUNCTORS);
-TF_CALL_complex64(REGISTER_FUNCTORS);
-TF_CALL_complex128(REGISTER_FUNCTORS);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_FUNCTORS);
+TF_CALL_COMPLEX_TYPES(REGISTER_FUNCTORS);
 
 #undef REGISTER_FUNCTORS
 
diff --git a/tensorflow/core/kernels/batch_matmul_op_complex.cc b/tensorflow/core/kernels/batch_matmul_op_complex.cc
index 2cf163be0d4..bc36b95d6a1 100644
--- a/tensorflow/core/kernels/batch_matmul_op_complex.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_complex.cc
@@ -17,12 +17,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-TF_CALL_complex64(REGISTER_BATCH_MATMUL_CPU);
-TF_CALL_complex128(REGISTER_BATCH_MATMUL_CPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_BATCH_MATMUL_CPU);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-TF_CALL_complex64(REGISTER_BATCH_MATMUL_GPU);
-TF_CALL_complex128(REGISTER_BATCH_MATMUL_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_BATCH_MATMUL_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cholesky_op.cc b/tensorflow/core/kernels/cholesky_op.cc
index 71e2604afce..ff8fd08f228 100644
--- a/tensorflow/core/kernels/cholesky_op.cc
+++ b/tensorflow/core/kernels/cholesky_op.cc
@@ -87,8 +87,7 @@ namespace functor {
   extern template struct MatrixBandPartFunctor<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
-TF_CALL_complex64(DECLARE_GPU_SPEC);
-TF_CALL_complex128(DECLARE_GPU_SPEC);
+TF_CALL_COMPLEX_TYPES(DECLARE_GPU_SPEC);
 }  // namespace functor
 
 template <class Scalar>
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index 1d6d219f7ba..35da7afe3f5 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -64,15 +64,12 @@ void ConcatGPU(
           inputs_flat,                                                        \
       Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER);
-TF_CALL_complex64(REGISTER);
-TF_CALL_complex128(REGISTER);
 TF_CALL_int32(REGISTER);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER);
 TF_CALL_int16(REGISTER);
 TF_CALL_bfloat16(REGISTER);
-TF_CALL_bool(REGISTER);
 TF_CALL_uint8(REGISTER);
+TF_CALL_GPU_ALL_TYPES(REGISTER);
 #undef REGISTER
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index 154b1a31c3a..de029397847 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -98,15 +98,12 @@ void ConcatGPU(
           inputs_flat,                                                        \
       Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER);
-TF_CALL_complex64(REGISTER);
-TF_CALL_complex128(REGISTER);
 TF_CALL_int32(REGISTER);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER);
 TF_CALL_int16(REGISTER);
 TF_CALL_bfloat16(REGISTER);
-TF_CALL_bool(REGISTER);
 TF_CALL_uint8(REGISTER);
+TF_CALL_GPU_ALL_TYPES(REGISTER);
 
 #undef REGISTER
 
diff --git a/tensorflow/core/kernels/concat_lib_gpu.h b/tensorflow/core/kernels/concat_lib_gpu.h
index 2db66a7c5a8..a83b6b63d9b 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.h
+++ b/tensorflow/core/kernels/concat_lib_gpu.h
@@ -66,15 +66,12 @@ void ConcatGPUImpl(const Eigen::GpuDevice& d,
       const GpuDeviceArrayStruct<int64>& ptr_offsets, bool fixed_size,        \
       int split_size, typename TTypes<T, 2>::Matrix* output);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER);
-TF_CALL_complex64(REGISTER);
-TF_CALL_complex128(REGISTER);
 TF_CALL_int32(REGISTER);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER);
 TF_CALL_int16(REGISTER);
 TF_CALL_bfloat16(REGISTER);
-TF_CALL_bool(REGISTER);
 TF_CALL_uint8(REGISTER);
+TF_CALL_GPU_ALL_TYPES(REGISTER);
 #undef REGISTER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 8f5458c9b56..859aca2f932 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -201,45 +201,33 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
       const GpuDeviceArrayStruct<int64>& ptr_offsets, bool fixed_size, \
       int split_size, typename TTypes<T, 2>::Matrix* output);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT32);
-TF_CALL_complex64(REGISTER_GPUCONCAT32);
-TF_CALL_complex128(REGISTER_GPUCONCAT32);
 TF_CALL_int32(REGISTER_GPUCONCAT32);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPUCONCAT32);
 TF_CALL_int16(REGISTER_GPUCONCAT32);
 TF_CALL_uint8(REGISTER_GPUCONCAT32);
-REGISTER_GPUCONCAT32(bfloat16);
-REGISTER_GPUCONCAT32(bool);
+TF_CALL_bfloat16(REGISTER_GPUCONCAT32);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPUCONCAT32);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT64);
-TF_CALL_complex64(REGISTER_GPUCONCAT64);
-TF_CALL_complex128(REGISTER_GPUCONCAT64);
 TF_CALL_int32(REGISTER_GPUCONCAT64);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPUCONCAT64);
 TF_CALL_int16(REGISTER_GPUCONCAT64);
 TF_CALL_uint8(REGISTER_GPUCONCAT64);
-REGISTER_GPUCONCAT64(bfloat16);
-REGISTER_GPUCONCAT64(bool);
+TF_CALL_bfloat16(REGISTER_GPUCONCAT64);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPUCONCAT64);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU32);
-TF_CALL_complex64(REGISTER_GPU32);
-TF_CALL_complex128(REGISTER_GPU32);
 TF_CALL_int32(REGISTER_GPU32);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPU32);
 TF_CALL_int16(REGISTER_GPU32);
 TF_CALL_uint8(REGISTER_GPU32);
-REGISTER_GPU32(bfloat16);
-REGISTER_GPU32(bool);
+TF_CALL_bfloat16(REGISTER_GPU32);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU32);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU64);
-TF_CALL_complex64(REGISTER_GPU64);
-TF_CALL_complex128(REGISTER_GPU64);
 TF_CALL_int32(REGISTER_GPU64);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPU64);
 TF_CALL_int16(REGISTER_GPU64);
 TF_CALL_uint8(REGISTER_GPU64);
-REGISTER_GPU64(bfloat16);
-REGISTER_GPU64(bool);
+TF_CALL_bfloat16(REGISTER_GPU64);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU64);
 
 #undef REGISTER_GPUCONCAT32
 #undef REGISTER_GPUCONCAT64
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index 19ed267b441..be3e9a67c5f 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -227,13 +227,10 @@ REGISTER_CONCAT(uint64);
                               .HostMemory("axis"),       \
                           ConcatV2Op<GPUDevice, type>)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-REGISTER_GPU(bfloat16);
+TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_uint8(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
-REGISTER_GPU(bool);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc
index 811d48af091..f34fc6c6be7 100644
--- a/tensorflow/core/kernels/diag_op.cc
+++ b/tensorflow/core/kernels/diag_op.cc
@@ -176,8 +176,7 @@ TF_CALL_double(REGISTER_DIAGOP);
 TF_CALL_float(REGISTER_DIAGOP);
 TF_CALL_int32(REGISTER_DIAGOP);
 TF_CALL_int64(REGISTER_DIAGOP);
-TF_CALL_complex64(REGISTER_DIAGOP);
-TF_CALL_complex128(REGISTER_DIAGOP);
+TF_CALL_COMPLEX_TYPES(REGISTER_DIAGOP);
 TF_CALL_half(REGISTER_DIAGOP);
 #undef REGISTER_DIAGOP
 
@@ -190,8 +189,7 @@ TF_CALL_double(REGISTER_DIAGPARTOP);
 TF_CALL_float(REGISTER_DIAGPARTOP);
 TF_CALL_int32(REGISTER_DIAGPARTOP);
 TF_CALL_int64(REGISTER_DIAGPARTOP);
-TF_CALL_complex64(REGISTER_DIAGPARTOP);
-TF_CALL_complex128(REGISTER_DIAGPARTOP);
+TF_CALL_COMPLEX_TYPES(REGISTER_DIAGPARTOP);
 TF_CALL_half(REGISTER_DIAGPARTOP);
 #undef REGISTER_DIAGPARTOP
 
@@ -217,8 +215,7 @@ TF_CALL_double(REGISTER_DIAGOP_GPU);
 TF_CALL_float(REGISTER_DIAGOP_GPU);
 TF_CALL_int32(REGISTER_DIAGOP_GPU);
 TF_CALL_int64(REGISTER_DIAGOP_GPU);
-TF_CALL_complex64(REGISTER_DIAGOP_GPU);
-TF_CALL_complex128(REGISTER_DIAGOP_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_DIAGOP_GPU);
 TF_CALL_half(REGISTER_DIAGOP_GPU);
 #undef REGISTER_DIAGOP_GPU
 
@@ -242,8 +239,7 @@ TF_CALL_double(REGISTER_DIAGPARTOP_GPU);
 TF_CALL_float(REGISTER_DIAGPARTOP_GPU);
 TF_CALL_int32(REGISTER_DIAGPARTOP_GPU);
 TF_CALL_int64(REGISTER_DIAGPARTOP_GPU);
-TF_CALL_complex64(REGISTER_DIAGPARTOP_GPU);
-TF_CALL_complex128(REGISTER_DIAGPARTOP_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_DIAGPARTOP_GPU);
 TF_CALL_half(REGISTER_DIAGPARTOP_GPU);
 #undef REGISTER_DIAGPARTOP_GPU
 
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index 98c2fb57833..7b64d9e8484 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -467,8 +467,7 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
       DynamicPartitionOpGPU<T>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_DYNAMIC_PARTITION_GPU);
-TF_CALL_complex64(REGISTER_DYNAMIC_PARTITION_GPU);
-TF_CALL_complex128(REGISTER_DYNAMIC_PARTITION_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_DYNAMIC_PARTITION_GPU);
 #undef REGISTER_DYNAMIC_PARTITION_GPU
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index 86f9c3e4621..5f6b0357f95 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -147,11 +147,11 @@ void DynamicStitchGPUImpl(const Eigen::GpuDevice& gpu_device,
       const int32 first_dim_size,                                 \
       const GpuDeviceArrayStruct<int32>& input_indices,           \
       const GpuDeviceArrayStruct<const T*>& input_ptrs, T* output);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
-TF_CALL_int64(REGISTER_GPU);
+
 TF_CALL_int32(REGISTER_GPU);
+TF_CALL_int64(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 template <class T>
@@ -357,11 +357,10 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_DYNAMIC_STITCH);
                               .HostMemory("merged"),     \
                           ParallelDynamicStitchOpCPU<type>)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_DYNAMIC_STITCH_GPU);
-TF_CALL_complex64(REGISTER_DYNAMIC_STITCH_GPU);
-TF_CALL_complex128(REGISTER_DYNAMIC_STITCH_GPU);
-TF_CALL_int64(REGISTER_DYNAMIC_STITCH_GPU);
 TF_CALL_int32(REGISTER_DYNAMIC_STITCH_GPU);
+TF_CALL_int64(REGISTER_DYNAMIC_STITCH_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_DYNAMIC_STITCH_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_DYNAMIC_STITCH_GPU);
 #undef REGISTER_DYNAMIC_STITCH_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc
index 22fed448c1f..c0a3df38b5d 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc
@@ -70,11 +70,10 @@ void DynamicStitchGPUImpl(const Eigen::GpuDevice& gpu_device,
       const GpuDeviceArrayStruct<int32>& input_indices,           \
       const GpuDeviceArrayStruct<const T*>& input_ptrs, T* output);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
+TF_CALL_int32(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
-TF_CALL_int32(REGISTER_GPU)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 
 #undef REGISTER_GPU
 
diff --git a/tensorflow/core/kernels/einsum_op_gpu.cu.cc b/tensorflow/core/kernels/einsum_op_gpu.cu.cc
index 36a97691297..2935b7fd02a 100644
--- a/tensorflow/core/kernels/einsum_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/einsum_op_gpu.cu.cc
@@ -33,11 +33,8 @@ namespace tensorflow {
   DECLARE_GPU_SPECS_NDIM(T, 5); \
   DECLARE_GPU_SPECS_NDIM(T, 6);
 
-TF_CALL_half(DECLARE_GPU_SPECS);
-TF_CALL_float(DECLARE_GPU_SPECS);
-TF_CALL_double(DECLARE_GPU_SPECS);
-TF_CALL_complex64(DECLARE_GPU_SPECS);
-TF_CALL_complex128(DECLARE_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+TF_CALL_COMPLEX_TYPES(DECLARE_GPU_SPECS);
 
 #undef DECLARE_GPU_SPECS_NDIM
 #undef DECLARE_GPU_SPECS
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index e4f2182be3c..a0293951660 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -39,8 +39,7 @@ namespace functor {
 
 TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
-TF_CALL_complex64(DECLARE_GPU_SPECS);
-TF_CALL_complex128(DECLARE_GPU_SPECS);
+TF_CALL_COMPLEX_TYPES(DECLARE_GPU_SPECS);
 
 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/gather_functor_batched.cc b/tensorflow/core/kernels/gather_functor_batched.cc
index 0960b3a2472..d1ef076260b 100644
--- a/tensorflow/core/kernels/gather_functor_batched.cc
+++ b/tensorflow/core/kernels/gather_functor_batched.cc
@@ -39,8 +39,7 @@ namespace functor {
 
 TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
-TF_CALL_complex64(DECLARE_GPU_SPECS);
-TF_CALL_complex128(DECLARE_GPU_SPECS);
+TF_CALL_COMPLEX_TYPES(DECLARE_GPU_SPECS);
 
 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/gather_functor_batched_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_batched_gpu.cu.cc
index f118d8dc72b..40b9894776d 100644
--- a/tensorflow/core/kernels/gather_functor_batched_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_batched_gpu.cu.cc
@@ -31,12 +31,9 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
-TF_CALL_bool(DEFINE_GPU_SPECS);
 TF_CALL_int32(DEFINE_GPU_SPECS);
 TF_CALL_int64(DEFINE_GPU_SPECS);
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
-TF_CALL_complex64(DEFINE_GPU_SPECS);
-TF_CALL_complex128(DEFINE_GPU_SPECS);
+TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 #undef DEFINE_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index c548abb8bde..39402ebacec 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -31,12 +31,9 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
-TF_CALL_bool(DEFINE_GPU_SPECS);
 TF_CALL_int32(DEFINE_GPU_SPECS);
 TF_CALL_int64(DEFINE_GPU_SPECS);
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
-TF_CALL_complex64(DEFINE_GPU_SPECS);
-TF_CALL_complex128(DEFINE_GPU_SPECS);
+TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 #undef DEFINE_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 517f78ff232..ad759489dc6 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -105,8 +105,7 @@ namespace functor {
 TF_CALL_int32(DECLARE_GPU_SPECS);
 TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
-TF_CALL_complex64(DECLARE_GPU_SPECS);
-TF_CALL_complex128(DECLARE_GPU_SPECS);
+TF_CALL_COMPLEX_TYPES(DECLARE_GPU_SPECS);
 
 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_SPECS_INDEX
@@ -118,8 +117,7 @@ TF_CALL_complex128(DECLARE_GPU_SPECS);
 TF_CALL_int32(REGISTER_GATHER_ND_GPU);
 TF_CALL_int64(REGISTER_GATHER_ND_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_ND_GPU);
-TF_CALL_complex64(REGISTER_GATHER_ND_GPU);
-TF_CALL_complex128(REGISTER_GATHER_ND_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_GATHER_ND_GPU);
 
 #undef REGISTER_GATHER_ND_GPU
 
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index 95ead6b0fd3..216ca2de114 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -121,8 +121,7 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
 TF_CALL_int32(DEFINE_GPU_SPECS);
 TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
-TF_CALL_complex64(DEFINE_GPU_SPECS);
-TF_CALL_complex128(DEFINE_GPU_SPECS);
+TF_CALL_COMPLEX_TYPES(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 #undef DEFINE_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 5e6bd1de9d6..6d493a5f2ea 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -221,12 +221,9 @@ TF_CALL_uint64(REGISTER_GATHER_CPU);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
-TF_CALL_bool(REGISTER_GATHER_GPU);
 TF_CALL_int32(REGISTER_GATHER_GPU);
 TF_CALL_int64(REGISTER_GATHER_GPU);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
-TF_CALL_complex64(REGISTER_GATHER_GPU);
-TF_CALL_complex128(REGISTER_GATHER_GPU);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GATHER_GPU);
 
 #undef REGISTER_GATHER_GPU
 
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
index 1a1447ef1e6..b95a065edb8 100644
--- a/tensorflow/core/kernels/list_kernels.cu.cc
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -105,13 +105,10 @@ typedef Eigen::GpuDevice GPUDevice;
                               .HostMemory("lengths"),                      \
                           TensorListSplit<GPUDevice, T>)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_OPS_GPU);
-REGISTER_TENSOR_LIST_OPS_GPU(bfloat16);
-TF_CALL_complex64(REGISTER_TENSOR_LIST_OPS_GPU);
-TF_CALL_complex128(REGISTER_TENSOR_LIST_OPS_GPU);
 TF_CALL_int32(REGISTER_TENSOR_LIST_OPS_GPU);
 TF_CALL_int64(REGISTER_TENSOR_LIST_OPS_GPU);
-REGISTER_TENSOR_LIST_OPS_GPU(bool);
+TF_CALL_bfloat16(REGISTER_TENSOR_LIST_OPS_GPU);
+TF_CALL_GPU_ALL_TYPES(REGISTER_TENSOR_LIST_OPS_GPU);
 
 #undef REGISTER_TENSOR_LIST_OPS_GPU
 
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index 148540a3d82..2e3c120248f 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -581,21 +581,14 @@ struct MatMulFunctor<SYCLDevice, T> {
                               .Label("cublas"),                    \
                           MatMulOp<GPUDevice, T, true /* cublas */>)
 
-TF_CALL_float(REGISTER_CPU);
-TF_CALL_double(REGISTER_CPU);
-TF_CALL_half(REGISTER_CPU);
-TF_CALL_bfloat16(REGISTER_CPU);
 TF_CALL_int32(REGISTER_CPU);
 TF_CALL_int64(REGISTER_CPU);
-TF_CALL_complex64(REGISTER_CPU);
-TF_CALL_complex128(REGISTER_CPU);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
-TF_CALL_half(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/matrix_band_part_op.cc b/tensorflow/core/kernels/matrix_band_part_op.cc
index 5254ade87b9..4dcce5a8f58 100644
--- a/tensorflow/core/kernels/matrix_band_part_op.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op.cc
@@ -210,10 +210,7 @@ namespace functor {
   };                                                                   \
   extern template struct MatrixBandPartFunctor<GPUDevice, T>;
 
-TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
-TF_CALL_bool(DECLARE_GPU_SPEC);
-TF_CALL_complex64(DECLARE_GPU_SPEC);
-TF_CALL_complex128(DECLARE_GPU_SPEC);
+TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_SPEC);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -225,10 +222,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC);
                               .HostMemory("num_lower")   \
                               .HostMemory("num_upper"),  \
                           MatrixBandPartOp<GPUDevice, type>);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_BAND_PART_GPU);
-TF_CALL_bool(REGISTER_MATRIX_BAND_PART_GPU);
-TF_CALL_complex64(REGISTER_MATRIX_BAND_PART_GPU);
-TF_CALL_complex128(REGISTER_MATRIX_BAND_PART_GPU);
+TF_CALL_GPU_ALL_TYPES(REGISTER_MATRIX_BAND_PART_GPU);
 #undef REGISTER_MATRIX_BAND_PART_GPU
 
 // Registration of the deprecated kernel.
diff --git a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
index 4a94c51e878..9eb3e4f72a2 100644
--- a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
@@ -68,10 +68,7 @@ struct MatrixBandPartFunctor<GPUDevice, Scalar> {
 
 #define DEFINE_GPU_SPEC(T) template struct MatrixBandPartFunctor<GPUDevice, T>;
 
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
-TF_CALL_bool(DEFINE_GPU_SPEC);
-TF_CALL_complex64(DEFINE_GPU_SPEC);
-TF_CALL_complex128(DEFINE_GPU_SPEC);
+TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_SPEC);
 
 #undef DEFINE_GPU_SPEC
 }  // namespace functor
diff --git a/tensorflow/core/kernels/matrix_diag_op.cc b/tensorflow/core/kernels/matrix_diag_op.cc
index 9796fd25e39..05d7e4e6f86 100644
--- a/tensorflow/core/kernels/matrix_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_diag_op.cc
@@ -469,10 +469,7 @@ namespace functor {
       const bool left_align_subdiagonal);                                      \
   extern template struct MatrixDiagPart<GPUDevice, T>;
 
-TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
-TF_CALL_bool(DECLARE_GPU_SPEC);
-TF_CALL_complex64(DECLARE_GPU_SPEC);
-TF_CALL_complex128(DECLARE_GPU_SPEC);
+TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_SPEC);
 
 }  // namespace functor
 
@@ -513,10 +510,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC);
                               .HostMemory("padding_value"),                \
                           MatrixDiagPartOp<GPUDevice, type>);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_DIAG_GPU);
-TF_CALL_bool(REGISTER_MATRIX_DIAG_GPU);
-TF_CALL_complex64(REGISTER_MATRIX_DIAG_GPU);
-TF_CALL_complex128(REGISTER_MATRIX_DIAG_GPU);
+TF_CALL_GPU_ALL_TYPES(REGISTER_MATRIX_DIAG_GPU);
 #undef REGISTER_MATRIX_DIAG_GPU
 
 // Registration of the deprecated kernel.
diff --git a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
index 53cd2d2dc46..76271798d5f 100644
--- a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
@@ -163,10 +163,7 @@ struct MatrixDiagPart<GPUDevice, T> {
   template struct MatrixDiag<GPUDevice, T>; \
   template struct MatrixDiagPart<GPUDevice, T>;
 
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
-TF_CALL_bool(DEFINE_GPU_SPEC);
-TF_CALL_complex64(DEFINE_GPU_SPEC);
-TF_CALL_complex128(DEFINE_GPU_SPEC);
+TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_SPEC);
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc
index 2701ff788f7..bf98fd0d47d 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op.cc
@@ -272,10 +272,7 @@ namespace functor {
       const bool left_align_superdiagonal, const bool left_align_subdiagonal); \
   extern template struct MatrixSetDiag<GPUDevice, T>;
 
-TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
-TF_CALL_bool(DECLARE_GPU_SPEC);
-TF_CALL_complex64(DECLARE_GPU_SPEC);
-TF_CALL_complex128(DECLARE_GPU_SPEC);
+TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_SPEC);
 
 }  // namespace functor
 
@@ -295,10 +292,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC);
                               .HostMemory("k"),                           \
                           MatrixSetDiagOp<GPUDevice, type>);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_SET_DIAG_GPU);
-TF_CALL_bool(REGISTER_MATRIX_SET_DIAG_GPU);
-TF_CALL_complex64(REGISTER_MATRIX_SET_DIAG_GPU);
-TF_CALL_complex128(REGISTER_MATRIX_SET_DIAG_GPU);
+TF_CALL_GPU_ALL_TYPES(REGISTER_MATRIX_SET_DIAG_GPU);
 #undef REGISTER_MATRIX_SET_DIAG_GPU
 
 // Registration of the deprecated kernel.
diff --git a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
index 4f742b90bff..4e32f8a52e8 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
@@ -136,10 +136,7 @@ struct MatrixSetDiag<GPUDevice, Scalar> {
 
 #define DEFINE_GPU_SPEC(T) template struct MatrixSetDiag<GPUDevice, T>;
 
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
-TF_CALL_bool(DEFINE_GPU_SPEC);
-TF_CALL_complex64(DEFINE_GPU_SPEC);
-TF_CALL_complex128(DEFINE_GPU_SPEC);
+TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_SPEC);
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op_complex.cc b/tensorflow/core/kernels/matrix_triangular_solve_op_complex.cc
index 47f958ff6a9..ae3702078a0 100644
--- a/tensorflow/core/kernels/matrix_triangular_solve_op_complex.cc
+++ b/tensorflow/core/kernels/matrix_triangular_solve_op_complex.cc
@@ -13,16 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/matrix_triangular_solve_op_impl.h"
 
 namespace tensorflow {
 
-TF_CALL_complex64(REGISTER_BATCH_MATRIX_TRIANGULAR_SOLVE_CPU);
-TF_CALL_complex128(REGISTER_BATCH_MATRIX_TRIANGULAR_SOLVE_CPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_BATCH_MATRIX_TRIANGULAR_SOLVE_CPU);
 
 #if GOOGLE_CUDA
-TF_CALL_complex64(REGISTER_BATCH_MATRIX_TRIANGULAR_SOLVE_GPU);
-TF_CALL_complex128(REGISTER_BATCH_MATRIX_TRIANGULAR_SOLVE_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_BATCH_MATRIX_TRIANGULAR_SOLVE_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
index 37888656020..2d0b18edb27 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -316,13 +316,11 @@ class BatchMatMulMkl : public OpKernel {
 #ifdef ENABLE_MKL
 TF_CALL_float(REGISTER_BATCH_MATMUL_MKL);
 TF_CALL_double(REGISTER_BATCH_MATMUL_MKL);
-TF_CALL_complex64(REGISTER_BATCH_MATMUL_MKL);
-TF_CALL_complex128(REGISTER_BATCH_MATMUL_MKL);
+TF_CALL_COMPLEX_TYPES(REGISTER_BATCH_MATMUL_MKL);
 
 TF_CALL_float(REGISTER_BATCH_MATMUL_MKL_V2);
 TF_CALL_double(REGISTER_BATCH_MATMUL_MKL_V2);
-TF_CALL_complex64(REGISTER_BATCH_MATMUL_MKL_V2);
-TF_CALL_complex128(REGISTER_BATCH_MATMUL_MKL_V2);
+TF_CALL_COMPLEX_TYPES(REGISTER_BATCH_MATMUL_MKL_V2);
 
 #if defined(ENABLE_MKLDNN_V1) && defined(ENABLE_INTEL_MKL_BFLOAT16)
 TF_CALL_bfloat16(REGISTER_BATCH_MATMUL_MKL);
diff --git a/tensorflow/core/kernels/one_hot_op.cc b/tensorflow/core/kernels/one_hot_op.cc
index 3badbc294b7..e6cb84bab74 100644
--- a/tensorflow/core/kernels/one_hot_op.cc
+++ b/tensorflow/core/kernels/one_hot_op.cc
@@ -160,12 +160,9 @@ namespace functor {
   DECLARE_GPU_SPEC_INDEX(T, int32); \
   DECLARE_GPU_SPEC_INDEX(T, int64);
 
-TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
-TF_CALL_bool(DECLARE_GPU_SPEC);
 TF_CALL_int32(DECLARE_GPU_SPEC);
 TF_CALL_int64(DECLARE_GPU_SPEC);
-TF_CALL_complex64(DECLARE_GPU_SPEC);
-TF_CALL_complex128(DECLARE_GPU_SPEC);
+TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC_INDEX
 #undef DECLARE_GPU_SPEC
@@ -186,12 +183,9 @@ TF_CALL_complex128(DECLARE_GPU_SPEC);
   REGISTER_ONE_HOT_GPU_INDEX(type, int32); \
   REGISTER_ONE_HOT_GPU_INDEX(type, int64);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_ONE_HOT_GPU);
-TF_CALL_bool(REGISTER_ONE_HOT_GPU);
 TF_CALL_int32(REGISTER_ONE_HOT_GPU);
 TF_CALL_int64(REGISTER_ONE_HOT_GPU);
-TF_CALL_complex64(REGISTER_ONE_HOT_GPU);
-TF_CALL_complex128(REGISTER_ONE_HOT_GPU);
+TF_CALL_GPU_ALL_TYPES(REGISTER_ONE_HOT_GPU);
 
 #undef REGISTER_ONE_HOT_GPU_INDEX
 #undef REGISTER_ONE_HOT_GPU
diff --git a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
index 8df7284caed..47af41477c7 100644
--- a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
@@ -37,12 +37,9 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPEC_INDEX(T, int32); \
   DEFINE_GPU_SPEC_INDEX(T, int64)
 
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
-TF_CALL_bool(DEFINE_GPU_SPEC);
 TF_CALL_int32(DEFINE_GPU_SPEC);
 TF_CALL_int64(DEFINE_GPU_SPEC);
-TF_CALL_complex64(DEFINE_GPU_SPEC);
-TF_CALL_complex128(DEFINE_GPU_SPEC);
+TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_SPEC);
 
 #undef DEFINE_GPU_SPEC_INDEX
 #undef DEFINE_GPU_SPEC
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index cf2b6bb1100..04b5c72b3cf 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -152,13 +152,10 @@ REGISTER_PACK(tstring);
       Name("Pack").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       PackOp<GPUDevice, type>)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
 TF_CALL_int16(REGISTER_GPU);
-TF_CALL_bool(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/reduction_ops_euclidean.cc b/tensorflow/core/kernels/reduction_ops_euclidean.cc
index cf719e76cd8..9bc11e29069 100644
--- a/tensorflow/core/kernels/reduction_ops_euclidean.cc
+++ b/tensorflow/core/kernels/reduction_ops_euclidean.cc
@@ -52,8 +52,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
                                       functor::EuclideanNormReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #if GOOGLE_CUDA
-TF_CALL_complex64(REGISTER_GPU_KERNELS);
-TF_CALL_complex128(REGISTER_GPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 #endif
 #undef REGISTER_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc
index d314f1953dc..e96d6f829ac 100644
--- a/tensorflow/core/kernels/reduction_ops_mean.cc
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@@ -52,8 +52,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
       ReductionOp<GPUDevice, type, int64, functor::MeanReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #if GOOGLE_CUDA
-TF_CALL_complex64(REGISTER_GPU_KERNELS);
-TF_CALL_complex128(REGISTER_GPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 #endif
 #undef REGISTER_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc
index 0642bad9218..33742e97146 100644
--- a/tensorflow/core/kernels/reduction_ops_prod.cc
+++ b/tensorflow/core/kernels/reduction_ops_prod.cc
@@ -50,11 +50,10 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
                               .HostMemory("reduction_indices"),             \
                           ReductionOp<GPUDevice, type, int64,               \
                                       Eigen::internal::ProdReducer<type>>);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int32(REGISTER_GPU_KERNELS);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #if GOOGLE_CUDA
-TF_CALL_complex64(REGISTER_GPU_KERNELS);
-TF_CALL_complex128(REGISTER_GPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 #endif
 #undef REGISTER_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
index d79684df290..b5f7a5d7089 100644
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -50,11 +50,10 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .TypeConstraint<int64>("Tidx")                                       \
           .HostMemory("reduction_indices"),                                    \
       ReductionOp<GPUDevice, type, int64, Eigen::internal::SumReducer<type>>);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #if GOOGLE_CUDA
-TF_CALL_complex64(REGISTER_GPU_KERNELS);
-TF_CALL_complex128(REGISTER_GPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 #endif
 #undef REGISTER_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 98bf8bf8e91..7ec6923492f 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -342,12 +342,7 @@ namespace functor {
 
 TF_CALL_uint8(DECLARE_GPU_SPEC);
 TF_CALL_int8(DECLARE_GPU_SPEC);
-TF_CALL_bool(DECLARE_GPU_SPEC);
-TF_CALL_half(DECLARE_GPU_SPEC);
-TF_CALL_float(DECLARE_GPU_SPEC);
-TF_CALL_double(DECLARE_GPU_SPEC);
-TF_CALL_complex64(DECLARE_GPU_SPEC);
-TF_CALL_complex128(DECLARE_GPU_SPEC);
+TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_SPEC);
 #undef DECLARE_GPU_SPEC
 #undef DECLARE_GPU_SPEC_DIM
 }  // namespace functor
@@ -373,12 +368,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC);
                           ReverseV2Op<GPUDevice, T, int64>)
 TF_CALL_uint8(REGISTER_GPU_KERNELS);
 TF_CALL_int8(REGISTER_GPU_KERNELS);
-TF_CALL_bool(REGISTER_GPU_KERNELS);
-TF_CALL_half(REGISTER_GPU_KERNELS);
-TF_CALL_float(REGISTER_GPU_KERNELS);
-TF_CALL_double(REGISTER_GPU_KERNELS);
-TF_CALL_complex64(REGISTER_GPU_KERNELS);
-TF_CALL_complex128(REGISTER_GPU_KERNELS);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNEL
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
index 2917a0d5f11..28c50bc66df 100644
--- a/tensorflow/core/kernels/reverse_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
@@ -40,12 +40,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 TF_CALL_uint8(DEFINE_REVERSE_ALL_DIMS);
 TF_CALL_int8(DEFINE_REVERSE_ALL_DIMS);
-TF_CALL_bool(DEFINE_REVERSE_ALL_DIMS);
-TF_CALL_half(DEFINE_REVERSE_ALL_DIMS);
-TF_CALL_float(DEFINE_REVERSE_ALL_DIMS);
-TF_CALL_double(DEFINE_REVERSE_ALL_DIMS);
-TF_CALL_complex64(DEFINE_REVERSE_ALL_DIMS);
-TF_CALL_complex128(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_GPU_ALL_TYPES(DEFINE_REVERSE_ALL_DIMS);
 #undef DEFINE_REVERSE
 #undef DEFINE_REVERSE_ALL_DIMS
 
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index 8d6147801e7..2a141864e18 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -397,10 +397,9 @@ TF_CALL_ALL_TYPES(REGISTER_CPU);
 
 TF_CALL_int32(REGISTER_KERNEL);
 TF_CALL_int64(REGISTER_KERNEL);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNEL);
-TF_CALL_complex64(REGISTER_KERNEL);
-TF_CALL_complex128(REGISTER_KERNEL);
 TF_CALL_uint32(REGISTER_KERNEL);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNEL);
+TF_CALL_COMPLEX_TYPES(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/roll_op_gpu.cu.cc b/tensorflow/core/kernels/roll_op_gpu.cu.cc
index 7ba37e2f59c..0e81b576330 100644
--- a/tensorflow/core/kernels/roll_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/roll_op_gpu.cu.cc
@@ -93,10 +93,9 @@ struct Roll<GPUDevice, T> {
 
 TF_CALL_int32(DEFINE_GPU_SPECS);
 TF_CALL_int64(DEFINE_GPU_SPECS);
+TF_CALL_uint32(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
-TF_CALL_complex64(DEFINE_GPU_SPECS);
-TF_CALL_complex128(DEFINE_GPU_SPECS);
-TF_CALL_uint32(DEFINE_GPU_SPECS)
+TF_CALL_COMPLEX_TYPES(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 }  // namespace functor
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 96711fce643..c6c93077f01 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -502,8 +502,7 @@ TF_CALL_int64(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_int64(REGISTER_SCATTER_ND_MIN_MAX_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_ND_MIN_MAX_GPU);
-TF_CALL_complex64(REGISTER_SCATTER_ND_ALL_GPU);
-TF_CALL_complex128(REGISTER_SCATTER_ND_ALL_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_SCATTER_ND_ALL_GPU);
 
 #undef REGISTER_SCATTER_ND_ALL_GPU
 
@@ -563,8 +562,7 @@ TF_CALL_int64(REGISTER_SCATTER_ND_TENSOR_GPU);
 TF_CALL_int64(REGISTER_SCATTER_ND_TENSOR_GPU_MIN_MAX);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_TENSOR_GPU);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_TENSOR_GPU_MIN_MAX);
-TF_CALL_complex64(REGISTER_SCATTER_ND_TENSOR_GPU);
-TF_CALL_complex128(REGISTER_SCATTER_ND_TENSOR_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_SCATTER_ND_TENSOR_GPU);
 
 #undef REGISTER_SCATTER_ND_ADD
 #undef REGISTER_SCATTER_ND_ADD_SUB
@@ -862,8 +860,7 @@ TF_CALL_int32(DECLARE_GPU_SPECS);
 TF_CALL_int32(DECLARE_GPU_SPECS_MIN_MAX);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS_MIN_MAX);
-TF_CALL_complex64(DECLARE_GPU_SPECS);
-TF_CALL_complex128(DECLARE_GPU_SPECS);
+TF_CALL_COMPLEX_TYPES(DECLARE_GPU_SPECS);
 
 #undef DECLARE_GPU_SPECS_MIN_MAX
 #undef DECLARE_GPU_SPECS
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 7fb2d3916e3..64b69af423f 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -200,8 +200,7 @@ TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_int64(DECLARE_GPU_SPECS_MINMAX);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS_MINMAX);
-TF_CALL_complex64(DECLARE_GPU_SPECS);
-TF_CALL_complex128(DECLARE_GPU_SPECS);
+TF_CALL_COMPLEX_TYPES(DECLARE_GPU_SPECS);
 
 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_SPECS_MINMAX
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index bc28e64c4d7..418af1d6b6d 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -244,8 +244,7 @@ TF_CALL_int32(DEFINE_SUM_GPU_SPECS);
 
 // TODO(rocm): support atomicAdd for complex numbers on ROCm
 #if GOOGLE_CUDA
-TF_CALL_complex64(DEFINE_SUM_GPU_SPECS);
-TF_CALL_complex128(DEFINE_SUM_GPU_SPECS);
+TF_CALL_COMPLEX_TYPES(DEFINE_SUM_GPU_SPECS);
 #endif
 
 #undef DEFINE_SORTED_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl_3.cc b/tensorflow/core/kernels/segment_reduction_ops_impl_3.cc
index d0d46938629..eef5a532b29 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl_3.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl_3.cc
@@ -113,8 +113,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
 TF_CALL_int32(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
 // TODO(rocm): support atomicAdd for complex numbers on ROCm
 #if GOOGLE_CUDA
-TF_CALL_complex64(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
-TF_CALL_complex128(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_COMPLEX_TYPES(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
 #endif
 
 #undef REGISTER_GPU_KERNEL_UNSORTEDSEGMENT
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl_4.cc b/tensorflow/core/kernels/segment_reduction_ops_impl_4.cc
index 92caeb3c544..cad6f8a5e08 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl_4.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl_4.cc
@@ -113,8 +113,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
 TF_CALL_int32(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
 // TODO(rocm): support atomicAdd for complex numbers on ROCm
 #if GOOGLE_CUDA
-TF_CALL_complex64(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
-TF_CALL_complex128(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_COMPLEX_TYPES(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
 #endif
 
 #undef REGISTER_GPU_KERNEL_UNSORTEDSEGMENT
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 62c63263d48..6d7cd6f2a3d 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -300,14 +300,11 @@ namespace functor {
   DECLARE_GPU_SPEC(T, 7); \
   DECLARE_GPU_SPEC(T, 8);
 
-TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N);
-TF_CALL_complex64(DECLARE_FOR_N);
-TF_CALL_complex128(DECLARE_FOR_N);
 TF_CALL_bfloat16(DECLARE_FOR_N);
-TF_CALL_bool(DECLARE_FOR_N);
 TF_CALL_int8(DECLARE_FOR_N);
+TF_CALL_int32(DECLARE_FOR_N);
 TF_CALL_int64(DECLARE_FOR_N);
-DECLARE_FOR_N(int32);
+TF_CALL_GPU_ALL_TYPES(DECLARE_FOR_N);
 
 #undef DECLARE_FOR_N
 #undef DECLARE_GPU_SPEC
@@ -321,13 +318,10 @@ DECLARE_FOR_N(int32);
                               .HostMemory("size"),       \
                           SliceOp<GPUDevice, type>)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_bfloat16(REGISTER_GPU);
-TF_CALL_bool(REGISTER_GPU);
 TF_CALL_int8(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/slice_op_gpu.cu.cc b/tensorflow/core/kernels/slice_op_gpu.cu.cc
index 5a9d2ff950a..c20d01751d9 100644
--- a/tensorflow/core/kernels/slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/slice_op_gpu.cu.cc
@@ -37,14 +37,11 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::Slice<GPUDevice, T, 7>; \
   template struct functor::Slice<GPUDevice, T, 8>;
 
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
-TF_CALL_complex64(DEFINE_GPU_KERNELS);
-TF_CALL_complex128(DEFINE_GPU_KERNELS);
 TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
-TF_CALL_bool(DEFINE_GPU_KERNELS);
 TF_CALL_int8(DEFINE_GPU_KERNELS);
-DEFINE_GPU_KERNELS(int32);
-DEFINE_GPU_KERNELS(int64);
+TF_CALL_int32(DEFINE_GPU_KERNELS);
+TF_CALL_int64(DEFINE_GPU_KERNELS);
+TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
 
 #undef DEFINE_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index b094a5320f7..b4379a01ce1 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -51,20 +51,16 @@ void SplitCustom<Device, T>::operator()(
   template struct Split<Eigen::GpuDevice, T, 2>; \
   template struct Split<Eigen::GpuDevice, T, 3>;
 
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
-TF_CALL_complex64(DEFINE_GPU_KERNELS);
-TF_CALL_complex128(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
 TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
 TF_CALL_uint8(DEFINE_GPU_KERNELS);
-TF_CALL_bool(DEFINE_GPU_KERNELS);
+TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
 
 #undef DEFINE_GPU_KERNELS
 #define DEFINE_GPU_KERNELS(T) template struct SplitCustom<Eigen::GpuDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
-TF_CALL_complex64(DEFINE_GPU_KERNELS);
-TF_CALL_complex128(DEFINE_GPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
 
 #undef DEFINE_GPU_KERNELS
@@ -248,8 +244,7 @@ void SplitVOpGPULaunch<T, IntType>::Run(
 #define REGISTER_GPU_KERNEL(T) template struct SplitOpGPULaunch<T>;
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
-TF_CALL_complex64(REGISTER_GPU_KERNEL);
-TF_CALL_complex128(REGISTER_GPU_KERNEL);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #define REGISTER_GPU_KERNEL(T)                 \
@@ -257,8 +252,7 @@ TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
   template struct SplitVOpGPULaunch<T, int64>;
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
-TF_CALL_complex64(REGISTER_GPU_KERNEL);
-TF_CALL_complex128(REGISTER_GPU_KERNEL);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
diff --git a/tensorflow/core/kernels/split_lib_gpu.h b/tensorflow/core/kernels/split_lib_gpu.h
index 20feb7df143..f2c343ae5fd 100644
--- a/tensorflow/core/kernels/split_lib_gpu.h
+++ b/tensorflow/core/kernels/split_lib_gpu.h
@@ -50,12 +50,9 @@ struct SplitVOpGPULaunch {
   extern template struct SplitVOpGPULaunch<T, int32>; \
   extern template struct SplitVOpGPULaunch<T, int64>;
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
-TF_CALL_complex64(REGISTER_GPU_KERNEL);
-TF_CALL_complex128(REGISTER_GPU_KERNEL);
 TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 TF_CALL_uint8(REGISTER_GPU_KERNEL);
-TF_CALL_bool(REGISTER_GPU_KERNEL);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index f8bf5bce197..f09740c6198 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -417,10 +417,9 @@ REGISTER_SPLIT(uint64);
                               .HostMemory("split_dim"),  \
                           SplitOpGPU<type>)
 
+TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
-REGISTER_GPU(bfloat16);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index 1eaeda927f8..4569e11dd13 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -471,10 +471,9 @@ TF_CALL_ALL_TYPES(REGISTER_SPLIT_LEN);
   REGISTER_GPU(type, int32);   \
   REGISTER_GPU(type, int64);
 
+TF_CALL_bfloat16(REGISTER_GPU_LEN);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_LEN);
-TF_CALL_complex64(REGISTER_GPU_LEN);
-TF_CALL_complex128(REGISTER_GPU_LEN);
-REGISTER_GPU_LEN(bfloat16);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU_LEN);
 #undef REGISTER_GPU_LEN
 #undef REGISTER_GPU
 
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index dd23c251897..ccc1984bb98 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -486,12 +486,9 @@ TF_CALL_uint64(REGISTER_STRIDED_SLICE);
                               .HostMemory("strides"),                   \
                           StridedSliceAssignOp<GPUDevice, type, true>)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-TF_CALL_bool(REGISTER_GPU);
 TF_CALL_int8(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_complex.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_complex.cu.cc
index a930b8a3fac..33d94f4fc71 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu_complex.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_complex.cu.cc
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
 
 namespace tensorflow {
-TF_CALL_complex64(DEFINE_GPU_KERNELS);
-TF_CALL_complex128(DEFINE_GPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(DEFINE_GPU_KERNELS);
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index bf69a19abc5..1ae959b7b3f 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -278,16 +278,12 @@ class HandleStridedSliceAssignCase<Device, T, 0> {
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TF_CALL_GPU_PROXY_TYPES(PREVENT_FOR_N_GPU);
-TF_CALL_complex64(PREVENT_FOR_N_GPU);
-TF_CALL_complex128(PREVENT_FOR_N_GPU);
+TF_CALL_COMPLEX_TYPES(PREVENT_FOR_N_GPU);
 
-TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_GPU);
-TF_CALL_complex64(DECLARE_FOR_N_GPU);
-TF_CALL_complex128(DECLARE_FOR_N_GPU);
-TF_CALL_bool(DECLARE_FOR_N_GPU);
 TF_CALL_int8(DECLARE_FOR_N_GPU);
-DECLARE_FOR_N_GPU(int32);
-DECLARE_FOR_N_GPU(int64);
+TF_CALL_int32(DECLARE_FOR_N_GPU);
+TF_CALL_int64(DECLARE_FOR_N_GPU);
+TF_CALL_GPU_ALL_TYPES(DECLARE_FOR_N_GPU);
 #endif  // END GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
diff --git a/tensorflow/core/kernels/tensor_array.cc b/tensorflow/core/kernels/tensor_array.cc
index 69efc016a1f..8613ed8c80d 100644
--- a/tensorflow/core/kernels/tensor_array.cc
+++ b/tensorflow/core/kernels/tensor_array.cc
@@ -46,8 +46,7 @@ TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_CPU)
 
 #define TENSOR_ARRAY_WRITE_OR_ADD_GPU(T) TENSOR_ARRAY_WRITE_OR_ADD(GPUDevice, T)
 TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
-TF_CALL_complex64(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
-TF_CALL_complex128(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+TF_CALL_COMPLEX_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
 #undef TENSOR_ARRAY_WRITE_OR_ADD_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -71,8 +70,7 @@ TF_CALL_bool(TENSOR_ARRAY_SET_ZERO_CPU);
 
 #define TENSOR_ARRAY_SET_ZERO_GPU(T) TENSOR_ARRAY_SET_ZERO(GPUDevice, T)
 TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_GPU);
-TF_CALL_complex64(TENSOR_ARRAY_SET_ZERO_GPU);
-TF_CALL_complex128(TENSOR_ARRAY_SET_ZERO_GPU);
+TF_CALL_COMPLEX_TYPES(TENSOR_ARRAY_SET_ZERO_GPU);
 #undef TENSOR_ARRAY_SET_ZERO_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 405eba6c542..9bed5c533cf 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -61,8 +61,7 @@ TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_CPU)
 
 #define TENSOR_ARRAY_WRITE_OR_ADD_GPU(T) TENSOR_ARRAY_WRITE_OR_ADD(GPUDevice, T)
 TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
-TF_CALL_complex64(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
-TF_CALL_complex128(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+TF_CALL_COMPLEX_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
 #undef TENSOR_ARRAY_WRITE_OR_ADD_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -89,8 +88,7 @@ TF_CALL_bool(TENSOR_ARRAY_SET_ZERO_CPU);
 
 #define TENSOR_ARRAY_SET_ZERO_GPU(T) TENSOR_ARRAY_SET_ZERO(GPUDevice, T)
 TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_GPU);
-TF_CALL_complex64(TENSOR_ARRAY_SET_ZERO_GPU);
-TF_CALL_complex128(TENSOR_ARRAY_SET_ZERO_GPU);
+TF_CALL_COMPLEX_TYPES(TENSOR_ARRAY_SET_ZERO_GPU);
 #undef TENSOR_ARRAY_SET_ZERO_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index ea8e04a33f4..048048cea9e 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -256,11 +256,10 @@ REGISTER_KERNEL_BUILDER(Name("TensorArrayV3").Device(DEVICE_CPU),
                               .HostMemory("handle"),         \
                           TensorArrayOp);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
-REGISTER_GPU(bfloat16);
+TF_CALL_bfloat16(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -483,10 +482,9 @@ TF_CALL_ALL_TYPES(REGISTER_WRITE);
                               .HostMemory("index"),             \
                           TensorArrayWriteOp<GPUDevice, type>);
 
+TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
-REGISTER_GPU(bfloat16);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -572,11 +570,10 @@ TF_CALL_ALL_TYPES(REGISTER_READ)
                               .HostMemory("index"),            \
                           TensorArrayReadOp<GPUDevice, type>);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
-REGISTER_GPU(bfloat16);
+TF_CALL_bfloat16(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -774,10 +771,9 @@ REGISTER_GATHER_AND_PACK(qint32);
           .HostMemory("handle"),                                            \
       TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>);
 
+TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
-REGISTER_GPU(bfloat16);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
@@ -995,10 +991,9 @@ REGISTER_CONCAT(qint32);
                               .HostMemory("handle"),             \
                           TensorArrayConcatOp<GPUDevice, type>)
 
+TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
-REGISTER_GPU(bfloat16);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
@@ -1215,10 +1210,9 @@ TF_CALL_ALL_TYPES(REGISTER_SCATTER_AND_UNPACK);
       TensorArrayUnpackOrScatterOp<GPUDevice, type,             \
                                    false /* LEGACY_UNPACK */>);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -1387,8 +1381,7 @@ TF_CALL_ALL_TYPES(REGISTER_SPLIT);
                           TensorArraySplitOp<GPUDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 7e2a8b363c5..b3714e4d27e 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -727,12 +727,8 @@ class ApplyGradientDescentOp<SYCLDevice, T> : public OpKernel {
                           ApplyGradientDescentOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-TF_CALL_half(REGISTER_CPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
-TF_CALL_float(REGISTER_CPU_KERNELS);
-TF_CALL_double(REGISTER_CPU_KERNELS);
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
@@ -898,12 +894,8 @@ class ApplyAdadeltaOp : public OpKernel {
                           ApplyAdadeltaOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-TF_CALL_half(REGISTER_CPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
-TF_CALL_float(REGISTER_CPU_KERNELS);
-TF_CALL_double(REGISTER_CPU_KERNELS);
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
@@ -1089,12 +1081,8 @@ class SparseApplyAdadeltaOp : public OpKernel {
   REGISTER_KERNELS(T, int32);   \
   REGISTER_KERNELS(T, int64);
 
-TF_CALL_half(REGISTER_CPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
-TF_CALL_float(REGISTER_CPU_KERNELS);
-TF_CALL_double(REGISTER_CPU_KERNELS);
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
@@ -1383,12 +1371,8 @@ class ApplyAdagradOp : public OpKernel {
                           ApplyAdagradOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-TF_CALL_half(REGISTER_CPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
-TF_CALL_float(REGISTER_CPU_KERNELS);
-TF_CALL_double(REGISTER_CPU_KERNELS);
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
@@ -1492,12 +1476,8 @@ class ApplyAdagradV2Op : public OpKernel {
                           ApplyAdagradV2Op<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-TF_CALL_half(REGISTER_CPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
-TF_CALL_float(REGISTER_CPU_KERNELS);
-TF_CALL_double(REGISTER_CPU_KERNELS);
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
@@ -1801,12 +1781,8 @@ class SparseApplyAdagradOp : public OpKernel {
   REGISTER_KERNELS(T, int32);   \
   REGISTER_KERNELS(T, int64);
 
-TF_CALL_half(REGISTER_CPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
-TF_CALL_float(REGISTER_CPU_KERNELS);
-TF_CALL_double(REGISTER_CPU_KERNELS);
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
@@ -1976,12 +1952,8 @@ class SparseApplyAdagradV2Op : public OpKernel {
   REGISTER_KERNELS(T, int32);   \
   REGISTER_KERNELS(T, int64);
 
-TF_CALL_half(REGISTER_CPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
-TF_CALL_float(REGISTER_CPU_KERNELS);
-TF_CALL_double(REGISTER_CPU_KERNELS);
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
@@ -3054,12 +3026,8 @@ class ApplyMomentumOp : public OpKernel {
                           ApplyMomentumOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-TF_CALL_half(REGISTER_CPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
-TF_CALL_float(REGISTER_CPU_KERNELS);
-TF_CALL_double(REGISTER_CPU_KERNELS);
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
@@ -3209,12 +3177,8 @@ class SparseApplyMomentumOp : public OpKernel {
   REGISTER_KERNELS(T, int32);   \
   REGISTER_KERNELS(T, int64);
 
-TF_CALL_half(REGISTER_CPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
-TF_CALL_float(REGISTER_CPU_KERNELS);
-TF_CALL_double(REGISTER_CPU_KERNELS);
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
@@ -3288,12 +3252,8 @@ class ApplyKerasMomentumOp : public OpKernel {
                           ApplyKerasMomentumOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-TF_CALL_half(REGISTER_CPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
-TF_CALL_float(REGISTER_CPU_KERNELS);
-TF_CALL_double(REGISTER_CPU_KERNELS);
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
@@ -3423,12 +3383,9 @@ class SparseApplyKerasMomentumOp : public OpKernel {
   REGISTER_KERNELS(T, CPU, int32); \
   REGISTER_KERNELS(T, CPU, int64);
 
-TF_CALL_half(REGISTER_CPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
-TF_CALL_float(REGISTER_CPU_KERNELS);
-TF_CALL_double(REGISTER_CPU_KERNELS);
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
+
 #undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -3699,12 +3656,8 @@ class ApplyAdamOp<SYCLDevice, T> : public OpKernel {
                           ApplyAdamOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-TF_CALL_half(REGISTER_CPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
-TF_CALL_float(REGISTER_CPU_KERNELS);
-TF_CALL_double(REGISTER_CPU_KERNELS);
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
@@ -4226,12 +4179,8 @@ class ApplyCenteredRMSPropOp : public OpKernel {
                           ApplyCenteredRMSPropOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-TF_CALL_half(REGISTER_CPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
-TF_CALL_float(REGISTER_CPU_KERNELS);
-TF_CALL_double(REGISTER_CPU_KERNELS);
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index ce02aa17225..7ac02e8b4d4 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -144,12 +144,9 @@ TF_CALL_ALL_TYPES(REGISTER_UNPACK);
       Name("Unpack").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       UnpackOp<GPUDevice, type>)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_uint8(REGISTER_GPU);
-TF_CALL_bool(REGISTER_GPU);
-TF_CALL_complex64(REGISTER_GPU);
-TF_CALL_complex128(REGISTER_GPU);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index e527c1a2e3f..6f5e0b94eca 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -250,11 +250,10 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
                               .HostMemory("is_initialized"),               \
                           IsVariableInitializedOp);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
-TF_CALL_complex64(REGISTER_GPU_KERNELS);
-TF_CALL_complex128(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 TF_CALL_uint32(REGISTER_GPU_KERNELS);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM