Provide a template overload that works for all primitive types on GpuAtomic[Add|Sub|Min|Max]. CUDA only provides a handful of these and we need to emulate the rest. In particular, CUDA provides no overloads for unsigned long or long.

PiperOrigin-RevId: 331918801 Change-Id: I7cc16d51d8e5d1710abe704ac2644248d26238f6
2020-09-15 20:51:24 -07:00 · 2020-09-15 20:51:24 -07:00 · 541b4040fa
commit 541b4040fa
parent 85d50dbb7d
1 changed files with 43 additions and 15 deletions
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@ -606,7 +606,7 @@ __device__ double GpuAtomicCasHelper(double* ptr, F accumulate) {
  // HIP has a bug in the implementation of __longlong_as_double
  // So workaround it by using reinterpret_cast<double*>.
  uint64_t result =
-      GpuAtomicCasHelper(reinterpret_cast<tensorflow::uint64*>(ptr),
+      GpuAtomicCasHelper(reinterpret_cast<unsigned long long*>(ptr),
                         [accumulate](tensorflow::uint64 a) {
                           return __double_as_longlong(
                               accumulate(*(reinterpret_cast<double*>(&a))));
@ -614,7 +614,7 @@ __device__ double GpuAtomicCasHelper(double* ptr, F accumulate) {
  return *(reinterpret_cast<double*>(&result));
 #else
  return __longlong_as_double(GpuAtomicCasHelper(
-      reinterpret_cast<tensorflow::uint64*>(ptr),
+      reinterpret_cast<unsigned long long*>(ptr),
      [accumulate](tensorflow::uint64 a) {
        return __double_as_longlong(accumulate(__longlong_as_double(a)));
      }));
@ -676,6 +676,38 @@ template <typename From, typename To>
 using ToTypeIfConvertible =
    typename std::enable_if<std::is_convertible<From, To>::value, To>::type;

+template <typename T>
+struct CudaSupportedTypeImpl {
+  using type = T;
+};
+
+template <>
+struct CudaSupportedTypeImpl<long long> {
+  using type = unsigned long long;
+};
+
+template <>
+struct CudaSupportedTypeImpl<unsigned long> {
+  using type =
+      typename std::conditional<sizeof(unsigned long) == sizeof(unsigned int),
+                                unsigned int, unsigned long long>::type;
+};
+
+template <>
+struct CudaSupportedTypeImpl<long> {
+  // This cast should be safe since module-2 addition should work fine. However,
+  // signed overflow is not handled correctly since it's undefined behavior.
+  using type = typename CudaSupportedTypeImpl<unsigned long>::type;
+};
+
+template <typename T>
+using CudaSupportedType = typename CudaSupportedTypeImpl<T>::type;
+
+template <typename T>
+__device__ CudaSupportedType<T>* ToCudaSupportedPtr(T* ptr) {
+  return reinterpret_cast<CudaSupportedType<T>*>(ptr);
+}
+
 }  // namespace detail

 // CUDA provides atomic ops, but not for all types.  We provide wrappers
@ -683,13 +715,7 @@ using ToTypeIfConvertible =

 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicAdd(T* ptr, U value) {
-  return atomicAdd(ptr, value);
-}
-
-__device__ inline int64 GpuAtomicAdd(int64* ptr, int64 value) {
-  // This cast should be safe since module-2 addition should work fine. However,
-  // signed overflow is not handled correctly since it's undefined behavior.
-  return atomicAdd(reinterpret_cast<uint64*>(ptr), static_cast<uint64>(value));
+  return atomicAdd(detail::ToCudaSupportedPtr(ptr), value);
 }

 __device__ inline Eigen::half GpuAtomicAdd(Eigen::half* ptr,
@ -765,7 +791,7 @@ CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicSub, CudaAtomicSub);
 // GpuAtomicMax
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMax(T* ptr, U value) {
-  return atomicMax(ptr, value);
+  return atomicMax(detail::ToCudaSupportedPtr(ptr), value);
 }

 #if TENSORFLOW_USE_ROCM
@ -817,11 +843,12 @@ __device__ inline Eigen::half GpuAtomicMax(Eigen::half* ptr,
 __device__ inline tensorflow::uint64 GpuAtomicMax(tensorflow::uint64* ptr,
                                                  tensorflow::uint64 value) {
  return detail::GpuAtomicCasHelper(
-      ptr, [value](tensorflow::uint64 a) { return max(a, value); });
+      detail::ToCudaSupportedPtr(ptr),
+      [value](tensorflow::uint64 a) { return max(a, value); });
 }

 __device__ inline int64 GpuAtomicMax(int64* ptr, int64 value) {
-  return detail::GpuAtomicCasHelper(ptr,
+  return detail::GpuAtomicCasHelper(detail::ToCudaSupportedPtr(ptr),
                                    [value](int64 a) { return max(a, value); });
 }
 #endif
@ -830,7 +857,7 @@ CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMax, CudaAtomicMax);
 // GpuAtomicMin
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMin(T* ptr, U value) {
-  return atomicMin(ptr, value);
+  return atomicMin(detail::ToCudaSupportedPtr(ptr), value);
 }

 #if TENSORFLOW_USE_ROCM
@ -882,11 +909,12 @@ __device__ inline Eigen::half GpuAtomicMin(Eigen::half* ptr,
 __device__ inline tensorflow::uint64 GpuAtomicMin(tensorflow::uint64* ptr,
                                                  tensorflow::uint64 value) {
  return detail::GpuAtomicCasHelper(
-      ptr, [value](tensorflow::uint64 a) { return min(a, value); });
+      detail::ToCudaSupportedPtr(ptr),
+      [value](tensorflow::uint64 a) { return min(a, value); });
 }

 __device__ inline int64 GpuAtomicMin(int64* ptr, int64 value) {
-  return detail::GpuAtomicCasHelper(ptr,
+  return detail::GpuAtomicCasHelper(detail::ToCudaSupportedPtr(ptr),
                                    [value](int64 a) { return min(a, value); });
 }
 #endif