Provide a template overload that works for all primitive types on GpuAtomic[Add|Sub|Min|Max]. CUDA only provides a handful of these and we need to emulate the rest. In particular, CUDA provides no overloads for unsigned long
or long
.
PiperOrigin-RevId: 331918801 Change-Id: I7cc16d51d8e5d1710abe704ac2644248d26238f6
This commit is contained in:
parent
85d50dbb7d
commit
541b4040fa
@ -606,7 +606,7 @@ __device__ double GpuAtomicCasHelper(double* ptr, F accumulate) {
|
||||
// HIP has a bug in the implementation of __longlong_as_double
|
||||
// So workaround it by using reinterpret_cast<double*>.
|
||||
uint64_t result =
|
||||
GpuAtomicCasHelper(reinterpret_cast<tensorflow::uint64*>(ptr),
|
||||
GpuAtomicCasHelper(reinterpret_cast<unsigned long long*>(ptr),
|
||||
[accumulate](tensorflow::uint64 a) {
|
||||
return __double_as_longlong(
|
||||
accumulate(*(reinterpret_cast<double*>(&a))));
|
||||
@ -614,7 +614,7 @@ __device__ double GpuAtomicCasHelper(double* ptr, F accumulate) {
|
||||
return *(reinterpret_cast<double*>(&result));
|
||||
#else
|
||||
return __longlong_as_double(GpuAtomicCasHelper(
|
||||
reinterpret_cast<tensorflow::uint64*>(ptr),
|
||||
reinterpret_cast<unsigned long long*>(ptr),
|
||||
[accumulate](tensorflow::uint64 a) {
|
||||
return __double_as_longlong(accumulate(__longlong_as_double(a)));
|
||||
}));
|
||||
@ -676,6 +676,38 @@ template <typename From, typename To>
|
||||
using ToTypeIfConvertible =
|
||||
typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
|
||||
|
||||
template <typename T>
|
||||
struct CudaSupportedTypeImpl {
|
||||
using type = T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CudaSupportedTypeImpl<long long> {
|
||||
using type = unsigned long long;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CudaSupportedTypeImpl<unsigned long> {
|
||||
using type =
|
||||
typename std::conditional<sizeof(unsigned long) == sizeof(unsigned int),
|
||||
unsigned int, unsigned long long>::type;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CudaSupportedTypeImpl<long> {
|
||||
// This cast should be safe since module-2 addition should work fine. However,
|
||||
// signed overflow is not handled correctly since it's undefined behavior.
|
||||
using type = typename CudaSupportedTypeImpl<unsigned long>::type;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using CudaSupportedType = typename CudaSupportedTypeImpl<T>::type;
|
||||
|
||||
template <typename T>
|
||||
__device__ CudaSupportedType<T>* ToCudaSupportedPtr(T* ptr) {
|
||||
return reinterpret_cast<CudaSupportedType<T>*>(ptr);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// CUDA provides atomic ops, but not for all types. We provide wrappers
|
||||
@ -683,13 +715,7 @@ using ToTypeIfConvertible =
|
||||
|
||||
template <typename T, typename U>
|
||||
__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicAdd(T* ptr, U value) {
|
||||
return atomicAdd(ptr, value);
|
||||
}
|
||||
|
||||
__device__ inline int64 GpuAtomicAdd(int64* ptr, int64 value) {
|
||||
// This cast should be safe since module-2 addition should work fine. However,
|
||||
// signed overflow is not handled correctly since it's undefined behavior.
|
||||
return atomicAdd(reinterpret_cast<uint64*>(ptr), static_cast<uint64>(value));
|
||||
return atomicAdd(detail::ToCudaSupportedPtr(ptr), value);
|
||||
}
|
||||
|
||||
__device__ inline Eigen::half GpuAtomicAdd(Eigen::half* ptr,
|
||||
@ -765,7 +791,7 @@ CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicSub, CudaAtomicSub);
|
||||
// GpuAtomicMax
|
||||
template <typename T, typename U>
|
||||
__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMax(T* ptr, U value) {
|
||||
return atomicMax(ptr, value);
|
||||
return atomicMax(detail::ToCudaSupportedPtr(ptr), value);
|
||||
}
|
||||
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
@ -817,11 +843,12 @@ __device__ inline Eigen::half GpuAtomicMax(Eigen::half* ptr,
|
||||
__device__ inline tensorflow::uint64 GpuAtomicMax(tensorflow::uint64* ptr,
|
||||
tensorflow::uint64 value) {
|
||||
return detail::GpuAtomicCasHelper(
|
||||
ptr, [value](tensorflow::uint64 a) { return max(a, value); });
|
||||
detail::ToCudaSupportedPtr(ptr),
|
||||
[value](tensorflow::uint64 a) { return max(a, value); });
|
||||
}
|
||||
|
||||
__device__ inline int64 GpuAtomicMax(int64* ptr, int64 value) {
|
||||
return detail::GpuAtomicCasHelper(ptr,
|
||||
return detail::GpuAtomicCasHelper(detail::ToCudaSupportedPtr(ptr),
|
||||
[value](int64 a) { return max(a, value); });
|
||||
}
|
||||
#endif
|
||||
@ -830,7 +857,7 @@ CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMax, CudaAtomicMax);
|
||||
// GpuAtomicMin
|
||||
template <typename T, typename U>
|
||||
__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMin(T* ptr, U value) {
|
||||
return atomicMin(ptr, value);
|
||||
return atomicMin(detail::ToCudaSupportedPtr(ptr), value);
|
||||
}
|
||||
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
@ -882,11 +909,12 @@ __device__ inline Eigen::half GpuAtomicMin(Eigen::half* ptr,
|
||||
__device__ inline tensorflow::uint64 GpuAtomicMin(tensorflow::uint64* ptr,
|
||||
tensorflow::uint64 value) {
|
||||
return detail::GpuAtomicCasHelper(
|
||||
ptr, [value](tensorflow::uint64 a) { return min(a, value); });
|
||||
detail::ToCudaSupportedPtr(ptr),
|
||||
[value](tensorflow::uint64 a) { return min(a, value); });
|
||||
}
|
||||
|
||||
__device__ inline int64 GpuAtomicMin(int64* ptr, int64 value) {
|
||||
return detail::GpuAtomicCasHelper(ptr,
|
||||
return detail::GpuAtomicCasHelper(detail::ToCudaSupportedPtr(ptr),
|
||||
[value](int64 a) { return min(a, value); });
|
||||
}
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user