Merge pull request #42288 from ROCmSoftwarePlatform:google-upstream-rocm35-0812
PiperOrigin-RevId: 333738541 Change-Id: Ic848afd875bb7dee980bd7c47a637ae6189f90e3
This commit is contained in:
commit
eb0f1eda6d
tensorflow/core/kernels
@ -287,7 +287,7 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
|
||||
// One extra line in the inner dimension to avoid share memory bank conflict.
|
||||
// This is to mimic the following, but no constructor of T can be invoked.
|
||||
// __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
|
||||
#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||
#if GOOGLE_CUDA // || TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||
__shared__ __align__(
|
||||
alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
|
||||
typedef T(*SharedMemoryTile)[TileSizeJ + 1];
|
||||
|
@ -234,11 +234,13 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
|
||||
const uint64* counter, random::PhiloxRandom gen,
|
||||
typename Distribution::ResultElementType* data, int64 size,
|
||||
Distribution dist) {
|
||||
if (size == 0) return;
|
||||
const int32 block_size = d.maxGpuThreadsPerBlock();
|
||||
const int32 num_blocks =
|
||||
(d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
|
||||
std::min<int64>(
|
||||
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
|
||||
size + block_size - 1) /
|
||||
block_size;
|
||||
|
||||
TF_CHECK_OK(GpuLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>,
|
||||
num_blocks, block_size, 0, d.stream(), key,
|
||||
counter, gen, data, size, dist));
|
||||
|
@ -276,7 +276,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceMax16ColumnsKernel(
|
||||
// This is to mimic the following, but without any constructors:
|
||||
// __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
|
||||
// (TF_RED_WARPSIZE+1)];
|
||||
#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||
#if GOOGLE_CUDA
|
||||
__shared__ __align__(alignof(value_type)) char
|
||||
partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
|
||||
sizeof(value_type)];
|
||||
@ -337,7 +337,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceKernel(
|
||||
// This is to mimic the following, but without constructors:
|
||||
// __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
|
||||
// (TF_RED_WARPSIZE + 1)];
|
||||
#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||
#if GOOGLE_CUDA
|
||||
__shared__ __align__(alignof(value_type)) char
|
||||
partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
|
||||
sizeof(value_type)];
|
||||
|
@ -279,7 +279,13 @@ void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
|
||||
GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
|
||||
num_blocks, block_size, 0, d.stream(), in.data(),
|
||||
out.data(), dimx, dimy, dimz, exclusive, reverse, op));
|
||||
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||
// HIP-CLANG has some kind of problem here with 32 threads (possibly because
|
||||
// the warpsize is 64). Reenable when working properly
|
||||
} else if (true) {
|
||||
#else
|
||||
} else if (ideal_block_size >= 64) {
|
||||
#endif
|
||||
const int block_size = 64;
|
||||
TF_CHECK_OK(
|
||||
GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
|
||||
|
@ -175,11 +175,11 @@ __device__ std::complex<T> impl_rsqrt(std::complex<T> x) {
|
||||
// due to subtraction of two close values. We have to get fancy
|
||||
root[0] = sqrt(r * ((std::is_same<T, float>::value && re * r < -0.98)
|
||||
? rsqrt_helper(im * im * r * r)
|
||||
: 1 + re * r)) *
|
||||
: max(T(0.0), 1 + re * r))) *
|
||||
root2;
|
||||
root[1] = sqrt(r * ((std::is_same<T, float>::value && re * r > 0.98)
|
||||
? rsqrt_helper(im * im * r * r)
|
||||
: 1 - re * r)) *
|
||||
: max(T(0.0), 1 - re * r))) *
|
||||
root2 * (im >= 0 ? -1. : 1.);
|
||||
return *(reinterpret_cast<std::complex<T>*>(&root));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user