Merge pull request from ROCmSoftwarePlatform:google-upstream-rocm35-0812

PiperOrigin-RevId: 333738541
Change-Id: Ic848afd875bb7dee980bd7c47a637ae6189f90e3
This commit is contained in:
TensorFlower Gardener 2020-09-25 08:48:20 -07:00
commit eb0f1eda6d
5 changed files with 15 additions and 7 deletions

View File

@ -287,7 +287,7 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
// One extra line in the inner dimension to avoid share memory bank conflict. // One extra line in the inner dimension to avoid share memory bank conflict.
// This is to mimic the following, but no constructor of T can be invoked. // This is to mimic the following, but no constructor of T can be invoked.
// __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1]; // __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG #if GOOGLE_CUDA // || TENSORFLOW_COMPILER_IS_HIP_CLANG
__shared__ __align__( __shared__ __align__(
alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)]; alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
typedef T(*SharedMemoryTile)[TileSizeJ + 1]; typedef T(*SharedMemoryTile)[TileSizeJ + 1];

View File

@ -234,11 +234,13 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
const uint64* counter, random::PhiloxRandom gen, const uint64* counter, random::PhiloxRandom gen,
typename Distribution::ResultElementType* data, int64 size, typename Distribution::ResultElementType* data, int64 size,
Distribution dist) { Distribution dist) {
if (size == 0) return;
const int32 block_size = d.maxGpuThreadsPerBlock(); const int32 block_size = d.maxGpuThreadsPerBlock();
const int32 num_blocks = const int32 num_blocks =
(d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) / std::min<int64>(
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
size + block_size - 1) /
block_size; block_size;
TF_CHECK_OK(GpuLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>, TF_CHECK_OK(GpuLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>,
num_blocks, block_size, 0, d.stream(), key, num_blocks, block_size, 0, d.stream(), key,
counter, gen, data, size, dist)); counter, gen, data, size, dist));

View File

@ -276,7 +276,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceMax16ColumnsKernel(
// This is to mimic the following, but without any constructors: // This is to mimic the following, but without any constructors:
// __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE * // __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
// (TF_RED_WARPSIZE+1)]; // (TF_RED_WARPSIZE+1)];
#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG #if GOOGLE_CUDA
__shared__ __align__(alignof(value_type)) char __shared__ __align__(alignof(value_type)) char
partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) * partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
sizeof(value_type)]; sizeof(value_type)];
@ -337,7 +337,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceKernel(
// This is to mimic the following, but without constructors: // This is to mimic the following, but without constructors:
// __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE * // __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
// (TF_RED_WARPSIZE + 1)]; // (TF_RED_WARPSIZE + 1)];
#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG #if GOOGLE_CUDA
__shared__ __align__(alignof(value_type)) char __shared__ __align__(alignof(value_type)) char
partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) * partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
sizeof(value_type)]; sizeof(value_type)];

View File

@ -279,7 +279,13 @@ void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>, GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
num_blocks, block_size, 0, d.stream(), in.data(), num_blocks, block_size, 0, d.stream(), in.data(),
out.data(), dimx, dimy, dimz, exclusive, reverse, op)); out.data(), dimx, dimy, dimz, exclusive, reverse, op));
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
// HIP-CLANG has some kind of problem here with 32 threads (possibly because
// the warpsize is 64). Reenable when working properly
} else if (true) {
#else
} else if (ideal_block_size >= 64) { } else if (ideal_block_size >= 64) {
#endif
const int block_size = 64; const int block_size = 64;
TF_CHECK_OK( TF_CHECK_OK(
GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>, GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,

View File

@ -175,11 +175,11 @@ __device__ std::complex<T> impl_rsqrt(std::complex<T> x) {
// due to subtraction of two close values. We have to get fancy // due to subtraction of two close values. We have to get fancy
root[0] = sqrt(r * ((std::is_same<T, float>::value && re * r < -0.98) root[0] = sqrt(r * ((std::is_same<T, float>::value && re * r < -0.98)
? rsqrt_helper(im * im * r * r) ? rsqrt_helper(im * im * r * r)
: 1 + re * r)) * : max(T(0.0), 1 + re * r))) *
root2; root2;
root[1] = sqrt(r * ((std::is_same<T, float>::value && re * r > 0.98) root[1] = sqrt(r * ((std::is_same<T, float>::value && re * r > 0.98)
? rsqrt_helper(im * im * r * r) ? rsqrt_helper(im * im * r * r)
: 1 - re * r)) * : max(T(0.0), 1 - re * r))) *
root2 * (im >= 0 ? -1. : 1.); root2 * (im >= 0 ? -1. : 1.);
return *(reinterpret_cast<std::complex<T>*>(&root)); return *(reinterpret_cast<std::complex<T>*>(&root));
} }