Merge pull request #42288 from ROCmSoftwarePlatform:google-upstream-rocm35-0812
PiperOrigin-RevId: 333738541 Change-Id: Ic848afd875bb7dee980bd7c47a637ae6189f90e3
This commit is contained in:
commit
eb0f1eda6d
tensorflow/core/kernels
@ -287,7 +287,7 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
|
|||||||
// One extra line in the inner dimension to avoid share memory bank conflict.
|
// One extra line in the inner dimension to avoid share memory bank conflict.
|
||||||
// This is to mimic the following, but no constructor of T can be invoked.
|
// This is to mimic the following, but no constructor of T can be invoked.
|
||||||
// __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
|
// __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
|
||||||
#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
|
#if GOOGLE_CUDA // || TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||||
__shared__ __align__(
|
__shared__ __align__(
|
||||||
alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
|
alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
|
||||||
typedef T(*SharedMemoryTile)[TileSizeJ + 1];
|
typedef T(*SharedMemoryTile)[TileSizeJ + 1];
|
||||||
|
@ -234,11 +234,13 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
|
|||||||
const uint64* counter, random::PhiloxRandom gen,
|
const uint64* counter, random::PhiloxRandom gen,
|
||||||
typename Distribution::ResultElementType* data, int64 size,
|
typename Distribution::ResultElementType* data, int64 size,
|
||||||
Distribution dist) {
|
Distribution dist) {
|
||||||
|
if (size == 0) return;
|
||||||
const int32 block_size = d.maxGpuThreadsPerBlock();
|
const int32 block_size = d.maxGpuThreadsPerBlock();
|
||||||
const int32 num_blocks =
|
const int32 num_blocks =
|
||||||
(d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
|
std::min<int64>(
|
||||||
|
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
|
||||||
|
size + block_size - 1) /
|
||||||
block_size;
|
block_size;
|
||||||
|
|
||||||
TF_CHECK_OK(GpuLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>,
|
TF_CHECK_OK(GpuLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>,
|
||||||
num_blocks, block_size, 0, d.stream(), key,
|
num_blocks, block_size, 0, d.stream(), key,
|
||||||
counter, gen, data, size, dist));
|
counter, gen, data, size, dist));
|
||||||
|
@ -276,7 +276,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceMax16ColumnsKernel(
|
|||||||
// This is to mimic the following, but without any constructors:
|
// This is to mimic the following, but without any constructors:
|
||||||
// __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
|
// __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
|
||||||
// (TF_RED_WARPSIZE+1)];
|
// (TF_RED_WARPSIZE+1)];
|
||||||
#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
|
#if GOOGLE_CUDA
|
||||||
__shared__ __align__(alignof(value_type)) char
|
__shared__ __align__(alignof(value_type)) char
|
||||||
partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
|
partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
|
||||||
sizeof(value_type)];
|
sizeof(value_type)];
|
||||||
@ -337,7 +337,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceKernel(
|
|||||||
// This is to mimic the following, but without constructors:
|
// This is to mimic the following, but without constructors:
|
||||||
// __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
|
// __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
|
||||||
// (TF_RED_WARPSIZE + 1)];
|
// (TF_RED_WARPSIZE + 1)];
|
||||||
#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
|
#if GOOGLE_CUDA
|
||||||
__shared__ __align__(alignof(value_type)) char
|
__shared__ __align__(alignof(value_type)) char
|
||||||
partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
|
partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
|
||||||
sizeof(value_type)];
|
sizeof(value_type)];
|
||||||
|
@ -279,7 +279,13 @@ void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
|
|||||||
GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
|
GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
|
||||||
num_blocks, block_size, 0, d.stream(), in.data(),
|
num_blocks, block_size, 0, d.stream(), in.data(),
|
||||||
out.data(), dimx, dimy, dimz, exclusive, reverse, op));
|
out.data(), dimx, dimy, dimz, exclusive, reverse, op));
|
||||||
|
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||||
|
// HIP-CLANG has some kind of problem here with 32 threads (possibly because
|
||||||
|
// the warpsize is 64). Reenable when working properly
|
||||||
|
} else if (true) {
|
||||||
|
#else
|
||||||
} else if (ideal_block_size >= 64) {
|
} else if (ideal_block_size >= 64) {
|
||||||
|
#endif
|
||||||
const int block_size = 64;
|
const int block_size = 64;
|
||||||
TF_CHECK_OK(
|
TF_CHECK_OK(
|
||||||
GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
|
GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
|
||||||
|
@ -175,11 +175,11 @@ __device__ std::complex<T> impl_rsqrt(std::complex<T> x) {
|
|||||||
// due to subtraction of two close values. We have to get fancy
|
// due to subtraction of two close values. We have to get fancy
|
||||||
root[0] = sqrt(r * ((std::is_same<T, float>::value && re * r < -0.98)
|
root[0] = sqrt(r * ((std::is_same<T, float>::value && re * r < -0.98)
|
||||||
? rsqrt_helper(im * im * r * r)
|
? rsqrt_helper(im * im * r * r)
|
||||||
: 1 + re * r)) *
|
: max(T(0.0), 1 + re * r))) *
|
||||||
root2;
|
root2;
|
||||||
root[1] = sqrt(r * ((std::is_same<T, float>::value && re * r > 0.98)
|
root[1] = sqrt(r * ((std::is_same<T, float>::value && re * r > 0.98)
|
||||||
? rsqrt_helper(im * im * r * r)
|
? rsqrt_helper(im * im * r * r)
|
||||||
: 1 - re * r)) *
|
: max(T(0.0), 1 - re * r))) *
|
||||||
root2 * (im >= 0 ? -1. : 1.);
|
root2 * (im >= 0 ? -1. : 1.);
|
||||||
return *(reinterpret_cast<std::complex<T>*>(&root));
|
return *(reinterpret_cast<std::complex<T>*>(&root));
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user