Merge pull request #42288 from ROCmSoftwarePlatform:google-upstream-rocm35-0812

PiperOrigin-RevId: 333738541 Change-Id: Ic848afd875bb7dee980bd7c47a637ae6189f90e3
2020-09-25 08:48:20 -07:00 · 2020-09-25 08:48:20 -07:00 · eb0f1eda6d
commit eb0f1eda6d
parent 4b6afcd5d7 142c5c10d4
5 changed files with 15 additions and 7 deletions
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@ -287,7 +287,7 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
  // One extra line in the inner dimension to avoid share memory bank conflict.
  // This is to mimic the following, but no constructor of T can be invoked.
  //     __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
-#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
+#if GOOGLE_CUDA  // || TENSORFLOW_COMPILER_IS_HIP_CLANG
  __shared__ __align__(
      alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
  typedef T(*SharedMemoryTile)[TileSizeJ + 1];
--- a/tensorflow/core/kernels/random_op_gpu.h
+++ b/tensorflow/core/kernels/random_op_gpu.h
@ -234,11 +234,13 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
    const uint64* counter, random::PhiloxRandom gen,
    typename Distribution::ResultElementType* data, int64 size,
    Distribution dist) {
+  if (size == 0) return;
  const int32 block_size = d.maxGpuThreadsPerBlock();
  const int32 num_blocks =
-      (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
+      std::min<int64>(
+          d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
+          size + block_size - 1) /
      block_size;
-
  TF_CHECK_OK(GpuLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>,
                              num_blocks, block_size, 0, d.stream(), key,
                              counter, gen, data, size, dist));
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@ -276,7 +276,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceMax16ColumnsKernel(
    // This is to mimic the following, but without any constructors:
    //   __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
    //   (TF_RED_WARPSIZE+1)];
-#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
+#if GOOGLE_CUDA
  __shared__ __align__(alignof(value_type)) char
      partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
                       sizeof(value_type)];
@ -337,7 +337,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceKernel(
    // This is to mimic the following, but without constructors:
    //     __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
    //     (TF_RED_WARPSIZE + 1)];
-#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
+#if GOOGLE_CUDA
  __shared__ __align__(alignof(value_type)) char
      partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
                       sizeof(value_type)];
--- a/tensorflow/core/kernels/scan_ops_gpu.h
+++ b/tensorflow/core/kernels/scan_ops_gpu.h
@ -279,7 +279,13 @@ void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
        GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
                        num_blocks, block_size, 0, d.stream(), in.data(),
                        out.data(), dimx, dimy, dimz, exclusive, reverse, op));
+#if TENSORFLOW_COMPILER_IS_HIP_CLANG
+    // HIP-CLANG has some kind of problem here with 32 threads (possibly because
+    // the warpsize is 64). Reenable when working properly
+  } else if (true) {
+#else
  } else if (ideal_block_size >= 64) {
+#endif
    const int block_size = 64;
    TF_CHECK_OK(
        GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@ -175,11 +175,11 @@ __device__ std::complex<T> impl_rsqrt(std::complex<T> x) {
  // due to subtraction of two close values. We have to get fancy
  root[0] = sqrt(r * ((std::is_same<T, float>::value && re * r < -0.98)
                          ? rsqrt_helper(im * im * r * r)
-                          : 1 + re * r)) *
+                          : max(T(0.0), 1 + re * r))) *
            root2;
  root[1] = sqrt(r * ((std::is_same<T, float>::value && re * r > 0.98)
                          ? rsqrt_helper(im * im * r * r)
-                          : 1 - re * r)) *
+                          : max(T(0.0), 1 - re * r))) *
            root2 * (im >= 0 ? -1. : 1.);
  return *(reinterpret_cast<std::complex<T>*>(&root));
 }