[ROCm] Fix for ROCm CSB Breakage - 200630

The following commit (which switched G's internal CI to use ROCm 3.5) breaks the ROCm CSB build (which still uses ROCm 3.3) 22def20bae This PR/commit simply puts back a couple of codes that were removed the the previous commit, and makes them condition on ROCm 3.5. Note that the ROCm CSB build will be switching to ROCm 3.5 or higher in the near future, at which point all codes the `true` block for `#if TENSORFLOW_COMPILER_IS_HIP_CLANG` will become default, and those in eht `false / #else` block will be removed.
2020-06-30 19:52:19 +00:00 · 2020-06-30 19:52:19 +00:00 · a9798f4432
commit a9798f4432
parent d8dcead440
2 changed files with 36 additions and 1 deletions
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@ -168,10 +168,25 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
      block_size_limit);
  CHECK_EQ(err, cudaSuccess);
 #elif TENSORFLOW_USE_ROCM
+#if TENSORFLOW_COMPILER_IS_HIP_CLANG
  hipError_t err = hipOccupancyMaxPotentialBlockSize(
      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
      block_size_limit);
  CHECK_EQ(err, hipSuccess);
+#else
+  // Earlier versions of this HIP routine incorrectly returned void.
+  // TODO re-enable hipError_t error checking when HIP is fixed.
+  // ROCm interface uses unsigned int, convert after checking
+  uint32_t block_count_uint = 0;
+  uint32_t thread_per_block_uint = 0;
+  CHECK_GE(block_size_limit, 0);
+  uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
+  hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
+                                    func, dynamic_shared_memory_size,
+                                    block_size_limit_uint);
+  block_count = static_cast<int>(block_count_uint);
+  thread_per_block = static_cast<int>(thread_per_block_uint);
+#endif
 #endif

  block_count =
@ -201,9 +216,22 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
      &block_count, func, fixed_block_size, dynamic_shared_memory_size);
  CHECK_EQ(err, cudaSuccess);
 #elif TENSORFLOW_USE_ROCM
+#if TENSORFLOW_COMPILER_IS_HIP_CLANG
  hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
      &block_count, func, fixed_block_size, dynamic_shared_memory_size);
  CHECK_EQ(err, hipSuccess);
+#else
+  // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
+  // that the kernel is quite simple and will largely be memory-limited.
+  const int physical_thread_count = std::min(
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
+      work_element_count);
+  // Assume the kernel be simple enough that it is okay to use 1024 threads
+  // per workgroup.
+  int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
+  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
+                         d.getNumGpuMultiProcessors());
+#endif
 #endif
  block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
                         DivUp(work_element_count, fixed_block_size));
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@ -35,7 +35,7 @@ load(

 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
-_ROCM_TOOLKIT_PATH = "ROCM_TOOLKIT_PATH"
+_ROCM_TOOLKIT_PATH = "ROCM_PATH"
 _TF_ROCM_VERSION = "TF_ROCM_VERSION"
 _TF_MIOPEN_VERSION = "TF_MIOPEN_VERSION"
 _TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS"
@ -196,6 +196,13 @@ def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/10.0.0/include")
    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/11.0.0/include")

+    # Support hcc based off clang 10.0.0 (for ROCm 3.3)
+    inc_dirs.append(rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
+    inc_dirs.append(rocm_toolkit_path + "/hcc/lib/clang/10.0.0/include")
+
+    # Add hcc headers
+    inc_dirs.append(rocm_toolkit_path + "/hcc/include")
+
    return inc_dirs

 def _enable_rocm(repository_ctx):