diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h index 0b943e917da..4c2df39e1a2 100644 --- a/tensorflow/core/util/gpu_launch_config.h +++ b/tensorflow/core/util/gpu_launch_config.h @@ -168,10 +168,25 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count, block_size_limit); CHECK_EQ(err, cudaSuccess); #elif TENSORFLOW_USE_ROCM +#if TENSORFLOW_COMPILER_IS_HIP_CLANG hipError_t err = hipOccupancyMaxPotentialBlockSize( &block_count, &thread_per_block, func, dynamic_shared_memory_size, block_size_limit); CHECK_EQ(err, hipSuccess); +#else + // Earlier versions of this HIP routine incorrectly returned void. + // TODO re-enable hipError_t error checking when HIP is fixed. + // ROCm interface uses unsigned int, convert after checking + uint32_t block_count_uint = 0; + uint32_t thread_per_block_uint = 0; + CHECK_GE(block_size_limit, 0); + uint32_t block_size_limit_uint = static_cast(block_size_limit); + hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint, + func, dynamic_shared_memory_size, + block_size_limit_uint); + block_count = static_cast(block_count_uint); + thread_per_block = static_cast(thread_per_block_uint); +#endif #endif block_count = @@ -201,9 +216,22 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize( &block_count, func, fixed_block_size, dynamic_shared_memory_size); CHECK_EQ(err, cudaSuccess); #elif TENSORFLOW_USE_ROCM +#if TENSORFLOW_COMPILER_IS_HIP_CLANG hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( &block_count, func, fixed_block_size, dynamic_shared_memory_size); CHECK_EQ(err, hipSuccess); +#else + // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&) + // that the kernel is quite simple and will largely be memory-limited. + const int physical_thread_count = std::min( + d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(), + work_element_count); + // Assume the kernel be simple enough that it is okay to use 1024 threads + // per workgroup. + int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock()); + block_count = std::min(DivUp(physical_thread_count, thread_per_block), + d.getNumGpuMultiProcessors()); +#endif #endif block_count = std::min(block_count * d.getNumGpuMultiProcessors(), DivUp(work_element_count, fixed_block_size)); diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl index 6a1204b87db..d28337de836 100644 --- a/third_party/gpus/rocm_configure.bzl +++ b/third_party/gpus/rocm_configure.bzl @@ -35,7 +35,7 @@ load( _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH" _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX" -_ROCM_TOOLKIT_PATH = "ROCM_TOOLKIT_PATH" +_ROCM_TOOLKIT_PATH = "ROCM_PATH" _TF_ROCM_VERSION = "TF_ROCM_VERSION" _TF_MIOPEN_VERSION = "TF_MIOPEN_VERSION" _TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS" @@ -196,6 +196,13 @@ def _rocm_include_path(repository_ctx, rocm_config, bash_bin): inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/10.0.0/include") inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/11.0.0/include") + # Support hcc based off clang 10.0.0 (for ROCm 3.3) + inc_dirs.append(rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/") + inc_dirs.append(rocm_toolkit_path + "/hcc/lib/clang/10.0.0/include") + + # Add hcc headers + inc_dirs.append(rocm_toolkit_path + "/hcc/include") + return inc_dirs def _enable_rocm(repository_ctx):