[ROCm] Fix for ROCm CSB Breakage - 200630

The following commit (which switched G's internal CI to use ROCm 3.5) breaks the ROCm CSB build (which still uses ROCm 3.3)

22def20bae

This PR/commit simply puts back a couple of codes that were removed the the previous commit, and makes them condition on ROCm 3.5.

Note that the ROCm CSB build will be switching to ROCm 3.5 or higher in the near future, at which point all codes the `true` block for `#if TENSORFLOW_COMPILER_IS_HIP_CLANG` will become default, and those in eht `false / #else` block will be removed.
This commit is contained in:
Deven Desai 2020-06-30 19:52:19 +00:00
parent d8dcead440
commit a9798f4432
2 changed files with 36 additions and 1 deletions
tensorflow/core/util
third_party/gpus

View File

@ -168,10 +168,25 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
block_size_limit);
CHECK_EQ(err, cudaSuccess);
#elif TENSORFLOW_USE_ROCM
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
hipError_t err = hipOccupancyMaxPotentialBlockSize(
&block_count, &thread_per_block, func, dynamic_shared_memory_size,
block_size_limit);
CHECK_EQ(err, hipSuccess);
#else
// Earlier versions of this HIP routine incorrectly returned void.
// TODO re-enable hipError_t error checking when HIP is fixed.
// ROCm interface uses unsigned int, convert after checking
uint32_t block_count_uint = 0;
uint32_t thread_per_block_uint = 0;
CHECK_GE(block_size_limit, 0);
uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
func, dynamic_shared_memory_size,
block_size_limit_uint);
block_count = static_cast<int>(block_count_uint);
thread_per_block = static_cast<int>(thread_per_block_uint);
#endif
#endif
block_count =
@ -201,9 +216,22 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
CHECK_EQ(err, cudaSuccess);
#elif TENSORFLOW_USE_ROCM
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
CHECK_EQ(err, hipSuccess);
#else
// Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
// that the kernel is quite simple and will largely be memory-limited.
const int physical_thread_count = std::min(
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
work_element_count);
// Assume the kernel be simple enough that it is okay to use 1024 threads
// per workgroup.
int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
block_count = std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumGpuMultiProcessors());
#endif
#endif
block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
DivUp(work_element_count, fixed_block_size));

View File

@ -35,7 +35,7 @@ load(
_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
_ROCM_TOOLKIT_PATH = "ROCM_TOOLKIT_PATH"
_ROCM_TOOLKIT_PATH = "ROCM_PATH"
_TF_ROCM_VERSION = "TF_ROCM_VERSION"
_TF_MIOPEN_VERSION = "TF_MIOPEN_VERSION"
_TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS"
@ -196,6 +196,13 @@ def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/10.0.0/include")
inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/11.0.0/include")
# Support hcc based off clang 10.0.0 (for ROCm 3.3)
inc_dirs.append(rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
inc_dirs.append(rocm_toolkit_path + "/hcc/lib/clang/10.0.0/include")
# Add hcc headers
inc_dirs.append(rocm_toolkit_path + "/hcc/include")
return inc_dirs
def _enable_rocm(repository_ctx):