[ROCm] Fix for ROCm CSB Breakage - 200630
The following commit (which switched G's internal CI to use ROCm 3.5) breaks the ROCm CSB build (which still uses ROCm 3.3)
22def20bae
This PR/commit simply puts back a couple of codes that were removed the the previous commit, and makes them condition on ROCm 3.5.
Note that the ROCm CSB build will be switching to ROCm 3.5 or higher in the near future, at which point all codes the `true` block for `#if TENSORFLOW_COMPILER_IS_HIP_CLANG` will become default, and those in eht `false / #else` block will be removed.
This commit is contained in:
parent
d8dcead440
commit
a9798f4432
@ -168,10 +168,25 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
|
||||
block_size_limit);
|
||||
CHECK_EQ(err, cudaSuccess);
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||
hipError_t err = hipOccupancyMaxPotentialBlockSize(
|
||||
&block_count, &thread_per_block, func, dynamic_shared_memory_size,
|
||||
block_size_limit);
|
||||
CHECK_EQ(err, hipSuccess);
|
||||
#else
|
||||
// Earlier versions of this HIP routine incorrectly returned void.
|
||||
// TODO re-enable hipError_t error checking when HIP is fixed.
|
||||
// ROCm interface uses unsigned int, convert after checking
|
||||
uint32_t block_count_uint = 0;
|
||||
uint32_t thread_per_block_uint = 0;
|
||||
CHECK_GE(block_size_limit, 0);
|
||||
uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
|
||||
hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
|
||||
func, dynamic_shared_memory_size,
|
||||
block_size_limit_uint);
|
||||
block_count = static_cast<int>(block_count_uint);
|
||||
thread_per_block = static_cast<int>(thread_per_block_uint);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
block_count =
|
||||
@ -201,9 +216,22 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
|
||||
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
|
||||
CHECK_EQ(err, cudaSuccess);
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||
hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
|
||||
CHECK_EQ(err, hipSuccess);
|
||||
#else
|
||||
// Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
|
||||
// that the kernel is quite simple and will largely be memory-limited.
|
||||
const int physical_thread_count = std::min(
|
||||
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
|
||||
work_element_count);
|
||||
// Assume the kernel be simple enough that it is okay to use 1024 threads
|
||||
// per workgroup.
|
||||
int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
|
||||
block_count = std::min(DivUp(physical_thread_count, thread_per_block),
|
||||
d.getNumGpuMultiProcessors());
|
||||
#endif
|
||||
#endif
|
||||
block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
|
||||
DivUp(work_element_count, fixed_block_size));
|
||||
|
9
third_party/gpus/rocm_configure.bzl
vendored
9
third_party/gpus/rocm_configure.bzl
vendored
@ -35,7 +35,7 @@ load(
|
||||
|
||||
_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
|
||||
_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
|
||||
_ROCM_TOOLKIT_PATH = "ROCM_TOOLKIT_PATH"
|
||||
_ROCM_TOOLKIT_PATH = "ROCM_PATH"
|
||||
_TF_ROCM_VERSION = "TF_ROCM_VERSION"
|
||||
_TF_MIOPEN_VERSION = "TF_MIOPEN_VERSION"
|
||||
_TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS"
|
||||
@ -196,6 +196,13 @@ def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
|
||||
inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/10.0.0/include")
|
||||
inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/11.0.0/include")
|
||||
|
||||
# Support hcc based off clang 10.0.0 (for ROCm 3.3)
|
||||
inc_dirs.append(rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
|
||||
inc_dirs.append(rocm_toolkit_path + "/hcc/lib/clang/10.0.0/include")
|
||||
|
||||
# Add hcc headers
|
||||
inc_dirs.append(rocm_toolkit_path + "/hcc/include")
|
||||
|
||||
return inc_dirs
|
||||
|
||||
def _enable_rocm(repository_ctx):
|
||||
|
Loading…
Reference in New Issue
Block a user