Removing references to TENSORFLOW_COMPILER_IS_HIP_CLANG
Now that we are way past the switch to use ROCm 3.5 and above (i.e. hip-clang), the codes within `#ifdef TENSORFLOW_COMPILER_IS_HIP_CLANG` are always enabled, and the codes within the corresponding `#else` blocks are deadcodes. This commit removes the references to `#ifdef TENSORFLOW_COMPILER_IS_HIP_CLANG` and their corresponding `#else` blocks
This commit is contained in:
parent
147a175e37
commit
1498747b30
@ -287,7 +287,7 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
|
||||
// One extra line in the inner dimension to avoid share memory bank conflict.
|
||||
// This is to mimic the following, but no constructor of T can be invoked.
|
||||
// __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
|
||||
#if GOOGLE_CUDA // || TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||
#if GOOGLE_CUDA
|
||||
__shared__ __align__(
|
||||
alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
|
||||
typedef T(*SharedMemoryTile)[TileSizeJ + 1];
|
||||
|
@ -248,10 +248,8 @@ void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
|
||||
int num_blocks = dimx * dimz;
|
||||
|
||||
int ideal_block_size = dimy / items_per_thread;
|
||||
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||
const int rocm_threads_per_warp = 64;
|
||||
ideal_block_size = std::max(ideal_block_size, rocm_threads_per_warp);
|
||||
#endif
|
||||
|
||||
// There seems to be a bug when the type is not float and block_size 1024.
|
||||
// Launch on the smallest power of 2 block size that we can.
|
||||
|
@ -36,11 +36,7 @@ string RocmRoot() {
|
||||
}
|
||||
|
||||
string RocdlRoot() {
|
||||
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||
return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "lib");
|
||||
#else
|
||||
return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "hcc/lib");
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -168,25 +168,10 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
|
||||
block_size_limit);
|
||||
CHECK_EQ(err, cudaSuccess);
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||
hipError_t err = hipOccupancyMaxPotentialBlockSize(
|
||||
&block_count, &thread_per_block, func, dynamic_shared_memory_size,
|
||||
block_size_limit);
|
||||
CHECK_EQ(err, hipSuccess);
|
||||
#else
|
||||
// Earlier versions of this HIP routine incorrectly returned void.
|
||||
// TODO re-enable hipError_t error checking when HIP is fixed.
|
||||
// ROCm interface uses unsigned int, convert after checking
|
||||
uint32_t block_count_uint = 0;
|
||||
uint32_t thread_per_block_uint = 0;
|
||||
CHECK_GE(block_size_limit, 0);
|
||||
uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
|
||||
hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
|
||||
func, dynamic_shared_memory_size,
|
||||
block_size_limit_uint);
|
||||
block_count = static_cast<int>(block_count_uint);
|
||||
thread_per_block = static_cast<int>(thread_per_block_uint);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
block_count =
|
||||
@ -216,22 +201,9 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
|
||||
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
|
||||
CHECK_EQ(err, cudaSuccess);
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
|
||||
hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
|
||||
CHECK_EQ(err, hipSuccess);
|
||||
#else
|
||||
// Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
|
||||
// that the kernel is quite simple and will largely be memory-limited.
|
||||
const int physical_thread_count = std::min(
|
||||
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
|
||||
work_element_count);
|
||||
// Assume the kernel be simple enough that it is okay to use 1024 threads
|
||||
// per workgroup.
|
||||
int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
|
||||
block_count = std::min(DivUp(physical_thread_count, thread_per_block),
|
||||
d.getNumGpuMultiProcessors());
|
||||
#endif
|
||||
#endif
|
||||
block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
|
||||
DivUp(work_element_count, fixed_block_size));
|
||||
|
Loading…
Reference in New Issue
Block a user