Removing references to TENSORFLOW_COMPILER_IS_HIP_CLANG

Now that we are way past the switch to use ROCm 3.5 and above (i.e. hip-clang), the codes within `#ifdef TENSORFLOW_COMPILER_IS_HIP_CLANG` are always enabled, and the codes within the corresponding `#else` blocks are deadcodes.

This commit removes the references to `#ifdef TENSORFLOW_COMPILER_IS_HIP_CLANG` and their corresponding `#else` blocks
This commit is contained in:
Deven Desai 2020-09-29 16:53:34 +00:00
parent 147a175e37
commit 1498747b30
4 changed files with 1 additions and 35 deletions

View File

@ -287,7 +287,7 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
// One extra line in the inner dimension to avoid share memory bank conflict.
// This is to mimic the following, but no constructor of T can be invoked.
// __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
#if GOOGLE_CUDA // || TENSORFLOW_COMPILER_IS_HIP_CLANG
#if GOOGLE_CUDA
__shared__ __align__(
alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
typedef T(*SharedMemoryTile)[TileSizeJ + 1];

View File

@ -248,10 +248,8 @@ void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
int num_blocks = dimx * dimz;
int ideal_block_size = dimy / items_per_thread;
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
const int rocm_threads_per_warp = 64;
ideal_block_size = std::max(ideal_block_size, rocm_threads_per_warp);
#endif
// There seems to be a bug when the type is not float and block_size 1024.
// Launch on the smallest power of 2 block size that we can.

View File

@ -36,11 +36,7 @@ string RocmRoot() {
}
string RocdlRoot() {
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "lib");
#else
return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "hcc/lib");
#endif
}
} // namespace tensorflow

View File

@ -168,25 +168,10 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
block_size_limit);
CHECK_EQ(err, cudaSuccess);
#elif TENSORFLOW_USE_ROCM
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
hipError_t err = hipOccupancyMaxPotentialBlockSize(
&block_count, &thread_per_block, func, dynamic_shared_memory_size,
block_size_limit);
CHECK_EQ(err, hipSuccess);
#else
// Earlier versions of this HIP routine incorrectly returned void.
// TODO re-enable hipError_t error checking when HIP is fixed.
// ROCm interface uses unsigned int, convert after checking
uint32_t block_count_uint = 0;
uint32_t thread_per_block_uint = 0;
CHECK_GE(block_size_limit, 0);
uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
func, dynamic_shared_memory_size,
block_size_limit_uint);
block_count = static_cast<int>(block_count_uint);
thread_per_block = static_cast<int>(thread_per_block_uint);
#endif
#endif
block_count =
@ -216,22 +201,9 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
CHECK_EQ(err, cudaSuccess);
#elif TENSORFLOW_USE_ROCM
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
CHECK_EQ(err, hipSuccess);
#else
// Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
// that the kernel is quite simple and will largely be memory-limited.
const int physical_thread_count = std::min(
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
work_element_count);
// Assume the kernel be simple enough that it is okay to use 1024 threads
// per workgroup.
int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
block_count = std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumGpuMultiProcessors());
#endif
#endif
block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
DivUp(work_element_count, fixed_block_size));