Removing references to TENSORFLOW_COMPILER_IS_HIP_CLANG
Now that we are way past the switch to use ROCm 3.5 and above (i.e. hip-clang), the codes within `#ifdef TENSORFLOW_COMPILER_IS_HIP_CLANG` are always enabled, and the codes within the corresponding `#else` blocks are deadcodes. This commit removes the references to `#ifdef TENSORFLOW_COMPILER_IS_HIP_CLANG` and their corresponding `#else` blocks
This commit is contained in:
parent
147a175e37
commit
1498747b30
@ -287,7 +287,7 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
|
|||||||
// One extra line in the inner dimension to avoid share memory bank conflict.
|
// One extra line in the inner dimension to avoid share memory bank conflict.
|
||||||
// This is to mimic the following, but no constructor of T can be invoked.
|
// This is to mimic the following, but no constructor of T can be invoked.
|
||||||
// __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
|
// __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
|
||||||
#if GOOGLE_CUDA // || TENSORFLOW_COMPILER_IS_HIP_CLANG
|
#if GOOGLE_CUDA
|
||||||
__shared__ __align__(
|
__shared__ __align__(
|
||||||
alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
|
alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
|
||||||
typedef T(*SharedMemoryTile)[TileSizeJ + 1];
|
typedef T(*SharedMemoryTile)[TileSizeJ + 1];
|
||||||
|
@ -248,10 +248,8 @@ void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
|
|||||||
int num_blocks = dimx * dimz;
|
int num_blocks = dimx * dimz;
|
||||||
|
|
||||||
int ideal_block_size = dimy / items_per_thread;
|
int ideal_block_size = dimy / items_per_thread;
|
||||||
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
|
|
||||||
const int rocm_threads_per_warp = 64;
|
const int rocm_threads_per_warp = 64;
|
||||||
ideal_block_size = std::max(ideal_block_size, rocm_threads_per_warp);
|
ideal_block_size = std::max(ideal_block_size, rocm_threads_per_warp);
|
||||||
#endif
|
|
||||||
|
|
||||||
// There seems to be a bug when the type is not float and block_size 1024.
|
// There seems to be a bug when the type is not float and block_size 1024.
|
||||||
// Launch on the smallest power of 2 block size that we can.
|
// Launch on the smallest power of 2 block size that we can.
|
||||||
|
@ -36,11 +36,7 @@ string RocmRoot() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
string RocdlRoot() {
|
string RocdlRoot() {
|
||||||
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
|
|
||||||
return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "lib");
|
return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "lib");
|
||||||
#else
|
|
||||||
return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "hcc/lib");
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
@ -168,25 +168,10 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
|
|||||||
block_size_limit);
|
block_size_limit);
|
||||||
CHECK_EQ(err, cudaSuccess);
|
CHECK_EQ(err, cudaSuccess);
|
||||||
#elif TENSORFLOW_USE_ROCM
|
#elif TENSORFLOW_USE_ROCM
|
||||||
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
|
|
||||||
hipError_t err = hipOccupancyMaxPotentialBlockSize(
|
hipError_t err = hipOccupancyMaxPotentialBlockSize(
|
||||||
&block_count, &thread_per_block, func, dynamic_shared_memory_size,
|
&block_count, &thread_per_block, func, dynamic_shared_memory_size,
|
||||||
block_size_limit);
|
block_size_limit);
|
||||||
CHECK_EQ(err, hipSuccess);
|
CHECK_EQ(err, hipSuccess);
|
||||||
#else
|
|
||||||
// Earlier versions of this HIP routine incorrectly returned void.
|
|
||||||
// TODO re-enable hipError_t error checking when HIP is fixed.
|
|
||||||
// ROCm interface uses unsigned int, convert after checking
|
|
||||||
uint32_t block_count_uint = 0;
|
|
||||||
uint32_t thread_per_block_uint = 0;
|
|
||||||
CHECK_GE(block_size_limit, 0);
|
|
||||||
uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
|
|
||||||
hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
|
|
||||||
func, dynamic_shared_memory_size,
|
|
||||||
block_size_limit_uint);
|
|
||||||
block_count = static_cast<int>(block_count_uint);
|
|
||||||
thread_per_block = static_cast<int>(thread_per_block_uint);
|
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
block_count =
|
block_count =
|
||||||
@ -216,22 +201,9 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
|
|||||||
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
|
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
|
||||||
CHECK_EQ(err, cudaSuccess);
|
CHECK_EQ(err, cudaSuccess);
|
||||||
#elif TENSORFLOW_USE_ROCM
|
#elif TENSORFLOW_USE_ROCM
|
||||||
#if TENSORFLOW_COMPILER_IS_HIP_CLANG
|
|
||||||
hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||||
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
|
&block_count, func, fixed_block_size, dynamic_shared_memory_size);
|
||||||
CHECK_EQ(err, hipSuccess);
|
CHECK_EQ(err, hipSuccess);
|
||||||
#else
|
|
||||||
// Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
|
|
||||||
// that the kernel is quite simple and will largely be memory-limited.
|
|
||||||
const int physical_thread_count = std::min(
|
|
||||||
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
|
|
||||||
work_element_count);
|
|
||||||
// Assume the kernel be simple enough that it is okay to use 1024 threads
|
|
||||||
// per workgroup.
|
|
||||||
int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
|
|
||||||
block_count = std::min(DivUp(physical_thread_count, thread_per_block),
|
|
||||||
d.getNumGpuMultiProcessors());
|
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
|
block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
|
||||||
DivUp(work_element_count, fixed_block_size));
|
DivUp(work_element_count, fixed_block_size));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user