From 1498747b3022a59da5939ec6bb5c1287b09c58b1 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Tue, 29 Sep 2020 16:53:34 +0000 Subject: [PATCH] Removing references to TENSORFLOW_COMPILER_IS_HIP_CLANG Now that we are way past the switch to use ROCm 3.5 and above (i.e. hip-clang), the codes within `#ifdef TENSORFLOW_COMPILER_IS_HIP_CLANG` are always enabled, and the codes within the corresponding `#else` blocks are deadcodes. This commit removes the references to `#ifdef TENSORFLOW_COMPILER_IS_HIP_CLANG` and their corresponding `#else` blocks --- tensorflow/core/kernels/conv_2d_gpu.h | 2 +- tensorflow/core/kernels/scan_ops_gpu.h | 2 -- .../core/platform/default/rocm_rocdl_path.cc | 4 --- tensorflow/core/util/gpu_launch_config.h | 28 ------------------- 4 files changed, 1 insertion(+), 35 deletions(-) diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h index 1ed88ca753c..67126f31e27 100644 --- a/tensorflow/core/kernels/conv_2d_gpu.h +++ b/tensorflow/core/kernels/conv_2d_gpu.h @@ -287,7 +287,7 @@ __global__ void SwapDimension1And2InTensor3UsingTiles( // One extra line in the inner dimension to avoid share memory bank conflict. // This is to mimic the following, but no constructor of T can be invoked. // __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1]; -#if GOOGLE_CUDA // || TENSORFLOW_COMPILER_IS_HIP_CLANG +#if GOOGLE_CUDA __shared__ __align__( alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)]; typedef T(*SharedMemoryTile)[TileSizeJ + 1]; diff --git a/tensorflow/core/kernels/scan_ops_gpu.h b/tensorflow/core/kernels/scan_ops_gpu.h index f99f8af3190..7914b7a1103 100644 --- a/tensorflow/core/kernels/scan_ops_gpu.h +++ b/tensorflow/core/kernels/scan_ops_gpu.h @@ -248,10 +248,8 @@ void LaunchScan(const GPUDevice& d, typename TTypes::ConstTensor in, int num_blocks = dimx * dimz; int ideal_block_size = dimy / items_per_thread; -#if TENSORFLOW_COMPILER_IS_HIP_CLANG const int rocm_threads_per_warp = 64; ideal_block_size = std::max(ideal_block_size, rocm_threads_per_warp); -#endif // There seems to be a bug when the type is not float and block_size 1024. // Launch on the smallest power of 2 block size that we can. diff --git a/tensorflow/core/platform/default/rocm_rocdl_path.cc b/tensorflow/core/platform/default/rocm_rocdl_path.cc index 9e9261d26c8..948c0e5c4ce 100644 --- a/tensorflow/core/platform/default/rocm_rocdl_path.cc +++ b/tensorflow/core/platform/default/rocm_rocdl_path.cc @@ -36,11 +36,7 @@ string RocmRoot() { } string RocdlRoot() { -#if TENSORFLOW_COMPILER_IS_HIP_CLANG return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "lib"); -#else - return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "hcc/lib"); -#endif } } // namespace tensorflow diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h index 4c2df39e1a2..0b943e917da 100644 --- a/tensorflow/core/util/gpu_launch_config.h +++ b/tensorflow/core/util/gpu_launch_config.h @@ -168,25 +168,10 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count, block_size_limit); CHECK_EQ(err, cudaSuccess); #elif TENSORFLOW_USE_ROCM -#if TENSORFLOW_COMPILER_IS_HIP_CLANG hipError_t err = hipOccupancyMaxPotentialBlockSize( &block_count, &thread_per_block, func, dynamic_shared_memory_size, block_size_limit); CHECK_EQ(err, hipSuccess); -#else - // Earlier versions of this HIP routine incorrectly returned void. - // TODO re-enable hipError_t error checking when HIP is fixed. - // ROCm interface uses unsigned int, convert after checking - uint32_t block_count_uint = 0; - uint32_t thread_per_block_uint = 0; - CHECK_GE(block_size_limit, 0); - uint32_t block_size_limit_uint = static_cast(block_size_limit); - hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint, - func, dynamic_shared_memory_size, - block_size_limit_uint); - block_count = static_cast(block_count_uint); - thread_per_block = static_cast(thread_per_block_uint); -#endif #endif block_count = @@ -216,22 +201,9 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize( &block_count, func, fixed_block_size, dynamic_shared_memory_size); CHECK_EQ(err, cudaSuccess); #elif TENSORFLOW_USE_ROCM -#if TENSORFLOW_COMPILER_IS_HIP_CLANG hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( &block_count, func, fixed_block_size, dynamic_shared_memory_size); CHECK_EQ(err, hipSuccess); -#else - // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&) - // that the kernel is quite simple and will largely be memory-limited. - const int physical_thread_count = std::min( - d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(), - work_element_count); - // Assume the kernel be simple enough that it is okay to use 1024 threads - // per workgroup. - int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock()); - block_count = std::min(DivUp(physical_thread_count, thread_per_block), - d.getNumGpuMultiProcessors()); -#endif #endif block_count = std::min(block_count * d.getNumGpuMultiProcessors(), DivUp(work_element_count, fixed_block_size));