Removing references to TENSORFLOW_COMPILER_IS_HIP_CLANG

Now that we are way past the switch to use ROCm 3.5 and above (i.e. hip-clang), the codes within `#ifdef TENSORFLOW_COMPILER_IS_HIP_CLANG` are always enabled, and the codes within the corresponding `#else` blocks are deadcodes. This commit removes the references to `#ifdef TENSORFLOW_COMPILER_IS_HIP_CLANG` and their corresponding `#else` blocks
2020-09-29 16:53:34 +00:00 · 2020-09-29 16:53:34 +00:00 · 1498747b30
commit 1498747b30
parent 147a175e37
4 changed files with 1 additions and 35 deletions
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@ -287,7 +287,7 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
  // One extra line in the inner dimension to avoid share memory bank conflict.
  // This is to mimic the following, but no constructor of T can be invoked.
  //     __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
-#if GOOGLE_CUDA  // || TENSORFLOW_COMPILER_IS_HIP_CLANG
+#if GOOGLE_CUDA
  __shared__ __align__(
      alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
  typedef T(*SharedMemoryTile)[TileSizeJ + 1];
--- a/tensorflow/core/kernels/scan_ops_gpu.h
+++ b/tensorflow/core/kernels/scan_ops_gpu.h
@ -248,10 +248,8 @@ void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
  int num_blocks = dimx * dimz;
  int ideal_block_size = dimy / items_per_thread;
 #if TENSORFLOW_COMPILER_IS_HIP_CLANG
  const int rocm_threads_per_warp = 64;
  ideal_block_size = std::max(ideal_block_size, rocm_threads_per_warp);
 #endif
  // There seems to be a bug when the type is not float and block_size 1024.
  // Launch on the smallest power of 2 block size that we can.
--- a/tensorflow/core/platform/default/rocm_rocdl_path.cc
+++ b/tensorflow/core/platform/default/rocm_rocdl_path.cc
@ -36,11 +36,7 @@ string RocmRoot() {
 }
 string RocdlRoot() {
 #if TENSORFLOW_COMPILER_IS_HIP_CLANG
  return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "lib");
 #else
  return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "hcc/lib");
 #endif
 }
 }  // namespace tensorflow
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@ -168,25 +168,10 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
      block_size_limit);
  CHECK_EQ(err, cudaSuccess);
 #elif TENSORFLOW_USE_ROCM
 #if TENSORFLOW_COMPILER_IS_HIP_CLANG
  hipError_t err = hipOccupancyMaxPotentialBlockSize(
      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
      block_size_limit);
  CHECK_EQ(err, hipSuccess);
 #else
  // Earlier versions of this HIP routine incorrectly returned void.
  // TODO re-enable hipError_t error checking when HIP is fixed.
  // ROCm interface uses unsigned int, convert after checking
  uint32_t block_count_uint = 0;
  uint32_t thread_per_block_uint = 0;
  CHECK_GE(block_size_limit, 0);
  uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
  hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
                                    func, dynamic_shared_memory_size,
                                    block_size_limit_uint);
  block_count = static_cast<int>(block_count_uint);
  thread_per_block = static_cast<int>(thread_per_block_uint);
 #endif
 #endif
  block_count =
@ -216,22 +201,9 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
      &block_count, func, fixed_block_size, dynamic_shared_memory_size);
  CHECK_EQ(err, cudaSuccess);
 #elif TENSORFLOW_USE_ROCM
 #if TENSORFLOW_COMPILER_IS_HIP_CLANG
  hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
      &block_count, func, fixed_block_size, dynamic_shared_memory_size);
  CHECK_EQ(err, hipSuccess);
 #else
  // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
  // that the kernel is quite simple and will largely be memory-limited.
  const int physical_thread_count = std::min(
      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
      work_element_count);
  // Assume the kernel be simple enough that it is okay to use 1024 threads
  // per workgroup.
  int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
                         d.getNumGpuMultiProcessors());
 #endif
 #endif
  block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
                         DivUp(work_element_count, fixed_block_size));