Merge pull request #35401 from ROCmSoftwarePlatform:google_upstream_rocm_documentation

PiperOrigin-RevId: 288929060 Change-Id: I9179465fb654df9130bb51c54fd9a35a3cd8f0c1
2020-01-09 10:57:22 -08:00 · 2020-01-09 10:57:22 -08:00 · 33963418c6
commit 33963418c6
parent 617297e60a 8a4be4e585
1 changed files with 12 additions and 17 deletions
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@ -168,23 +168,18 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
      block_size_limit);
  CHECK_EQ(err, cudaSuccess);
 #elif TENSORFLOW_USE_ROCM
-  // ROCM TODO re-enable this after hipOccupancyMaxPotentialBlockSize is
-  // implemented
-  // hipError_t err = hipOccupancyMaxPotentialBlockSize(
-  //    &block_count, &thread_per_block, func, dynamic_shared_memory_size,
-  //    block_size_limit);
-  // CHECK_EQ(err, hipSuccess);
-
-  // Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
-  // that the kernel is quite simple and will largely be memory-limited.
-  const int physical_thread_count = std::min(
-      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
-      work_element_count);
-  // Assume the kernel be simple enough that it is okay to use 1024 threads
-  // per workgroup.
-  thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
-  block_count = std::min(DivUp(physical_thread_count, thread_per_block),
-                         d.getNumGpuMultiProcessors());
+  // Earlier versions of this HIP routine incorrectly returned void.
+  // TODO re-enable hipError_t error checking when HIP is fixed.
+  // ROCm interface uses unsigned int, convert after checking
+  uint32_t block_count_uint = 0;
+  uint32_t thread_per_block_uint = 0;
+  CHECK_GE(block_size_limit, 0);
+  uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
+  hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
+                                    func, dynamic_shared_memory_size,
+                                    block_size_limit_uint);
+  block_count = static_cast<int>(block_count_uint);
+  thread_per_block = static_cast<int>(thread_per_block_uint);
 #endif

  block_count =