Merge pull request from ROCmSoftwarePlatform:google_upstream_rocm_documentation

PiperOrigin-RevId: 288929060
Change-Id: I9179465fb654df9130bb51c54fd9a35a3cd8f0c1
This commit is contained in:
TensorFlower Gardener 2020-01-09 10:57:22 -08:00
commit 33963418c6

View File

@ -168,23 +168,18 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
block_size_limit);
CHECK_EQ(err, cudaSuccess);
#elif TENSORFLOW_USE_ROCM
// ROCM TODO re-enable this after hipOccupancyMaxPotentialBlockSize is
// implemented
// hipError_t err = hipOccupancyMaxPotentialBlockSize(
// &block_count, &thread_per_block, func, dynamic_shared_memory_size,
// block_size_limit);
// CHECK_EQ(err, hipSuccess);
// Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&)
// that the kernel is quite simple and will largely be memory-limited.
const int physical_thread_count = std::min(
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
work_element_count);
// Assume the kernel be simple enough that it is okay to use 1024 threads
// per workgroup.
thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
block_count = std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumGpuMultiProcessors());
// Earlier versions of this HIP routine incorrectly returned void.
// TODO re-enable hipError_t error checking when HIP is fixed.
// ROCm interface uses unsigned int, convert after checking
uint32_t block_count_uint = 0;
uint32_t thread_per_block_uint = 0;
CHECK_GE(block_size_limit, 0);
uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
func, dynamic_shared_memory_size,
block_size_limit_uint);
block_count = static_cast<int>(block_count_uint);
thread_per_block = static_cast<int>(thread_per_block_uint);
#endif
block_count =