Merge pull request #35401 from ROCmSoftwarePlatform:google_upstream_rocm_documentation

PiperOrigin-RevId: 288929060
Change-Id: I9179465fb654df9130bb51c54fd9a35a3cd8f0c1
This commit is contained in:
TensorFlower Gardener 2020-01-09 10:57:22 -08:00
commit 33963418c6

View File

@ -168,23 +168,18 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
block_size_limit); block_size_limit);
CHECK_EQ(err, cudaSuccess); CHECK_EQ(err, cudaSuccess);
#elif TENSORFLOW_USE_ROCM #elif TENSORFLOW_USE_ROCM
// ROCM TODO re-enable this after hipOccupancyMaxPotentialBlockSize is // Earlier versions of this HIP routine incorrectly returned void.
// implemented // TODO re-enable hipError_t error checking when HIP is fixed.
// hipError_t err = hipOccupancyMaxPotentialBlockSize( // ROCm interface uses unsigned int, convert after checking
// &block_count, &thread_per_block, func, dynamic_shared_memory_size, uint32_t block_count_uint = 0;
// block_size_limit); uint32_t thread_per_block_uint = 0;
// CHECK_EQ(err, hipSuccess); CHECK_GE(block_size_limit, 0);
uint32_t block_size_limit_uint = static_cast<uint32_t>(block_size_limit);
// Apply the heuristic in GetGpuLaunchConfig(int, const Eigen::GpuDevice&) hipOccupancyMaxPotentialBlockSize(&block_count_uint, &thread_per_block_uint,
// that the kernel is quite simple and will largely be memory-limited. func, dynamic_shared_memory_size,
const int physical_thread_count = std::min( block_size_limit_uint);
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(), block_count = static_cast<int>(block_count_uint);
work_element_count); thread_per_block = static_cast<int>(thread_per_block_uint);
// Assume the kernel be simple enough that it is okay to use 1024 threads
// per workgroup.
thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
block_count = std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumGpuMultiProcessors());
#endif #endif
block_count = block_count =