Move CUDA-specific occupancy calculation into proper file

-Maintain functionality, just move CalculateOccupancy() and CompareOccupancy() methods from device_description to cuda_gpu_executor
-Remove CUDA requirement in general class device_description
This commit is contained in:
Matt Conley 2018-09-04 14:20:40 -07:00
parent e93a9f9ccf
commit fa20b59b92
4 changed files with 48 additions and 49 deletions

View File

@ -490,6 +490,43 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
}
}
// Compute and return maximum blocks per core (occupancy) based on the
// device description, some kernel characteristics and the number of threads per
// block. If unable to compute occupancy, zero is returned.
int CalculateOccupancy(const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims, CUfunction func) {
int suggested_blocks = 0;
int suggested_threads = 0;
CUresult err =
cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
func, NULL, shared_memory_per_block, 0);
CHECK_EQ(err, CUDA_SUCCESS);
return suggested_blocks;
}
// Compute and return the suggested thread count to acheive ideal occupancy.
// If the provided thread dimensions match this number, zero is returned.
int CompareOccupancy(int* initial_blocks,
const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims, CUfunction func) {
int suggested_blocks = 0;
int suggested_threads = 0;
CUresult err =
cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
func, NULL, shared_memory_per_block, 0);
CHECK_EQ(err, CUDA_SUCCESS);
if (suggested_blocks > *initial_blocks) {
*initial_blocks = suggested_blocks;
return suggested_threads;
} else {
return 0;
}
}
void *CUDAExecutor::Allocate(uint64 size) {
return CUDADriver::DeviceAllocate(context_, size);
}

View File

@ -70,6 +70,17 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
const BlockDim &block_dims, const KernelBase &k,
const KernelArgsArrayBase &args) override;
int CalculateOccupancy(const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims, CUfunction func);
int CompareOccupancy(int* initial_blocks,
const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims, CUfunction func);
void *Allocate(uint64 size) override;
void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,

View File

@ -157,36 +157,4 @@ static uint64 RoundDown(uint64 value, uint64 n) {
return port::MathUtil::FloorOfRatio(value, n) * n;
}
int CalculateOccupancy(const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims, CUfunction func) {
int suggested_blocks = 0;
int suggested_threads = 0;
CUresult err =
cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
func, NULL, shared_memory_per_block, 0);
CHECK_EQ(err, CUDA_SUCCESS);
return suggested_blocks;
}
int CompareOccupancy(int* initial_blocks,
const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims, CUfunction func) {
int suggested_blocks = 0;
int suggested_threads = 0;
CUresult err =
cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
func, NULL, shared_memory_per_block, 0);
CHECK_EQ(err, CUDA_SUCCESS);
if (suggested_blocks > *initial_blocks) {
*initial_blocks = suggested_blocks;
return suggested_threads;
} else {
return 0;
}
}
} // namespace stream_executor

View File

@ -24,7 +24,6 @@ limitations under the License.
#include <memory>
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
#include "tensorflow/stream_executor/launch_dim.h"
#include "tensorflow/stream_executor/platform/port.h"
@ -324,22 +323,6 @@ void CalculateDimensionality(const DeviceDescription &device_description,
uint64 element_count, uint64 *threads_per_block,
uint64 *block_count);
// Compute and return maximum blocks per core (occupancy) based on the
// device description, some kernel characteristics and the number of threads per
// block. If unable to compute occupancy, zero is returned.
int CalculateOccupancy(const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims, CUfunction func);
// Compute and return the suggested thread count to acheive ideal occupancy.
// If the provided thread dimensions match this number, zero is returned.
int CompareOccupancy(int* initial_blocks,
const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims, CUfunction func);
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_