Move CUDA-specific occupancy calculation into proper file
-Maintain functionality, just move CalculateOccupancy() and CompareOccupancy() methods from device_description to cuda_gpu_executor -Remove CUDA requirement in general class device_description
This commit is contained in:
parent
e93a9f9ccf
commit
fa20b59b92
tensorflow/stream_executor
@ -490,6 +490,43 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
|
||||
}
|
||||
}
|
||||
|
||||
// Compute and return maximum blocks per core (occupancy) based on the
|
||||
// device description, some kernel characteristics and the number of threads per
|
||||
// block. If unable to compute occupancy, zero is returned.
|
||||
int CalculateOccupancy(const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims, CUfunction func) {
|
||||
int suggested_blocks = 0;
|
||||
int suggested_threads = 0;
|
||||
CUresult err =
|
||||
cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
|
||||
func, NULL, shared_memory_per_block, 0);
|
||||
CHECK_EQ(err, CUDA_SUCCESS);
|
||||
return suggested_blocks;
|
||||
}
|
||||
|
||||
// Compute and return the suggested thread count to acheive ideal occupancy.
|
||||
// If the provided thread dimensions match this number, zero is returned.
|
||||
int CompareOccupancy(int* initial_blocks,
|
||||
const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims, CUfunction func) {
|
||||
int suggested_blocks = 0;
|
||||
int suggested_threads = 0;
|
||||
CUresult err =
|
||||
cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
|
||||
func, NULL, shared_memory_per_block, 0);
|
||||
CHECK_EQ(err, CUDA_SUCCESS);
|
||||
if (suggested_blocks > *initial_blocks) {
|
||||
*initial_blocks = suggested_blocks;
|
||||
return suggested_threads;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
void *CUDAExecutor::Allocate(uint64 size) {
|
||||
return CUDADriver::DeviceAllocate(context_, size);
|
||||
}
|
||||
|
@ -70,6 +70,17 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
|
||||
const BlockDim &block_dims, const KernelBase &k,
|
||||
const KernelArgsArrayBase &args) override;
|
||||
|
||||
int CalculateOccupancy(const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims, CUfunction func);
|
||||
|
||||
int CompareOccupancy(int* initial_blocks,
|
||||
const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims, CUfunction func);
|
||||
|
||||
void *Allocate(uint64 size) override;
|
||||
|
||||
void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
|
||||
|
@ -157,36 +157,4 @@ static uint64 RoundDown(uint64 value, uint64 n) {
|
||||
return port::MathUtil::FloorOfRatio(value, n) * n;
|
||||
}
|
||||
|
||||
int CalculateOccupancy(const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims, CUfunction func) {
|
||||
int suggested_blocks = 0;
|
||||
int suggested_threads = 0;
|
||||
CUresult err =
|
||||
cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
|
||||
func, NULL, shared_memory_per_block, 0);
|
||||
CHECK_EQ(err, CUDA_SUCCESS);
|
||||
return suggested_blocks;
|
||||
}
|
||||
|
||||
int CompareOccupancy(int* initial_blocks,
|
||||
const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims, CUfunction func) {
|
||||
int suggested_blocks = 0;
|
||||
int suggested_threads = 0;
|
||||
CUresult err =
|
||||
cuOccupancyMaxPotentialBlockSize(&suggested_blocks, &suggested_threads,
|
||||
func, NULL, shared_memory_per_block, 0);
|
||||
CHECK_EQ(err, CUDA_SUCCESS);
|
||||
if (suggested_blocks > *initial_blocks) {
|
||||
*initial_blocks = suggested_blocks;
|
||||
return suggested_threads;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace stream_executor
|
||||
|
@ -24,7 +24,6 @@ limitations under the License.
|
||||
#include <memory>
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
|
||||
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
|
||||
#include "tensorflow/stream_executor/launch_dim.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
|
||||
@ -324,22 +323,6 @@ void CalculateDimensionality(const DeviceDescription &device_description,
|
||||
uint64 element_count, uint64 *threads_per_block,
|
||||
uint64 *block_count);
|
||||
|
||||
// Compute and return maximum blocks per core (occupancy) based on the
|
||||
// device description, some kernel characteristics and the number of threads per
|
||||
// block. If unable to compute occupancy, zero is returned.
|
||||
int CalculateOccupancy(const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims, CUfunction func);
|
||||
|
||||
// Compute and return the suggested thread count to acheive ideal occupancy.
|
||||
// If the provided thread dimensions match this number, zero is returned.
|
||||
int CompareOccupancy(int* initial_blocks,
|
||||
const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims, CUfunction func);
|
||||
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
|
||||
|
Loading…
Reference in New Issue
Block a user