Addressing comments to improve code.

This commit is contained in:
Matt Conley 2019-01-16 16:05:47 -08:00
parent a363b67e4e
commit e31b0d045a
2 changed files with 20 additions and 13 deletions

View File

@ -1129,18 +1129,20 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_)
.ValueOrDie());
const char* blank_ptx =
".version 6.0\n"
".target sm_30\n"
".address_size 64\n"
"\n"
" // .globl _Z6ValAddPf\n"
".visible .entry _Z6ValAddPf(\n"
")\n"
"{\n"
" ret;\n"
"}\n";
const char* kernel_name = "_Z6ValAddPf";
// We are loading a dummy ptx kernel to set the device description's
// blocks_per_core_limit by calling the CUDA occupancy calculator. This
// value is currently required XLA GPU's CalculateLaunchDimensions()
const char* blank_ptx = R"(
.version 6.0
.target sm_30
.address_size 64
// .globl testkernel
.visible .entry testkernel()
{
ret;
})";
const char* kernel_name = "testkernel";
CUmodule blank_module;
CUfunction blank_function;
@ -1151,7 +1153,12 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
int bpc;
CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
&bpc, blank_function, 1, 1);
if (result != CUDA_SUCCESS) {
VLOG(1) << "Failed to calculate max blocks per SM using dummy kernel.";
bpc = -1;
}
builder.set_blocks_per_core_limit(bpc);
CUDADriver::UnloadModule(context_, blank_module);
auto built = builder.Build();
return built.release();

View File

@ -78,7 +78,7 @@ class DeviceDescription {
// legitimate kernel launch request.
const BlockDim &block_dim_limit() const { return block_dim_limit_; }
// Returns the limit on the number of simultaneously resident blocks
// Returns the maximum number of simultaneously resident blocks
// on a multiprocessor.
uint64 blocks_per_core_limit() const { return blocks_per_core_limit_; }