From e31b0d045a72553fe4eab73b6935c88c71ace40b Mon Sep 17 00:00:00 2001 From: Matt Conley Date: Wed, 16 Jan 2019 16:05:47 -0800 Subject: [PATCH] Addressing comments to improve code. --- .../stream_executor/cuda/cuda_gpu_executor.cc | 31 ++++++++++++------- .../stream_executor/device_description.h | 2 +- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc index 496b0393254..ef32107247c 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc @@ -1129,18 +1129,20 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const { CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_) .ValueOrDie()); - const char* blank_ptx = - ".version 6.0\n" - ".target sm_30\n" - ".address_size 64\n" - "\n" - " // .globl _Z6ValAddPf\n" - ".visible .entry _Z6ValAddPf(\n" - ")\n" - "{\n" - " ret;\n" - "}\n"; - const char* kernel_name = "_Z6ValAddPf"; + // We are loading a dummy ptx kernel to set the device description's + // blocks_per_core_limit by calling the CUDA occupancy calculator. This + // value is currently required XLA GPU's CalculateLaunchDimensions() + const char* blank_ptx = R"( +.version 6.0 +.target sm_30 +.address_size 64 + + // .globl testkernel +.visible .entry testkernel() +{ + ret; +})"; + const char* kernel_name = "testkernel"; CUmodule blank_module; CUfunction blank_function; @@ -1151,7 +1153,12 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const { int bpc; CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor( &bpc, blank_function, 1, 1); + if (result != CUDA_SUCCESS) { + VLOG(1) << "Failed to calculate max blocks per SM using dummy kernel."; + bpc = -1; + } builder.set_blocks_per_core_limit(bpc); + CUDADriver::UnloadModule(context_, blank_module); auto built = builder.Build(); return built.release(); diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h index b71959d8c84..525669bcdcd 100644 --- a/tensorflow/stream_executor/device_description.h +++ b/tensorflow/stream_executor/device_description.h @@ -78,7 +78,7 @@ class DeviceDescription { // legitimate kernel launch request. const BlockDim &block_dim_limit() const { return block_dim_limit_; } - // Returns the limit on the number of simultaneously resident blocks + // Returns the maximum number of simultaneously resident blocks // on a multiprocessor. uint64 blocks_per_core_limit() const { return blocks_per_core_limit_; }