From e31b0d045a72553fe4eab73b6935c88c71ace40b Mon Sep 17 00:00:00 2001
From: Matt Conley <mconley@nvidia.com>
Date: Wed, 16 Jan 2019 16:05:47 -0800
Subject: [PATCH] Addressing comments to improve code.

---
 .../stream_executor/cuda/cuda_gpu_executor.cc | 31 ++++++++++++-------
 .../stream_executor/device_description.h      |  2 +-
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 496b0393254..ef32107247c 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -1129,18 +1129,20 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
           CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_)
           .ValueOrDie());
 
-  const char* blank_ptx =
-      ".version 6.0\n"
-      ".target sm_30\n"
-      ".address_size 64\n"
-      "\n"
-      "        // .globl       _Z6ValAddPf\n"
-      ".visible .entry _Z6ValAddPf(\n"
-      ")\n"
-      "{\n"
-      "        ret;\n"
-      "}\n";
-  const char* kernel_name = "_Z6ValAddPf";
+  // We are loading a dummy ptx kernel to set the device description's
+  // blocks_per_core_limit by calling the CUDA occupancy calculator.  This
+  // value is currently required XLA GPU's CalculateLaunchDimensions()
+  const char* blank_ptx = R"(
+.version 6.0
+.target sm_30
+.address_size 64
+
+        // .globl       testkernel
+.visible .entry testkernel()
+{
+        ret;
+})";
+  const char* kernel_name = "testkernel";
 
   CUmodule blank_module;
   CUfunction blank_function;
@@ -1151,7 +1153,12 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
   int bpc;
   CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
       &bpc, blank_function, 1, 1);
+  if (result != CUDA_SUCCESS) {
+    VLOG(1) << "Failed to calculate max blocks per SM using dummy kernel.";
+    bpc = -1;
+  }
   builder.set_blocks_per_core_limit(bpc);
+  CUDADriver::UnloadModule(context_, blank_module);
 
   auto built = builder.Build();
   return built.release();
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index b71959d8c84..525669bcdcd 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -78,7 +78,7 @@ class DeviceDescription {
   // legitimate kernel launch request.
   const BlockDim &block_dim_limit() const { return block_dim_limit_; }
 
-  // Returns the limit on the number of simultaneously resident blocks
+  // Returns the maximum number of simultaneously resident blocks
   // on a multiprocessor.
   uint64 blocks_per_core_limit() const { return blocks_per_core_limit_; }