Allow to generate a cubin for a fixed compute capability.

Currently, se::CompileGpuAsm requires a device to query for the compute capabilities to use. This patch exposes an additional API that allows for manually specifying the compute capability. PiperOrigin-RevId: 297549709 Change-Id: I0012f9698f3eee227e9bdbf261238a5e107e28f9
2020-02-27 02:36:09 -08:00 · 2020-02-27 02:36:09 -08:00 · f0c854d2db
commit f0c854d2db
parent 730fcad287
2 changed files with 18 additions and 2 deletions
--- a/tensorflow/stream_executor/gpu/asm_compiler.cc
+++ b/tensorflow/stream_executor/gpu/asm_compiler.cc
@ -155,7 +155,12 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
  int cc_minor;
  TF_RETURN_IF_ERROR(
      gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
+  return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
+}

+port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
+                                                 const char* ptx_contents,
+                                                 GpuAsmOpts options) {
  string ptxas_path;
  auto env = tensorflow::Env::Default();
  for (const string& cuda_root :
--- a/tensorflow/stream_executor/gpu/asm_compiler.h
+++ b/tensorflow/stream_executor/gpu/asm_compiler.h
@ -26,14 +26,25 @@ limitations under the License.
 namespace stream_executor {

 // Compiles the given PTX string using ptxas and returns the resulting machine
-// code (i.e. a cubin) as a byte array.
+// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
+// capabilities of the device associated with 'device_ordinal'.
 //
-// compile_ptx_options is used to query for the CUDA location in case it is
+// 'options' is used to query for the CUDA location in case it is
 // customized in a passed flag, and for controlling ptxas optimizations.
 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
                                                 const char* ptx_contents,
                                                 GpuAsmOpts options);

+// Compiles the given PTX string using ptxas and returns the resulting machine
+// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
+// capabilities provided by 'cc_major' and 'cc_minor'.
+//
+// 'options' is used to query for the CUDA location in case it is
+// customized in a passed flag, and for controlling ptxas optimizations.
+port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
+                                                 const char* ptx_contents,
+                                                 GpuAsmOpts options);
+
 // Same as CompileGpuAsm, but caches the result, and returns unowned view of
 // the compiled binary.
 //