Allow to generate a cubin for a fixed compute capability.

Currently, se::CompileGpuAsm requires a device to query for the compute
capabilities to use. This patch exposes an additional API that allows for
manually specifying the compute capability.

PiperOrigin-RevId: 297549709
Change-Id: I0012f9698f3eee227e9bdbf261238a5e107e28f9
This commit is contained in:
Stephan Herhut 2020-02-27 02:36:09 -08:00 committed by TensorFlower Gardener
parent 730fcad287
commit f0c854d2db
2 changed files with 18 additions and 2 deletions
tensorflow/stream_executor/gpu

View File

@ -155,7 +155,12 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
int cc_minor;
TF_RETURN_IF_ERROR(
gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
}
port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
const char* ptx_contents,
GpuAsmOpts options) {
string ptxas_path;
auto env = tensorflow::Env::Default();
for (const string& cuda_root :

View File

@ -26,14 +26,25 @@ limitations under the License.
namespace stream_executor {
// Compiles the given PTX string using ptxas and returns the resulting machine
// code (i.e. a cubin) as a byte array.
// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
// capabilities of the device associated with 'device_ordinal'.
//
// compile_ptx_options is used to query for the CUDA location in case it is
// 'options' is used to query for the CUDA location in case it is
// customized in a passed flag, and for controlling ptxas optimizations.
port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
const char* ptx_contents,
GpuAsmOpts options);
// Compiles the given PTX string using ptxas and returns the resulting machine
// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
// capabilities provided by 'cc_major' and 'cc_minor'.
//
// 'options' is used to query for the CUDA location in case it is
// customized in a passed flag, and for controlling ptxas optimizations.
port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
const char* ptx_contents,
GpuAsmOpts options);
// Same as CompileGpuAsm, but caches the result, and returns unowned view of
// the compiled binary.
//