Allow to generate a cubin for a fixed compute capability.
Currently, se::CompileGpuAsm requires a device to query for the compute capabilities to use. This patch exposes an additional API that allows for manually specifying the compute capability. PiperOrigin-RevId: 297549709 Change-Id: I0012f9698f3eee227e9bdbf261238a5e107e28f9
This commit is contained in:
parent
730fcad287
commit
f0c854d2db
tensorflow/stream_executor/gpu
@ -155,7 +155,12 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
|
||||
int cc_minor;
|
||||
TF_RETURN_IF_ERROR(
|
||||
gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
|
||||
return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
|
||||
}
|
||||
|
||||
port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
|
||||
const char* ptx_contents,
|
||||
GpuAsmOpts options) {
|
||||
string ptxas_path;
|
||||
auto env = tensorflow::Env::Default();
|
||||
for (const string& cuda_root :
|
||||
|
@ -26,14 +26,25 @@ limitations under the License.
|
||||
namespace stream_executor {
|
||||
|
||||
// Compiles the given PTX string using ptxas and returns the resulting machine
|
||||
// code (i.e. a cubin) as a byte array.
|
||||
// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
|
||||
// capabilities of the device associated with 'device_ordinal'.
|
||||
//
|
||||
// compile_ptx_options is used to query for the CUDA location in case it is
|
||||
// 'options' is used to query for the CUDA location in case it is
|
||||
// customized in a passed flag, and for controlling ptxas optimizations.
|
||||
port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
|
||||
const char* ptx_contents,
|
||||
GpuAsmOpts options);
|
||||
|
||||
// Compiles the given PTX string using ptxas and returns the resulting machine
|
||||
// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
|
||||
// capabilities provided by 'cc_major' and 'cc_minor'.
|
||||
//
|
||||
// 'options' is used to query for the CUDA location in case it is
|
||||
// customized in a passed flag, and for controlling ptxas optimizations.
|
||||
port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
|
||||
const char* ptx_contents,
|
||||
GpuAsmOpts options);
|
||||
|
||||
// Same as CompileGpuAsm, but caches the result, and returns unowned view of
|
||||
// the compiled binary.
|
||||
//
|
||||
|
Loading…
Reference in New Issue
Block a user