STT-tensorflow/tensorflow/stream_executor/gpu/asm_compiler.h
Stephan Herhut f0c854d2db Allow to generate a cubin for a fixed compute capability.
Currently, se::CompileGpuAsm requires a device to query for the compute
capabilities to use. This patch exposes an additional API that allows for
manually specifying the compute capability.

PiperOrigin-RevId: 297549709
Change-Id: I0012f9698f3eee227e9bdbf261238a5e107e28f9
2020-02-27 02:44:14 -08:00

58 lines
2.5 KiB
C++

/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
#include <vector>
#include "absl/types/span.h"
#include "tensorflow/stream_executor/gpu/gpu_asm_opts.h"
#include "tensorflow/stream_executor/lib/statusor.h"
#include "tensorflow/stream_executor/platform/port.h"
namespace stream_executor {
// Compiles the given PTX string using ptxas and returns the resulting machine
// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
// capabilities of the device associated with 'device_ordinal'.
//
// 'options' is used to query for the CUDA location in case it is
// customized in a passed flag, and for controlling ptxas optimizations.
port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
const char* ptx_contents,
GpuAsmOpts options);
// Compiles the given PTX string using ptxas and returns the resulting machine
// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
// capabilities provided by 'cc_major' and 'cc_minor'.
//
// 'options' is used to query for the CUDA location in case it is
// customized in a passed flag, and for controlling ptxas optimizations.
port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
const char* ptx_contents,
GpuAsmOpts options);
// Same as CompileGpuAsm, but caches the result, and returns unowned view of
// the compiled binary.
//
// A copy of the string provided in ptx will be made.
port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
int device_ordinal, const char* ptx, GpuAsmOpts compilation_options);
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_