Currently, se::CompileGpuAsm requires a device to query for the compute capabilities to use. This patch exposes an additional API that allows for manually specifying the compute capability. PiperOrigin-RevId: 297549709 Change-Id: I0012f9698f3eee227e9bdbf261238a5e107e28f9
58 lines
2.5 KiB
C++
58 lines
2.5 KiB
C++
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
|
|
#define TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
|
|
|
|
#include <vector>
|
|
|
|
#include "absl/types/span.h"
|
|
#include "tensorflow/stream_executor/gpu/gpu_asm_opts.h"
|
|
#include "tensorflow/stream_executor/lib/statusor.h"
|
|
#include "tensorflow/stream_executor/platform/port.h"
|
|
|
|
namespace stream_executor {
|
|
|
|
// Compiles the given PTX string using ptxas and returns the resulting machine
|
|
// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
|
|
// capabilities of the device associated with 'device_ordinal'.
|
|
//
|
|
// 'options' is used to query for the CUDA location in case it is
|
|
// customized in a passed flag, and for controlling ptxas optimizations.
|
|
port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
|
|
const char* ptx_contents,
|
|
GpuAsmOpts options);
|
|
|
|
// Compiles the given PTX string using ptxas and returns the resulting machine
|
|
// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
|
|
// capabilities provided by 'cc_major' and 'cc_minor'.
|
|
//
|
|
// 'options' is used to query for the CUDA location in case it is
|
|
// customized in a passed flag, and for controlling ptxas optimizations.
|
|
port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
|
|
const char* ptx_contents,
|
|
GpuAsmOpts options);
|
|
|
|
// Same as CompileGpuAsm, but caches the result, and returns unowned view of
|
|
// the compiled binary.
|
|
//
|
|
// A copy of the string provided in ptx will be made.
|
|
port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
|
|
int device_ordinal, const char* ptx, GpuAsmOpts compilation_options);
|
|
|
|
} // namespace stream_executor
|
|
|
|
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
|