cuda:fatbinary dependency causes licensing issues.

PiperOrigin-RevId: 335809522
Change-Id: I374057f1b9b8144420089b04e2a5892f5c64832f
This commit is contained in:
A. Unique TensorFlower 2020-10-07 00:40:21 -07:00 committed by TensorFlower Gardener
parent 6876c21157
commit 4d77e555e8
18 changed files with 227 additions and 53 deletions

View File

@ -74,8 +74,8 @@ tool_names = [
'tf_tfjs_translate', 'flatbuffer_to_string', 'flatbuffer_translate',
'tf-mlir-translate', 'mlir-tflite-runner', 'tfcompile',
'json_to_flatbuffer', 'xla-gpu-opt', 'xla-mlir-gpu-opt', 'xla-opt',
'hlo_to_llvm_ir', 'kernel-gen-opt', 'tf_to_gpu_binary', 'xla-thunks-opt',
'tfjs-opt'
'hlo_to_llvm_ir', 'kernel-gen-opt', 'tf_to_kernel', 'tf_to_gpu_binary',
'xla-thunks-opt', 'tfjs-opt'
]
tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
llvm_config.add_tool_substitutions(tools, tool_dirs)

View File

@ -105,7 +105,10 @@ tf_cc_binary(
tf_cc_binary(
name = "tf_to_kernel",
srcs = ["tf_to_kernel.cc"],
visibility = ["//tensorflow/core/kernels/mlir_generated:__pkg__"],
visibility = [
"//tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel:__pkg__",
"//tensorflow/core/kernels/mlir_generated:__pkg__",
],
deps = [
":kernel_creator",
"//tensorflow/compiler/mlir:init_mlir",

View File

@ -174,7 +174,8 @@ Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
Status LowerGPUToLLVM(mlir::ModuleOp module, bool gpu_binary_only,
llvm::ArrayRef<uint32_t> same_shape,
llvm::StringRef gpu_binary_attr_name,
int32_t architecture) {
llvm::ArrayRef<uint32_t> architectures,
bool generate_fatbin) {
mlir::PassManager pm(module.getContext());
applyTensorflowAndCLOptions(pm);
@ -187,7 +188,7 @@ Status LowerGPUToLLVM(mlir::ModuleOp module, bool gpu_binary_only,
}
kernel_pm.addPass(mlir::createStripDebugInfoPass());
kernel_pm.addPass(mlir::kernel_gen::transforms::CreateGpuKernelToBlobPass(
gpu_binary_attr_name, architecture));
gpu_binary_attr_name, architectures, generate_fatbin));
if (!gpu_binary_only) {
pm.addPass(mlir::kernel_gen::transforms::CreateTFKernelToLLVMPass());
@ -202,9 +203,9 @@ Status LowerGPUToLLVM(mlir::ModuleOp module, bool gpu_binary_only,
StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
mlir::MLIRContext& context, llvm::StringRef tf_code, bool gpu_binary_only,
int32_t architecture, llvm::ArrayRef<uint32_t> tile_sizes,
llvm::ArrayRef<uint32_t> architectures, llvm::ArrayRef<uint32_t> tile_sizes,
llvm::ArrayRef<uint32_t> same_shape,
llvm::ArrayRef<uint32_t> unroll_factors) {
llvm::ArrayRef<uint32_t> unroll_factors, bool generate_fatbin) {
mlir::RegisterAllTensorFlowDialects(context.getDialectRegistry());
mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
TF_RETURN_IF_ERROR(
@ -221,7 +222,8 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
#endif
TF_RETURN_IF_ERROR(LowerGPUToLLVM(module.get(), gpu_binary_only, same_shape,
kGpuBinaryAttrName, architecture));
kGpuBinaryAttrName, architectures,
generate_fatbin));
return module;
}

View File

@ -38,9 +38,10 @@ namespace kernel_gen {
// false, lowers the host side to LLVM Dialect.
xla::StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
mlir::MLIRContext& context, llvm::StringRef tf_code, bool gpu_binary_only,
int32_t architecture = 75, llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
llvm::ArrayRef<uint32_t> architectures = {75},
llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
llvm::ArrayRef<uint32_t> same_shape = {},
llvm::ArrayRef<uint32_t> unroll_factors = {});
llvm::ArrayRef<uint32_t> unroll_factors = {}, bool generate_fatbin = true);
// Extracts gpu_binary from the converted module.
xla::StatusOr<std::string> ExtractGpuBinary(mlir::ModuleOp module);

View File

@ -1,6 +1,5 @@
// RUN: tf_to_gpu_binary --input=%s --output=%t --same_shape=0,1 --unroll_factors=4 --tile_sizes=256 --arch=70
func @tanh(%arg0: tensor<?xf32>) -> tensor<?xf32> {
%0 = "tf.Tanh"(%arg0) { }
: (tensor<?xf32>) -> tensor<?xf32>
%0 = "tf.Tanh"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
return %0 : tensor<?xf32>
}

View File

@ -0,0 +1,17 @@
load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
package(licenses = ["notice"])
glob_lit_tests(
data = [
"//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel",
"@llvm-project//mlir:run_lit.sh",
],
default_tags = [
# We need access to the CUDA SDK.
"gpu",
"no_rocm",
],
driver = "//tensorflow/compiler/mlir:run_lit.sh",
test_file_exts = ["mlir"],
)

View File

@ -0,0 +1,6 @@
// RUN: tf_to_kernel --input=%s --output=%t --same_shape=0,1 --unroll_factors=4 --tile_sizes=256 --arch=70,75
func @tanh(%arg: tensor<*xf32>) -> tensor<*xf32> {
%0 = "tf.Tanh"(%arg) : (tensor<*xf32>) -> tensor<*xf32>
return %0 : tensor<*xf32>
}

View File

@ -48,7 +48,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
mlir::OwningModuleRef module,
GenerateKernelForTfCode(context, tf_code, /*gpu_binary_only=*/true,
architecture, tile_sizes, same_shape,
unroll_factors));
unroll_factors, /*generate_fatbin=*/false));
// Extract gpu_binary.
TF_ASSIGN_OR_RETURN(std::string gpu_binary, ExtractGpuBinary(*module));

View File

@ -95,7 +95,8 @@ xla::StatusOr<std::string> EmitToBinary(mlir::ModuleOp module) {
}
xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
int32_t architecture, llvm::ArrayRef<uint32_t> tile_sizes,
llvm::ArrayRef<uint32_t> architectures,
llvm::ArrayRef<uint32_t> tile_sizes,
llvm::ArrayRef<uint32_t> same_shape,
llvm::ArrayRef<uint32_t> unroll_factors) {
// Read TF code.
@ -107,7 +108,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
TF_ASSIGN_OR_RETURN(
mlir::OwningModuleRef module,
GenerateKernelForTfCode(context, tf_code, /*gpu_binary_only=*/false,
architecture, tile_sizes, same_shape,
architectures, tile_sizes, same_shape,
unroll_factors));
// Get binary.
TF_ASSIGN_OR_RETURN(std::string binary, EmitToBinary(*module));
@ -129,8 +130,8 @@ int main(int argc, char** argv) {
llvm::cl::opt<std::string> output_file(
"output", llvm::cl::desc("output file"), llvm::cl::value_desc("filename"),
llvm::cl::init("foo.bin"));
llvm::cl::list<int32_t> architecture(
"arch", llvm::cl::desc("target architecture (e.g. 50 for sm_50)"),
llvm::cl::list<uint32_t> architectures(
"arch", llvm::cl::desc("target architectures (e.g. 50 for sm_50)"),
llvm::cl::OneOrMore, llvm::cl::CommaSeparated);
llvm::cl::list<uint32_t> tile_sizes(
"tile_sizes", llvm::cl::desc("tile sizes to use"), llvm::cl::ZeroOrMore,
@ -151,7 +152,7 @@ int main(int argc, char** argv) {
llvm::cl::ParseCommandLineOptions(argc, argv, "TF op GPU kernel generator\n");
auto status =
tensorflow::kernel_gen::Run(input_file, output_file, architecture.front(),
tensorflow::kernel_gen::Run(input_file, output_file, architectures,
tile_sizes, same_shape, unroll_factors);
if (!status.ok()) {
LOG(ERROR) << status;

View File

@ -117,6 +117,7 @@ cc_library(
"@llvm-project//mlir:AllPassesAndDialects",
"@llvm-project//mlir:Support",
"@llvm-project//mlir:Transforms",
"@llvm-project//llvm:TransformUtils",
"//tensorflow/compiler/mlir/hlo",
"//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
"//tensorflow/compiler/mlir/hlo:lhlo",

View File

@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "llvm/Transforms/Utils/Cloning.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h" // from @llvm-project
#include "mlir/Target/NVVMIR.h" // from @llvm-project
#include "mlir/Target/ROCDLIR.h" // from @llvm-project
@ -49,9 +50,12 @@ using xla::InternalError;
class GpuKernelToBlobPass
: public GpuKernelToBlobPassBase<GpuKernelToBlobPass> {
public:
GpuKernelToBlobPass(mlir::StringRef blob_annotation, int32_t arch) {
GpuKernelToBlobPass(mlir::StringRef blob_annotation,
llvm::ArrayRef<uint32_t> architectures,
bool generate_fatbin) {
blob_annotation_ = blob_annotation.str();
arch_ = arch;
architectures_ = architectures;
generate_fatbin_ = generate_fatbin;
}
void runOnOperation() override {
@ -69,7 +73,17 @@ class GpuKernelToBlobPass
xla::StatusOr<std::vector<uint8_t>> GetGpuBinaryBlob(
mlir::gpu::GPUModuleOp gpu_module) {
if (architectures_.empty()) {
return InternalError("Expected at least one GPU architecture.");
}
if (!generate_fatbin_ && architectures_.size() > 1) {
return InternalError(
"Can only generate machine code for more than one architecture as a "
"fatbin.");
}
llvm::LLVMContext llvmContext;
#if TENSORFLOW_USE_ROCM
auto llvmModule = mlir::translateModuleToROCDLIR(gpu_module, llvmContext);
if (!llvmModule) {
@ -81,9 +95,14 @@ class GpuKernelToBlobPass
xla::HloModuleConfig config;
config.set_debug_options(xla::GetDebugOptionsFromFlags());
std::string libdevice_dir = tensorflow::RocdlRoot();
// TODO(b/169066682): Support fatbin on ROCm.
if (generate_fatbin_) {
return InternalError("Fatbins are not yet supported for ROCm.");
}
return xla::gpu::amdgpu::CompileToHsaco(llvmModule.get(), arch_, config,
uint32_t arch = architectures_.front();
std::string libdevice_dir = tensorflow::RocdlRoot();
return xla::gpu::amdgpu::CompileToHsaco(llvmModule.get(), arch, config,
libdevice_dir);
#elif GOOGLE_CUDA
@ -102,19 +121,42 @@ class GpuKernelToBlobPass
target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
};
int32_t cc_major = arch_ / 10;
int32_t cc_minor = arch_ % 10;
// Compile and collect requested cubin and PTX images.
std::vector<tensorflow::se::CubinOrPTXImage> images;
TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
TF_ASSIGN_OR_RETURN(
std::string ptx,
xla::gpu::nvptx::CompileToPtx(llvmModule.get(),
std::make_pair(cc_major, cc_minor),
config, libdevice_dir, enable_fusion));
VLOG(1) << ptx;
auto gpu_asm_opts = xla::gpu::PtxOptsFromConfig(config);
for (uint32_t arch : architectures_) {
int32_t cc_major = arch / 10;
int32_t cc_minor = arch % 10;
// Module may be changed by CompileToPtx.
auto llvm_module_copy = llvm::CloneModule(*llvmModule);
TF_ASSIGN_OR_RETURN(
std::string ptx,
xla::gpu::nvptx::CompileToPtx(llvm_module_copy.get(),
std::make_pair(cc_major, cc_minor),
config, libdevice_dir, enable_fusion));
// TODO(b/169066682): If compute_XX profile, collect PTX image here.
VLOG(1) << ptx;
TF_ASSIGN_OR_RETURN(std::vector<uint8_t> gpu_asm,
tensorflow::se::CompileGpuAsm(
cc_major, cc_minor, ptx.c_str(), gpu_asm_opts));
return tensorflow::se::CompileGpuAsm(cc_major, cc_minor, ptx.c_str(),
xla::gpu::PtxOptsFromConfig(config));
if (!generate_fatbin_) {
// Skip fatbin generation and return the first and only GPU machine
// code.
return gpu_asm;
}
// Collect cubin image.
images.push_back({absl::StrCat("sm_", arch), std::move(gpu_asm)});
}
// TODO(b/169870789): Revisit the use of fatbins.
// Bundle cubin and PTX images into a single fatbin.
return tensorflow::se::BundleGpuAsm(images,
gpu_asm_opts.preferred_cuda_dir);
#endif
return InternalError(
"Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
" Did you specify either --config=rocm or --config=cuda ?");
@ -141,8 +183,10 @@ class GpuKernelToBlobPass
} // namespace
std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
mlir::StringRef blob_annotation, int32_t architecture) {
return std::make_unique<GpuKernelToBlobPass>(blob_annotation, architecture);
mlir::StringRef blob_annotation, ArrayRef<uint32_t> architectures,
bool generate_fatbin) {
return std::make_unique<GpuKernelToBlobPass>(blob_annotation, architectures,
generate_fatbin);
}
} // namespace transforms

View File

@ -61,7 +61,8 @@ CreatePropagateTensorFlowABIKnowledgePass(
// Pass to annotate GPU Module with its PTX.
std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
mlir::StringRef blob_annotation = "", int32_t architecture = 0);
mlir::StringRef blob_annotation = "", ArrayRef<uint32_t> architectures = {},
bool generate_fatbin = true);
// Pass to unfuse batch norm.
std::unique_ptr<FunctionPass> CreateUnfuseBatchNormPass();

View File

@ -53,7 +53,10 @@ def GpuKernelToBlobPass : Pass<"gpu-kernel-to-blob", "gpu::GPUModuleOp"> {
let options = [
Option<"blob_annotation_", "blob-annotation", "std::string",
/*default=*/"", "Blob attribute name">,
Option<"arch_", "arch", "int32_t", /*default=*/"0", "GPU architecture">,
ListOption<"architectures_", "arch", "uint32_t", "GPU architectures">,
Option<"generate_fatbin_", "generate-fatbin", "bool", /*default=*/"true",
"Bundle machine code for the different architectures in one "
"fatbin.">,
];
let constructor = "transforms::CreateGpuKernelToBlobPass()";
}

View File

@ -306,9 +306,6 @@ def _gen_unranked_kernel_fatbin_impl(ctx):
archs_trimmed.append(arch[3:])
arch_flag = ",".join(archs_trimmed)
# TODO(b/169066682): Generate Fatbin when lowering GPU module.
arch_flag = "75"
filename = "%s.a" % (name)
gpu_bin = ctx.outputs.output
ctx.actions.run(

View File

@ -106,6 +106,10 @@ cc_library(
# an intermediate target.
cc_library(name = "ptxas_wrapper")
# Buildozer can not remove dependencies inside select guards, so we have to use
# an intermediate target.
cc_library(name = "fatbinary_wrapper")
cc_library(
name = "cuda_driver",
srcs = if_cuda_is_configured(["cuda_driver.cc"]),

View File

@ -251,6 +251,7 @@ cc_library(
]) + if_cuda_is_configured([
"//tensorflow/stream_executor/cuda:cuda_driver",
"//tensorflow/stream_executor/cuda:ptxas_wrapper",
"//tensorflow/stream_executor/cuda:fatbinary_wrapper",
]),
)

View File

@ -140,34 +140,44 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
}
port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
const char* ptx_contents,
GpuAsmOpts options) {
std::string ptxas_path;
auto env = tensorflow::Env::Default();
std::string ptxas_binary_name = "ptxas";
static std::string findCudaExecutable(const std::string binary_name,
const std::string preferred_cuda_dir) {
#if defined(PLATFORM_WINDOWS)
ptxas_binary_name += ".exe";
const std::string binary_filename = binary_name + ".exe";
#else
const std::string& binary_filename = binary_name;
#endif
// Search in cuda root candidates.
auto env = tensorflow::Env::Default();
std::string binary_path;
for (const std::string& cuda_root :
tensorflow::CandidateCudaRoots(options.preferred_cuda_dir)) {
ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", ptxas_binary_name);
VLOG(2) << "Looking for ptxas at " << ptxas_path;
if (env->FileExists(ptxas_path).ok()) {
tensorflow::CandidateCudaRoots(preferred_cuda_dir)) {
binary_path = tensorflow::io::JoinPath(cuda_root, "bin", binary_filename);
VLOG(2) << "Looking for " << binary_filename << " at " << binary_path;
if (env->FileExists(binary_path).ok()) {
break;
}
}
if (!env->FileExists(ptxas_path).ok()) {
if (!env->FileExists(binary_path).ok()) {
// Rely on subprocess invocation to find the correct binary.
ptxas_path = ptxas_binary_name;
binary_path = binary_filename;
}
VLOG(2) << "Using ptxas at " << ptxas_path;
VLOG(2) << "Using " << binary_filename << " at " << binary_path;
return binary_path;
}
port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
const char* ptx_contents,
GpuAsmOpts options) {
std::string ptxas_path =
findCudaExecutable("ptxas", options.preferred_cuda_dir);
WarnIfBadPtxasVersion(ptxas_path);
// Write ptx into a temporary file.
std::string ptx_path;
auto env = tensorflow::Env::Default();
if (!env->LocalTempFilename(&ptx_path)) {
return port::InternalError("couldn't get temp PTX file name");
}
@ -232,4 +242,78 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
return cubin_vector;
}
port::StatusOr<std::vector<uint8>> BundleGpuAsm(
std::vector<CubinOrPTXImage> images, const std::string preferred_cuda_dir) {
std::string fatbinary_path =
findCudaExecutable("fatbinary", preferred_cuda_dir);
// Write images to temporary files.
std::vector<std::string> image_paths;
auto env = tensorflow::Env::Default();
for (const CubinOrPTXImage& img : images) {
std::string img_path;
if (!env->LocalTempFilename(&img_path)) {
return port::InternalError(
"Could not get temporary filenames for images.");
}
TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
VLOG(2) << "image written to " << img_path;
image_paths.push_back(std::move(img_path));
}
auto image_files_cleaner = tensorflow::gtl::MakeCleanup([&image_paths] {
for (const auto& path : image_paths) {
TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
}
});
// Prepare temorary result file.
std::string result_path;
if (!env->LocalTempFilename(&result_path)) {
return port::InternalError(
"Could not get temporary filename for fatbin result.");
}
auto result_file_cleaner = tensorflow::gtl::MakeCleanup([&result_path] {
// This file may never be created, so the failure to delete it should not
// propagate to TF.
tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
});
// Invoke fatbinary and collect its output.
tensorflow::SubProcess fatbinary;
std::vector<std::string> fatbinary_args = {
fatbinary_path, "--64", "--cmdline=--compile-only",
"--link", "--compress-all", absl::StrCat("--create=", result_path)};
assert(images.size() == image_paths.size());
for (int i = 0; i < images.size(); i++) {
fatbinary_args.push_back(absl::StrFormat(
"--image=profile=%s,file=%s", images[i].profile, image_paths[i]));
}
if (VLOG_IS_ON(3)) {
VLOG(3) << absl::StrJoin(fatbinary_args, " ");
}
fatbinary.SetProgram(fatbinary_path, fatbinary_args);
fatbinary.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
if (!fatbinary.Start()) {
return port::InternalError("Failed to launch fatbinary.");
}
std::string stderr_output;
int exit_status = fatbinary.Communicate(
/*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
if (exit_status != 0) {
return port::InternalError(absl::StrFormat(
"fatbinary exited with non-zero error code %d, output: %s", exit_status,
stderr_output));
}
if (!stderr_output.empty()) {
VLOG(2) << stderr_output;
}
// Read in the result and return it as a byte vector.
std::string result_blob;
TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
result_path, &result_blob));
return std::vector<uint8>(result_blob.begin(), result_blob.end());
}
} // namespace stream_executor

View File

@ -52,6 +52,16 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
int device_ordinal, const char* ptx, GpuAsmOpts compilation_options);
struct CubinOrPTXImage {
std::string profile;
std::vector<uint8> bytes;
};
// Bundles the GPU machine code (cubins) and PTX if requested and returns the
// resulting binary (i.e. a fatbin) as a byte array.
port::StatusOr<std::vector<uint8>> BundleGpuAsm(
std::vector<CubinOrPTXImage> images, const std::string preferred_cuda_dir);
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_