diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD index d4269c336e9..27a8dbd2809 100644 --- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD +++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD @@ -40,6 +40,7 @@ cc_library( tf_cc_binary( name = "tf_to_cubin", srcs = ["tf_to_cubin.cc"], + visibility = ["//tensorflow/core/kernels/cubin_headers:__pkg__"], deps = [ ":cubin_creator", "//tensorflow/core:framework_internal", diff --git a/tensorflow/core/kernels/cubin_headers/build_defs.bzl b/tensorflow/core/kernels/cubin_headers/build_defs.bzl new file mode 100644 index 00000000000..b09c515c883 --- /dev/null +++ b/tensorflow/core/kernels/cubin_headers/build_defs.bzl @@ -0,0 +1,101 @@ +"""Generates cubin headers for TF dialect ops.""" + +load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures", "if_cuda") + +def _lookup_file(filegroup, path): + """Extracts file at (relative) path in filegroup.""" + for file in filegroup.files.to_list(): + if file.path.endswith(path): + return file + return None + +def _gen_kernel_image_hdr_impl(ctx): + if not ctx.attr.gpu_archs: + fail("No GPU architecture specified, use --config=cuda or similar") + + name = ctx.attr.name + tile_sizes = ctx.attr.tile_size.replace("x", ",") + same_shape = [] + if ctx.attr.same_shape: + same_shape.append("--same_shape=%s" % ctx.attr.same_shape) + + cubins = [] + images = [] + for arch in ctx.attr.gpu_archs: + filename = "%s.%s.cubin" % (name, arch) + cubin = ctx.actions.declare_file(filename) + ctx.actions.run( + outputs = [cubin], + executable = ctx.executable._tool, + arguments = same_shape + [ + "--tile_sizes=%s" % tile_sizes, + "--arch=%s" % arch.split("_")[1], + "--output=%s" % cubin.path, + ctx.attr.op, + ], + mnemonic = "compile", + ) + cubins.append(cubin) + images.append("--image=profile=%s,file=%s" % (arch, cubin.path)) + + # Generate fatbin file from all cubins. + fatbin = ctx.actions.declare_file("%s.fatbin" % name) + ctx.actions.run( + outputs = [fatbin], + inputs = cubins, + executable = _lookup_file(ctx.attr._cuda_root, "bin/fatbinary"), + arguments = [ + "--64", + "--cmdline=--compile-only", + "--link", + "--compress-all", + "--create=%s" % fatbin.path, + ] + images, + mnemonic = "fatbinary", + ) + + bin2c = _lookup_file(ctx.attr._cuda_root, "bin/bin2c") + ctx.actions.run_shell( + outputs = [ctx.outputs.out], + inputs = [fatbin], + tools = [bin2c], + command = "%s --static --const --type=int --name=%s %s 1> %s" % + (bin2c.path, ctx.attr.symbol, fatbin.path, ctx.outputs.out.path), + mnemonic = "bin2c", + ) + +_gen_kernel_image_hdr = rule( + implementation = _gen_kernel_image_hdr_impl, + output_to_genfiles = True, + attrs = { + "op": attr.string(mandatory = True), + "tile_size": attr.string(mandatory = True), + "same_shape": attr.string(), + "out": attr.output(mandatory = True), + "symbol": attr.string(mandatory = True), + "gpu_archs": attr.string_list(mandatory = True), + "_cuda_root": attr.label( + default = Label("@local_config_cuda//cuda:cuda_root"), + ), + "_tool": attr.label( + executable = True, + default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_cubin"), + cfg = "host", + ), + }, +) + +def gen_kernel_image_hdr(name, op, tile_size, tags = [], same_shape = None): + """Generates a C header with fatbin data from a Tensorflow op.""" + if_cuda( + if_true = [_gen_kernel_image_hdr( + name = name, + op = op, + tile_size = tile_size, + same_shape = same_shape, + out = "%s.h" % name, + symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""), + gpu_archs = cuda_gpu_architectures(), + tags = tags, + )], + ) diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl index 9d17e1b8f35..92586dd7d11 100644 --- a/third_party/gpus/cuda/BUILD.tpl +++ b/third_party/gpus/cuda/BUILD.tpl @@ -166,6 +166,14 @@ cc_library( data = [":cuda-nvvm"], ) +filegroup( + name = "cuda_root", + srcs = [ + "cuda/bin/fatbinary", + "cuda/bin/bin2c", + ], +) + bzl_library( name = "build_defs_bzl", srcs = ["build_defs.bzl"], diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl index 3280d6b041f..bba772e2377 100644 --- a/third_party/gpus/cuda/build_defs.bzl.tpl +++ b/third_party/gpus/cuda/build_defs.bzl.tpl @@ -51,6 +51,10 @@ def cuda_is_configured(): """Returns true if CUDA was enabled during the configure process.""" return %{cuda_is_configured} +def cuda_gpu_architectures(): + """Returns a list of supported GPU architectures.""" + return %{cuda_gpu_architectures} + def if_cuda_is_configured(x): """Tests if the CUDA was enabled during the configure process. diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl index c587f117deb..aa8a2f0226d 100644 --- a/third_party/gpus/cuda_configure.bzl +++ b/third_party/gpus/cuda_configure.bzl @@ -714,6 +714,7 @@ def _create_dummy_repository(repository_ctx): { "%{cuda_is_configured}": "False", "%{cuda_extra_copts}": "[]", + "%{cuda_gpu_architectures}": "[]", }, ) _tpl( @@ -842,6 +843,16 @@ def _compute_cuda_extra_copts(repository_ctx, compute_capabilities): ] return str(capability_flags) +def _compute_cuda_gpu_architectures(repository_ctx, compute_capabilities): + gpu_architectures = [ + "sm_" + capability.replace(".", "") + for capability in compute_capabilities + ] + + # Make the list unique. + gpu_architectures = dict(zip(gpu_architectures, gpu_architectures)).keys() + return str(gpu_architectures) + def _tpl_path(repository_ctx, filename): return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % filename)) @@ -973,6 +984,10 @@ def _create_local_cuda_repository(repository_ctx): repository_ctx, cuda_config.compute_capabilities, ), + "%{cuda_gpu_architectures}": _compute_cuda_gpu_architectures( + repository_ctx, + cuda_config.compute_capabilities, + ), }, )