diff --git a/tensorflow/core/kernels/mlir_generated/build_defs.bzl b/tensorflow/core/kernels/mlir_generated/build_defs.bzl index f59df3d7f3c..81b869ba624 100644 --- a/tensorflow/core/kernels/mlir_generated/build_defs.bzl +++ b/tensorflow/core/kernels/mlir_generated/build_defs.bzl @@ -23,10 +23,10 @@ def _lookup_file(filegroup, path): GpuBinaryInfo = provider( "GPU binaries in either cubin format or hsaco format", - fields = ["cubins", "hsacos"], + fields = ["gpu_bins"], ) -def _gen_kernel_cubin_impl_cuda(ctx): +def _gen_kernel_gpu_bin_impl(ctx): name = ctx.attr.name tile_sizes = ctx.attr.tile_size.replace("x", ",") cmd_args = [] @@ -35,56 +35,29 @@ def _gen_kernel_cubin_impl_cuda(ctx): if ctx.attr.unroll_factors: cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors) - cubins = [] + gpu_bins = [] for arch in ctx.attr.gpu_archs: # TODO(b/152737872): 'compute_' should generate both SASS and PTX. arch = arch.replace("compute_", "sm_") - filename = "%s.%s.cubin" % (name, arch) - cubin = ctx.actions.declare_file(filename) + filename = "%s.%s.bin" % (name, arch) + gpu_bin = ctx.actions.declare_file(filename) ctx.actions.run( inputs = [ctx.file.mlir_op, ctx.file._tfso], - outputs = [cubin], + outputs = [gpu_bin], executable = ctx.executable._tool, arguments = cmd_args + [ "--tile_sizes=%s" % tile_sizes, - "--arch=%s" % arch.split("_")[1], + # For ROCM, remove the "gfx" prefix. For CUDA, remove the "sm_" prefix. + "--arch=%s" % arch[3:], "--input=%s" % ctx.file.mlir_op.path, - "--output=%s" % cubin.path, + "--output=%s" % gpu_bin.path, ], mnemonic = "compile", ) - cubins.append(cubin) - return [GpuBinaryInfo(cubins = cubins)] + gpu_bins.append(gpu_bin) + return [GpuBinaryInfo(gpu_bins = gpu_bins)] -def _gen_kernel_cubin_impl_rocm(ctx): - name = ctx.attr.name - tile_sizes = ctx.attr.tile_size.replace("x", ",") - cmd_args = [] - if ctx.attr.same_shape: - cmd_args.append("--same_shape=%s" % ctx.attr.same_shape) - if ctx.attr.unroll_factors: - cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors) - - hsacos = [] - for arch in ctx.attr.gpu_archs: - filename = "%s.%s.hsaco" % (name, arch) - hsaco = ctx.actions.declare_file(filename) - ctx.actions.run( - inputs = [ctx.file.mlir_op, ctx.file._tfso], - outputs = [hsaco], - executable = ctx.executable._tool, - arguments = cmd_args + [ - "--tile_sizes=%s" % tile_sizes, - "--arch=%s" % arch[3:], # DDD in "gfxDDD" - "--input=%s" % ctx.file.mlir_op.path, - "--output=%s" % hsaco.path, - ], - mnemonic = "compile", - ) - hsacos.append(hsaco) - return [GpuBinaryInfo(hsacos = hsacos)] - -_gen_kernel_cubin_rule = rule( +_gen_kernel_gpu_bin_rule = rule( attrs = { "mlir_op": attr.label(mandatory = True, allow_single_file = True), "tile_size": attr.string(mandatory = True), @@ -103,12 +76,12 @@ _gen_kernel_cubin_rule = rule( ), }, output_to_genfiles = True, - implementation = _gen_kernel_cubin_impl_rocm if rocm_is_configured() else _gen_kernel_cubin_impl_cuda, + implementation = _gen_kernel_gpu_bin_impl, ) def _gen_kernel_image_hdr_impl_cuda(ctx): images = [] - for cubin in ctx.attr.input[GpuBinaryInfo].cubins: + for cubin in ctx.attr.input[GpuBinaryInfo].gpu_bins: arch = cubin.path.split(".")[-2] images.append("--image=profile=%s,file=%s" % (arch, cubin.path)) @@ -116,7 +89,7 @@ def _gen_kernel_image_hdr_impl_cuda(ctx): fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name) ctx.actions.run( outputs = [fatbin], - inputs = ctx.attr.input[GpuBinaryInfo].cubins, + inputs = ctx.attr.input[GpuBinaryInfo].gpu_bins, executable = _lookup_file(ctx.attr._gpu_root, "bin/fatbinary"), arguments = [ "--64", @@ -146,7 +119,7 @@ def _gen_kernel_image_hdr_impl_rocm(ctx): hsaco_files.append("/dev/null") hsaco_targets.append("host-x86_64-unknown-linux") - hsacos = ctx.attr.input[GpuBinaryInfo].hsacos + hsacos = ctx.attr.input[GpuBinaryInfo].gpu_bins for hsaco in hsacos: gfx_arch = hsaco.path.split(".")[-2] hsaco_files.append(hsaco.path) @@ -196,23 +169,22 @@ _gen_kernel_image_hdr_rule = rule( }, ) -def _gen_kernel_image_hdr(name, mlir_op, tile_size, same_shape = None, unroll_factors = None): +def _gen_kernel_image_hdr(name, mlir_op, gpu_archs, tile_size, same_shape = None, unroll_factors = None): """Generates a C header with fatbin data from a Tensorflow op.""" - if cuda_gpu_architectures() or rocm_gpu_architectures(): - _gen_kernel_cubin_rule( - name = name + "_cubin", - mlir_op = mlir_op, - tile_size = tile_size, - same_shape = same_shape, - unroll_factors = unroll_factors, - gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(), - ) - _gen_kernel_image_hdr_rule( - name = name, - input = ":" + name + "_cubin", - out = "%s.h" % name, - symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""), - ) + _gen_kernel_gpu_bin_rule( + name = name + "_cubin", + mlir_op = mlir_op, + tile_size = tile_size, + same_shape = same_shape, + unroll_factors = unroll_factors, + gpu_archs = gpu_archs, + ) + _gen_kernel_image_hdr_rule( + name = name, + input = ":" + name + "_cubin", + out = "%s.h" % name, + symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""), + ) def _gen_mlir_op_impl(ctx): ctx.actions.run_shell( @@ -264,6 +236,7 @@ def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None, unr _gen_kernel_image_hdr( name = "{name}_{type}_kernel".format(name = name, type = type), mlir_op = "{name}_{type}.mlir".format(name = name, type = type), + gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(), tile_size = tile_size, same_shape = same_shape, unroll_factors = unroll_factors,