[KERNEL_GEN] Remove any trace of ranked MLIR kernels.

PiperOrigin-RevId: 347379437
Change-Id: Ia840f07c6e01d3cbe19059fb02dc676fd0b67136
This commit is contained in:
Alexander Belyaev 2020-12-14 07:17:20 -08:00 committed by TensorFlower Gardener
parent 9618f238aa
commit fbcb024d7f
20 changed files with 36 additions and 855 deletions

View File

@ -86,29 +86,6 @@ cc_library(
],
)
tf_cc_binary(
name = "tf_to_gpu_binary",
srcs = [
"crash_handler.h",
"tf_to_gpu_binary.cc",
],
visibility = [
"//tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary:__pkg__",
"//tensorflow/core/kernels/mlir_generated:__pkg__",
],
deps = [
":kernel_creator",
"//tensorflow/compiler/mlir:init_mlir",
"//tensorflow/compiler/mlir/tensorflow",
"//tensorflow/core:lib",
"//tensorflow/core/platform",
"//tensorflow/stream_executor/lib",
"@com_google_absl//absl/strings",
"@llvm-project//llvm:Support",
"@llvm-project//mlir:Pass",
],
)
tf_cc_binary(
name = "tf_to_kernel",
srcs = ["tf_to_kernel.cc"],

View File

@ -90,27 +90,17 @@ struct RemoveUnusedTensorToMemrefOperations
};
} // end anonymous namespace
Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
llvm::ArrayRef<uint32_t> tile_sizes,
Status LowerTFtoGPU(mlir::ModuleOp module, llvm::ArrayRef<uint32_t> tile_sizes,
llvm::ArrayRef<uint32_t> unroll_factors,
bool embed_memref_prints) {
mlir::PassManager pm(module.getContext());
applyTensorflowAndCLOptions(pm);
if (gpu_binary_only) {
pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
/*allow_partial_conversion=*/false, /*legalize_chlo=*/true));
pm.addNestedPass<mlir::FuncOp>(
mlir::kernel_gen::transforms::CreateMaterializeBroadcastsPass());
pm.addNestedPass<mlir::FuncOp>(
mlir::kernel_gen::transforms::CreateUnfuseBatchNormPass());
} else {
pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
/*allow_partial_conversion=*/false, /*legalize_chlo=*/false));
pm.addNestedPass<mlir::FuncOp>(mlir::createTransformUnrankedHloPass());
pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createChloLegalizeToHloPass());
pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
}
pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
/*allow_partial_conversion=*/false, /*legalize_chlo=*/false));
pm.addNestedPass<mlir::FuncOp>(mlir::createTransformUnrankedHloPass());
pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createChloLegalizeToHloPass());
pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
// Transform HLO operations to LinAlg.
pm.addNestedPass<mlir::FuncOp>(::mlir::mhlo::createLegalizeHloToLinalgPass());
@ -139,12 +129,10 @@ Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
pm.addPass(mlir::kernel_gen::transforms::CreateHloBufferizePass());
pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
if (!gpu_binary_only) {
// Find candidates for buffer reuse. This is only successful if buffer size
// equality can be determined based on `linalg.generic` operations.
pm.addNestedPass<mlir::FuncOp>(
mlir::kernel_gen::transforms::CreateBufferReusePass());
}
// Find candidates for buffer reuse. This is only successful if buffer size
// equality can be determined based on `linalg.generic` operations.
pm.addNestedPass<mlir::FuncOp>(
mlir::kernel_gen::transforms::CreateBufferReusePass());
pm.addNestedPass<mlir::FuncOp>(
mlir::createLinalgTilingToParallelLoopsPass((tiling_for_unrolling)));
// Transform the Linalg ops inside of the loop nest into parallel loops.
@ -188,15 +176,13 @@ Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
std::make_unique<RemoveUnusedTensorToMemrefOperations>());
pm.addPass(mlir::createCanonicalizerPass());
pm.addNestedPass<mlir::FuncOp>(mlir::createCSEPass());
if (!gpu_binary_only) {
// Before inserting more allocs, map the ones we already have to the
// tf runtime. That ensures that all allocations for the actual computation
// end up on the device, whereas allocations for shape computation and host
// side things remain on the host.
// Longer term, this should be handled by proper device placement.
pm.addPass(mlir::kernel_gen::tf_framework::
CreateEmbedTFFrameworkFunctionAndAllocPass());
}
// Before inserting more allocs, map the ones we already have to the
// tf runtime. That ensures that all allocations for the actual computation
// end up on the device, whereas allocations for shape computation and host
// side things remain on the host.
// Longer term, this should be handled by proper device placement.
pm.addPass(mlir::kernel_gen::tf_framework::
CreateEmbedTFFrameworkFunctionAndAllocPass());
pm.addPass(mlir::kernel_gen::transforms::CreateFinalBufferizePass());
pm.addNestedPass<mlir::FuncOp>(mlir::createPromoteBuffersToStackPass(64));
// TODO(herhut): Depends on https://bugs.llvm.org/show_bug.cgi?id=48385.
@ -223,11 +209,6 @@ Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
// Take launches to launches with kernels.
pm.addPass(::mlir::createGpuKernelOutliningPass());
if (gpu_binary_only) {
// Make kernel signature deterministic so that we can call it externally.
pm.addNestedPass<::mlir::FuncOp>(
xla::mlir_gpu::createRewriteKernelSignaturePass());
}
pm.addPass(::mlir::createLowerAffinePass());
// Constraints are removed as late as possible and before lowering to CFG.
pm.addNestedPass<::mlir::FuncOp>(::mlir::createConvertShapeConstraintsPass());
@ -295,7 +276,7 @@ Status LowerHostSideToFinalForm(mlir::ModuleOp module) {
} // namespace
StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
mlir::MLIRContext& context, llvm::StringRef tf_code, bool gpu_binary_only,
mlir::MLIRContext& context, llvm::StringRef tf_code,
llvm::ArrayRef<std::string> architectures,
llvm::ArrayRef<uint32_t> tile_sizes,
llvm::ArrayRef<uint32_t> unroll_factors, bool embed_memref_prints,
@ -304,8 +285,8 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
mlir::RegisterAllTensorFlowDialects(registry);
registry.insert<mlir::chlo::HloClientDialect, mlir::mhlo::MhloDialect>();
mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
TF_RETURN_IF_ERROR(LowerTFtoGPU(module.get(), gpu_binary_only, tile_sizes,
unroll_factors, embed_memref_prints));
TF_RETURN_IF_ERROR(LowerTFtoGPU(module.get(), tile_sizes, unroll_factors,
embed_memref_prints));
#if !defined(TENSORFLOW_USE_ROCM) && !defined(GOOGLE_CUDA)
return InternalError(
"Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
@ -321,9 +302,7 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
TF_RETURN_IF_ERROR(GenerateDeviceCode(module.get(), kGpuBinaryAttrName,
architectures, generate_fatbin,
print_ptx));
if (!gpu_binary_only) {
TF_RETURN_IF_ERROR(LowerHostSideToFinalForm(module.get()));
}
TF_RETURN_IF_ERROR(LowerHostSideToFinalForm(module.get()));
return module;
}

View File

@ -33,11 +33,9 @@ limitations under the License.
namespace tensorflow {
namespace kernel_gen {
// Converts TF code to LLVM/NVVM. If `gpu_binary_only` is true, then the
// conversion stops after gpu_binary blob is generated. If `gpu_binary_only` is
// false, lowers the host side to LLVM Dialect.
// Converts TF code to LLVM/NVVM. Lowers the host side to LLVM Dialect.
xla::StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
mlir::MLIRContext& context, llvm::StringRef tf_code, bool gpu_binary_only,
mlir::MLIRContext& context, llvm::StringRef tf_code,
llvm::ArrayRef<std::string> architectures = {"sm_75"},
llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
llvm::ArrayRef<uint32_t> unroll_factors = {},

View File

@ -1,17 +0,0 @@
load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
package(licenses = ["notice"])
glob_lit_tests(
data = [
"//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_gpu_binary",
"@llvm-project//mlir:run_lit.sh",
],
default_tags = [
# We need access to the CUDA SDK.
"gpu",
"no_rocm",
],
driver = "//tensorflow/compiler/mlir:run_lit.sh",
test_file_exts = ["mlir"],
)

View File

@ -1,6 +0,0 @@
// RUN: tf_to_gpu_binary --input=%s --output=%t --unroll_factors=4 --tile_sizes=256 --arch=sm_70
func @abs(%arg0: tensor<?xf16>) -> tensor<?xf16> attributes {tf_entry} {
%0 = "tf.Abs"(%arg0) { }
: (tensor<?xf16>) -> tensor<?xf16>
return %0 : tensor<?xf16>
}

View File

@ -1,6 +0,0 @@
// RUN: tf_to_gpu_binary --input=%s --output=%t --unroll_factors=4 --tile_sizes=256 --arch=sm_70
func @ceil(%arg0: tensor<?xf64>) -> tensor<?xf64> attributes {tf_entry} {
%0 = "tf.Ceil"(%arg0) { }
: (tensor<?xf64>) -> tensor<?xf64>
return %0 : tensor<?xf64>
}

View File

@ -1,5 +0,0 @@
// RUN: tf_to_gpu_binary --input=%s --output=%t --unroll_factors=4 --tile_sizes=256 --arch=sm_70
func @tanh(%arg0: tensor<?xf32>) -> tensor<?xf32> attributes {tf_entry} {
%0 = "tf.Tanh"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
return %0 : tensor<?xf32>
}

View File

@ -1,96 +0,0 @@
// Copyright 2020 The TensorFlow Runtime Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//===- tf_to_gpu_binary.cc --------------------------------------*- C++ -*-===//
//
// This file implements the entry point to compile a tf op to a gpu binary
//
//===----------------------------------------------------------------------===//
#include <string>
#include <utility>
#include <vector>
#include "absl/strings/string_view.h"
#include "llvm/Support/CommandLine.h"
#include "mlir/Pass/PassManager.h" // from @llvm-project
#include "tensorflow/compiler/mlir/init_mlir.h"
#include "tensorflow/compiler/mlir/tools/kernel_gen/crash_handler.h"
#include "tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/stream_executor/lib/statusor.h"
namespace tensorflow {
namespace kernel_gen {
namespace {
xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
std::string architecture, llvm::ArrayRef<uint32_t> tile_sizes,
llvm::ArrayRef<uint32_t> unroll_factors) {
// Read TF code.
std::string tf_code;
TF_RETURN_IF_ERROR(
ReadFileToString(Env::Default(), input_file.str(), &tf_code));
// Compile.
mlir::MLIRContext context;
TF_ASSIGN_OR_RETURN(
mlir::OwningModuleRef module,
GenerateKernelForTfCode(context, tf_code, /*gpu_binary_only=*/true,
architecture, tile_sizes, unroll_factors,
/*embed_memref_prints=*/false,
/*generate_fatbin=*/false));
// Extract gpu_binary.
TF_ASSIGN_OR_RETURN(std::string gpu_binary, ExtractGpuBinary(*module));
// Write gpu_binary blob.
TF_RETURN_IF_ERROR(
WriteStringToFile(Env::Default(), output_file.str(), gpu_binary));
return xla::Status::OK();
}
} // namespace
} // namespace kernel_gen
} // namespace tensorflow
int main(int argc, char** argv) {
tensorflow::kernel_gen::SetCrashReportMessage();
llvm::cl::opt<std::string> input_file("input", llvm::cl::desc("input file"),
llvm::cl::value_desc("filename"),
llvm::cl::init("foo.mlir"));
llvm::cl::opt<std::string> output_file(
"output", llvm::cl::desc("output file"), llvm::cl::value_desc("filename"),
llvm::cl::init("foo.bin"));
llvm::cl::opt<std::string> architecture(
"arch", llvm::cl::desc("target architecture (e.g. sm_50)"),
llvm::cl::init("sm_50"));
llvm::cl::list<uint32_t> tile_sizes(
"tile_sizes", llvm::cl::desc("tile sizes to use"), llvm::cl::ZeroOrMore,
llvm::cl::CommaSeparated);
llvm::cl::list<uint32_t> unroll_factors(
"unroll_factors",
llvm::cl::desc("factors to unroll by, separated by commas"),
llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
tensorflow::InitMlir y(&argc, &argv);
mlir::registerPassManagerCLOptions();
llvm::cl::ParseCommandLineOptions(argc, argv, "TF op GPU kernel generator\n");
auto status = tensorflow::kernel_gen::Run(
input_file, output_file, architecture, tile_sizes, unroll_factors);
if (!status.ok()) {
LOG(ERROR) << status;
return 1;
}
return 0;
}

View File

@ -115,9 +115,9 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
mlir::MLIRContext context;
TF_ASSIGN_OR_RETURN(
mlir::OwningModuleRef module,
GenerateKernelForTfCode(context, tf_code, /*gpu_binary_only=*/false,
architectures, tile_sizes, unroll_factors,
embed_memref_prints, /*generate_fatbin=*/true,
GenerateKernelForTfCode(context, tf_code, architectures, tile_sizes,
unroll_factors, embed_memref_prints,
/*generate_fatbin=*/true,
/*print_ptx=*/print_ptx));
// Get binary.
TF_ASSIGN_OR_RETURN(std::string binary, EmitToBinary(*module));

View File

@ -77,20 +77,16 @@ cc_library(
"embed_memref_prints.cc",
"embed_tf_framework_pass.cc",
"gpu_kernel_to_blob_pass.cc",
"materialize_broadcasts_pass.cc",
"parallel_loops_to_sequential.cc",
"same_shape_propagation.cc",
"shape_to_descriptors_pass.cc",
"tensorflow_abi_knowledge_propagation.cc",
"tf_kernel_to_llvm_pass.cc",
"unfuse_batch_norm_pass.cc",
],
hdrs = ["passes.h"],
copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]) + if_rocm_is_configured(["-DTENSORFLOW_USE_ROCM=1"]),
deps = [
"@llvm-project//mlir:Affine",
"//tensorflow/compiler/mlir/hlo:materialize_broadcasts", # buildcleaner: keep
"//tensorflow/compiler/mlir/hlo:unfuse_batch_norm", # buildcleaner: keep
"//tensorflow/compiler/xla/service:hlo_module_config",
"//tensorflow/compiler/xla:debug_options_flags",
"//tensorflow/compiler/xla:statusor",

View File

@ -1,61 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "mlir/Dialect/StandardOps/IR/Ops.h" // from @llvm-project
#include "mlir/Transforms/DialectConversion.h" // from @llvm-project
#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
namespace mlir {
namespace kernel_gen {
namespace transforms {
namespace {
#define GEN_PASS_CLASSES
#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
struct MaterializeBroadcastsPass
: public MaterializeBroadcastsPassBase<MaterializeBroadcastsPass> {
void runOnFunction() override {
mlir::ConversionTarget conversionTarget(getContext());
mlir::OwningRewritePatternList conversionPatterns;
// Consider the mhlo dialect legal for tests.
conversionTarget.addLegalDialect<mlir::mhlo::MhloDialect>();
// The conversion uses helpers from the Standard dialect.
conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
mlir::mhlo::SetupMaterializeBroadcastsLegality(&getContext(),
&conversionTarget);
mlir::mhlo::PopulateMaterializeBroadcastsPatterns(&getContext(),
&conversionPatterns);
if (failed(applyPartialConversion(getFunction(), conversionTarget,
std::move(conversionPatterns)))) {
return signalPassFailure();
}
}
};
} // namespace
std::unique_ptr<mlir::FunctionPass> CreateMaterializeBroadcastsPass() {
return std::make_unique<MaterializeBroadcastsPass>();
}
} // namespace transforms
} // namespace kernel_gen
} // namespace mlir

View File

@ -62,9 +62,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateHloBufferizePass();
// buffers.
std::unique_ptr<OperationPass<ModuleOp>> CreateFinalBufferizePass();
// Pass to materialize broadcasts.
std::unique_ptr<FunctionPass> CreateMaterializeBroadcastsPass();
// Pass to convert scf::ParallelOp to scf::ForOp.
std::unique_ptr<FunctionPass> CreateParallelLoopsToSequential();
@ -74,9 +71,6 @@ std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
ArrayRef<std::string> architectures = {}, bool generate_fatbin = true,
bool print_ptx = false);
// Pass to unfuse batch norm.
std::unique_ptr<FunctionPass> CreateUnfuseBatchNormPass();
// Pass to propagate tensorflow runtime ABI knowledge across kernel boundaries.
std::unique_ptr<FunctionPass> CreatePropagateTfAbiKnowledgeToKernels();

View File

@ -61,16 +61,6 @@ def FinalBufferizePass : Pass<"final-bufferize", "ModuleOp"> {
let constructor = "transforms::CreateFinalBufferizePass()";
}
def MaterializeBroadcastsPass : FunctionPass<"materialize-broadcast"> {
let summary = "Pass to materialize broadcasts";
let constructor = "transforms::CreateMaterializeBroadcastsPass()";
}
def UnfuseBatchNormPass : FunctionPass<"unfuse-batch-norm"> {
let summary = "Pass to unfuse batch norm";
let constructor = "transforms::CreateUnfuseBatchNormPass()";
}
def GpuKernelToBlobPass : Pass<"gpu-kernel-to-blob", "gpu::GPUModuleOp"> {
let summary = "Pass to annotate GPU Module with its PTX";
let options = [

View File

@ -1,45 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "mlir/Transforms/GreedyPatternRewriteDriver.h" // from @llvm-project
#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
namespace mlir {
namespace kernel_gen {
namespace transforms {
namespace {
#define GEN_PASS_CLASSES
#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
struct UnfuseBatchNormPass
: public UnfuseBatchNormPassBase<UnfuseBatchNormPass> {
void runOnFunction() override {
mlir::OwningRewritePatternList patterns;
mlir::mhlo::PopulateUnfuseBatchNormPatterns(&getContext(), &patterns);
mlir::applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
}
};
} // namespace
std::unique_ptr<mlir::FunctionPass> CreateUnfuseBatchNormPass() {
return std::make_unique<UnfuseBatchNormPass>();
}
} // namespace transforms
} // namespace kernel_gen
} // namespace mlir

View File

@ -261,7 +261,6 @@ tf_cuda_cc_test(
gen_kernel_library(
name = "abs",
generate_unranked = True,
tile_size = "256",
types = [
"f16",
@ -275,7 +274,6 @@ gen_kernel_library(
gen_kernel_library(
name = "conj",
generate_unranked = True,
tile_size = "256",
types = [
"f32",
@ -286,7 +284,6 @@ gen_kernel_library(
gen_kernel_library(
name = "imag",
generate_unranked = True,
tile_size = "256",
types = [
"f32",
@ -296,7 +293,6 @@ gen_kernel_library(
gen_kernel_library(
name = "invert",
generate_unranked = True,
tile_size = "256",
types = [
"i8",
@ -309,8 +305,6 @@ gen_kernel_library(
gen_kernel_library(
name = "is_inf",
generate_ranked = False,
generate_unranked = True,
tile_size = "256",
types = [
"f16",
@ -322,14 +316,12 @@ gen_kernel_library(
gen_kernel_library(
name = "logical_not",
generate_unranked = True,
tile_size = "256",
types = ["i1"],
)
gen_kernel_library(
name = "real",
generate_unranked = True,
tile_size = "256",
types = [
"f32",
@ -339,7 +331,6 @@ gen_kernel_library(
gen_kernel_library(
name = "sign",
generate_unranked = True,
tile_size = "256",
types = [
# TODO(b/162577610): Add bf16, c64 and c128.
@ -354,8 +345,6 @@ gen_kernel_library(
gen_kernel_library(
name = "add_v2",
generate_ranked = False,
generate_unranked = True,
tile_size = "256,1,1",
types = [
"f16",
@ -371,8 +360,6 @@ gen_kernel_library(
[
gen_kernel_library(
name = name,
generate_ranked = False,
generate_unranked = True,
tile_size = "256,1,1",
types = [
"i8",
@ -401,8 +388,6 @@ gen_kernel_library(
[
gen_kernel_library(
name = name,
generate_ranked = False,
generate_unranked = True,
tile_size = "256,1,1",
types = [
"i1",
@ -419,8 +404,6 @@ gen_kernel_library(
[
gen_kernel_library(
name = name,
generate_ranked = False,
generate_unranked = True,
tile_size = "256,1,1",
types = [
"f16",
@ -444,8 +427,6 @@ gen_kernel_library(
[
gen_kernel_library(
name = name,
generate_ranked = False,
generate_unranked = True,
tile_size = "256,1,1",
types = [
"f16",
@ -470,8 +451,6 @@ gen_kernel_library(
[
gen_kernel_library(
name = name,
generate_ranked = False,
generate_unranked = True,
tile_size = "256,1,1",
types = [
"f16",
@ -494,7 +473,6 @@ gen_kernel_library(
[
gen_kernel_library(
name = name,
generate_unranked = True,
tile_size = "256",
types = [
"f16",
@ -516,7 +494,6 @@ gen_kernel_library(
[
gen_kernel_library(
name = name,
generate_unranked = True,
tile_size = "256",
types = [
"f16",
@ -541,7 +518,6 @@ gen_kernel_library(
[
gen_kernel_library(
name = name,
generate_unranked = True,
tile_size = "256",
types = [
"f16",

View File

@ -31,168 +31,6 @@ GpuBinaryInfo = provider(
fields = ["gpu_bins"],
)
def _gen_kernel_gpu_bin_impl(ctx):
name = ctx.attr.name
tile_sizes = ctx.attr.tile_size.replace("x", ",")
cmd_args = []
if ctx.attr.unroll_factors:
cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)
if ctx.attr.extra_args:
cmd_args.extend(ctx.attr.extra_args)
gpu_bins = []
for arch in ctx.attr.gpu_archs:
# TODO(b/170283783): 'compute_' should generate both SASS and PTX.
arch = arch.replace("compute_", "sm_")
filename = "%s.%s.bin" % (name, arch)
gpu_bin = ctx.actions.declare_file(filename)
ctx.actions.run(
inputs = [ctx.file.mlir_op, ctx.file._tfso],
outputs = [gpu_bin],
executable = ctx.executable._tool,
arguments = cmd_args + [
"--tile_sizes=%s" % tile_sizes,
"--arch=%s" % arch,
"--input=%s" % ctx.file.mlir_op.path,
"--output=%s" % gpu_bin.path,
],
mnemonic = "compile",
)
gpu_bins.append(gpu_bin)
return [GpuBinaryInfo(gpu_bins = gpu_bins)]
_gen_kernel_gpu_bin_rule = rule(
attrs = {
"mlir_op": attr.label(mandatory = True, allow_single_file = True),
"tile_size": attr.string(mandatory = True),
"unroll_factors": attr.string(),
"gpu_archs": attr.string_list(mandatory = True),
"extra_args": attr.string_list(),
"_tfso": attr.label(
default = Label("//tensorflow:libtensorflow_framework.so.2"),
cfg = "host",
allow_single_file = True,
),
"_tool": attr.label(
executable = True,
default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_gpu_binary"),
cfg = "host",
),
},
output_to_genfiles = True,
implementation = _gen_kernel_gpu_bin_impl,
)
def _gen_kernel_image_hdr_impl_cuda(ctx):
images = []
for cubin in ctx.attr.input[GpuBinaryInfo].gpu_bins:
arch = cubin.path.split(".")[-2]
images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
# Generate fatbin file from all cubins.
fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
ctx.actions.run(
outputs = [fatbin],
inputs = ctx.attr.input[GpuBinaryInfo].gpu_bins,
executable = _lookup_file(ctx.attr._gpu_root, "bin/fatbinary"),
arguments = [
"--64",
"--cmdline=--compile-only",
"--link",
"--compress-all",
"--create=%s" % fatbin.path,
] + images,
mnemonic = "fatbinary",
)
bin2c = _lookup_file(ctx.attr._gpu_root, "bin/bin2c")
ctx.actions.run_shell(
outputs = [ctx.outputs.out],
inputs = [fatbin],
tools = [bin2c],
command = "%s --static --const --type=char --name=%s %s 1> %s" %
(bin2c.path, ctx.attr.symbol, fatbin.path, ctx.outputs.out.path),
mnemonic = "bin2c",
)
def _gen_kernel_image_hdr_impl_rocm(ctx):
hsaco_files = []
hsaco_targets = []
# Add a dummy host target triple...clang-offload-bundler requires 1 and only 1 host target triple
hsaco_files.append("/dev/null")
hsaco_targets.append("host-x86_64-unknown-linux")
hsacos = ctx.attr.input[GpuBinaryInfo].gpu_bins
for hsaco in hsacos:
gfx_arch = hsaco.path.split(".")[-2]
hsaco_files.append(hsaco.path)
hsaco_targets.append("hip-amdgcn-amd-amdhsa-%s" % gfx_arch)
# Generate fatbin file from all hsacos.
fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
ctx.actions.run(
outputs = [fatbin],
inputs = hsacos,
executable = _lookup_file(ctx.attr._gpu_root, "bin/clang-offload-bundler"),
arguments = [
"--inputs=%s" % ",".join(hsaco_files),
"--targets=%s" % ",".join(hsaco_targets),
"--type=o",
"--outputs=%s" % fatbin.path,
],
mnemonic = "fatbinary",
)
ctx.actions.run_shell(
outputs = [ctx.outputs.out],
inputs = [fatbin],
command = (
("hex=`hexdump -v -e \'/1 \"0x%%02x, \"\' %s` && " +
"len=`echo $hex | wc -c` && " +
"echo 'static const unsigned char %s['$len' + 1] = {' > %s && " +
"echo $hex | cat >> %s && " +
"echo '};' >> %s") % (
fatbin.path,
ctx.attr.symbol,
ctx.outputs.out.path,
ctx.outputs.out.path,
ctx.outputs.out.path,
)
),
)
_gen_kernel_image_hdr_rule = rule(
implementation = _gen_kernel_image_hdr_impl_rocm if rocm_is_configured() else _gen_kernel_image_hdr_impl_cuda,
output_to_genfiles = True,
attrs = {
"input": attr.label(mandatory = True, providers = [GpuBinaryInfo]),
"out": attr.output(mandatory = True),
"symbol": attr.string(mandatory = True),
"_gpu_root": attr.label(
default = Label("@local_config_rocm//rocm:rocm_root") if rocm_is_configured() else Label("@local_config_cuda//cuda:cuda_root"),
),
},
)
def _gen_kernel_image_hdr(name, mlir_op, gpu_archs, tile_size, unroll_factors = None, extra_args = []):
"""Generates a C header with fatbin data from a Tensorflow op."""
_gen_kernel_gpu_bin_rule(
name = name + "_cubin",
mlir_op = mlir_op,
tile_size = tile_size,
unroll_factors = unroll_factors,
gpu_archs = gpu_archs,
extra_args = extra_args,
)
_gen_kernel_image_hdr_rule(
name = name,
input = ":" + name + "_cubin",
out = "%s.h" % name,
symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
)
type_to_mlir = {
"c64": "complex<f32>",
"c128": "complex<f64>",
@ -204,18 +42,12 @@ def _gen_mlir_op_impl(ctx):
if mlir_type in type_to_mlir:
mlir_type = type_to_mlir[mlir_type]
# In order to generate a ranked kernel we change *xelem_type to ?xelem_type
# and remove element type from the entry function name.
convert_to_ranked = ""
if ctx.attr.unranked == False:
convert_to_ranked = "sed s/*x/?x/g | sed s/_elem_type//g |"
cmd = ctx.actions.run_shell(
inputs = [ctx.file.template],
outputs = [ctx.outputs.out],
command = (
("cat %s | %s sed 's/_elem_type/_%s/g' | sed 's/elem_type/%s/g' > %s") % (
("cat %s | sed 's/_elem_type/_%s/g' | sed 's/elem_type/%s/g' > %s") % (
ctx.file.template.path,
convert_to_ranked,
ctx.attr.type,
mlir_type,
ctx.outputs.out.path,
@ -244,40 +76,6 @@ def _gen_mlir_op(name, type, unranked):
unranked = unranked,
)
def gen_ranked_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = []):
""" Generate a library with kernels for a specific tensorflow op.
Args:
name: The name of the tensorflow op.
types: The types ("f16", "f32", "f64") for which a kernel should be generated.
tile_size: The tiling specification, e.g. "16x16".
unroll_factors: The unrolling specification, e.g. "4,4"
tags: The tags which should be added to the library.
extra_args: Extra arguments to pass to the generator tool.
"""
if cuda_gpu_architectures() or rocm_gpu_architectures():
for type in types:
_gen_mlir_op(
name = name,
type = type,
unranked = False,
)
_gen_kernel_image_hdr(
name = "{name}_{type}_kernel".format(name = name, type = type),
mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
tile_size = tile_size,
unroll_factors = unroll_factors,
extra_args = extra_args,
)
native.cc_library(
name = name + "_kernels",
hdrs = if_gpu_is_configured([":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
tags = tags,
)
################################################################################
# Unranked kernels build rules.
################################################################################
@ -410,22 +208,12 @@ def gen_unranked_kernel_library(name, types, tile_size, tags = [], unroll_factor
tags = tags,
)
def gen_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = [], generate_ranked = True, generate_unranked = False):
if (generate_ranked):
gen_ranked_kernel_library(
name = name,
types = types,
tile_size = tile_size,
tags = tags,
unroll_factors = unroll_factors,
extra_args = extra_args,
)
if (generate_unranked):
gen_unranked_kernel_library(
name = name + "_unranked",
types = types,
tile_size = tile_size,
tags = tags,
unroll_factors = unroll_factors,
extra_args = extra_args,
)
def gen_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = []):
gen_unranked_kernel_library(
name = name + "_unranked",
types = types,
tile_size = tile_size,
tags = tags,
unroll_factors = unroll_factors,
extra_args = extra_args,
)

View File

@ -1,40 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <string>
#include <vector>
#include "absl/types/span.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/kernels/mlir_generated/abs_f16_kernel.h"
#include "tensorflow/core/kernels/mlir_generated/abs_f32_kernel.h"
#include "tensorflow/core/kernels/mlir_generated/abs_f64_kernel.h"
#include "tensorflow/core/kernels/mlir_generated/abs_i32_kernel.h"
#include "tensorflow/core/kernels/mlir_generated/abs_i64_kernel.h"
#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h"
namespace tensorflow {
namespace {
GENERATE_OP_KERNEL_BASE(Abs);
} // namespace
GENERATE_AND_REGISTER_UNARY_KERNEL(Abs, F16, Eigen::half);
GENERATE_AND_REGISTER_UNARY_KERNEL(Abs, F32, float);
GENERATE_AND_REGISTER_UNARY_KERNEL(Abs, F64, double);
GENERATE_AND_REGISTER_UNARY_KERNEL(Abs, I32, int32);
GENERATE_AND_REGISTER_UNARY_KERNEL(Abs, I64, int64);
} // namespace tensorflow

View File

@ -1,129 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h"
#include <memory>
#include <string>
#include <vector>
#include "absl/strings/string_view.h"
#include "absl/synchronization/mutex.h"
#include "absl/types/span.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor.h"
namespace tensorflow {
namespace {
Status CreateKernel(absl::string_view kernel_name, uint64_t num_args,
absl::string_view ptx, absl::Span<const uint8_t> cubin_data,
se::StreamExecutor* stream_exec,
std::unique_ptr<se::KernelBase>& kernel_base) {
se::MultiKernelLoaderSpec loader_spec(num_args);
if (!cubin_data.empty()) {
loader_spec.AddCudaCubinInMemory(
reinterpret_cast<const char*>(cubin_data.data()), kernel_name);
}
kernel_base.reset(new se::KernelBase(stream_exec));
return stream_exec->GetKernel(loader_spec, kernel_base.get());
}
struct LaunchConfig {
se::BlockDim blockDim;
se::ThreadDim threadDim;
};
LaunchConfig GetLaunchConfiguration(std::vector<uint64> tile_sizes,
std::vector<uint64> unrolling_factors,
std::vector<uint64> shape) {
LaunchConfig result;
// Ensure the vectors are length 3 and pad with ones.
tile_sizes.resize(3, 1);
unrolling_factors.resize(3, 1);
shape.resize(3, 1);
// The number of threads is given by the tiling size.
result.threadDim = se::ThreadDim(tile_sizes[0], tile_sizes[1], tile_sizes[2]);
// We know that the kernel was generated by mapping the three outer-most
// dimensions to x,y,z dimensions. So we only need to compute those.
std::vector<int> block_dims(3);
for (int i = 0; i < 3; ++i) {
// Compute the number of grids. We use ceildiv here as we have to allocate
// an extra thread/block if the division is not even. The kernel contains
// code to handle the boundaries.
uint64 number_of_threads = Eigen::divup(shape[i], unrolling_factors[i]);
int number_of_grids = Eigen::divup(number_of_threads, tile_sizes[i]);
block_dims[i] = number_of_grids;
}
result.blockDim = se::BlockDim(block_dims[0], block_dims[1], block_dims[2]);
return result;
}
} // namespace
void MlirGeneratedUnaryOp::Compute(OpKernelContext* ctx) {
auto* stream = ctx->op_device_context()->stream();
se::KernelBase* kernel;
{
absl::MutexLock l(&mu_);
if (!kernel_) {
OP_REQUIRES_OK(ctx, CreateKernel(name_, 10, "", cubin_data_,
stream->parent(), kernel_));
}
kernel = kernel_.get();
}
const Tensor& inp = ctx->input(0);
Tensor* out = nullptr;
OP_REQUIRES_OK(
ctx, ctx->forward_input_or_allocate_output({0}, 0, inp.shape(), &out));
if (inp.NumElements() == 0) {
return;
}
se::KernelArgsArray<10> args;
args.add_device_memory_argument(
stream_executor::DeviceMemoryBase(inp.data(), inp.TotalBytes()));
args.add_device_memory_argument(
stream_executor::DeviceMemoryBase(inp.data(), inp.TotalBytes()));
args.add_argument<int64_t>(0);
args.add_argument<int64_t>(inp.NumElements());
args.add_argument<int64_t>(1);
args.add_device_memory_argument(
stream_executor::DeviceMemoryBase(out->data(), out->TotalBytes()));
args.add_device_memory_argument(
stream_executor::DeviceMemoryBase(out->data(), out->TotalBytes()));
args.add_argument<int64_t>(0);
args.add_argument<int64_t>(inp.NumElements());
args.add_argument<int64_t>(1);
// This has to be aligned with the configuration that was used when building
// the kernels. See the corresponding build rules in the `BUILD` file.
LaunchConfig config = GetLaunchConfiguration(
{256}, {4}, {static_cast<uint64>(inp.NumElements())});
OP_REQUIRES_OK(ctx, stream->parent()->Launch(stream, config.threadDim,
config.blockDim, *kernel, args));
}
} // namespace tensorflow

View File

@ -1,76 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_H_
#define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_H_
#include <memory>
#include <string>
#include "absl/strings/ascii.h"
#include "absl/synchronization/mutex.h"
#include "absl/types/span.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/platform/stream_executor.h"
namespace tensorflow {
class MlirGeneratedUnaryOp : public OpKernel {
public:
MlirGeneratedUnaryOp(OpKernelConstruction* ctx, std::string name,
absl::Span<const uint8_t> cubin_data)
: OpKernel(ctx), name_(name), cubin_data_(cubin_data) {}
void Compute(OpKernelContext* ctx) override;
private:
std::string name_;
absl::Span<const uint8_t> cubin_data_;
std::unique_ptr<se::KernelBase> kernel_;
absl::Mutex mu_;
};
#define GENERATE_OP_KERNEL_BASE(kernel_name) \
class MlirGenerated##kernel_name##Op : public MlirGeneratedUnaryOp { \
public: \
MlirGenerated##kernel_name##Op(OpKernelConstruction* ctx, \
absl::Span<const uint8_t> cubin_data) \
: MlirGeneratedUnaryOp(ctx, #kernel_name "_kernel", cubin_data) {} \
};
#define GENERATE_OP_KERNEL_FOR(kernel_name, data_type) \
class MlirGenerated##kernel_name##data_type##Op \
: public MlirGenerated##kernel_name##Op { \
public: \
explicit MlirGenerated##kernel_name##data_type##Op( \
OpKernelConstruction* ctx) \
: MlirGenerated##kernel_name \
##Op(ctx, k##kernel_name##data_type##Kernel) {} \
};
#define GENERATE_AND_REGISTER_UNARY_KERNEL(kernel_name, data_type, \
native_data_type) \
namespace { \
GENERATE_OP_KERNEL_FOR(kernel_name, data_type) \
} \
REGISTER_KERNEL_BUILDER(Name(#kernel_name) \
.Device(DEVICE_GPU) \
.TypeConstraint<native_data_type>("T"), \
MlirGenerated##kernel_name##data_type##Op);
} // namespace tensorflow
#endif // TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_H_

View File

@ -1,36 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <string>
#include <vector>
#include "absl/types/span.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h"
#include "tensorflow/core/kernels/mlir_generated/tanh_f16_kernel.h"
#include "tensorflow/core/kernels/mlir_generated/tanh_f32_kernel.h"
#include "tensorflow/core/kernels/mlir_generated/tanh_f64_kernel.h"
namespace tensorflow {
namespace {
GENERATE_OP_KERNEL_BASE(Tanh);
} // namespace
GENERATE_AND_REGISTER_UNARY_KERNEL(Tanh, F16, Eigen::half)
GENERATE_AND_REGISTER_UNARY_KERNEL(Tanh, F32, float)
GENERATE_AND_REGISTER_UNARY_KERNEL(Tanh, F64, double)
} // namespace tensorflow