[KERNEL_GEN] Remove any trace of ranked MLIR kernels.
PiperOrigin-RevId: 347379437 Change-Id: Ia840f07c6e01d3cbe19059fb02dc676fd0b67136
This commit is contained in:
parent
9618f238aa
commit
fbcb024d7f
@ -86,29 +86,6 @@ cc_library(
|
||||
],
|
||||
)
|
||||
|
||||
tf_cc_binary(
|
||||
name = "tf_to_gpu_binary",
|
||||
srcs = [
|
||||
"crash_handler.h",
|
||||
"tf_to_gpu_binary.cc",
|
||||
],
|
||||
visibility = [
|
||||
"//tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary:__pkg__",
|
||||
"//tensorflow/core/kernels/mlir_generated:__pkg__",
|
||||
],
|
||||
deps = [
|
||||
":kernel_creator",
|
||||
"//tensorflow/compiler/mlir:init_mlir",
|
||||
"//tensorflow/compiler/mlir/tensorflow",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core/platform",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"@com_google_absl//absl/strings",
|
||||
"@llvm-project//llvm:Support",
|
||||
"@llvm-project//mlir:Pass",
|
||||
],
|
||||
)
|
||||
|
||||
tf_cc_binary(
|
||||
name = "tf_to_kernel",
|
||||
srcs = ["tf_to_kernel.cc"],
|
||||
|
||||
@ -90,27 +90,17 @@ struct RemoveUnusedTensorToMemrefOperations
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
|
||||
llvm::ArrayRef<uint32_t> tile_sizes,
|
||||
Status LowerTFtoGPU(mlir::ModuleOp module, llvm::ArrayRef<uint32_t> tile_sizes,
|
||||
llvm::ArrayRef<uint32_t> unroll_factors,
|
||||
bool embed_memref_prints) {
|
||||
mlir::PassManager pm(module.getContext());
|
||||
applyTensorflowAndCLOptions(pm);
|
||||
|
||||
if (gpu_binary_only) {
|
||||
pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
|
||||
/*allow_partial_conversion=*/false, /*legalize_chlo=*/true));
|
||||
pm.addNestedPass<mlir::FuncOp>(
|
||||
mlir::kernel_gen::transforms::CreateMaterializeBroadcastsPass());
|
||||
pm.addNestedPass<mlir::FuncOp>(
|
||||
mlir::kernel_gen::transforms::CreateUnfuseBatchNormPass());
|
||||
} else {
|
||||
pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
|
||||
/*allow_partial_conversion=*/false, /*legalize_chlo=*/false));
|
||||
pm.addNestedPass<mlir::FuncOp>(mlir::createTransformUnrankedHloPass());
|
||||
pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createChloLegalizeToHloPass());
|
||||
pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
|
||||
}
|
||||
pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
|
||||
/*allow_partial_conversion=*/false, /*legalize_chlo=*/false));
|
||||
pm.addNestedPass<mlir::FuncOp>(mlir::createTransformUnrankedHloPass());
|
||||
pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createChloLegalizeToHloPass());
|
||||
pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
|
||||
|
||||
// Transform HLO operations to LinAlg.
|
||||
pm.addNestedPass<mlir::FuncOp>(::mlir::mhlo::createLegalizeHloToLinalgPass());
|
||||
@ -139,12 +129,10 @@ Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
|
||||
pm.addPass(mlir::kernel_gen::transforms::CreateHloBufferizePass());
|
||||
pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
|
||||
pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
|
||||
if (!gpu_binary_only) {
|
||||
// Find candidates for buffer reuse. This is only successful if buffer size
|
||||
// equality can be determined based on `linalg.generic` operations.
|
||||
pm.addNestedPass<mlir::FuncOp>(
|
||||
mlir::kernel_gen::transforms::CreateBufferReusePass());
|
||||
}
|
||||
// Find candidates for buffer reuse. This is only successful if buffer size
|
||||
// equality can be determined based on `linalg.generic` operations.
|
||||
pm.addNestedPass<mlir::FuncOp>(
|
||||
mlir::kernel_gen::transforms::CreateBufferReusePass());
|
||||
pm.addNestedPass<mlir::FuncOp>(
|
||||
mlir::createLinalgTilingToParallelLoopsPass((tiling_for_unrolling)));
|
||||
// Transform the Linalg ops inside of the loop nest into parallel loops.
|
||||
@ -188,15 +176,13 @@ Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
|
||||
std::make_unique<RemoveUnusedTensorToMemrefOperations>());
|
||||
pm.addPass(mlir::createCanonicalizerPass());
|
||||
pm.addNestedPass<mlir::FuncOp>(mlir::createCSEPass());
|
||||
if (!gpu_binary_only) {
|
||||
// Before inserting more allocs, map the ones we already have to the
|
||||
// tf runtime. That ensures that all allocations for the actual computation
|
||||
// end up on the device, whereas allocations for shape computation and host
|
||||
// side things remain on the host.
|
||||
// Longer term, this should be handled by proper device placement.
|
||||
pm.addPass(mlir::kernel_gen::tf_framework::
|
||||
CreateEmbedTFFrameworkFunctionAndAllocPass());
|
||||
}
|
||||
// Before inserting more allocs, map the ones we already have to the
|
||||
// tf runtime. That ensures that all allocations for the actual computation
|
||||
// end up on the device, whereas allocations for shape computation and host
|
||||
// side things remain on the host.
|
||||
// Longer term, this should be handled by proper device placement.
|
||||
pm.addPass(mlir::kernel_gen::tf_framework::
|
||||
CreateEmbedTFFrameworkFunctionAndAllocPass());
|
||||
pm.addPass(mlir::kernel_gen::transforms::CreateFinalBufferizePass());
|
||||
pm.addNestedPass<mlir::FuncOp>(mlir::createPromoteBuffersToStackPass(64));
|
||||
// TODO(herhut): Depends on https://bugs.llvm.org/show_bug.cgi?id=48385.
|
||||
@ -223,11 +209,6 @@ Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
|
||||
// Take launches to launches with kernels.
|
||||
pm.addPass(::mlir::createGpuKernelOutliningPass());
|
||||
|
||||
if (gpu_binary_only) {
|
||||
// Make kernel signature deterministic so that we can call it externally.
|
||||
pm.addNestedPass<::mlir::FuncOp>(
|
||||
xla::mlir_gpu::createRewriteKernelSignaturePass());
|
||||
}
|
||||
pm.addPass(::mlir::createLowerAffinePass());
|
||||
// Constraints are removed as late as possible and before lowering to CFG.
|
||||
pm.addNestedPass<::mlir::FuncOp>(::mlir::createConvertShapeConstraintsPass());
|
||||
@ -295,7 +276,7 @@ Status LowerHostSideToFinalForm(mlir::ModuleOp module) {
|
||||
} // namespace
|
||||
|
||||
StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
|
||||
mlir::MLIRContext& context, llvm::StringRef tf_code, bool gpu_binary_only,
|
||||
mlir::MLIRContext& context, llvm::StringRef tf_code,
|
||||
llvm::ArrayRef<std::string> architectures,
|
||||
llvm::ArrayRef<uint32_t> tile_sizes,
|
||||
llvm::ArrayRef<uint32_t> unroll_factors, bool embed_memref_prints,
|
||||
@ -304,8 +285,8 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
|
||||
mlir::RegisterAllTensorFlowDialects(registry);
|
||||
registry.insert<mlir::chlo::HloClientDialect, mlir::mhlo::MhloDialect>();
|
||||
mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
|
||||
TF_RETURN_IF_ERROR(LowerTFtoGPU(module.get(), gpu_binary_only, tile_sizes,
|
||||
unroll_factors, embed_memref_prints));
|
||||
TF_RETURN_IF_ERROR(LowerTFtoGPU(module.get(), tile_sizes, unroll_factors,
|
||||
embed_memref_prints));
|
||||
#if !defined(TENSORFLOW_USE_ROCM) && !defined(GOOGLE_CUDA)
|
||||
return InternalError(
|
||||
"Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
|
||||
@ -321,9 +302,7 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
|
||||
TF_RETURN_IF_ERROR(GenerateDeviceCode(module.get(), kGpuBinaryAttrName,
|
||||
architectures, generate_fatbin,
|
||||
print_ptx));
|
||||
if (!gpu_binary_only) {
|
||||
TF_RETURN_IF_ERROR(LowerHostSideToFinalForm(module.get()));
|
||||
}
|
||||
TF_RETURN_IF_ERROR(LowerHostSideToFinalForm(module.get()));
|
||||
return module;
|
||||
}
|
||||
|
||||
|
||||
@ -33,11 +33,9 @@ limitations under the License.
|
||||
namespace tensorflow {
|
||||
namespace kernel_gen {
|
||||
|
||||
// Converts TF code to LLVM/NVVM. If `gpu_binary_only` is true, then the
|
||||
// conversion stops after gpu_binary blob is generated. If `gpu_binary_only` is
|
||||
// false, lowers the host side to LLVM Dialect.
|
||||
// Converts TF code to LLVM/NVVM. Lowers the host side to LLVM Dialect.
|
||||
xla::StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
|
||||
mlir::MLIRContext& context, llvm::StringRef tf_code, bool gpu_binary_only,
|
||||
mlir::MLIRContext& context, llvm::StringRef tf_code,
|
||||
llvm::ArrayRef<std::string> architectures = {"sm_75"},
|
||||
llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
|
||||
llvm::ArrayRef<uint32_t> unroll_factors = {},
|
||||
|
||||
@ -1,17 +0,0 @@
|
||||
load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
|
||||
|
||||
package(licenses = ["notice"])
|
||||
|
||||
glob_lit_tests(
|
||||
data = [
|
||||
"//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_gpu_binary",
|
||||
"@llvm-project//mlir:run_lit.sh",
|
||||
],
|
||||
default_tags = [
|
||||
# We need access to the CUDA SDK.
|
||||
"gpu",
|
||||
"no_rocm",
|
||||
],
|
||||
driver = "//tensorflow/compiler/mlir:run_lit.sh",
|
||||
test_file_exts = ["mlir"],
|
||||
)
|
||||
@ -1,6 +0,0 @@
|
||||
// RUN: tf_to_gpu_binary --input=%s --output=%t --unroll_factors=4 --tile_sizes=256 --arch=sm_70
|
||||
func @abs(%arg0: tensor<?xf16>) -> tensor<?xf16> attributes {tf_entry} {
|
||||
%0 = "tf.Abs"(%arg0) { }
|
||||
: (tensor<?xf16>) -> tensor<?xf16>
|
||||
return %0 : tensor<?xf16>
|
||||
}
|
||||
@ -1,6 +0,0 @@
|
||||
// RUN: tf_to_gpu_binary --input=%s --output=%t --unroll_factors=4 --tile_sizes=256 --arch=sm_70
|
||||
func @ceil(%arg0: tensor<?xf64>) -> tensor<?xf64> attributes {tf_entry} {
|
||||
%0 = "tf.Ceil"(%arg0) { }
|
||||
: (tensor<?xf64>) -> tensor<?xf64>
|
||||
return %0 : tensor<?xf64>
|
||||
}
|
||||
@ -1,5 +0,0 @@
|
||||
// RUN: tf_to_gpu_binary --input=%s --output=%t --unroll_factors=4 --tile_sizes=256 --arch=sm_70
|
||||
func @tanh(%arg0: tensor<?xf32>) -> tensor<?xf32> attributes {tf_entry} {
|
||||
%0 = "tf.Tanh"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
|
||||
return %0 : tensor<?xf32>
|
||||
}
|
||||
@ -1,96 +0,0 @@
|
||||
// Copyright 2020 The TensorFlow Runtime Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//===- tf_to_gpu_binary.cc --------------------------------------*- C++ -*-===//
|
||||
//
|
||||
// This file implements the entry point to compile a tf op to a gpu binary
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "mlir/Pass/PassManager.h" // from @llvm-project
|
||||
#include "tensorflow/compiler/mlir/init_mlir.h"
|
||||
#include "tensorflow/compiler/mlir/tools/kernel_gen/crash_handler.h"
|
||||
#include "tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h"
|
||||
#include "tensorflow/core/platform/env.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/stream_executor/lib/statusor.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace kernel_gen {
|
||||
namespace {
|
||||
|
||||
xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
|
||||
std::string architecture, llvm::ArrayRef<uint32_t> tile_sizes,
|
||||
llvm::ArrayRef<uint32_t> unroll_factors) {
|
||||
// Read TF code.
|
||||
std::string tf_code;
|
||||
TF_RETURN_IF_ERROR(
|
||||
ReadFileToString(Env::Default(), input_file.str(), &tf_code));
|
||||
// Compile.
|
||||
mlir::MLIRContext context;
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
mlir::OwningModuleRef module,
|
||||
GenerateKernelForTfCode(context, tf_code, /*gpu_binary_only=*/true,
|
||||
architecture, tile_sizes, unroll_factors,
|
||||
/*embed_memref_prints=*/false,
|
||||
/*generate_fatbin=*/false));
|
||||
// Extract gpu_binary.
|
||||
TF_ASSIGN_OR_RETURN(std::string gpu_binary, ExtractGpuBinary(*module));
|
||||
|
||||
// Write gpu_binary blob.
|
||||
TF_RETURN_IF_ERROR(
|
||||
WriteStringToFile(Env::Default(), output_file.str(), gpu_binary));
|
||||
return xla::Status::OK();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace kernel_gen
|
||||
} // namespace tensorflow
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
tensorflow::kernel_gen::SetCrashReportMessage();
|
||||
llvm::cl::opt<std::string> input_file("input", llvm::cl::desc("input file"),
|
||||
llvm::cl::value_desc("filename"),
|
||||
llvm::cl::init("foo.mlir"));
|
||||
llvm::cl::opt<std::string> output_file(
|
||||
"output", llvm::cl::desc("output file"), llvm::cl::value_desc("filename"),
|
||||
llvm::cl::init("foo.bin"));
|
||||
llvm::cl::opt<std::string> architecture(
|
||||
"arch", llvm::cl::desc("target architecture (e.g. sm_50)"),
|
||||
llvm::cl::init("sm_50"));
|
||||
llvm::cl::list<uint32_t> tile_sizes(
|
||||
"tile_sizes", llvm::cl::desc("tile sizes to use"), llvm::cl::ZeroOrMore,
|
||||
llvm::cl::CommaSeparated);
|
||||
llvm::cl::list<uint32_t> unroll_factors(
|
||||
"unroll_factors",
|
||||
llvm::cl::desc("factors to unroll by, separated by commas"),
|
||||
llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
|
||||
|
||||
tensorflow::InitMlir y(&argc, &argv);
|
||||
mlir::registerPassManagerCLOptions();
|
||||
llvm::cl::ParseCommandLineOptions(argc, argv, "TF op GPU kernel generator\n");
|
||||
|
||||
auto status = tensorflow::kernel_gen::Run(
|
||||
input_file, output_file, architecture, tile_sizes, unroll_factors);
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << status;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@ -115,9 +115,9 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
|
||||
mlir::MLIRContext context;
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
mlir::OwningModuleRef module,
|
||||
GenerateKernelForTfCode(context, tf_code, /*gpu_binary_only=*/false,
|
||||
architectures, tile_sizes, unroll_factors,
|
||||
embed_memref_prints, /*generate_fatbin=*/true,
|
||||
GenerateKernelForTfCode(context, tf_code, architectures, tile_sizes,
|
||||
unroll_factors, embed_memref_prints,
|
||||
/*generate_fatbin=*/true,
|
||||
/*print_ptx=*/print_ptx));
|
||||
// Get binary.
|
||||
TF_ASSIGN_OR_RETURN(std::string binary, EmitToBinary(*module));
|
||||
|
||||
@ -77,20 +77,16 @@ cc_library(
|
||||
"embed_memref_prints.cc",
|
||||
"embed_tf_framework_pass.cc",
|
||||
"gpu_kernel_to_blob_pass.cc",
|
||||
"materialize_broadcasts_pass.cc",
|
||||
"parallel_loops_to_sequential.cc",
|
||||
"same_shape_propagation.cc",
|
||||
"shape_to_descriptors_pass.cc",
|
||||
"tensorflow_abi_knowledge_propagation.cc",
|
||||
"tf_kernel_to_llvm_pass.cc",
|
||||
"unfuse_batch_norm_pass.cc",
|
||||
],
|
||||
hdrs = ["passes.h"],
|
||||
copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]) + if_rocm_is_configured(["-DTENSORFLOW_USE_ROCM=1"]),
|
||||
deps = [
|
||||
"@llvm-project//mlir:Affine",
|
||||
"//tensorflow/compiler/mlir/hlo:materialize_broadcasts", # buildcleaner: keep
|
||||
"//tensorflow/compiler/mlir/hlo:unfuse_batch_norm", # buildcleaner: keep
|
||||
"//tensorflow/compiler/xla/service:hlo_module_config",
|
||||
"//tensorflow/compiler/xla:debug_options_flags",
|
||||
"//tensorflow/compiler/xla:statusor",
|
||||
|
||||
@ -1,61 +0,0 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "mlir/Dialect/StandardOps/IR/Ops.h" // from @llvm-project
|
||||
#include "mlir/Transforms/DialectConversion.h" // from @llvm-project
|
||||
#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
|
||||
#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
|
||||
#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
|
||||
|
||||
namespace mlir {
|
||||
namespace kernel_gen {
|
||||
namespace transforms {
|
||||
namespace {
|
||||
|
||||
#define GEN_PASS_CLASSES
|
||||
#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
|
||||
|
||||
struct MaterializeBroadcastsPass
|
||||
: public MaterializeBroadcastsPassBase<MaterializeBroadcastsPass> {
|
||||
void runOnFunction() override {
|
||||
mlir::ConversionTarget conversionTarget(getContext());
|
||||
mlir::OwningRewritePatternList conversionPatterns;
|
||||
|
||||
// Consider the mhlo dialect legal for tests.
|
||||
conversionTarget.addLegalDialect<mlir::mhlo::MhloDialect>();
|
||||
// The conversion uses helpers from the Standard dialect.
|
||||
conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
|
||||
|
||||
mlir::mhlo::SetupMaterializeBroadcastsLegality(&getContext(),
|
||||
&conversionTarget);
|
||||
mlir::mhlo::PopulateMaterializeBroadcastsPatterns(&getContext(),
|
||||
&conversionPatterns);
|
||||
|
||||
if (failed(applyPartialConversion(getFunction(), conversionTarget,
|
||||
std::move(conversionPatterns)))) {
|
||||
return signalPassFailure();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
std::unique_ptr<mlir::FunctionPass> CreateMaterializeBroadcastsPass() {
|
||||
return std::make_unique<MaterializeBroadcastsPass>();
|
||||
}
|
||||
|
||||
} // namespace transforms
|
||||
} // namespace kernel_gen
|
||||
} // namespace mlir
|
||||
@ -62,9 +62,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateHloBufferizePass();
|
||||
// buffers.
|
||||
std::unique_ptr<OperationPass<ModuleOp>> CreateFinalBufferizePass();
|
||||
|
||||
// Pass to materialize broadcasts.
|
||||
std::unique_ptr<FunctionPass> CreateMaterializeBroadcastsPass();
|
||||
|
||||
// Pass to convert scf::ParallelOp to scf::ForOp.
|
||||
std::unique_ptr<FunctionPass> CreateParallelLoopsToSequential();
|
||||
|
||||
@ -74,9 +71,6 @@ std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
|
||||
ArrayRef<std::string> architectures = {}, bool generate_fatbin = true,
|
||||
bool print_ptx = false);
|
||||
|
||||
// Pass to unfuse batch norm.
|
||||
std::unique_ptr<FunctionPass> CreateUnfuseBatchNormPass();
|
||||
|
||||
// Pass to propagate tensorflow runtime ABI knowledge across kernel boundaries.
|
||||
std::unique_ptr<FunctionPass> CreatePropagateTfAbiKnowledgeToKernels();
|
||||
|
||||
|
||||
@ -61,16 +61,6 @@ def FinalBufferizePass : Pass<"final-bufferize", "ModuleOp"> {
|
||||
let constructor = "transforms::CreateFinalBufferizePass()";
|
||||
}
|
||||
|
||||
def MaterializeBroadcastsPass : FunctionPass<"materialize-broadcast"> {
|
||||
let summary = "Pass to materialize broadcasts";
|
||||
let constructor = "transforms::CreateMaterializeBroadcastsPass()";
|
||||
}
|
||||
|
||||
def UnfuseBatchNormPass : FunctionPass<"unfuse-batch-norm"> {
|
||||
let summary = "Pass to unfuse batch norm";
|
||||
let constructor = "transforms::CreateUnfuseBatchNormPass()";
|
||||
}
|
||||
|
||||
def GpuKernelToBlobPass : Pass<"gpu-kernel-to-blob", "gpu::GPUModuleOp"> {
|
||||
let summary = "Pass to annotate GPU Module with its PTX";
|
||||
let options = [
|
||||
|
||||
@ -1,45 +0,0 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h" // from @llvm-project
|
||||
#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
|
||||
#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
|
||||
|
||||
namespace mlir {
|
||||
namespace kernel_gen {
|
||||
namespace transforms {
|
||||
namespace {
|
||||
|
||||
#define GEN_PASS_CLASSES
|
||||
#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
|
||||
|
||||
struct UnfuseBatchNormPass
|
||||
: public UnfuseBatchNormPassBase<UnfuseBatchNormPass> {
|
||||
void runOnFunction() override {
|
||||
mlir::OwningRewritePatternList patterns;
|
||||
mlir::mhlo::PopulateUnfuseBatchNormPatterns(&getContext(), &patterns);
|
||||
mlir::applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
std::unique_ptr<mlir::FunctionPass> CreateUnfuseBatchNormPass() {
|
||||
return std::make_unique<UnfuseBatchNormPass>();
|
||||
}
|
||||
|
||||
} // namespace transforms
|
||||
} // namespace kernel_gen
|
||||
} // namespace mlir
|
||||
@ -261,7 +261,6 @@ tf_cuda_cc_test(
|
||||
|
||||
gen_kernel_library(
|
||||
name = "abs",
|
||||
generate_unranked = True,
|
||||
tile_size = "256",
|
||||
types = [
|
||||
"f16",
|
||||
@ -275,7 +274,6 @@ gen_kernel_library(
|
||||
|
||||
gen_kernel_library(
|
||||
name = "conj",
|
||||
generate_unranked = True,
|
||||
tile_size = "256",
|
||||
types = [
|
||||
"f32",
|
||||
@ -286,7 +284,6 @@ gen_kernel_library(
|
||||
|
||||
gen_kernel_library(
|
||||
name = "imag",
|
||||
generate_unranked = True,
|
||||
tile_size = "256",
|
||||
types = [
|
||||
"f32",
|
||||
@ -296,7 +293,6 @@ gen_kernel_library(
|
||||
|
||||
gen_kernel_library(
|
||||
name = "invert",
|
||||
generate_unranked = True,
|
||||
tile_size = "256",
|
||||
types = [
|
||||
"i8",
|
||||
@ -309,8 +305,6 @@ gen_kernel_library(
|
||||
|
||||
gen_kernel_library(
|
||||
name = "is_inf",
|
||||
generate_ranked = False,
|
||||
generate_unranked = True,
|
||||
tile_size = "256",
|
||||
types = [
|
||||
"f16",
|
||||
@ -322,14 +316,12 @@ gen_kernel_library(
|
||||
|
||||
gen_kernel_library(
|
||||
name = "logical_not",
|
||||
generate_unranked = True,
|
||||
tile_size = "256",
|
||||
types = ["i1"],
|
||||
)
|
||||
|
||||
gen_kernel_library(
|
||||
name = "real",
|
||||
generate_unranked = True,
|
||||
tile_size = "256",
|
||||
types = [
|
||||
"f32",
|
||||
@ -339,7 +331,6 @@ gen_kernel_library(
|
||||
|
||||
gen_kernel_library(
|
||||
name = "sign",
|
||||
generate_unranked = True,
|
||||
tile_size = "256",
|
||||
types = [
|
||||
# TODO(b/162577610): Add bf16, c64 and c128.
|
||||
@ -354,8 +345,6 @@ gen_kernel_library(
|
||||
|
||||
gen_kernel_library(
|
||||
name = "add_v2",
|
||||
generate_ranked = False,
|
||||
generate_unranked = True,
|
||||
tile_size = "256,1,1",
|
||||
types = [
|
||||
"f16",
|
||||
@ -371,8 +360,6 @@ gen_kernel_library(
|
||||
[
|
||||
gen_kernel_library(
|
||||
name = name,
|
||||
generate_ranked = False,
|
||||
generate_unranked = True,
|
||||
tile_size = "256,1,1",
|
||||
types = [
|
||||
"i8",
|
||||
@ -401,8 +388,6 @@ gen_kernel_library(
|
||||
[
|
||||
gen_kernel_library(
|
||||
name = name,
|
||||
generate_ranked = False,
|
||||
generate_unranked = True,
|
||||
tile_size = "256,1,1",
|
||||
types = [
|
||||
"i1",
|
||||
@ -419,8 +404,6 @@ gen_kernel_library(
|
||||
[
|
||||
gen_kernel_library(
|
||||
name = name,
|
||||
generate_ranked = False,
|
||||
generate_unranked = True,
|
||||
tile_size = "256,1,1",
|
||||
types = [
|
||||
"f16",
|
||||
@ -444,8 +427,6 @@ gen_kernel_library(
|
||||
[
|
||||
gen_kernel_library(
|
||||
name = name,
|
||||
generate_ranked = False,
|
||||
generate_unranked = True,
|
||||
tile_size = "256,1,1",
|
||||
types = [
|
||||
"f16",
|
||||
@ -470,8 +451,6 @@ gen_kernel_library(
|
||||
[
|
||||
gen_kernel_library(
|
||||
name = name,
|
||||
generate_ranked = False,
|
||||
generate_unranked = True,
|
||||
tile_size = "256,1,1",
|
||||
types = [
|
||||
"f16",
|
||||
@ -494,7 +473,6 @@ gen_kernel_library(
|
||||
[
|
||||
gen_kernel_library(
|
||||
name = name,
|
||||
generate_unranked = True,
|
||||
tile_size = "256",
|
||||
types = [
|
||||
"f16",
|
||||
@ -516,7 +494,6 @@ gen_kernel_library(
|
||||
[
|
||||
gen_kernel_library(
|
||||
name = name,
|
||||
generate_unranked = True,
|
||||
tile_size = "256",
|
||||
types = [
|
||||
"f16",
|
||||
@ -541,7 +518,6 @@ gen_kernel_library(
|
||||
[
|
||||
gen_kernel_library(
|
||||
name = name,
|
||||
generate_unranked = True,
|
||||
tile_size = "256",
|
||||
types = [
|
||||
"f16",
|
||||
|
||||
@ -31,168 +31,6 @@ GpuBinaryInfo = provider(
|
||||
fields = ["gpu_bins"],
|
||||
)
|
||||
|
||||
def _gen_kernel_gpu_bin_impl(ctx):
|
||||
name = ctx.attr.name
|
||||
tile_sizes = ctx.attr.tile_size.replace("x", ",")
|
||||
cmd_args = []
|
||||
if ctx.attr.unroll_factors:
|
||||
cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)
|
||||
|
||||
if ctx.attr.extra_args:
|
||||
cmd_args.extend(ctx.attr.extra_args)
|
||||
|
||||
gpu_bins = []
|
||||
for arch in ctx.attr.gpu_archs:
|
||||
# TODO(b/170283783): 'compute_' should generate both SASS and PTX.
|
||||
arch = arch.replace("compute_", "sm_")
|
||||
filename = "%s.%s.bin" % (name, arch)
|
||||
gpu_bin = ctx.actions.declare_file(filename)
|
||||
ctx.actions.run(
|
||||
inputs = [ctx.file.mlir_op, ctx.file._tfso],
|
||||
outputs = [gpu_bin],
|
||||
executable = ctx.executable._tool,
|
||||
arguments = cmd_args + [
|
||||
"--tile_sizes=%s" % tile_sizes,
|
||||
"--arch=%s" % arch,
|
||||
"--input=%s" % ctx.file.mlir_op.path,
|
||||
"--output=%s" % gpu_bin.path,
|
||||
],
|
||||
mnemonic = "compile",
|
||||
)
|
||||
gpu_bins.append(gpu_bin)
|
||||
return [GpuBinaryInfo(gpu_bins = gpu_bins)]
|
||||
|
||||
_gen_kernel_gpu_bin_rule = rule(
|
||||
attrs = {
|
||||
"mlir_op": attr.label(mandatory = True, allow_single_file = True),
|
||||
"tile_size": attr.string(mandatory = True),
|
||||
"unroll_factors": attr.string(),
|
||||
"gpu_archs": attr.string_list(mandatory = True),
|
||||
"extra_args": attr.string_list(),
|
||||
"_tfso": attr.label(
|
||||
default = Label("//tensorflow:libtensorflow_framework.so.2"),
|
||||
cfg = "host",
|
||||
allow_single_file = True,
|
||||
),
|
||||
"_tool": attr.label(
|
||||
executable = True,
|
||||
default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_gpu_binary"),
|
||||
cfg = "host",
|
||||
),
|
||||
},
|
||||
output_to_genfiles = True,
|
||||
implementation = _gen_kernel_gpu_bin_impl,
|
||||
)
|
||||
|
||||
def _gen_kernel_image_hdr_impl_cuda(ctx):
|
||||
images = []
|
||||
for cubin in ctx.attr.input[GpuBinaryInfo].gpu_bins:
|
||||
arch = cubin.path.split(".")[-2]
|
||||
images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
|
||||
|
||||
# Generate fatbin file from all cubins.
|
||||
fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
|
||||
ctx.actions.run(
|
||||
outputs = [fatbin],
|
||||
inputs = ctx.attr.input[GpuBinaryInfo].gpu_bins,
|
||||
executable = _lookup_file(ctx.attr._gpu_root, "bin/fatbinary"),
|
||||
arguments = [
|
||||
"--64",
|
||||
"--cmdline=--compile-only",
|
||||
"--link",
|
||||
"--compress-all",
|
||||
"--create=%s" % fatbin.path,
|
||||
] + images,
|
||||
mnemonic = "fatbinary",
|
||||
)
|
||||
|
||||
bin2c = _lookup_file(ctx.attr._gpu_root, "bin/bin2c")
|
||||
ctx.actions.run_shell(
|
||||
outputs = [ctx.outputs.out],
|
||||
inputs = [fatbin],
|
||||
tools = [bin2c],
|
||||
command = "%s --static --const --type=char --name=%s %s 1> %s" %
|
||||
(bin2c.path, ctx.attr.symbol, fatbin.path, ctx.outputs.out.path),
|
||||
mnemonic = "bin2c",
|
||||
)
|
||||
|
||||
def _gen_kernel_image_hdr_impl_rocm(ctx):
|
||||
hsaco_files = []
|
||||
hsaco_targets = []
|
||||
|
||||
# Add a dummy host target triple...clang-offload-bundler requires 1 and only 1 host target triple
|
||||
hsaco_files.append("/dev/null")
|
||||
hsaco_targets.append("host-x86_64-unknown-linux")
|
||||
|
||||
hsacos = ctx.attr.input[GpuBinaryInfo].gpu_bins
|
||||
for hsaco in hsacos:
|
||||
gfx_arch = hsaco.path.split(".")[-2]
|
||||
hsaco_files.append(hsaco.path)
|
||||
hsaco_targets.append("hip-amdgcn-amd-amdhsa-%s" % gfx_arch)
|
||||
|
||||
# Generate fatbin file from all hsacos.
|
||||
fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
|
||||
ctx.actions.run(
|
||||
outputs = [fatbin],
|
||||
inputs = hsacos,
|
||||
executable = _lookup_file(ctx.attr._gpu_root, "bin/clang-offload-bundler"),
|
||||
arguments = [
|
||||
"--inputs=%s" % ",".join(hsaco_files),
|
||||
"--targets=%s" % ",".join(hsaco_targets),
|
||||
"--type=o",
|
||||
"--outputs=%s" % fatbin.path,
|
||||
],
|
||||
mnemonic = "fatbinary",
|
||||
)
|
||||
|
||||
ctx.actions.run_shell(
|
||||
outputs = [ctx.outputs.out],
|
||||
inputs = [fatbin],
|
||||
command = (
|
||||
("hex=`hexdump -v -e \'/1 \"0x%%02x, \"\' %s` && " +
|
||||
"len=`echo $hex | wc -c` && " +
|
||||
"echo 'static const unsigned char %s['$len' + 1] = {' > %s && " +
|
||||
"echo $hex | cat >> %s && " +
|
||||
"echo '};' >> %s") % (
|
||||
fatbin.path,
|
||||
ctx.attr.symbol,
|
||||
ctx.outputs.out.path,
|
||||
ctx.outputs.out.path,
|
||||
ctx.outputs.out.path,
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
_gen_kernel_image_hdr_rule = rule(
|
||||
implementation = _gen_kernel_image_hdr_impl_rocm if rocm_is_configured() else _gen_kernel_image_hdr_impl_cuda,
|
||||
output_to_genfiles = True,
|
||||
attrs = {
|
||||
"input": attr.label(mandatory = True, providers = [GpuBinaryInfo]),
|
||||
"out": attr.output(mandatory = True),
|
||||
"symbol": attr.string(mandatory = True),
|
||||
"_gpu_root": attr.label(
|
||||
default = Label("@local_config_rocm//rocm:rocm_root") if rocm_is_configured() else Label("@local_config_cuda//cuda:cuda_root"),
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
def _gen_kernel_image_hdr(name, mlir_op, gpu_archs, tile_size, unroll_factors = None, extra_args = []):
|
||||
"""Generates a C header with fatbin data from a Tensorflow op."""
|
||||
_gen_kernel_gpu_bin_rule(
|
||||
name = name + "_cubin",
|
||||
mlir_op = mlir_op,
|
||||
tile_size = tile_size,
|
||||
unroll_factors = unroll_factors,
|
||||
gpu_archs = gpu_archs,
|
||||
extra_args = extra_args,
|
||||
)
|
||||
_gen_kernel_image_hdr_rule(
|
||||
name = name,
|
||||
input = ":" + name + "_cubin",
|
||||
out = "%s.h" % name,
|
||||
symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
|
||||
)
|
||||
|
||||
type_to_mlir = {
|
||||
"c64": "complex<f32>",
|
||||
"c128": "complex<f64>",
|
||||
@ -204,18 +42,12 @@ def _gen_mlir_op_impl(ctx):
|
||||
if mlir_type in type_to_mlir:
|
||||
mlir_type = type_to_mlir[mlir_type]
|
||||
|
||||
# In order to generate a ranked kernel we change *xelem_type to ?xelem_type
|
||||
# and remove element type from the entry function name.
|
||||
convert_to_ranked = ""
|
||||
if ctx.attr.unranked == False:
|
||||
convert_to_ranked = "sed s/*x/?x/g | sed s/_elem_type//g |"
|
||||
cmd = ctx.actions.run_shell(
|
||||
inputs = [ctx.file.template],
|
||||
outputs = [ctx.outputs.out],
|
||||
command = (
|
||||
("cat %s | %s sed 's/_elem_type/_%s/g' | sed 's/elem_type/%s/g' > %s") % (
|
||||
("cat %s | sed 's/_elem_type/_%s/g' | sed 's/elem_type/%s/g' > %s") % (
|
||||
ctx.file.template.path,
|
||||
convert_to_ranked,
|
||||
ctx.attr.type,
|
||||
mlir_type,
|
||||
ctx.outputs.out.path,
|
||||
@ -244,40 +76,6 @@ def _gen_mlir_op(name, type, unranked):
|
||||
unranked = unranked,
|
||||
)
|
||||
|
||||
def gen_ranked_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = []):
|
||||
""" Generate a library with kernels for a specific tensorflow op.
|
||||
|
||||
Args:
|
||||
name: The name of the tensorflow op.
|
||||
types: The types ("f16", "f32", "f64") for which a kernel should be generated.
|
||||
tile_size: The tiling specification, e.g. "16x16".
|
||||
unroll_factors: The unrolling specification, e.g. "4,4"
|
||||
tags: The tags which should be added to the library.
|
||||
extra_args: Extra arguments to pass to the generator tool.
|
||||
"""
|
||||
|
||||
if cuda_gpu_architectures() or rocm_gpu_architectures():
|
||||
for type in types:
|
||||
_gen_mlir_op(
|
||||
name = name,
|
||||
type = type,
|
||||
unranked = False,
|
||||
)
|
||||
_gen_kernel_image_hdr(
|
||||
name = "{name}_{type}_kernel".format(name = name, type = type),
|
||||
mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
|
||||
gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
|
||||
tile_size = tile_size,
|
||||
unroll_factors = unroll_factors,
|
||||
extra_args = extra_args,
|
||||
)
|
||||
|
||||
native.cc_library(
|
||||
name = name + "_kernels",
|
||||
hdrs = if_gpu_is_configured([":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
|
||||
tags = tags,
|
||||
)
|
||||
|
||||
################################################################################
|
||||
# Unranked kernels build rules.
|
||||
################################################################################
|
||||
@ -410,22 +208,12 @@ def gen_unranked_kernel_library(name, types, tile_size, tags = [], unroll_factor
|
||||
tags = tags,
|
||||
)
|
||||
|
||||
def gen_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = [], generate_ranked = True, generate_unranked = False):
|
||||
if (generate_ranked):
|
||||
gen_ranked_kernel_library(
|
||||
name = name,
|
||||
types = types,
|
||||
tile_size = tile_size,
|
||||
tags = tags,
|
||||
unroll_factors = unroll_factors,
|
||||
extra_args = extra_args,
|
||||
)
|
||||
if (generate_unranked):
|
||||
gen_unranked_kernel_library(
|
||||
name = name + "_unranked",
|
||||
types = types,
|
||||
tile_size = tile_size,
|
||||
tags = tags,
|
||||
unroll_factors = unroll_factors,
|
||||
extra_args = extra_args,
|
||||
)
|
||||
def gen_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = []):
|
||||
gen_unranked_kernel_library(
|
||||
name = name + "_unranked",
|
||||
types = types,
|
||||
tile_size = tile_size,
|
||||
tags = tags,
|
||||
unroll_factors = unroll_factors,
|
||||
extra_args = extra_args,
|
||||
)
|
||||
|
||||
@ -1,40 +0,0 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/types/span.h"
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/kernels/mlir_generated/abs_f16_kernel.h"
|
||||
#include "tensorflow/core/kernels/mlir_generated/abs_f32_kernel.h"
|
||||
#include "tensorflow/core/kernels/mlir_generated/abs_f64_kernel.h"
|
||||
#include "tensorflow/core/kernels/mlir_generated/abs_i32_kernel.h"
|
||||
#include "tensorflow/core/kernels/mlir_generated/abs_i64_kernel.h"
|
||||
#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace {
|
||||
GENERATE_OP_KERNEL_BASE(Abs);
|
||||
} // namespace
|
||||
|
||||
GENERATE_AND_REGISTER_UNARY_KERNEL(Abs, F16, Eigen::half);
|
||||
GENERATE_AND_REGISTER_UNARY_KERNEL(Abs, F32, float);
|
||||
GENERATE_AND_REGISTER_UNARY_KERNEL(Abs, F64, double);
|
||||
GENERATE_AND_REGISTER_UNARY_KERNEL(Abs, I32, int32);
|
||||
GENERATE_AND_REGISTER_UNARY_KERNEL(Abs, I64, int64);
|
||||
} // namespace tensorflow
|
||||
@ -1,129 +0,0 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "absl/synchronization/mutex.h"
|
||||
#include "absl/types/span.h"
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/lib/core/errors.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/stream_executor.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace {
|
||||
Status CreateKernel(absl::string_view kernel_name, uint64_t num_args,
|
||||
absl::string_view ptx, absl::Span<const uint8_t> cubin_data,
|
||||
se::StreamExecutor* stream_exec,
|
||||
std::unique_ptr<se::KernelBase>& kernel_base) {
|
||||
se::MultiKernelLoaderSpec loader_spec(num_args);
|
||||
|
||||
if (!cubin_data.empty()) {
|
||||
loader_spec.AddCudaCubinInMemory(
|
||||
reinterpret_cast<const char*>(cubin_data.data()), kernel_name);
|
||||
}
|
||||
|
||||
kernel_base.reset(new se::KernelBase(stream_exec));
|
||||
return stream_exec->GetKernel(loader_spec, kernel_base.get());
|
||||
}
|
||||
|
||||
struct LaunchConfig {
|
||||
se::BlockDim blockDim;
|
||||
se::ThreadDim threadDim;
|
||||
};
|
||||
|
||||
LaunchConfig GetLaunchConfiguration(std::vector<uint64> tile_sizes,
|
||||
std::vector<uint64> unrolling_factors,
|
||||
std::vector<uint64> shape) {
|
||||
LaunchConfig result;
|
||||
// Ensure the vectors are length 3 and pad with ones.
|
||||
tile_sizes.resize(3, 1);
|
||||
unrolling_factors.resize(3, 1);
|
||||
shape.resize(3, 1);
|
||||
// The number of threads is given by the tiling size.
|
||||
result.threadDim = se::ThreadDim(tile_sizes[0], tile_sizes[1], tile_sizes[2]);
|
||||
// We know that the kernel was generated by mapping the three outer-most
|
||||
// dimensions to x,y,z dimensions. So we only need to compute those.
|
||||
std::vector<int> block_dims(3);
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
// Compute the number of grids. We use ceildiv here as we have to allocate
|
||||
// an extra thread/block if the division is not even. The kernel contains
|
||||
// code to handle the boundaries.
|
||||
uint64 number_of_threads = Eigen::divup(shape[i], unrolling_factors[i]);
|
||||
int number_of_grids = Eigen::divup(number_of_threads, tile_sizes[i]);
|
||||
block_dims[i] = number_of_grids;
|
||||
}
|
||||
result.blockDim = se::BlockDim(block_dims[0], block_dims[1], block_dims[2]);
|
||||
return result;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void MlirGeneratedUnaryOp::Compute(OpKernelContext* ctx) {
|
||||
auto* stream = ctx->op_device_context()->stream();
|
||||
se::KernelBase* kernel;
|
||||
{
|
||||
absl::MutexLock l(&mu_);
|
||||
if (!kernel_) {
|
||||
OP_REQUIRES_OK(ctx, CreateKernel(name_, 10, "", cubin_data_,
|
||||
stream->parent(), kernel_));
|
||||
}
|
||||
kernel = kernel_.get();
|
||||
}
|
||||
|
||||
const Tensor& inp = ctx->input(0);
|
||||
Tensor* out = nullptr;
|
||||
OP_REQUIRES_OK(
|
||||
ctx, ctx->forward_input_or_allocate_output({0}, 0, inp.shape(), &out));
|
||||
|
||||
if (inp.NumElements() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
se::KernelArgsArray<10> args;
|
||||
|
||||
args.add_device_memory_argument(
|
||||
stream_executor::DeviceMemoryBase(inp.data(), inp.TotalBytes()));
|
||||
args.add_device_memory_argument(
|
||||
stream_executor::DeviceMemoryBase(inp.data(), inp.TotalBytes()));
|
||||
args.add_argument<int64_t>(0);
|
||||
args.add_argument<int64_t>(inp.NumElements());
|
||||
args.add_argument<int64_t>(1);
|
||||
|
||||
args.add_device_memory_argument(
|
||||
stream_executor::DeviceMemoryBase(out->data(), out->TotalBytes()));
|
||||
args.add_device_memory_argument(
|
||||
stream_executor::DeviceMemoryBase(out->data(), out->TotalBytes()));
|
||||
args.add_argument<int64_t>(0);
|
||||
args.add_argument<int64_t>(inp.NumElements());
|
||||
args.add_argument<int64_t>(1);
|
||||
|
||||
// This has to be aligned with the configuration that was used when building
|
||||
// the kernels. See the corresponding build rules in the `BUILD` file.
|
||||
LaunchConfig config = GetLaunchConfiguration(
|
||||
{256}, {4}, {static_cast<uint64>(inp.NumElements())});
|
||||
OP_REQUIRES_OK(ctx, stream->parent()->Launch(stream, config.threadDim,
|
||||
config.blockDim, *kernel, args));
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
||||
@ -1,76 +0,0 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_H_
|
||||
#define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "absl/strings/ascii.h"
|
||||
#include "absl/synchronization/mutex.h"
|
||||
#include "absl/types/span.h"
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/platform/stream_executor.h"
|
||||
|
||||
namespace tensorflow {
|
||||
class MlirGeneratedUnaryOp : public OpKernel {
|
||||
public:
|
||||
MlirGeneratedUnaryOp(OpKernelConstruction* ctx, std::string name,
|
||||
absl::Span<const uint8_t> cubin_data)
|
||||
: OpKernel(ctx), name_(name), cubin_data_(cubin_data) {}
|
||||
|
||||
void Compute(OpKernelContext* ctx) override;
|
||||
|
||||
private:
|
||||
std::string name_;
|
||||
absl::Span<const uint8_t> cubin_data_;
|
||||
std::unique_ptr<se::KernelBase> kernel_;
|
||||
absl::Mutex mu_;
|
||||
};
|
||||
|
||||
#define GENERATE_OP_KERNEL_BASE(kernel_name) \
|
||||
class MlirGenerated##kernel_name##Op : public MlirGeneratedUnaryOp { \
|
||||
public: \
|
||||
MlirGenerated##kernel_name##Op(OpKernelConstruction* ctx, \
|
||||
absl::Span<const uint8_t> cubin_data) \
|
||||
: MlirGeneratedUnaryOp(ctx, #kernel_name "_kernel", cubin_data) {} \
|
||||
};
|
||||
|
||||
#define GENERATE_OP_KERNEL_FOR(kernel_name, data_type) \
|
||||
class MlirGenerated##kernel_name##data_type##Op \
|
||||
: public MlirGenerated##kernel_name##Op { \
|
||||
public: \
|
||||
explicit MlirGenerated##kernel_name##data_type##Op( \
|
||||
OpKernelConstruction* ctx) \
|
||||
: MlirGenerated##kernel_name \
|
||||
##Op(ctx, k##kernel_name##data_type##Kernel) {} \
|
||||
};
|
||||
|
||||
#define GENERATE_AND_REGISTER_UNARY_KERNEL(kernel_name, data_type, \
|
||||
native_data_type) \
|
||||
namespace { \
|
||||
GENERATE_OP_KERNEL_FOR(kernel_name, data_type) \
|
||||
} \
|
||||
REGISTER_KERNEL_BUILDER(Name(#kernel_name) \
|
||||
.Device(DEVICE_GPU) \
|
||||
.TypeConstraint<native_data_type>("T"), \
|
||||
MlirGenerated##kernel_name##data_type##Op);
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_H_
|
||||
@ -1,36 +0,0 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/types/span.h"
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h"
|
||||
#include "tensorflow/core/kernels/mlir_generated/tanh_f16_kernel.h"
|
||||
#include "tensorflow/core/kernels/mlir_generated/tanh_f32_kernel.h"
|
||||
#include "tensorflow/core/kernels/mlir_generated/tanh_f64_kernel.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace {
|
||||
GENERATE_OP_KERNEL_BASE(Tanh);
|
||||
} // namespace
|
||||
|
||||
GENERATE_AND_REGISTER_UNARY_KERNEL(Tanh, F16, Eigen::half)
|
||||
GENERATE_AND_REGISTER_UNARY_KERNEL(Tanh, F32, float)
|
||||
GENERATE_AND_REGISTER_UNARY_KERNEL(Tanh, F64, double)
|
||||
} // namespace tensorflow
|
||||
Loading…
x
Reference in New Issue
Block a user