Enable denormal-fp-math-f32 mode for f32 kernels.
This is necessary to be compatible with Tensorflow which uses the flag -fcuda-flush-denormals-to-zero when compiling cuda code with clang. PiperOrigin-RevId: 347999405 Change-Id: If303ef6d922ae239fe6ba4e0581be9f12c2edeb2
This commit is contained in:
parent
84de9ebadc
commit
800a3b1fb4
@ -245,7 +245,8 @@ Status AmendKernelLLVMIRWithStaticKnowledge(mlir::ModuleOp module) {
|
||||
Status GenerateDeviceCode(mlir::ModuleOp module,
|
||||
llvm::StringRef gpu_binary_attr_name,
|
||||
llvm::ArrayRef<std::string> architectures,
|
||||
bool generate_fatbin, bool print_ptx) {
|
||||
bool generate_fatbin, bool print_ptx,
|
||||
bool enable_ftz) {
|
||||
mlir::PassManager pm(module.getContext());
|
||||
applyTensorflowAndCLOptions(pm);
|
||||
|
||||
@ -253,7 +254,8 @@ Status GenerateDeviceCode(mlir::ModuleOp module,
|
||||
// Remove debug information to ensure we do not create debug PTX.
|
||||
kernel_pm.addPass(mlir::createStripDebugInfoPass());
|
||||
kernel_pm.addPass(mlir::kernel_gen::transforms::CreateGpuKernelToBlobPass(
|
||||
gpu_binary_attr_name, architectures, generate_fatbin, print_ptx));
|
||||
gpu_binary_attr_name, architectures, generate_fatbin, print_ptx,
|
||||
enable_ftz));
|
||||
|
||||
return failed(pm.run(module))
|
||||
? InternalError("Generating device code failed.")
|
||||
@ -281,7 +283,7 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
|
||||
llvm::ArrayRef<std::string> architectures,
|
||||
llvm::ArrayRef<uint32_t> tile_sizes,
|
||||
llvm::ArrayRef<uint32_t> unroll_factors, bool embed_memref_prints,
|
||||
bool generate_fatbin, bool print_ptx) {
|
||||
bool generate_fatbin, bool print_ptx, bool enable_ftz) {
|
||||
auto& registry = context.getDialectRegistry();
|
||||
mlir::RegisterAllTensorFlowDialects(registry);
|
||||
registry.insert<mlir::chlo::HloClientDialect, mlir::mhlo::MhloDialect>();
|
||||
@ -302,7 +304,7 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
|
||||
TF_RETURN_IF_ERROR(AmendKernelLLVMIRWithStaticKnowledge(module.get()));
|
||||
TF_RETURN_IF_ERROR(GenerateDeviceCode(module.get(), kGpuBinaryAttrName,
|
||||
architectures, generate_fatbin,
|
||||
print_ptx));
|
||||
print_ptx, enable_ftz));
|
||||
TF_RETURN_IF_ERROR(LowerHostSideToFinalForm(module.get()));
|
||||
return module;
|
||||
}
|
||||
|
@ -40,7 +40,7 @@ xla::StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
|
||||
llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
|
||||
llvm::ArrayRef<uint32_t> unroll_factors = {},
|
||||
bool embed_memref_prints = false, bool generate_fatbin = true,
|
||||
bool print_ptx = false);
|
||||
bool print_ptx = false, bool enable_ftz = false);
|
||||
|
||||
// Extracts gpu_binary from the converted module.
|
||||
xla::StatusOr<std::string> ExtractGpuBinary(mlir::ModuleOp module);
|
||||
|
@ -106,7 +106,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
|
||||
llvm::ArrayRef<std::string> architectures,
|
||||
llvm::ArrayRef<uint32_t> tile_sizes,
|
||||
llvm::ArrayRef<uint32_t> unroll_factors,
|
||||
bool embed_memref_prints, bool print_ptx) {
|
||||
bool embed_memref_prints, bool print_ptx, bool enable_ftz) {
|
||||
// Read TF code.
|
||||
std::string tf_code;
|
||||
TF_RETURN_IF_ERROR(
|
||||
@ -117,8 +117,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
|
||||
mlir::OwningModuleRef module,
|
||||
GenerateKernelForTfCode(context, tf_code, architectures, tile_sizes,
|
||||
unroll_factors, embed_memref_prints,
|
||||
/*generate_fatbin=*/true,
|
||||
/*print_ptx=*/print_ptx));
|
||||
/*generate_fatbin=*/true, print_ptx, enable_ftz));
|
||||
// Get binary.
|
||||
TF_ASSIGN_OR_RETURN(std::string binary, EmitToBinary(*module));
|
||||
|
||||
@ -147,6 +146,11 @@ int main(int argc, char** argv) {
|
||||
"print-ptx",
|
||||
llvm::cl::desc("Print generated PTX code per target architecture."),
|
||||
llvm::cl::init(false));
|
||||
llvm::cl::opt<bool> enable_ftz(
|
||||
"enable_ftz",
|
||||
llvm::cl::desc(
|
||||
"Enable the denormal flush to zero mode when generating code."),
|
||||
llvm::cl::init(false));
|
||||
llvm::cl::list<std::string> architectures(
|
||||
"arch", llvm::cl::desc("target architectures (e.g. sm_70 or compute_75)"),
|
||||
llvm::cl::OneOrMore, llvm::cl::CommaSeparated);
|
||||
@ -166,7 +170,7 @@ int main(int argc, char** argv) {
|
||||
|
||||
auto status = tensorflow::kernel_gen::Run(
|
||||
input_file, output_file, architectures, tile_sizes, unroll_factors,
|
||||
embed_memref_prints, print_ptx);
|
||||
embed_memref_prints, print_ptx, enable_ftz);
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << status;
|
||||
return 1;
|
||||
|
@ -53,13 +53,14 @@ class GpuKernelToBlobPass
|
||||
public:
|
||||
GpuKernelToBlobPass(mlir::StringRef blob_annotation,
|
||||
llvm::ArrayRef<std::string> architectures,
|
||||
bool generate_fatbin, bool print_ptx) {
|
||||
bool generate_fatbin, bool print_ptx, bool enable_ftz) {
|
||||
if (!blob_annotation.empty()) {
|
||||
blob_annotation_ = blob_annotation.str();
|
||||
}
|
||||
architectures_ = architectures;
|
||||
generate_fatbin_ = generate_fatbin;
|
||||
print_ptx_ = print_ptx;
|
||||
enable_ftz_ = enable_ftz;
|
||||
}
|
||||
|
||||
void runOnOperation() override {
|
||||
@ -99,7 +100,9 @@ class GpuKernelToBlobPass
|
||||
llvmModule->setModuleIdentifier("acme");
|
||||
|
||||
xla::HloModuleConfig config;
|
||||
config.set_debug_options(xla::GetDebugOptionsFromFlags());
|
||||
xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
|
||||
options.set_xla_gpu_ftz(enable_ftz_);
|
||||
config.set_debug_options(options);
|
||||
|
||||
using AmdGpuHsaco = std::vector<tensorflow::uint8>;
|
||||
std::vector<tensorflow::se::HsacoImage> images;
|
||||
@ -148,7 +151,9 @@ class GpuKernelToBlobPass
|
||||
llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
|
||||
|
||||
xla::HloModuleConfig config;
|
||||
config.set_debug_options(xla::GetDebugOptionsFromFlags());
|
||||
xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
|
||||
options.set_xla_gpu_ftz(enable_ftz_);
|
||||
config.set_debug_options(options);
|
||||
|
||||
auto enable_fusion = [](llvm::TargetMachine* target) {
|
||||
target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
|
||||
@ -241,15 +246,16 @@ class GpuKernelToBlobPass
|
||||
return InternalError(
|
||||
"Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
|
||||
}
|
||||
bool enable_ftz_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
|
||||
mlir::StringRef blob_annotation, ArrayRef<std::string> architectures,
|
||||
bool generate_fatbin, bool print_ptx) {
|
||||
return std::make_unique<GpuKernelToBlobPass>(blob_annotation, architectures,
|
||||
generate_fatbin, print_ptx);
|
||||
bool generate_fatbin, bool print_ptx, bool enable_ftz) {
|
||||
return std::make_unique<GpuKernelToBlobPass>(
|
||||
blob_annotation, architectures, generate_fatbin, print_ptx, enable_ftz);
|
||||
}
|
||||
|
||||
} // namespace transforms
|
||||
|
@ -69,7 +69,7 @@ std::unique_ptr<FunctionPass> CreateParallelLoopsToSequential();
|
||||
std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
|
||||
mlir::StringRef blob_annotation = {},
|
||||
ArrayRef<std::string> architectures = {}, bool generate_fatbin = true,
|
||||
bool print_ptx = false);
|
||||
bool print_ptx = false, bool enable_ftz = false);
|
||||
|
||||
// Pass to propagate tensorflow runtime ABI knowledge across kernel boundaries.
|
||||
std::unique_ptr<FunctionPass> CreatePropagateTfAbiKnowledgeToKernels();
|
||||
|
@ -111,6 +111,7 @@ def _gen_kernel_fatbin_impl(ctx):
|
||||
"--arch=%s" % arch_flag,
|
||||
"--input=%s" % ctx.file.mlir_op.path,
|
||||
"--output=%s" % gpu_bin.path,
|
||||
"--enable_ftz=%s" % (ctx.attr.data_type == "f32"),
|
||||
],
|
||||
mnemonic = "compile",
|
||||
)
|
||||
@ -131,6 +132,7 @@ def _gen_kernel_fatbin_impl(ctx):
|
||||
_gen_kernel_fatbin_rule = rule(
|
||||
attrs = {
|
||||
"mlir_op": attr.label(mandatory = True, allow_single_file = True),
|
||||
"data_type": attr.string(mandatory = True),
|
||||
"tile_size": attr.string(mandatory = True),
|
||||
"unroll_factors": attr.string(),
|
||||
"gpu_archs": attr.string_list(mandatory = True),
|
||||
@ -174,6 +176,7 @@ def gen_kernel_library(name, types, tile_size, tags = [], unroll_factors = None,
|
||||
_gen_kernel_fatbin_rule(
|
||||
name = "{name}_{type}_kernel_generator".format(name = name, type = type),
|
||||
mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
|
||||
data_type = type,
|
||||
gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
|
||||
tile_size = tile_size,
|
||||
unroll_factors = unroll_factors,
|
||||
|
Loading…
Reference in New Issue
Block a user