Add lowering pipeline from LHLO to NVVM.

This adds a first simple lowering pipeline to the mlir_gpu compiler that takes an mlir module containing LHLO to a module with kernel functions containing LLVM/NVVM dialect only. PiperOrigin-RevId: 266903498
2019-09-03 04:56:33 -07:00 · 2019-09-03 04:56:33 -07:00 · dc48199572
commit dc48199572
parent 185a465225
9 changed files with 263 additions and 10 deletions
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@ -36,7 +36,10 @@ cc_library(
    hdrs = ["mlir_compiler.h"],
    deps = [
        ":failover_compiler",
+        ":kernel_lowering",
        ":lhlo_dialect_emitter",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:buffer_assignment",
        "//tensorflow/compiler/xla/service:compiler",
        "//tensorflow/compiler/xla/service:dump",
        "//tensorflow/compiler/xla/service:hlo",
@ -49,6 +52,7 @@ cc_library(
        "//tensorflow/compiler/xla/service/gpu:target_constants",
        "//tensorflow/core:lib",
        "//tensorflow/stream_executor:stream_executor_headers",
+        "@local_config_mlir//:GPUDialect",
        "@local_config_mlir//:IR",
        "@local_config_mlir//:LLVMDialect",
    ],
@ -92,6 +96,29 @@ cc_library(
    ],
 )

+cc_library(
+    name = "kernel_lowering",
+    srcs = ["kernel_lowering.cc"],
+    hdrs = ["kernel_lowering.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/xla:hlo",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service/mlir_gpu/transforms:legalize_to_affine",
+        "@com_google_absl//absl/memory",
+        "@local_config_mlir//:GPUDialect",
+        "@local_config_mlir//:GPUToNVVMTransforms",
+        "@local_config_mlir//:GPUTransforms",
+        "@local_config_mlir//:IR",
+        "@local_config_mlir//:LLVMDialect",
+        "@local_config_mlir//:LLVMTransforms",
+        "@local_config_mlir//:LoopsToGPUPass",
+        "@local_config_mlir//:NVVMDialect",
+        "@local_config_mlir//:Pass",
+        "@local_config_mlir//:Transforms",
+    ],
+)
+
 cc_library(
    name = "mlir_irgen_test_base",
    testonly = True,
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@ -0,0 +1,123 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // TF:local_config_mlir
+#include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h"  // TF:local_config_mlir
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // TF:local_config_mlir
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // TF:local_config_mlir
+#include "mlir/Dialect/GPU/GPUDialect.h"  // TF:local_config_mlir
+#include "mlir/Dialect/GPU/Passes.h"  // TF:local_config_mlir
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // TF:local_config_mlir
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // TF:local_config_mlir
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Function.h"  // TF:local_config_mlir
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "mlir/IR/OperationSupport.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Pass/PassManager.h"  // TF:local_config_mlir
+#include "mlir/Transforms/DialectConversion.h"  // TF:local_config_mlir
+#include "mlir/Transforms/Passes.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/xla/service/mlir_gpu/transforms/legalize_to_affine.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace mlir_gpu {
+namespace {
+
+using ::mlir::ConversionTarget;
+using ::mlir::FuncOp;
+using ::mlir::LLVMTypeConverter;
+using ::mlir::ModulePass;
+using ::mlir::ModulePassBase;
+using ::mlir::OwningRewritePatternList;
+using ::mlir::PassManager;
+using ::mlir::gpu::GPUDialect;
+using ::mlir::LLVM::LLVMDialect;
+using ::mlir::NVVM::NVVMDialect;
+
+struct LowerKernelBodiesToNVVMPass
+    : public ModulePass<LowerKernelBodiesToNVVMPass> {
+ public:
+  explicit LowerKernelBodiesToNVVMPass() = default;
+
+  void runOnModule() override {
+    auto module = getModule();
+    ConversionTarget target(*module.getContext());
+    LLVMTypeConverter converter(module.getContext());
+
+    target.addLegalDialect<LLVMDialect>();
+    target.addLegalDialect<NVVMDialect>();
+    target.addDynamicallyLegalOp<FuncOp>(
+        [&](FuncOp op) { return converter.isSignatureLegal(op.getType()); });
+
+    OwningRewritePatternList patterns;
+    populateStdToLLVMConversionPatterns(converter, patterns);
+    populateGpuToNVVMConversionPatterns(converter, patterns);
+
+    module.walk([this, &target, &patterns, &converter](FuncOp function) {
+      if (!GPUDialect::isKernel(function)) {
+        return;
+      }
+      if (failed(applyFullConversion(function, target, patterns, &converter))) {
+        signalPassFailure();
+      }
+    });
+  }
+};
+
+}  // namespace
+
+Status LowerLHLOToGPU(mlir::ModuleOp module) {
+  PassManager pm(module.getContext());
+
+  // Transform element-wise operations to Affine.
+  pm.addPass(createLegalizeAffinePass());
+  // Transform affine to gpu launches.
+  // TODO(b/137624192) This pass requires known dimensions. Generalization it.
+  pm.addPass(::mlir::createSimpleLoopsToGPUPass(/*numBlockDims=*/0,
+                                                /*numThreadDims=*/2));
+  // Take launches to launches with kernels.
+  pm.addPass(::mlir::createGpuKernelOutliningPass());
+  // Some basic cleanup.
+  pm.addPass(::mlir::createCSEPass());
+
+  if (failed(pm.run(module))) {
+    return InternalError("Lowering to NVVM IR failed.");
+  }
+  return Status::OK();
+}
+
+Status LowerKernelBodiesToNVVM(mlir::ModuleOp module) {
+  // We cannot verify as the signature of the kernel is rewritten.
+  PassManager pm(module.getContext(), /*verifyPasses=*/false);
+
+  // Rewrite kernel functions to LLVM IR.
+  pm.addPass(absl::make_unique<LowerKernelBodiesToNVVMPass>());
+  // Some basic cleanup.
+  pm.addPass(::mlir::createCSEPass());
+
+  if (failed(pm.run(module))) {
+    return InternalError("Lowering to NVVM IR failed.");
+  }
+  return Status::OK();
+}
+
+}  // namespace mlir_gpu
+}  // namespace xla
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_KERNEL_LOWERING_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_KERNEL_LOWERING_H_
+
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/xla/status.h"
+
+namespace xla {
+namespace mlir_gpu {
+
+Status LowerLHLOToGPU(mlir::ModuleOp module);
+
+Status LowerKernelBodiesToNVVM(mlir::ModuleOp module);
+
+}  // namespace mlir_gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_KERNEL_LOWERING_H_
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
@ -17,10 +17,13 @@ limitations under the License.

 #include <memory>

+#include "mlir/Dialect/GPU/GPUDialect.h"  // TF:local_config_mlir
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // TF:local_config_mlir
+#include "mlir/IR/Function.h"  // TF:local_config_mlir
 #include "mlir/IR/Location.h"  // TF:local_config_mlir
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
@ -30,9 +33,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"

 namespace xla {
@ -152,7 +158,22 @@ StatusOr<std::unique_ptr<Executable>> MlirCompiler::RunBackend(
  TF_RETURN_IF_ERROR(
      lhlo_emitter.EmitComputation(*module->entry_computation()));

-  if (module_hook_.callback && !module_hook_.apply_on_lowered) {
+  if (module_hook_.callback &&
+      module_hook_.stage == IRHook::LoweringStage::LHLO) {
+    module_hook_.callback(*mlir_module);
+  }
+
+  TF_RETURN_IF_ERROR(LowerLHLOToGPU(*mlir_module));
+
+  if (module_hook_.callback &&
+      module_hook_.stage == IRHook::LoweringStage::GPU) {
+    module_hook_.callback(*mlir_module);
+  }
+
+  TF_RETURN_IF_ERROR(LowerKernelBodiesToNVVM(*mlir_module));
+
+  if (module_hook_.callback &&
+      module_hook_.stage == IRHook::LoweringStage::LLVM) {
    module_hook_.callback(*mlir_module);
  }

@ -194,7 +215,9 @@ void MlirCompiler::SetModuleHook(IRHook module_hook) {
  module_hook_ = module_hook;
 }

-void MlirCompiler::RemoveModuleHook() { module_hook_ = {nullptr, false}; }
+void MlirCompiler::RemoveModuleHook() {
+  module_hook_ = {nullptr, IRHook::LoweringStage::LHLO};
+}

 }  // namespace mlir_gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
@ -57,8 +57,10 @@ class MlirCompiler : public Compiler {
  }

  struct IRHook {
+    enum class LoweringStage { LHLO, GPU, LLVM };
+
    std::function<void(mlir::ModuleOp)> callback;
-    bool apply_on_lowered;
+    LoweringStage stage;
  };

  void SetModuleHook(IRHook module_hook);
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc
@ -31,7 +31,7 @@ namespace mlir_gpu {

 void MlirIrGenTestBase::CompileAndVerifyIr(
    std::unique_ptr<HloModule> hlo_module, const string& pattern,
-    bool match_lowered_ir) {
+    LoweringStage stage) {
  MlirCompiler* compiler = GetMLIRCompiler();
  string ir;
  compiler->SetModuleHook({[&ir](mlir::ModuleOp module) -> Status {
@ -42,7 +42,7 @@ void MlirIrGenTestBase::CompileAndVerifyIr(
                             ir = buffer_string;
                             return Status::OK();
                           },
-                           match_lowered_ir});
+                           stage});
  Status status = CompileToExecutable(std::move(hlo_module)).status();
  compiler->RemoveModuleHook();
  TF_ASSERT_OK(status);
@ -54,12 +54,12 @@ void MlirIrGenTestBase::CompileAndVerifyIr(

 void MlirIrGenTestBase::CompileAndVerifyIr(const string& hlo_text,
                                           const string& expected_llvm_ir,
-                                           bool match_lowered_ir) {
+                                           LoweringStage stage) {
  HloModuleConfig config;
  config.set_debug_options(GetDebugOptionsForTest());
  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                          ParseAndReturnUnverifiedModule(hlo_text, config));
-  CompileAndVerifyIr(std::move(module), expected_llvm_ir, match_lowered_ir);
+  CompileAndVerifyIr(std::move(module), expected_llvm_ir, stage);
 }

 MlirCompiler* MlirIrGenTestBase::GetMLIRCompiler() {
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h
@ -27,6 +27,8 @@ namespace mlir_gpu {
 // Tests that verify IR emitted by the CPU/GPU backend is as expected.
 class MlirIrGenTestBase : public CodegenTestBase {
 protected:
+  using LoweringStage = MlirCompiler::IRHook::LoweringStage;
+
  // Compiles the given HLO module to MLIR IR and verifies the IR matches the
  // given pattern. `pattern` is in the FileCheck pattern matching syntax
  // (http://llvm.org/docs/CommandGuide/FileCheck.html).
@ -37,13 +39,14 @@ class MlirIrGenTestBase : public CodegenTestBase {
  // steps to LLVM IR are applied; otherwise, the IR before lowering is
  // matched.
  void CompileAndVerifyIr(std::unique_ptr<HloModule> hlo_module,
-                          const string& pattern, bool match_lowered_ir = false);
+                          const string& pattern,
+                          LoweringStage stage = LoweringStage::LHLO);

  // A thin wrapper around CompileAndVerifyIr that parses `hlo_text` to create
  // an HLO module.
  void CompileAndVerifyIr(const string& hlo_text,
                          const string& expected_llvm_ir,
-                          bool match_lowered_ir = false);
+                          LoweringStage stage = LoweringStage::LHLO);

  // Compiles and returns module with optimizations from a given HLO.
  StatusOr<std::unique_ptr<HloModule>> GetOptimizedModule(
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc
@ -47,6 +47,49 @@ ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
      )");
 }

+TEST_F(LhloGenTest, AddInGPUDialect) {
+  CompileAndVerifyIr(R"(
+HloModule Add
+
+ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  %y = f32[2,2]{1,0} parameter(1)
+  ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
+})",
+                     R"(
+;CHECK: func @add(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
+;CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[ARG0]], %[[ARG1]], %[[ARG2]]
+;CHECK: }
+;CHECK: func @add_kernel(%[[ARG0]]: [[TYPE]], %[[ARG1]]: [[TYPE]], %[[ARG2]]: [[TYPE]]
+;CHECK: load %[[ARG0]][[INDEX:.*]]
+;CHECK: load %[[ARG1]][[INDEX]]
+;CHECK: store %{{.*}}, %[[ARG2]][[INDEX]]
+      )",
+                     LoweringStage::GPU);
+}
+
+TEST_F(LhloGenTest, AddInLVVMDialect) {
+  CompileAndVerifyIr(R"(
+HloModule Add
+
+ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  %y = f32[2,2]{1,0} parameter(1)
+  ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
+})",
+                     R"(
+;CHECK: func @add_kernel(%[[ARG0:.*]]: [[TYPE:!llvm<.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]
+;CHECK: %[[GEP0:.*]] = llvm.getelementptr %[[ARG0]][[INDEX:.*]]
+;CHECK: %[[VAL0:.*]] = llvm.load %[[GEP0]]
+;CHECK: %[[GEP1:.*]] = llvm.getelementptr %[[ARG1]][[INDEX]]
+;CHECK: %[[VAL1:.*]] = llvm.load %[[GEP1]]
+;CHECK: %[[VAL2:.*]] = llvm.fadd %[[VAL0]], %[[VAL1]]
+;CHECK: %[[GEP2:.*]] = llvm.getelementptr %[[ARG2]][[INDEX]]
+;CHECK: llvm.store %[[VAL2]], %[[GEP2]]
+      )",
+                     LoweringStage::LLVM);
+}
+
 TEST_F(LhloGenTest, AddMultiply) {
  CompileAndVerifyIr(R"(
 HloModule AddMultiply
--- a/tensorflow/compiler/xla/service/mlir_gpu/transforms/legalize_to_affine.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/transforms/legalize_to_affine.h
@ -24,7 +24,7 @@ namespace xla {
 namespace mlir_gpu {

 // Lowers from LHLO dialect to affine dialect.
-std::unique_ptr<::mlir::FunctionPassBase> createLegalizeAffine();
+std::unique_ptr<::mlir::FunctionPassBase> createLegalizeAffinePass();

 // Adds patterns to convert LHLO binary ops to affine loops.
 void AppendBinaryOpsPatterns(::mlir::MLIRContext* context,