Drop the implementation of the mlir_gpu plugin prototype in XLA.

This leaves some passes and the kernel_lowering in place for now. Those are pending removal,, too. PiperOrigin-RevId: 348621555 Change-Id: I4ff2ed11fb646bf76ceaa780956c6257d89f54ae
2020-12-22 05:56:55 -08:00 · 2020-12-22 05:56:55 -08:00 · 1a44a1f9dd
commit 1a44a1f9dd
parent 852b977596
54 changed files with 0 additions and 3400 deletions
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@ -1036,31 +1036,8 @@ cc_library(
    ],
 )
 # This flag enables experimental MLIR GPU support.
 config_setting(
    name = "with_mlir_gpu_support",
    define_values = {"with_mlir_gpu_support": "true"},
    visibility = ["//visibility:public"],
 )
 # Lets us choose the right GPU plugin depending on whether the experimental MLIR
 # GPU plugin should be used or not.
 cc_library(
    name = "gpu_plugin",
    deps = select(
        {
            ":with_mlir_gpu_support": [
                ":gpu_plugin_mlir",
            ],
            "//conditions:default": [
                ":gpu_plugin_no_mlir",
            ],
        },
    ),
 )
 cc_library(
    name = "gpu_plugin_no_mlir",
    deps = [
        ":service",
        "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
@ -1075,17 +1052,6 @@ cc_library(
    ]) + internal_cuda_deps(),
 )
 cc_library(
    name = "gpu_plugin_mlir",
    deps = [
        ":service",
        "//tensorflow/compiler/xla/service/gpu:gpu_transfer_manager",
        "//tensorflow/core/platform:stream_executor_no_cuda",
    ] + if_cuda_is_configured([
        "//tensorflow/compiler/xla/service/mlir_gpu:mlir_compiler_impl",
    ]) + internal_cuda_deps(),
 )
 cc_library(
    name = "interpreter_plugin",
    deps = [
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@ -9,10 +9,6 @@ load("//tensorflow:tensorflow.bzl", "filegroup")
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
    "//tensorflow/core/platform/default:cuda_build_defs.bzl",
    "if_cuda_is_configured",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 package(
@ -34,139 +30,6 @@ filegroup(
    ]),
 )
 cc_library(
    name = "failover_compiler",
    srcs = ["failover_compiler.cc"],
    hdrs = ["failover_compiler.h"],
    deps = [
        "//tensorflow/compiler/xla/service:compiler",
        "//tensorflow/core:lib",
    ],
 )
 cc_library(
    name = "emission_context",
    srcs = ["emission_context.cc"],
    hdrs = ["emission_context.h"],
    deps = [
        "//tensorflow/compiler/mlir/hlo",
        "//tensorflow/compiler/mlir/hlo:lhlo",
        "//tensorflow/compiler/xla/service:hlo",
        "@com_google_absl//absl/strings",
        "@llvm-project//mlir:IR",
        "@llvm-project//mlir:StandardOps",
    ],
 )
 cc_library(
    name = "inject_errors_pass",
    srcs = ["inject_errors_pass.cc"],
    hdrs = ["inject_errors_pass.h"],
    deps = [
        "@llvm-project//mlir:Pass",
        "@llvm-project//mlir:StandardOps",
    ],
 )
 cc_library(
    name = "mlir_compiler",
    srcs = ["mlir_compiler.cc"],
    hdrs = ["mlir_compiler.h"],
    deps = [
        ":emission_context",
        "//tensorflow/compiler/xla/service:compiler",
        "//tensorflow/compiler/xla/service/gpu:target_constants",
        "//tensorflow/core/platform:stream_executor_no_cuda",
        "@llvm-project//llvm:Core",
        "@llvm-project//mlir:IR",
        "@llvm-project//mlir:LLVMDialect",
    ],
 )
 cc_library(
    name = "mlir_compiler_impl",
    srcs = if_cuda_is_configured(["mlir_compiler_impl.cc"]),
    deps = if_cuda_is_configured([
        ":mlir_compiler",
        ":failover_compiler",
        ":emission_context",
        ":kernel_lowering",
        ":lhlo_dialect_emitter",
        "@com_google_absl//absl/container:flat_hash_map",
        "@llvm-project//llvm:Core",
        "@llvm-project//mlir:GPUDialect",
        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
        "@llvm-project//mlir:IR",
        "@llvm-project//mlir:LLVMDialect",
        "@llvm-project//mlir:LLVMTransforms",
        "@llvm-project//mlir:StandardOps",
        "@llvm-project//mlir:Support",
        "@llvm-project//mlir:TargetNVVMIR",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla/service:buffer_assignment",
        "//tensorflow/compiler/xla/service:dump",
        "//tensorflow/compiler/xla/service:hlo",
        "//tensorflow/compiler/xla/service/gpu:gpu_constants",
        "//tensorflow/compiler/xla/service/gpu:gpu_executable",
        "//tensorflow/compiler/xla/service/gpu:gpu_hlo_schedule",
        "//tensorflow/compiler/xla/service/gpu:gpu_types",
        "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
        "//tensorflow/compiler/xla/service/gpu:nvptx_compiler_impl",
        "//tensorflow/compiler/xla/service/gpu:launch_dimensions",
        "//tensorflow/compiler/xla/service/gpu:stream_assignment",
        "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
        "//tensorflow/compiler/xla/service/gpu:target_constants",
        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
        "//tensorflow/core/platform:cuda_libdevice_path",
        "//tensorflow/core:lib",
        "//tensorflow/stream_executor/gpu:asm_compiler",
    ]),
    alwayslink = True,  # Contains compiler registration
 )
 cc_library(
    name = "hlo_dialect_emitter",
    srcs = ["hlo_dialect_emitter.cc"],
    hdrs = ["hlo_dialect_emitter.h"],
    deps = [
        ":emission_context",
        "//tensorflow/compiler/mlir/hlo",
        "//tensorflow/compiler/mlir/xla:hlo_utils",
        "//tensorflow/compiler/xla:comparison_util",
        "//tensorflow/compiler/xla:status",
        "//tensorflow/compiler/xla/service:hlo",
        "@com_google_absl//absl/types:span",
        "@llvm-project//llvm:Support",
        "@llvm-project//mlir:IR",
        "@llvm-project//mlir:StandardOps",
    ],
 )
 cc_library(
    name = "lhlo_dialect_emitter",
    srcs = ["lhlo_dialect_emitter.cc"],
    hdrs = ["lhlo_dialect_emitter.h"],
    deps = [
        ":emission_context",
        ":hlo_dialect_emitter",
        "//tensorflow/compiler/mlir/hlo:lhlo",
        "//tensorflow/compiler/mlir/xla:hlo_utils",
        "//tensorflow/compiler/xla:status",
        "//tensorflow/compiler/xla:status_macros",
        "//tensorflow/compiler/xla/service:buffer_assignment",
        "//tensorflow/compiler/xla/service:hlo",
        "//tensorflow/compiler/xla/service/gpu:thunk",
        "//tensorflow/compiler/xla/service/gpu:thunk_emitter",
        "//tensorflow/core:lib",
        "//tensorflow/stream_executor:stream_executor_headers",
        "@com_google_absl//absl/container:flat_hash_map",
        "@llvm-project//llvm:Core",
        "@llvm-project//mlir:IR",
        "@llvm-project//mlir:LLVMDialect",
        "@llvm-project//mlir:StandardOps",
    ],
 )
 gentbl(
    name = "passes_inc_gen",
    compatible_with = get_compatible_with_cloud(),
@ -238,51 +101,6 @@ cc_library(
    ],
 )
 cc_library(
    name = "xla_gpu_opt_lib",
    testonly = True,
    srcs = ["xla_gpu_opt.cc"],
    hdrs = ["xla_gpu_opt.h"],
    tags = ["no_pip"],
    deps = [
        ":failover_compiler",
        ":inject_errors_pass",
        ":mlir_compiler",
        "//tensorflow/compiler/xla:debug_options_flags",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:status",
        "//tensorflow/compiler/xla:statusor",
        "//tensorflow/compiler/xla/service:backend",
        "//tensorflow/compiler/xla/service:hlo_module_config",
        "//tensorflow/compiler/xla/tests:verified_hlo_module",
        "//tensorflow/core:lib",
        "//tensorflow/stream_executor/lib",
        "@com_google_absl//absl/strings",
        "@llvm-project//llvm:Support",
        "@llvm-project//mlir:IR",
        "@llvm-project//mlir:Pass",
    ],
 )
 tf_cc_binary(
    name = "xla-gpu-opt",
    testonly = True,
    srcs = ["xla_gpu_opt_main.cc"],
    tags = ["no_pip"],
    deps = [
        ":mlir_compiler",
        ":xla_gpu_opt_lib",
        "//tensorflow/compiler/mlir:init_mlir",
        "//tensorflow/compiler/xla:status",
        "//tensorflow/compiler/xla/service:gpu_plugin_mlir",
        "//tensorflow/core:lib",
        "@llvm-project//llvm:Support",
        "@llvm-project//mlir:Pass",
        "@llvm-project//mlir:SideEffects",
        "@llvm-project//mlir:Support",
    ],
 )
 tf_cc_binary(
    name = "xla-mlir-gpu-opt",
    srcs = ["xla_mlir_gpu_opt.cc"],
--- a/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc
@ -1,137 +0,0 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/mlir_gpu/emission_context.h"
 #include "absl/strings/substitute.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 namespace xla {
 namespace mlir_gpu {
 EmissionContext::EmissionContext(std::unique_ptr<HloModule> module)
    : module_(std::move(module)), context_() {
  context_.loadDialect<mlir::mhlo::MhloDialect, mlir::lmhlo::LmhloDialect,
                       mlir::StandardOpsDialect>();
  error_handler_ = [](const ErrorMap& instructions_with_error,
                      HloModule* module) {
    std::set<const HloComputation*> computations_with_error;
    for (const auto& err : instructions_with_error) {
      computations_with_error.insert(err.first->parent());
    }
    LOG(ERROR) << module->ToString(
        HloPrintOptions()
            .set_print_instruction(
                [&instructions_with_error](const HloInstruction* instr) {
                  return instructions_with_error.count(instr);
                })
            .set_format_instruction(
                // Returns the string representation of `instr` in the following
                // format.
                //
                // ROOT? instr_name
                //   FAILED: err_0
                //   FAILED: err_1
                //   ...
                [&instructions_with_error](const HloInstruction* instr,
                                           const string& instr_name, int indent,
                                           bool is_root) {
                  const string tab(2 * indent, ' ');
                  if (!instructions_with_error.count(instr)) {
                    return absl::StrCat(tab, is_root ? "ROOT " : "",
                                        instr_name);
                  }
                  static constexpr char kStartBold[] = "\033[1m";
                  static constexpr char kStartRed[] = "\033[31m";
                  static constexpr char kBackToNormal[] = "\033[0m";
                  string result =
                      absl::StrCat(tab, kStartBold, is_root ? "ROOT " : "",
                                   instr_name, kBackToNormal);
                  for (const string& err : instructions_with_error.at(instr)) {
                    absl::SubstituteAndAppend(
                        &result, "\n$0  $1$2FAILED:$3 $4$5$6", tab, kStartBold,
                        kStartRed, kBackToNormal, kStartBold, err,
                        kBackToNormal);
                  }
                  return result;
                })
            .set_print_computation(
                [&computations_with_error](const HloComputation* comp) {
                  return computations_with_error.find(comp) !=
                         computations_with_error.end();
                }));
  };
  registerDiagnosticHandler();
 }
 EmissionContext::EmissionContext(
    std::unique_ptr<HloModule> module,
    std::function<void(const ErrorMap&, HloModule*)> callback)
    : module_(std::move(module)), context_(), error_handler_(callback) {
  registerDiagnosticHandler();
 }
 EmissionContext::~EmissionContext() { callErrorHandlerCallback(); }
 mlir::Location EmissionContext::getLocation(const HloInstruction* instr) {
  return mlir::OpaqueLoc::get<const HloInstruction*>(instr, &context_);
 }
 void EmissionContext::addError(const HloInstruction* hlo_instruction,
                               const string& str) {
  instructions_with_error_[hlo_instruction].push_back(str);
 }
 void EmissionContext::setErrorHandler(
    std::function<void(const ErrorMap&, HloModule*)> callback) {
  error_handler_ = callback;
 }
 std::unique_ptr<HloModule> EmissionContext::releaseHloModule() {
  callErrorHandlerCallback();
  return std::move(module_);
 }
 HloModule* EmissionContext::getHloModule() const { return module_.get(); }
 mlir::MLIRContext* EmissionContext::getContext() { return &context_; }
 void EmissionContext::registerDiagnosticHandler() {
  context_.getDiagEngine().registerHandler([&](mlir::Diagnostic& diag) {
    const HloInstruction* hloInstruction =
        mlir::OpaqueLoc::getUnderlyingLocationOrNull<const HloInstruction*>(
            diag.getLocation());
    assert(hloInstruction);
    addError(hloInstruction, diag.str());
    return mlir::success();
  });
 }
 void EmissionContext::callErrorHandlerCallback() {
  if (module_.get() && !instructions_with_error_.empty()) {
    error_handler_(instructions_with_error_, module_.get());
  }
 }
 }  // namespace mlir_gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/mlir_gpu/emission_context.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/emission_context.h
@ -1,89 +0,0 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_EMISSION_CONTEXT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_EMISSION_CONTEXT_H_
 #include <memory>
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 namespace xla {
 namespace mlir_gpu {
 // Registers a diagnostic handler and collects all the errors as a map from
 // HloInstruction* to a vector of string representations of all the errors that
 // occurred at that hlo instruction. Also, it takes a function that handles
 // those errors at the point when the instance gets destroyed or
 // `releaseHloModule()` is called.
 //
 // EmissionContext uses an RAII pattern, it owns its hlo module and mlir
 // context.
 class EmissionContext {
 public:
  using ErrorMap =
      std::unordered_map<const HloInstruction*, std::vector<std::string>>;
  // Gets an hlo module and sets the default error handler which writes to the
  // ERROR log and is executed when the instance gets destroyed or
  // `releaseHloModule()` is called.
  explicit EmissionContext(std::unique_ptr<HloModule> module);
  // Gets an hlo module and an error handler function which is executed when the
  // instance gets destroyed or `releaseHloModule()` is called.
  EmissionContext(std::unique_ptr<HloModule> module,
                  std::function<void(const ErrorMap&, HloModule*)> callback);
  // Handles all the errors according to the error handler function before
  // getting destroyed.
  ~EmissionContext();
  // Returns a location constructed from `instr` that then is used by
  // the diagnostic handler to collect the errors.
  mlir::Location getLocation(const HloInstruction* instr);
  // Adds an error message associated with provided hlo instruction.
  void addError(const HloInstruction* hlo_instruction, const string& str);
  // Sets a function that handles the errors at the point when the instance
  // gets destroyed or `releaseHloModule()` is called.
  void setErrorHandler(
      std::function<void(const ErrorMap&, HloModule*)> callback);
  // Releases hlo module and handles all the errors according to the error
  // handler function.
  std::unique_ptr<HloModule> releaseHloModule();
  HloModule* getHloModule() const;
  mlir::MLIRContext* getContext();
 private:
  void registerDiagnosticHandler();
  void callErrorHandlerCallback();
  std::unique_ptr<HloModule> module_;
  ErrorMap instructions_with_error_;
  mlir::MLIRContext context_;
  std::function<void(const ErrorMap&, HloModule*)> error_handler_;
 };
 }  // namespace mlir_gpu
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_EMISSION_CONTEXT_H_
--- a/tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.cc
@ -1,119 +0,0 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h"
 #include <memory>
 #include "tensorflow/core/lib/core/errors.h"
 namespace xla {
 template <typename T>
 bool IsUnimplemented(StatusOr<T>& result) {
  return result.status().code() == tensorflow::error::Code::UNIMPLEMENTED;
 }
 StatusOr<std::unique_ptr<HloModule>> FailoverCompiler::RunHloPasses(
    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
    const CompileOptions& options) {
  auto result = primary_->RunHloPasses(module->Clone(), stream_exec, options);
  if (IsUnimplemented(result)) {
    VLOG(2) << "RunHloPasses resulted in " << result.status()
            << ", falling back to secondary backend";
    return secondary_->RunHloPasses(std::move(module), stream_exec, options);
  }
  return result;
 }
 StatusOr<std::unique_ptr<Executable>> FailoverCompiler::RunBackend(
    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
    const CompileOptions& options) {
  auto result = primary_->RunBackend(module->Clone(), stream_exec, options);
  if (IsUnimplemented(result)) {
    VLOG(2) << "RunBackend resulted in " << result.status()
            << ", falling back to secondary backend";
    return secondary_->RunBackend(std::move(module), stream_exec, options);
  }
  return result;
 }
 StatusOr<std::vector<std::unique_ptr<Executable>>> FailoverCompiler::Compile(
    std::unique_ptr<HloModuleGroup> module_group,
    std::vector<std::vector<se::StreamExecutor*>> stream_execs,
    const CompileOptions& options) {
  std::vector<std::unique_ptr<Executable>> result;
  std::vector<std::unique_ptr<HloModule>> modules =
      module_group->ConsumeModules();
  for (size_t i = 0; i < modules.size(); i++) {
    if (stream_execs[i].size() != 1) {
      // This is not supported by GPU compiler anyway.
      return Unimplemented(
          "Model partitioning not implemented for the failover compiler!");
    }
    auto executable = [stream_execs, &options, i,
                       this](std::unique_ptr<HloModule> module)
        -> StatusOr<std::unique_ptr<Executable>> {
      TF_ASSIGN_OR_RETURN(auto processed_module,
                          primary_->RunHloPasses(std::move(module),
                                                 stream_execs[i][0], options));
      TF_ASSIGN_OR_RETURN(auto result,
                          primary_->RunBackend(std::move(processed_module),
                                               stream_execs[i][0], options));
      return result;
    }(modules[i]->Clone());
    if (IsUnimplemented(executable)) {
      VLOG(2) << "Compile resulted in " << executable.status()
              << ", falling back to secondary backend";
      TF_ASSIGN_OR_RETURN(
          modules[i], secondary_->RunHloPasses(std::move(modules[i]),
                                               stream_execs[i][0], options));
      TF_ASSIGN_OR_RETURN(executable,
                          secondary_->RunBackend(std::move(modules[i]),
                                                 stream_execs[i][0], options));
    }
    if (!executable.ok()) {
      return executable.status();
    }
    result.push_back(std::move(executable.ValueOrDie()));
  }
  return {std::move(result)};
 }
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 FailoverCompiler::CompileAheadOfTime(
    std::unique_ptr<HloModuleGroup> module_group,
    const AotCompilationOptions& options) {
  // This is not supported by GPU compiler anyway.
  return Unimplemented(
      "CompileAheadOfTime not implemented in failover compiler!");
 }
 HloCostAnalysis::ShapeSizeFunction FailoverCompiler::ShapeSizeBytesFunction()
    const {
  auto prim_fun = primary_->ShapeSizeBytesFunction();
  auto second_fun = secondary_->ShapeSizeBytesFunction();
  return [prim_fun, second_fun](const Shape& shape) -> int64 {
    int64 primary = prim_fun(shape);
    assert(primary == second_fun(shape));
    return primary;
  };
 }
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h
@ -1,81 +0,0 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_FAILOVER_COMPILER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_FAILOVER_COMPILER_H_
 #include <memory>
 #include "tensorflow/compiler/xla/service/compiler.h"
 namespace xla {
 // FailoverCompiler implements a compiler that fails over between a primary
 // and secondary compiler.
 //
 // For all methods, first the primary compiler is invoked. If that compiler's
 // implementation of the method fails with an unimplemented error, the
 // secondary's compiler method is invoked. In all other cases, the result of
 // the primary compiler's method is returned.
 //
 // The primary compiler is invoked on a clone of the supplied HloModule. This
 // ensures that partial updates to the module by one compiler to not leak into
 // the other compiler.
 //
 // The FailoverCompiler is used to layer a partial compiler implementation on
 // top of a full implementation.
 class FailoverCompiler final : public Compiler {
 public:
  FailoverCompiler(std::unique_ptr<Compiler> primary,
                   std::unique_ptr<Compiler> secondary)
      : primary_(std::move(primary)), secondary_(std::move(secondary)) {
    // Both compilers should serve the same platform id.
    assert(primary_->PlatformId() == secondary_->PlatformId());
  }
  se::Platform::Id PlatformId() const override {
    return primary_->PlatformId();
  }
  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
      const CompileOptions& options) override;
  StatusOr<std::unique_ptr<Executable>> RunBackend(
      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
      const CompileOptions& options) override;
  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
      std::unique_ptr<HloModuleGroup> module_group,
      std::vector<std::vector<se::StreamExecutor*>> stream_execs,
      const CompileOptions& options) override;
  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                     const AotCompilationOptions& options) override;
  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
  Compiler* GetPrimary() const { return primary_.get(); }
  Compiler* GetSecondary() const { return secondary_.get(); }
 private:
  std::unique_ptr<Compiler> primary_;
  std::unique_ptr<Compiler> secondary_;
 };
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_FAILOVER_COMPILER_H_
--- a/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
@ -1,276 +0,0 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.h"
 #include <utility>
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 namespace xla {
 namespace mlir_gpu {
 namespace {
 using ::mlir::ArrayRef;
 using ::mlir::Attribute;
 using ::mlir::Identifier;
 using ::mlir::Location;
 using ::mlir::NamedAttribute;
 using ::mlir::OpBuilder;
 using ::mlir::RankedTensorType;
 using ::mlir::Type;
 using ::mlir::Value;
 namespace hlo = ::mlir::mhlo;
 // TODO(b/137624192) Use tablegen for this.
 StatusOr<Value> InsertMlirOp(HloOpcode opcode, OpBuilder func_builder,
                             Location loc, ArrayRef<Type> rets,
                             ArrayRef<Value> args,
                             ArrayRef<std::pair<Identifier, Attribute>> attrs) {
  switch (opcode) {
    case HloOpcode::kAbs:
      return {func_builder.create<hlo::AbsOp>(loc, rets, args, attrs)};
    case HloOpcode::kAdd:
      return {func_builder.create<hlo::AddOp>(loc, rets, args, attrs)};
    case HloOpcode::kAnd:
      return {func_builder.create<hlo::AndOp>(loc, rets, args, attrs)};
    case HloOpcode::kCeil:
      return {func_builder.create<hlo::CeilOp>(loc, rets, args, attrs)};
    case HloOpcode::kComplex:
      return {func_builder.create<hlo::ComplexOp>(loc, rets, args, attrs)};
    case HloOpcode::kCopy:
      return {func_builder.create<hlo::CopyOp>(loc, rets, args, attrs)};
    case HloOpcode::kCos:
      return {func_builder.create<hlo::CosOp>(loc, rets, args, attrs)};
    case HloOpcode::kDivide:
      return {func_builder.create<hlo::DivOp>(loc, rets, args, attrs)};
    case HloOpcode::kExp:
      return {func_builder.create<hlo::ExpOp>(loc, rets, args, attrs)};
    case HloOpcode::kImag:
      return {func_builder.create<hlo::ImagOp>(loc, rets, args, attrs)};
    case HloOpcode::kLog:
      return {func_builder.create<hlo::LogOp>(loc, rets, args, attrs)};
    case HloOpcode::kMaximum:
      return {func_builder.create<hlo::MaxOp>(loc, rets, args, attrs)};
    case HloOpcode::kMinimum:
      return {func_builder.create<hlo::MinOp>(loc, rets, args, attrs)};
    case HloOpcode::kMultiply:
      return {func_builder.create<hlo::MulOp>(loc, rets, args, attrs)};
    case HloOpcode::kNegate:
      return {func_builder.create<hlo::NegOp>(loc, rets, args, attrs)};
    case HloOpcode::kReal:
      return {func_builder.create<hlo::RealOp>(loc, rets, args, attrs)};
    case HloOpcode::kRemainder:
      return {func_builder.create<hlo::RemOp>(loc, rets, args, attrs)};
    case HloOpcode::kRsqrt:
      return {func_builder.create<hlo::RsqrtOp>(loc, rets, args, attrs)};
    case HloOpcode::kSelect:
      return {func_builder.create<hlo::SelectOp>(loc, rets, args, attrs)};
    case HloOpcode::kSign:
      return {func_builder.create<hlo::SignOp>(loc, rets, args, attrs)};
    case HloOpcode::kSqrt:
      return {func_builder.create<hlo::SqrtOp>(loc, rets, args, attrs)};
    case HloOpcode::kSubtract:
      return {func_builder.create<hlo::SubOp>(loc, rets, args, attrs)};
    case HloOpcode::kTanh:
      return {func_builder.create<hlo::TanhOp>(loc, rets, args, attrs)};
    default:
      return tensorflow::errors::Internal(absl::StrCat(
          "HLO Opcode ", HloOpcodeString(opcode), " is not supported."));
  }
 }
 }  // namespace
 mlir::Location HloDialectEmitter::getLocation(
    const HloInstruction* instr) const {
  return emission_context_->getLocation(instr);
 }
 StatusOr<Value> HloDialectEmitter::EmitComputation(
    const HloComputation& computation) {
  const auto root = computation.root_instruction();
  TF_RETURN_IF_ERROR(root->Accept(this));
  return instruction_to_values_[root];
 }
 Status HloDialectEmitter::DefaultAction(HloInstruction* instr) {
  TF_ASSIGN_OR_RETURN(auto res_type, ConvertTensorShapeToType<RankedTensorType>(
                                         instr->shape(), builder_));
  llvm::SmallVector<Value, 4> arguments;
  arguments.reserve(instr->operand_count());
  for (auto operand : instr->operands()) {
    arguments.push_back(instruction_to_values_[operand]);
  }
  TF_ASSIGN_OR_RETURN(
      auto inserted, InsertMlirOp(instr->opcode(), builder_, getLocation(instr),
                                  res_type, arguments, llvm::None));
  instruction_to_values_[instr] = inserted;
  return Status::OK();
 }
 Status HloDialectEmitter::HandleBroadcast(HloInstruction* instr) {
  mlir::DenseIntElementsAttr broadcast_dim =
      CreateDenseIntElementsAttrFromVector(instr->dimensions(), builder_);
  TF_ASSIGN_OR_RETURN(Type res_type, ConvertTensorShapeToType<RankedTensorType>(
                                         instr->shape(), builder_));
  instruction_to_values_[instr] = builder_.create<hlo::BroadcastInDimOp>(
      getLocation(instr), llvm::makeArrayRef(res_type),
      instruction_to_values_[instr->operand(0)], broadcast_dim);
  return Status::OK();
 }
 Status HloDialectEmitter::HandleConcatenate(HloInstruction* instr) {
  int64 concatenate_dim = instr->concatenate_dimension();
  TF_ASSIGN_OR_RETURN(Type res_type, ConvertTensorShapeToType<RankedTensorType>(
                                         instr->shape(), builder_));
  llvm::SmallVector<Value, 4> arguments;
  arguments.reserve(instr->operand_count());
  for (auto operand : instr->operands()) {
    arguments.push_back(instruction_to_values_[operand]);
  }
  instruction_to_values_[instr] = builder_.create<hlo::ConcatenateOp>(
      getLocation(instr), llvm::makeArrayRef(res_type), arguments,
      builder_.getI64IntegerAttr(concatenate_dim));
  return Status::OK();
 }
 Status HloDialectEmitter::HandleParameter(HloInstruction* instr) {
  auto argValue = arguments_[instr->parameter_number()];
  instruction_to_values_[instr] = argValue;
  return Status::OK();
 }
 Status HloDialectEmitter::HandleConstant(HloInstruction* instr) {
  auto shape = instr->shape();
  if (!shape.IsArray() || shape.rank() != 0) {
    return Unimplemented("non-scalar constants are not supported yet");
  }
  TF_ASSIGN_OR_RETURN(auto type, ConvertTensorShapeToType<RankedTensorType>(
                                     instr->shape(), builder_));
  TF_ASSIGN_OR_RETURN(auto value, CreateDenseElementsAttrFromLiteral(
                                      instr->literal(), builder_));
  auto const_value =
      builder_.create<hlo::ConstOp>(getLocation(instr), type, value);
  instruction_to_values_[instr] = const_value;
  return Status::OK();
 }
 Status HloDialectEmitter::HandleGather(HloInstruction* instr) {
  HloGatherInstruction* gather = static_cast<HloGatherInstruction*>(instr);
  mlir::mhlo::GatherDimensionNumbers dimension_numbers =
      xla::CreateGatherDimensionNumbers(gather->gather_dimension_numbers(),
                                        builder_);
  mlir::DenseIntElementsAttr slice_sizes = CreateDenseIntElementsAttrFromVector(
      llvm::SmallVector<int64, 4>{gather->gather_slice_sizes().begin(),
                                  gather->gather_slice_sizes().end()},
      builder_);
  mlir::BoolAttr indices_are_sorted =
      builder_.getBoolAttr(gather->indices_are_sorted());
  TF_ASSIGN_OR_RETURN(Type res_type, ConvertTensorShapeToType<RankedTensorType>(
                                         instr->shape(), builder_));
  instruction_to_values_[instr] = builder_.create<hlo::GatherOp>(
      getLocation(instr), res_type, instruction_to_values_[instr->operand(0)],
      instruction_to_values_[instr->operand(1)], dimension_numbers, slice_sizes,
      indices_are_sorted);
  return Status::OK();
 }
 Status HloDialectEmitter::HandleReduce(HloInstruction* instr) {
  llvm::SmallVector<Value, 4> operands;
  for (auto operand : instr->operands()) {
    operands.push_back(instruction_to_values_.at(operand));
  }
  const unsigned num_inputs = operands.size() / 2;
  TF_ASSIGN_OR_RETURN(
      const auto return_type,
      ConvertTensorShapeToType<RankedTensorType>(instr->shape(), builder_));
  const auto dimensions_attr =
      CreateDenseIntElementsAttrFromVector(instr->dimensions(), builder_);
  auto reduceOp = builder_.create<hlo::ReduceOp>(
      getLocation(instr), return_type,
      llvm::makeArrayRef(operands).take_front(num_inputs),
      llvm::makeArrayRef(operands).take_back(num_inputs), dimensions_attr);
  {
    auto computation = instr->to_apply();
    auto block = new mlir::Block();
    llvm::SmallVector<Value, 4> arguments;
    arguments.reserve(computation->num_parameters());
    for (auto parameter : computation->parameter_instructions()) {
      TF_ASSIGN_OR_RETURN(auto param_type,
                          ConvertTensorShapeToType<RankedTensorType>(
                              parameter->shape(), builder_));
      arguments.push_back(block->addArgument(param_type));
    }
    reduceOp.body().push_back(block);
    HloDialectEmitter emitter(emission_context_, &reduceOp.body(), arguments);
    TF_ASSIGN_OR_RETURN(auto result, emitter.EmitComputation(*computation));
    OpBuilder body_builder = OpBuilder::atBlockEnd(block);
    body_builder.setInsertionPointToEnd(block);
    body_builder.create<hlo::ReturnOp>(getLocation(instr),
                                       ArrayRef<Value>{result});
  }
  // TODO(b/137624192) Add support for multiple results.
  instruction_to_values_[instr] = reduceOp.getResult(0);
  return Status::OK();
 }
 Status HloDialectEmitter::HandleCompare(HloInstruction* instr) {
  TF_ASSIGN_OR_RETURN(Type res_type, ConvertTensorShapeToType<RankedTensorType>(
                                         instr->shape(), builder_));
  auto comparison_direction_attr = builder_.getNamedAttr(
      "comparison_direction",
      builder_.getStringAttr(
          ComparisonDirectionToString(instr->comparison_direction())));
  llvm::SmallVector<Value, 4> arguments;
  arguments.reserve(instr->operand_count());
  for (auto operand : instr->operands()) {
    arguments.push_back(instruction_to_values_[operand]);
  }
  instruction_to_values_[instr] = builder_.create<hlo::CompareOp>(
      getLocation(instr), llvm::makeArrayRef(res_type), arguments,
      comparison_direction_attr);
  return Status::OK();
 }
 Status HloDialectEmitter::HandleIota(HloInstruction* instr) {
  mlir::IntegerAttr iota_dim = builder_.getI64IntegerAttr(
      static_cast<HloIotaInstruction*>(instr)->iota_dimension());
  TF_ASSIGN_OR_RETURN(Type res_type, ConvertTensorShapeToType<RankedTensorType>(
                                         instr->shape(), builder_));
  instruction_to_values_[instr] =
      builder_.create<hlo::IotaOp>(getLocation(instr), res_type, iota_dim);
  return Status::OK();
 }
 }  // namespace mlir_gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.h
@ -1,76 +0,0 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_HLO_DIALECT_EMITTER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_HLO_DIALECT_EMITTER_H_
 #include <memory>
 #include "absl/types/span.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/emission_context.h"
 #include "tensorflow/compiler/xla/status.h"
 namespace xla {
 namespace mlir_gpu {
 class HloDialectEmitter : public DfsHloVisitorWithDefault {
 public:
  HloDialectEmitter(xla::mlir_gpu::EmissionContext* emission_context,
                    ::mlir::Region* region,
                    llvm::ArrayRef<::mlir::Value> arguments)
      : emission_context_(emission_context),
        builder_(region),
        arguments_(arguments) {}
  HloDialectEmitter(xla::mlir_gpu::EmissionContext* emission_context,
                    ::mlir::OpBuilder builder,
                    llvm::ArrayRef<::mlir::Value> arguments)
      : emission_context_(emission_context),
        builder_(builder),
        arguments_(arguments) {}
  StatusOr<mlir::Value> EmitComputation(const HloComputation& computation);
  Status DefaultAction(HloInstruction* instr) override;
  Status HandleBroadcast(HloInstruction* instr) override;
  Status HandleCompare(HloInstruction* instr) override;
  Status HandleConcatenate(HloInstruction* instr) override;
  Status HandleConstant(HloInstruction* instr) override;
  Status HandleGather(HloInstruction* instr) override;
  Status HandleIota(HloInstruction* instr) override;
  Status HandleParameter(HloInstruction* instr) override;
  Status HandleReduce(HloInstruction* instr) override;
 private:
  mlir::Location getLocation(const HloInstruction* instr) const;
  xla::mlir_gpu::EmissionContext* emission_context_;
  ::mlir::OpBuilder builder_;
  llvm::ArrayRef<::mlir::Value> arguments_;
  absl::flat_hash_map<const xla::HloInstruction*, ::mlir::Value>
      instruction_to_values_;
 };
 }  // namespace mlir_gpu
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_HLO_DIALECT_EMITTER_H_
--- a/tensorflow/compiler/xla/service/mlir_gpu/inject_errors_pass.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/inject_errors_pass.cc
@ -1,41 +0,0 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/mlir_gpu/inject_errors_pass.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 namespace mlir {
 namespace {
 struct InjectErrorsForTestingPass
    : public PassWrapper<InjectErrorsForTestingPass, FunctionPass> {
  void runOnFunction() override {
    getFunction().getBody().walk([&](Operation *op) {
      op->emitError() << "failed for testing: " << op->getName();
    });
  }
 };
 }  // namespace
 std::unique_ptr<OperationPass<FuncOp>> createInjectErrorsForTestingPass() {
  return std::make_unique<InjectErrorsForTestingPass>();
 }
 static PassRegistration<InjectErrorsForTestingPass> pass(
    "inject-errors", "Emits errors from all operations");
 }  // namespace mlir
--- a/tensorflow/compiler/xla/service/mlir_gpu/inject_errors_pass.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/inject_errors_pass.h
@ -1,29 +0,0 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_INJECT_ERRORS_PASS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_INJECT_ERRORS_PASS_H_
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 namespace mlir {
 // Returns a function pass that emits errors from all operations inside the
 // function.
 std::unique_ptr<OperationPass<FuncOp>> createInjectErrorsForTestingPass();
 }  // namespace mlir
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_INJECT_ERRORS_PASS_H_
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
@ -1,504 +0,0 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h"
 #include <utility>
 #include "llvm/IR/DataLayout.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Identifier.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk_emitter.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/errors.h"
 namespace xla {
 namespace mlir_gpu {
 namespace {
 using ::mlir::ArrayRef;
 using ::mlir::Attribute;
 using ::mlir::Builder;
 using ::mlir::DenseIntElementsAttr;
 using ::mlir::FuncOp;
 using ::mlir::Identifier;
 using ::mlir::Location;
 using ::mlir::MemRefType;
 using ::mlir::ModuleOp;
 using ::mlir::OpBuilder;
 using ::mlir::Type;
 using ::mlir::Value;
 using ::mlir::LLVM::LLVMDialect;
 using ::xla::gpu::Thunk;
 using ::xla::gpu::ThunkEmitter;
 using ::xla::gpu::ThunkSequence;
 namespace lhlo = ::mlir::lmhlo;
 // TODO(b/137624192) Use tablegen for this.
 Status InsertMlirOp(HloOpcode opcode, OpBuilder func_builder, Location loc,
                    ArrayRef<Type> rets, ArrayRef<Value> args,
                    ArrayRef<std::pair<Identifier, Attribute>> attrs) {
  switch (opcode) {
    case HloOpcode::kAbs:
      func_builder.create<lhlo::AbsOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kAdd:
      func_builder.create<lhlo::AddOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kAnd:
      func_builder.create<lhlo::AndOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kCeil:
      func_builder.create<lhlo::CeilOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kComplex:
      func_builder.create<lhlo::ComplexOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kCopy:
      func_builder.create<lhlo::CopyOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kCos:
      func_builder.create<lhlo::CosOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kDivide:
      func_builder.create<lhlo::DivOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kExp:
      func_builder.create<lhlo::ExpOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kImag:
      func_builder.create<lhlo::ImagOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kLog:
      func_builder.create<lhlo::LogOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kMaximum:
      func_builder.create<lhlo::MaxOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kMinimum:
      func_builder.create<lhlo::MinOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kMultiply:
      func_builder.create<lhlo::MulOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kNegate:
      func_builder.create<lhlo::NegOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kReal:
      func_builder.create<lhlo::RealOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kRemainder:
      func_builder.create<lhlo::RemOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kRsqrt:
      func_builder.create<lhlo::RsqrtOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kSelect:
      func_builder.create<lhlo::SelectOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kSign:
      func_builder.create<lhlo::SignOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kSqrt:
      func_builder.create<lhlo::SqrtOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kSubtract:
      func_builder.create<lhlo::SubOp>(loc, rets, args, attrs);
      break;
    case HloOpcode::kTanh:
      func_builder.create<lhlo::TanhOp>(loc, rets, args, attrs);
      break;
    default:
      return tensorflow::errors::Internal(absl::StrCat(
          "LHLO opcode ", HloOpcodeString(opcode), " is not supported."));
  }
  return Status::OK();
 }
 StatusOr<llvm::SmallVector<Type, 4>> GetInstructionArgTypes(
    const HloInstruction& instruction, Builder builder) {
  llvm::SmallVector<Type, 4> arg_types;
  for (auto operand : instruction.operands()) {
    TF_ASSIGN_OR_RETURN(auto operand_type, ConvertShapeToType<MemRefType>(
                                               operand->shape(), builder));
    arg_types.push_back(operand_type);
  }
  TF_ASSIGN_OR_RETURN(auto operand_type, ConvertShapeToType<MemRefType>(
                                             instruction.shape(), builder));
  arg_types.push_back(operand_type);
  return arg_types;
 }
 // Converts HloComputation into a block with HLO dialect ops. The block gets
 // memref arguments corresponding to HloComputation arguments and results.
 Status SpliceHloComputation(OpBuilder builder, mlir::Location loc,
                            const HloComputation& hlo_computation,
                            xla::mlir_gpu::EmissionContext* emission_context) {
  auto block = builder.getInsertionBlock();
  builder.setInsertionPoint(block->getTerminator());
  llvm::SmallVector<Value, 4> arg_values;
  // First map parameters to memrefs on the operation.
  for (auto param : hlo_computation.parameter_instructions()) {
    TF_ASSIGN_OR_RETURN(
        auto arg_type, ConvertShapeToType<MemRefType>(param->shape(), builder));
    auto block_arg = block->addArgument(arg_type);
    arg_values.push_back(builder.create<::mlir::TensorLoadOp>(loc, block_arg));
  }
  HloDialectEmitter hlo_emitter(emission_context, builder, arg_values);
  TF_ASSIGN_OR_RETURN(auto result,
                      hlo_emitter.EmitComputation(hlo_computation));
  // Now add a block arg and store for the result.
  builder.setInsertionPoint(block->getTerminator());
  TF_ASSIGN_OR_RETURN(
      auto result_type,
      ConvertShapeToType<MemRefType>(
          hlo_computation.root_instruction()->shape(), builder));
  auto block_arg = block->addArgument(result_type);
  builder.create<::mlir::TensorStoreOp>(loc, result, block_arg);
  return Status::OK();
 }
 }  // namespace
 mlir::Location LhloDialectEmitter::getLocation(
    const HloInstruction* instr) const {
  return emission_context_->getLocation(instr);
 }
 LhloDialectEmitter::LhloDialectEmitter(
    xla::mlir_gpu::EmissionContext* emission_context,
    const BufferAssignment& assignment, const se::Platform* platform,
    ModuleOp mlir_module)
    : emission_context_(emission_context),
      mlir_module_(mlir_module),
      builder_(mlir_module_.getContext()),
      buffer_assignment_(assignment),
      platform_(platform) {
  llvm::DataLayout data_layout("");
  if (auto data_layout_attr = mlir_module.getAttrOfType<mlir::StringAttr>(
          mlir::LLVM::LLVMDialect::getDataLayoutAttrName())) {
    data_layout.reset(data_layout_attr.getValue());
  }
  pointer_size_ = data_layout.getPointerSize();
 }
 void LhloDialectEmitter::AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) {
  thunk_sequence_.push_back(std::move(thunk));
 }
 StatusOr<BufferAllocation::Slice> LhloDialectEmitter::MaybeGetAllocationSlice(
    const HloInstruction& hlo, const ShapeIndex& index) const {
  return buffer_assignment_.GetUniqueSlice(&hlo, index);
 }
 int64 LhloDialectEmitter::ByteSizeOf(const Shape& shape) const {
  return ShapeUtil::ByteSizeOf(shape, pointer_size_);
 }
 absl::string_view LhloDialectEmitter::platform_name() const {
  return platform_->Name();
 }
 StatusOr<FuncOp> LhloDialectEmitter::CreateFunction(
    const HloInstruction& instr) {
  TF_ASSIGN_OR_RETURN(auto args, GetInstructionArgTypes(instr, builder_));
  auto function_type = builder_.getFunctionType(args, {});
  auto function =
      FuncOp::create(getLocation(&instr), instr.name(), function_type);
  mlir_module_.push_back(function);
  function.addEntryBlock();
  OpBuilder op_builder(function.getBody());
  op_builder.create<::mlir::ReturnOp>(getLocation(&instr));
  instruction_to_mlir_func_[&instr] = function;
  return function;
 }
 Status LhloDialectEmitter::DefaultAction(HloInstruction* instr) {
  TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
  OpBuilder func_builder(function.getBody());
  llvm::SmallVector<Value, 4> arg_values{function.args_begin(),
                                         function.args_end()};
  TF_RETURN_IF_ERROR(InsertMlirOp(instr->opcode(), func_builder,
                                  getLocation(instr), ArrayRef<Type>{},
                                  arg_values, llvm::None));
  return Status::OK();
 }
 Status LhloDialectEmitter::HandleBroadcast(HloInstruction* instr) {
  DenseIntElementsAttr broadcast_dim =
      CreateDenseIntElementsAttrFromVector(instr->dimensions(), builder_);
  TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
  OpBuilder func_builder(function.getBody());
  func_builder.create<lhlo::BroadcastInDimOp>(
      getLocation(instr), function.getArgument(0), function.getArgument(1),
      broadcast_dim);
  return Status::OK();
 }
 Status LhloDialectEmitter::HandleConcatenate(HloInstruction* instr) {
  mlir::IntegerAttr concatenate_dim = builder_.getI64IntegerAttr(
      static_cast<HloConcatenateInstruction*>(instr)->concatenate_dimension());
  TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
  OpBuilder func_builder(function.getBody());
  func_builder.create<lhlo::ConcatenateOp>(
      getLocation(instr), function.getArguments().drop_back(),
      function.getArguments().back(), concatenate_dim);
  return Status::OK();
 }
 Status LhloDialectEmitter::HandleFusion(HloInstruction* instr) {
  TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
  OpBuilder func_builder(function.getBody());
  auto fusion_op = func_builder.create<lhlo::FusionOp>(getLocation(instr));
  // Load the HLO argument tensors from the corresponding buffers. The last
  // argument is for the result, so no need to load it.
  OpBuilder body_builder(fusion_op.region());
  llvm::SmallVector<Value, 4> arg_values;
  for (int i = 0, e = function.getNumArguments() - 1; i < e; ++i) {
    arg_values.push_back(body_builder.create<::mlir::TensorLoadOp>(
        getLocation(instr), function.getArgument(i)));
  }
  HloDialectEmitter hlo_emitter(emission_context_, body_builder, arg_values);
  TF_ASSIGN_OR_RETURN(
      auto result,
      hlo_emitter.EmitComputation(*instr->fused_instructions_computation()));
  // Insert the write-back from the HLO computation to the result argument
  // buffer.
  body_builder.setInsertionPoint(fusion_op.region().back().getTerminator());
  Value result_memref = function.getArguments().back();
  body_builder.create<::mlir::TensorStoreOp>(getLocation(instr), result,
                                             result_memref);
  return Status::OK();
 }
 Status LhloDialectEmitter::HandleGather(HloInstruction* instr) {
  HloGatherInstruction* gather = static_cast<HloGatherInstruction*>(instr);
  mlir::mhlo::GatherDimensionNumbers dim_numbers =
      xla::CreateGatherDimensionNumbers(gather->gather_dimension_numbers(),
                                        builder_);
  mlir::DenseIntElementsAttr slice_sizes = CreateDenseIntElementsAttrFromVector(
      llvm::SmallVector<int64, 4>{gather->gather_slice_sizes().begin(),
                                  gather->gather_slice_sizes().end()},
      builder_);
  TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
  OpBuilder func_builder(function.getBody());
  func_builder.create<lhlo::GatherOp>(
      getLocation(instr), function.getArgument(0), function.getArgument(1),
      dim_numbers, slice_sizes, function.getArgument(2));
  return Status::OK();
 }
 Status LhloDialectEmitter::HandleReduce(HloInstruction* instr) {
  TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
  llvm::SmallVector<Value, 4> arg_values{function.args_begin(),
                                         function.args_end()};
  OpBuilder builder(function.getBody());
  auto loc = getLocation(instr);
  int input_count = instr->operand_count() / 3;
  auto inputs = llvm::makeArrayRef(arg_values).slice(input_count);
  auto init_values =
      llvm::makeArrayRef(arg_values).slice(input_count, input_count);
  auto results =
      llvm::makeArrayRef(arg_values).slice(2 * input_count, input_count);
  auto dimensions_attr =
      CreateDenseIntElementsAttrFromVector(instr->dimensions(), builder_);
  auto reduce_op = builder.create<lhlo::ReduceOp>(loc, inputs, init_values,
                                                  results, dimensions_attr);
  builder.createBlock(&reduce_op.body());
  OpBuilder::atBlockEnd(&reduce_op.body().front())
      .create<lhlo::TerminatorOp>(getLocation(instr));
  return SpliceHloComputation(OpBuilder{&reduce_op.body()}, loc,
                              *instr->to_apply(), emission_context_);
 }
 Status LhloDialectEmitter::HandleReduceWindow(HloInstruction* instr) {
  TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
  llvm::SmallVector<Value, 4> arg_values{function.args_begin(),
                                         function.args_end()};
  OpBuilder builder(function.getBody());
  auto loc = getLocation(instr);
  // Collect attribute values.
  llvm::SmallVector<int64, 2> window_dimensions, window_strides, base_dilations,
      window_dilations;
  llvm::SmallVector<int64, 4> padding;
  int64 rank = instr->window().dimensions_size();
  window_dimensions.reserve(rank);
  window_strides.reserve(rank);
  base_dilations.reserve(rank);
  window_dilations.reserve(rank);
  padding.reserve(2 * rank);
  for (const auto& window : instr->window().dimensions()) {
    window_dimensions.push_back(window.size());
    window_strides.push_back(window.stride());
    base_dilations.push_back(window.base_dilation());
    window_dilations.push_back(window.window_dilation());
    padding.push_back(window.padding_low());
    padding.push_back(window.padding_high());
  }
  auto reduce_window_op = builder.create<lhlo::ReduceWindowOp>(
      loc, /*operand=*/arg_values[0], /*init_value=*/arg_values[1],
      /*out=*/arg_values[2],
      CreateDenseIntElementsAttrFromVector(window_dimensions, builder),
      CreateDenseIntElementsAttrFromVector(window_strides, builder),
      CreateDenseIntElementsAttrFromVector(base_dilations, builder),
      CreateDenseIntElementsAttrFromVector(window_dilations, builder),
      CreateDenseIntElementsAttrFromVector(padding, builder, {rank, 2}));
  reduce_window_op.ensureTerminator(reduce_window_op.body(), builder, loc);
  return SpliceHloComputation(OpBuilder{&reduce_window_op.body()}, loc,
                              *instr->to_apply(), emission_context_);
 }
 Status LhloDialectEmitter::HandleSelectAndScatter(HloInstruction* instr) {
  TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
  llvm::SmallVector<Value, 4> arg_values{function.args_begin(),
                                         function.args_end()};
  OpBuilder builder(function.getBody());
  auto loc = getLocation(instr);
  // Collect attribute values.
  llvm::SmallVector<int64, 2> window_dimensions, window_strides, padding;
  int64 rank = instr->window().dimensions_size();
  window_dimensions.reserve(rank);
  window_strides.reserve(rank);
  padding.reserve(2 * rank);
  for (const auto& window : instr->window().dimensions()) {
    window_dimensions.push_back(window.size());
    window_strides.push_back(window.stride());
    padding.push_back(window.padding_low());
    padding.push_back(window.padding_high());
  }
  auto select_scatter_op = builder.create<lhlo::SelectAndScatterOp>(
      loc, /*operand=*/arg_values[0], /*source=*/arg_values[1],
      /*init_value=*/arg_values[2],
      /*out=*/arg_values[3],
      CreateDenseIntElementsAttrFromVector(window_dimensions, builder),
      CreateDenseIntElementsAttrFromVector(window_strides, builder),
      CreateDenseIntElementsAttrFromVector(padding, builder, {rank, 2}));
  // Convert `select` computation.
  builder.createBlock(&select_scatter_op.select());
  OpBuilder select_builder{&select_scatter_op.select()};
  select_builder.create<lhlo::TerminatorOp>(loc);
  TF_RETURN_IF_ERROR(SpliceHloComputation(select_builder, loc, *instr->select(),
                                          emission_context_));
  // Convert `scatter` computation.
  builder.createBlock(&select_scatter_op.scatter());
  OpBuilder scatter_builder{&select_scatter_op.scatter()};
  scatter_builder.create<lhlo::TerminatorOp>(loc);
  TF_RETURN_IF_ERROR(SpliceHloComputation(
      scatter_builder, loc, *instr->scatter(), emission_context_));
  return Status::OK();
 }
 Status LhloDialectEmitter::HandleCustomCall(HloInstruction* instr) {
  return ThunkEmitter(this).HandleCustomCall(instr);
 }
 Status LhloDialectEmitter::HandleParameter(HloInstruction* instr) {
  return Status::OK();
 }
 Status LhloDialectEmitter::HandleCompare(HloInstruction* instr) {
  auto comparison_direction_attr = builder_.getNamedAttr(
      "comparison_direction",
      builder_.getStringAttr(
          ComparisonDirectionToString(instr->comparison_direction())));
  TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
  OpBuilder func_builder(function.getBody());
  llvm::SmallVector<Value, 4> arg_values{function.args_begin(),
                                         function.args_end()};
  func_builder.create<lhlo::CompareOp>(getLocation(instr), llvm::None,
                                       arg_values, comparison_direction_attr);
  return Status::OK();
 }
 Status LhloDialectEmitter::HandleConstant(HloInstruction* instr) {
  auto shape = instr->shape();
  if (!shape.IsArray() || shape.rank() != 0) {
    return Unimplemented("non-scalar constants are not supported yet");
  }
  TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
  OpBuilder func_builder(function.getBody());
  TF_ASSIGN_OR_RETURN(auto value, CreateDenseElementsAttrFromLiteral(
                                      instr->literal(), func_builder));
  func_builder.create<lhlo::ConstOp>(getLocation(instr), value,
                                     *function.args_begin());
  return Status::OK();
 }
 Status LhloDialectEmitter::HandleIota(HloInstruction* instr) {
  mlir::IntegerAttr iota_dim = builder_.getI64IntegerAttr(
      static_cast<HloIotaInstruction*>(instr)->iota_dimension());
  TF_ASSIGN_OR_RETURN(auto function, CreateFunction(*instr));
  OpBuilder func_builder(function.getBody());
  func_builder.create<lhlo::IotaOp>(getLocation(instr), iota_dim,
                                    function.getArgument(0));
  return Status::OK();
 }
 Status LhloDialectEmitter::HandleTuple(HloInstruction* instr) {
  // For the root node of the entry computation we can elide writing the tuple
  // buffer. We can always figure out the contents of the tuples from buffer
  // assignment because we insert copies to ensure non-ambiguous output buffers.
  // GpuExecutable never reads the tuple buffer.
  if (instr ==
      instr->parent()->parent()->entry_computation()->root_instruction()) {
    return Status::OK();
  }
  return Unimplemented("handling of typles not yet implemented");
 }
 Status LhloDialectEmitter::FinishVisit(HloInstruction* root) {
  return Status::OK();
 }
 }  // namespace mlir_gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h
@ -1,111 +0,0 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_LHLO_DIALECT_EMITTER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_LHLO_DIALECT_EMITTER_H_
 #include <memory>
 #include <utility>
 #include "absl/container/flat_hash_map.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk_emitter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/emission_context.h"
 #include "tensorflow/compiler/xla/status.h"
 namespace xla {
 namespace mlir_gpu {
 // Implementation for the translation of HLO instructions to a ThunkSequence
 // via MLIR using the LHLO dialect.
 // Implements the DfsHloVisitor interface, emits LHLO computations as MLIR IR
 // functions and transforms them into gpu::Thunk.
 class LhloDialectEmitter : public DfsHloVisitorWithDefault,
                           private gpu::ThunkEmitter::EmissionContext {
 public:
  LhloDialectEmitter(xla::mlir_gpu::EmissionContext* emission_context,
                     const BufferAssignment& assignment,
                     const se::Platform* platform,
                     ::mlir::ModuleOp mlir_module);
  ~LhloDialectEmitter() override = default;
  // The following methods implement the DfsHloVisitor interface.
  //
  // Default action which emits code for most operations. Operations which are
  // special in some way are handled explicitly in HandleFoo methods.
  Status DefaultAction(HloInstruction* instr) override;
  Status HandleBroadcast(HloInstruction* instr) override;
  Status HandleCompare(HloInstruction* instr) override;
  Status HandleConcatenate(HloInstruction* instr) override;
  Status HandleConstant(HloInstruction* instr) override;
  Status HandleCustomCall(HloInstruction* instr) override;
  Status HandleFusion(HloInstruction* instr) override;
  Status HandleGather(HloInstruction* instr) override;
  Status HandleIota(HloInstruction* instr) override;
  Status HandleParameter(HloInstruction* instr) override;
  Status HandleReduce(HloInstruction* instr) override;
  Status HandleReduceWindow(HloInstruction* instr) override;
  Status HandleSelectAndScatter(HloInstruction* instr) override;
  Status HandleTuple(HloInstruction* instr) override;
  Status FinishVisit(HloInstruction* root) override;
  // Transfers the ownship of thunk_sequence_ out.
  gpu::ThunkSequence ConsumeThunkSequence() {
    gpu::ThunkSequence result;
    std::swap(result, thunk_sequence_);
    return result;
  }
  const absl::flat_hash_map<const xla::HloInstruction*, ::mlir::FuncOp>&
  InstructionToFunctionMap() const {
    return instruction_to_mlir_func_;
  }
 private:
  StatusOr<::mlir::FuncOp> CreateFunction(const HloInstruction& instr);
  // Interface required by ThunkEmitter
  void AddThunkToThunkSequence(std::unique_ptr<gpu::Thunk> thunk) override;
  StatusOr<BufferAllocation::Slice> MaybeGetAllocationSlice(
      const HloInstruction& hlo, const ShapeIndex& index) const override;
  int64 ByteSizeOf(const Shape& shape) const override;
  absl::string_view platform_name() const override;
  mlir::Location getLocation(const HloInstruction* instr) const;
  xla::mlir_gpu::EmissionContext* emission_context_;
  ::mlir::ModuleOp mlir_module_;
  ::mlir::Builder builder_;
  absl::flat_hash_map<const xla::HloInstruction*, ::mlir::FuncOp>
      instruction_to_mlir_func_;
  const BufferAssignment& buffer_assignment_;
  const se::Platform* platform_;
  // Cached pointer size extracted from the mlir module.
  unsigned pointer_size_;
  // The thunk sequence this IrEmitter generates for the input computation.
  gpu::ThunkSequence thunk_sequence_;
  TF_DISALLOW_COPY_AND_ASSIGN(LhloDialectEmitter);
 };
 }  // namespace mlir_gpu
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_LHLO_DIALECT_EMITTER_H_
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
@ -1,50 +0,0 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h"
 #include <memory>
 #include "llvm/IR/Module.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 namespace xla {
 namespace mlir_gpu {
 MlirCompiler::MlirCompiler() : data_layout_("") {}
 se::Platform::Id MlirCompiler::PlatformId() const {
  return stream_executor::cuda::kCudaPlatformId;
 }
 void MlirCompiler::SetModuleHook(IRHook module_hook) {
  module_hook_ = module_hook;
 }
 void MlirCompiler::RemoveModuleHook() {
  module_hook_ = {nullptr, IRHook::LoweringStage::LHLO};
 }
 void MlirCompiler::SetErrorHandler(ErrorHandler error_handler) {
  error_handler_ = error_handler;
 }
 void MlirCompiler::RemoveErrorHandler() { error_handler_ = nullptr; }
 }  // namespace mlir_gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
@ -1,70 +0,0 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_COMPILER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_COMPILER_H_
 #include "llvm/IR/DataLayout.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/emission_context.h"
 namespace xla {
 namespace mlir_gpu {
 // A Compiler implementation that converts XLAs IR to a matching MLIR dialect,
 // performs all lowering on the MLIR IR and finally converts MLIR to LLVMIR for
 // generation of a thunk suitable for XLAs runtime. MlirCompilerImpl contains
 // the implementation.
 class MlirCompiler : public Compiler {
  using ErrorHandler =
      std::function<void(const EmissionContext::ErrorMap&, HloModule*)>;
 public:
  MlirCompiler();
  se::Platform::Id PlatformId() const override;
  struct IRHook {
    enum class LoweringStage { LHLO, GPU, LLVM, KERNEL };
    Status invoke(LoweringStage stage_, mlir::ModuleOp module) {
      if (callback && stage == stage_) {
        return callback(module);
      }
      return Status::OK();
    }
    std::function<Status(mlir::ModuleOp)> callback;
    LoweringStage stage;
  };
  void SetModuleHook(IRHook module_hook);
  void RemoveModuleHook();
  void SetErrorHandler(ErrorHandler error_handler);
  void RemoveErrorHandler();
 protected:
  ::mlir::MLIRContext context_;
  llvm::DataLayout data_layout_;
  IRHook module_hook_;
  ErrorHandler error_handler_;
 };
 }  // namespace mlir_gpu
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_COMPILER_H_
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@ -1,629 +0,0 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <memory>
 #include <string>
 #include <vector>
 #include "absl/container/flat_hash_map.h"
 #include "llvm/IR/LLVMContext.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Target/NVVMIR.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/emission_context.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/cuda_libdevice_path.h"
 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
 namespace xla {
 namespace mlir_gpu {
 namespace {
 using ::mlir::BlockArgument;
 using ::mlir::dyn_cast;
 using ::mlir::FuncOp;
 using ::mlir::ModuleOp;
 using ::mlir::OwningModuleRef;
 using ::mlir::UnknownLoc;
 using ::mlir::Value;
 using ::mlir::gpu::LaunchFuncOp;
 using ::mlir::LLVM::LLVMDialect;
 using ::mlir::LLVM::LLVMFuncOp;
 using ::mlir::LLVM::LLVMType;
 using ::xla::gpu::GpuExecutable;
 using ::xla::gpu::GpuHloSchedule;
 using ::xla::gpu::GpuVersion;
 using ::xla::gpu::StreamAssignment;
 using ::xla::gpu::ThunkSchedule;
 // A Compiler implementation that converts XLAs IR to a matching MLIR dialect,
 // performs all lowering on the MLIR IR and finally converts MLIR to LLVMIR for
 // generation of a thunk suitable for XLAs runtime.
 class MlirCompilerImpl : public MlirCompiler {
 public:
  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
      const CompileOptions& options) override;
  StatusOr<std::unique_ptr<Executable>> RunBackend(
      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
      const CompileOptions& options) override;
  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
      std::unique_ptr<HloModuleGroup> module_group,
      std::vector<std::vector<se::StreamExecutor*>> stream_execs,
      const CompileOptions& options) override;
  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                     const AotCompilationOptions& options) override;
  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
    int64 pointer_size = data_layout_.getPointerSize();
    return [pointer_size](const Shape& shape) {
      return ShapeUtil::ByteSizeOf(shape, pointer_size);
    };
  }
 };
 // TODO(b/137624192) Share with NVPTX compiler
 static std::vector<std::string> CandidateCudaRoots(
    const HloModuleConfig& config) {
  return tensorflow::CandidateCudaRoots(
      config.debug_options().xla_gpu_cuda_data_dir());
 }
 void PrintCantFindCudaMessage(absl::string_view msg,
                              const HloModuleConfig& hlo_module_config) {
  LOG(WARNING) << msg;
  LOG(WARNING) << "Searched for CUDA in the following directories:";
  for (const auto& dir : CandidateCudaRoots(hlo_module_config)) {
    LOG(WARNING) << "  " << dir;
  }
  LOG(WARNING)
      << "You can choose the search directory by setting xla_gpu_cuda_data_dir "
         "in HloModule's DebugOptions.  For most apps, setting the environment "
         "variable XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda will work.";
 }
 // Returns the directory containing nvvm libdevice files.
 std::string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
  for (const string& cuda_root : CandidateCudaRoots(hlo_module_config)) {
    const std::string libdevice_dir =
        tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
    VLOG(2) << "Looking for libdevice at " << libdevice_dir;
    if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
      VLOG(2) << "Found libdevice dir " << libdevice_dir;
      return libdevice_dir;
    }
  }
  PrintCantFindCudaMessage(
      "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice. This may "
      "result in compilation or runtime failures, if the program we try to run "
      "uses routines from libdevice.",
      hlo_module_config);
  // GetCudaRootCandidates always includes ".", but if everything fails, we
  // return it anyway.  Better than returning the empty string.
  return ".";
 }
 StatusOr<std::unique_ptr<HloModule>> MlirCompilerImpl::RunHloPasses(
    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
    const CompileOptions& options) {
  // Until we find a reason to do something different, run the same passes
  // that the normal GPU backend runs.
  gpu::NVPTXCompiler xla_compiler;
  TF_RETURN_IF_ERROR(xla_compiler.OptimizeHloModule(module.get(), stream_exec,
                                                    options.device_allocator));
  TF_RETURN_IF_ERROR(xla_compiler.PrepareHloModuleForIrEmitting(module.get()));
  return std::move(module);
 }
 // TODO(b/137624192): Move this to custom call handling and share.
 absl::optional<bool> CanShareBufferHint(const HloInstruction* user,
                                        const HloInstruction* operand,
                                        const ShapeIndex& user_index) {
  if (user->opcode() == HloOpcode::kCustomCall) {
    // Share the bias buffer with the parent instruction.
    if (user->custom_call_target() == xla::gpu::kGemmCallTarget) {
      if (user->operand_count() == 3 && user->operand(2) == operand) {
        return true;
      }
    }
    // The operand of cholesky can be shared with the first output.
    if (user->custom_call_target() == xla::gpu::kCusolverCholeskyCallTarget) {
      return user_index.size() == 1 && user_index[0] == 0;
    }
  }
  return absl::nullopt;
 }
 // TODO(b/137624192): Share this with nvptx backend.
 GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) {
  int cc_major, cc_minor;
  const auto& device_description = stream_exec->GetDeviceDescription();
  if (!device_description.cuda_compute_capability(&cc_major, &cc_minor)) {
    LOG(WARNING)
        << "Couldn't get compute capability for device; assuming sm_20.";
    cc_major = 2;
    cc_minor = 0;
  }
  return std::make_pair(cc_major, cc_minor);
 }
 // Return the constant launch bound along the "x" dimension in "dim" if all the
 // other dimensions are 1.  Return nullopt otherwise or when any of the bounds
 // is not constant.
 static absl::optional<int64> getLaunchBound(const mlir::gpu::KernelDim3& dim) {
  auto get_constant = [](mlir::Operation* op,
                         mlir::StringRef name) -> absl::optional<int64> {
    if (auto constant = llvm::dyn_cast_or_null<mlir::ConstantOp>(op)) {
      return constant.value().cast<mlir::IntegerAttr>().getInt();
    }
    op->emitError() << "bound " << name << " is not constant";
    return absl::nullopt;
  };
  auto y_op = dim.y.getDefiningOp();
  auto dim_y = get_constant(y_op, "y");
  if (!dim_y.has_value() || dim_y.value() != 1) {
    y_op->emitError() << "bound 'y' is not constant 1";
    return absl::nullopt;
  }
  auto z_op = dim.z.getDefiningOp();
  auto dim_z = get_constant(z_op, "z");
  if (!dim_z.has_value() || dim_z.value() != 1) {
    z_op->emitError() << "bound 'z' is not constant 1";
    return absl::nullopt;
  }
  return get_constant(dim.x.getDefiningOp(), "x");
 }
 // Indexes of a range of arguments in a GPU function. This is used to keep the
 // range of arguments that correspond to a lowered kernel argument of
 // (previously) memref type.
 struct LaunchFuncArgument {
  int kernel_argument_begin;
  int kernel_argument_size;
 };
 using OperandToValueMap =
    absl::flat_hash_map<const HloInstruction*, std::vector<LaunchFuncArgument>>;
 static StatusOr<std::vector<const HloInstruction*>> ComputeOperandToValueMap(
    OperandToValueMap* operand_to_value_map, const HloInstruction* instr,
    LaunchFuncOp launchOp, LLVMFuncOp kernel) {
  auto operands = instr->operands();
  std::vector<const HloInstruction*> ordered_operands;
  bool has_failed = false;
  // A memref will expand into multiple kernel operands, accumulate their number
  // in order to find them later.
  int cur_operand_position = 0;
  for (int kernel_index = 0; kernel_index < launchOp.getNumKernelOperands();
       ++kernel_index) {
    auto launchop_operand =
        launchOp.getKernelOperand(kernel_index).dyn_cast<BlockArgument>();
    if (!launchop_operand) {
      launchOp.emitError("argument to kernel is not a function input");
      has_failed = true;
      continue;
    }
    auto memref_type =
        launchop_operand.getType().dyn_cast<::mlir::MemRefType>();
    if (!memref_type) {
      launchOp.emitError("only memref-typed arguments are supported");
      has_failed = true;
      break;
    }
    // host_index is the argument position to the surrounding function that
    // contains the launch. This index corresponds to HLO operand indices
    // by construction.
    auto host_index = launchop_operand.getArgNumber();
    // The trailing argument to the outer function are the results.
    auto operand =
        (host_index < operands.size()) ? operands[host_index] : instr;
    if (!operand_to_value_map->count(operand)) {
      ordered_operands.push_back(operand);
    }
    // Associate the HLO operand with the argument values of the kernel
    // function.
    int num_unpacked =
        mlir::MemRefDescriptor::getNumUnpackedValues(memref_type);
    (*operand_to_value_map)[operand].push_back(
        {cur_operand_position, num_unpacked});
    cur_operand_position += num_unpacked;
  }
  if (has_failed) {
    return InternalError("Mapping operands to kernel arguments has failed.");
  }
  return ordered_operands;
 }
 Status InsertBufferLoadPreduleIntoKernel(
    LLVMFuncOp kernel, const OperandToValueMap& operand_to_value_map,
    const std::vector<const HloInstruction*>& ordered_operands,
    BufferAssignment* assignment,
    const std::vector<const BufferAllocation*>& buffers) {
  mlir::OpBuilder builder(kernel.getBody());
  auto* context = kernel.getContext();
  auto offset_type = LLVMType::getInt64Ty(context);
  auto ptr_type = LLVMType::getInt8PtrTy(context);
  auto void_type = LLVMType::getVoidTy(context);
  auto loc = kernel.getLoc();
  auto num_original_args = kernel.getNumArguments();
  std::vector<LLVMType> new_arg_types(buffers.size(), ptr_type);
  kernel->setAttr(kernel.getTypeAttrName(),
                  mlir::TypeAttr::get(LLVMType::getFunctionTy(
                      void_type, new_arg_types, /*isVarArg=*/false)));
  std::vector<Value> original_args(kernel.args_begin(), kernel.args_end());
  std::vector<mlir::Type> as_mlir_types(new_arg_types.begin(),
                                        new_arg_types.end());
  auto new_args = kernel.front().addArguments(as_mlir_types);
  std::vector<Value> buffer_args(new_args.begin(), new_args.end());
  for (auto operand : ordered_operands) {
    TF_ASSIGN_OR_RETURN(auto slice,
                        assignment->GetUniqueTopLevelSlice(operand));
    auto buffer = std::find(buffers.begin(), buffers.end(), slice.allocation());
    auto index = buffer - buffers.begin();
    auto offset = builder.create<mlir::LLVM::ConstantOp>(
        loc, offset_type, builder.getI64IntegerAttr(slice.offset()));
    auto ptr = buffer_args[index];
    // Replace uses of function arguments pertaining to memref descriptors with
    // values derived from HLO buffers. The instructions inserting these values
    // into memref descriptors were already introduced during the lowering phase
    // as per MLIR calling convention.
    for (auto arg : operand_to_value_map.at(operand)) {
      mlir::MemRefDescriptorView original(
          mlir::ValueRange(original_args)
              .slice(arg.kernel_argument_begin, arg.kernel_argument_size));
      // Allocated and aligned pointers are the same.
      auto casted = builder.create<mlir::LLVM::BitcastOp>(
          loc, original.alignedPtr().getType().cast<LLVMType>(),
          mlir::ValueRange(ptr));
      original.alignedPtr().replaceAllUsesWith(casted);
      original.allocatedPtr().replaceAllUsesWith(casted);
      // Use the offset of the HLO buffer instead of the one expected in the
      // function call.
      original.offset().replaceAllUsesWith(offset);
      // Fill the shape.
      auto shape = operand->shape();
      // Unless the operand is a scalar pointer, also fill shape and strides.
      if (shape.dimensions().empty()) {
        continue;
      }
      // TODO(b/137624192) Pass in the descriptor to allow for dynamic shapes.
      assert(shape.IsArray() && shape.is_static());
      for (auto extent : llvm::enumerate(shape.dimensions())) {
        auto shape = builder.create<mlir::LLVM::ConstantOp>(
            loc, original.size(extent.index()).getType(),
            builder.getI64IntegerAttr(extent.value()));
        original.size(extent.index()).replaceAllUsesWith(shape);
      }
      // Finally, fill the strides.
      // TODO(b/137624192): Take assigned layout into account.
      uint64_t accumulator = 0;
      for (int64_t idx = shape.rank() - 1; idx >= 0; --idx) {
        if (accumulator == 0) {
          accumulator = 1;
        } else {
          accumulator *= shape.dimensions(idx + 1);
        }
        auto stride = builder.create<mlir::LLVM::ConstantOp>(
            loc, original.stride(idx).getType(),
            builder.getI64IntegerAttr(accumulator));
        original.stride(idx).replaceAllUsesWith(stride);
      }
    }
  }
  // Now we can remove the original arguments, as they should have no more
  // users.
  for (int i = 0; i < num_original_args; ++i) {
    kernel.front().eraseArgument(0);
  }
  return Status::OK();
 }
 StatusOr<std::unique_ptr<gpu::KernelThunk>> TransformKernelToXlaThunk(
    FuncOp func, const HloInstruction* const instr, ModuleOp kernel_module,
    BufferAssignment* assignment) {
  // Find the single LaunchFuncOp and compute a mapping from operands of
  // the hlo instruction to the corresponding values of the kernel
  // function in the target module;
  LaunchFuncOp launchOp;
  auto walkResult = func.walk([&launchOp](LaunchFuncOp op) {
    if (launchOp) {
      op.emitError("multiple kernels for single top-level HLO");
      return mlir::WalkResult::interrupt();
    }
    launchOp = op;
    return mlir::WalkResult::advance();
  });
  if (walkResult.wasInterrupted()) {
    return InternalError("Multiple kernels for single top-level HLO");
  }
  if (!launchOp) {
    // If there was no launchOp, then no kernel was generated, so the lowering
    // from the LHLO ops to the GPU dialect is not implemented yet.
    return Unimplemented("No kernel was generated.");
  }
  auto kernel =
      kernel_module.lookupSymbol<LLVMFuncOp>(launchOp.getKernelName());
  // Store the assignment of operands to block arguments. Note that an operand
  // might be used in multiple argument positions, hence the vector.
  OperandToValueMap operand_to_value_map;
  TF_ASSIGN_OR_RETURN(
      auto ordered_operands,
      ComputeOperandToValueMap(&operand_to_value_map, instr, launchOp, kernel));
  // Get the required buffers to support the inputs. Use a set and vector here
  // to keep the order fixed. This is mostly useful for testing.
  std::unordered_set<const BufferAllocation*> buffers_needed;
  std::vector<const BufferAllocation*> buffers;
  // TODO(b/137624192) Add support for tuples.
  for (auto operand : ordered_operands) {
    TF_ASSIGN_OR_RETURN(auto buffer,
                        assignment->GetUniqueTopLevelSlice(operand));
    if (buffers_needed.insert(buffer.allocation()).second) {
      buffers.push_back(buffer.allocation());
    }
  }
  // TODO(b/137624192) Add support for temp buffer.
  // TODO(b/137624192) Add support for constant buffers.
  // Change the signature to match what the XLA runtime expects from the
  // kernel.
  TF_RETURN_IF_ERROR(InsertBufferLoadPreduleIntoKernel(
      kernel, operand_to_value_map, ordered_operands, assignment, buffers));
  // Finally, create the thunk and set the launch dimensions.
  gpu::Thunk::ThunkInfo info;
  auto thunk = absl::make_unique<gpu::KernelThunk>(info, buffers,
                                                   kernel.getName().str());
  // Set launch bounds.
  mlir::gpu::KernelDim3 block = launchOp.getBlockSizeOperandValues();
  mlir::gpu::KernelDim3 grid = launchOp.getGridSizeOperandValues();
  absl::optional<int64> num_threads = getLaunchBound(block);
  absl::optional<int64> num_blocks = getLaunchBound(grid);
  if (!num_threads || !num_blocks) {
    return Unimplemented("Unsupported launch bounds");
  }
  thunk->SetLaunchDimensions(gpu::LaunchDimensions(*num_blocks, *num_threads));
  return std::move(thunk);
 }
 StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
    const CompileOptions& options) {
  // Determine the HLO schedule, which is an ordering of HLO instructions. This
  // is used by buffer assignment to enable buffer reuse, and the same ordering
  // must also be used to determine the thunk launch schedule.
  std::unique_ptr<StreamAssignment> stream_assignment =
      xla::gpu::AssignStreams(*module);
  TF_ASSIGN_OR_RETURN(std::unique_ptr<GpuHloSchedule> hlo_schedule,
                      GpuHloSchedule::Build(*module, *stream_assignment,
                                            data_layout_.getPointerSize()));
  // Run buffer analysis on the HLO graph. This analysis figures out which
  // temporary buffers are required to run the computation.
  TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> buffer_assignment,
                      BufferAssigner::Run(
                          module.get(), hlo_schedule->ConsumeHloOrdering(),
                          BufferSizeBytesFunction(),
                          /*color_alignment=*/
                          [](LogicalBuffer::Color) {
                            return xla::gpu::kXlaAllocatedBufferAlignBytes;
                          },
                          /*allocate_buffers_for_constants=*/true,
                          /*colorer=*/BufferAssigner::DefaultColorer(),
                          /*must_not_live_out=*/{}, &CanShareBufferHint));
  DumpHloModuleIfEnabled(*module, *buffer_assignment, "after_optimizations");
  EmissionContext emission_context(std::move(module));
  if (error_handler_) {
    emission_context.setErrorHandler(error_handler_);
  }
  OwningModuleRef mlir_module =
      ModuleOp::create(UnknownLoc::get(emission_context.getContext()));
  LhloDialectEmitter lhlo_emitter(&emission_context, *buffer_assignment,
                                  stream_exec->platform(), *mlir_module);
  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<gpu::Thunk>>
      hlo_to_thunk;
  for (HloInstruction* instruction : hlo_schedule->ThunkLaunchOrder()) {
    TF_RETURN_IF_ERROR(instruction->Visit(&lhlo_emitter));
    gpu::ThunkSequence thunks = lhlo_emitter.ConsumeThunkSequence();
    TF_RET_CHECK(thunks.size() <= 1) << instruction->ToString();
    if (!thunks.empty()) {
      auto thunk = std::move(thunks.front());
      hlo_to_thunk[instruction] = std::move(thunk);
    }
  }
  TF_RETURN_IF_ERROR(
      module_hook_.invoke(IRHook::LoweringStage::LHLO, *mlir_module));
  TF_RETURN_IF_ERROR(LowerLHLOToGPU(*mlir_module));
  TF_RETURN_IF_ERROR(
      module_hook_.invoke(IRHook::LoweringStage::GPU, *mlir_module));
  TF_RETURN_IF_ERROR(LowerKernelBodiesToNVVM(*mlir_module));
  TF_RETURN_IF_ERROR(
      module_hook_.invoke(IRHook::LoweringStage::LLVM, *mlir_module));
  TF_ASSIGN_OR_RETURN(OwningModuleRef kernel_module,
                      ExtractKernelModule(*mlir_module));
  for (auto entry : lhlo_emitter.InstructionToFunctionMap()) {
    TF_ASSIGN_OR_RETURN(
        auto thunk,
        TransformKernelToXlaThunk(entry.second, entry.first, *kernel_module,
                                  buffer_assignment.get()));
    hlo_to_thunk[entry.first] = std::move(thunk);
  }
  absl::flat_hash_map<const gpu::Thunk*, const HloInstruction*> thunk_to_hlo;
  gpu::ThunkSequence thunk_sequence;
  {
    for (HloInstruction* hlo : hlo_schedule->ThunkLaunchOrder()) {
      auto it = hlo_to_thunk.find(hlo);
      if (it != hlo_to_thunk.end()) {
        const HloInstruction* hlo = it->first;
        auto& thunk = it->second;
        thunk_to_hlo[thunk.get()] = hlo;
        thunk_sequence.push_back(std::move(thunk));
      }
    }
  }
  TF_RETURN_IF_ERROR(
      module_hook_.invoke(IRHook::LoweringStage::KERNEL, *kernel_module));
  // Translate to LLVM IR in a fresh context. The module is further translated
  // to textual PTX and a CUBIN blob so there is no need for the context to live
  // longer than this function.
  llvm::LLVMContext llvmContext;
  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module, llvmContext);
  if (!llvmModule) {
    return InternalError("Translation to LLVM failed");
  }
  llvmModule->setModuleIdentifier(emission_context.getHloModule()->name());
  // TODO(herhut): Why is this needed and does not come from the template?
  llvmModule->setDataLayout(gpu::nvptx::kDataLayout);
  const auto& config = emission_context.getHloModule()->config();
  TF_ASSIGN_OR_RETURN(
      auto ptx, xla::gpu::nvptx::CompileToPtx(llvmModule.get(),
                                              GetGpuVersion(stream_exec),
                                              config, GetLibdeviceDir(config)));
  // Allow to fallback to the driver compilation when ptxas isn't able to
  // compile.
  StatusOr<std::vector<uint8>> maybe_cubin =
      se::CompileGpuAsm(stream_exec->device_ordinal(), ptx.c_str(),
                        gpu::PtxOptsFromConfig(config));
  std::vector<uint8> cubin;
  if (maybe_cubin.ok()) {
    cubin = std::move(maybe_cubin).ValueOrDie();
  } else if (maybe_cubin.status().code() ==
             tensorflow::error::Code::UNIMPLEMENTED) {
    xla::gpu::WarnIfBadDriverJITVersion();
  } else {
    return maybe_cubin.status();
  }
  auto thunk_schedule = absl::make_unique<ThunkSchedule>(
      std::make_unique<gpu::ThunkSequence>(std::move(thunk_sequence)),
      std::move(stream_assignment), std::move(thunk_to_hlo));
  if (DumpingEnabledForHloModule(*emission_context.getHloModule())) {
    DumpToFileInDirOrStdout(*emission_context.getHloModule(), "",
                            "thunk_schedule", thunk_schedule->ToString());
  }
  module = emission_context.releaseHloModule();
  TF_ASSIGN_OR_RETURN(auto output_info,
                      xla::gpu::GetOutputInfo(*module, *buffer_assignment));
  std::vector<BufferAllocation> allocations =
      buffer_assignment->ReleaseAllocations();
  // TODO(b/137624192): Add profiling support.
  return {absl::make_unique<GpuExecutable>(GpuExecutable::Params{
      std::move(ptx), std::move(cubin), GetGpuVersion(stream_exec),
      std::move(thunk_schedule), std::vector<GpuExecutable::ConstantInfo>(),
      std::move(output_info), std::move(module), std::move(allocations)})};
 }
 StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(
    std::unique_ptr<HloModuleGroup> module_group,
    std::vector<std::vector<se::StreamExecutor*>> stream_execs,
    const CompileOptions& options) {
  return Unimplemented("Not yet implemented in MLIR compiler");
 }
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 MlirCompilerImpl::CompileAheadOfTime(
    std::unique_ptr<HloModuleGroup> /*module_group*/,
    const AotCompilationOptions& /*options*/) {
  return Unimplemented("Not yet implemented in MLIR compiler");
 }
 }  // namespace
 }  // namespace mlir_gpu
 }  // namespace xla
 static bool InitModule() {
  xla::Compiler::RegisterCompilerFactory(
      stream_executor::cuda::kCudaPlatformId, []() {
        return absl::make_unique<xla::FailoverCompiler>(
            absl::make_unique<xla::mlir_gpu::MlirCompilerImpl>(),
            absl::make_unique<xla::gpu::NVPTXCompiler>());
      });
  return true;
 }
 static bool module_initialized = InitModule();
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
@ -1,48 +0,0 @@
 load("//tensorflow:tensorflow.bzl", "filegroup")
 load(
    "//tensorflow/core/platform:build_config_root.bzl",
    "tf_cuda_tests_tags",
    "tf_exec_properties",
 )
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 package(
    default_visibility = [":friends"],
    licenses = ["notice"],  # Apache 2.0
 )
 package_group(
    name = "friends",
    includes = [
        "//tensorflow/compiler/xla:friends",
    ],
 )
 glob_lit_tests(
    data = [
        ":test_utilities",
        "@llvm-project//mlir:run_lit.sh",
    ],
    default_tags = tf_cuda_tests_tags() + [
        "no_pip",
        "config-cuda-only",
        "no_rocm",
    ],
    driver = "//tensorflow/compiler/mlir:run_lit.sh",
    exclude = [
        # TODO(b/137624192): Reenable once we can fuse reductions.
        "fused_reduce.hlo",
    ],
    exec_properties = tf_exec_properties({"tags": tf_cuda_tests_tags()}),
    test_file_exts = ["hlo"],
 )
 # Bundle together all of the test utilities that are used by tests.
 filegroup(
    name = "test_utilities",
    testonly = True,
    data = [
        "//tensorflow/compiler/xla/service/mlir_gpu:xla-gpu-opt",
        "@llvm-project//llvm:FileCheck",
    ],
 )
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/abs.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/abs.hlo
@ -1,10 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Abs
 ENTRY %Abs (val: f32[2,2]) -> f32[2,2] {
  %val = f32[2,2]{1,0} parameter(0)
  ROOT %abs = f32[2,2]{1,0} abs(f32[2,2]{1,0} %val)
 }
 //  CHECK: func @abs(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
 //  CHECK:   "lmhlo.abs"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
 //  CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add.hlo
@ -1,12 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Add
 ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
  %x = f32[2,2]{1,0} parameter(0)
  %y = f32[2,2]{1,0} parameter(1)
  ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
 }
 // CHECK: func @add(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
 // CHECK:   "lmhlo.add"(%[[ARG0]], %[[ARG1]], %[[ARG2]]) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
 // CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo
@ -1,63 +0,0 @@
 // RUN: xla-gpu-opt -lowering-stage=KERNEL %s | FileCheck %s
 HloModule Add
 ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
  %x = f32[2,2]{1,0} parameter(0)
  %y = f32[2,2]{1,0} parameter(1)
  ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
 }
 //  CHECK: func @add_kernel(%[[ARG0:.*]]: [[TYPE:!llvm\..*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]
 //
 //   Check that relevant sizes and strides are emitted.
 //
 //  CHECK: %[[CAST0:.*]] = llvm.bitcast %[[ARG0:.*]] : !llvm.ptr<i8> to !llvm.ptr<float>
 //  CHECK: %[[SIZE00:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[SIZE01:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[STRIDE01:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64
 //  CHECK: %[[STRIDE00:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[CAST1:.*]] = llvm.bitcast %[[ARG1:.*]] : !llvm.ptr<i8> to !llvm.ptr<float>
 //  CHECK: %[[SIZE10:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[SIZE11:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[STRIDE11:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64
 //  CHECK: %[[STRIDE10:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[CAST2:.*]] = llvm.bitcast %[[ARG2:.*]] : !llvm.ptr<i8> to !llvm.ptr<float>
 //  CHECK: %[[SIZE20:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[SIZE21:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //  CHECK: %[[STRIDE21:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64
 //  CHECK: %[[STRIDE20:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
 //
 //   Check that the emitted sizes and strides, as well the pointers to HLO buffers,
 //   are inserted into the memref descriptors.
 //
 //  CHECK: %[[DESC0:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC01:.*]] = llvm.insertvalue %[[CAST0]], %[[DESC0]][0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC02:.*]] = llvm.insertvalue %[[CAST0]], %[[DESC01]][1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC03:.*]] = llvm.insertvalue %{{.*}}, %[[DESC02]][2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC04:.*]] = llvm.insertvalue %[[SIZE00]], %[[DESC03]][3, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC05:.*]] = llvm.insertvalue %[[STRIDE00]], %[[DESC04]][4, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC06:.*]] = llvm.insertvalue %[[SIZE01]], %[[DESC05]][3, 1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE01]], %[[DESC06]][4, 1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC11:.*]] = llvm.insertvalue %[[CAST1]], %[[DESC1]][0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC12:.*]] = llvm.insertvalue %[[CAST1]], %[[DESC11]][1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC13:.*]] = llvm.insertvalue %{{.*}}, %[[DESC12]][2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC14:.*]] = llvm.insertvalue %[[SIZE10]], %[[DESC13]][3, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC15:.*]] = llvm.insertvalue %[[STRIDE10]], %[[DESC14]][4, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC16:.*]] = llvm.insertvalue %[[SIZE11]], %[[DESC15]][3, 1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE11]], %[[DESC16]][4, 1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC2:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC21:.*]] = llvm.insertvalue %[[CAST2]], %[[DESC2]][0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC22:.*]] = llvm.insertvalue %[[CAST2]], %[[DESC21]][1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC23:.*]] = llvm.insertvalue %{{.*}}, %[[DESC22]][2] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC24:.*]] = llvm.insertvalue %[[SIZE20]], %[[DESC23]][3, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC25:.*]] = llvm.insertvalue %[[STRIDE20]], %[[DESC24]][4, 0] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %[[DESC26:.*]] = llvm.insertvalue %[[SIZE21]], %[[DESC25]][3, 1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE21]], %[[DESC26]][4, 1] : !llvm.struct<(ptr<float>, ptr<float>, i64, array<2 x i64>, array<2 x i64>)>
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
@ -1,22 +0,0 @@
 // RUN: xla-gpu-opt -lowering-stage=GPU %s | FileCheck %s
 HloModule Add
 ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
  %x = f32[2,2]{1,0} parameter(0)
  %y = f32[2,2]{1,0} parameter(1)
  ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
 }
 // CHECK: func @add(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
 // CHECK: gpu.launch_func
 // CHECK-SAME: blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args
 // CHECK-SAME: (%[[ARG0]] : [[TYPE]], %[[ARG1]] : [[TYPE]], %[[ARG2]] : [[TYPE]])
 // CHECK: }
 // CHECK: func @add_kernel(%[[ARG0]]: [[TYPE]], %[[ARG1]]: [[TYPE]], %[[ARG2]]: [[TYPE]]
 // CHECK-DAG: subview %[[ARG0]]{{\[}}[[INDEX:.*]]]
 // CHECK-DAG: subview %[[ARG1]]{{\[}}[[INDEX]]]
 // CHECK-DAG: subview %[[ARG2]]{{\[}}[[INDEX]]]
 // CHECK: %[[VAL1:.*]] = load %{{.*\[}}[[INDEX:.*]]]
 // CHECK: %[[VAL2:.*]] = load %{{.*\[}}[[INDEX]]]
 // CHECK: %[[RES:.*]] = addf %[[VAL1]], %[[VAL2]]
 // CHECK: store %[[RES]], %{{.*\[}}[[INDEX]]]
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply.hlo
@ -1,22 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule AddMultiply
 ENTRY %AddMultiply (x: f32[2,2], y: f32[2,2], z: f32[2,2]) -> f32[2,2] {
  %x = f32[2,2]{1,0} parameter(0)
  %y = f32[2,2]{1,0} parameter(1)
  %z = f32[2,2]{1,0} parameter(2)
  %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
  ROOT %mul = f32[2,2]{1,0} multiply(f32[2,2]{1,0} %add, f32[2,2]{1,0} %z)
 }
 //  CHECK: func @fusion(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]], %[[RESULT:.*]]: [[TYPE]])
 //  CHECK: "lmhlo.fusion"() ( {
 //  CHECK:   %[[REF0:.*]] = tensor_load %[[ARG0]] : [[TYPE]]
 //  CHECK:   %[[REF1:.*]] = tensor_load %[[ARG1]] : [[TYPE]]
 //  CHECK:   %[[REF2:.*]] = tensor_load %[[ARG2]] : [[TYPE]]
 //  CHECK:   %[[ADD:.*]] = mhlo.add %[[REF1]], %[[REF2]]
 //  CHECK:   %[[MUL:.*]] = mhlo.multiply %[[ADD]], %[[REF0]]
 //  CHECK:   tensor_store %[[MUL]], %[[RESULT]]
 //  CHECK:   "lmhlo.terminator"()
 //  CHECK-NEXT: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo
@ -1,23 +0,0 @@
 // RUN: xla-gpu-opt -lowering-stage=GPU %s | FileCheck %s
 HloModule AddMultiply
 ENTRY %AddMultiply (x: f32[2,2], y: f32[2,2], z: f32[2,2]) -> f32[2,2] {
  %x = f32[2,2]{1,0} parameter(0)
  %y = f32[2,2]{1,0} parameter(1)
  %z = f32[2,2]{1,0} parameter(2)
  %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
  ROOT %mul = f32[2,2]{1,0} multiply(f32[2,2]{1,0} %add, f32[2,2]{1,0} %z)
 }
 //  CHECK: func @fusion_kernel(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]], %[[RESULT:.*]]: [[TYPE]])
 //  CHECK-DAG: subview %[[ARG0]]{{\[}}[[INDEX:.*]]]
 //  CHECK-DAG: subview %[[ARG1]]{{\[}}[[INDEX]]]
 //  CHECK-DAG: subview %[[ARG2]]{{\[}}[[INDEX]]]
 //  CHECK-DAG: subview %[[RESULT]]{{\[}}[[INDEX]]]
 //  CHECK:   %[[V0:.*]] = load %{{.*\[}}[[CSTIDX:.*]]]
 //  CHECK:   %[[V1:.*]] = load %{{.*\[}}[[CSTIDX:.*]]]
 //  CHECK:   %[[ADD:.*]] = addf %[[V0]], %[[V1]]
 //  CHECK:   %[[V2:.*]] = load %{{.*\[}}[[CSTIDX:.*]]]
 //  CHECK:   %[[MUL:.*]] = mulf %[[ADD]], %[[V2]]
 //  CHECK:   store %[[MUL]], %{{.*\[}}[[CSTIDX:.*]]]
 //  CHECK: return
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_reduce.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_reduce.hlo
@ -1,24 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule AddReduce
 %add (x: f32[], y: f32[]) -> f32[] {
  %x = f32[] parameter(0)
  %y = f32[] parameter(1)
  ROOT %add = f32[] add(f32[] %x, f32[] %y)
 }
 ENTRY %AddReduce (x: f32[100,10], c: f32[]) -> f32[100] {
  %x = f32[100,10]{1,0} parameter(0)
  %c = f32[] parameter(1)
  ROOT %reduce = f32[100]{0} reduce(f32[100,10]{1,0} %x, f32[] %c), dimensions={1}, to_apply=%add
 }
 //  CHECK: func @reduce(%[[ARG:.*]]: [[ARGT:.*]], %[[CST:.*]]: memref<f32>, %[[RES:.*]]: [[REST:.*]]) {
 //  CHECK:   "lmhlo.reduce"(%[[ARG]], %[[CST]], %[[RES]]) ( {
 //  CHECK:   ^bb0(%[[FARG0:.*]]: memref<f32>, %[[FARG1:.*]]: memref<f32>, %[[FRES:.*]]: memref<f32>):
 //  CHECK:      %[[LHS:.*]] = tensor_load %[[FARG0]] : memref<f32>
 //  CHECK:      %[[RHS:.*]] = tensor_load %[[FARG1]] : memref<f32>
 //  CHECK:      %[[RES:.*]] = mhlo.add %[[LHS]], %[[RHS]] : tensor<f32>
 //  CHECK:      tensor_store %[[RES]], %[[FRES]] : memref<f32>
 //  CHECK:     "lmhlo.terminator"() : () -> ()
 //  CHECK-NEXT: }) {dimensions = dense<1> : tensor<1xi64>} : ([[ARGT]], memref<f32>, [[REST]]) -> ()
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/broadcast.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/broadcast.hlo
@ -1,14 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Broadcast
 ENTRY %Broadcast (x: f32[10]) -> f32[10, 5] {
  %x = f32[10]{0} parameter(0)
  ROOT %broadcast = f32[10, 5]{1,0} broadcast(f32[10]{0} %x), dimensions={0}
 }
 //  CHECK: func @broadcast(%[[IN:.*]]: [[IN_T:.*]],  %[[OUT:.*]]: [[OUT_T:.*]]) {
 //  CHECK:   "lmhlo.broadcast_in_dim"(%[[IN]], %[[OUT]])
 //  CHECK:   {broadcast_dimensions = dense<0> : tensor<1xi64>}
 //  CHECK:   : ([[IN_T]], [[OUT_T]]) -> ()
 //  CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/broken_add.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/broken_add.hlo
@ -1,10 +0,0 @@
 // RUN: xla-gpu-opt -verify-errors %s | FileCheck %s
 HloModule Add
 ENTRY %Add (x: f32[2,2,2], y: f32[2,2,2]) -> f32[2,2,2] {
  %x = f32[2,2,2]{2,1,0} parameter(0)
  %y = f32[2,2,2]{2,1,0} parameter(1)
  ROOT %add = f32[2,2,2]{2,1,0} add(f32[2,2,2]{2,1,0} %x, f32[2,2,2]{2,1,0} %y), metadata={op_name="original_tf_op"}
 }
 // CHECK: ERRORS FOUND: [%add = f32[2,2,2]{2,1,0} add(f32[2,2,2]{2,1,0} %x, f32[2,2,2]{2,1,0} %y), metadata={op_name="original_tf_op"}: failed for testing: lmhlo.add; failed for testing: std.return]
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/ceil.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/ceil.hlo
@ -1,10 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Ceil
 ENTRY %Ceil (val: f32[2,2]) -> f32[2,2] {
  %val = f32[2,2]{1,0} parameter(0)
  ROOT %ceil = f32[2,2]{1,0} ceil(f32[2,2]{1,0} %val)
 }
 //  CHECK: func @ceil(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
 //  CHECK:   "lmhlo.ceil"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
 //  CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/compare.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/compare.hlo
@ -1,13 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Compare
 ENTRY %Compare (x: f32[2,2], y: f32[2,2]) -> pred[2,2] {
  %x = f32[2,2]{1,0} parameter(0)
  %y = f32[2,2]{1,0} parameter(1)
  ROOT %compare = pred[2,2]{1,0} compare(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y), direction=EQ
 }
 // CHECK: func @compare(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[PRED:.*]]: [[PRED_TYPE:.*]]) {
 // CHECK:   "lmhlo.compare"(%[[ARG0]], %[[ARG1]], %[[PRED]])
 // CHECK: {comparison_direction = "EQ"} : ([[TYPE]], [[TYPE]], [[PRED_TYPE]]) -> ()
 // CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/complex.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/complex.hlo
@ -1,12 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Complex
 ENTRY %Complex (real: f32[2,2]{0,1}, imag: f32[2,2]{0,1}) -> c64[2,2] {
  %real = f32[2,2]{0,1} parameter(0)
  %imag = f32[2,2]{0,1} parameter(1)
  ROOT %compl = c64[2,2]{0,1} complex(%real, %imag)
 }
 // CHECK: func @complex(%[[REAL:.*]]: [[BUF_F32:.*]], %[[IMAG:.*]]: [[BUF_F32]], %[[OUT:.*]]: [[BUF_C64:.*]]) {
 // CHECK:   "lmhlo.complex"(%[[REAL]], %[[IMAG]], %[[OUT]]) : ([[BUF_F32]], [[BUF_F32]], [[BUF_C64]]) -> ()
 // CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/concatenate.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/concatenate.hlo
@ -1,13 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Concatenate
 ENTRY %Concatenate (x: f32[2,3], y: f32[2,2]) -> f32[2,5] {
  %x = f32[2,3]{1,0} parameter(0)
  %y = f32[2,2]{1,0} parameter(1)
  ROOT %concatenate = f32[2,5]{1,0} concatenate(f32[2,3]{1,0} %x, f32[2,2]{1,0} %y), dimensions={1}
 }
 // CHECK: func @concatenate(%[[ARG0:.*]]: [[TYPE0:.*]], %[[ARG1:.*]]: [[TYPE1:.*]], %[[RESULT:.*]]: [[RTYPE:.*]]) {
 // CHECK:   "lmhlo.concatenate"(%[[ARG0]], %[[ARG1]], %[[RESULT]])
 // CHECK:   {dimension = 1 : i64} : ([[TYPE0]], [[TYPE1]], [[RTYPE]]) -> ()
 // CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/const.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/const.hlo
@ -1,12 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Const
 ENTRY %Const () -> s32[100] {
  %const.0 = s32[] constant(10)
  ROOT %broadcast.0 = s32[100]{0} broadcast(s32[] %const.0), dimensions={}
 }
 // CHECK: func @constant(%[[ARG0:.*]]: memref<i32>)
 // CHECK:   "lmhlo.constant"(%[[ARG0]]) {value = dense<10> : tensor<i32>}
 // CHECK: func @broadcast(%[[ARG1:.*]]: memref<i32>, %[[ARG2:.*]]: memref<100xi32>)
 // CHECK:   "lmhlo.broadcast_in_dim"(%[[ARG1]], %[[ARG2]]) {broadcast_dimensions = dense<> : tensor<0xi64>}
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/copy.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/copy.hlo
@ -1,10 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Copy
 ENTRY %Copy (x: f32[2,4]) -> f32[2,4] {
  %x = f32[2,4] parameter(0)
  ROOT %copy = f32[2,4] copy(f32[2,4] %x)
 }
 // CHECK: func @copy(%[[OPERAND:.*]]: memref<2x4xf32>, %[[RESULT:.*]]: memref<2x4xf32>) {
 // CHECK:   "lmhlo.copy"(%[[OPERAND]], %[[RESULT]]) : (memref<2x4xf32>, memref<2x4xf32>) -> ()
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/copy_transpose.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/copy_transpose.hlo
@ -1,13 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule CopyTranspose
 ENTRY %CopyTranspose (x: f32[2,4]) -> f32[2,4]{0,1} {
  %x = f32[2,4] parameter(0)
  ROOT %copy = f32[2,4]{0,1} copy(f32[2,4] %x)
 }
 // CHECK: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1 * 2)>
 // CHECK: func @copy(%[[OPERAND:.*]]: memref<2x4xf32>,
 // CHECK-SAME:       %[[RESULT:.*]]: memref<2x4xf32, #[[MAP0]]>) 
 // CHECK:   "lmhlo.copy"(%[[OPERAND]], %[[RESULT]])
 // CHECK-SAME: : (memref<2x4xf32>, memref<2x4xf32, #[[MAP0]]>)
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/cos.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/cos.hlo
@ -1,10 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Cos
 ENTRY %Cos (val: f32[2,2]) -> f32[2,2] {
  %val = f32[2,2]{1,0} parameter(0)
  ROOT %cos = f32[2,2]{1,0} cosine(f32[2,2]{1,0} %val)
 }
 //  CHECK: func @cosine(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
 //  CHECK:   "lmhlo.cosine"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
 //  CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/exp.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/exp.hlo
@ -1,12 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Exp
 ENTRY %Exp (x: f32[2,2]) -> f32[2,2] {
  %x = f32[2,2]{1,0} parameter(0)
  ROOT %exp = f32[2,2]{1,0} exponential(f32[2,2]{1,0} %x)
 }
 // CHECK: func @exponential(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
 // CHECK:   "lmhlo.exponential"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
 // CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/fused_reduce.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/fused_reduce.hlo
@ -1,35 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule FusedReduce
 %add (x: f32[], y: f32[]) -> f32[] {
  %x = f32[] parameter(0)
  %y = f32[] parameter(1)
  ROOT %add = f32[] add(f32[] %x, f32[] %y)
 }
 %fused_computation (param: f32[100,10]) -> f32[10] {
  %param = f32[100,10] parameter(0)
  %constant = f32[] constant(0)
  ROOT %reduce = f32[10]{0} reduce(f32[100,10]{1,0} %param, f32[] %constant),
      dimensions={0}, to_apply=%add
 }
 ENTRY %FusedReduce (x: f32[100,10]) -> f32[10] {
  %x = f32[100,10] parameter(0)
  ROOT %fusion = f32[10]{0} fusion(f32[100,10]{1,0} %x), kind=kInput,
      calls=%fused_computation
 }
 //  CHECK: func @fusion(%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[RTYPE:.*]])
 //  CHECK: "lmhlo.fusion"() ( {
 //  CHECK:   %[[REF0:.*]] = tensor_load %arg0 : [[TYPE]]
 //  CHECK:   %[[CT0:.*]] = mhlo.constant dense<0.000000e+00>
 //  CHECK:   %[[RED:.*]] = "mhlo.reduce"(%0, %1) ( {
 //  CHECK:     ^bb0(%[[BARG0:.*]]: [[ETYPE:.*]], %[[BARG1:.*]]: [[ETYPE]])
 //  CHECK:       %[[ADD:.*]] = mhlo.add %[[BARG0]], %[[BARG1]] : [[ETYPE]]
 //  CHECK:       "mhlo.return"(%[[ADD]])
 //  CHECK:     })
 //  CHECK:   tensor_store %[[RED]], %[[RESULT]] : [[RTYPE]]
 //  CHECK:   "lmhlo.terminator"()
 //  CHECK-NEXT: })
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/gather.hlo
@ -1,22 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Gather
 ENTRY %Gather (x: f32[100,10], y: s64[4,6]) -> f32[4,6,10] {
  %x = f32[100,10] parameter(0)
  %y = s64[4,6] parameter(1)
  ROOT %gather = f32[4,6,10]{2,1,0} gather(f32[100,10]{1,0} %x, s64[4,6]{1,0} %y),
      collapsed_slice_dims={0}, index_vector_dim=2, offset_dims={2},
      slice_sizes={1,10}, start_index_map={0}
 }
 // CHECK: func @gather(%[[ARG0:.*]]: [[TYPE0:.*]], %[[ARG1:.*]]: [[TYPE1:.*]],
 // CHECK-SAME:         %[[RESULT:.*]]: [[RTYPE:.*]]) {
 // CHECK-NEXT: "lmhlo.gather"(%[[ARG0]], %[[ARG1]], %[[RESULT]]) {
 // CHECK-SAME:   dimension_numbers = {
 // CHECK-SAME:     collapsed_slice_dims = dense<0> : tensor<1xi64>,
 // CHECK-SAME:     index_vector_dim = 2 : i64,
 // CHECK-SAME:     offset_dims = dense<2> : tensor<1xi64>,
 // CHECK-SAME:     start_index_map = dense<0> : tensor<1xi64>
 // CHECK-SAME:   },
 // CHECK-SAME:   slice_sizes = dense<[1, 10]> : tensor<2xi64>
 // CHECK-SAME: } : ([[TYPE0]], [[TYPE1]], [[RTYPE]]) -> ()
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/imag.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/imag.hlo
@ -1,11 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Imag
 ENTRY %Imag (x: c64[2,2]{0,1}) -> f32[2,2] {
  %x = c64[2,2]{0,1} parameter(0)
  ROOT %imag = f32[2,2]{0,1} imag(%x)
 }
 // CHECK: func @imag(%[[IN:.*]]: [[BUF_C64:.*]], %[[OUT:.*]]: [[BUF_F32:.*]]) {
 // CHECK:   "lmhlo.imag"(%[[IN]], %[[OUT]]) : ([[BUF_C64]], [[BUF_F32]]) -> ()
 // CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/iota.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/iota.hlo
@ -1,11 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Iota
 ENTRY %Iota() -> s64[10, 5] {
  ROOT %iota = s64[10, 5]{1,0} iota(), iota_dimension=0
 }
 //  CHECK: func @iota(%[[OUT:.*]]: [[OUT_T:.*]]) {
 //  CHECK:   "lmhlo.iota"(%[[OUT]])
 //  CHECK:   {iota_dimension = 0 : i64} : ([[OUT_T]]) -> ()
 //  CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/iota_add_subtract.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/iota_add_subtract.hlo
@ -1,16 +0,0 @@
 // RUN: xla-gpu-opt -lowering-stage=GPU %s | FileCheck %s
 HloModule AddSubtract
 ENTRY %AddSubtract (x: s32[2,2], y: s32[2,2]) -> s32[2,2] {
  %x = s32[2,2]{1,0} parameter(0)
  %y = s32[2,2]{1,0} parameter(1)
  %add = s32[2,2]{1,0} add(s32[2,2]{1,0} %x, s32[2,2]{1,0} %y)
  %iota = s32[2, 2]{1,0} iota(), iota_dimension=0
  ROOT %sub = s32[2,2]{1,0} subtract(s32[2,2]{1,0} %add, s32[2,2]{1,0} %iota)
 }
 //  CHECK-NOT:  store
 //  CHECK:      [[RESULT:%.*]] = subi %{{.*}}, %{{.*}}
 //  CHECK:      store [[RESULT]]
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/log.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/log.hlo
@ -1,11 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Log
 ENTRY %Log (x: f32[2,2]) -> f32[2,2] {
  %x = f32[2,2]{1,0} parameter(0)
  ROOT %log = f32[2,2]{1,0} log(f32[2,2]{1,0} %x)
 }
 // CHECK: func @log(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
 // CHECK:   "lmhlo.log"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
 // CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/neg.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/neg.hlo
@ -1,10 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Neg
 ENTRY %Neg (val: f32[2,2]) -> f32[2,2] {
  %val = f32[2,2]{1,0} parameter(0)
  ROOT %neg = f32[2,2]{1,0} negate(f32[2,2]{1,0} %val)
 }
 //  CHECK: func @negate(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
 //  CHECK:   "lmhlo.negate"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
 //  CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/real.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/real.hlo
@ -1,11 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Real
 ENTRY %Real (x: c64[2,2]{0,1}) -> f32[2,2] {
  %x = c64[2,2]{0,1} parameter(0)
  ROOT %real = f32[2,2]{0,1} real(%x)
 }
 // CHECK: func @real(%[[IN:.*]]: [[BUF_C64:.*]], %[[OUT:.*]]: [[BUF_F32:.*]]) {
 // CHECK:   "lmhlo.real"(%[[IN]], %[[OUT]]) : ([[BUF_C64]], [[BUF_F32]]) -> ()
 // CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/reduce_window.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/reduce_window.hlo
@ -1,35 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule ReduceWindow
 %max (x: f32[], y: f32[]) -> f32[] {
  %x = f32[] parameter(0)
  %y = f32[] parameter(1)
  ROOT %max = f32[] maximum(f32[] %x, f32[] %y)
 }
 ENTRY %ReduceWindow (x: f32[128,64,112,112], y: f32[]) -> f32[128,64,56,56] {
  %x = f32[128,64,112,112] parameter(0)
  %y = f32[] parameter(1)
  ROOT %reduce-window = f32[128,64,56,56] reduce-window(
    f32[128,64,112,112] %x,
    f32[] %y
  ),
  window={size=1x1x3x3 stride=1x1x2x2 pad=0_0x0_0x0_1x0_1}, to_apply=%max
 }
 // CHECK: func @"reduce-window"(
 // CHECK-SAME: [[ARG:%.*]]: [[ARGT:.*]], [[CST:%.*]]: memref<f32>, [[RES:%.*]]: [[REST:.*]]) {
 // CHECK: "lmhlo.reduce_window"([[LHS:%.*]], [[RHS:%.*]], [[OUT:%.*]]) ( {
 // CHECK:   ^bb0([[LHS:%.*]]: memref<f32>, [[RHS:%.*]]: memref<f32>, [[OUT:%.*]]: memref<f32>):
 // CHECK:     [[LHS_TENSOR:%.*]] = tensor_load [[LHS]]
 // CHECK:     [[RHS_TENSOR:%.*]] = tensor_load [[RHS]]
 // CHECK:     [[OUT_TENSOR:%.*]] = mhlo.maximum [[LHS_TENSOR]], [[RHS_TENSOR]]
 // CHECK:     tensor_store [[OUT_TENSOR]], [[OUT]]
 // CHECK:     "lmhlo.terminator"() : () -> ()
 // CHECK:   }) {
 // CHECK-SAME: base_dilations = dense<1> : tensor<4xi64>
 // CHECK-SAME: padding = dense<{{\[}}[0, 0], [0, 0], [0, 1], [0, 1]]>
 // CHECK-SAME: window_dilations = dense<1> : tensor<4xi64>
 // CHECK-SAME: window_dimensions = dense<[1, 1, 3, 3]>
 // CHECK-SAME: window_strides = dense<[1, 1, 2, 2]>
 // CHECK: } : ([[ARGT]], memref<f32>, [[REST]]) -> ()
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/rem.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/rem.hlo
@ -1,11 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Rem
 ENTRY %Rem(x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
  %x = f32[2,2]{1,0} parameter(0)
  %y = f32[2,2]{1,0} parameter(1)
  ROOT %rem = f32[2,2]{1,0} remainder(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
 }
 //  CHECK: func @remainder(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
 //  CHECK:   "lmhlo.remainder"(%[[ARG0]], %[[ARG1]], %[[ARG2]]) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
 //  CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/rsqrt.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/rsqrt.hlo
@ -1,11 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Rsqrt
 ENTRY %Rsqrt (x: f32[2,2]) -> f32[2,2] {
  %x = f32[2,2]{1,0} parameter(0)
  ROOT %rsqrt = f32[2,2]{1,0} rsqrt(f32[2,2]{1,0} %x)
 }
 //  CHECK: func @rsqrt(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
 //  CHECK:   "lmhlo.rsqrt"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
 //  CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/select.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/select.hlo
@ -1,14 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Select
 ENTRY %Select (p: pred[2,2], x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
  %p = pred[2,2]{1,0} parameter(0)
  %x = f32[2,2]{1,0} parameter(1)
  %y = f32[2,2]{1,0} parameter(2)
  ROOT %select = f32[2,2]{1,0} select(pred[2,2]{1,0} %p, f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
 }
 // CHECK: func @select(%[[PRED:.*]]: [[PRED_TYPE:.*]], %[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
 // CHECK:   "lmhlo.select"(%[[PRED]], %[[ARG0]], %[[ARG1]], %[[ARG2]]) : ([[PRED_TYPE]], [[TYPE]], [[TYPE]], [[TYPE]]) -> ()
 // CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/select_and_scatter.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/select_and_scatter.hlo
@ -1,54 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule SelectAndScatter
 %ge (x: f32[], y: f32[]) -> pred[] {
  %x = f32[] parameter(0)
  %y = f32[] parameter(1)
  ROOT %compare = pred[] compare(f32[] %x, f32[] %y), direction=GE
 }
 %add (x: f32[], y: f32[]) -> f32[] {
  %x = f32[] parameter(0)
  %y = f32[] parameter(1)
  ROOT %add = f32[] add(f32[] %x, f32[] %y)
 }
 ENTRY %SelectAndScatter (x: f32[128,64,112,112],
                         y: f32[128,64,56,56],
                         z: f32[]) -> f32[128,64,112,112] {
  %x = f32[128,64,112,112] parameter(0)
  %y = f32[128,64,56,56] parameter(1)
  %z = f32[] parameter(2)
  ROOT %result = f32[128,64,112,112] select-and-scatter(
    f32[128,64,112,112] %x,
    f32[128,64,56,56] %y,
    f32[] %z),
  window={size=1x1x3x3 stride=1x1x2x2 pad=0_0x0_0x0_1x0_1},
  select=%ge,
  scatter=%add
 }
 // CHECK: func @"select-and-scatter"(
 // CHECK-SAME: [[ARG:%.*]]: [[ARGT:.*]], [[SRC:%.*]]: [[SRCT:.*]], [[CST:%.*]]: memref<f32>, [[RES:%.*]]: [[REST:.*]]) {
 // CHECK: "lmhlo.select_and_scatter"([[ARG]], [[SRC]], [[CST]], [[RES]]) ( {
 // CHECK:   ^bb0([[LHS:%.*]]: memref<f32>, [[RHS:%.*]]: memref<f32>,
 // CHECK-SAME:   [[OUT:%.*]]: memref<i1>):
 // CHECK:     [[LHS_TENSOR:%.*]] = tensor_load [[LHS]]
 // CHECK:     [[RHS_TENSOR:%.*]] = tensor_load [[RHS]]
 // CHECK:     [[OUT_TENSOR:%.*]] = "mhlo.compare"
 // CHECK-SAME:    ([[LHS_TENSOR]], [[RHS_TENSOR]]) {comparison_direction = "GE"}
 // CHECK:     tensor_store [[OUT_TENSOR]], [[OUT]]
 // CHECK:     lmhlo.terminator
 // CHECK:   },  {
 // CHECK:   ^bb0([[LHS_:%.*]]: memref<f32>, [[RHS_:%.*]]: memref<f32>,
 // CHECK-SAME:   [[OUT_:%.*]]: memref<f32>):
 // CHECK:     [[LHS_TENSOR_:%.*]] = tensor_load [[LHS_]]
 // CHECK:     [[RHS_TENSOR_:%.*]] = tensor_load [[RHS_]]
 // CHECK:     [[OUT_TENSOR_:%.*]] = mhlo.add [[LHS_TENSOR_]], [[RHS_TENSOR_]]
 // CHECK:     tensor_store [[OUT_TENSOR_]], [[OUT_]]
 // CHECK:     lmhlo.terminator
 // CHECK:   }) {
 // CHECK-SAME: padding = dense<{{\[}}[0, 0], [0, 0], [0, 1], [0, 1]]>
 // CHECK-SAME: window_dimensions = dense<[1, 1, 3, 3]>
 // CHECK-SAME: window_strides = dense<[1, 1, 2, 2]>
 // CHECK-SAME: } : ([[ARGT]], [[SRCT]], memref<f32>, [[REST]]) -> ()
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/sign.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/sign.hlo
@ -1,10 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Sign
 ENTRY %Sign (val: f32[2,2]) -> f32[2,2] {
  %val = f32[2,2]{1,0} parameter(0)
  ROOT %sign = f32[2,2]{1,0} sign(f32[2,2]{1,0} %val)
 }
 //  CHECK: func @sign(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
 //  CHECK:   "lmhlo.sign"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
 //  CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/sqrt.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/sqrt.hlo
@ -1,12 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Sqrt
 ENTRY %Sqrt (x: f32[2,2]) -> f32[2,2] {
  %x = f32[2,2]{1,0} parameter(0)
  ROOT %sqrt = f32[2,2]{1,0} sqrt(f32[2,2]{1,0} %x)
 }
 // CHECK: func @sqrt(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
 // CHECK:   "lmhlo.sqrt"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
 // CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/tanh.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/tanh.hlo
@ -1,10 +0,0 @@
 // RUN: xla-gpu-opt %s | FileCheck %s
 HloModule Tanh
 ENTRY %Tanh (val: f32[2,2]) -> f32[2,2] {
  %val = f32[2,2]{1,0} parameter(0)
  ROOT %tanh = f32[2,2]{1,0} tanh(f32[2,2]{1,0} %val)
 }
 //  CHECK: func @tanh(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
 //  CHECK:   "lmhlo.tanh"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
 //  CHECK: }
--- a/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.cc
@ -1,167 +0,0 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.h"
 #include <memory>
 #include <string>
 #include "absl/strings/str_join.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/failover_compiler.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/inject_errors_pass.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 namespace xla {
 namespace mlir_gpu {
 Status XlaGpuOpt::CompileIr(std::unique_ptr<HloModule> hlo_module,
                            const MlirCompiler::IRHook& ir_hook) {
  MlirCompiler* compiler = GetMLIRCompiler();
  compiler->SetModuleHook(ir_hook);
  TF_ASSIGN_OR_RETURN(hlo_module, backend_->compiler()->RunHloPasses(
                                      std::move(hlo_module),
                                      backend_->default_stream_executor(),
                                      /*device_allocator=*/nullptr));
  Status status = backend_->compiler()
                      ->RunBackend(std::move(hlo_module),
                                   backend_->default_stream_executor(),
                                   /*device_allocator=*/nullptr)
                      .status();
  compiler->RemoveModuleHook();
  return status;
 }
 StatusOr<std::string> XlaGpuOpt::CompileIr(
    std::unique_ptr<HloModule> hlo_module,
    MlirCompiler::IRHook::LoweringStage printing_stage) {
  std::string ir;
  TF_RETURN_IF_ERROR(CompileIr(
      std::move(hlo_module), {[&ir](mlir::ModuleOp module) -> Status {
                                std::string buffer_string;
                                llvm::raw_string_ostream ostream(buffer_string);
                                module.print(ostream);
                                ostream.flush();
                                ir = buffer_string;
                                return Status::OK();
                              },
                              printing_stage}));
  return ir;
 }
 Status XlaGpuOpt::CompileAndOutputIr(std::unique_ptr<HloModule> hlo_module,
                                     llvm::raw_ostream& os,
                                     LoweringStage printing_stage) {
  TF_ASSIGN_OR_RETURN(std::string ir,
                      CompileIr(std::move(hlo_module), printing_stage));
  os << ir;
  return Status::OK();
 }
 Status XlaGpuOpt::CompileAndOutputIr(const std::string& hlo_text,
                                     llvm::raw_ostream& os,
                                     LoweringStage printing_stage) {
  TF_ASSIGN_OR_RETURN(auto module, GetVerifiedHloModule(hlo_text));
  return CompileAndOutputIr(std::move(module), os, printing_stage);
 }
 MlirCompiler::IRHook XlaGpuOpt::GetIRHookBreakingLoweringStage(
    LoweringStage breaking_stage) {
  return {[](mlir::ModuleOp module) -> Status {
            mlir::PassManager pm(module.getContext());
            pm.addNestedPass<::mlir::FuncOp>(
                ::mlir::createInjectErrorsForTestingPass());
            if (failed(pm.run(module))) {
              return InternalError("InjectErrorsForTestingPass failed.");
            }
            return Status::OK();
          },
          breaking_stage};
 }
 StatusOr<string> XlaGpuOpt::CompileAndInjectErrors(
    std::unique_ptr<HloModule> hlo_module, LoweringStage breaking_stage) {
  std::string errors;
  auto error_handler = [&errors](const EmissionContext::ErrorMap& error_map,
                                 HloModule* hlo_module) {
    errors = "ERRORS FOUND: ";
    for (auto& err : error_map) {
      errors += "[" + err.first->ToString() + ": " +
                absl::StrJoin(err.second, "; ") + "]";
    }
  };
  MlirCompiler* compiler = GetMLIRCompiler();
  compiler->SetModuleHook(GetIRHookBreakingLoweringStage(breaking_stage));
  compiler->SetErrorHandler(error_handler);
  TF_ASSIGN_OR_RETURN(
      hlo_module, compiler->RunHloPasses(std::move(hlo_module),
                                         backend_->default_stream_executor(),
                                         /*device_allocator=*/nullptr));
  Status status = compiler
                      ->RunBackend(std::move(hlo_module),
                                   backend_->default_stream_executor(),
                                   /*device_allocator=*/nullptr)
                      .status();
  compiler->RemoveModuleHook();
  compiler->RemoveErrorHandler();
  if (status.ok()) {
    return errors;
  }
  return status;
 }
 Status XlaGpuOpt::CompileAndExpectErrors(const std::string& hlo_text,
                                         llvm::raw_ostream& os,
                                         LoweringStage breaking_stage) {
  TF_ASSIGN_OR_RETURN(auto module, GetVerifiedHloModule(hlo_text));
  TF_ASSIGN_OR_RETURN(
      std::string errors,
      CompileAndInjectErrors(std::move(module), breaking_stage));
  os << errors;
  return Status::OK();
 }
 StatusOr<std::unique_ptr<VerifiedHloModule>> XlaGpuOpt::GetVerifiedHloModule(
    const std::string& hlo_text) {
  HloModuleConfig config;
  auto debug_options = GetDebugOptionsFromFlags();
  debug_options.add_xla_disable_hlo_passes("constant_folding");
  config.set_debug_options(debug_options);
  auto module = absl::make_unique<VerifiedHloModule>(
      "Module", config, /*verifier_layout_sensitive=*/true,
      /*allow_mixed_precision_in_hlo_verifier=*/false,
      /*shape_size_function=*/ShapeUtil::ByteSizeOfElements);
  TF_RETURN_IF_ERROR(module->ParseHloStringAndVerifyModule(hlo_text));
  return std::move(module);
 }
 MlirCompiler* XlaGpuOpt::GetMLIRCompiler() {
  // TODO(b/137624192): Remove failover once no longer in place.
  auto* failover = static_cast<FailoverCompiler*>(backend_->compiler());
  return static_cast<MlirCompiler*>(failover->GetPrimary());
 }
 }  // namespace mlir_gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.h
@ -1,76 +0,0 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_XLA_GPU_OPT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_XLA_GPU_OPT_H_
 #include <memory>
 #include <string>
 #include "llvm/Support/raw_ostream.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
 namespace xla {
 namespace mlir_gpu {
 // Prints the IR created by the MLIR GPU backend at a certain lowering stage.
 class XlaGpuOpt {
 public:
  using LoweringStage = MlirCompiler::IRHook::LoweringStage;
  XlaGpuOpt() {
    backend_ = std::move(Backend::CreateDefaultBackend().ValueOrDie());
  }
  // Compiles the HLO module given in 'hlo_text' to a GpuExecutable and prints
  // the IR at the lowering stage 'printing_stage' to the 'os' stream.
  //
  // This function invokes the JIT compiler.
  Status CompileAndOutputIr(const std::string& hlo_text, llvm::raw_ostream& os,
                            LoweringStage printing_stage = LoweringStage::LHLO);
  // Adds the InjectErrorsForTestingPass to MLIRCompiler on the provided
  // lowering stage 'breaking_stage', parses and compiles `hlo_text`, and prints
  // the resulting errors to the 'os' stream.
  Status CompileAndExpectErrors(const std::string& hlo_text,
                                llvm::raw_ostream& os,
                                LoweringStage breaking_stage);
 private:
  std::unique_ptr<Backend> backend_;
  StatusOr<std::unique_ptr<VerifiedHloModule>> GetVerifiedHloModule(
      const std::string& hlo_text_filename);
  Status CompileAndOutputIr(std::unique_ptr<HloModule> hlo_module,
                            llvm::raw_ostream& os,
                            LoweringStage printing_stage);
  Status CompileIr(std::unique_ptr<HloModule> hlo_module,
                   const MlirCompiler::IRHook& ir_hook);
  StatusOr<std::string> CompileIr(std::unique_ptr<HloModule> hlo_module,
                                  LoweringStage printing_stage);
  MlirCompiler::IRHook GetIRHookBreakingLoweringStage(
      LoweringStage breaking_stage);
  StatusOr<std::string> CompileAndInjectErrors(
      std::unique_ptr<HloModule> hlo_module, LoweringStage breaking_stage);
  MlirCompiler* GetMLIRCompiler();
 };
 }  // namespace mlir_gpu
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_XLA_GPU_OPT_H_
--- a/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt_main.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt_main.cc
@ -1,90 +0,0 @@
 /* Copyright 2020 Google Inc. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <string>
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/xla_gpu_opt.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/core/platform/logging.h"
 // NOLINTNEXTLINE
 static llvm::cl::opt<std::string> input_filename(llvm::cl::Positional,
                                                 llvm::cl::desc("<input file>"),
                                                 llvm::cl::init("-"));
 // NOLINTNEXTLINE
 static llvm::cl::opt<std::string> output_filename(
    "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"),
    llvm::cl::init("-"));
 // NOLINTNEXTLINE
 static llvm::cl::opt<bool> verify_errors(
    "verify-errors",
    llvm::cl::desc("Whether we expect errors which should be verified"),
    llvm::cl::init(false));
 static llvm::cl::opt<xla::mlir_gpu::MlirCompiler::IRHook::LoweringStage>
    // NOLINTNEXTLINE
    lowering_stage(
        "lowering-stage",
        llvm::cl::desc(
            "The lowering stage up to which the compiler will be run"),
        llvm::cl::values(
            clEnumValN(xla::mlir_gpu::MlirCompiler::IRHook::LoweringStage::LHLO,
                       "LHLO", "LHLO"),
            clEnumValN(xla::mlir_gpu::MlirCompiler::IRHook::LoweringStage::GPU,
                       "GPU", "GPU"),
            clEnumValN(xla::mlir_gpu::MlirCompiler::IRHook::LoweringStage::LLVM,
                       "LLVM", "LLVM"),
            clEnumValN(
                xla::mlir_gpu::MlirCompiler::IRHook::LoweringStage::KERNEL,
                "KERNEL", "Kernel")),
        llvm::cl::init(
            xla::mlir_gpu::MlirCompiler::IRHook::LoweringStage::LHLO));
 int main(int argc, char **argv) {
  tensorflow::InitMlir y(&argc, &argv);
  mlir::registerPassManagerCLOptions();
  llvm::cl::ParseCommandLineOptions(argc, argv,
                                    "XLA GPU modular optimizer driver\n");
  // Set up the input file.
  std::string error_message;
  auto file = mlir::openInputFile(input_filename, &error_message);
  QCHECK(file) << error_message;
  auto output = mlir::openOutputFile(output_filename, &error_message);
  QCHECK(output) << error_message;
  xla::mlir_gpu::XlaGpuOpt opt;
  xla::Status status =
      verify_errors ? opt.CompileAndExpectErrors(file->getBuffer().str(),
                                                 output->os(), lowering_stage)
                    : opt.CompileAndOutputIr(file->getBuffer().str(),
                                             output->os(), lowering_stage);
  if (!status.ok()) {
    LOG(ERROR) << status.error_message();
    return 1;
  }
  output->keep();
  return 0;
 }
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@ -94,7 +94,6 @@ tf_cc_binary(
    ],
 )
 # To run with MLIR GPU plugin enabled, pass --define=with_mlir_gpu_support=true.
 tf_cc_binary(
    name = "replay_computation_gpu",
    tags = ["gpu"],
@ -328,7 +327,6 @@ cc_library(
    ],
 )
 # To run with MLIR GPU plugin enabled, pass --define=with_mlir_gpu_support=true.
 tf_cc_binary(
    name = "run_hlo_module",
    testonly = True,