Remove the experimental xla quantization work from open source (II)

PiperOrigin-RevId: 305821944 Change-Id: I3d606fd11cb3f92691fee3a85b9d35a29b2038da
2020-04-09 21:38:22 -07:00 · 2020-04-09 21:38:22 -07:00 · f22174826d
commit f22174826d
parent 84ff3e44b2
19 changed files with 4 additions and 1576 deletions
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@ -70,7 +70,6 @@ cc_library(
        "//tensorflow/compiler/mlir/lite:tensorflow_lite_quantize",
        "//tensorflow/compiler/mlir/lite/quantization:quantization_passes",
        "//tensorflow/compiler/mlir/lite/quantization/tensorflow:tf_to_quant",
-        "//tensorflow/compiler/mlir/lite/quantization/xla:hlo_xla_quantization_passes",
        "//tensorflow/compiler/mlir/tensorflow",
        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
--- a/tensorflow/compiler/mlir/lite/quantization/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/BUILD
@ -14,7 +14,10 @@ package(
 package_group(
    name = "friends",
    includes = ["//third_party/mlir:subpackages"],
-    packages = ["//tensorflow/compiler/mlir/..."],
+    packages = [
+        "//learning/brain/experimental/mlir/quantization/...",
+        "//tensorflow/compiler/mlir/...",
+    ],
 )

 exports_files([
--- a/tensorflow/compiler/mlir/lite/quantization/xla/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/BUILD
@ -1,86 +0,0 @@
-load(
-    "//third_party/mlir:tblgen.bzl",
-    "gentbl",
-)
-
-package(
-    default_visibility = [
-        ":friends",
-    ],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-package_group(
-    name = "friends",
-    includes = ["//third_party/mlir:subpackages"],
-    packages = [
-        "//learning/brain/experimental/mlir/quantization/...",
-        "//tensorflow/compiler/mlir/...",
-        "//tensorflow/compiler/mlir/lite/...",
-    ],
-)
-
-cc_library(
-    name = "hlo_xla_quantization_passes",
-    srcs = [
-        "cpu_kernel_fusion.cc",
-        "generated_cpu_kernel_fusion.inc",
-        "materialize.cc",
-        "op_quant_spec.inc",
-        "propagate.cc",
-    ],
-    hdrs = [
-        "passes.h",
-    ],
-    deps = [
-        ":cpu_device_target",
-        "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
-        "//tensorflow/compiler/mlir/lite/quantization:quantization_context",
-        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
-        "//tensorflow/compiler/mlir/xla:hlo",
-        "//tensorflow/compiler/xla/client/lib:quantize",
-        "@com_google_absl//absl/memory",
-        "@llvm-project//llvm:support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "cpu_device_target",
-    srcs = [
-        "cpu_device_target.cc",
-    ],
-    hdrs = [
-        "cpu_device_target.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/mlir/lite/quantization:device_target",
-        "//tensorflow/compiler/mlir/lite/quantization:quantization_context",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-gentbl(
-    name = "cpu_kernel_fusion_inc_gen",
-    tbl_outs = [
-        (
-            "-gen-rewriters",
-            "generated_cpu_kernel_fusion.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "cpu_kernel_fusion.td",
-    td_srcs = [
-        "@llvm-project//mlir:StdOpsTdFiles",
-        "//tensorflow/compiler/mlir/xla:hlo_ops_td_files",
-        "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
-    ],
-)
--- a/tensorflow/compiler/mlir/lite/quantization/xla/cpu_device_target.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/cpu_device_target.cc
@ -1,67 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/lite/quantization/xla/cpu_device_target.h"
-
-#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_context.h"
-
-namespace mlir {
-namespace xla_hlo {
-
-namespace ph = std::placeholders;
-
-CpuDeviceTarget::CpuDeviceTarget(MLIRContext* ctx) : DeviceTarget(ctx) {
-  RegisterKernel("generic.concat", {qi8_, qi8_, qi8_},
-                 quant::ScaleConstraintType::OutputInputSameScale);
-
-  // TODO(fengliuai): All the combinations are required to list. We need to
-  // improve this.
-  RegisterKernel("generic.reshape", {qi8_, any_},
-                 quant::ScaleConstraintType::OutputInputSameScale);
-  RegisterKernel("generic.reshape", {any_, qi8_},
-                 quant::ScaleConstraintType::OutputInputSameScale);
-
-  RegisterKernel("generic.mul", {qi8_, qi8_, qi8_},
-                 quant::ScaleConstraintType::OutputInputFreeScale);
-  RegisterKernel("generic.mul_add", {qi8_, qi8n_, any_, qi8_},
-                 std::bind(&CpuDeviceTarget::HandleMultiplyAccumulateScale,
-                           this, ph::_1, ph::_2, ph::_3, ph::_4));
-  RegisterKernel("generic.matmul_add", {qi8_, qi8n_, any_, qi8_},
-                 std::bind(&CpuDeviceTarget::HandleMultiplyAccumulateScale,
-                           this, ph::_1, ph::_2, ph::_3, ph::_4));
-}
-
-LogicalResult CpuDeviceTarget::HandleMultiplyAccumulateScale(
-    quant::QuantizeContext* ctx, Operation* op,
-    quant::AdjacentOperations* new_items, bool* changed) {
-  auto bias_params = ctx->GetOperandParams(op, 2);
-  if (!EmptyParams(bias_params)) {
-    return success();
-  }
-  std::vector<quant::QuantParams> op_types{ctx->GetOperandParams(op, 0),
-                                           ctx->GetOperandParams(op, 1)};
-  auto bias_scale = GetUniformQuantizedTypeForBias(op_types);
-  if (bias_scale && ctx->SetOperandParams(op, 2, bias_scale)) {
-    *changed = true;
-    new_items->push_back(op->getOperand(2).getDefiningOp());
-  }
-  return success();
-}
-
-}  // namespace xla_hlo
-}  // namespace mlir
--- a/tensorflow/compiler/mlir/lite/quantization/xla/cpu_device_target.h
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/cpu_device_target.h
@ -1,40 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_CPU_DEVICE_TARGET_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_CPU_DEVICE_TARGET_H_
-
-#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/device_target.h"
-
-namespace mlir {
-namespace xla_hlo {
-
-// Target specs for cpu kernels
-class CpuDeviceTarget : public quant::DeviceTarget {
- public:
-  explicit CpuDeviceTarget(MLIRContext* ctx);
-
- private:
-  LogicalResult HandleMultiplyAccumulateScale(
-      quant::QuantizeContext* ctx, Operation* op,
-      quant::AdjacentOperations* new_items, bool* changed);
-};
-
-}  // namespace xla_hlo
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_CPU_DEVICE_TARGET_H_
--- a/tensorflow/compiler/mlir/lite/quantization/xla/cpu_kernel_fusion.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/cpu_kernel_fusion.cc
@ -1,347 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <math.h>
-
-#include <algorithm>
-#include <cstdint>
-#include <initializer_list>
-#include <iterator>
-#include <numeric>
-#include <string>
-
-#include "absl/memory/memory.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Matchers.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
-#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
-#include "tensorflow/compiler/xla/client/lib/quantize.h"
-
-#define DEBUG_TYPE "quant-kernel-fusion"
-
-constexpr int kFakeQuantOperandsNum = 5;
-constexpr int kFakeQuantPerChannelOperandsNum = 6;
-
-namespace mlir {
-namespace xla_hlo {
-
-namespace {
-
-TypeAttr GetQuantSpec(Operation* op) {
-  auto fake_quant = llvm::dyn_cast_or_null<CustomCallOp>(op);
-  if (!fake_quant || fake_quant.getNumOperands() < kFakeQuantOperandsNum ||
-      fake_quant.getNumOperands() > kFakeQuantPerChannelOperandsNum ||
-      fake_quant.call_target_name() != "fake_quant_with_min_max_vars")
-    return {};
-
-  DenseFPElementsAttr min, max;
-  DenseIntElementsAttr bit_width, narrow_range, quant_dim;
-  if (!matchPattern(fake_quant.getOperand(1), m_Constant(&min)) ||
-      !matchPattern(fake_quant.getOperand(2), m_Constant(&max)) ||
-      !matchPattern(fake_quant.getOperand(3), m_Constant(&bit_width)) ||
-      !matchPattern(fake_quant.getOperand(4), m_Constant(&narrow_range)))
-    return {};
-
-  auto bit_width_val = (*bit_width.attr_value_begin()).cast<IntegerAttr>();
-  auto narrow_range_val = (*narrow_range.int_value_begin()).getSExtValue();
-  int quant_dim_val = -1;
-  if (fake_quant.getNumOperands() == kFakeQuantPerChannelOperandsNum &&
-      matchPattern(fake_quant.getOperand(kFakeQuantPerChannelOperandsNum - 1),
-                   m_Constant(&quant_dim))) {
-    quant_dim_val = (*quant_dim.int_value_begin()).getSExtValue();
-  }
-
-  OpBuilder builder(op);
-  Type input_type =
-      fake_quant.getOperand(0).getType().cast<ShapedType>().getElementType();
-  return quant::GetQuantizedTypeAttr(
-      builder, input_type, min, max, quant_dim_val, bit_width_val,
-      builder.getBoolAttr(narrow_range_val), /*is_signed=*/true);
-}
-
-// Collects input values from outside for 'ops'.
-void CollectInputs(llvm::ArrayRef<Operation*> ops,
-                   llvm::SmallVectorImpl<Value>* inputs,
-                   llvm::SmallVectorImpl<Attribute>* input_specs) {
-  for (Operation* op : ops) {
-    for (Value operand : op->getOperands()) {
-      if (std::find(inputs->begin(), inputs->end(), operand) != inputs->end()) {
-        continue;
-      }
-      if (Operation* def_op = operand.getDefiningOp()) {
-        if (std::find(ops.begin(), ops.end(), def_op) == ops.end()) {
-          inputs->push_back(operand);
-        }
-      } else {  // argument value
-        inputs->push_back(operand);
-      }
-    }
-  }
-
-  for (Value input : *inputs) {
-    ShapedType input_type = input.getType().cast<ShapedType>();
-    if (TypeAttr spec = GetQuantSpec(input.getDefiningOp())) {
-      input_specs->push_back(spec);
-    } else {
-      input_specs->push_back(TypeAttr::get(input_type.getElementType()));
-    }
-  }
-}
-
-// Collects values that are produced by 'ops' and have use outside of 'ops'.
-// TODO(fengliuai): if it is a single user and QDQ, write that to the specs.
-void CollectRets(llvm::ArrayRef<Operation*> ops,
-                 llvm::SmallVectorImpl<Value>* rets,
-                 llvm::SmallVectorImpl<Type>* ret_types,
-                 llvm::SmallVectorImpl<Attribute>* ret_specs) {
-  for (Operation* op : ops) {
-    // The constant will not be shared outside the region.
-    if (llvm::isa<ConstantOp>(op)) continue;
-
-    for (Value result : op->getResults()) {
-      for (Operation* user : result.getUsers()) {
-        // If there are any user outside of 'ops'
-        if (std::find(ops.begin(), ops.end(), user) == ops.end()) {
-          ShapedType ret_type = result.getType().cast<ShapedType>();
-          rets->push_back(result);
-          ret_types->push_back(ret_type);
-          if (TypeAttr spec = GetQuantSpec(user)) {
-            ret_specs->push_back(spec);
-          } else {
-            ret_specs->push_back(TypeAttr::get(ret_type.getElementType()));
-          }
-          break;
-        }
-      }
-    }
-  }
-}
-
-enum FusedActivationFunc { NONE, RELU, RELU1, RELU6 };
-
-#define FLOAT_EQ(value, x) fabs(value - x) <= 1e-6
-
-// If the op is max(in, 0.0), we consider this is from Relu, so both this op
-// and constant 0.0 will be fused.
-// If the op is clamp(0.0, in, 1.0) or clamp(0.0, in, 6.0), we consider this is
-// from Relu1 or Relu6, so all the constants and this op will be fused.
-// Returns the activation function type.
-FusedActivationFunc FuseReluX(Operation* op,
-                              llvm::SmallVectorImpl<Operation*>* fused) {
-  if (auto max = llvm::dyn_cast<xla_hlo::MaxOp>(op)) {
-    Value min_val = max.rhs();
-    llvm::SmallVector<Operation*, 4> broadcast_ops;
-    if (auto broadcast = llvm::dyn_cast_or_null<xla_hlo::BroadcastInDimOp>(
-            min_val.getDefiningOp())) {
-      min_val = broadcast.operand();
-      broadcast_ops.push_back(broadcast);
-    }
-    DenseFPElementsAttr min;
-    if (!matchPattern(min_val, m_Constant(&min))) {
-      // In case the min value is lhs.
-      min_val = max.lhs();
-      broadcast_ops.clear();
-      if (auto broadcast = llvm::dyn_cast_or_null<xla_hlo::BroadcastInDimOp>(
-              min_val.getDefiningOp())) {
-        min_val = broadcast.operand();
-        broadcast_ops.push_back(broadcast);
-      }
-      if (!matchPattern(min_val, m_Constant(&min))) {
-        return NONE;
-      }
-    }
-    if (!min.isSplat() ||
-        !(FLOAT_EQ(min.getSplatValue().cast<FloatAttr>().getValueAsDouble(),
-                   0.0))) {
-      return NONE;
-    }
-
-    // Include the constant 0.0 as well, to avoid being quantized.
-    fused->push_back(min_val.getDefiningOp());
-    fused->append(broadcast_ops.begin(), broadcast_ops.end());
-    fused->push_back(max);
-    return RELU;
-  }
-
-  if (auto clamp = llvm::dyn_cast<xla_hlo::ClampOp>(op)) {
-    DenseFPElementsAttr lower, upper;
-    if (!matchPattern(clamp.min(), m_Constant(&lower)) ||
-        !matchPattern(clamp.max(), m_Constant(&upper)) || !lower.isSplat() ||
-        !upper.isSplat() ||
-        !(FLOAT_EQ(lower.getSplatValue().cast<FloatAttr>().getValueAsDouble(),
-                   0.0))) {
-      return NONE;
-    }
-
-    double upper_value =
-        upper.getSplatValue().cast<FloatAttr>().getValueAsDouble();
-    if (FLOAT_EQ(upper_value, 1.0) || FLOAT_EQ(upper_value, 6.0)) {
-      fused->push_back(clamp.min().getDefiningOp());
-      fused->push_back(clamp.max().getDefiningOp());
-      fused->push_back(op);
-      return (FLOAT_EQ(upper_value, 1.0) ? RELU1 : RELU6);
-    }
-  }
-  return NONE;
-}
-
-llvm::SmallVector<Value, 0> FuseOps(PatternRewriter* rewriter,
-                                    const std::initializer_list<Value>& results,
-                                    StringRef kernel) {
-  // Collect all the operations to be fused.
-  llvm::SmallVector<Operation*, 4> fused;
-  llvm::SmallVector<Location, 4> locs;
-  fused.reserve(results.size());
-  locs.reserve(results.size());
-  for (auto value : results) {
-    Operation* op = value.getDefiningOp();
-    fused.push_back(op);
-    locs.push_back(op->getLoc());
-  }
-
-  Operation* root = fused.back();
-
-  FusedActivationFunc act_func = FusedActivationFunc::NONE;
-  // If there is Relu, Relu1 or Relu6, fuse it as well.
-  if (results.size() > 0 && std::rbegin(results)->hasOneUse()) {
-    act_func = FuseReluX(*std::rbegin(results)->user_begin(), &fused);
-  }
-
-  // Collect inputs from outside to 'ops'.
-  llvm::SmallVector<Value, 4> inputs;
-  llvm::SmallVector<Attribute, 4> input_specs;
-  CollectInputs(fused, &inputs, &input_specs);
-
-  // Collect outputs from 'ops' to outside.
-  llvm::SmallVector<Value, 4> rets;
-  llvm::SmallVector<Type, 4> ret_types;
-  llvm::SmallVector<Attribute, 4> ret_specs;
-  CollectRets(fused, &rets, &ret_types, &ret_specs);
-
-  // TODO(fengliuai): make activation function an attribute.
-  std::string kernel_name;
-  switch (act_func) {
-    case RELU:
-      kernel_name = llvm::Twine(kernel, "_relu").str();
-      break;
-    case RELU1:
-      kernel_name = llvm::Twine(kernel, "_relu1").str();
-      break;
-    case RELU6:
-      kernel_name = llvm::Twine(kernel, "_relu6").str();
-      break;
-    default:
-      kernel_name = kernel.str();
-  }
-
-  // Create the region op with the return.
-  auto region = rewriter->create<quant::QuantizeRegionOp>(
-      rewriter->getFusedLoc(locs), ret_types, inputs,
-      rewriter->getArrayAttr(input_specs), rewriter->getArrayAttr(ret_specs),
-      kernel_name);
-  auto* body = new Block();
-  region.body().push_back(body);
-
-  OpBuilder builder = OpBuilder::atBlockEnd(body);
-  BlockAndValueMapping mapping;
-
-  // Make block arguments and add it to the block value mapping.
-  for (Value input : inputs) {
-    mapping.map(input, body->addArgument(input.getType()));
-  }
-
-  // Clone the operations 'ops' to the region.
-  for (Operation* op : fused) {
-    builder.clone(*op, mapping);
-  }
-
-  llvm::SmallVector<Value, 4> new_rets;
-  new_rets.reserve(rets.size());
-  for (auto ret : llvm::enumerate(rets)) {
-    Value new_ret = mapping.lookupOrNull(ret.value());
-    assert(new_ret && "couldn't find return value.");
-    new_rets.push_back(new_ret);
-    ret.value().replaceAllUsesWith(region.getResult(ret.index()));
-  }
-  builder.create<quant::ReturnOp>(builder.getUnknownLoc(), new_rets);
-
-  LLVM_DEBUG({
-    assert(region.verify().Success && "failed to create quant region.");
-    llvm::dbgs() << "\ncreated region: ";
-    region.print(llvm::dbgs());
-    llvm::dbgs() << "\n\n\n";
-  });
-
-  // All uses of the fused ops are replaced, so the values in this vector
-  // will not be used.
-  SmallVector<Value, 0> new_values(root->getNumResults(), region.getResult(0));
-  return new_values;
-}
-
-struct CpuKernelFusionPass
-    : public PassWrapper<CpuKernelFusionPass, FunctionPass> {
-  explicit CpuKernelFusionPass() = default;
-  CpuKernelFusionPass(const CpuKernelFusionPass&) {}
-
-  void runOnFunction() override;
-};
-
-#include "tensorflow/compiler/mlir/lite/quantization/xla/generated_cpu_kernel_fusion.inc"
-
-void CpuKernelFusionPass::runOnFunction() {
-  Operation* op = getOperation();
-  MLIRContext* ctx = op->getContext();
-  OwningRewritePatternList patterns;
-  populateWithGenerated(ctx, &patterns);
-  applyPatternsGreedily(op->getRegions(), patterns);
-}
-
-}  // namespace
-
-// Creates an instance of the xla_hlo cpu kernel fusion pass.
-std::unique_ptr<OperationPass<FuncOp>> CreateCpuKernelFusionPass() {
-  return std::make_unique<CpuKernelFusionPass>();
-}
-
-static PassRegistration<CpuKernelFusionPass> pass(
-    "xla-hlo-cpu-fusion", "Fuse xla hlo ops into cpu kernels");
-
-}  // namespace xla_hlo
-}  // namespace mlir
--- a/tensorflow/compiler/mlir/lite/quantization/xla/cpu_kernel_fusion.td
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/cpu_kernel_fusion.td
@ -1,70 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
-include "mlir/IR/OpBase.td"
-include "mlir/Dialect/StandardOps/IR/Ops.td"
-
-class Fused1Ops<string kernel> : NativeCodeCall<
-  "FuseOps(&$_builder, {$0}, \"" # kernel # "\")">;
-class Fused2Ops<string kernel> : NativeCodeCall<
-  "FuseOps(&$_builder, {$0, $1}, \"" # kernel # "\")">;
-class Fused3Ops<string kernel> : NativeCodeCall<
-  "FuseOps(&$_builder, {$0, $1, $2}, \"" # kernel # "\")">;
-class Fused4Ops<string kernel> : NativeCodeCall<
-  "FuseOps(&$_builder, {$0, $1, $2, $3}, \"" # kernel # "\")">;
-
-// We shouldn't revisit those ops which have been fused. This constraint is
-// required because the greedy pattern rewriter will visit and match any new
-// ops. So when the source pattern are matched and wrapped by the quant region
-// op, these ops will be matched again. To prevent this, this constraint is
-// added to bypass any ops inside a quant region.
-def NeedsToBeFused : Constraint<CPred<
-  "!$0.getDefiningOp()->getParentOfType<quant::QuantizeRegionOp>()">>;
-
-// dummy example
-def : Pat<(HLO_AddOp:$add (HLO_MulOp:$mul $_, $_, $_), $_, $_),
-          (Fused2Ops<"generic.mul_add"> $mul, $add),
-          [(NeedsToBeFused $add)]>;
-
-// add
-def : Pat<(HLO_AddOp:$add $_, $_, $_),
-          (Fused1Ops<"generic.add"> $add),
-          [(NeedsToBeFused $add)]>;
-
-// reduce_window: maxpool, avgpool
-def : Pat<(HLO_ReduceWindowOp:$reduce $_, $_, $_, $_, $_, $_, $_),
-          (Fused1Ops<"generic.reduce_window"> $reduce),
-          [(NeedsToBeFused $reduce)]>;
-
-// reshape
-def : Pat<(HLO_ReshapeOp:$reshape $_), (Fused1Ops<"generic.reshape"> $reshape),
-          [(NeedsToBeFused $reshape)]>;
-
-// broadcast
-def : Pat<(HLO_BroadcastInDimOp:$broadcast $_, $_),
-          (Fused1Ops<"generic.broadcast"> $broadcast),
-          [(NeedsToBeFused $broadcast)]>;
-
-// dot -> add
-def : Pat<(HLO_AddOp:$add (HLO_DotOp:$dot $_, $_, $_), $_, $_),
-          (Fused2Ops<"generic.biased_dot"> $dot, $add),
-          [(NeedsToBeFused $add)]>;
-
-// conv -> add
-def : Pat<(HLO_AddOp:$add
-             (HLO_ConvOp:$conv $_, $_, $_, $_, $_, $_, $_, $_, $_, $_), $_, $_),
-          (Fused2Ops<"generic.biased_conv"> $conv, $add),
-          [(NeedsToBeFused $add)]>;
--- a/tensorflow/compiler/mlir/lite/quantization/xla/materialize.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/materialize.cc
@ -1,175 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This transformation pass quantize the constant and rewrite the quantization
-// ops by xla_hlo primitive ops.
-#include <cstdint>
-#include <iterator>
-#include <numeric>
-#include <string>
-
-#include "absl/memory/memory.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
-#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
-#include "tensorflow/compiler/xla/client/lib/quantize.h"
-
-//===----------------------------------------------------------------------===//
-// The pass to materialize the quantization results by xla primitive ops.
-//
-namespace mlir {
-namespace xla_hlo {
-
-namespace {
-
-// This pattern matches the "constant->qcast->dcast" pattern and replaces it by
-// "quantized constant->xla_hlo.dequantize". If it only matches the
-// "non-constant->qcast->dcast" pattern, it will remove both the "qcast->dcast".
-// We chain the pattern as a whole to bypass the type checks of the normal
-// xla_hlo ops.
-// TODO(fengliuai): make this pass work for bf16 input.
-class RewriteDequantize : public OpRewritePattern<quant::DequantizeCastOp> {
- public:
-  explicit RewriteDequantize(int64_t size, MLIRContext *context)
-      : OpRewritePattern<quant::DequantizeCastOp>(context), size_(size) {}
-
-  LogicalResult matchAndRewrite(quant::DequantizeCastOp op,
-                                PatternRewriter &rewriter) const override {
-    // quant.dcast
-    // xla_hlo dequantize only takes min/max, so let's recover them from
-    // the quantization parameters.
-    Value dcast = op.arg();
-    auto type = quant::QuantizedType::getQuantizedElementType(dcast.getType());
-    if (!type || !type.isa<quant::UniformQuantizedType>()) {
-      return failure();
-    }
-    auto qtype = type.cast<quant::UniformQuantizedType>();
-    double scale = qtype.getScale();
-    int64_t zero_point = qtype.getZeroPoint();
-    float min = scale * (qtype.getStorageTypeMin() - zero_point);
-    float max = scale * (qtype.getStorageTypeMax() - zero_point);
-
-    // quant.qcast
-    auto qcast =
-        llvm::dyn_cast_or_null<quant::QuantizeCastOp>(dcast.getDefiningOp());
-    if (!qcast) return failure();
-
-    // constant
-    DenseFPElementsAttr attr;
-    // If it isn't a floating-point constant or the size is too small, let's
-    // remove the quantization. Also the last dimension size should be a
-    // multiplier of 4, so the shape isn't broken during packing and unpacking.
-    if (!matchPattern(qcast.arg(), m_Constant(&attr)) ||
-        attr.getNumElements() <= size_ ||
-        attr.getType().getDimSize(attr.getType().getRank() - 1) % 4 != 0) {
-      op.getResult().replaceAllUsesWith(qcast.arg());
-      return success();
-    }
-    // TODO(fengliuai): implement transpose if it has high dimension.
-
-    // Create the quantized result
-    auto quantized_result =
-        quant::Quantize(attr, qtype).dyn_cast_or_null<DenseIntElementsAttr>();
-    if (!quantized_result) {
-      return failure();
-    }
-
-    // Pack the uint8 bits to uint32. The shape is changed from from
-    // [n0, n1, ..., nk] to [n0, n1, ..., nk / 4].
-    std::vector<uint8_t> raw_data;
-    for (auto d : quantized_result.getValues<uint8_t>()) {
-      raw_data.push_back(d);
-    }
-    // The packing might increase the data size by paddings.
-    auto packed_data = xla::PackToUint32<uint8_t>(raw_data);
-    auto packed_shape = attr.getType().getShape().vec();
-    int lower_dims = std::accumulate(
-        packed_shape.begin(),
-        std::next(packed_shape.begin(), packed_shape.size() - 1), 1,
-        std::multiplies<int>());
-    packed_shape[packed_shape.size() - 1] = packed_data.size() / lower_dims;
-    auto packed_type =
-        RankedTensorType::get(packed_shape, rewriter.getIntegerType(32));
-
-    auto packed_quantized_result =
-        DenseElementsAttr::get<uint32_t>(packed_type, packed_data);
-    auto quantized_constant =
-        rewriter.create<ConstantOp>(qcast.getLoc(), packed_quantized_result);
-
-    // Create the xla dequantize op with bf16 output
-    auto dequantized_type = RankedTensorType::get(attr.getType().getShape(),
-                                                  rewriter.getBF16Type());
-    auto dequantize = rewriter.create<DequantizeOp>(
-        qcast.getLoc(), dequantized_type, quantized_constant,
-        rewriter.getF32FloatAttr(min), rewriter.getF32FloatAttr(max),
-        rewriter.getStringAttr("MIN_COMBINED"), rewriter.getBoolAttr(false),
-        rewriter.getBoolAttr(false));
-
-    // Convert bf16 output back to f32
-    rewriter.replaceOpWithNewOp<ConvertOp>(op, op.getResult().getType(),
-                                           dequantize);
-    return success();
-  }
-
- private:
-  int64_t size_;
-};
-
-// Materialize the quantization results by hlo primitive ops.
-struct MaterializeToXlaPass
-    : public PassWrapper<MaterializeToXlaPass, FunctionPass> {
-  explicit MaterializeToXlaPass() = default;
-  MaterializeToXlaPass(const MaterializeToXlaPass &) {}
-
-  void runOnFunction() override;
-};
-
-void MaterializeToXlaPass::runOnFunction() {
-  FuncOp func = getFunction();
-  MLIRContext *ctx = &getContext();
-
-  OwningRewritePatternList patterns;
-  // TODO(fengliuai): make the size 6 configurable.
-  patterns.insert<RewriteDequantize>(6, ctx);
-
-  applyPatternsGreedily(func, patterns);
-}
-
-}  // namespace
-
-// Creates an instance of the xla_hlo dialect quantization propagation pass.
-std::unique_ptr<OperationPass<FuncOp>> CreateMaterializeToXlaPass() {
-  return std::make_unique<MaterializeToXlaPass>();
-}
-
-static PassRegistration<MaterializeToXlaPass> pass(
-    "xla-hlo-materialize-quant",
-    "Materialize the quantization results by xla primitve ops");
-
-}  // namespace xla_hlo
-}  // namespace mlir
--- a/tensorflow/compiler/mlir/lite/quantization/xla/op_quant_spec.inc
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/op_quant_spec.inc
@ -1,7 +0,0 @@
-// TODO(fengliuai): automatically generate this file
-// TODO(fengliuai): add all the xla_hlo ops
-
-static std::unique_ptr<quant::OpQuantSpec> GetOpQuantSpec(mlir::Operation *op) {
-  auto spec = absl::make_unique<quant::OpQuantSpec>();
-  return spec;
-}
--- a/tensorflow/compiler/mlir/lite/quantization/xla/passes.h
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/passes.h
@ -1,40 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_PASSES_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_PASSES_H_
-
-#include <memory>
-
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace mlir {
-namespace xla_hlo {
-
-// Propagate the quantization information to all the tensors according to the
-// op quant spec.
-std::unique_ptr<OperationPass<FuncOp>> CreatePropagateQuantPass();
-
-// Rewrite the graph and quantize the constant.
-std::unique_ptr<OperationPass<FuncOp>> CreateMaterializeToXlaPass();
-
-// Fuse HLO ops into quantized regions.
-std::unique_ptr<OperationPass<FuncOp>> CreateCpuKernelFusionPass();
-
-}  // namespace xla_hlo
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_PASSES_H_
--- a/tensorflow/compiler/mlir/lite/quantization/xla/propagate.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/propagate.cc
@ -1,108 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This transformation pass applies quantization propagation on xla_hlo dialect.
-#include <iterator>
-#include <string>
-
-#include "absl/memory/memory.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/CommandLine.h"
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_context.h"
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
-#include "tensorflow/compiler/mlir/lite/quantization/xla/cpu_device_target.h"
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> disable_per_channel(
-    "xla-disable-per-channel", llvm::cl::value_desc("bool"),
-    llvm::cl::desc("Whether disable per-channel quantized weights."),
-    llvm::cl::init(false));
-
-//===----------------------------------------------------------------------===//
-// The quantization propagation Pass.
-//
-namespace mlir {
-namespace xla_hlo {
-
-namespace {
-
-// Applies the quantization propagation on the input function. During the
-// propagation, two facts are respected:
-// - The quantization type (params) of the ops in the function
-// - The quantization spec for the ops
-// The propagation results should assign quantization types to all the tensors
-// and the two restrictions are respected.
-struct PropagateQuantPass
-    : public PassWrapper<PropagateQuantPass, FunctionPass> {
-  explicit PropagateQuantPass() = default;
-  PropagateQuantPass(const PropagateQuantPass &) {}
-
-  void runOnFunction() override;
-};
-
-#include "tensorflow/compiler/mlir/lite/quantization/xla/op_quant_spec.inc"
-
-void PropagateQuantPass::runOnFunction() {
-  FuncOp func = getFunction();
-  // TODO(fengliuai): deprecate this old code generation path.
-  // XLA only support uint8/uint16 quantization for now.
-  ApplyQuantizationParamsPropagation(func, /*is_signed*/ false,
-                                     disable_per_channel, GetOpQuantSpec);
-
-  CpuDeviceTarget spec(&getContext());
-  quant::QuantizeContext ctx(func, spec);
-
-  std::vector<quant::QuantizeRegionOp> work_list = ctx.GetAllOps();
-  bool changed = false;
-  while (!work_list.empty()) {
-    quant::QuantizeRegionOp op = work_list.back();
-    work_list.pop_back();
-
-    llvm::SmallVector<Operation *, 4> new_items;
-    if (failed(ctx.Handle(op, &new_items, &changed))) {
-      // The IR is still valid, thus we shouldn't fail.
-      signalPassFailure();
-    }
-    for (auto item : new_items) {
-      if (auto reg = llvm::dyn_cast_or_null<quant::QuantizeRegionOp>(item))
-        work_list.push_back(reg);
-    }
-  }
-
-  if (!changed) return;
-
-  if (failed(ctx.Finalize())) {
-    signalPassFailure();
-  }
-}
-
-}  // namespace
-
-// Creates an instance of the xla_hlo dialect quantization propagation pass.
-std::unique_ptr<OperationPass<FuncOp>> CreatePropagateQuantPass() {
-  return std::make_unique<PropagateQuantPass>();
-}
-
-static PassRegistration<PropagateQuantPass> pass(
-    "xla-hlo-propagate-quant", "Propagate quantization information");
-
-}  // namespace xla_hlo
-}  // namespace mlir
--- a/tensorflow/compiler/mlir/lite/quantization/xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/tests/BUILD
@ -1,33 +0,0 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [
-        ":graph_config_files",
-        ":test_utilities",
-    ],
-    driver = "@llvm-project//mlir:run_lit.sh",
-    exclude = ["fadd_quant.mlir"],
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/aot:tfcompile",
-        "//tensorflow/compiler/mlir:tf-opt",
-        "@llvm-project//llvm:FileCheck",
-        "@llvm-project//llvm:not",
-    ],
-)
-
-# Bundle together all the graph files that are used by the tests.
-filegroup(
-    name = "graph_config_files",
-    srcs = glob(
-        ["**/*.pbtxt"],
-    ),
-)
--- a/tensorflow/compiler/mlir/lite/quantization/xla/tests/cpu_kernel_fusion.mlir
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/tests/cpu_kernel_fusion.mlir
@ -1,199 +0,0 @@
-// RUN: tf-opt -xla-hlo-cpu-fusion %s | FileCheck %s
-
-// CHECK-LABEL: @mul_add_source
-func @mul_add_source(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32>) -> (tensor<4xf32>) {
-  %0 = "xla_hlo.multiply"(%arg0, %arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  %1 = "xla_hlo.add"(%0, %arg2) {broadcast_dimensions = dense<[]> : tensor<0xi64>} :  (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %1 : tensor<4xf32>
-
-// CHECK: %[[region:.*]] = "quant.region"(%arg0, %arg1, %arg2) ( {
-// CHECK: ^bb0(%arg3: tensor<4xf32>, %arg4: tensor<4xf32>, %arg5: tensor<4xf32>):	// no predecessors
-// CHECK:   %[[mul:.*]] = xla_hlo.multiply %arg3, %arg4 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
-// CHECK:   %[[add:.*]] = xla_hlo.add %[[mul]], %arg5 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
-// CHECK:   "quant.return"(%[[add]]) : (tensor<4xf32>) -> ()
-// CHECK: }) {input_specs = [f32, f32, f32], logical_kernel = "generic.mul_add", output_specs = [f32]} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-// CHECK: return %[[region]] : tensor<4xf32>
-}
-
-// CHECK-LABEL: @mul_add_annotated
-func @mul_add_annotated(%arg0: tensor<2x4xf32>, %arg1: tensor<2x4xf32>, %arg2: tensor<2x4xf32>) -> (tensor<2x4xf32>) {
-  %cst = constant dense<0.0> : tensor<f32>
-  %cst_0 = constant dense<255.0> : tensor<f32>
-  %cst_1 = constant dense<8> : tensor<i32>
-  %cst_2 = constant dense<false> : tensor<i1>
-  %qin = "xla_hlo.custom_call"(%arg0, %cst, %cst_0, %cst_1, %cst_2) {backend_config = "", call_target_name = "fake_quant_with_min_max_vars",
-    has_side_effect = false, name = "custom-call.1"} : (tensor<2x4xf32>, tensor<f32>, tensor<f32>, tensor<i32>, tensor<i1>) -> tensor<2x4xf32>
-  %qw = "xla_hlo.custom_call"(%arg1, %cst, %cst_0, %cst_1, %cst_2) {backend_config = "", call_target_name = "fake_quant_with_min_max_vars",
-    has_side_effect = false, name = "custom-call.2"} : (tensor<2x4xf32>, tensor<f32>, tensor<f32>, tensor<i32>, tensor<i1>) -> tensor<2x4xf32>
-  %0 = "xla_hlo.multiply"(%qin, %qw) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<2x4xf32>
-  %1 = "xla_hlo.add"(%0, %arg2) {broadcast_dimensions = dense<[]> : tensor<0xi64>} :  (tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<2x4xf32>
-  %r = "xla_hlo.custom_call"(%1, %cst, %cst_0, %cst_1, %cst_2) {backend_config = "", call_target_name = "fake_quant_with_min_max_vars",
-    has_side_effect = false, name = "custom-call.3"} : (tensor<2x4xf32>, tensor<f32>, tensor<f32>, tensor<i32>, tensor<i1>) -> tensor<2x4xf32>
-  return %r : tensor<2x4xf32>
-
-// CHECK: %[[region:.*]] = "quant.region"
-// CHECK: ^bb0(%arg3: tensor<2x4xf32>, %arg4: tensor<2x4xf32>, %arg5: tensor<2x4xf32>):	// no predecessors
-// CHECK:   %[[mul:.*]] = xla_hlo.multiply %arg3, %arg4 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<2x4xf32>
-// CHECK:   %[[add:.*]] = xla_hlo.add %[[mul]], %arg5 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<2x4xf32>
-// CHECK:   "quant.return"(%[[add]]) : (tensor<2x4xf32>) -> ()
-// CHECK: }) {input_specs = [!quant.uniform<i8:f32, 1.000000e+00:-128>, !quant.uniform<i8:f32, 1.000000e+00:-128>, f32],
-// CHECK-SAME: logical_kernel = "generic.mul_add", output_specs = [!quant.uniform<i8:f32, 1.000000e+00:-128>]} :
-// CHECK-SAME: (tensor<2x4xf32>, tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<2x4xf32>
-// CHECK:  %[[r:.*]] = "xla_hlo.custom_call"(%[[region]]
-// CHECK: return %[[r]] : tensor<2x4xf32>
-}
-
-// CHECK-LABEL: @reduce_window
-func @reduce_window(%arg0: tensor<1x28x28x32xf32>, %arg1: tensor<f32>) -> (tensor<1x14x14x32xf32>) {
-  %0 = "xla_hlo.reduce_window"(%arg0, %arg1) ({
-  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %1 = xla_hlo.maximum %arg2, %arg3 : tensor<f32>
-    "xla_hlo.return"(%1) : (tensor<f32>) -> ()
-  }) {
-    base_dilations = dense<1> : tensor<4xi64>,
-    padding = dense<0> : tensor<4x2xi64>,
-    window_dilations = dense<1> : tensor<4xi64>,
-    window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
-    window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>
-  } : (tensor<1x28x28x32xf32>, tensor<f32>) -> tensor<1x14x14x32xf32>
-  return %0 : tensor<1x14x14x32xf32>
-
-// CHECK: "quant.region"(%arg0, %arg1) ( {
-// CHECK: ^bb0(%arg2: tensor<1x28x28x32xf32>, %arg3: tensor<f32>):	// no predecessors
-// CHECK:   %[[rw:.*]] = "xla_hlo.reduce_window"(%arg2, %arg3) ( {
-// CHECK:   ^bb0(%arg4: tensor<f32>, %arg5: tensor<f32>):	// no predecessors
-// CHECK:     %[[max:.*]] = xla_hlo.maximum %arg4, %arg5 : tensor<f32>
-// CHECK:     "xla_hlo.return"(%[[max]]) : (tensor<f32>) -> ()
-// CHECK:   })
-// CHECK:   "quant.return"(%[[rw]])
-// CHECK: }) {input_specs = [f32, f32], logical_kernel = "generic.reduce_window", output_specs = [f32]}
-}
-
-// CHECK-LABEL: @reshape
-func @reshape(%arg0: tensor<1x7x7x64xf32>) -> (tensor<1x3136xf32>) {
-  %0 = "xla_hlo.reshape"(%arg0) : (tensor<1x7x7x64xf32>) -> tensor<1x3136xf32>
-  return %0 : tensor<1x3136xf32>
-
-// CHECK: "quant.region"(%arg0)
-// CHECK: logical_kernel = "generic.reshape"
-}
-
-// CHECK-LABEL: @broadcast
-func @broadcast(%arg0: tensor<64xf32>) -> (tensor<1x14x14x64xf32>) {
-  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<64xf32>) -> tensor<1x14x14x64xf32>
-  return %0 : tensor<1x14x14x64xf32>
-
-// CHECK: "quant.region"(%arg0)
-// CHECK: logical_kernel = "generic.broadcast"
-}
-
-// CHECK-LABEL: @biased_dot
-func @biased_dot(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x10xf32>, %arg2: tensor<1x10xf32>) -> (tensor<1x10xf32>) {
-  %0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x1024xf32>, tensor<1024x10xf32>) -> tensor<1x10xf32>
-  %1 = xla_hlo.add %0, %arg2 : tensor<1x10xf32>
-  return %1 : tensor<1x10xf32>
-
-// CHECK: "quant.region"(%arg0, %arg1, %arg2)
-// CHECK: xla_hlo.dot
-// CHECK: xla_hlo.add
-// CHECK: logical_kernel = "generic.biased_dot"
-}
-
-// CHECK-LABEL: @biased_conv
-func @biased_conv(%arg0: tensor<1x14x14x32xf32>, %arg1: tensor<5x5x32x64xf32>, %arg2: tensor<1x14x14x64xf32>) -> (tensor<1x14x14x64xf32>) {
-  %0 = "xla_hlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64,
-    input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64,
-    kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64,
-    output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, lhs_dilations = dense<1> : tensor<2xi64>,
-    padding = dense<2> : tensor<2x2xi64>, precision_config = ["DEFAULT", "DEFAULT"], rhs_dilations = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>
-  } : (tensor<1x14x14x32xf32>, tensor<5x5x32x64xf32>) -> tensor<1x14x14x64xf32>
-  %1 = xla_hlo.add %0, %arg2 : tensor<1x14x14x64xf32>
-  return %1 : tensor<1x14x14x64xf32>
-
-// CHECK: "quant.region"(%arg0, %arg1, %arg2)
-// CHECK: xla_hlo.convolution
-// CHECK: xla_hlo.add
-// CHECK: logical_kernel = "generic.biased_conv"
-}
-
-// CHECK-LABEL: @biased_dot_relu
-func @biased_dot_relu(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x10xf32>, %arg2: tensor<1x10xf32>) -> (tensor<1x10xf32>) {
-  %cst = constant dense<0.0> : tensor<1x10xf32>
-  %0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x1024xf32>, tensor<1024x10xf32>) -> tensor<1x10xf32>
-  %1 = xla_hlo.add %0, %arg2 : tensor<1x10xf32>
-  %2 = xla_hlo.maximum %1, %cst : tensor<1x10xf32>
-  return %2 : tensor<1x10xf32>
-
-// CHECK: "quant.region"(%arg0, %arg1, %arg2)
-// CHECK: constant
-// CHECK: xla_hlo.dot
-// CHECK: xla_hlo.add
-// CHECK: xla_hlo.maximum
-// CHECK: logical_kernel = "generic.biased_dot_relu"
-}
-
-// CHECK-LABEL: @biased_conv_relu
-func @biased_conv_relu(%arg0: tensor<1x14x14x32xf32>, %arg1: tensor<5x5x32x64xf32>, %arg2: tensor<1x14x14x64xf32>) -> (tensor<1x14x14x64xf32>) {
-  %cst = constant dense<0.0> : tensor<1x14x14x64xf32>
-  %0 = "xla_hlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64,
-    input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64,
-    kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64,
-    output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, lhs_dilations = dense<1> : tensor<2xi64>,
-    padding = dense<2> : tensor<2x2xi64>, precision_config = ["DEFAULT", "DEFAULT"], rhs_dilations = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>
-  } : (tensor<1x14x14x32xf32>, tensor<5x5x32x64xf32>) -> tensor<1x14x14x64xf32>
-  %1 = xla_hlo.add %0, %arg2 : tensor<1x14x14x64xf32>
-  %2 = xla_hlo.maximum %1, %cst : tensor<1x14x14x64xf32>
-  return %2 : tensor<1x14x14x64xf32>
-
-// CHECK: "quant.region"(%arg0, %arg1, %arg2)
-// CHECK: constant
-// CHECK: xla_hlo.convolution
-// CHECK: xla_hlo.add
-// CHECK: xla_hlo.maximum
-// CHECK: logical_kernel = "generic.biased_conv_relu"
-}
-
-// CHECK-LABEL: @biased_conv_relu_shared
-func @biased_conv_relu_shared(%arg0: tensor<1x14x14x32xf32>, %arg1: tensor<5x5x32x64xf32>, %arg2: tensor<1x14x14x64xf32>) -> (tensor<1x14x14x64xf32>, tensor<1x14x14x64xf32>) {
-  %cst = constant dense<0.0> : tensor<1x14x14x64xf32>
-  %0 = "xla_hlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64,
-    input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64,
-    kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64,
-    output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, lhs_dilations = dense<1> : tensor<2xi64>,
-    padding = dense<2> : tensor<2x2xi64>, precision_config = ["DEFAULT", "DEFAULT"], rhs_dilations = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>
-  } : (tensor<1x14x14x32xf32>, tensor<5x5x32x64xf32>) -> tensor<1x14x14x64xf32>
-  %1 = xla_hlo.add %0, %arg2 : tensor<1x14x14x64xf32>
-  %2 = xla_hlo.maximum %1, %cst : tensor<1x14x14x64xf32>
-  return %cst, %2 : tensor<1x14x14x64xf32>, tensor<1x14x14x64xf32>
-
-// CHECK: "quant.region"(%arg0, %arg1, %arg2)
-// CHECK: constant
-// CHECK: xla_hlo.convolution
-// CHECK: xla_hlo.add
-// CHECK: %[[max:.*]] = xla_hlo.maximum
-// CHECK: "quant.return"(%[[max]])
-// CHECK: logical_kernel = "generic.biased_conv_relu"
-}
-
-// CHECK-LABEL: @biased_conv_relu6
-func @biased_conv_relu6(%arg0: tensor<1x14x14x32xf32>, %arg1: tensor<5x5x32x64xf32>, %arg2: tensor<1x14x14x64xf32>) -> (tensor<1x14x14x64xf32>) {
-  %min = constant dense<0.0> : tensor<1x14x14x64xf32>
-  %max = constant dense<6.0> : tensor<1x14x14x64xf32>
-  %0 = "xla_hlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64,
-    input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64,
-    kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64,
-    output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, lhs_dilations = dense<1> : tensor<2xi64>,
-    padding = dense<2> : tensor<2x2xi64>, precision_config = ["DEFAULT", "DEFAULT"], rhs_dilations = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>
-  } : (tensor<1x14x14x32xf32>, tensor<5x5x32x64xf32>) -> tensor<1x14x14x64xf32>
-  %1 = xla_hlo.add %0, %arg2 : tensor<1x14x14x64xf32>
-  %2 = "xla_hlo.clamp"(%min, %1, %max) : (tensor<1x14x14x64xf32>, tensor<1x14x14x64xf32>, tensor<1x14x14x64xf32>) -> tensor<1x14x14x64xf32>
-  return %2 : tensor<1x14x14x64xf32>
-
-// CHECK: "quant.region"(%arg0, %arg1, %arg2)
-// CHECK: constant
-// CHECK: constant
-// CHECK: xla_hlo.convolution
-// CHECK: xla_hlo.add
-// CHECK: xla_hlo.clamp
-// CHECK: logical_kernel = "generic.biased_conv_relu6"
-}
--- a/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir
@ -1,10 +0,0 @@
-# RUN: not tfcompile --graph=%s.pbtxt --config=%s.config.pbtxt --experimental_quantize --cpp_class="::test::fadd_quant"  2>&1 | FileCheck %s -dump-input-on-failure
-
-# TODO(fengliuai): update this file with the progress of the implementation
-
-// CHECK: "quant.region"
-// CHECK: ^bb0(%arg0: tensor<2x4xf32>, %arg1: tensor<2x4xf32>):	// no predecessors
-// CHECK:   xla_hlo.add %arg0, %arg1
-// CHECK:   "quant.return"
-// CHECK: }) {input_specs = [!quant.uniform<i8:f32, 0.49803921568627452:-128>, !quant.uniform<i8:f32, 0.49803921568627452:-128>],
-// CHECK-SAME: logical_kernel = "generic.add", output_specs = [!quant.uniform<i8:f32, 0.49803921568627452:-128>]}
--- a/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir.config.pbtxt
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir.config.pbtxt
@ -1,26 +0,0 @@
-feed {
-  id { node_name: "input0" }
-  shape {
-    dim { size: 2 }
-    dim { size: 4 }
-  }
-}
-feed {
-  id { node_name: "input1" }
-  shape {
-    dim { size: 2 }
-    dim { size: 4 }
-  }
-}
-
-fetch {
-  id { node_name: "Add/FakeQuantWithMinMaxVars" }
-  shape {
-    dim { size: 2 }
-    dim { size: 4 }
-  }
-}
-
-conversion_options {
-  custom_fake_quant_op_calls: true
-}
--- a/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir.pbtxt
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir.pbtxt
@ -1,218 +0,0 @@
-node: {
-  name: "Add/FakeQuantWithMinMaxVars"
-  op: "FakeQuantWithMinMaxVars"
-  input: "Add"
-  input: "Add/FakeQuantWithMinMaxVars/min"
-  input: "Add/FakeQuantWithMinMaxVars/max"
-  attr: {
-    key: "num_bits"
-    value: {
-      i: 8
-    }
-  }
-  attr: {
-    key: "narrow_range"
-    value: {
-      b: false
-    }
-  }
-}
-node: {
-  name: "Add/FakeQuantWithMinMaxVars/min"
-  op: "Const"
-  attr: {
-    key: "value"
-    value: {
-      tensor: {
-        dtype: DT_FLOAT
-        tensor_shape: {
-        }
-        float_val: 0.0
-      }
-    }
-  }
-  attr: {
-    key: "dtype"
-    value: {
-      type: DT_FLOAT
-    }
-  }
-}
-node: {
-  name: "Add/FakeQuantWithMinMaxVars/max"
-  op: "Const"
-  attr: {
-    key: "value"
-    value: {
-      tensor: {
-        dtype: DT_FLOAT
-        tensor_shape: {
-        }
-        float_val: 127.0
-      }
-    }
-  }
-  attr: {
-    key: "dtype"
-    value: {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "Add"
-  op: "Add"
-  input: "input0/FakeQuantWithMinMaxVars"
-  input: "input1/FakeQuantWithMinMaxVars"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node: {
-  name: "input0/FakeQuantWithMinMaxVars"
-  op: "FakeQuantWithMinMaxVars"
-  input: "input0"
-  input: "input0/FakeQuantWithMinMaxVars/min"
-  input: "input0/FakeQuantWithMinMaxVars/max"
-  attr: {
-    key: "num_bits"
-    value: {
-      i: 8
-    }
-  }
-  attr: {
-    key: "narrow_range"
-    value: {
-      b: false
-    }
-  }
-}
-node: {
-  name: "input0/FakeQuantWithMinMaxVars/min"
-  op: "Const"
-  attr: {
-    key: "value"
-    value: {
-      tensor: {
-        dtype: DT_FLOAT
-        tensor_shape: {
-        }
-        float_val: 0.0
-      }
-    }
-  }
-  attr: {
-    key: "dtype"
-    value: {
-      type: DT_FLOAT
-    }
-  }
-}
-node: {
-  name: "input0/FakeQuantWithMinMaxVars/max"
-  op: "Const"
-  attr: {
-    key: "value"
-    value: {
-      tensor: {
-        dtype: DT_FLOAT
-        tensor_shape: {
-        }
-        float_val: 127.0
-      }
-    }
-  }
-  attr: {
-    key: "dtype"
-    value: {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "input0"
-  op: "Placeholder"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-node: {
-  name: "input1/FakeQuantWithMinMaxVars"
-  op: "FakeQuantWithMinMaxVars"
-  input: "input1"
-  input: "input1/FakeQuantWithMinMaxVars/min"
-  input: "input1/FakeQuantWithMinMaxVars/max"
-  attr: {
-    key: "num_bits"
-    value: {
-      i: 8
-    }
-  }
-  attr: {
-    key: "narrow_range"
-    value: {
-      b: false
-    }
-  }
-}
-node: {
-  name: "input1/FakeQuantWithMinMaxVars/min"
-  op: "Const"
-  attr: {
-    key: "value"
-    value: {
-      tensor: {
-        dtype: DT_FLOAT
-        tensor_shape: {
-        }
-        float_val: 0.0
-      }
-    }
-  }
-  attr: {
-    key: "dtype"
-    value: {
-      type: DT_FLOAT
-    }
-  }
-}
-node: {
-  name: "input1/FakeQuantWithMinMaxVars/max"
-  op: "Const"
-  attr: {
-    key: "value"
-    value: {
-      tensor: {
-        dtype: DT_FLOAT
-        tensor_shape: {
-        }
-        float_val: 127.0
-      }
-    }
-  }
-  attr: {
-    key: "dtype"
-    value: {
-      type: DT_FLOAT
-    }
-  }
-}
-node {
-  name: "input1"
-  op: "Placeholder"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-}
-versions {
-  producer: 27
-}
--- a/tensorflow/compiler/mlir/lite/quantization/xla/tests/materialize.mlir
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/tests/materialize.mlir
@ -1,54 +0,0 @@
-// RUN: tf-opt -xla-hlo-materialize-quant %s | FileCheck %s
-
-// CHECK-LABEL: func @quantize_rewrite
-func @quantize_rewrite(%arg0: tensor<2x4xf32>) -> tensor<2x4xf32> {
-// CHECK:      %[[qcst:.*]] = constant dense<{{\[\[}}21004416], [-1056997248]]> : tensor<2x1xi32>
-// CHECK-NEXT: %[[dq:.*]] = "xla_hlo.dequantize"(%[[qcst]]) {is_16bits = false, max_range = 0.996078431 : f32, min_range = -1.00392163 : f32,
-// CHECK-SAME:   mode = "MIN_COMBINED", transpose_output = false} : (tensor<2x1xi32>) -> tensor<2x4xbf16>
-// CHECK-NEXT: %[[cast:.*]] = "xla_hlo.convert"(%[[dq]]) : (tensor<2x4xbf16>) -> tensor<2x4xf32>
-// CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %arg0, %[[cast]] : tensor<2x4xf32>
-// CHECK-NEXT: return %[[mul]] : tensor<2x4xf32>
-
-  %w = constant dense<[[-1.0, -0.5, 0.0, 0.0], [0.5, 1.0, 0.0, 0.0]]> : tensor<2x4xf32>
-  %q = "quant.qcast"(%w) : (tensor<2x4xf32>) -> tensor<2x4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
-  %dq = "quant.dcast"(%q) : (tensor<2x4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<2x4xf32>
-  %mul = xla_hlo.multiply %arg0, %dq : tensor<2x4xf32>
-  return %mul: tensor<2x4xf32>
-}
-
-// CHECK-LABEL: func @quantize_small
-func @quantize_small(%arg0: tensor<1x4xf32>) -> tensor<1x4xf32> {
-// CHECK: %[[w:.*]] = constant dense<1.000000e+00> : tensor<1x4xf32>
-// CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %arg0, %[[w]] : tensor<1x4xf32>
-// CHECK-NEXT: return %[[mul]] : tensor<1x4xf32>
-
-  %w = constant dense<1.0> : tensor<1x4xf32>
-  %q = "quant.qcast"(%w) : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
-  %dq = "quant.dcast"(%q) : (tensor<1x4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<1x4xf32>
-  %mul = xla_hlo.multiply %arg0, %dq : tensor<1x4xf32>
-  return %mul: tensor<1x4xf32>
-}
-
-// CHECK-LABEL: func @quantize_non_cst
-func @quantize_non_cst(%arg0: tensor<2x4xf32>) -> tensor<2x4xf32> {
-// CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %arg0, %arg0 : tensor<2x4xf32>
-// CHECK-NEXT: return %[[mul]] : tensor<2x4xf32>
-
-  %q = "quant.qcast"(%arg0) : (tensor<2x4xf32>) -> tensor<2x4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
-  %dq = "quant.dcast"(%q) : (tensor<2x4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<2x4xf32>
-  %mul = xla_hlo.multiply %arg0, %dq : tensor<2x4xf32>
-  return %mul: tensor<2x4xf32>
-}
-
-// CHECK-LABEL: func @quantize_non_4x
-func @quantize_non_4x(%arg0: tensor<2x5xf32>) -> tensor<2x5xf32> {
-// CHECK: %[[w:.*]] = constant dense<1.000000e+00> : tensor<2x5xf32>
-// CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %arg0, %[[w]] : tensor<2x5xf32>
-// CHECK-NEXT: return %[[mul]] : tensor<2x5xf32>
-
-  %w = constant dense<1.0> : tensor<2x5xf32>
-  %q = "quant.qcast"(%w) : (tensor<2x5xf32>) -> tensor<2x5x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
-  %dq = "quant.dcast"(%q) : (tensor<2x5x!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<2x5xf32>
-  %mul = xla_hlo.multiply %arg0, %dq : tensor<2x5xf32>
-  return %mul: tensor<2x5xf32>
-}
--- a/tensorflow/compiler/mlir/lite/quantization/xla/tests/region_propagation.mlir
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/tests/region_propagation.mlir
@ -1,69 +0,0 @@
-// RUN: tf-opt -xla-hlo-propagate-quant %s | FileCheck %s --dump-input-on-failure
-
-// -----
-
-// CHECK-LABEL: @mul_add_source_no_params
-func @mul_add_source_no_params(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32>) -> (tensor<4xf32>) {
-  %region = "quant.region"(%arg0, %arg1, %arg2) ( {
-    ^bb0(%arg3: tensor<4xf32>, %arg4: tensor<4xf32>, %arg5: tensor<4xf32>):	// no predecessors
-    %mul = xla_hlo.multiply %arg3, %arg4 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
-    %add = xla_hlo.add %mul, %arg5 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
-    "quant.return"(%add) : (tensor<4xf32>) -> ()
-  }) {input_specs = [f32, f32, f32], logical_kernel = "generic.mul_add", output_specs = [f32]} :
-  (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %region : tensor<4xf32>
-
-// CHECK: input_specs = [f32, f32, f32]
-// CHECK-SAME: output_specs = [f32]
-}
-
-// -----
-
-// CHECK-LABEL: @mul_add_annotated_no_narrow_range
-func @mul_add_annotated_no_narrow_range(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32>) -> (tensor<4xf32>) {
-  %region = "quant.region"(%arg0, %arg1, %arg2) ( {
-    ^bb0(%arg3: tensor<4xf32>, %arg4: tensor<4xf32>, %arg5: tensor<4xf32>):	// no predecessors
-    %mul = xla_hlo.multiply %arg3, %arg4 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
-    %add = xla_hlo.add %mul, %arg5 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
-    "quant.return"(%add) : (tensor<4xf32>) -> ()
-  }) {input_specs = [!quant.uniform<i8:f32, 1.0:-128>, !quant.uniform<i8:f32, 1.0:-128>, f32],
-    logical_kernel = "generic.mul_add", output_specs = [!quant.uniform<i8:f32, 1.0:-128>]} :
-  (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %region : tensor<4xf32>
-
-// CHECK: input_specs = [!quant.uniform<i8:f32, 1.000000e+00:-128>, !quant.uniform<i8:f32, 1.000000e+00:-128>, f32]
-// CHECK-SAME: output_specs = [!quant.uniform<i8:f32, 1.000000e+00:-128>]
-}
-
-// -----
-
-// CHECK-LABEL: @mul_add_annotated
-func @mul_add_annotated(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32>) -> (tensor<4xf32>) {
-  %region = "quant.region"(%arg0, %arg1, %arg2) ( {
-    ^bb0(%arg3: tensor<4xf32>, %arg4: tensor<4xf32>, %arg5: tensor<4xf32>):	// no predecessors
-    %mul = xla_hlo.multiply %arg3, %arg4 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
-    %add = xla_hlo.add %mul, %arg5 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
-    "quant.return"(%add) : (tensor<4xf32>) -> ()
-  }) {input_specs = [!quant.uniform<i8:f32, 1.0:-128>, !quant.uniform<i8<-127:127>:f32, 1.0:-128>, f32],
-    logical_kernel = "generic.mul_add", output_specs = [!quant.uniform<i8:f32, 1.0:-128>]} :
-  (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %region : tensor<4xf32>
-
-// CHECK: input_specs = [!quant.uniform<i8:f32, 1.000000e+00:-128>, !quant.uniform<i8<-127:127>:f32, 1.000000e+00:-128>, !quant.uniform<i32:f32, 1.000000e+00>]
-// CHECK-SAME: output_specs = [!quant.uniform<i8:f32, 1.000000e+00:-128>]
-}
-
-// -----
-
-// CHECK-LABEL: @same_scale_1_1
-func @same_scale_1_1(%arg0: tensor<1x7x7x64xf32>) -> (tensor<1x3136xf32>) {
-  %region = "quant.region"(%arg0) ( {
-    ^bb0(%arg1: tensor<1x7x7x64xf32>):	// no predecessors
-    %r = "xla_hlo.reshape"(%arg1) : (tensor<1x7x7x64xf32>) -> (tensor<1x3136xf32>)
-    "quant.return"(%r) : (tensor<1x3136xf32>) -> ()
-  }) {input_specs = [!quant.uniform<i8:f32, 1.0>], logical_kernel = "generic.reshape", output_specs = [f32]} : (tensor<1x7x7x64xf32>) -> tensor<1x3136xf32>
-  return %region : tensor<1x3136xf32>
-
-// CHECK: input_specs = [!quant.uniform<i8:f32, 1.000000e+00>]
-// CHECK-SAME: output_specs = [!quant.uniform<i8:f32, 1.000000e+00>]
-}
--- a/tensorflow/compiler/mlir/lite/quantization/xla/tests/weight-only.mlir
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/tests/weight-only.mlir
@ -1,25 +0,0 @@
-// RUN: tf-opt -xla-hlo-propagate-quant %s | FileCheck %s
-
-// CHECK-LABEL: func @mul
-func @mul(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[w:.*]] = constant dense<{{\[\[}}-1.000000e+00, -5.000000e-01], [5.000000e-01, 1.000000e+00]]> : tensor<2x2xf32>
-// CHECK-NEXT: %[[q:.*]] = "quant.qcast"(%[[w]]) : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
-// CHECK-NEXT: %[[dq:.*]] = "quant.dcast"(%[[q]]) : (tensor<2x2x!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<2x2xf32>
-// CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %arg0, %[[dq]] : tensor<2x2xf32>
-// CHECK-NEXT: return %[[mul]] : tensor<2x2xf32>
-  %w = constant dense<[[-1.0, -0.5], [0.5, 1.0]]> : tensor<2x2xf32>
-  %mul = xla_hlo.multiply %arg0, %w : tensor<2x2xf32>
-  return %mul: tensor<2x2xf32>
-}
-
-// CHECK-LABEL: func @add
-func @add(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[b:.*]] = constant dense<1.000000e+00> : tensor<2xf32>
-// CHECK-NEXT: %[[q:.*]] = "quant.qcast"(%[[b]]) : (tensor<2xf32>) -> tensor<2x!quant.uniform<u8:f32, 0.0039215686274509803>>
-// CHECK-NEXT: %[[dq:.*]] = "quant.dcast"(%[[q]]) : (tensor<2x!quant.uniform<u8:f32, 0.0039215686274509803>>) -> tensor<2xf32>
-// CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg0, %[[dq]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x2xf32>, tensor<2xf32>) -> tensor<2x2xf32>
-// CHECK-NEXT: return %[[add]] : tensor<2x2xf32>
-  %b = constant dense<1.0> : tensor<2xf32>
-  %add = "xla_hlo.add"(%arg0, %b) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x2xf32>, tensor<2xf32>) -> tensor<2x2xf32>
-  return %add: tensor<2x2xf32>
-}