Remove the experimental xla quantization work from open source (II)

PiperOrigin-RevId: 305821944
Change-Id: I3d606fd11cb3f92691fee3a85b9d35a29b2038da
This commit is contained in:
Feng Liu 2020-04-09 21:38:22 -07:00 committed by TensorFlower Gardener
parent 84ff3e44b2
commit f22174826d
19 changed files with 4 additions and 1576 deletions

View File

@ -70,7 +70,6 @@ cc_library(
"//tensorflow/compiler/mlir/lite:tensorflow_lite_quantize",
"//tensorflow/compiler/mlir/lite/quantization:quantization_passes",
"//tensorflow/compiler/mlir/lite/quantization/tensorflow:tf_to_quant",
"//tensorflow/compiler/mlir/lite/quantization/xla:hlo_xla_quantization_passes",
"//tensorflow/compiler/mlir/tensorflow",
"//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
"//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",

View File

@ -14,7 +14,10 @@ package(
package_group(
name = "friends",
includes = ["//third_party/mlir:subpackages"],
packages = ["//tensorflow/compiler/mlir/..."],
packages = [
"//learning/brain/experimental/mlir/quantization/...",
"//tensorflow/compiler/mlir/...",
],
)
exports_files([

View File

@ -1,86 +0,0 @@
load(
"//third_party/mlir:tblgen.bzl",
"gentbl",
)
package(
default_visibility = [
":friends",
],
licenses = ["notice"], # Apache 2.0
)
package_group(
name = "friends",
includes = ["//third_party/mlir:subpackages"],
packages = [
"//learning/brain/experimental/mlir/quantization/...",
"//tensorflow/compiler/mlir/...",
"//tensorflow/compiler/mlir/lite/...",
],
)
cc_library(
name = "hlo_xla_quantization_passes",
srcs = [
"cpu_kernel_fusion.cc",
"generated_cpu_kernel_fusion.inc",
"materialize.cc",
"op_quant_spec.inc",
"propagate.cc",
],
hdrs = [
"passes.h",
],
deps = [
":cpu_device_target",
"//tensorflow/compiler/mlir/lite/quantization:quantization_config",
"//tensorflow/compiler/mlir/lite/quantization:quantization_context",
"//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
"//tensorflow/compiler/mlir/xla:hlo",
"//tensorflow/compiler/xla/client/lib:quantize",
"@com_google_absl//absl/memory",
"@llvm-project//llvm:support",
"@llvm-project//mlir:IR",
"@llvm-project//mlir:Pass",
"@llvm-project//mlir:QuantOps",
"@llvm-project//mlir:StandardOps",
"@llvm-project//mlir:Support",
"@llvm-project//mlir:TransformUtils",
],
alwayslink = 1,
)
cc_library(
name = "cpu_device_target",
srcs = [
"cpu_device_target.cc",
],
hdrs = [
"cpu_device_target.h",
],
deps = [
"//tensorflow/compiler/mlir/lite/quantization:device_target",
"//tensorflow/compiler/mlir/lite/quantization:quantization_context",
"@llvm-project//mlir:IR",
"@llvm-project//mlir:QuantOps",
"@llvm-project//mlir:Support",
],
)
gentbl(
name = "cpu_kernel_fusion_inc_gen",
tbl_outs = [
(
"-gen-rewriters",
"generated_cpu_kernel_fusion.inc",
),
],
tblgen = "@llvm-project//mlir:mlir-tblgen",
td_file = "cpu_kernel_fusion.td",
td_srcs = [
"@llvm-project//mlir:StdOpsTdFiles",
"//tensorflow/compiler/mlir/xla:hlo_ops_td_files",
"//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
],
)

View File

@ -1,67 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/compiler/mlir/lite/quantization/xla/cpu_device_target.h"
#include "mlir/Dialect/Quant/QuantOps.h" // from @llvm-project
#include "mlir/IR/MLIRContext.h" // from @llvm-project
#include "mlir/Support/LogicalResult.h" // from @llvm-project
#include "tensorflow/compiler/mlir/lite/quantization/quantization_context.h"
namespace mlir {
namespace xla_hlo {
namespace ph = std::placeholders;
CpuDeviceTarget::CpuDeviceTarget(MLIRContext* ctx) : DeviceTarget(ctx) {
RegisterKernel("generic.concat", {qi8_, qi8_, qi8_},
quant::ScaleConstraintType::OutputInputSameScale);
// TODO(fengliuai): All the combinations are required to list. We need to
// improve this.
RegisterKernel("generic.reshape", {qi8_, any_},
quant::ScaleConstraintType::OutputInputSameScale);
RegisterKernel("generic.reshape", {any_, qi8_},
quant::ScaleConstraintType::OutputInputSameScale);
RegisterKernel("generic.mul", {qi8_, qi8_, qi8_},
quant::ScaleConstraintType::OutputInputFreeScale);
RegisterKernel("generic.mul_add", {qi8_, qi8n_, any_, qi8_},
std::bind(&CpuDeviceTarget::HandleMultiplyAccumulateScale,
this, ph::_1, ph::_2, ph::_3, ph::_4));
RegisterKernel("generic.matmul_add", {qi8_, qi8n_, any_, qi8_},
std::bind(&CpuDeviceTarget::HandleMultiplyAccumulateScale,
this, ph::_1, ph::_2, ph::_3, ph::_4));
}
LogicalResult CpuDeviceTarget::HandleMultiplyAccumulateScale(
quant::QuantizeContext* ctx, Operation* op,
quant::AdjacentOperations* new_items, bool* changed) {
auto bias_params = ctx->GetOperandParams(op, 2);
if (!EmptyParams(bias_params)) {
return success();
}
std::vector<quant::QuantParams> op_types{ctx->GetOperandParams(op, 0),
ctx->GetOperandParams(op, 1)};
auto bias_scale = GetUniformQuantizedTypeForBias(op_types);
if (bias_scale && ctx->SetOperandParams(op, 2, bias_scale)) {
*changed = true;
new_items->push_back(op->getOperand(2).getDefiningOp());
}
return success();
}
} // namespace xla_hlo
} // namespace mlir

View File

@ -1,40 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_CPU_DEVICE_TARGET_H_
#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_CPU_DEVICE_TARGET_H_
#include "mlir/Dialect/Quant/QuantOps.h" // from @llvm-project
#include "mlir/Support/LogicalResult.h" // from @llvm-project
#include "tensorflow/compiler/mlir/lite/quantization/device_target.h"
namespace mlir {
namespace xla_hlo {
// Target specs for cpu kernels
class CpuDeviceTarget : public quant::DeviceTarget {
public:
explicit CpuDeviceTarget(MLIRContext* ctx);
private:
LogicalResult HandleMultiplyAccumulateScale(
quant::QuantizeContext* ctx, Operation* op,
quant::AdjacentOperations* new_items, bool* changed);
};
} // namespace xla_hlo
} // namespace mlir
#endif // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_CPU_DEVICE_TARGET_H_

View File

@ -1,347 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <math.h>
#include <algorithm>
#include <cstdint>
#include <initializer_list>
#include <iterator>
#include <numeric>
#include <string>
#include "absl/memory/memory.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "mlir/Dialect/Quant/QuantOps.h" // from @llvm-project
#include "mlir/Dialect/StandardOps/IR/Ops.h" // from @llvm-project
#include "mlir/IR/Attributes.h" // from @llvm-project
#include "mlir/IR/BlockAndValueMapping.h" // from @llvm-project
#include "mlir/IR/Function.h" // from @llvm-project
#include "mlir/IR/MLIRContext.h" // from @llvm-project
#include "mlir/IR/Matchers.h" // from @llvm-project
#include "mlir/IR/PatternMatch.h" // from @llvm-project
#include "mlir/IR/StandardTypes.h" // from @llvm-project
#include "mlir/IR/Value.h" // from @llvm-project
#include "mlir/Pass/Pass.h" // from @llvm-project
#include "mlir/Support/LLVM.h" // from @llvm-project
#include "mlir/Support/LogicalResult.h" // from @llvm-project
#include "mlir/Transforms/DialectConversion.h" // from @llvm-project
#include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
#include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
#include "tensorflow/compiler/xla/client/lib/quantize.h"
#define DEBUG_TYPE "quant-kernel-fusion"
constexpr int kFakeQuantOperandsNum = 5;
constexpr int kFakeQuantPerChannelOperandsNum = 6;
namespace mlir {
namespace xla_hlo {
namespace {
TypeAttr GetQuantSpec(Operation* op) {
auto fake_quant = llvm::dyn_cast_or_null<CustomCallOp>(op);
if (!fake_quant || fake_quant.getNumOperands() < kFakeQuantOperandsNum ||
fake_quant.getNumOperands() > kFakeQuantPerChannelOperandsNum ||
fake_quant.call_target_name() != "fake_quant_with_min_max_vars")
return {};
DenseFPElementsAttr min, max;
DenseIntElementsAttr bit_width, narrow_range, quant_dim;
if (!matchPattern(fake_quant.getOperand(1), m_Constant(&min)) ||
!matchPattern(fake_quant.getOperand(2), m_Constant(&max)) ||
!matchPattern(fake_quant.getOperand(3), m_Constant(&bit_width)) ||
!matchPattern(fake_quant.getOperand(4), m_Constant(&narrow_range)))
return {};
auto bit_width_val = (*bit_width.attr_value_begin()).cast<IntegerAttr>();
auto narrow_range_val = (*narrow_range.int_value_begin()).getSExtValue();
int quant_dim_val = -1;
if (fake_quant.getNumOperands() == kFakeQuantPerChannelOperandsNum &&
matchPattern(fake_quant.getOperand(kFakeQuantPerChannelOperandsNum - 1),
m_Constant(&quant_dim))) {
quant_dim_val = (*quant_dim.int_value_begin()).getSExtValue();
}
OpBuilder builder(op);
Type input_type =
fake_quant.getOperand(0).getType().cast<ShapedType>().getElementType();
return quant::GetQuantizedTypeAttr(
builder, input_type, min, max, quant_dim_val, bit_width_val,
builder.getBoolAttr(narrow_range_val), /*is_signed=*/true);
}
// Collects input values from outside for 'ops'.
void CollectInputs(llvm::ArrayRef<Operation*> ops,
llvm::SmallVectorImpl<Value>* inputs,
llvm::SmallVectorImpl<Attribute>* input_specs) {
for (Operation* op : ops) {
for (Value operand : op->getOperands()) {
if (std::find(inputs->begin(), inputs->end(), operand) != inputs->end()) {
continue;
}
if (Operation* def_op = operand.getDefiningOp()) {
if (std::find(ops.begin(), ops.end(), def_op) == ops.end()) {
inputs->push_back(operand);
}
} else { // argument value
inputs->push_back(operand);
}
}
}
for (Value input : *inputs) {
ShapedType input_type = input.getType().cast<ShapedType>();
if (TypeAttr spec = GetQuantSpec(input.getDefiningOp())) {
input_specs->push_back(spec);
} else {
input_specs->push_back(TypeAttr::get(input_type.getElementType()));
}
}
}
// Collects values that are produced by 'ops' and have use outside of 'ops'.
// TODO(fengliuai): if it is a single user and QDQ, write that to the specs.
void CollectRets(llvm::ArrayRef<Operation*> ops,
llvm::SmallVectorImpl<Value>* rets,
llvm::SmallVectorImpl<Type>* ret_types,
llvm::SmallVectorImpl<Attribute>* ret_specs) {
for (Operation* op : ops) {
// The constant will not be shared outside the region.
if (llvm::isa<ConstantOp>(op)) continue;
for (Value result : op->getResults()) {
for (Operation* user : result.getUsers()) {
// If there are any user outside of 'ops'
if (std::find(ops.begin(), ops.end(), user) == ops.end()) {
ShapedType ret_type = result.getType().cast<ShapedType>();
rets->push_back(result);
ret_types->push_back(ret_type);
if (TypeAttr spec = GetQuantSpec(user)) {
ret_specs->push_back(spec);
} else {
ret_specs->push_back(TypeAttr::get(ret_type.getElementType()));
}
break;
}
}
}
}
}
enum FusedActivationFunc { NONE, RELU, RELU1, RELU6 };
#define FLOAT_EQ(value, x) fabs(value - x) <= 1e-6
// If the op is max(in, 0.0), we consider this is from Relu, so both this op
// and constant 0.0 will be fused.
// If the op is clamp(0.0, in, 1.0) or clamp(0.0, in, 6.0), we consider this is
// from Relu1 or Relu6, so all the constants and this op will be fused.
// Returns the activation function type.
FusedActivationFunc FuseReluX(Operation* op,
llvm::SmallVectorImpl<Operation*>* fused) {
if (auto max = llvm::dyn_cast<xla_hlo::MaxOp>(op)) {
Value min_val = max.rhs();
llvm::SmallVector<Operation*, 4> broadcast_ops;
if (auto broadcast = llvm::dyn_cast_or_null<xla_hlo::BroadcastInDimOp>(
min_val.getDefiningOp())) {
min_val = broadcast.operand();
broadcast_ops.push_back(broadcast);
}
DenseFPElementsAttr min;
if (!matchPattern(min_val, m_Constant(&min))) {
// In case the min value is lhs.
min_val = max.lhs();
broadcast_ops.clear();
if (auto broadcast = llvm::dyn_cast_or_null<xla_hlo::BroadcastInDimOp>(
min_val.getDefiningOp())) {
min_val = broadcast.operand();
broadcast_ops.push_back(broadcast);
}
if (!matchPattern(min_val, m_Constant(&min))) {
return NONE;
}
}
if (!min.isSplat() ||
!(FLOAT_EQ(min.getSplatValue().cast<FloatAttr>().getValueAsDouble(),
0.0))) {
return NONE;
}
// Include the constant 0.0 as well, to avoid being quantized.
fused->push_back(min_val.getDefiningOp());
fused->append(broadcast_ops.begin(), broadcast_ops.end());
fused->push_back(max);
return RELU;
}
if (auto clamp = llvm::dyn_cast<xla_hlo::ClampOp>(op)) {
DenseFPElementsAttr lower, upper;
if (!matchPattern(clamp.min(), m_Constant(&lower)) ||
!matchPattern(clamp.max(), m_Constant(&upper)) || !lower.isSplat() ||
!upper.isSplat() ||
!(FLOAT_EQ(lower.getSplatValue().cast<FloatAttr>().getValueAsDouble(),
0.0))) {
return NONE;
}
double upper_value =
upper.getSplatValue().cast<FloatAttr>().getValueAsDouble();
if (FLOAT_EQ(upper_value, 1.0) || FLOAT_EQ(upper_value, 6.0)) {
fused->push_back(clamp.min().getDefiningOp());
fused->push_back(clamp.max().getDefiningOp());
fused->push_back(op);
return (FLOAT_EQ(upper_value, 1.0) ? RELU1 : RELU6);
}
}
return NONE;
}
llvm::SmallVector<Value, 0> FuseOps(PatternRewriter* rewriter,
const std::initializer_list<Value>& results,
StringRef kernel) {
// Collect all the operations to be fused.
llvm::SmallVector<Operation*, 4> fused;
llvm::SmallVector<Location, 4> locs;
fused.reserve(results.size());
locs.reserve(results.size());
for (auto value : results) {
Operation* op = value.getDefiningOp();
fused.push_back(op);
locs.push_back(op->getLoc());
}
Operation* root = fused.back();
FusedActivationFunc act_func = FusedActivationFunc::NONE;
// If there is Relu, Relu1 or Relu6, fuse it as well.
if (results.size() > 0 && std::rbegin(results)->hasOneUse()) {
act_func = FuseReluX(*std::rbegin(results)->user_begin(), &fused);
}
// Collect inputs from outside to 'ops'.
llvm::SmallVector<Value, 4> inputs;
llvm::SmallVector<Attribute, 4> input_specs;
CollectInputs(fused, &inputs, &input_specs);
// Collect outputs from 'ops' to outside.
llvm::SmallVector<Value, 4> rets;
llvm::SmallVector<Type, 4> ret_types;
llvm::SmallVector<Attribute, 4> ret_specs;
CollectRets(fused, &rets, &ret_types, &ret_specs);
// TODO(fengliuai): make activation function an attribute.
std::string kernel_name;
switch (act_func) {
case RELU:
kernel_name = llvm::Twine(kernel, "_relu").str();
break;
case RELU1:
kernel_name = llvm::Twine(kernel, "_relu1").str();
break;
case RELU6:
kernel_name = llvm::Twine(kernel, "_relu6").str();
break;
default:
kernel_name = kernel.str();
}
// Create the region op with the return.
auto region = rewriter->create<quant::QuantizeRegionOp>(
rewriter->getFusedLoc(locs), ret_types, inputs,
rewriter->getArrayAttr(input_specs), rewriter->getArrayAttr(ret_specs),
kernel_name);
auto* body = new Block();
region.body().push_back(body);
OpBuilder builder = OpBuilder::atBlockEnd(body);
BlockAndValueMapping mapping;
// Make block arguments and add it to the block value mapping.
for (Value input : inputs) {
mapping.map(input, body->addArgument(input.getType()));
}
// Clone the operations 'ops' to the region.
for (Operation* op : fused) {
builder.clone(*op, mapping);
}
llvm::SmallVector<Value, 4> new_rets;
new_rets.reserve(rets.size());
for (auto ret : llvm::enumerate(rets)) {
Value new_ret = mapping.lookupOrNull(ret.value());
assert(new_ret && "couldn't find return value.");
new_rets.push_back(new_ret);
ret.value().replaceAllUsesWith(region.getResult(ret.index()));
}
builder.create<quant::ReturnOp>(builder.getUnknownLoc(), new_rets);
LLVM_DEBUG({
assert(region.verify().Success && "failed to create quant region.");
llvm::dbgs() << "\ncreated region: ";
region.print(llvm::dbgs());
llvm::dbgs() << "\n\n\n";
});
// All uses of the fused ops are replaced, so the values in this vector
// will not be used.
SmallVector<Value, 0> new_values(root->getNumResults(), region.getResult(0));
return new_values;
}
struct CpuKernelFusionPass
: public PassWrapper<CpuKernelFusionPass, FunctionPass> {
explicit CpuKernelFusionPass() = default;
CpuKernelFusionPass(const CpuKernelFusionPass&) {}
void runOnFunction() override;
};
#include "tensorflow/compiler/mlir/lite/quantization/xla/generated_cpu_kernel_fusion.inc"
void CpuKernelFusionPass::runOnFunction() {
Operation* op = getOperation();
MLIRContext* ctx = op->getContext();
OwningRewritePatternList patterns;
populateWithGenerated(ctx, &patterns);
applyPatternsGreedily(op->getRegions(), patterns);
}
} // namespace
// Creates an instance of the xla_hlo cpu kernel fusion pass.
std::unique_ptr<OperationPass<FuncOp>> CreateCpuKernelFusionPass() {
return std::make_unique<CpuKernelFusionPass>();
}
static PassRegistration<CpuKernelFusionPass> pass(
"xla-hlo-cpu-fusion", "Fuse xla hlo ops into cpu kernels");
} // namespace xla_hlo
} // namespace mlir

View File

@ -1,70 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
include "mlir/IR/OpBase.td"
include "mlir/Dialect/StandardOps/IR/Ops.td"
class Fused1Ops<string kernel> : NativeCodeCall<
"FuseOps(&$_builder, {$0}, \"" # kernel # "\")">;
class Fused2Ops<string kernel> : NativeCodeCall<
"FuseOps(&$_builder, {$0, $1}, \"" # kernel # "\")">;
class Fused3Ops<string kernel> : NativeCodeCall<
"FuseOps(&$_builder, {$0, $1, $2}, \"" # kernel # "\")">;
class Fused4Ops<string kernel> : NativeCodeCall<
"FuseOps(&$_builder, {$0, $1, $2, $3}, \"" # kernel # "\")">;
// We shouldn't revisit those ops which have been fused. This constraint is
// required because the greedy pattern rewriter will visit and match any new
// ops. So when the source pattern are matched and wrapped by the quant region
// op, these ops will be matched again. To prevent this, this constraint is
// added to bypass any ops inside a quant region.
def NeedsToBeFused : Constraint<CPred<
"!$0.getDefiningOp()->getParentOfType<quant::QuantizeRegionOp>()">>;
// dummy example
def : Pat<(HLO_AddOp:$add (HLO_MulOp:$mul $_, $_, $_), $_, $_),
(Fused2Ops<"generic.mul_add"> $mul, $add),
[(NeedsToBeFused $add)]>;
// add
def : Pat<(HLO_AddOp:$add $_, $_, $_),
(Fused1Ops<"generic.add"> $add),
[(NeedsToBeFused $add)]>;
// reduce_window: maxpool, avgpool
def : Pat<(HLO_ReduceWindowOp:$reduce $_, $_, $_, $_, $_, $_, $_),
(Fused1Ops<"generic.reduce_window"> $reduce),
[(NeedsToBeFused $reduce)]>;
// reshape
def : Pat<(HLO_ReshapeOp:$reshape $_), (Fused1Ops<"generic.reshape"> $reshape),
[(NeedsToBeFused $reshape)]>;
// broadcast
def : Pat<(HLO_BroadcastInDimOp:$broadcast $_, $_),
(Fused1Ops<"generic.broadcast"> $broadcast),
[(NeedsToBeFused $broadcast)]>;
// dot -> add
def : Pat<(HLO_AddOp:$add (HLO_DotOp:$dot $_, $_, $_), $_, $_),
(Fused2Ops<"generic.biased_dot"> $dot, $add),
[(NeedsToBeFused $add)]>;
// conv -> add
def : Pat<(HLO_AddOp:$add
(HLO_ConvOp:$conv $_, $_, $_, $_, $_, $_, $_, $_, $_, $_), $_, $_),
(Fused2Ops<"generic.biased_conv"> $conv, $add),
[(NeedsToBeFused $add)]>;

View File

@ -1,175 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// This transformation pass quantize the constant and rewrite the quantization
// ops by xla_hlo primitive ops.
#include <cstdint>
#include <iterator>
#include <numeric>
#include <string>
#include "absl/memory/memory.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "mlir/Dialect/Quant/QuantOps.h" // from @llvm-project
#include "mlir/Dialect/StandardOps/IR/Ops.h" // from @llvm-project
#include "mlir/IR/Attributes.h" // from @llvm-project
#include "mlir/IR/MLIRContext.h" // from @llvm-project
#include "mlir/IR/PatternMatch.h" // from @llvm-project
#include "mlir/IR/StandardTypes.h" // from @llvm-project
#include "mlir/IR/Value.h" // from @llvm-project
#include "mlir/Pass/Pass.h" // from @llvm-project
#include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
#include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
#include "tensorflow/compiler/xla/client/lib/quantize.h"
//===----------------------------------------------------------------------===//
// The pass to materialize the quantization results by xla primitive ops.
//
namespace mlir {
namespace xla_hlo {
namespace {
// This pattern matches the "constant->qcast->dcast" pattern and replaces it by
// "quantized constant->xla_hlo.dequantize". If it only matches the
// "non-constant->qcast->dcast" pattern, it will remove both the "qcast->dcast".
// We chain the pattern as a whole to bypass the type checks of the normal
// xla_hlo ops.
// TODO(fengliuai): make this pass work for bf16 input.
class RewriteDequantize : public OpRewritePattern<quant::DequantizeCastOp> {
public:
explicit RewriteDequantize(int64_t size, MLIRContext *context)
: OpRewritePattern<quant::DequantizeCastOp>(context), size_(size) {}
LogicalResult matchAndRewrite(quant::DequantizeCastOp op,
PatternRewriter &rewriter) const override {
// quant.dcast
// xla_hlo dequantize only takes min/max, so let's recover them from
// the quantization parameters.
Value dcast = op.arg();
auto type = quant::QuantizedType::getQuantizedElementType(dcast.getType());
if (!type || !type.isa<quant::UniformQuantizedType>()) {
return failure();
}
auto qtype = type.cast<quant::UniformQuantizedType>();
double scale = qtype.getScale();
int64_t zero_point = qtype.getZeroPoint();
float min = scale * (qtype.getStorageTypeMin() - zero_point);
float max = scale * (qtype.getStorageTypeMax() - zero_point);
// quant.qcast
auto qcast =
llvm::dyn_cast_or_null<quant::QuantizeCastOp>(dcast.getDefiningOp());
if (!qcast) return failure();
// constant
DenseFPElementsAttr attr;
// If it isn't a floating-point constant or the size is too small, let's
// remove the quantization. Also the last dimension size should be a
// multiplier of 4, so the shape isn't broken during packing and unpacking.
if (!matchPattern(qcast.arg(), m_Constant(&attr)) ||
attr.getNumElements() <= size_ ||
attr.getType().getDimSize(attr.getType().getRank() - 1) % 4 != 0) {
op.getResult().replaceAllUsesWith(qcast.arg());
return success();
}
// TODO(fengliuai): implement transpose if it has high dimension.
// Create the quantized result
auto quantized_result =
quant::Quantize(attr, qtype).dyn_cast_or_null<DenseIntElementsAttr>();
if (!quantized_result) {
return failure();
}
// Pack the uint8 bits to uint32. The shape is changed from from
// [n0, n1, ..., nk] to [n0, n1, ..., nk / 4].
std::vector<uint8_t> raw_data;
for (auto d : quantized_result.getValues<uint8_t>()) {
raw_data.push_back(d);
}
// The packing might increase the data size by paddings.
auto packed_data = xla::PackToUint32<uint8_t>(raw_data);
auto packed_shape = attr.getType().getShape().vec();
int lower_dims = std::accumulate(
packed_shape.begin(),
std::next(packed_shape.begin(), packed_shape.size() - 1), 1,
std::multiplies<int>());
packed_shape[packed_shape.size() - 1] = packed_data.size() / lower_dims;
auto packed_type =
RankedTensorType::get(packed_shape, rewriter.getIntegerType(32));
auto packed_quantized_result =
DenseElementsAttr::get<uint32_t>(packed_type, packed_data);
auto quantized_constant =
rewriter.create<ConstantOp>(qcast.getLoc(), packed_quantized_result);
// Create the xla dequantize op with bf16 output
auto dequantized_type = RankedTensorType::get(attr.getType().getShape(),
rewriter.getBF16Type());
auto dequantize = rewriter.create<DequantizeOp>(
qcast.getLoc(), dequantized_type, quantized_constant,
rewriter.getF32FloatAttr(min), rewriter.getF32FloatAttr(max),
rewriter.getStringAttr("MIN_COMBINED"), rewriter.getBoolAttr(false),
rewriter.getBoolAttr(false));
// Convert bf16 output back to f32
rewriter.replaceOpWithNewOp<ConvertOp>(op, op.getResult().getType(),
dequantize);
return success();
}
private:
int64_t size_;
};
// Materialize the quantization results by hlo primitive ops.
struct MaterializeToXlaPass
: public PassWrapper<MaterializeToXlaPass, FunctionPass> {
explicit MaterializeToXlaPass() = default;
MaterializeToXlaPass(const MaterializeToXlaPass &) {}
void runOnFunction() override;
};
void MaterializeToXlaPass::runOnFunction() {
FuncOp func = getFunction();
MLIRContext *ctx = &getContext();
OwningRewritePatternList patterns;
// TODO(fengliuai): make the size 6 configurable.
patterns.insert<RewriteDequantize>(6, ctx);
applyPatternsGreedily(func, patterns);
}
} // namespace
// Creates an instance of the xla_hlo dialect quantization propagation pass.
std::unique_ptr<OperationPass<FuncOp>> CreateMaterializeToXlaPass() {
return std::make_unique<MaterializeToXlaPass>();
}
static PassRegistration<MaterializeToXlaPass> pass(
"xla-hlo-materialize-quant",
"Materialize the quantization results by xla primitve ops");
} // namespace xla_hlo
} // namespace mlir

View File

@ -1,7 +0,0 @@
// TODO(fengliuai): automatically generate this file
// TODO(fengliuai): add all the xla_hlo ops
static std::unique_ptr<quant::OpQuantSpec> GetOpQuantSpec(mlir::Operation *op) {
auto spec = absl::make_unique<quant::OpQuantSpec>();
return spec;
}

View File

@ -1,40 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_PASSES_H_
#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_PASSES_H_
#include <memory>
#include "mlir/IR/Function.h" // from @llvm-project
#include "mlir/Pass/Pass.h" // from @llvm-project
namespace mlir {
namespace xla_hlo {
// Propagate the quantization information to all the tensors according to the
// op quant spec.
std::unique_ptr<OperationPass<FuncOp>> CreatePropagateQuantPass();
// Rewrite the graph and quantize the constant.
std::unique_ptr<OperationPass<FuncOp>> CreateMaterializeToXlaPass();
// Fuse HLO ops into quantized regions.
std::unique_ptr<OperationPass<FuncOp>> CreateCpuKernelFusionPass();
} // namespace xla_hlo
} // namespace mlir
#endif // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_PASSES_H_

View File

@ -1,108 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// This transformation pass applies quantization propagation on xla_hlo dialect.
#include <iterator>
#include <string>
#include "absl/memory/memory.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/CommandLine.h"
#include "mlir/IR/MLIRContext.h" // from @llvm-project
#include "mlir/IR/PatternMatch.h" // from @llvm-project
#include "mlir/IR/Value.h" // from @llvm-project
#include "mlir/Pass/Pass.h" // from @llvm-project
#include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
#include "tensorflow/compiler/mlir/lite/quantization/quantization_context.h"
#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
#include "tensorflow/compiler/mlir/lite/quantization/xla/cpu_device_target.h"
// NOLINTNEXTLINE
static llvm::cl::opt<bool> disable_per_channel(
"xla-disable-per-channel", llvm::cl::value_desc("bool"),
llvm::cl::desc("Whether disable per-channel quantized weights."),
llvm::cl::init(false));
//===----------------------------------------------------------------------===//
// The quantization propagation Pass.
//
namespace mlir {
namespace xla_hlo {
namespace {
// Applies the quantization propagation on the input function. During the
// propagation, two facts are respected:
// - The quantization type (params) of the ops in the function
// - The quantization spec for the ops
// The propagation results should assign quantization types to all the tensors
// and the two restrictions are respected.
struct PropagateQuantPass
: public PassWrapper<PropagateQuantPass, FunctionPass> {
explicit PropagateQuantPass() = default;
PropagateQuantPass(const PropagateQuantPass &) {}
void runOnFunction() override;
};
#include "tensorflow/compiler/mlir/lite/quantization/xla/op_quant_spec.inc"
void PropagateQuantPass::runOnFunction() {
FuncOp func = getFunction();
// TODO(fengliuai): deprecate this old code generation path.
// XLA only support uint8/uint16 quantization for now.
ApplyQuantizationParamsPropagation(func, /*is_signed*/ false,
disable_per_channel, GetOpQuantSpec);
CpuDeviceTarget spec(&getContext());
quant::QuantizeContext ctx(func, spec);
std::vector<quant::QuantizeRegionOp> work_list = ctx.GetAllOps();
bool changed = false;
while (!work_list.empty()) {
quant::QuantizeRegionOp op = work_list.back();
work_list.pop_back();
llvm::SmallVector<Operation *, 4> new_items;
if (failed(ctx.Handle(op, &new_items, &changed))) {
// The IR is still valid, thus we shouldn't fail.
signalPassFailure();
}
for (auto item : new_items) {
if (auto reg = llvm::dyn_cast_or_null<quant::QuantizeRegionOp>(item))
work_list.push_back(reg);
}
}
if (!changed) return;
if (failed(ctx.Finalize())) {
signalPassFailure();
}
}
} // namespace
// Creates an instance of the xla_hlo dialect quantization propagation pass.
std::unique_ptr<OperationPass<FuncOp>> CreatePropagateQuantPass() {
return std::make_unique<PropagateQuantPass>();
}
static PassRegistration<PropagateQuantPass> pass(
"xla-hlo-propagate-quant", "Propagate quantization information");
} // namespace xla_hlo
} // namespace mlir

View File

@ -1,33 +0,0 @@
load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
package(licenses = ["notice"])
glob_lit_tests(
data = [
":graph_config_files",
":test_utilities",
],
driver = "@llvm-project//mlir:run_lit.sh",
exclude = ["fadd_quant.mlir"],
test_file_exts = ["mlir"],
)
# Bundle together all of the test utilities that are used by tests.
filegroup(
name = "test_utilities",
testonly = True,
data = [
"//tensorflow/compiler/aot:tfcompile",
"//tensorflow/compiler/mlir:tf-opt",
"@llvm-project//llvm:FileCheck",
"@llvm-project//llvm:not",
],
)
# Bundle together all the graph files that are used by the tests.
filegroup(
name = "graph_config_files",
srcs = glob(
["**/*.pbtxt"],
),
)

View File

@ -1,199 +0,0 @@
// RUN: tf-opt -xla-hlo-cpu-fusion %s | FileCheck %s
// CHECK-LABEL: @mul_add_source
func @mul_add_source(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32>) -> (tensor<4xf32>) {
%0 = "xla_hlo.multiply"(%arg0, %arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
%1 = "xla_hlo.add"(%0, %arg2) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
return %1 : tensor<4xf32>
// CHECK: %[[region:.*]] = "quant.region"(%arg0, %arg1, %arg2) ( {
// CHECK: ^bb0(%arg3: tensor<4xf32>, %arg4: tensor<4xf32>, %arg5: tensor<4xf32>): // no predecessors
// CHECK: %[[mul:.*]] = xla_hlo.multiply %arg3, %arg4 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
// CHECK: %[[add:.*]] = xla_hlo.add %[[mul]], %arg5 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
// CHECK: "quant.return"(%[[add]]) : (tensor<4xf32>) -> ()
// CHECK: }) {input_specs = [f32, f32, f32], logical_kernel = "generic.mul_add", output_specs = [f32]} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
// CHECK: return %[[region]] : tensor<4xf32>
}
// CHECK-LABEL: @mul_add_annotated
func @mul_add_annotated(%arg0: tensor<2x4xf32>, %arg1: tensor<2x4xf32>, %arg2: tensor<2x4xf32>) -> (tensor<2x4xf32>) {
%cst = constant dense<0.0> : tensor<f32>
%cst_0 = constant dense<255.0> : tensor<f32>
%cst_1 = constant dense<8> : tensor<i32>
%cst_2 = constant dense<false> : tensor<i1>
%qin = "xla_hlo.custom_call"(%arg0, %cst, %cst_0, %cst_1, %cst_2) {backend_config = "", call_target_name = "fake_quant_with_min_max_vars",
has_side_effect = false, name = "custom-call.1"} : (tensor<2x4xf32>, tensor<f32>, tensor<f32>, tensor<i32>, tensor<i1>) -> tensor<2x4xf32>
%qw = "xla_hlo.custom_call"(%arg1, %cst, %cst_0, %cst_1, %cst_2) {backend_config = "", call_target_name = "fake_quant_with_min_max_vars",
has_side_effect = false, name = "custom-call.2"} : (tensor<2x4xf32>, tensor<f32>, tensor<f32>, tensor<i32>, tensor<i1>) -> tensor<2x4xf32>
%0 = "xla_hlo.multiply"(%qin, %qw) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<2x4xf32>
%1 = "xla_hlo.add"(%0, %arg2) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<2x4xf32>
%r = "xla_hlo.custom_call"(%1, %cst, %cst_0, %cst_1, %cst_2) {backend_config = "", call_target_name = "fake_quant_with_min_max_vars",
has_side_effect = false, name = "custom-call.3"} : (tensor<2x4xf32>, tensor<f32>, tensor<f32>, tensor<i32>, tensor<i1>) -> tensor<2x4xf32>
return %r : tensor<2x4xf32>
// CHECK: %[[region:.*]] = "quant.region"
// CHECK: ^bb0(%arg3: tensor<2x4xf32>, %arg4: tensor<2x4xf32>, %arg5: tensor<2x4xf32>): // no predecessors
// CHECK: %[[mul:.*]] = xla_hlo.multiply %arg3, %arg4 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<2x4xf32>
// CHECK: %[[add:.*]] = xla_hlo.add %[[mul]], %arg5 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<2x4xf32>
// CHECK: "quant.return"(%[[add]]) : (tensor<2x4xf32>) -> ()
// CHECK: }) {input_specs = [!quant.uniform<i8:f32, 1.000000e+00:-128>, !quant.uniform<i8:f32, 1.000000e+00:-128>, f32],
// CHECK-SAME: logical_kernel = "generic.mul_add", output_specs = [!quant.uniform<i8:f32, 1.000000e+00:-128>]} :
// CHECK-SAME: (tensor<2x4xf32>, tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<2x4xf32>
// CHECK: %[[r:.*]] = "xla_hlo.custom_call"(%[[region]]
// CHECK: return %[[r]] : tensor<2x4xf32>
}
// CHECK-LABEL: @reduce_window
func @reduce_window(%arg0: tensor<1x28x28x32xf32>, %arg1: tensor<f32>) -> (tensor<1x14x14x32xf32>) {
%0 = "xla_hlo.reduce_window"(%arg0, %arg1) ({
^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
%1 = xla_hlo.maximum %arg2, %arg3 : tensor<f32>
"xla_hlo.return"(%1) : (tensor<f32>) -> ()
}) {
base_dilations = dense<1> : tensor<4xi64>,
padding = dense<0> : tensor<4x2xi64>,
window_dilations = dense<1> : tensor<4xi64>,
window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>
} : (tensor<1x28x28x32xf32>, tensor<f32>) -> tensor<1x14x14x32xf32>
return %0 : tensor<1x14x14x32xf32>
// CHECK: "quant.region"(%arg0, %arg1) ( {
// CHECK: ^bb0(%arg2: tensor<1x28x28x32xf32>, %arg3: tensor<f32>): // no predecessors
// CHECK: %[[rw:.*]] = "xla_hlo.reduce_window"(%arg2, %arg3) ( {
// CHECK: ^bb0(%arg4: tensor<f32>, %arg5: tensor<f32>): // no predecessors
// CHECK: %[[max:.*]] = xla_hlo.maximum %arg4, %arg5 : tensor<f32>
// CHECK: "xla_hlo.return"(%[[max]]) : (tensor<f32>) -> ()
// CHECK: })
// CHECK: "quant.return"(%[[rw]])
// CHECK: }) {input_specs = [f32, f32], logical_kernel = "generic.reduce_window", output_specs = [f32]}
}
// CHECK-LABEL: @reshape
func @reshape(%arg0: tensor<1x7x7x64xf32>) -> (tensor<1x3136xf32>) {
%0 = "xla_hlo.reshape"(%arg0) : (tensor<1x7x7x64xf32>) -> tensor<1x3136xf32>
return %0 : tensor<1x3136xf32>
// CHECK: "quant.region"(%arg0)
// CHECK: logical_kernel = "generic.reshape"
}
// CHECK-LABEL: @broadcast
func @broadcast(%arg0: tensor<64xf32>) -> (tensor<1x14x14x64xf32>) {
%0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<64xf32>) -> tensor<1x14x14x64xf32>
return %0 : tensor<1x14x14x64xf32>
// CHECK: "quant.region"(%arg0)
// CHECK: logical_kernel = "generic.broadcast"
}
// CHECK-LABEL: @biased_dot
func @biased_dot(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x10xf32>, %arg2: tensor<1x10xf32>) -> (tensor<1x10xf32>) {
%0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x1024xf32>, tensor<1024x10xf32>) -> tensor<1x10xf32>
%1 = xla_hlo.add %0, %arg2 : tensor<1x10xf32>
return %1 : tensor<1x10xf32>
// CHECK: "quant.region"(%arg0, %arg1, %arg2)
// CHECK: xla_hlo.dot
// CHECK: xla_hlo.add
// CHECK: logical_kernel = "generic.biased_dot"
}
// CHECK-LABEL: @biased_conv
func @biased_conv(%arg0: tensor<1x14x14x32xf32>, %arg1: tensor<5x5x32x64xf32>, %arg2: tensor<1x14x14x64xf32>) -> (tensor<1x14x14x64xf32>) {
%0 = "xla_hlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64,
input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64,
kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64,
output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, lhs_dilations = dense<1> : tensor<2xi64>,
padding = dense<2> : tensor<2x2xi64>, precision_config = ["DEFAULT", "DEFAULT"], rhs_dilations = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>
} : (tensor<1x14x14x32xf32>, tensor<5x5x32x64xf32>) -> tensor<1x14x14x64xf32>
%1 = xla_hlo.add %0, %arg2 : tensor<1x14x14x64xf32>
return %1 : tensor<1x14x14x64xf32>
// CHECK: "quant.region"(%arg0, %arg1, %arg2)
// CHECK: xla_hlo.convolution
// CHECK: xla_hlo.add
// CHECK: logical_kernel = "generic.biased_conv"
}
// CHECK-LABEL: @biased_dot_relu
func @biased_dot_relu(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x10xf32>, %arg2: tensor<1x10xf32>) -> (tensor<1x10xf32>) {
%cst = constant dense<0.0> : tensor<1x10xf32>
%0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x1024xf32>, tensor<1024x10xf32>) -> tensor<1x10xf32>
%1 = xla_hlo.add %0, %arg2 : tensor<1x10xf32>
%2 = xla_hlo.maximum %1, %cst : tensor<1x10xf32>
return %2 : tensor<1x10xf32>
// CHECK: "quant.region"(%arg0, %arg1, %arg2)
// CHECK: constant
// CHECK: xla_hlo.dot
// CHECK: xla_hlo.add
// CHECK: xla_hlo.maximum
// CHECK: logical_kernel = "generic.biased_dot_relu"
}
// CHECK-LABEL: @biased_conv_relu
func @biased_conv_relu(%arg0: tensor<1x14x14x32xf32>, %arg1: tensor<5x5x32x64xf32>, %arg2: tensor<1x14x14x64xf32>) -> (tensor<1x14x14x64xf32>) {
%cst = constant dense<0.0> : tensor<1x14x14x64xf32>
%0 = "xla_hlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64,
input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64,
kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64,
output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, lhs_dilations = dense<1> : tensor<2xi64>,
padding = dense<2> : tensor<2x2xi64>, precision_config = ["DEFAULT", "DEFAULT"], rhs_dilations = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>
} : (tensor<1x14x14x32xf32>, tensor<5x5x32x64xf32>) -> tensor<1x14x14x64xf32>
%1 = xla_hlo.add %0, %arg2 : tensor<1x14x14x64xf32>
%2 = xla_hlo.maximum %1, %cst : tensor<1x14x14x64xf32>
return %2 : tensor<1x14x14x64xf32>
// CHECK: "quant.region"(%arg0, %arg1, %arg2)
// CHECK: constant
// CHECK: xla_hlo.convolution
// CHECK: xla_hlo.add
// CHECK: xla_hlo.maximum
// CHECK: logical_kernel = "generic.biased_conv_relu"
}
// CHECK-LABEL: @biased_conv_relu_shared
func @biased_conv_relu_shared(%arg0: tensor<1x14x14x32xf32>, %arg1: tensor<5x5x32x64xf32>, %arg2: tensor<1x14x14x64xf32>) -> (tensor<1x14x14x64xf32>, tensor<1x14x14x64xf32>) {
%cst = constant dense<0.0> : tensor<1x14x14x64xf32>
%0 = "xla_hlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64,
input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64,
kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64,
output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, lhs_dilations = dense<1> : tensor<2xi64>,
padding = dense<2> : tensor<2x2xi64>, precision_config = ["DEFAULT", "DEFAULT"], rhs_dilations = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>
} : (tensor<1x14x14x32xf32>, tensor<5x5x32x64xf32>) -> tensor<1x14x14x64xf32>
%1 = xla_hlo.add %0, %arg2 : tensor<1x14x14x64xf32>
%2 = xla_hlo.maximum %1, %cst : tensor<1x14x14x64xf32>
return %cst, %2 : tensor<1x14x14x64xf32>, tensor<1x14x14x64xf32>
// CHECK: "quant.region"(%arg0, %arg1, %arg2)
// CHECK: constant
// CHECK: xla_hlo.convolution
// CHECK: xla_hlo.add
// CHECK: %[[max:.*]] = xla_hlo.maximum
// CHECK: "quant.return"(%[[max]])
// CHECK: logical_kernel = "generic.biased_conv_relu"
}
// CHECK-LABEL: @biased_conv_relu6
func @biased_conv_relu6(%arg0: tensor<1x14x14x32xf32>, %arg1: tensor<5x5x32x64xf32>, %arg2: tensor<1x14x14x64xf32>) -> (tensor<1x14x14x64xf32>) {
%min = constant dense<0.0> : tensor<1x14x14x64xf32>
%max = constant dense<6.0> : tensor<1x14x14x64xf32>
%0 = "xla_hlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64,
input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64,
kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64,
output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, lhs_dilations = dense<1> : tensor<2xi64>,
padding = dense<2> : tensor<2x2xi64>, precision_config = ["DEFAULT", "DEFAULT"], rhs_dilations = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>
} : (tensor<1x14x14x32xf32>, tensor<5x5x32x64xf32>) -> tensor<1x14x14x64xf32>
%1 = xla_hlo.add %0, %arg2 : tensor<1x14x14x64xf32>
%2 = "xla_hlo.clamp"(%min, %1, %max) : (tensor<1x14x14x64xf32>, tensor<1x14x14x64xf32>, tensor<1x14x14x64xf32>) -> tensor<1x14x14x64xf32>
return %2 : tensor<1x14x14x64xf32>
// CHECK: "quant.region"(%arg0, %arg1, %arg2)
// CHECK: constant
// CHECK: constant
// CHECK: xla_hlo.convolution
// CHECK: xla_hlo.add
// CHECK: xla_hlo.clamp
// CHECK: logical_kernel = "generic.biased_conv_relu6"
}

View File

@ -1,10 +0,0 @@
# RUN: not tfcompile --graph=%s.pbtxt --config=%s.config.pbtxt --experimental_quantize --cpp_class="::test::fadd_quant" 2>&1 | FileCheck %s -dump-input-on-failure
# TODO(fengliuai): update this file with the progress of the implementation
// CHECK: "quant.region"
// CHECK: ^bb0(%arg0: tensor<2x4xf32>, %arg1: tensor<2x4xf32>): // no predecessors
// CHECK: xla_hlo.add %arg0, %arg1
// CHECK: "quant.return"
// CHECK: }) {input_specs = [!quant.uniform<i8:f32, 0.49803921568627452:-128>, !quant.uniform<i8:f32, 0.49803921568627452:-128>],
// CHECK-SAME: logical_kernel = "generic.add", output_specs = [!quant.uniform<i8:f32, 0.49803921568627452:-128>]}

View File

@ -1,26 +0,0 @@
feed {
id { node_name: "input0" }
shape {
dim { size: 2 }
dim { size: 4 }
}
}
feed {
id { node_name: "input1" }
shape {
dim { size: 2 }
dim { size: 4 }
}
}
fetch {
id { node_name: "Add/FakeQuantWithMinMaxVars" }
shape {
dim { size: 2 }
dim { size: 4 }
}
}
conversion_options {
custom_fake_quant_op_calls: true
}

View File

@ -1,218 +0,0 @@
node: {
name: "Add/FakeQuantWithMinMaxVars"
op: "FakeQuantWithMinMaxVars"
input: "Add"
input: "Add/FakeQuantWithMinMaxVars/min"
input: "Add/FakeQuantWithMinMaxVars/max"
attr: {
key: "num_bits"
value: {
i: 8
}
}
attr: {
key: "narrow_range"
value: {
b: false
}
}
}
node: {
name: "Add/FakeQuantWithMinMaxVars/min"
op: "Const"
attr: {
key: "value"
value: {
tensor: {
dtype: DT_FLOAT
tensor_shape: {
}
float_val: 0.0
}
}
}
attr: {
key: "dtype"
value: {
type: DT_FLOAT
}
}
}
node: {
name: "Add/FakeQuantWithMinMaxVars/max"
op: "Const"
attr: {
key: "value"
value: {
tensor: {
dtype: DT_FLOAT
tensor_shape: {
}
float_val: 127.0
}
}
}
attr: {
key: "dtype"
value: {
type: DT_FLOAT
}
}
}
node {
name: "Add"
op: "Add"
input: "input0/FakeQuantWithMinMaxVars"
input: "input1/FakeQuantWithMinMaxVars"
attr {
key: "T"
value {
type: DT_FLOAT
}
}
}
node: {
name: "input0/FakeQuantWithMinMaxVars"
op: "FakeQuantWithMinMaxVars"
input: "input0"
input: "input0/FakeQuantWithMinMaxVars/min"
input: "input0/FakeQuantWithMinMaxVars/max"
attr: {
key: "num_bits"
value: {
i: 8
}
}
attr: {
key: "narrow_range"
value: {
b: false
}
}
}
node: {
name: "input0/FakeQuantWithMinMaxVars/min"
op: "Const"
attr: {
key: "value"
value: {
tensor: {
dtype: DT_FLOAT
tensor_shape: {
}
float_val: 0.0
}
}
}
attr: {
key: "dtype"
value: {
type: DT_FLOAT
}
}
}
node: {
name: "input0/FakeQuantWithMinMaxVars/max"
op: "Const"
attr: {
key: "value"
value: {
tensor: {
dtype: DT_FLOAT
tensor_shape: {
}
float_val: 127.0
}
}
}
attr: {
key: "dtype"
value: {
type: DT_FLOAT
}
}
}
node {
name: "input0"
op: "Placeholder"
attr {
key: "dtype"
value {
type: DT_FLOAT
}
}
}
node: {
name: "input1/FakeQuantWithMinMaxVars"
op: "FakeQuantWithMinMaxVars"
input: "input1"
input: "input1/FakeQuantWithMinMaxVars/min"
input: "input1/FakeQuantWithMinMaxVars/max"
attr: {
key: "num_bits"
value: {
i: 8
}
}
attr: {
key: "narrow_range"
value: {
b: false
}
}
}
node: {
name: "input1/FakeQuantWithMinMaxVars/min"
op: "Const"
attr: {
key: "value"
value: {
tensor: {
dtype: DT_FLOAT
tensor_shape: {
}
float_val: 0.0
}
}
}
attr: {
key: "dtype"
value: {
type: DT_FLOAT
}
}
}
node: {
name: "input1/FakeQuantWithMinMaxVars/max"
op: "Const"
attr: {
key: "value"
value: {
tensor: {
dtype: DT_FLOAT
tensor_shape: {
}
float_val: 127.0
}
}
}
attr: {
key: "dtype"
value: {
type: DT_FLOAT
}
}
}
node {
name: "input1"
op: "Placeholder"
attr {
key: "dtype"
value {
type: DT_FLOAT
}
}
}
versions {
producer: 27
}

View File

@ -1,54 +0,0 @@
// RUN: tf-opt -xla-hlo-materialize-quant %s | FileCheck %s
// CHECK-LABEL: func @quantize_rewrite
func @quantize_rewrite(%arg0: tensor<2x4xf32>) -> tensor<2x4xf32> {
// CHECK: %[[qcst:.*]] = constant dense<{{\[\[}}21004416], [-1056997248]]> : tensor<2x1xi32>
// CHECK-NEXT: %[[dq:.*]] = "xla_hlo.dequantize"(%[[qcst]]) {is_16bits = false, max_range = 0.996078431 : f32, min_range = -1.00392163 : f32,
// CHECK-SAME: mode = "MIN_COMBINED", transpose_output = false} : (tensor<2x1xi32>) -> tensor<2x4xbf16>
// CHECK-NEXT: %[[cast:.*]] = "xla_hlo.convert"(%[[dq]]) : (tensor<2x4xbf16>) -> tensor<2x4xf32>
// CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %arg0, %[[cast]] : tensor<2x4xf32>
// CHECK-NEXT: return %[[mul]] : tensor<2x4xf32>
%w = constant dense<[[-1.0, -0.5, 0.0, 0.0], [0.5, 1.0, 0.0, 0.0]]> : tensor<2x4xf32>
%q = "quant.qcast"(%w) : (tensor<2x4xf32>) -> tensor<2x4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
%dq = "quant.dcast"(%q) : (tensor<2x4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<2x4xf32>
%mul = xla_hlo.multiply %arg0, %dq : tensor<2x4xf32>
return %mul: tensor<2x4xf32>
}
// CHECK-LABEL: func @quantize_small
func @quantize_small(%arg0: tensor<1x4xf32>) -> tensor<1x4xf32> {
// CHECK: %[[w:.*]] = constant dense<1.000000e+00> : tensor<1x4xf32>
// CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %arg0, %[[w]] : tensor<1x4xf32>
// CHECK-NEXT: return %[[mul]] : tensor<1x4xf32>
%w = constant dense<1.0> : tensor<1x4xf32>
%q = "quant.qcast"(%w) : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
%dq = "quant.dcast"(%q) : (tensor<1x4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<1x4xf32>
%mul = xla_hlo.multiply %arg0, %dq : tensor<1x4xf32>
return %mul: tensor<1x4xf32>
}
// CHECK-LABEL: func @quantize_non_cst
func @quantize_non_cst(%arg0: tensor<2x4xf32>) -> tensor<2x4xf32> {
// CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %arg0, %arg0 : tensor<2x4xf32>
// CHECK-NEXT: return %[[mul]] : tensor<2x4xf32>
%q = "quant.qcast"(%arg0) : (tensor<2x4xf32>) -> tensor<2x4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
%dq = "quant.dcast"(%q) : (tensor<2x4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<2x4xf32>
%mul = xla_hlo.multiply %arg0, %dq : tensor<2x4xf32>
return %mul: tensor<2x4xf32>
}
// CHECK-LABEL: func @quantize_non_4x
func @quantize_non_4x(%arg0: tensor<2x5xf32>) -> tensor<2x5xf32> {
// CHECK: %[[w:.*]] = constant dense<1.000000e+00> : tensor<2x5xf32>
// CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %arg0, %[[w]] : tensor<2x5xf32>
// CHECK-NEXT: return %[[mul]] : tensor<2x5xf32>
%w = constant dense<1.0> : tensor<2x5xf32>
%q = "quant.qcast"(%w) : (tensor<2x5xf32>) -> tensor<2x5x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
%dq = "quant.dcast"(%q) : (tensor<2x5x!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<2x5xf32>
%mul = xla_hlo.multiply %arg0, %dq : tensor<2x5xf32>
return %mul: tensor<2x5xf32>
}

View File

@ -1,69 +0,0 @@
// RUN: tf-opt -xla-hlo-propagate-quant %s | FileCheck %s --dump-input-on-failure
// -----
// CHECK-LABEL: @mul_add_source_no_params
func @mul_add_source_no_params(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32>) -> (tensor<4xf32>) {
%region = "quant.region"(%arg0, %arg1, %arg2) ( {
^bb0(%arg3: tensor<4xf32>, %arg4: tensor<4xf32>, %arg5: tensor<4xf32>): // no predecessors
%mul = xla_hlo.multiply %arg3, %arg4 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
%add = xla_hlo.add %mul, %arg5 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
"quant.return"(%add) : (tensor<4xf32>) -> ()
}) {input_specs = [f32, f32, f32], logical_kernel = "generic.mul_add", output_specs = [f32]} :
(tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
return %region : tensor<4xf32>
// CHECK: input_specs = [f32, f32, f32]
// CHECK-SAME: output_specs = [f32]
}
// -----
// CHECK-LABEL: @mul_add_annotated_no_narrow_range
func @mul_add_annotated_no_narrow_range(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32>) -> (tensor<4xf32>) {
%region = "quant.region"(%arg0, %arg1, %arg2) ( {
^bb0(%arg3: tensor<4xf32>, %arg4: tensor<4xf32>, %arg5: tensor<4xf32>): // no predecessors
%mul = xla_hlo.multiply %arg3, %arg4 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
%add = xla_hlo.add %mul, %arg5 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
"quant.return"(%add) : (tensor<4xf32>) -> ()
}) {input_specs = [!quant.uniform<i8:f32, 1.0:-128>, !quant.uniform<i8:f32, 1.0:-128>, f32],
logical_kernel = "generic.mul_add", output_specs = [!quant.uniform<i8:f32, 1.0:-128>]} :
(tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
return %region : tensor<4xf32>
// CHECK: input_specs = [!quant.uniform<i8:f32, 1.000000e+00:-128>, !quant.uniform<i8:f32, 1.000000e+00:-128>, f32]
// CHECK-SAME: output_specs = [!quant.uniform<i8:f32, 1.000000e+00:-128>]
}
// -----
// CHECK-LABEL: @mul_add_annotated
func @mul_add_annotated(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32>) -> (tensor<4xf32>) {
%region = "quant.region"(%arg0, %arg1, %arg2) ( {
^bb0(%arg3: tensor<4xf32>, %arg4: tensor<4xf32>, %arg5: tensor<4xf32>): // no predecessors
%mul = xla_hlo.multiply %arg3, %arg4 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
%add = xla_hlo.add %mul, %arg5 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<4xf32>
"quant.return"(%add) : (tensor<4xf32>) -> ()
}) {input_specs = [!quant.uniform<i8:f32, 1.0:-128>, !quant.uniform<i8<-127:127>:f32, 1.0:-128>, f32],
logical_kernel = "generic.mul_add", output_specs = [!quant.uniform<i8:f32, 1.0:-128>]} :
(tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
return %region : tensor<4xf32>
// CHECK: input_specs = [!quant.uniform<i8:f32, 1.000000e+00:-128>, !quant.uniform<i8<-127:127>:f32, 1.000000e+00:-128>, !quant.uniform<i32:f32, 1.000000e+00>]
// CHECK-SAME: output_specs = [!quant.uniform<i8:f32, 1.000000e+00:-128>]
}
// -----
// CHECK-LABEL: @same_scale_1_1
func @same_scale_1_1(%arg0: tensor<1x7x7x64xf32>) -> (tensor<1x3136xf32>) {
%region = "quant.region"(%arg0) ( {
^bb0(%arg1: tensor<1x7x7x64xf32>): // no predecessors
%r = "xla_hlo.reshape"(%arg1) : (tensor<1x7x7x64xf32>) -> (tensor<1x3136xf32>)
"quant.return"(%r) : (tensor<1x3136xf32>) -> ()
}) {input_specs = [!quant.uniform<i8:f32, 1.0>], logical_kernel = "generic.reshape", output_specs = [f32]} : (tensor<1x7x7x64xf32>) -> tensor<1x3136xf32>
return %region : tensor<1x3136xf32>
// CHECK: input_specs = [!quant.uniform<i8:f32, 1.000000e+00>]
// CHECK-SAME: output_specs = [!quant.uniform<i8:f32, 1.000000e+00>]
}

View File

@ -1,25 +0,0 @@
// RUN: tf-opt -xla-hlo-propagate-quant %s | FileCheck %s
// CHECK-LABEL: func @mul
func @mul(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
// CHECK: %[[w:.*]] = constant dense<{{\[\[}}-1.000000e+00, -5.000000e-01], [5.000000e-01, 1.000000e+00]]> : tensor<2x2xf32>
// CHECK-NEXT: %[[q:.*]] = "quant.qcast"(%[[w]]) : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
// CHECK-NEXT: %[[dq:.*]] = "quant.dcast"(%[[q]]) : (tensor<2x2x!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<2x2xf32>
// CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %arg0, %[[dq]] : tensor<2x2xf32>
// CHECK-NEXT: return %[[mul]] : tensor<2x2xf32>
%w = constant dense<[[-1.0, -0.5], [0.5, 1.0]]> : tensor<2x2xf32>
%mul = xla_hlo.multiply %arg0, %w : tensor<2x2xf32>
return %mul: tensor<2x2xf32>
}
// CHECK-LABEL: func @add
func @add(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
// CHECK: %[[b:.*]] = constant dense<1.000000e+00> : tensor<2xf32>
// CHECK-NEXT: %[[q:.*]] = "quant.qcast"(%[[b]]) : (tensor<2xf32>) -> tensor<2x!quant.uniform<u8:f32, 0.0039215686274509803>>
// CHECK-NEXT: %[[dq:.*]] = "quant.dcast"(%[[q]]) : (tensor<2x!quant.uniform<u8:f32, 0.0039215686274509803>>) -> tensor<2xf32>
// CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg0, %[[dq]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x2xf32>, tensor<2xf32>) -> tensor<2x2xf32>
// CHECK-NEXT: return %[[add]] : tensor<2x2xf32>
%b = constant dense<1.0> : tensor<2xf32>
%add = "xla_hlo.add"(%arg0, %b) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x2xf32>, tensor<2xf32>) -> tensor<2x2xf32>
return %add: tensor<2x2xf32>
}