Adds QuantizeAndDequantize kernel to OpenGL & OpenCL backends. This is not a TFLite op, but will be used to support inference on quantized models with future CLs.

PiperOrigin-RevId: 301229478 Change-Id: I7379a801ba355616a6730578a01c077253494670
2020-03-16 13:43:27 -07:00 · 2020-03-16 13:43:27 -07:00 · e61ff10d8b
commit e61ff10d8b
parent eb6b2831f8
16 changed files with 803 additions and 0 deletions
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@ -991,6 +991,45 @@ cc_test(
    ],
 )

+cc_library(
+    name = "quantize_and_dequantize",
+    srcs = ["quantize_and_dequantize.cc"],
+    hdrs = ["quantize_and_dequantize.h"],
+    deps = [
+        ":flt_type",
+        ":gpu_operation",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/cl:cl_context",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:variant",
+    ],
+)
+
+cc_test(
+    name = "quantize_and_dequantize_test",
+    srcs = ["quantize_and_dequantize_test.cc"],
+    linkstatic = True,
+    tags = tf_gpu_tests_tags() + [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":quantize_and_dequantize",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/kernels/internal:quantization_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
    name = "relu",
    srcs = ["relu.cc"],
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
@ -0,0 +1,128 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h"
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/variant.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+QuantizeAndDequantize::QuantizeAndDequantize(
+    const OperationDef& definition, const QuantizeAndDequantizeAttributes& attr,
+    CalculationsPrecision scalar_precision)
+    : ElementwiseOperation(definition) {
+  min_ = FLT(scalar_precision, attr.min);
+  max_ = FLT(scalar_precision, attr.max);
+  scale_ = FLT(scalar_precision, attr.scale);
+}
+
+QuantizeAndDequantize::QuantizeAndDequantize(QuantizeAndDequantize&& operation)
+    : ElementwiseOperation(std::move(operation)),
+      min_(std::move(operation.min_)),
+      max_(std::move(operation.max_)),
+      scale_(std::move(operation.scale_)) {}
+
+QuantizeAndDequantize& QuantizeAndDequantize::operator=(
+    QuantizeAndDequantize&& operation) {
+  if (this != &operation) {
+    min_ = std::move(operation.min_);
+    max_ = std::move(operation.max_);
+    scale_ = std::move(operation.scale_);
+    ElementwiseOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+void QuantizeAndDequantize::SetLinkIndex(int index) {
+  min_.SetName(absl::StrCat("quantize_and_dequantize_min_", index));
+  max_.SetName(absl::StrCat("quantize_and_dequantize_max_", index));
+  scale_.SetName(absl::StrCat("quantize_and_dequantize_scale_", index));
+}
+
+std::string QuantizeAndDequantize::GetCoreCode(
+    const LinkingContext& context) const {
+  std::string scale_string, max_string, min_string;
+  if (!scale_.Active()) {
+    scale_string = "(FLT4)(1.0f)";
+  } else {
+    scale_string = absl::StrCat("(FLT4)(", scale_.GetName(), ")");
+  }
+  if (!max_.Active()) {
+    max_string = "(FLT4)(0.0f)";
+  } else {
+    max_string = absl::StrCat("(FLT4)(", max_.GetName(), ")");
+  }
+  if (!min_.Active()) {
+    min_string = "(FLT4)(0.0f)";
+  } else {
+    min_string = absl::StrCat("(FLT4)(", min_.GetName(), ")");
+  }
+  std::string clamped_value = absl::StrCat(
+      "min(", max_string, ", max(", min_string, ", ", context.var_name, "))");
+  std::string quantized_value = absl::StrCat(
+      "round((", clamped_value, " - ", min_string, ") / ", scale_string, ")");
+  std::string dequantized_value =
+      absl::StrCat(quantized_value, " * ", scale_string, " + ", min_string);
+
+  return absl::StrCat(context.var_name, " = ", dequantized_value, ";\n");
+}
+
+std::string QuantizeAndDequantize::GetArgsDeclaration() const {
+  return absl::StrCat(",\n    ", min_.GetDeclaration(), ",\n    ",
+                      max_.GetDeclaration(), ",\n    ",
+                      scale_.GetDeclaration());
+}
+
+Status QuantizeAndDequantize::BindArguments(CLKernel* kernel) {
+  RETURN_IF_ERROR(kernel->SetBytesAuto(min_));
+  RETURN_IF_ERROR(kernel->SetBytesAuto(max_));
+  RETURN_IF_ERROR(kernel->SetBytesAuto(scale_));
+  return OkStatus();
+}
+
+Status CreateQuantizeAndDequantize(const CreationContext& creation_context,
+                                   const OperationDef& definition,
+                                   const QuantizeAndDequantizeAttributes& attr,
+                                   QuantizeAndDequantize* result) {
+  const auto scalar_precision = creation_context.device->IsPowerVR()
+                                    ? CalculationsPrecision::F32
+                                    : definition.precision;
+  const bool is_fp16 = definition.precision == CalculationsPrecision::F16 ||
+                       definition.precision == CalculationsPrecision::F32_F16;
+  if (is_fp16 && attr.scale < 0.000062f) {
+    // The smallest positive normal number for Half-precision floating-point
+    // format is 2^-14 ~ 0.000062f. Therefore, if the scale is lesser than this
+    // number, we just reset it accordingly.
+    QuantizeAndDequantizeAttributes adjusted_attr = attr;
+    adjusted_attr.scale = 0.000062f;
+    *result =
+        QuantizeAndDequantize(definition, adjusted_attr, scalar_precision);
+  } else {
+    *result = QuantizeAndDequantize(definition, attr, scalar_precision);
+  }
+  result->SetLinkIndex(0);
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
@ -0,0 +1,100 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Performs the operation: {Quantize, Dequantize} on floating-point data.
+// We need this operation to emulate the error introduced by quantization
+// on the GPU, which cannot represent int8 tensors.
+//
+// Implemented as:
+// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale) + 0.5)
+// dq_value = qvalue * qscale + qmin
+// Here, qmin, qmax & qscale refer to the quantization values as implemented in
+// TensorFlow Lite's 'FakeQuant' kernel. round(x + 0.5) ensures we round away
+// from zero.
+//
+// NOTE: We do not need to nudge min/max values in this op, since they would
+// already be adjusted while generating the quantized model.
+class QuantizeAndDequantize : public ElementwiseOperation {
+ public:
+  QuantizeAndDequantize() = default;
+  // Move only
+  QuantizeAndDequantize(QuantizeAndDequantize&& operation);
+  QuantizeAndDequantize& operator=(QuantizeAndDequantize&& operation);
+  QuantizeAndDequantize(const QuantizeAndDequantize&) = delete;
+  QuantizeAndDequantize& operator=(const QuantizeAndDequantize&) = delete;
+
+  void SetLinkIndex(int index) override;
+  std::string GetCoreCode(const LinkingContext& context) const override;
+  std::string GetArgsDeclaration() const override;
+  Status BindArguments(CLKernel* kernel) override;
+
+  friend Status CreateQuantizeAndDequantize(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const QuantizeAndDequantizeAttributes& attr,
+      QuantizeAndDequantize* result);
+
+ private:
+  QuantizeAndDequantize(const OperationDef& definition,
+                        const QuantizeAndDequantizeAttributes& attr,
+                        CalculationsPrecision scalar_precision);
+
+  template <DataType T>
+  Status UploadParameters(const ::tflite::gpu::Tensor<Linear, T>& parameters,
+                          CLContext* context);
+
+  FLT min_;
+  FLT max_;
+  FLT scale_;
+};
+
+Status CreateQuantizeAndDequantize(const CreationContext& creation_context,
+                                   const OperationDef& definition,
+                                   const QuantizeAndDequantizeAttributes& attr,
+                                   QuantizeAndDequantize* result);
+
+template <DataType T>
+Status QuantizeAndDequantize::UploadParameters(
+    const ::tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context) {
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type =
+      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
+  create_info.data_type = definition_.GetPrimaryDataType();
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
@ -0,0 +1,182 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits8) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 3, 2, 1);
+  src_tensor.data = {0.0f, 1.0f, 0.25f, 0.50f, 0.4444444f, 0.00001f};
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 8;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ 0.0, /**original_max**/ 1.0,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      QuantizeAndDequantize operation;
+      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 2, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 1.0f, 0.25098f, 0.498039f,
+                                             0.443137f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits8_NegativeRange) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 3, 1, 2);
+  src_tensor.data = {0.0f, -0.9f, 0.25f, 0.50f, 0.4444444f, -0.00001f};
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 8;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ -0.9, /**original_max**/ 0.9,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      QuantizeAndDequantize operation;
+      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, -0.896471f, 0.247059f,
+                                             0.501176f, 0.444706f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits16) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 3, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 0.25f, 0.50f, 0.4444444f, 0.00001f};
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 16;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ 0.0, /**original_max**/ 1.0,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      QuantizeAndDequantize operation;
+      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 1.0f, 0.250004f, 0.500008f,
+                                             0.44445f, 1.5259e-05f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits16_NegativeRange) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 3, 2, 1);
+  src_tensor.data = {0.0f, -0.9f, 0.25f, 0.50f, 0.4444444f, -0.00001f};
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 16;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ -0.9, /**original_max**/ 0.9,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      QuantizeAndDequantize operation;
+      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 2, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, -0.900014f, 0.249998f,
+                                             0.499995f, 0.444431f, 0.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@ -132,6 +132,7 @@ cc_library(
        "//tensorflow/lite/delegates/gpu/cl/kernels:padding",
        "//tensorflow/lite/delegates/gpu/cl/kernels:pooling",
        "//tensorflow/lite/delegates/gpu/cl/kernels:prelu",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:quantize_and_dequantize",
        "//tensorflow/lite/delegates/gpu/cl/kernels:relu",
        "//tensorflow/lite/delegates/gpu/cl/kernels:reshape",
        "//tensorflow/lite/delegates/gpu/cl/kernels:reshapex4",
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@ -279,6 +279,12 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
      auto attr = absl::any_cast<PReLUAttributes>(node.operation.attributes);
      return SelectPReLU(attr, creation_context, op_def, gpu_op);
    }
+    case OperationType::QUANTIZE_AND_DEQUANTIZE: {
+      auto attr = absl::any_cast<QuantizeAndDequantizeAttributes>(
+          node.operation.attributes);
+      return SelectQuantizeAndDequantize(attr, creation_context, op_def,
+                                         gpu_op);
+    }
    case OperationType::RELU: {
      auto attr = absl::any_cast<ReLUAttributes>(node.operation.attributes);
      SelectReLU(creation_context, attr, op_def, gpu_op);
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/padding.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/pooling.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/prelu.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/relu.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/reshape.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h"
@ -218,6 +219,17 @@ Status SelectWinograd36To4x4(
  return OkStatus();
 }

+Status SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
+                                   const CreationContext& creation_context,
+                                   const OperationDef& op_def,
+                                   std::unique_ptr<GPUOperation>* ptr) {
+  QuantizeAndDequantize operation;
+  RETURN_IF_ERROR(
+      CreateQuantizeAndDequantize(creation_context, op_def, attr, &operation));
+  *ptr = absl::make_unique<QuantizeAndDequantize>(std::move(operation));
+  return OkStatus();
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@ -100,6 +100,11 @@ Status SelectWinograd36To4x4(
    const ::tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
    std::unique_ptr<GPUOperation>* ptr);

+Status SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
+                                   const CreationContext& creation_context,
+                                   const OperationDef& op_def,
+                                   std::unique_ptr<GPUOperation>* ptr);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@ -118,6 +118,8 @@ std::string ToString(enum OperationType op) {
      return "pow";
    case OperationType::PRELU:
      return "prelu";
+    case OperationType::QUANTIZE_AND_DEQUANTIZE:
+      return "quantize_and_dequantize";
    case OperationType::RELU:
      return "relu";
    case OperationType::RESHAPE:
@ -183,6 +185,7 @@ OperationType OperationTypeFromString(const std::string& name) {
          {"pooling_2d", OperationType::POOLING_2D},
          {"pow", OperationType::POW},
          {"prelu", OperationType::PRELU},
+          {"quantize_and_dequantize", OperationType::QUANTIZE_AND_DEQUANTIZE},
          {"relu", OperationType::RELU},
          {"resize", OperationType::RESIZE},
          {"reshape", OperationType::RESHAPE},
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@ -57,6 +57,8 @@ enum class OperationType {
  POOLING_2D,
  POW,
  PRELU,
+  // Used to accurately run inference on quantized models.
+  QUANTIZE_AND_DEQUANTIZE,
  RELU,
  RESHAPE,
  RESIZE,
@ -478,6 +480,14 @@ struct SpaceToDepthAttributes {
  int block_size;
 };

+// These help perform a combination of Quantize & Dequantize to adjust float
+// values like quantized inference would.
+struct QuantizeAndDequantizeAttributes {
+  float min = 0;
+  float max = 0;
+  float scale = 0;
+};
+
 }  // namespace gpu
 }  // namespace tflite

--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@ -451,6 +451,38 @@ cc_test(
    ],
 )

+cc_library(
+    name = "quantize_and_dequantize",
+    srcs = ["quantize_and_dequantize.cc"],
+    hdrs = ["quantize_and_dequantize.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:convert",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/gl:node_shader",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_test(
+    name = "quantize_and_dequantize_test",
+    srcs = ["quantize_and_dequantize_test.cc"],
+    tags = tf_gpu_tests_tags() + [
+        "notap",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":quantize_and_dequantize",
+        ":test_util",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/kernels/internal:quantization_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
    name = "relu",
    srcs = ["relu.cc"],
@ -699,6 +731,7 @@ TFLITE_GPU_BINARY_RELEASE_OPERATORS = [
    "pad",
    "pooling",
    "prelu",
+    "quantize_and_dequantize",
    "relu",
    "mean",
    "reshape",
--- a/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.cc
@ -0,0 +1,74 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+namespace {
+
+class QuantizeAndDequantize : public NodeShader {
+ public:
+  Status GenerateCode(const GenerationContext& ctx,
+                      GeneratedCode* generated_code) const final {
+    std::string code;
+    // Constants
+    code += "vec4 scale = vec4($quant_scale$);";
+    code += "vec4 min_bound = vec4($quant_min$);";
+    code += "vec4 max_bound = vec4($quant_max$);";
+    // Quantize
+    code += "value_0 = clamp(value_0, min_bound, max_bound);";
+    code += "value_0 = (value_0 - min_bound) / scale;";
+    code += "value_0 = floor(value_0 + vec4(0.5));";
+    // Dequantize
+    code += "value_0 = value_0 * scale + min_bound;";
+
+    auto attr = absl::any_cast<const QuantizeAndDequantizeAttributes&>(
+        ctx.node->operation.attributes);
+    *generated_code = {
+        /*parameters=*/{{"quant_min", attr.min},
+                        {"quant_max", attr.max},
+                        {"quant_scale", attr.scale}},
+        /*objects=*/{},
+        /*shared_variables=*/{},
+        /*workload=*/uint3(),
+        /*workgroup=*/uint3(),
+        /*source_code=*/code,
+        /*input=*/IOStructure::AUTO,
+        /*output=*/IOStructure::AUTO,
+    };
+    return OkStatus();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<NodeShader> NewQuantizeAndDequantizeNodeShader() {
+  return absl::make_unique<QuantizeAndDequantize>();
+}
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h
@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Performs the operation: {Quantize, Dequantize} on floating-point data.
+// We need this operation to emulate the error introduced by quantization
+// on the GPU, which cannot represent int8 tensors.
+//
+// Implemented as:
+// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale) + 0.5)
+// dq_value = qvalue * qscale + qmin
+// Here, qmin, qmax & qscale refer to the quantization values as implemented in
+// TensorFlow Lite's 'FakeQuant' kernel. round(x + 0.5) ensures we round away
+// from zero.
+//
+// NOTE: We do not need to nudge min/max values in this op, since they would
+// already be adjusted while generating the quantized model.
+std::unique_ptr<NodeShader> NewQuantizeAndDequantizeNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
--- a/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize_test.cc
@ -0,0 +1,159 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/kernels/test_util.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+namespace {
+
+TEST(QuantizeAndDequantizeTest, Dim2Bits8) {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 2, 1);
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 8;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ 0.0, /**original_max**/ 1.0,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 1;
+  output.shape = BHWC(1, 3, 2, 1);
+
+  SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr},
+                      {input}, {output});
+  ASSERT_TRUE(
+      model.PopulateTensor(0, {0.0, 1.0, 0.25, 0.50, 0.4444444, 0.00001}));
+  ASSERT_OK(model.Invoke(*NewQuantizeAndDequantizeNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6),
+                        {0.0f, 1.0f, 0.25098f, 0.498039f, 0.443137f, 0.0f}));
+}
+
+TEST(QuantizeAndDequantizeTest, Dim3Bits8_NegativeRange) {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 1, 2);
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 8;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ -0.9, /**original_max**/ 0.9,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 1;
+  output.shape = BHWC(1, 3, 1, 2);
+
+  SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr},
+                      {input}, {output});
+  ASSERT_TRUE(
+      model.PopulateTensor(0, {0.0, -0.9, 0.25, 0.50, 0.4444444, -0.00001}));
+  ASSERT_OK(model.Invoke(*NewQuantizeAndDequantizeNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {0.0f, -0.896471f, 0.247059f,
+                                          0.501176f, 0.444706f, 0.0f}));
+}
+
+TEST(QuantizeAndDequantizeTest, Dim3Bits16) {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 1, 2);
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 16;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ 0.0, /**original_max**/ 1.0,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 1;
+  output.shape = BHWC(1, 3, 1, 2);
+
+  SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr},
+                      {input}, {output});
+  ASSERT_TRUE(
+      model.PopulateTensor(0, {0.0, 1.0, 0.25, 0.50, 0.4444444, 0.00001}));
+  ASSERT_OK(model.Invoke(*NewQuantizeAndDequantizeNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {0.0f, 1.0f, 0.250004f, 0.500008f,
+                                          0.44445f, 1.5259e-05f}));
+}
+
+TEST(QuantizeAndDequantizeTest, Dim2Bits16_NegativeRange) {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 2, 1);
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 16;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ -0.9, /**original_max**/ 0.9,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 1;
+  output.shape = BHWC(1, 3, 2, 1);
+
+  SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr},
+                      {input}, {output});
+  ASSERT_TRUE(
+      model.PopulateTensor(0, {0.0, -0.9, 0.25, 0.50, 0.4444444, -0.00001}));
+  ASSERT_OK(model.Invoke(*NewQuantizeAndDequantizeNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {0.0f, -0.900014f, 0.249998f,
+                                          0.499995f, 0.444431f, 0.0f}));
+}
+
+}  // namespace
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/gl/kernels/pad.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/pooling.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/prelu.h"
+#include "tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/relu.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/reshape.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/resize.h"
@ -85,6 +86,8 @@ class Registry : public NodeShader {
    insert_op(Type::PAD, NewPadNodeShader);
    insert_op(Type::POOLING_2D, NewPoolingNodeShader);
    insert_op(Type::PRELU, NewPReLUNodeShader);
+    insert_op(Type::QUANTIZE_AND_DEQUANTIZE,
+              NewQuantizeAndDequantizeNodeShader);
    insert_op(Type::RELU, NewReLUNodeShader);
    insert_op(Type::RESIZE, NewResizeNodeShader);
    insert_op(Type::RESHAPE, NewReshapeNodeShader);
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@ -305,6 +305,7 @@ Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
    case OperationType::BATCH_TO_SPACE:
    case OperationType::CONST:
    case OperationType::LSTM:
+    case OperationType::QUANTIZE_AND_DEQUANTIZE:
    case OperationType::SPACE_TO_BATCH:
    case OperationType::TRANSPOSE:
    case OperationType::UNKNOWN: