Adds QuantizeAndDequantize kernel to Metal backend

* Fixed comment to match implementation. PiperOrigin-RevId: 314684033 Change-Id: Ice38717811c8a5f6abf4a4fb65ecbdf7b9ed42ca
2020-06-04 00:58:02 -07:00 · 2020-06-04 00:58:02 -07:00 · 282f234242
commit 282f234242
parent 1421933a1d
7 changed files with 336 additions and 7 deletions
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
@ -37,11 +37,10 @@ namespace cl {
 // on the GPU, which cannot represent int8 tensors.
 //
 // Implemented as:
-// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale) + 0.5)
+// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale))
 // dq_value = qvalue * qscale + qmin
 // Here, qmin, qmax & qscale refer to the quantization values as implemented in
-// TensorFlow Lite's 'FakeQuant' kernel. round(x + 0.5) ensures we round away
+// TensorFlow Lite's 'FakeQuant' kernel.
 // from zero.
 //
 // NOTE: We do not need to nudge min/max values in this op, since they would
 // already be adjusted while generating the quantized model.
--- a/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h
@ -30,11 +30,10 @@ namespace gl {
 // on the GPU, which cannot represent int8 tensors.
 //
 // Implemented as:
-// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale) + 0.5)
+// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale))
 // dq_value = qvalue * qscale + qmin
 // Here, qmin, qmax & qscale refer to the quantization values as implemented in
-// TensorFlow Lite's 'FakeQuant' kernel. round(x + 0.5) ensures we round away
+// TensorFlow Lite's 'FakeQuant' kernel.
 // from zero.
 //
 // NOTE: We do not need to nudge min/max values in this op, since they would
 // already be adjusted while generating the quantized model.
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/metal/kernels/padding.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/pooling.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/prelu.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/quantize_and_dequantize.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/relu.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/reshape.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/resize.h"
@ -96,6 +97,12 @@ std::vector<ComputeTaskDescriptorPtr> SelectConvolutionTransposed(
  }
 }
 std::vector<ComputeTaskDescriptorPtr> SelectQuantizeAndDequantize(
    int id, ValueId input_id, ValueId output_id,
    const QuantizeAndDequantizeAttributes& attr) {
  return QuantizeAndDequantize(id, input_id, output_id, attr);
 }
 std::vector<ComputeTaskDescriptorPtr> SelectPReLU(
    const GraphFloat32& graph, int id, ValueId input_id, ValueId output_id,
    const PReLUAttributes& attr, const metal::RuntimeOptions& options) {
@ -351,6 +358,12 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
      *tasks = ReLU(node_id, inputs[0], outputs[0],
                    absl::any_cast<ReLUAttributes>(node->operation.attributes));
      break;
    case OperationType::QUANTIZE_AND_DEQUANTIZE:
      *tasks = SelectQuantizeAndDequantize(
          node_id, inputs[0], outputs[0],
          absl::any_cast<QuantizeAndDequantizeAttributes>(
              node->operation.attributes));
      break;
    case OperationType::RESHAPE:
      *tasks = SelectReshape(
          graph, node_id, inputs[0], outputs[0],
@ -427,7 +440,6 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
    case OperationType::BATCH_TO_SPACE:
    case OperationType::CONST:
    case OperationType::LSTM:
    case OperationType::QUANTIZE_AND_DEQUANTIZE:
    case OperationType::SPACE_TO_BATCH:
    case OperationType::TRANSPOSE:
    case OperationType::UNKNOWN:
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@ -31,6 +31,7 @@ cc_library(
        ":padding",
        ":pooling",
        ":prelu",
        ":quantize_and_dequantize",
        ":relu",
        ":reshape",
        ":resize",
@ -539,6 +540,53 @@ ios_unit_test(
    deps = [":prelu_test_lib"],
 )
 cc_library(
    name = "quantize_and_dequantize",
    srcs = ["quantize_and_dequantize.cc"],
    hdrs = ["quantize_and_dequantize.h"],
    deps = [
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/common:util",
        "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
        "@com_google_absl//absl/strings",
    ],
 )
 objc_library(
    name = "quantize_and_dequantize_test_lib",
    testonly = 1,
    srcs = ["quantize_and_dequantize_test.mm"],
    sdk_frameworks = ["XCTest"],
    deps = [
        ":quantize_and_dequantize",
        ":test_util",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:tensor",
        "//tensorflow/lite/delegates/gpu/common:util",
        "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
        "//tensorflow/lite/kernels/internal:quantization_util",
    ],
 )
 ios_unit_test(
    name = "quantize_and_dequantize_test",
    testonly = 1,
    minimum_os_version = "11.0",
    runner = tflite_ios_lab_runner("IOS_LATEST"),
    tags = tf_gpu_tests_tags() + [
        "notap",
        "tflite_not_portable_android",
    ],
    deps = [":quantize_and_dequantize_test_lib"],
 )
 cc_library(
    name = "relu",
    srcs = ["relu.cc"],
--- a/tensorflow/lite/delegates/gpu/metal/kernels/quantize_and_dequantize.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/quantize_and_dequantize.cc
@ -0,0 +1,54 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/metal/kernels/quantize_and_dequantize.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
 namespace tflite {
 namespace gpu {
 namespace metal {
 std::vector<ComputeTaskDescriptorPtr> QuantizeAndDequantize(
    int id, ValueId input_id, ValueId output_id,
    const QuantizeAndDequantizeAttributes& attr) {
  auto desc = std::make_shared<ComputeTaskDescriptor>();
  desc->id = id;
  desc->is_linkable = true;
  desc->shader_source = R"(
    FLT4 linkable$0(FLT4 value, int linear_index, uint3 gid, float3 params) {
      value = clamp(value, FLT4(params.x), FLT4(params.y));
      value = (value - FLT4(params.x)) / FLT4(params.z);
      return round(value) * FLT4(params.z) + FLT4(params.x);
    }
  )";
  desc->input_buffers = {{input_id}};
  desc->output_buffer = {output_id};
  desc->uniform_buffers = {
      {"constant float3&",
       [attr](const std::map<ValueId, BHWC>& buffers) {
         return GetByteBuffer(
             std::vector<float>{attr.min, attr.max, attr.scale});
       }},
  };
  return {desc};
 }
 }  // namespace metal
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/metal/kernels/quantize_and_dequantize.h
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/quantize_and_dequantize.h
@ -0,0 +1,50 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 namespace tflite {
 namespace gpu {
 namespace metal {
 // Performs the operation: {Quantize, Dequantize} on floating-point data.
 // We need this operation to emulate the error introduced by quantization
 // on the GPU, which cannot represent int8 tensors.
 //
 // Implemented as:
 // qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale))
 // dq_value = qvalue * qscale + qmin
 // Here, qmin, qmax & qscale refer to the quantization values as implemented in
 // TensorFlow Lite's 'FakeQuant' kernel.
 //
 // NOTE: We do not need to nudge min/max values in this op, since they would
 // already be adjusted while generating the quantized model.
 std::vector<ComputeTaskDescriptorPtr> QuantizeAndDequantize(
    int id, ValueId input_id, ValueId output_id,
    const QuantizeAndDequantizeAttributes& attr);
 }  // namespace metal
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
--- a/tensorflow/lite/delegates/gpu/metal/kernels/quantize_and_dequantize_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/quantize_and_dequantize_test.mm
@ -0,0 +1,167 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #import <XCTest/XCTest.h>
 #include <string>
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 using ::tflite::NudgeQuantizationRange;
 using ::tflite::gpu::DataType;
 using ::tflite::gpu::BHWC;
 using ::tflite::gpu::OperationType;
 using ::tflite::gpu::QuantizeAndDequantizeAttributes;
 using ::tflite::gpu::TensorRef;
 using ::tflite::gpu::metal::CompareVectors;
 using ::tflite::gpu::metal::SingleOpModel;
 // TODO: Add per-op test if possible.
@interface QuantizeAndDequantizeTest : XCTestCase
@end
@implementation QuantizeAndDequantizeTest
 - (void)setUp {
  [super setUp];
 }
 - (void)testDim2Bits8 {
  TensorRef<BHWC> input;
  input.type = DataType::FLOAT32;
  input.ref = 0;
  input.shape = BHWC(1, 3, 2, 1);
  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
  // pre-nudged, since this should be done during model conversion.
  const int num_bits = 8;
  const int quant_min = 0;
  const int quant_max = (1 << num_bits) - 1;
  QuantizeAndDequantizeAttributes attr;
  NudgeQuantizationRange(/**original_min**/ 0.0, /**original_max**/ 1.0, quant_min, quant_max,
                         &attr.min, &attr.max, &attr.scale);
  TensorRef<BHWC> output;
  output.type = DataType::FLOAT32;
  output.ref = 1;
  output.shape = BHWC(1, 3, 2, 1);
  SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr}, {input}, {output});
  XCTAssertTrue(model.PopulateTensor(0, {0.0, 1.0, 0.25, 0.50, 0.4444444, 0.00001}));
  auto status = model.Invoke();
  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
  std::vector<float> expected_output = {0.0f, 1.0f, 0.25098f, 0.498039f, 0.443137f, 0.0f};
  status =
      CompareVectors({0.0f, 1.0f, 0.25098f, 0.498039f, 0.443137f, 0.0f}, model.GetOutput(0), 1e-6f);
  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }
 - (void)testDim3Bits8_NegativeRange {
  TensorRef<BHWC> input;
  input.type = DataType::FLOAT32;
  input.ref = 0;
  input.shape = BHWC(1, 3, 1, 2);
  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
  // pre-nudged, since this should be done during model conversion.
  const int num_bits = 8;
  const int quant_min = 0;
  const int quant_max = (1 << num_bits) - 1;
  QuantizeAndDequantizeAttributes attr;
  NudgeQuantizationRange(/**original_min**/ -0.9, /**original_max**/ 0.9, quant_min, quant_max,
                         &attr.min, &attr.max, &attr.scale);
  TensorRef<BHWC> output;
  output.type = DataType::FLOAT32;
  output.ref = 1;
  output.shape = BHWC(1, 3, 1, 2);
  SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr}, {input}, {output});
  XCTAssertTrue(model.PopulateTensor(0, {0.0, -0.9, 0.25, 0.50, 0.4444444, -0.00001}));
  auto status = model.Invoke();
  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
  status = CompareVectors({0.0f, -0.896471f, 0.247059f, 0.501176f, 0.444706f, 0.0f},
                          model.GetOutput(0), 1e-6f);
  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }
 - (void)testDim3Bits16 {
  TensorRef<BHWC> input;
  input.type = DataType::FLOAT32;
  input.ref = 0;
  input.shape = BHWC(1, 3, 1, 2);
  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
  // pre-nudged, since this should be done during model conversion.
  const int num_bits = 16;
  const int quant_min = 0;
  const int quant_max = (1 << num_bits) - 1;
  QuantizeAndDequantizeAttributes attr;
  NudgeQuantizationRange(/**original_min**/ 0.0, /**original_max**/ 1.0, quant_min, quant_max,
                         &attr.min, &attr.max, &attr.scale);
  TensorRef<BHWC> output;
  output.type = DataType::FLOAT32;
  output.ref = 1;
  output.shape = BHWC(1, 3, 1, 2);
  SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr}, {input}, {output});
  XCTAssertTrue(model.PopulateTensor(0, {0.0, 1.0, 0.25, 0.50, 0.4444444, 0.00001}));
  auto status = model.Invoke();
  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
  status = CompareVectors({0.0f, 1.0f, 0.250004f, 0.500008f, 0.44445f, 1.5259e-05f},
                          model.GetOutput(0), 1e-6f);
  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }
 - (void)testDim2Bits16_NegativeRange {
  TensorRef<BHWC> input;
  input.type = DataType::FLOAT32;
  input.ref = 0;
  input.shape = BHWC(1, 3, 2, 1);
  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
  // pre-nudged, since this should be done during model conversion.
  const int num_bits = 16;
  const int quant_min = 0;
  const int quant_max = (1 << num_bits) - 1;
  QuantizeAndDequantizeAttributes attr;
  NudgeQuantizationRange(/**original_min**/ -0.9, /**original_max**/ 0.9, quant_min, quant_max,
                         &attr.min, &attr.max, &attr.scale);
  TensorRef<BHWC> output;
  output.type = DataType::FLOAT32;
  output.ref = 1;
  output.shape = BHWC(1, 3, 2, 1);
  SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr}, {input}, {output});
  XCTAssertTrue(model.PopulateTensor(0, {0.0, -0.9, 0.25, 0.50, 0.4444444, -0.00001}));
  auto status = model.Invoke();
  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
  status =
      CompareVectors({0.0f, -0.900014f, 0.249998f, 0.499995f, 0.444431f, 0.0f}, model.GetOutput(0),
                     1e-6f);
  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }
@end