Adds QuantizeAndDequantize kernel to Metal backend

* Fixed comment to match implementation.

PiperOrigin-RevId: 314684033
Change-Id: Ice38717811c8a5f6abf4a4fb65ecbdf7b9ed42ca
This commit is contained in:
Taehee Jeong 2020-06-04 00:58:02 -07:00 committed by TensorFlower Gardener
parent 1421933a1d
commit 282f234242
7 changed files with 336 additions and 7 deletions

View File

@ -37,11 +37,10 @@ namespace cl {
// on the GPU, which cannot represent int8 tensors.
//
// Implemented as:
// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale) + 0.5)
// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale))
// dq_value = qvalue * qscale + qmin
// Here, qmin, qmax & qscale refer to the quantization values as implemented in
// TensorFlow Lite's 'FakeQuant' kernel. round(x + 0.5) ensures we round away
// from zero.
// TensorFlow Lite's 'FakeQuant' kernel.
//
// NOTE: We do not need to nudge min/max values in this op, since they would
// already be adjusted while generating the quantized model.

View File

@ -30,11 +30,10 @@ namespace gl {
// on the GPU, which cannot represent int8 tensors.
//
// Implemented as:
// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale) + 0.5)
// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale))
// dq_value = qvalue * qscale + qmin
// Here, qmin, qmax & qscale refer to the quantization values as implemented in
// TensorFlow Lite's 'FakeQuant' kernel. round(x + 0.5) ensures we round away
// from zero.
// TensorFlow Lite's 'FakeQuant' kernel.
//
// NOTE: We do not need to nudge min/max values in this op, since they would
// already be adjusted while generating the quantized model.

View File

@ -39,6 +39,7 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/metal/kernels/padding.h"
#include "tensorflow/lite/delegates/gpu/metal/kernels/pooling.h"
#include "tensorflow/lite/delegates/gpu/metal/kernels/prelu.h"
#include "tensorflow/lite/delegates/gpu/metal/kernels/quantize_and_dequantize.h"
#include "tensorflow/lite/delegates/gpu/metal/kernels/relu.h"
#include "tensorflow/lite/delegates/gpu/metal/kernels/reshape.h"
#include "tensorflow/lite/delegates/gpu/metal/kernels/resize.h"
@ -96,6 +97,12 @@ std::vector<ComputeTaskDescriptorPtr> SelectConvolutionTransposed(
}
}
std::vector<ComputeTaskDescriptorPtr> SelectQuantizeAndDequantize(
int id, ValueId input_id, ValueId output_id,
const QuantizeAndDequantizeAttributes& attr) {
return QuantizeAndDequantize(id, input_id, output_id, attr);
}
std::vector<ComputeTaskDescriptorPtr> SelectPReLU(
const GraphFloat32& graph, int id, ValueId input_id, ValueId output_id,
const PReLUAttributes& attr, const metal::RuntimeOptions& options) {
@ -351,6 +358,12 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
*tasks = ReLU(node_id, inputs[0], outputs[0],
absl::any_cast<ReLUAttributes>(node->operation.attributes));
break;
case OperationType::QUANTIZE_AND_DEQUANTIZE:
*tasks = SelectQuantizeAndDequantize(
node_id, inputs[0], outputs[0],
absl::any_cast<QuantizeAndDequantizeAttributes>(
node->operation.attributes));
break;
case OperationType::RESHAPE:
*tasks = SelectReshape(
graph, node_id, inputs[0], outputs[0],
@ -427,7 +440,6 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
case OperationType::BATCH_TO_SPACE:
case OperationType::CONST:
case OperationType::LSTM:
case OperationType::QUANTIZE_AND_DEQUANTIZE:
case OperationType::SPACE_TO_BATCH:
case OperationType::TRANSPOSE:
case OperationType::UNKNOWN:

View File

@ -31,6 +31,7 @@ cc_library(
":padding",
":pooling",
":prelu",
":quantize_and_dequantize",
":relu",
":reshape",
":resize",
@ -539,6 +540,53 @@ ios_unit_test(
deps = [":prelu_test_lib"],
)
cc_library(
name = "quantize_and_dequantize",
srcs = ["quantize_and_dequantize.cc"],
hdrs = ["quantize_and_dequantize.h"],
deps = [
"//tensorflow/lite/delegates/gpu/common:model",
"//tensorflow/lite/delegates/gpu/common:operations",
"//tensorflow/lite/delegates/gpu/common:shape",
"//tensorflow/lite/delegates/gpu/common:types",
"//tensorflow/lite/delegates/gpu/common:util",
"//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
"//tensorflow/lite/delegates/gpu/metal:runtime_options",
"@com_google_absl//absl/strings",
],
)
objc_library(
name = "quantize_and_dequantize_test_lib",
testonly = 1,
srcs = ["quantize_and_dequantize_test.mm"],
sdk_frameworks = ["XCTest"],
deps = [
":quantize_and_dequantize",
":test_util",
"//tensorflow/lite/delegates/gpu/common:operations",
"//tensorflow/lite/delegates/gpu/common:shape",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:tensor",
"//tensorflow/lite/delegates/gpu/common:util",
"//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
"//tensorflow/lite/delegates/gpu/metal:runtime_options",
"//tensorflow/lite/kernels/internal:quantization_util",
],
)
ios_unit_test(
name = "quantize_and_dequantize_test",
testonly = 1,
minimum_os_version = "11.0",
runner = tflite_ios_lab_runner("IOS_LATEST"),
tags = tf_gpu_tests_tags() + [
"notap",
"tflite_not_portable_android",
],
deps = [":quantize_and_dequantize_test_lib"],
)
cc_library(
name = "relu",
srcs = ["relu.cc"],

View File

@ -0,0 +1,54 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/metal/kernels/quantize_and_dequantize.h"
#include "tensorflow/lite/delegates/gpu/common/model.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
namespace tflite {
namespace gpu {
namespace metal {
std::vector<ComputeTaskDescriptorPtr> QuantizeAndDequantize(
int id, ValueId input_id, ValueId output_id,
const QuantizeAndDequantizeAttributes& attr) {
auto desc = std::make_shared<ComputeTaskDescriptor>();
desc->id = id;
desc->is_linkable = true;
desc->shader_source = R"(
FLT4 linkable$0(FLT4 value, int linear_index, uint3 gid, float3 params) {
value = clamp(value, FLT4(params.x), FLT4(params.y));
value = (value - FLT4(params.x)) / FLT4(params.z);
return round(value) * FLT4(params.z) + FLT4(params.x);
}
)";
desc->input_buffers = {{input_id}};
desc->output_buffer = {output_id};
desc->uniform_buffers = {
{"constant float3&",
[attr](const std::map<ValueId, BHWC>& buffers) {
return GetByteBuffer(
std::vector<float>{attr.min, attr.max, attr.scale});
}},
};
return {desc};
}
} // namespace metal
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,50 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
#include <vector>
#include "tensorflow/lite/delegates/gpu/common/model.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
namespace tflite {
namespace gpu {
namespace metal {
// Performs the operation: {Quantize, Dequantize} on floating-point data.
// We need this operation to emulate the error introduced by quantization
// on the GPU, which cannot represent int8 tensors.
//
// Implemented as:
// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale))
// dq_value = qvalue * qscale + qmin
// Here, qmin, qmax & qscale refer to the quantization values as implemented in
// TensorFlow Lite's 'FakeQuant' kernel.
//
// NOTE: We do not need to nudge min/max values in this op, since they would
// already be adjusted while generating the quantized model.
std::vector<ComputeTaskDescriptorPtr> QuantizeAndDequantize(
int id, ValueId input_id, ValueId output_id,
const QuantizeAndDequantizeAttributes& attr);
} // namespace metal
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_

View File

@ -0,0 +1,167 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#import <XCTest/XCTest.h>
#include <string>
#include <vector>
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/util.h"
#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
using ::tflite::NudgeQuantizationRange;
using ::tflite::gpu::DataType;
using ::tflite::gpu::BHWC;
using ::tflite::gpu::OperationType;
using ::tflite::gpu::QuantizeAndDequantizeAttributes;
using ::tflite::gpu::TensorRef;
using ::tflite::gpu::metal::CompareVectors;
using ::tflite::gpu::metal::SingleOpModel;
// TODO: Add per-op test if possible.
@interface QuantizeAndDequantizeTest : XCTestCase
@end
@implementation QuantizeAndDequantizeTest
- (void)setUp {
[super setUp];
}
- (void)testDim2Bits8 {
TensorRef<BHWC> input;
input.type = DataType::FLOAT32;
input.ref = 0;
input.shape = BHWC(1, 3, 2, 1);
// Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
// pre-nudged, since this should be done during model conversion.
const int num_bits = 8;
const int quant_min = 0;
const int quant_max = (1 << num_bits) - 1;
QuantizeAndDequantizeAttributes attr;
NudgeQuantizationRange(/**original_min**/ 0.0, /**original_max**/ 1.0, quant_min, quant_max,
&attr.min, &attr.max, &attr.scale);
TensorRef<BHWC> output;
output.type = DataType::FLOAT32;
output.ref = 1;
output.shape = BHWC(1, 3, 2, 1);
SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr}, {input}, {output});
XCTAssertTrue(model.PopulateTensor(0, {0.0, 1.0, 0.25, 0.50, 0.4444444, 0.00001}));
auto status = model.Invoke();
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
std::vector<float> expected_output = {0.0f, 1.0f, 0.25098f, 0.498039f, 0.443137f, 0.0f};
status =
CompareVectors({0.0f, 1.0f, 0.25098f, 0.498039f, 0.443137f, 0.0f}, model.GetOutput(0), 1e-6f);
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
}
- (void)testDim3Bits8_NegativeRange {
TensorRef<BHWC> input;
input.type = DataType::FLOAT32;
input.ref = 0;
input.shape = BHWC(1, 3, 1, 2);
// Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
// pre-nudged, since this should be done during model conversion.
const int num_bits = 8;
const int quant_min = 0;
const int quant_max = (1 << num_bits) - 1;
QuantizeAndDequantizeAttributes attr;
NudgeQuantizationRange(/**original_min**/ -0.9, /**original_max**/ 0.9, quant_min, quant_max,
&attr.min, &attr.max, &attr.scale);
TensorRef<BHWC> output;
output.type = DataType::FLOAT32;
output.ref = 1;
output.shape = BHWC(1, 3, 1, 2);
SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr}, {input}, {output});
XCTAssertTrue(model.PopulateTensor(0, {0.0, -0.9, 0.25, 0.50, 0.4444444, -0.00001}));
auto status = model.Invoke();
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
status = CompareVectors({0.0f, -0.896471f, 0.247059f, 0.501176f, 0.444706f, 0.0f},
model.GetOutput(0), 1e-6f);
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
}
- (void)testDim3Bits16 {
TensorRef<BHWC> input;
input.type = DataType::FLOAT32;
input.ref = 0;
input.shape = BHWC(1, 3, 1, 2);
// Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
// pre-nudged, since this should be done during model conversion.
const int num_bits = 16;
const int quant_min = 0;
const int quant_max = (1 << num_bits) - 1;
QuantizeAndDequantizeAttributes attr;
NudgeQuantizationRange(/**original_min**/ 0.0, /**original_max**/ 1.0, quant_min, quant_max,
&attr.min, &attr.max, &attr.scale);
TensorRef<BHWC> output;
output.type = DataType::FLOAT32;
output.ref = 1;
output.shape = BHWC(1, 3, 1, 2);
SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr}, {input}, {output});
XCTAssertTrue(model.PopulateTensor(0, {0.0, 1.0, 0.25, 0.50, 0.4444444, 0.00001}));
auto status = model.Invoke();
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
status = CompareVectors({0.0f, 1.0f, 0.250004f, 0.500008f, 0.44445f, 1.5259e-05f},
model.GetOutput(0), 1e-6f);
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
}
- (void)testDim2Bits16_NegativeRange {
TensorRef<BHWC> input;
input.type = DataType::FLOAT32;
input.ref = 0;
input.shape = BHWC(1, 3, 2, 1);
// Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
// pre-nudged, since this should be done during model conversion.
const int num_bits = 16;
const int quant_min = 0;
const int quant_max = (1 << num_bits) - 1;
QuantizeAndDequantizeAttributes attr;
NudgeQuantizationRange(/**original_min**/ -0.9, /**original_max**/ 0.9, quant_min, quant_max,
&attr.min, &attr.max, &attr.scale);
TensorRef<BHWC> output;
output.type = DataType::FLOAT32;
output.ref = 1;
output.shape = BHWC(1, 3, 2, 1);
SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr}, {input}, {output});
XCTAssertTrue(model.PopulateTensor(0, {0.0, -0.9, 0.25, 0.50, 0.4444444, -0.00001}));
auto status = model.Invoke();
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
status =
CompareVectors({0.0f, -0.900014f, 0.249998f, 0.499995f, 0.444431f, 0.0f}, model.GetOutput(0),
1e-6f);
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
}
@end