Optimize int8 depthwise_conv, it is not fast as the uint8 variant for mobilenet because 3x3 is not done yet.
PiperOrigin-RevId: 243555952
This commit is contained in:
parent
a41e83060f
commit
9b0656f8ac
@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
@ -24,6 +25,7 @@ limitations under the License.
|
||||
#include "tensorflow/lite/kernels/gemm_support.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
|
||||
@ -265,6 +267,7 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
|
||||
}
|
||||
}
|
||||
|
||||
template <KernelType kernel_type>
|
||||
void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
|
||||
TfLiteDepthwiseConvParams* params, OpData* data,
|
||||
const TfLiteTensor* input,
|
||||
@ -282,7 +285,11 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
|
||||
op_params.input_offset = -input->params.zero_point;
|
||||
op_params.weights_offset = 0;
|
||||
op_params.output_offset = output->params.zero_point;
|
||||
// TODO(b/130439627): Use calculated value for clamping.
|
||||
op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
|
||||
op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
|
||||
|
||||
if (kernel_type == kReference) {
|
||||
reference_integer_ops::DepthwiseConvPerChannel(
|
||||
op_params, data->per_channel_output_multiplier.data(),
|
||||
data->per_channel_output_shift.data(), GetTensorShape(input),
|
||||
@ -290,6 +297,16 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
|
||||
GetTensorData<int8>(filter), GetTensorShape(bias),
|
||||
GetTensorData<int32>(bias), GetTensorShape(output),
|
||||
GetTensorData<int8>(output));
|
||||
} else {
|
||||
gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
|
||||
optimized_integer_ops::DepthwiseConvPerChannel(
|
||||
op_params, data->per_channel_output_multiplier.data(),
|
||||
data->per_channel_output_shift.data(), GetTensorShape(input),
|
||||
GetTensorData<int8>(input), GetTensorShape(filter),
|
||||
GetTensorData<int8>(filter), GetTensorShape(bias),
|
||||
GetTensorData<int32>(bias), GetTensorShape(output),
|
||||
GetTensorData<int8>(output), gemm_context);
|
||||
}
|
||||
}
|
||||
|
||||
template <KernelType kernel_type>
|
||||
@ -316,8 +333,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
|
||||
bias, output);
|
||||
break;
|
||||
case kTfLiteInt8: {
|
||||
EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
|
||||
output);
|
||||
EvalQuantizedPerChannel<kernel_type>(context, node, params, data, input,
|
||||
filter, bias, output);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
@ -692,7 +692,14 @@ class PerChannelQuantizedDepthwiseConvolutionOpModel
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTest) {
|
||||
class PerChannelQuantizedDepthwiseConvolutionOpTest : public SingleOpTest {
|
||||
protected:
|
||||
const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
|
||||
return *kKernelMap;
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest, SimpleTest) {
|
||||
PerChannelQuantizedDepthwiseConvolutionOpModel m(
|
||||
GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
|
||||
{TensorType_INT8,
|
||||
@ -702,9 +709,9 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTest) {
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
/*per_channel=*/true,
|
||||
/*per_channel_scales=*/{1, 2, 3, 4},
|
||||
/*per_channel_zeros=*/{0, 0, 0, 0},
|
||||
/*per_channel_quantization=*/true,
|
||||
/*per_channel_quantization_scales=*/{1, 2, 3, 4},
|
||||
/*per_channel_quantization_offsets=*/{0, 0, 0, 0},
|
||||
/*channel_index=*/3},
|
||||
{TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
|
||||
m.SetInput({
|
||||
@ -738,6 +745,102 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTest) {
|
||||
ElementsAreArray({80, 95, 53, 79, 0, -9, -49, -73}));
|
||||
}
|
||||
|
||||
// Same as previous test, except the shift will be negative for the outputs.
|
||||
TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest,
|
||||
SimpleTestNegativeOutputShift) {
|
||||
PerChannelQuantizedDepthwiseConvolutionOpModel m(
|
||||
GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
|
||||
{TensorType_INT8,
|
||||
// [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
|
||||
{1, 2, 2, 4},
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
/*per_channel_quantization=*/true,
|
||||
/*per_channel_quantization_scales=*/{0.1, 0.2, 0.3, 0.4},
|
||||
/*per_channel_quantization_offsets=*/{0, 0, 0, 0},
|
||||
/*channel_index=*/3},
|
||||
{TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
|
||||
m.SetInput({
|
||||
// [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
|
||||
3, 2, // batch = 0, y = 0, x = 0
|
||||
1, -1, // batch = 0, y = 0, x = 1
|
||||
-2, -3, // batch = 0, y = 0, x = 2
|
||||
4, 3, // batch = 0, y = 1, x = 0
|
||||
2, -2, // batch = 0, y = 1, x = 1
|
||||
-3, -4, // batch = 0, y = 1, x = 2
|
||||
});
|
||||
m.SetFilter(
|
||||
/*filter data*/
|
||||
{
|
||||
// [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
|
||||
// depth multiplier = 2
|
||||
1, 2, 3, 4, // y = 0, x = 0
|
||||
3, 4, 5, 6, // y = 0, x = 1
|
||||
7, 8, 5, 6, // y = 1, x = 0
|
||||
3, 4, 1, 2, // y = 1, x = 1
|
||||
});
|
||||
m.SetBias({3, -2, 4, 6});
|
||||
|
||||
// Invoke and verify output.
|
||||
// output has dimension [1 * 1 * 2 * 4] as [batch, y, x, output_channel]
|
||||
m.Invoke();
|
||||
EXPECT_THAT(
|
||||
m.GetDequantizedOutput(),
|
||||
ElementsAreArray(ArrayFloatNear({40, 50, 14.5, 16.5, 0, -2, -32, -42})));
|
||||
EXPECT_THAT(m.GetOutput(),
|
||||
ElementsAreArray({79, 99, 28, 32, -1, -5, -65, -85}));
|
||||
}
|
||||
|
||||
// Same as previous test, except the shift will be mixed for the outputs.
|
||||
TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest,
|
||||
SimpleTestMixedOutputShift) {
|
||||
PerChannelQuantizedDepthwiseConvolutionOpModel m(
|
||||
GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
|
||||
{TensorType_INT8,
|
||||
// [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
|
||||
{1, 2, 2, 4},
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
/*per_channel_quantization=*/true,
|
||||
/*per_channel_quantization_scales=*/{0.1, 2, 3, 0.4},
|
||||
/*per_channel_quantization_offsets=*/{0, 0, 0, 0},
|
||||
/*channel_index=*/3},
|
||||
{TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
|
||||
m.SetInput({
|
||||
// [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
|
||||
3, 2, // batch = 0, y = 0, x = 0
|
||||
1, -1, // batch = 0, y = 0, x = 1
|
||||
-2, -3, // batch = 0, y = 0, x = 2
|
||||
4, 3, // batch = 0, y = 1, x = 0
|
||||
2, -2, // batch = 0, y = 1, x = 1
|
||||
-3, -4, // batch = 0, y = 1, x = 2
|
||||
});
|
||||
m.SetFilter(
|
||||
/*filter data*/
|
||||
{
|
||||
// [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
|
||||
// depth multiplier = 2
|
||||
1, 2, 3, 4, // y = 0, x = 0
|
||||
3, 4, 5, 6, // y = 0, x = 1
|
||||
7, 8, 5, 6, // y = 1, x = 0
|
||||
3, 4, 1, 2, // y = 1, x = 1
|
||||
});
|
||||
m.SetBias({3, -2, 4, 6});
|
||||
|
||||
// Invoke and verify output.
|
||||
// output has dimension [1 * 1 * 2 * 4] as [batch, y, x, output_channel]
|
||||
m.Invoke();
|
||||
EXPECT_THAT(
|
||||
m.GetDequantizedOutput(),
|
||||
ElementsAreArray(ArrayFloatNear({40, 48, 27, 16.5, 0, -4, -24, -42})));
|
||||
EXPECT_THAT(m.GetOutput(),
|
||||
ElementsAreArray({79, 95, 53, 32, -1, -9, -49, -85}));
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
DepthwiseConvolutionOpTest, DepthwiseConvolutionOpTest,
|
||||
::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
|
||||
@ -746,6 +849,11 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
QuantizedDepthwiseConvolutionOpTest, QuantizedDepthwiseConvolutionOpTest,
|
||||
::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
PerChannelQuantizedDepthwiseConvolutionOpTest,
|
||||
PerChannelQuantizedDepthwiseConvolutionOpTest,
|
||||
::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
|
||||
|
||||
} // namespace
|
||||
} // namespace tflite
|
||||
|
||||
|
@ -178,6 +178,7 @@ cc_library(
|
||||
"optimized/im2col_utils.h",
|
||||
"optimized/integer_ops/add.h",
|
||||
"optimized/integer_ops/conv.h",
|
||||
"optimized/integer_ops/depthwise_conv.h",
|
||||
"optimized/integer_ops/fully_connected.h",
|
||||
"optimized/integer_ops/mul.h",
|
||||
"optimized/integer_ops/pooling.h",
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -15,6 +15,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
|
||||
|
||||
#include "public/gemmlowp.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
@ -27,6 +28,7 @@ inline void DepthwiseConvPerChannel(
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
int8* output_data) {
|
||||
// Get parameters.
|
||||
gemmlowp::ScopedProfilingLabel label("DepthwiseConvInt8");
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
@ -36,10 +38,8 @@ inline void DepthwiseConvPerChannel(
|
||||
const int depth_multiplier = params.depth_multiplier;
|
||||
const int32 input_offset = params.input_offset;
|
||||
const int32 output_offset = params.output_offset;
|
||||
|
||||
// Set min and max value of the output.
|
||||
const int32 output_activation_min = std::numeric_limits<int8_t>::min();
|
||||
const int32 output_activation_max = std::numeric_limits<int8_t>::max();
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
const int32 output_activation_max = params.quantized_activation_max;
|
||||
|
||||
// Check dimensions of the tensors.
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
|
Loading…
Reference in New Issue
Block a user