Optimize int8 depthwise_conv, it is not fast as the uint8 variant for mobilenet because 3x3 is not done yet.

PiperOrigin-RevId: 243555952
This commit is contained in:
Renjie Liu 2019-04-14 22:50:11 -07:00 committed by TensorFlower Gardener
parent a41e83060f
commit 9b0656f8ac
5 changed files with 2215 additions and 17 deletions

View File

@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#include <cassert> #include <cassert>
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
@ -24,6 +25,7 @@ limitations under the License.
#include "tensorflow/lite/kernels/gemm_support.h" #include "tensorflow/lite/kernels/gemm_support.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h" #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h" #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h" #include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h" #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h" #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
@ -265,6 +267,7 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
} }
} }
template <KernelType kernel_type>
void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
TfLiteDepthwiseConvParams* params, OpData* data, TfLiteDepthwiseConvParams* params, OpData* data,
const TfLiteTensor* input, const TfLiteTensor* input,
@ -282,7 +285,11 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
op_params.input_offset = -input->params.zero_point; op_params.input_offset = -input->params.zero_point;
op_params.weights_offset = 0; op_params.weights_offset = 0;
op_params.output_offset = output->params.zero_point; op_params.output_offset = output->params.zero_point;
// TODO(b/130439627): Use calculated value for clamping.
op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
if (kernel_type == kReference) {
reference_integer_ops::DepthwiseConvPerChannel( reference_integer_ops::DepthwiseConvPerChannel(
op_params, data->per_channel_output_multiplier.data(), op_params, data->per_channel_output_multiplier.data(),
data->per_channel_output_shift.data(), GetTensorShape(input), data->per_channel_output_shift.data(), GetTensorShape(input),
@ -290,6 +297,16 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
GetTensorData<int8>(filter), GetTensorShape(bias), GetTensorData<int8>(filter), GetTensorShape(bias),
GetTensorData<int32>(bias), GetTensorShape(output), GetTensorData<int32>(bias), GetTensorShape(output),
GetTensorData<int8>(output)); GetTensorData<int8>(output));
} else {
gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
optimized_integer_ops::DepthwiseConvPerChannel(
op_params, data->per_channel_output_multiplier.data(),
data->per_channel_output_shift.data(), GetTensorShape(input),
GetTensorData<int8>(input), GetTensorShape(filter),
GetTensorData<int8>(filter), GetTensorShape(bias),
GetTensorData<int32>(bias), GetTensorShape(output),
GetTensorData<int8>(output), gemm_context);
}
} }
template <KernelType kernel_type> template <KernelType kernel_type>
@ -316,8 +333,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
bias, output); bias, output);
break; break;
case kTfLiteInt8: { case kTfLiteInt8: {
EvalQuantizedPerChannel(context, node, params, data, input, filter, bias, EvalQuantizedPerChannel<kernel_type>(context, node, params, data, input,
output); filter, bias, output);
break; break;
} }
default: default:

View File

@ -692,7 +692,14 @@ class PerChannelQuantizedDepthwiseConvolutionOpModel
} }
}; };
TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTest) { class PerChannelQuantizedDepthwiseConvolutionOpTest : public SingleOpTest {
protected:
const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
return *kKernelMap;
}
};
TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest, SimpleTest) {
PerChannelQuantizedDepthwiseConvolutionOpModel m( PerChannelQuantizedDepthwiseConvolutionOpModel m(
GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1}, GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
{TensorType_INT8, {TensorType_INT8,
@ -702,9 +709,9 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTest) {
0, 0,
0, 0,
0, 0,
/*per_channel=*/true, /*per_channel_quantization=*/true,
/*per_channel_scales=*/{1, 2, 3, 4}, /*per_channel_quantization_scales=*/{1, 2, 3, 4},
/*per_channel_zeros=*/{0, 0, 0, 0}, /*per_channel_quantization_offsets=*/{0, 0, 0, 0},
/*channel_index=*/3}, /*channel_index=*/3},
{TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID); {TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
m.SetInput({ m.SetInput({
@ -738,6 +745,102 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTest) {
ElementsAreArray({80, 95, 53, 79, 0, -9, -49, -73})); ElementsAreArray({80, 95, 53, 79, 0, -9, -49, -73}));
} }
// Same as previous test, except the shift will be negative for the outputs.
TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest,
SimpleTestNegativeOutputShift) {
PerChannelQuantizedDepthwiseConvolutionOpModel m(
GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
{TensorType_INT8,
// [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
{1, 2, 2, 4},
0,
0,
0,
0,
/*per_channel_quantization=*/true,
/*per_channel_quantization_scales=*/{0.1, 0.2, 0.3, 0.4},
/*per_channel_quantization_offsets=*/{0, 0, 0, 0},
/*channel_index=*/3},
{TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
m.SetInput({
// [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
3, 2, // batch = 0, y = 0, x = 0
1, -1, // batch = 0, y = 0, x = 1
-2, -3, // batch = 0, y = 0, x = 2
4, 3, // batch = 0, y = 1, x = 0
2, -2, // batch = 0, y = 1, x = 1
-3, -4, // batch = 0, y = 1, x = 2
});
m.SetFilter(
/*filter data*/
{
// [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
// depth multiplier = 2
1, 2, 3, 4, // y = 0, x = 0
3, 4, 5, 6, // y = 0, x = 1
7, 8, 5, 6, // y = 1, x = 0
3, 4, 1, 2, // y = 1, x = 1
});
m.SetBias({3, -2, 4, 6});
// Invoke and verify output.
// output has dimension [1 * 1 * 2 * 4] as [batch, y, x, output_channel]
m.Invoke();
EXPECT_THAT(
m.GetDequantizedOutput(),
ElementsAreArray(ArrayFloatNear({40, 50, 14.5, 16.5, 0, -2, -32, -42})));
EXPECT_THAT(m.GetOutput(),
ElementsAreArray({79, 99, 28, 32, -1, -5, -65, -85}));
}
// Same as previous test, except the shift will be mixed for the outputs.
TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest,
SimpleTestMixedOutputShift) {
PerChannelQuantizedDepthwiseConvolutionOpModel m(
GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
{TensorType_INT8,
// [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
{1, 2, 2, 4},
0,
0,
0,
0,
/*per_channel_quantization=*/true,
/*per_channel_quantization_scales=*/{0.1, 2, 3, 0.4},
/*per_channel_quantization_offsets=*/{0, 0, 0, 0},
/*channel_index=*/3},
{TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
m.SetInput({
// [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
3, 2, // batch = 0, y = 0, x = 0
1, -1, // batch = 0, y = 0, x = 1
-2, -3, // batch = 0, y = 0, x = 2
4, 3, // batch = 0, y = 1, x = 0
2, -2, // batch = 0, y = 1, x = 1
-3, -4, // batch = 0, y = 1, x = 2
});
m.SetFilter(
/*filter data*/
{
// [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
// depth multiplier = 2
1, 2, 3, 4, // y = 0, x = 0
3, 4, 5, 6, // y = 0, x = 1
7, 8, 5, 6, // y = 1, x = 0
3, 4, 1, 2, // y = 1, x = 1
});
m.SetBias({3, -2, 4, 6});
// Invoke and verify output.
// output has dimension [1 * 1 * 2 * 4] as [batch, y, x, output_channel]
m.Invoke();
EXPECT_THAT(
m.GetDequantizedOutput(),
ElementsAreArray(ArrayFloatNear({40, 48, 27, 16.5, 0, -4, -24, -42})));
EXPECT_THAT(m.GetOutput(),
ElementsAreArray({79, 95, 53, 32, -1, -9, -49, -85}));
}
INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P(
DepthwiseConvolutionOpTest, DepthwiseConvolutionOpTest, DepthwiseConvolutionOpTest, DepthwiseConvolutionOpTest,
::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap))); ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
@ -746,6 +849,11 @@ INSTANTIATE_TEST_SUITE_P(
QuantizedDepthwiseConvolutionOpTest, QuantizedDepthwiseConvolutionOpTest, QuantizedDepthwiseConvolutionOpTest, QuantizedDepthwiseConvolutionOpTest,
::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap))); ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
INSTANTIATE_TEST_SUITE_P(
PerChannelQuantizedDepthwiseConvolutionOpTest,
PerChannelQuantizedDepthwiseConvolutionOpTest,
::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
} // namespace } // namespace
} // namespace tflite } // namespace tflite

View File

@ -178,6 +178,7 @@ cc_library(
"optimized/im2col_utils.h", "optimized/im2col_utils.h",
"optimized/integer_ops/add.h", "optimized/integer_ops/add.h",
"optimized/integer_ops/conv.h", "optimized/integer_ops/conv.h",
"optimized/integer_ops/depthwise_conv.h",
"optimized/integer_ops/fully_connected.h", "optimized/integer_ops/fully_connected.h",
"optimized/integer_ops/mul.h", "optimized/integer_ops/mul.h",
"optimized/integer_ops/pooling.h", "optimized/integer_ops/pooling.h",

File diff suppressed because it is too large Load Diff

View File

@ -15,6 +15,7 @@ limitations under the License.
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_ #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_ #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
#include "public/gemmlowp.h"
#include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/common.h"
namespace tflite { namespace tflite {
@ -27,6 +28,7 @@ inline void DepthwiseConvPerChannel(
const int32* bias_data, const RuntimeShape& output_shape, const int32* bias_data, const RuntimeShape& output_shape,
int8* output_data) { int8* output_data) {
// Get parameters. // Get parameters.
gemmlowp::ScopedProfilingLabel label("DepthwiseConvInt8");
const int stride_width = params.stride_width; const int stride_width = params.stride_width;
const int stride_height = params.stride_height; const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor; const int dilation_width_factor = params.dilation_width_factor;
@ -36,10 +38,8 @@ inline void DepthwiseConvPerChannel(
const int depth_multiplier = params.depth_multiplier; const int depth_multiplier = params.depth_multiplier;
const int32 input_offset = params.input_offset; const int32 input_offset = params.input_offset;
const int32 output_offset = params.output_offset; const int32 output_offset = params.output_offset;
const int32 output_activation_min = params.quantized_activation_min;
// Set min and max value of the output. const int32 output_activation_max = params.quantized_activation_max;
const int32 output_activation_min = std::numeric_limits<int8_t>::min();
const int32 output_activation_max = std::numeric_limits<int8_t>::max();
// Check dimensions of the tensors. // Check dimensions of the tensors.
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);