Optimize int8 depthwise_conv, it is not fast as the uint8 variant for mobilenet because 3x3 is not done yet.

PiperOrigin-RevId: 243555952
This commit is contained in:
Renjie Liu 2019-04-14 22:50:11 -07:00 committed by TensorFlower Gardener
parent a41e83060f
commit 9b0656f8ac
5 changed files with 2215 additions and 17 deletions

View File

@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <cassert>
#include <cmath>
#include <cstdio>
@ -24,6 +25,7 @@ limitations under the License.
#include "tensorflow/lite/kernels/gemm_support.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
@ -265,6 +267,7 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
}
}
template <KernelType kernel_type>
void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
TfLiteDepthwiseConvParams* params, OpData* data,
const TfLiteTensor* input,
@ -282,14 +285,28 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
op_params.input_offset = -input->params.zero_point;
op_params.weights_offset = 0;
op_params.output_offset = output->params.zero_point;
// TODO(b/130439627): Use calculated value for clamping.
op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
reference_integer_ops::DepthwiseConvPerChannel(
op_params, data->per_channel_output_multiplier.data(),
data->per_channel_output_shift.data(), GetTensorShape(input),
GetTensorData<int8>(input), GetTensorShape(filter),
GetTensorData<int8>(filter), GetTensorShape(bias),
GetTensorData<int32>(bias), GetTensorShape(output),
GetTensorData<int8>(output));
if (kernel_type == kReference) {
reference_integer_ops::DepthwiseConvPerChannel(
op_params, data->per_channel_output_multiplier.data(),
data->per_channel_output_shift.data(), GetTensorShape(input),
GetTensorData<int8>(input), GetTensorShape(filter),
GetTensorData<int8>(filter), GetTensorShape(bias),
GetTensorData<int32>(bias), GetTensorShape(output),
GetTensorData<int8>(output));
} else {
gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
optimized_integer_ops::DepthwiseConvPerChannel(
op_params, data->per_channel_output_multiplier.data(),
data->per_channel_output_shift.data(), GetTensorShape(input),
GetTensorData<int8>(input), GetTensorShape(filter),
GetTensorData<int8>(filter), GetTensorShape(bias),
GetTensorData<int32>(bias), GetTensorShape(output),
GetTensorData<int8>(output), gemm_context);
}
}
template <KernelType kernel_type>
@ -316,8 +333,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
bias, output);
break;
case kTfLiteInt8: {
EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
output);
EvalQuantizedPerChannel<kernel_type>(context, node, params, data, input,
filter, bias, output);
break;
}
default:

View File

@ -692,7 +692,14 @@ class PerChannelQuantizedDepthwiseConvolutionOpModel
}
};
TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTest) {
class PerChannelQuantizedDepthwiseConvolutionOpTest : public SingleOpTest {
protected:
const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
return *kKernelMap;
}
};
TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest, SimpleTest) {
PerChannelQuantizedDepthwiseConvolutionOpModel m(
GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
{TensorType_INT8,
@ -702,9 +709,9 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTest) {
0,
0,
0,
/*per_channel=*/true,
/*per_channel_scales=*/{1, 2, 3, 4},
/*per_channel_zeros=*/{0, 0, 0, 0},
/*per_channel_quantization=*/true,
/*per_channel_quantization_scales=*/{1, 2, 3, 4},
/*per_channel_quantization_offsets=*/{0, 0, 0, 0},
/*channel_index=*/3},
{TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
m.SetInput({
@ -738,6 +745,102 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTest) {
ElementsAreArray({80, 95, 53, 79, 0, -9, -49, -73}));
}
// Same as previous test, except the shift will be negative for the outputs.
TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest,
SimpleTestNegativeOutputShift) {
PerChannelQuantizedDepthwiseConvolutionOpModel m(
GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
{TensorType_INT8,
// [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
{1, 2, 2, 4},
0,
0,
0,
0,
/*per_channel_quantization=*/true,
/*per_channel_quantization_scales=*/{0.1, 0.2, 0.3, 0.4},
/*per_channel_quantization_offsets=*/{0, 0, 0, 0},
/*channel_index=*/3},
{TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
m.SetInput({
// [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
3, 2, // batch = 0, y = 0, x = 0
1, -1, // batch = 0, y = 0, x = 1
-2, -3, // batch = 0, y = 0, x = 2
4, 3, // batch = 0, y = 1, x = 0
2, -2, // batch = 0, y = 1, x = 1
-3, -4, // batch = 0, y = 1, x = 2
});
m.SetFilter(
/*filter data*/
{
// [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
// depth multiplier = 2
1, 2, 3, 4, // y = 0, x = 0
3, 4, 5, 6, // y = 0, x = 1
7, 8, 5, 6, // y = 1, x = 0
3, 4, 1, 2, // y = 1, x = 1
});
m.SetBias({3, -2, 4, 6});
// Invoke and verify output.
// output has dimension [1 * 1 * 2 * 4] as [batch, y, x, output_channel]
m.Invoke();
EXPECT_THAT(
m.GetDequantizedOutput(),
ElementsAreArray(ArrayFloatNear({40, 50, 14.5, 16.5, 0, -2, -32, -42})));
EXPECT_THAT(m.GetOutput(),
ElementsAreArray({79, 99, 28, 32, -1, -5, -65, -85}));
}
// Same as previous test, except the shift will be mixed for the outputs.
TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest,
SimpleTestMixedOutputShift) {
PerChannelQuantizedDepthwiseConvolutionOpModel m(
GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
{TensorType_INT8,
// [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
{1, 2, 2, 4},
0,
0,
0,
0,
/*per_channel_quantization=*/true,
/*per_channel_quantization_scales=*/{0.1, 2, 3, 0.4},
/*per_channel_quantization_offsets=*/{0, 0, 0, 0},
/*channel_index=*/3},
{TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
m.SetInput({
// [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
3, 2, // batch = 0, y = 0, x = 0
1, -1, // batch = 0, y = 0, x = 1
-2, -3, // batch = 0, y = 0, x = 2
4, 3, // batch = 0, y = 1, x = 0
2, -2, // batch = 0, y = 1, x = 1
-3, -4, // batch = 0, y = 1, x = 2
});
m.SetFilter(
/*filter data*/
{
// [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
// depth multiplier = 2
1, 2, 3, 4, // y = 0, x = 0
3, 4, 5, 6, // y = 0, x = 1
7, 8, 5, 6, // y = 1, x = 0
3, 4, 1, 2, // y = 1, x = 1
});
m.SetBias({3, -2, 4, 6});
// Invoke and verify output.
// output has dimension [1 * 1 * 2 * 4] as [batch, y, x, output_channel]
m.Invoke();
EXPECT_THAT(
m.GetDequantizedOutput(),
ElementsAreArray(ArrayFloatNear({40, 48, 27, 16.5, 0, -4, -24, -42})));
EXPECT_THAT(m.GetOutput(),
ElementsAreArray({79, 95, 53, 32, -1, -9, -49, -85}));
}
INSTANTIATE_TEST_SUITE_P(
DepthwiseConvolutionOpTest, DepthwiseConvolutionOpTest,
::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
@ -746,6 +849,11 @@ INSTANTIATE_TEST_SUITE_P(
QuantizedDepthwiseConvolutionOpTest, QuantizedDepthwiseConvolutionOpTest,
::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
INSTANTIATE_TEST_SUITE_P(
PerChannelQuantizedDepthwiseConvolutionOpTest,
PerChannelQuantizedDepthwiseConvolutionOpTest,
::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
} // namespace
} // namespace tflite

View File

@ -178,6 +178,7 @@ cc_library(
"optimized/im2col_utils.h",
"optimized/integer_ops/add.h",
"optimized/integer_ops/conv.h",
"optimized/integer_ops/depthwise_conv.h",
"optimized/integer_ops/fully_connected.h",
"optimized/integer_ops/mul.h",
"optimized/integer_ops/pooling.h",

File diff suppressed because it is too large Load Diff

View File

@ -15,6 +15,7 @@ limitations under the License.
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
#include "public/gemmlowp.h"
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
@ -27,6 +28,7 @@ inline void DepthwiseConvPerChannel(
const int32* bias_data, const RuntimeShape& output_shape,
int8* output_data) {
// Get parameters.
gemmlowp::ScopedProfilingLabel label("DepthwiseConvInt8");
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
@ -36,10 +38,8 @@ inline void DepthwiseConvPerChannel(
const int depth_multiplier = params.depth_multiplier;
const int32 input_offset = params.input_offset;
const int32 output_offset = params.output_offset;
// Set min and max value of the output.
const int32 output_activation_min = std::numeric_limits<int8_t>::min();
const int32 output_activation_max = std::numeric_limits<int8_t>::max();
const int32 output_activation_min = params.quantized_activation_min;
const int32 output_activation_max = params.quantized_activation_max;
// Check dimensions of the tensors.
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);