From f6a39be9f488e431bbb891520100e4d290fc07f6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 5 Feb 2019 09:51:21 -0800 Subject: [PATCH] Introduce rounding into depthwise conv reference / test. PiperOrigin-RevId: 232504607 --- .../internal/depthwiseconv_quantized_test.cc | 77 +++++-- .../internal/reference/depthwiseconv_uint8.h | 201 +++++++++++------- 2 files changed, 186 insertions(+), 92 deletions(-) diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc index 963ab4d8ff6..b396e6256c7 100644 --- a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc +++ b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc @@ -57,7 +57,8 @@ enum class CoverageExtension { // The TestParam structure below is the preferred parameterization of tests. A // tuple version is defined in order to support value-parameterized tests. -typedef std::tuple +typedef std::tuple TestParamTuple; struct TestParam { @@ -68,7 +69,9 @@ struct TestParam { tests_to_run(::testing::get<1>(param_tuple)), test_stride(::testing::get<2>(param_tuple)), test_pad(::testing::get<3>(param_tuple)), - test_depth_multiplier(::testing::get<4>(param_tuple)) {} + test_depth_multiplier(::testing::get<4>(param_tuple)), + output_rounding(::testing::get<5>(param_tuple)), + loose_tolerance(::testing::get<6>(param_tuple)) {} static std::string TestNameSuffix( const ::testing::TestParamInfo& info) { @@ -84,6 +87,9 @@ struct TestParam { bool test_stride = false; bool test_pad = false; bool test_depth_multiplier = false; + DepthwiseConvOutputRounding output_rounding = + DepthwiseConvOutputRounding::kNone; + bool loose_tolerance = false; }; inline void DispatchDepthwiseConv( @@ -183,9 +189,30 @@ int TestOneDepthwiseConvWithGivenOutputShift( op_params.output_offset = output_offset; op_params.output_multiplier = output_multiplier; op_params.output_shift = -output_shift; - reference_ops::DepthwiseConv(op_params, input_shape, input_data, filter_shape, - filter_data, bias_shape, bias_data, output_shape, - reference_output_data.data()); + switch (test_param.output_rounding) { + case DepthwiseConvOutputRounding::kUpward: + reference_ops::DepthwiseConvBasicKernel< + DepthwiseConvOutputRounding::kAwayFromZero>::Run(op_params, + input_shape, + input_data, + filter_shape, + filter_data, + bias_shape, + bias_data, + output_shape, + reference_output_data + .data()); + break; + case DepthwiseConvOutputRounding::kAwayFromZero: + reference_ops::DepthwiseConv( + op_params, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, reference_output_data.data()); + break; + case DepthwiseConvOutputRounding::kNone: + default: + EXPECT_NE(test_param.output_rounding, DepthwiseConvOutputRounding::kNone); + break; + } DispatchDepthwiseConv(test_param, op_params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data.data()); @@ -221,10 +248,10 @@ int TestOneDepthwiseConvWithGivenOutputShift( // Normally we should require bit-for-bit exact results. Unfortunately a bug // in the Intel arm_neon_sse.h translation header that we use for x86 tests - // causes 1-bit inaccuracy in - // the vqrdmulh_n_s32 intrinsic, which causes off-by-1 errors in quantized - // DepthwiseConv ops. So we have to live with a few off-by-one errors for now, - // yet still ensure that no more than a small minority of values are wrong. + // causes 1-bit inaccuracy in the vqrdmulh_n_s32 intrinsic, which causes + // off-by-1 errors in quantized DepthwiseConv ops. So we have to live with a + // few off-by-one errors for now, yet still ensure that no more than a small + // minority of values are wrong. EXPECT_LT(std::abs(mean_diff), mean_tolerance); EXPECT_LT(mean_abs_diff, mean_tolerance); EXPECT_LE(std::abs(median_diff), diff_median_tolerance); @@ -482,16 +509,21 @@ bool TryTestOneNeonDot3x3(const TestParam& test_param, dilation_width_factor, dilation_height_factor, padding_type); } -void TestOneDepthwiseConv(DepthwiseConvInvocation forced_invocation) { +void TestOneDepthwiseConv(DepthwiseConvInvocation forced_invocation, + DepthwiseConvOutputRounding output_rounding) { TestParam test_param; test_param.forced_invocation = forced_invocation; + test_param.output_rounding = output_rounding; while (!TryTestOneDepthwiseConv(test_param, ParamsSpecialization::kNone)) { } } -void TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation forced_invocation) { +void TestOneDepthwiseConv3x3Filter( + DepthwiseConvInvocation forced_invocation, + DepthwiseConvOutputRounding output_rounding) { TestParam test_param; test_param.forced_invocation = forced_invocation; + test_param.output_rounding = output_rounding; while (!TryTestOneDepthwiseConv3x3Filter(test_param, ParamsSpecialization::kNone)) { } @@ -505,7 +537,8 @@ void TestOneNeonDot3x3(const TestParam& test_param) { TEST(TestDepthwiseConv, TestDepthwiseConv) { const int kTestsToRun = 10 * 1000; for (int i = 0; i < kTestsToRun; i++) { - TestOneDepthwiseConv(DepthwiseConvInvocation::kNone); + TestOneDepthwiseConv(DepthwiseConvInvocation::kNone, + DepthwiseConvOutputRounding::kAwayFromZero); } } @@ -513,14 +546,16 @@ TEST(TestDepthwiseConv, TestDepthwiseConv) { TEST(TestDepthwiseConv, TestGenericKernel) { const int kTestsToRun = 10 * 1000; for (int i = 0; i < kTestsToRun; i++) { - TestOneDepthwiseConv(DepthwiseConvInvocation::kUseGenericKernel); + TestOneDepthwiseConv(DepthwiseConvInvocation::kUseGenericKernel, + DepthwiseConvOutputRounding::kAwayFromZero); } } TEST(TestDepthwiseConv, TestKernel3x3Filter) { const int kTestsToRun = 1000; for (int i = 0; i < kTestsToRun; i++) { - TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation::kNone); + TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation::kNone, + DepthwiseConvOutputRounding::kAwayFromZero); } } @@ -529,7 +564,8 @@ TEST(TestDepthwiseConv, TestKernel3x3Filter) { TEST(TestDepthwiseConv, TestGenericKernel3x3Filter) { const int kTestsToRun = 100; for (int i = 0; i < kTestsToRun; i++) { - TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation::kUseGenericKernel); + TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation::kUseGenericKernel, + DepthwiseConvOutputRounding::kAwayFromZero); } } @@ -537,7 +573,8 @@ TEST(TestDepthwiseConv, TestGenericKernel3x3Filter) { TEST(TestDepthwiseConv, TestNeon3x3Filter) { const int kTestsToRun = 3 * 1000; for (int i = 0; i < kTestsToRun; i++) { - TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation::kUseNeon3x3); + TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation::kUseNeon3x3, + DepthwiseConvOutputRounding::kAwayFromZero); } } #endif @@ -559,7 +596,9 @@ INSTANTIATE_TEST_SUITE_P( Values(1000), // tests_to_run Bool(), // test_stride Values(false), // test_pad - Values(false) // test_depth_multiplier + Values(false), // test_depth_multiplier + Values(DepthwiseConvOutputRounding::kAwayFromZero), // output_rounding + Values(false) // loose_tolerance ), TestParam::TestNameSuffix); #endif @@ -574,7 +613,9 @@ INSTANTIATE_TEST_SUITE_P( Values(100), // tests_to_run Bool(), // test_stride Bool(), // test_pad - Bool() // test_depth_multiplier + Bool(), // test_depth_multiplier + Values(DepthwiseConvOutputRounding::kUpward), // output_rounding + Values(false) // loose_tolerance ), TestParam::TestNameSuffix); diff --git a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h index 99c03426f52..7cc5679dcb6 100644 --- a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h +++ b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h @@ -44,6 +44,14 @@ enum class DepthwiseConvInvocation { kUseIntrinsics3x3DotProduct, // 3x3 kernel using NEON intrinsics. }; +// Category of depthwise convolution output rounding. +enum class DepthwiseConvOutputRounding { + kNone = 0, // Invalid: specific method must be specified. + kAwayFromZero, // Original method: exact halves rounded away from zero. + kUpward, // Halves towards +infinity: adds 0.5 before truncate. + // This is where a future kNearestEven would be placed. +}; + // Category of depthwise convolution depth multiplication. enum class DepthwiseConvDepthMultiplication { kNoMultiplication = 0, // Depth multiplier = 1. @@ -52,88 +60,133 @@ enum class DepthwiseConvDepthMultiplication { namespace reference_ops { +template +inline int32 DepthwiseConvRound(int32 x, int32 quantized_multiplier, + int shift) { + TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone); + return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift); +} + +template <> +inline int32 DepthwiseConvRound( + int32 x, int32 quantized_multiplier, int shift) { + return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift); +} + +template <> +inline int32 DepthwiseConvRound( + int32 x, int32 quantized_multiplier, int shift) { + using gemmlowp::SaturatingRoundingDoublingHighMul; + const int left_shift = shift > 0 ? shift : 0; + const int right_shift = shift > 0 ? 0 : -shift; + const int rounding_offset = right_shift > 0 ? 1 << (right_shift - 1) : 0; + return (SaturatingRoundingDoublingHighMul(x * (1 << left_shift), + quantized_multiplier) + + rounding_offset) >> + right_shift; +} + +template +struct DepthwiseConvBasicKernel { + static inline void Run(const DepthwiseParams& params, + const RuntimeShape& input_shape, + const uint8* input_data, + const RuntimeShape& filter_shape, + const uint8* filter_data, + const RuntimeShape& bias_shape, const int32* bias_data, + const RuntimeShape& output_shape, uint8* output_data) { + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + const int32 input_offset = params.input_offset; + const int32 filter_offset = params.weights_offset; + const int32 output_offset = params.output_offset; + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + + for (int b = 0; b < batches; ++b) { + for (int out_y = 0; out_y < output_height; ++out_y) { + for (int out_x = 0; out_x < output_width; ++out_x) { + for (int ic = 0; ic < input_depth; ++ic) { + for (int m = 0; m < depth_multiplier; m++) { + const int oc = m + ic * depth_multiplier; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32 acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + const int in_x = + in_x_origin + dilation_width_factor * filter_x; + const int in_y = + in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32 input_val = + input_data[Offset(input_shape, b, in_y, in_x, ic)]; + int32 filter_val = filter_data[Offset( + filter_shape, 0, filter_y, filter_x, oc)]; + acc += (filter_val + filter_offset) * + (input_val + input_offset); + } + } + } + if (bias_data) { + acc += bias_data[oc]; + } + acc = DepthwiseConvRound(acc, output_multiplier, + output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[Offset(output_shape, b, out_y, out_x, oc)] = + static_cast(acc); + } + } + } + } + } + } +}; + inline void DepthwiseConv( const DepthwiseParams& params, const RuntimeShape& input_shape, const uint8* input_data, const RuntimeShape& filter_shape, const uint8* filter_data, const RuntimeShape& bias_shape, const int32* bias_data, const RuntimeShape& output_shape, uint8* output_data) { - const int stride_width = params.stride_width; - const int stride_height = params.stride_height; - const int dilation_width_factor = params.dilation_width_factor; - const int dilation_height_factor = params.dilation_height_factor; - const int pad_width = params.padding_values.width; - const int pad_height = params.padding_values.height; - const int depth_multiplier = params.depth_multiplier; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; - const int32 input_offset = params.input_offset; - const int32 filter_offset = params.weights_offset; - const int32 output_offset = params.output_offset; - const int32 output_multiplier = params.output_multiplier; - const int output_shift = params.output_shift; - TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); - TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); - TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); - - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - const int batches = MatchingDim(input_shape, 0, output_shape, 0); - const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); - const int input_height = input_shape.Dims(1); - const int input_width = input_shape.Dims(2); - const int input_depth = input_shape.Dims(3); - const int filter_height = filter_shape.Dims(1); - const int filter_width = filter_shape.Dims(2); - const int output_height = output_shape.Dims(1); - const int output_width = output_shape.Dims(2); - TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); - TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); - - for (int b = 0; b < batches; ++b) { - for (int out_y = 0; out_y < output_height; ++out_y) { - for (int out_x = 0; out_x < output_width; ++out_x) { - for (int ic = 0; ic < input_depth; ++ic) { - for (int m = 0; m < depth_multiplier; m++) { - const int oc = m + ic * depth_multiplier; - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; - int32 acc = 0; - for (int filter_y = 0; filter_y < filter_height; ++filter_y) { - for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - const int in_x = in_x_origin + dilation_width_factor * filter_x; - const int in_y = - in_y_origin + dilation_height_factor * filter_y; - // If the location is outside the bounds of the input image, - // use zero as a default value. - if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && - (in_y < input_height)) { - int32 input_val = - input_data[Offset(input_shape, b, in_y, in_x, ic)]; - int32 filter_val = filter_data[Offset( - filter_shape, 0, filter_y, filter_x, oc)]; - acc += - (filter_val + filter_offset) * (input_val + input_offset); - } - } - } - if (bias_data) { - acc += bias_data[oc]; - } - acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, - output_shift); - acc += output_offset; - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_data[Offset(output_shape, b, out_y, out_x, oc)] = - static_cast(acc); - } - } - } - } - } + return DepthwiseConvBasicKernel< + DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape, + input_data, filter_shape, + filter_data, bias_shape, + bias_data, output_shape, + output_data); } -} // end namespace reference_ops +} // namespace reference_ops } // end namespace tflite #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_