From f6a39be9f488e431bbb891520100e4d290fc07f6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Feb 2019 09:51:21 -0800
Subject: [PATCH] Introduce rounding into depthwise conv reference / test.

PiperOrigin-RevId: 232504607
---
 .../internal/depthwiseconv_quantized_test.cc  |  77 +++++--
 .../internal/reference/depthwiseconv_uint8.h  | 201 +++++++++++-------
 2 files changed, 186 insertions(+), 92 deletions(-)
diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
index 963ab4d8ff6..b396e6256c7 100644
--- a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
@@ -57,7 +57,8 @@ enum class CoverageExtension {
 
 // The TestParam structure below is the preferred parameterization of tests. A
 // tuple version is defined in order to support value-parameterized tests.
-typedef std::tuple<DepthwiseConvInvocation, int, bool, bool, bool>
+typedef std::tuple<DepthwiseConvInvocation, int, bool, bool, bool,
+                   DepthwiseConvOutputRounding, bool>
     TestParamTuple;
 
 struct TestParam {
@@ -68,7 +69,9 @@ struct TestParam {
         tests_to_run(::testing::get<1>(param_tuple)),
         test_stride(::testing::get<2>(param_tuple)),
         test_pad(::testing::get<3>(param_tuple)),
-        test_depth_multiplier(::testing::get<4>(param_tuple)) {}
+        test_depth_multiplier(::testing::get<4>(param_tuple)),
+        output_rounding(::testing::get<5>(param_tuple)),
+        loose_tolerance(::testing::get<6>(param_tuple)) {}
 
   static std::string TestNameSuffix(
       const ::testing::TestParamInfo<TestParamTuple>& info) {
@@ -84,6 +87,9 @@ struct TestParam {
   bool test_stride = false;
   bool test_pad = false;
   bool test_depth_multiplier = false;
+  DepthwiseConvOutputRounding output_rounding =
+      DepthwiseConvOutputRounding::kNone;
+  bool loose_tolerance = false;
 };
 
 inline void DispatchDepthwiseConv(
@@ -183,9 +189,30 @@ int TestOneDepthwiseConvWithGivenOutputShift(
   op_params.output_offset = output_offset;
   op_params.output_multiplier = output_multiplier;
   op_params.output_shift = -output_shift;
-  reference_ops::DepthwiseConv(op_params, input_shape, input_data, filter_shape,
-                               filter_data, bias_shape, bias_data, output_shape,
-                               reference_output_data.data());
+  switch (test_param.output_rounding) {
+    case DepthwiseConvOutputRounding::kUpward:
+      reference_ops::DepthwiseConvBasicKernel<
+          DepthwiseConvOutputRounding::kAwayFromZero>::Run(op_params,
+                                                           input_shape,
+                                                           input_data,
+                                                           filter_shape,
+                                                           filter_data,
+                                                           bias_shape,
+                                                           bias_data,
+                                                           output_shape,
+                                                           reference_output_data
+                                                               .data());
+      break;
+    case DepthwiseConvOutputRounding::kAwayFromZero:
+      reference_ops::DepthwiseConv(
+          op_params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, reference_output_data.data());
+      break;
+    case DepthwiseConvOutputRounding::kNone:
+    default:
+      EXPECT_NE(test_param.output_rounding, DepthwiseConvOutputRounding::kNone);
+      break;
+  }
   DispatchDepthwiseConv(test_param, op_params, input_shape, input_data,
                         filter_shape, filter_data, bias_shape, bias_data,
                         output_shape, output_data.data());
@@ -221,10 +248,10 @@ int TestOneDepthwiseConvWithGivenOutputShift(
 
   // Normally we should require bit-for-bit exact results. Unfortunately a bug
   // in the Intel arm_neon_sse.h translation header that we use for x86 tests
-  // causes 1-bit inaccuracy in
-  // the vqrdmulh_n_s32 intrinsic, which causes off-by-1 errors in quantized
-  // DepthwiseConv ops. So we have to live with a few off-by-one errors for now,
-  // yet still ensure that no more than a small minority of values are wrong.
+  // causes 1-bit inaccuracy in the vqrdmulh_n_s32 intrinsic, which causes
+  // off-by-1 errors in quantized DepthwiseConv ops. So we have to live with a
+  // few off-by-one errors for now, yet still ensure that no more than a small
+  // minority of values are wrong.
   EXPECT_LT(std::abs(mean_diff), mean_tolerance);
   EXPECT_LT(mean_abs_diff, mean_tolerance);
   EXPECT_LE(std::abs(median_diff), diff_median_tolerance);
@@ -482,16 +509,21 @@ bool TryTestOneNeonDot3x3(const TestParam& test_param,
       dilation_width_factor, dilation_height_factor, padding_type);
 }
 
-void TestOneDepthwiseConv(DepthwiseConvInvocation forced_invocation) {
+void TestOneDepthwiseConv(DepthwiseConvInvocation forced_invocation,
+                          DepthwiseConvOutputRounding output_rounding) {
   TestParam test_param;
   test_param.forced_invocation = forced_invocation;
+  test_param.output_rounding = output_rounding;
   while (!TryTestOneDepthwiseConv(test_param, ParamsSpecialization::kNone)) {
   }
 }
 
-void TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation forced_invocation) {
+void TestOneDepthwiseConv3x3Filter(
+    DepthwiseConvInvocation forced_invocation,
+    DepthwiseConvOutputRounding output_rounding) {
   TestParam test_param;
   test_param.forced_invocation = forced_invocation;
+  test_param.output_rounding = output_rounding;
   while (!TryTestOneDepthwiseConv3x3Filter(test_param,
                                            ParamsSpecialization::kNone)) {
   }
@@ -505,7 +537,8 @@ void TestOneNeonDot3x3(const TestParam& test_param) {
 TEST(TestDepthwiseConv, TestDepthwiseConv) {
   const int kTestsToRun = 10 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv(DepthwiseConvInvocation::kNone);
+    TestOneDepthwiseConv(DepthwiseConvInvocation::kNone,
+                         DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
@@ -513,14 +546,16 @@ TEST(TestDepthwiseConv, TestDepthwiseConv) {
 TEST(TestDepthwiseConv, TestGenericKernel) {
   const int kTestsToRun = 10 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv(DepthwiseConvInvocation::kUseGenericKernel);
+    TestOneDepthwiseConv(DepthwiseConvInvocation::kUseGenericKernel,
+                         DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
 TEST(TestDepthwiseConv, TestKernel3x3Filter) {
   const int kTestsToRun = 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation::kNone);
+    TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation::kNone,
+                                  DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
@@ -529,7 +564,8 @@ TEST(TestDepthwiseConv, TestKernel3x3Filter) {
 TEST(TestDepthwiseConv, TestGenericKernel3x3Filter) {
   const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation::kUseGenericKernel);
+    TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation::kUseGenericKernel,
+                                  DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
@@ -537,7 +573,8 @@ TEST(TestDepthwiseConv, TestGenericKernel3x3Filter) {
 TEST(TestDepthwiseConv, TestNeon3x3Filter) {
   const int kTestsToRun = 3 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation::kUseNeon3x3);
+    TestOneDepthwiseConv3x3Filter(DepthwiseConvInvocation::kUseNeon3x3,
+                                  DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 #endif
@@ -559,7 +596,9 @@ INSTANTIATE_TEST_SUITE_P(
         Values(1000),                                  // tests_to_run
         Bool(),                                        // test_stride
         Values(false),                                 // test_pad
-        Values(false)                                  // test_depth_multiplier
+        Values(false),                                 // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kAwayFromZero),  // output_rounding
+        Values(false)                                        // loose_tolerance
         ),
     TestParam::TestNameSuffix);
 #endif
@@ -574,7 +613,9 @@ INSTANTIATE_TEST_SUITE_P(
         Values(100),                                      // tests_to_run
         Bool(),                                           // test_stride
         Bool(),                                           // test_pad
-        Bool()  // test_depth_multiplier
+        Bool(),                                        // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
+        Values(false)                                  // loose_tolerance
         ),
     TestParam::TestNameSuffix);
 
diff --git a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
index 99c03426f52..7cc5679dcb6 100644
--- a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -44,6 +44,14 @@ enum class DepthwiseConvInvocation {
   kUseIntrinsics3x3DotProduct,  // 3x3 kernel using NEON intrinsics.
 };
 
+// Category of depthwise convolution output rounding.
+enum class DepthwiseConvOutputRounding {
+  kNone = 0,      // Invalid: specific method must be specified.
+  kAwayFromZero,  // Original method: exact halves rounded away from zero.
+  kUpward,        // Halves towards +infinity: adds 0.5 before truncate.
+  // This is where a future kNearestEven would be placed.
+};
+
 // Category of depthwise convolution depth multiplication.
 enum class DepthwiseConvDepthMultiplication {
   kNoMultiplication = 0,  // Depth multiplier = 1.
@@ -52,88 +60,133 @@ enum class DepthwiseConvDepthMultiplication {
 
 namespace reference_ops {
 
+template <DepthwiseConvOutputRounding output_rounding>
+inline int32 DepthwiseConvRound(int32 x, int32 quantized_multiplier,
+                                int shift) {
+  TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+template <>
+inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
+    int32 x, int32 quantized_multiplier, int shift) {
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+template <>
+inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
+    int32 x, int32 quantized_multiplier, int shift) {
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  const int left_shift = shift > 0 ? shift : 0;
+  const int right_shift = shift > 0 ? 0 : -shift;
+  const int rounding_offset = right_shift > 0 ? 1 << (right_shift - 1) : 0;
+  return (SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                            quantized_multiplier) +
+          rounding_offset) >>
+         right_shift;
+}
+
+template <DepthwiseConvOutputRounding output_rounding>
+struct DepthwiseConvBasicKernel {
+  static inline void Run(const DepthwiseParams& params,
+                         const RuntimeShape& input_shape,
+                         const uint8* input_data,
+                         const RuntimeShape& filter_shape,
+                         const uint8* filter_data,
+                         const RuntimeShape& bias_shape, const int32* bias_data,
+                         const RuntimeShape& output_shape, uint8* output_data) {
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int dilation_width_factor = params.dilation_width_factor;
+    const int dilation_height_factor = params.dilation_height_factor;
+    const int pad_width = params.padding_values.width;
+    const int pad_height = params.padding_values.height;
+    const int depth_multiplier = params.depth_multiplier;
+    const int32 output_activation_min = params.quantized_activation_min;
+    const int32 output_activation_max = params.quantized_activation_max;
+    const int32 input_offset = params.input_offset;
+    const int32 filter_offset = params.weights_offset;
+    const int32 output_offset = params.output_offset;
+    const int32 output_multiplier = params.output_multiplier;
+    const int output_shift = params.output_shift;
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+    TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+    const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+    TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+    for (int b = 0; b < batches; ++b) {
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          for (int ic = 0; ic < input_depth; ++ic) {
+            for (int m = 0; m < depth_multiplier; m++) {
+              const int oc = m + ic * depth_multiplier;
+              const int in_x_origin = (out_x * stride_width) - pad_width;
+              const int in_y_origin = (out_y * stride_height) - pad_height;
+              int32 acc = 0;
+              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                  const int in_x =
+                      in_x_origin + dilation_width_factor * filter_x;
+                  const int in_y =
+                      in_y_origin + dilation_height_factor * filter_y;
+                  // If the location is outside the bounds of the input image,
+                  // use zero as a default value.
+                  if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                      (in_y < input_height)) {
+                    int32 input_val =
+                        input_data[Offset(input_shape, b, in_y, in_x, ic)];
+                    int32 filter_val = filter_data[Offset(
+                        filter_shape, 0, filter_y, filter_x, oc)];
+                    acc += (filter_val + filter_offset) *
+                           (input_val + input_offset);
+                  }
+                }
+              }
+              if (bias_data) {
+                acc += bias_data[oc];
+              }
+              acc = DepthwiseConvRound<output_rounding>(acc, output_multiplier,
+                                                        output_shift);
+              acc += output_offset;
+              acc = std::max(acc, output_activation_min);
+              acc = std::min(acc, output_activation_max);
+              output_data[Offset(output_shape, b, out_y, out_x, oc)] =
+                  static_cast<uint8>(acc);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
 inline void DepthwiseConv(
     const DepthwiseParams& params, const RuntimeShape& input_shape,
     const uint8* input_data, const RuntimeShape& filter_shape,
     const uint8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape,
     uint8* output_data) {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
-  const int output_shift = params.output_shift;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-
-  for (int b = 0; b < batches; ++b) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        for (int ic = 0; ic < input_depth; ++ic) {
-          for (int m = 0; m < depth_multiplier; m++) {
-            const int oc = m + ic * depth_multiplier;
-            const int in_x_origin = (out_x * stride_width) - pad_width;
-            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
-            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  int32 input_val =
-                      input_data[Offset(input_shape, b, in_y, in_x, ic)];
-                  int32 filter_val = filter_data[Offset(
-                      filter_shape, 0, filter_y, filter_x, oc)];
-                  acc +=
-                      (filter_val + filter_offset) * (input_val + input_offset);
-                }
-              }
-            }
-            if (bias_data) {
-              acc += bias_data[oc];
-            }
-            acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
-                                                output_shift);
-            acc += output_offset;
-            acc = std::max(acc, output_activation_min);
-            acc = std::min(acc, output_activation_max);
-            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
-                static_cast<uint8>(acc);
-          }
-        }
-      }
-    }
-  }
+  return DepthwiseConvBasicKernel<
+      DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
+                                                       input_data, filter_shape,
+                                                       filter_data, bias_shape,
+                                                       bias_data, output_shape,
+                                                       output_data);
 }
 
-}  // end namespace reference_ops
+}  // namespace reference_ops
 }  // end namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_