Merge pull request #36129 from wwwind:depthwise_conv2d_16x8

PiperOrigin-RevId: 296852392 Change-Id: I3d0e26349a926863a60030b9c410d59a257a87cb
2020-02-24 03:04:36 -08:00 · 2020-02-24 03:04:36 -08:00 · ad497b69a0
commit ad497b69a0
parent 4e62ba9dfa 078e347389
5 changed files with 477 additions and 5 deletions
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@ -114,16 +114,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 4);

  const TfLiteType data_type = input->type;
+
  const TfLiteType filter_type = filter->type;
  const bool is_hybrid =
      data_type == kTfLiteFloat32 && filter_type == kTfLiteInt8;
-  TF_LITE_ENSURE(context, data_type == kTfLiteFloat32 ||
-                              data_type == kTfLiteUInt8 ||
-                              data_type == kTfLiteInt8);
+  TF_LITE_ENSURE(context,
+                 data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8 ||
+                     data_type == kTfLiteInt8 || data_type == kTfLiteInt16);
  TF_LITE_ENSURE_EQ(context, output->type, data_type);
  if (!is_hybrid) {
-    TF_LITE_ENSURE_EQ(context, filter->type, data_type);
+    TF_LITE_ENSURE(context,
+                   filter->type == data_type || data_type == kTfLiteInt16);
  }
+
  // Filter in DepthwiseConv is expected to be [1, H, W, O].
  TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 0), 1);

@ -132,6 +135,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
    if (data_type == kTfLiteUInt8 || data_type == kTfLiteInt8) {
      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
      TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
+    } else if (data_type == kTfLiteInt16) {
+      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt64);
+      TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
+      TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
    } else {
      TF_LITE_ENSURE_EQ(context, bias->type, data_type);
    }
@ -398,6 +406,34 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
  return kTfLiteOk;
 }

+TfLiteStatus EvalQuantizedPerChannel16x8(
+    const TfLiteDepthwiseConvParams* params, const OpData* data,
+    const TfLiteTensor* input, const TfLiteTensor* filter,
+    const TfLiteTensor* bias, TfLiteTensor* output) {
+  DepthwiseParams op_params;
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.weights_offset = 0;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  reference_integer_ops::DepthwiseConvPerChannel(
+      op_params, data->per_channel_output_multiplier.data(),
+      data->per_channel_output_shift.data(), GetTensorShape(input),
+      GetTensorData<int16>(input), GetTensorShape(filter),
+      GetTensorData<int8>(filter), GetTensorShape(bias),
+      GetTensorData<std::int64_t>(bias), GetTensorShape(output),
+      GetTensorData<int16>(output));
+
+  return kTfLiteOk;
+}
+
 template <KernelType kernel_type>
 TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
                                  TfLiteDepthwiseConvParams* params,
@ -435,6 +471,7 @@ TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.depth_multiplier = params->depth_multiplier;
+
  op_params.weights_offset = 0;
  op_params.float_activation_min = output_activation_min;
  op_params.float_activation_max = output_activation_max;
@ -457,6 +494,7 @@ TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
        GetTensorData<float>(output), affine_quantization->scale->data,
        input_offset_ptr, CpuBackendContext::GetFromContext(context));
  }
+
  return kTfLiteOk;
 }

@ -495,6 +533,11 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
    case kTfLiteInt8:
      return EvalQuantizedPerChannel<kernel_type>(context, node, params, data,
                                                  input, filter, bias, output);
+      break;
+    case kTfLiteInt16:
+      return EvalQuantizedPerChannel16x8(params, data, input, filter, bias,
+                                         output);
+      break;
    default:
      context->ReportError(context, "Type %d not currently supported.",
                           input->type);
@ -513,6 +556,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
      return EvalImpl<kernel_type, kTfLiteUInt8>(context, node);
    case kTfLiteInt8:
      return EvalImpl<kernel_type, kTfLiteInt8>(context, node);
+    case kTfLiteInt16:
+      return EvalImpl<kernel_type, kTfLiteInt16>(context, node);
    default:
      context->ReportError(context, "Type %d not currently supported.",
                           input->type);
--- a/tensorflow/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/kernels/depthwise_conv_test.cc
@ -71,7 +71,11 @@ class BaseDepthwiseConvolutionOpModel : public SingleOpModel {
              input.scale * filter.per_channel_quantization_scales[i];
          bias_zero_points[i] = 0;
        }
-        TensorData bias{TensorType_INT32,
+        tflite::TensorType bias_type = TensorType_INT32;
+        if (input.type == TensorType_INT16) {
+          bias_type = TensorType_INT64;
+        }
+        TensorData bias{bias_type,
                        {bias_size},
                        /*min=*/0,
                        /*max=*/0,
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@ -843,6 +843,23 @@ cc_test(
    ],
 )

+cc_test(
+    name = "depthwiseconv_per_channel_quantized_16x8_test",
+    srcs = [
+        "depthwiseconv_per_channel_quantized_16x8_test.cc",
+    ],
+    shard_count = 2,
+    deps = [
+        ":common",
+        ":optimized_base",
+        ":quantization_util",
+        ":reference_base",
+        ":test_util",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_test(
    name = "resize_bilinear_test",
    srcs = ["resize_bilinear_test.cc"],
--- a/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_16x8_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_16x8_test.cc
@ -0,0 +1,320 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <stdio.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+#include "tensorflow/lite/kernels/internal/test_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace {
+
+void PickOutputMultiplier(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const int16* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    float* output_multiplier) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  std::int64_t output_accu_min = std::numeric_limits<std::int64_t>::max();
+  std::int64_t output_accu_max = std::numeric_limits<std::int64_t>::min();
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            std::int64_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                  int32 filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += static_cast<int64_t>(filter_val) *
+                         static_cast<int64_t>(input_val);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+            output_accu_max = std::max(acc, output_accu_max);
+            output_accu_min = std::min(acc, output_accu_min);
+          }
+        }
+      }
+    }
+  }
+
+  // Since int16 ranges from -32768 to 32767, we need to squeeze the accumulator
+  // min/max fit in those ranges correspondingly as much as possible.
+  if (std::abs(output_accu_max) > std::abs(output_accu_min)) {
+    *output_multiplier = 32767.0f / std::abs(output_accu_max);
+  } else {
+    *output_multiplier = 32768.0f / std::abs(output_accu_min);
+  }
+}
+
+void PickReasonableMultiplier(
+    const DepthwiseParams& params, int output_activation_min,
+    int output_activation_max, int output_depth,
+    const RuntimeShape& input_shape_inference, const std::int16_t* input_data,
+    const RuntimeShape& filter_shape_inference, const std::int8_t* filter_data,
+    const RuntimeShape& bias_shape_inference, const std::int64_t* bias_data,
+    const RuntimeShape& output_shape_inference,
+    std::int32_t* output_multiplier_ptr, std::int32_t* output_shift_ptr,
+    std::int16_t* output_data) {
+  float output_multiplier;
+  PickOutputMultiplier(params, input_shape_inference, input_data,
+                       filter_shape_inference, filter_data,
+                       bias_shape_inference, bias_data, output_shape_inference,
+                       &output_multiplier);
+
+  int base_multiplier;
+  int base_shift;
+  QuantizeMultiplier(output_multiplier, &base_multiplier, &base_shift);
+  for (int i = 0; i < output_depth; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
+    output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
+  }
+}
+
+bool GenerateValidShapeConfigurations(
+    int filter_width, int filter_height, int depth_multiplier,
+    int dilation_width_factor, int dilation_height_factor,
+    RuntimeShape* input_shape_inference, RuntimeShape* filter_shape_inference,
+    RuntimeShape* output_shape_inference, int* pad_width, int* pad_height,
+    int* stride) {
+  const int batch = UniformRandomInt(1, 3);
+  const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
+  const int input_width = UniformRandomInt(5, 50);
+  const int input_height = UniformRandomInt(5, 50);
+  *stride = UniformRandomInt(1, 2);
+  const bool test_pad = UniformRandomInt(0, 1);
+  const auto padding_type = test_pad ? PaddingType::kValid : PaddingType::kSame;
+
+  const int output_depth = input_depth * depth_multiplier;
+
+  input_shape_inference->BuildFrom(
+      {batch, input_height, input_width, input_depth});
+
+  filter_shape_inference->BuildFrom(
+      {1, filter_height, filter_width, output_depth});
+
+  EXPECT_TRUE(ComputeConvSizes(
+      *input_shape_inference, output_depth, filter_width, filter_height,
+      *stride, dilation_width_factor, dilation_height_factor, padding_type,
+      output_shape_inference, pad_width, pad_height));
+
+  return true;
+}
+
+void IntToFloat(std::vector<float>* d, std::vector<std::int8_t>* s) {
+  for (unsigned int i = 0; i < s->size(); i++) {
+    d->data()[i] = (float)s->data()[i];
+  }
+}
+
+void IntToFloat(std::vector<float>* d, std::vector<std::int64_t>* s) {
+  for (unsigned int i = 0; i < s->size(); i++) {
+    d->data()[i] = (float)s->data()[i];
+  }
+}
+
+void TryTestOneDepthwiseConv3x3Filter() {
+  const int filter_width = 3;
+  const int filter_height = 3;
+  const int depth_multiplier = 1;
+  // We don't support dilations in the 3x3 filter.
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+
+  const int output_activation_min = -32768;
+  const int output_activation_max = 32767;
+
+  RuntimeShape input_shape_inference;
+  RuntimeShape filter_shape_inference;
+  RuntimeShape output_shape_inference;
+  int pad_width, pad_height;
+  int stride;
+
+  // Keeps trying until we get valid shape/configurations for 3x3 filter case.
+  bool generated_valid_configurations_for_3x3_kernel = false;
+  while (!generated_valid_configurations_for_3x3_kernel) {
+    generated_valid_configurations_for_3x3_kernel =
+        GenerateValidShapeConfigurations(
+            filter_width, filter_height, depth_multiplier,
+            dilation_width_factor, dilation_height_factor,
+            &input_shape_inference, &filter_shape_inference,
+            &output_shape_inference, &pad_width, &pad_height, &stride);
+  }
+
+  const int output_depth = output_shape_inference.Dims(3);
+
+  RuntimeShape bias_shape_inference({1, 1, 1, output_depth});
+  const int input_buffer_size = input_shape_inference.FlatSize();
+  const int filter_buffer_size = filter_shape_inference.FlatSize();
+  const int output_buffer_size = output_shape_inference.FlatSize();
+  std::vector<std::int16_t> input_data(input_buffer_size);
+  std::vector<std::int8_t> filter_data(filter_buffer_size);
+  std::vector<std::int64_t> bias_data(output_depth);
+
+  FillRandom(&input_data);
+  FillRandom(&filter_data);
+  for (int i = 0; i < output_depth; i++) {
+    bias_data.data()[i] = 0;
+  }
+
+  DepthwiseParams params;
+  params.stride_width = stride;
+  params.stride_height = stride;
+  params.dilation_height_factor = dilation_height_factor;
+  params.dilation_width_factor = dilation_width_factor;
+  params.padding_values.width = pad_width;
+  params.padding_values.height = pad_height;
+  params.depth_multiplier = depth_multiplier;
+  params.weights_offset = 0;
+  params.quantized_activation_min = output_activation_min;
+  params.quantized_activation_max = output_activation_max;
+  params.float_activation_max = (float)(1LL << 40);
+  params.float_activation_min = -params.float_activation_max;
+
+  std::vector<std::int16_t> reference_output_data(output_buffer_size);
+  std::vector<std::int16_t> neon_output_data(output_buffer_size);
+
+  std::vector<std::int32_t> output_multiplier(output_depth);
+  std::vector<std::int32_t> output_shift(output_depth);
+
+  // It's hard to come up with a right multiplier, random guess basically makes
+  // all the results saturated and becomes meaningfulless, so we first use
+  // reference impl to poke the min/max value of the accumulation, then use that
+  // value as a guided suggestion for us to populate meaningful mulitplier &
+  // shift.
+  PickReasonableMultiplier(
+      params, output_activation_min, output_activation_max, output_depth,
+      input_shape_inference, input_data.data(), filter_shape_inference,
+      filter_data.data(), bias_shape_inference, bias_data.data(),
+      output_shape_inference, output_multiplier.data(), output_shift.data(),
+      reference_output_data.data());
+
+  // The following tests compare reference impl for 16x8 version and
+  // float reference operator.
+  reference_integer_ops::DepthwiseConvPerChannel(
+      params, output_multiplier.data(), output_shift.data(),
+      input_shape_inference, input_data.data(), filter_shape_inference,
+      filter_data.data(), bias_shape_inference, bias_data.data(),
+      output_shape_inference, reference_output_data.data());
+
+  std::vector<float> input_data_float(input_buffer_size);
+  std::vector<float> filter_data_float(filter_buffer_size);
+  std::vector<float> bias_data_float(output_depth);
+  std::vector<float> output_data_float(output_buffer_size);
+
+  for (int i = 0; i < input_buffer_size; i++) {
+    input_data_float.data()[i] = (float)(input_data.data()[i]);
+  }
+  IntToFloat(&filter_data_float, &filter_data);
+  IntToFloat(&bias_data_float, &bias_data);
+
+  reference_ops::DepthwiseConv(
+      params, input_shape_inference, input_data_float.data(),
+      filter_shape_inference, filter_data_float.data(), bias_shape_inference,
+      bias_data_float.data(), output_shape_inference, output_data_float.data());
+
+  for (int n = 0; n < output_shape_inference.Dims(0); n++) {
+    for (int h = 0; h < output_shape_inference.Dims(1); h++) {
+      for (int w = 0; w < output_shape_inference.Dims(2); w++) {
+        for (int c = 0; c < output_shape_inference.Dims(3); c++) {
+          int offset = Offset(output_shape_inference, n, h, w, c);
+          float float_res = output_data_float.data()[offset];
+          int16 int16_res = reference_output_data.data()[offset];
+          int32 output_mul = output_multiplier.data()[c];
+          int shift = output_shift.data()[c];
+          float scale = (float)output_mul / (float)(1ULL << 31);
+          if (shift > 0) scale = scale * (float)(1 << shift);
+          if (shift < 0) scale = scale / (float)(1 << -shift);
+          int ref_res = floor(float_res * scale + 0.5);
+          if (ref_res < output_activation_min) ref_res = output_activation_min;
+          if (ref_res > output_activation_max) ref_res = output_activation_max;
+          int e = (ref_res - int16_res);
+          if (e < 0) e = -e;
+          if (e > 1) {
+            printf(
+                "(%d,%d,%d,%d) scale=%08x shift=%d res=%d float=%f (%f,%f)\n",
+                n, h, w, c, output_mul, shift, int16_res, float_res * scale,
+                float_res, scale);
+            EXPECT_TRUE(false);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(QuantizedDepthwiseConvPerChannelTest, FastKernelTest) {
+  for (int i = 0; i < 30; ++i) {
+    TryTestOneDepthwiseConv3x3Filter();
+  }
+}
+
+}  // namespace
+}  // namespace tflite
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@ -119,6 +119,92 @@ inline void DepthwiseConvPerChannel(
  }
 }

+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32* output_multiplier,
+    const int32* output_shift, const RuntimeShape& input_shape,
+    const int16* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    int16* output_data) {
+  // Get parameters.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            std::int64_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                  int32 filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  // Accumulate with 64 bits accumulator.
+                  // We assume maximum of 2^16 accumulations as with the 8-bit
+                  // case so actually the value in the accumulator should not
+                  // exceed 40 bits
+                  acc += static_cast<int64_t>(filter_val) *
+                         static_cast<int64_t>(input_val);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+            int32 scaled_acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_shift[output_channel]);
+            scaled_acc = std::max(scaled_acc, output_activation_min);
+            scaled_acc = std::min(scaled_acc, output_activation_max);
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] =
+                static_cast<int16_t>(scaled_acc);
+          }
+        }
+      }
+    }
+  }
+}
+
 inline void DepthwiseConvHybridPerChannel(
    const DepthwiseParams& params, float* scaling_factors_ptr,
    const RuntimeShape& input_shape, const int8* input_data,