From 3a3b1132d7b1ed02b28f24db9db7f430ac2614a7 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Tue, 19 Mar 2019 15:13:49 -0700
Subject: [PATCH] Optimized per-channel convolution.

PiperOrigin-RevId: 239279531
---
 tensorflow/lite/kernels/conv.cc               |  83 +++++--
 tensorflow/lite/kernels/conv_test.cc          |   2 +-
 tensorflow/lite/kernels/internal/BUILD        |   3 +
 .../kernels/internal/optimized/im2col_utils.h | 235 ++++++++++++++++++
 .../internal/optimized/integer_ops/conv.h     | 159 ++++++++++++
 .../internal/optimized/optimized_ops.h        | 209 +---------------
 .../internal/reference/integer_ops/conv.h     |   2 +-
 tensorflow/workspace.bzl                      |   8 +-
 8 files changed, 468 insertions(+), 233 deletions(-)
 create mode 100644 tensorflow/lite/kernels/internal/optimized/im2col_utils.h
 create mode 100644 tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index c1c19a1ed18..fb6b30eac84 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -24,12 +24,14 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/eigen_support.h"
 #include "tensorflow/lite/kernels/gemm_support.h"
+#include "tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h"
 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -495,27 +497,70 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   }
 }
 
+template <KernelType kernel_type>
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                              TfLiteConvParams* params, OpData* data,
                              TfLiteTensor* input, TfLiteTensor* filter,
-                             TfLiteTensor* bias, TfLiteTensor* output) {
-  ConvParams op_params;
-  op_params.input_offset = input->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
+                             TfLiteTensor* bias, TfLiteTensor* output,
+                             TfLiteTensor* im2col) {
+  KernelType effective_kernel_type;
+  effective_kernel_type = kernel_type;
 
-  reference_integer_ops::ConvPerChannel(
-      op_params, data->per_channel_output_multiplier.data(),
-      data->per_channel_output_shift.data(), GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
+// If not running on NEON we force a fallback to the reference kernels, until
+// we have optimized support on other platforms.
+#ifndef GEMMLOWP_NEON
+  effective_kernel_type = kReference;
+#endif
+
+  switch (effective_kernel_type) {
+    case kReference: {
+      ConvParams op_params;
+      op_params.input_offset = -input->params.zero_point;
+      op_params.output_offset = output->params.zero_point;
+      op_params.stride_height = params->stride_height;
+      op_params.stride_width = params->stride_width;
+      op_params.dilation_height_factor = params->dilation_height_factor;
+      op_params.dilation_width_factor = params->dilation_width_factor;
+      op_params.padding_values.height = data->padding.height;
+      op_params.padding_values.width = data->padding.width;
+
+      reference_integer_ops::ConvPerChannel(
+          op_params, data->per_channel_output_multiplier.data(),
+          data->per_channel_output_shift.data(), GetTensorShape(input),
+          GetTensorData<int8>(input), GetTensorShape(filter),
+          GetTensorData<int8>(filter), GetTensorShape(bias),
+          GetTensorData<int32>(bias), GetTensorShape(output),
+          GetTensorData<int8>(output));
+      break;
+    }
+    case kGenericOptimized:
+    case kMultithreadOptimized:
+    case kCblasOptimized: {
+#ifdef GEMMLOWP_NEON
+      gemmlowp::GemmContext* gemm_context =
+          gemm_support::GetFromContext(context);
+      ConvParams op_params;
+      op_params.input_offset = -input->params.zero_point;
+      op_params.output_offset = output->params.zero_point;
+      op_params.stride_height = params->stride_height;
+      op_params.stride_width = params->stride_width;
+      op_params.dilation_height_factor = params->dilation_height_factor;
+      op_params.dilation_width_factor = params->dilation_width_factor;
+      op_params.padding_values.height = data->padding.height;
+      op_params.padding_values.width = data->padding.width;
+
+      optimized_integer_ops::ConvPerChannel(
+          op_params, data->per_channel_output_multiplier.data(),
+          data->per_channel_output_shift.data(), GetTensorShape(input),
+          GetTensorData<int8>(input), GetTensorShape(filter),
+          GetTensorData<int8>(filter), GetTensorShape(bias),
+          GetTensorData<int32>(bias), GetTensorShape(output),
+          GetTensorData<int8>(output), GetTensorShape(im2col),
+          GetTensorData<int8>(im2col), gemm_context);
+#endif
+      break;
+    }
+  }
 }
 
 template <KernelType kernel_type>
@@ -707,8 +752,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                  bias, im2col, hwcn_weights, output);
       break;
     case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
-                              output);
+      EvalQuantizedPerChannel<kernel_type>(context, node, params, data, input,
+                                           filter, bias, output, im2col);
       break;
     default:
       context->ReportError(context, "Type %d not currently supported.",
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index 7c562a530ab..e36c251e7a1 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -1130,7 +1130,7 @@ class PerChannelQuantizedConvolutionOpModel : public BaseConvolutionOpModel {
   }
 };
 
-TEST_P(ConvolutionOpTest, SimpleTest) {
+TEST_P(ConvolutionOpTest, SimplePerChannelTest) {
   PerChannelQuantizedConvolutionOpModel m(
       GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
       {TensorType_INT8,
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 4a18ee3c097..6588e9d89e3 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -175,6 +175,8 @@ cc_library(
         "optimized/depthwiseconv_float.h",
         "optimized/depthwiseconv_uint8.h",
         "optimized/depthwiseconv_uint8_3x3_filter.h",
+        "optimized/im2col_utils.h",
+        "optimized/integer_ops/conv.h",
         "optimized/optimized_ops.h",
     ],
     copts = tflite_copts(),
@@ -209,6 +211,7 @@ cc_library(
         "optimized/depthwiseconv_float.h",
         "optimized/depthwiseconv_uint8.h",
         "optimized/depthwiseconv_uint8_3x3_filter.h",
+        "optimized/im2col_utils.h",
         "optimized/legacy_optimized_ops.h",
         "optimized/optimized_ops.h",
     ],
diff --git a/tensorflow/lite/kernels/internal/optimized/im2col_utils.h b/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
new file mode 100644
index 00000000000..e3600a783ff
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
@@ -0,0 +1,235 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
+
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+template <typename T>
+inline void ExtractPatchIntoBufferColumn(const RuntimeShape& input_shape, int w,
+                                         int h, int b, int kheight, int kwidth,
+                                         int stride_width, int stride_height,
+                                         int pad_width, int pad_height,
+                                         int in_width, int in_height,
+                                         int in_depth, int single_buffer_length,
+                                         int buffer_id, const T* in_data,
+                                         T* conv_buffer_data, uint8 zero_byte) {
+  gemmlowp::ScopedProfilingLabel label("ExtractPatchIntoBufferColumn");
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  // This chunk of code reshapes all the inputs corresponding to
+  // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
+  const int kwidth_times_indepth = kwidth * in_depth;
+  const int inwidth_times_indepth = in_width * in_depth;
+  const int ih_ungated_start = h * stride_height - pad_height;
+  const int ih_ungated_end = (ih_ungated_start + kheight);
+  const int ih_end = std::min(ih_ungated_end, in_height);
+  const int iw_ungated_start = w * stride_width - pad_width;
+  const int iw_ungated_end = (iw_ungated_start + kwidth);
+  const int iw_end = std::min(iw_ungated_end, in_width);
+  // If the patch is off the edge of the input image, skip writing those rows
+  // and columns from the patch into the output array.
+  const int h_offset = std::max(0, -ih_ungated_start);
+  const int w_offset = std::max(0, -iw_ungated_start);
+  const int ih_start = std::max(0, ih_ungated_start);
+  const int iw_start = std::max(0, iw_ungated_start);
+  const int single_row_num =
+      std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
+  const int output_row_offset = (buffer_id * single_buffer_length);
+  int out_offset =
+      output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
+  int in_offset = Offset(input_shape, b, ih_start, iw_start, 0);
+
+  // Express all of the calculations as padding around the input patch.
+  const int top_padding = h_offset;
+  const int bottom_padding = (ih_ungated_end - ih_end);
+  const int left_padding = w_offset;
+  const int right_padding = (iw_ungated_end - iw_end);
+  assert(single_row_num ==
+         ((kwidth - (left_padding + right_padding)) * in_depth));
+
+  // Write out zeroes to the elements representing the top rows of the input
+  // patch that are off the edge of the input image.
+  if (top_padding > 0) {
+    const int top_row_elements = (top_padding * kwidth * in_depth);
+    memset(conv_buffer_data + output_row_offset, zero_byte,
+           (top_row_elements * sizeof(T)));
+  }
+
+  // If the patch is on the interior of the input image horizontally, just copy
+  // over the rows sequentially, otherwise add zero padding at the start or end.
+  if ((left_padding == 0) && (right_padding == 0)) {
+    for (int ih = ih_start; ih < ih_end; ++ih) {
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
+             single_row_num * sizeof(T));
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  } else {
+    for (int ih = ih_start; ih < ih_end; ++ih) {
+      if (left_padding > 0) {
+        const int left_start = (out_offset - (left_padding * in_depth));
+        memset(conv_buffer_data + left_start, zero_byte,
+               (left_padding * in_depth * sizeof(T)));
+      }
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
+             single_row_num * sizeof(T));
+      if (right_padding > 0) {
+        const int right_start = (out_offset + single_row_num);
+        memset(conv_buffer_data + right_start, zero_byte,
+               (right_padding * in_depth * sizeof(T)));
+      }
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  }
+
+  // If the bottom of the patch falls off the input image, pad the values
+  // representing those input rows with zeroes.
+  if (bottom_padding > 0) {
+    const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
+    const int bottom_start =
+        output_row_offset +
+        ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+    memset(conv_buffer_data + bottom_start, zero_byte,
+           (bottom_row_elements * sizeof(T)));
+  }
+}
+
+template <typename T>
+void DilatedIm2col(const ConvParams& params, uint8 zero_byte,
+                   const RuntimeShape& input_shape, const T* input_data,
+                   const RuntimeShape& filter_shape,
+                   const RuntimeShape& output_shape, T* im2col_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  // For dilated convolution, the input pixels are not contiguous therefore we
+  // can't use the same opitimizations as Im2Col(). Though note this code would
+  // work fine for the non-dilated case too (though likely a bit slower).
+  gemmlowp::ScopedProfilingLabel label("DilatedIm2col");
+  TFLITE_DCHECK(dilation_width_factor != 1 || dilation_height_factor != 1);
+  TFLITE_DCHECK(im2col_data);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  MatchingDim(output_shape, 3, filter_shape, 0);
+
+  // Construct the MxN sized im2col matrix.
+  // The rows M, are sub-ordered B x H x W
+  const RuntimeShape row_shape({1, batches, output_height, output_width});
+  // The columns, N, are sub-ordered Kh x Kw x Din
+  const RuntimeShape col_shape({1, filter_height, filter_width, input_depth});
+  // Use dimensions M and N to construct dims for indexing directly into im2col
+  const RuntimeShape im2col_shape(
+      {1, 1, row_shape.FlatSize(), col_shape.FlatSize()});
+
+  // Loop through the output rows (B x H x W)
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        // Each im2col row is an output pixel. Arrange the input data in this
+        // row in an order we can conveniently multiply with the filter data.
+        int row_offset = Offset(row_shape, 0, batch, out_y, out_x);
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        // Loop through all the pixels of the filter (Kh x Kw)
+        for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          if ((in_y >= 0) && (in_y < input_height)) {
+            // Filter row is within the input data.
+            // Loop through all the filter pixels in this row.
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              int col_offset = Offset(col_shape, 0, filter_y, filter_x, 0);
+              T* dst = im2col_data +
+                       Offset(im2col_shape, 0, 0, row_offset, col_offset);
+              if ((in_x >= 0) && (in_x < input_width)) {
+                // Filter pixel is within the input, copy the input data.
+                T const* src =
+                    input_data + Offset(input_shape, batch, in_y, in_x, 0);
+                memcpy(dst, src, input_depth * sizeof(T));
+              } else {
+                // Filter pixel is outside the input, zero it out.
+                memset(dst, zero_byte, input_depth * sizeof(T));
+              }
+            }
+          } else {
+            // Filter row is outside the input, zero out the entire filter row.
+            int col_offset = Offset(col_shape, 0, filter_y, 0, 0);
+            T* dst = im2col_data +
+                     Offset(im2col_shape, 0, 0, row_offset, col_offset);
+            memset(dst, zero_byte, filter_width * input_depth * sizeof(T));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void Im2col(const ConvParams& params, int kheight, int kwidth, uint8 zero_byte,
+            const RuntimeShape& input_shape, const T* input_data,
+            const RuntimeShape& output_shape, T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Im2col");
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+
+  int buffer_id = 0;
+  // Loop over the output nodes.
+  for (int b = 0; b < batches; ++b) {
+    for (int h = 0; h < output_height; ++h) {
+      for (int w = 0; w < output_width; ++w) {
+        ExtractPatchIntoBufferColumn(
+            input_shape, w, h, b, kheight, kwidth, stride_width, stride_height,
+            pad_width, pad_height, input_width, input_height, input_depth,
+            output_depth, buffer_id, input_data, output_data, zero_byte);
+        ++buffer_id;
+      }
+    }
+  }
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
new file mode 100644
index 00000000000..a2fa85bd731
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
@@ -0,0 +1,159 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_CONV_H_
+
+#ifdef GEMMLOWP_NEON
+
+#include "fixedpoint/fixedpoint.h"
+#include "public/gemmlowp.h"
+#include "public/map.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_integer_ops {
+
+struct GemmlowpOutputPipelineFixedPointPCLhs {
+  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
+                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponentPC<
+                         gemmlowp::VectorShape::Col>,
+                     gemmlowp::OutputStageClamp,
+                     gemmlowp::OutputStageSaturatingCastToInt8>
+      Pipeline;
+  static Pipeline MakeExp(const int32* bias_data, int output_rows,
+                          const int32 output_offset,
+                          const int32* output_multiplier,
+                          const int* output_left_shift,
+                          int32 output_activation_min,
+                          int32 output_activation_max) {
+    ColVectorMap bias_vector(bias_data, output_rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponentPC<
+        gemmlowp::VectorShape::Col>
+        quantize_down_stage;
+    quantize_down_stage.result_offset_after_shift = output_offset;
+    quantize_down_stage.result_fixedpoint_multiplier =
+        ColVectorMap(output_multiplier, output_rows);
+    quantize_down_stage.result_exponent =
+        ColVectorMap(output_left_shift, output_rows);
+
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = output_activation_min;
+    clamp_stage.max = output_activation_max;
+    gemmlowp::OutputStageSaturatingCastToInt8 saturating_cast_stage;
+    return std::make_tuple(bias_addition_stage, quantize_down_stage,
+                           clamp_stage, saturating_cast_stage);
+  }
+};
+
+// Fixed-point per-channel-quantization convolution reference kernel.
+inline void ConvPerChannel(
+    const ConvParams& params, const int32* output_multiplier,
+    const int32* output_shift, const RuntimeShape& input_shape,
+    const int8* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape, int8* output_data,
+    const RuntimeShape& im2col_shape, int8* im2col_data,
+    gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label("Conv/8bit");
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int32 input_offset = params.input_offset;
+  const int32 output_offset = params.output_offset;
+  // Set min and max value of the output.
+  static constexpr int32 output_activation_min =
+      std::numeric_limits<int8_t>::min();
+  static constexpr int32 output_activation_max =
+      std::numeric_limits<int8_t>::max();
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int8* gemm_input_data = nullptr;
+  const RuntimeShape* gemm_input_shape = nullptr;
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  const int8 input_zero_point = -input_offset;
+  TFLITE_DCHECK_GE(input_zero_point, output_activation_min);
+  TFLITE_DCHECK_LE(input_zero_point, output_activation_max);
+  const uint8 zero_point_byte =
+      *reinterpret_cast<const uint8*>(&input_zero_point);
+  if (need_dilated_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    optimized_ops::DilatedIm2col(params, zero_point_byte, input_shape,
+                                 input_data, filter_shape, output_shape,
+                                 im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    optimized_ops::Im2col(params, filter_height, filter_width, zero_point_byte,
+                          input_shape, input_data, im2col_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_shape = &input_shape;
+  }
+
+  const int gemm_input_rows = gemm_input_shape->Dims(3);
+  const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
+  const int filter_rows = filter_shape.Dims(0);
+  const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
+  const int output_rows = output_shape.Dims(3);
+  // See b/79927784.
+  // const int output_cols = FlatSizeSkipDim(output_shape, 3);
+  const int output_cols =
+      output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
+  TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+  gemmlowp::MatrixMap<const int8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, filter_rows, filter_cols);
+  gemmlowp::MatrixMap<const int8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      gemm_input_data, gemm_input_rows, gemm_input_cols);
+  gemmlowp::MatrixMap<int8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, output_cols);
+
+  const auto& output_pipeline = GemmlowpOutputPipelineFixedPointPCLhs::MakeExp(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+
+  gemmlowp::GemmWithOutputPipeline<
+      int8, int8, gemmlowp::SignedL8R8WithLhsNonzeroBitDepthParams>(
+      gemm_context, filter_matrix, input_matrix, &output_matrix,
+      /*filter_offset*/ 0, input_offset, output_pipeline);
+}
+
+}  // namespace optimized_integer_ops
+}  // namespace tflite
+
+#endif  // GEMMLOWP_NEON
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_CONV_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index d234d537f3d..2ec75ee7ad5 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/round.h"
@@ -1969,214 +1970,6 @@ inline void Mean(const tflite::MeanParams& op_params,
   }
 }
 
-template <typename T>
-inline void ExtractPatchIntoBufferColumn(const RuntimeShape& input_shape, int w,
-                                         int h, int b, int kheight, int kwidth,
-                                         int stride_width, int stride_height,
-                                         int pad_width, int pad_height,
-                                         int in_width, int in_height,
-                                         int in_depth, int single_buffer_length,
-                                         int buffer_id, const T* in_data,
-                                         T* conv_buffer_data, uint8 zero_byte) {
-  gemmlowp::ScopedProfilingLabel label("ExtractPatchIntoBufferColumn");
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  // This chunk of code reshapes all the inputs corresponding to
-  // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
-  const int kwidth_times_indepth = kwidth * in_depth;
-  const int inwidth_times_indepth = in_width * in_depth;
-  const int ih_ungated_start = h * stride_height - pad_height;
-  const int ih_ungated_end = (ih_ungated_start + kheight);
-  const int ih_end = std::min(ih_ungated_end, in_height);
-  const int iw_ungated_start = w * stride_width - pad_width;
-  const int iw_ungated_end = (iw_ungated_start + kwidth);
-  const int iw_end = std::min(iw_ungated_end, in_width);
-  // If the patch is off the edge of the input image, skip writing those rows
-  // and columns from the patch into the output array.
-  const int h_offset = std::max(0, -ih_ungated_start);
-  const int w_offset = std::max(0, -iw_ungated_start);
-  const int ih_start = std::max(0, ih_ungated_start);
-  const int iw_start = std::max(0, iw_ungated_start);
-  const int single_row_num =
-      std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
-  const int output_row_offset = (buffer_id * single_buffer_length);
-  int out_offset =
-      output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
-  int in_offset = Offset(input_shape, b, ih_start, iw_start, 0);
-
-  // Express all of the calculations as padding around the input patch.
-  const int top_padding = h_offset;
-  const int bottom_padding = (ih_ungated_end - ih_end);
-  const int left_padding = w_offset;
-  const int right_padding = (iw_ungated_end - iw_end);
-  assert(single_row_num ==
-         ((kwidth - (left_padding + right_padding)) * in_depth));
-
-  // Write out zeroes to the elements representing the top rows of the input
-  // patch that are off the edge of the input image.
-  if (top_padding > 0) {
-    const int top_row_elements = (top_padding * kwidth * in_depth);
-    memset(conv_buffer_data + output_row_offset, zero_byte,
-           (top_row_elements * sizeof(T)));
-  }
-
-  // If the patch is on the interior of the input image horizontally, just copy
-  // over the rows sequentially, otherwise add zero padding at the start or end.
-  if ((left_padding == 0) && (right_padding == 0)) {
-    for (int ih = ih_start; ih < ih_end; ++ih) {
-      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
-             single_row_num * sizeof(T));
-      out_offset += kwidth_times_indepth;
-      in_offset += inwidth_times_indepth;
-    }
-  } else {
-    for (int ih = ih_start; ih < ih_end; ++ih) {
-      if (left_padding > 0) {
-        const int left_start = (out_offset - (left_padding * in_depth));
-        memset(conv_buffer_data + left_start, zero_byte,
-               (left_padding * in_depth * sizeof(T)));
-      }
-      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
-             single_row_num * sizeof(T));
-      if (right_padding > 0) {
-        const int right_start = (out_offset + single_row_num);
-        memset(conv_buffer_data + right_start, zero_byte,
-               (right_padding * in_depth * sizeof(T)));
-      }
-      out_offset += kwidth_times_indepth;
-      in_offset += inwidth_times_indepth;
-    }
-  }
-
-  // If the bottom of the patch falls off the input image, pad the values
-  // representing those input rows with zeroes.
-  if (bottom_padding > 0) {
-    const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
-    const int bottom_start =
-        output_row_offset +
-        ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
-    memset(conv_buffer_data + bottom_start, zero_byte,
-           (bottom_row_elements * sizeof(T)));
-  }
-}
-
-template <typename T>
-void DilatedIm2col(const ConvParams& params, uint8 zero_byte,
-                   const RuntimeShape& input_shape, const T* input_data,
-                   const RuntimeShape& filter_shape,
-                   const RuntimeShape& output_shape, T* im2col_data) {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-
-  // For dilated convolution, the input pixels are not contiguous therefore we
-  // can't use the same opitimizations as Im2Col(). Though note this code would
-  // work fine for the non-dilated case too (though likely a bit slower).
-  gemmlowp::ScopedProfilingLabel label("DilatedIm2col");
-  TFLITE_DCHECK(dilation_width_factor != 1 || dilation_height_factor != 1);
-  TFLITE_DCHECK(im2col_data);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  MatchingDim(output_shape, 3, filter_shape, 0);
-
-  // Construct the MxN sized im2col matrix.
-  // The rows M, are sub-ordered B x H x W
-  const RuntimeShape row_shape({1, batches, output_height, output_width});
-  // The columns, N, are sub-ordered Kh x Kw x Din
-  const RuntimeShape col_shape({1, filter_height, filter_width, input_depth});
-  // Use dimensions M and N to construct dims for indexing directly into im2col
-  const RuntimeShape im2col_shape(
-      {1, 1, row_shape.FlatSize(), col_shape.FlatSize()});
-
-  // Loop through the output rows (B x H x W)
-  for (int batch = 0; batch < batches; ++batch) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        // Each im2col row is an output pixel. Arrange the input data in this
-        // row in an order we can conveniently multiply with the filter data.
-        int row_offset = Offset(row_shape, 0, batch, out_y, out_x);
-        const int in_x_origin = (out_x * stride_width) - pad_width;
-        const int in_y_origin = (out_y * stride_height) - pad_height;
-        // Loop through all the pixels of the filter (Kh x Kw)
-        for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-          const int in_y = in_y_origin + dilation_height_factor * filter_y;
-          if ((in_y >= 0) && (in_y < input_height)) {
-            // Filter row is within the input data.
-            // Loop through all the filter pixels in this row.
-            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-              const int in_x = in_x_origin + dilation_width_factor * filter_x;
-              int col_offset = Offset(col_shape, 0, filter_y, filter_x, 0);
-              T* dst = im2col_data +
-                       Offset(im2col_shape, 0, 0, row_offset, col_offset);
-              if ((in_x >= 0) && (in_x < input_width)) {
-                // Filter pixel is within the input, copy the input data.
-                T const* src =
-                    input_data + Offset(input_shape, batch, in_y, in_x, 0);
-                memcpy(dst, src, input_depth * sizeof(T));
-              } else {
-                // Filter pixel is outside the input, zero it out.
-                memset(dst, zero_byte, input_depth * sizeof(T));
-              }
-            }
-          } else {
-            // Filter row is outside the input, zero out the entire filter row.
-            int col_offset = Offset(col_shape, 0, filter_y, 0, 0);
-            T* dst = im2col_data +
-                     Offset(im2col_shape, 0, 0, row_offset, col_offset);
-            memset(dst, zero_byte, filter_width * input_depth * sizeof(T));
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void Im2col(const ConvParams& params, int kheight, int kwidth, uint8 zero_byte,
-            const RuntimeShape& input_shape, const T* input_data,
-            const RuntimeShape& output_shape, T* output_data) {
-  gemmlowp::ScopedProfilingLabel label("Im2col");
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = input_shape.Dims(3);
-  const int input_width = input_shape.Dims(2);
-  const int input_height = input_shape.Dims(1);
-  const int output_depth = output_shape.Dims(3);
-  const int output_width = output_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-
-  int buffer_id = 0;
-  // Loop over the output nodes.
-  for (int b = 0; b < batches; ++b) {
-    for (int h = 0; h < output_height; ++h) {
-      for (int w = 0; w < output_width; ++w) {
-        ExtractPatchIntoBufferColumn(
-            input_shape, w, h, b, kheight, kwidth, stride_width, stride_height,
-            pad_width, pad_height, input_width, input_height, input_depth,
-            output_depth, buffer_id, input_data, output_data, zero_byte);
-        ++buffer_id;
-      }
-    }
-  }
-}
-
 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                  const float* input_data, const RuntimeShape& filter_shape,
                  const float* filter_data, const RuntimeShape& bias_shape,
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
index 4ad2a70b31c..270b91f7296 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -100,7 +100,7 @@ inline void ConvPerChannel(
                   // we have seen so far.
                   // TODO(jianlijianli): Add a check to make sure the
                   // accumulator depth is smaller than 2^16.
-                  acc += filter_val * (input_val - input_offset);
+                  acc += filter_val * (input_val + input_offset);
                 }
               }
             }
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 930914148a4..6b94bc79850 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -216,11 +216,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "gemmlowp",
-        sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658",
-        strip_prefix = "gemmlowp-38ebac7b059e84692f53e5938f97a9943c120d98",
+        sha256 = "4da5404de25eeda40e7ceb18cf4ac1ce935db91c61ca2b4b84ef9d03e0ad1d4c",
+        strip_prefix = "gemmlowp-1bf3b9c582c70bddb07b8004fc031d9765684f79",
         urls = [
-            "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
-            "https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
+            "https://mirror.bazel.build/github.com/google/gemmlowp/archive/1bf3b9c582c70bddb07b8004fc031d9765684f79.zip",
+            "https://github.com/google/gemmlowp/archive/1bf3b9c582c70bddb07b8004fc031d9765684f79.zip",
         ],
     )