Add gemmlowp-threadpool multithreading to the depthwiseconv implementation for the quantized path.

PiperOrigin-RevId: 239959051
2019-03-23 10:31:22 -07:00 · 2019-03-23 10:31:22 -07:00 · 152095e319
commit 152095e319
parent 36f817a9f3
7 changed files with 380 additions and 42 deletions
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/gemm_support.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
@ -66,6 +67,7 @@ struct OpData {
 };
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  gemm_support::IncrementUsageCounter(context);
  // This is a builtin op, so we don't use the contents in 'buffer', if any.
  // Instead, we allocate a new object to carry information from Prepare() to
  // Eval().
@ -73,6 +75,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 void Free(TfLiteContext* context, void* buffer) {
  gemm_support::DecrementUsageCounter(context);
  delete reinterpret_cast<OpData*>(buffer);
 }
@ -230,17 +233,6 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
  auto filter_offset = -filter->params.zero_point;
  auto output_offset = output->params.zero_point;
  void (*depthwise_conv)(const DepthwiseParams&, const RuntimeShape&,
                         const uint8*, const RuntimeShape&, const uint8*,
                         const RuntimeShape&, const int32*, const RuntimeShape&,
                         uint8*);
  if (kernel_type == kReference) {
    depthwise_conv = &reference_ops::DepthwiseConv;
  } else {
    depthwise_conv = &optimized_ops::DepthwiseConv;
  }
  DepthwiseParams op_params;
  op_params.padding_type = PaddingType::kSame;
  op_params.padding_values.width = data->padding.width;
@ -257,11 +249,20 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
  op_params.output_shift = -data->output_shift;
  op_params.quantized_activation_min = data->output_activation_min;
  op_params.quantized_activation_max = data->output_activation_max;
-  depthwise_conv(op_params, GetTensorShape(input),
+  if (kernel_type == kReference) {
-                 GetTensorData<uint8_t>(input), GetTensorShape(filter),
+    reference_ops::DepthwiseConv(
-                 GetTensorData<uint8_t>(filter), GetTensorShape(bias),
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-                 GetTensorData<int32_t>(bias), GetTensorShape(output),
+        GetTensorShape(filter), GetTensorData<uint8_t>(filter),
-                 GetTensorData<uint8_t>(output));
+        GetTensorShape(bias), GetTensorData<int32_t>(bias),
        GetTensorShape(output), GetTensorData<uint8_t>(output));
  } else {
    gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
    optimized_ops::DepthwiseConv(
        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
        GetTensorShape(filter), GetTensorData<uint8_t>(filter),
        GetTensorShape(bias), GetTensorData<int32_t>(bias),
        GetTensorShape(output), GetTensorData<uint8_t>(output), gemm_context);
  }
 }
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
--- a/tensorflow/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/kernels/depthwise_conv_test.cc
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
 #include <initializer_list>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/interpreter.h"
@ -501,6 +502,172 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) {
              ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
 }
 TEST_P(DepthwiseConvolutionOpTest, MultithreadOnRowUint8GeneralTest) {
  const int depth = 1;
  const int image_width = 4;
  const int image_height = 28;
  const int image_batch_count = 3;
  const int filter_size = 3;
  const int filter_count = 1;
  QuantizedDepthwiseConvolutionOpModel m(
      GetRegistration(),
      {TensorType_UINT8,
       {image_batch_count, image_height, image_width, depth},
       0,
       255},
      {TensorType_UINT8,
       {depth, filter_size, filter_size, filter_count},
       0,
       255},
      {TensorType_UINT8, {}, 0, 255}, Padding_VALID);
  // clang-format off
  m.SetInput({
      0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
      0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
      1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
      1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
      1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
      0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
      0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
      0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
      0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
      2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
      2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
      2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
      0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
      0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
      0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
      0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
      3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
      3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
      3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
      0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
      0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0
  });
  // clang-format on
  // The filter matrix is:
  // | 1 | 2 | 3 |
  // | 4 | 5 | 6 |
  // | 7 | 8 | 9 |
  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
  // No bias for this test.
  m.SetBias({0});
  m.SetNumThreads(4);
  m.Invoke();
  // clang-format off
  EXPECT_THAT(
      m.GetOutput(),
      ElementsAreArray({
          0, 0,    0, 0,    0, 0,    0, 0,
          0, 0,    0, 0,    24, 24,  39, 39,
          45, 45,  45, 45,  45, 45,  45, 45,
          45, 45,  45, 45,  45, 45,  45, 45,
          45, 45,  45, 45,  21, 21,  6, 6,
          0, 0,    0, 0,    0, 0,    0, 0,
          0, 0,    0, 0,
          0, 0,    0, 0,    0, 0,    0, 0,
          0, 0,    0, 0,    48, 48,  78, 78,
          90, 90,  90, 90,  90, 90,  90, 90,
          90, 90,  90, 90,  90, 90,  90, 90,
          90, 90,  90, 90,  42, 42,  12, 12,
          0, 0,    0, 0,    0, 0,    0, 0,
          0, 0,    0, 0,
          0, 0,      0, 0,      0, 0,      0, 0,
          0, 0,      0, 0,      72, 72,    117, 117,
          135, 135,  135, 135,  135, 135,  135, 135,
          135, 135,  135, 135,  135, 135,  135, 135,
          135, 135,  135, 135,  63, 63,    18, 18,
          0, 0,      0, 0,      0, 0,      0, 0,
          0, 0,      0, 0,
      }));
  // clang-format on
 }
 TEST_P(DepthwiseConvolutionOpTest, MultithreadOnBatchUint8GeneralTest) {
  const int depth = 1;
  const int image_width = 8;
  const int image_height = 4;
  const int image_batch_count = 6;
  const int filter_size = 3;
  const int filter_count = 1;
  QuantizedDepthwiseConvolutionOpModel m(
      GetRegistration(),
      {TensorType_UINT8,
       {image_batch_count, image_height, image_width, depth},
       0,
       255},
      {TensorType_UINT8,
       {depth, filter_size, filter_size, filter_count},
       0,
       255},
      {TensorType_UINT8, {}, 0, 255}, Padding_VALID);
  // clang-format off
  m.SetInput({
      0, 0, 0, 0,  0, 0, 0, 0,  1, 1, 1, 1,  1, 1, 1, 1,
      1, 1, 1, 1,  1, 1, 1, 1,  0, 0, 0, 0,  0, 0, 0, 0,
      0, 0, 0, 0,  0, 0, 0, 0,  1, 1, 1, 1,  1, 1, 1, 1,
      1, 1, 1, 1,  1, 1, 1, 1,  0, 0, 0, 0,  0, 0, 0, 0,
      0, 0, 0, 0,  0, 0, 0, 0,  1, 1, 1, 1,  1, 1, 1, 1,
      1, 1, 1, 1,  1, 1, 1, 1,  0, 0, 0, 0,  0, 0, 0, 0,
      0, 0, 0, 0,  0, 0, 0, 0,  1, 1, 1, 1,  1, 1, 1, 1,
      1, 1, 1, 1,  1, 1, 1, 1,  0, 0, 0, 0,  0, 0, 0, 0,
      0, 0, 0, 0,  0, 0, 0, 0,  1, 1, 1, 1,  1, 1, 1, 1,
      1, 1, 1, 1,  1, 1, 1, 1,  0, 0, 0, 0,  0, 0, 0, 0,
      0, 0, 0, 0,  0, 0, 0, 0,  1, 1, 1, 1,  1, 1, 1, 1,
      1, 1, 1, 1,  1, 1, 1, 1,  0, 0, 0, 0,  0, 0, 0, 0
  });
  // clang-format on
  // The filter matrix is:
  // | 1 | 2 | 3 |
  // | 4 | 5 | 6 |
  // | 7 | 8 | 9 |
  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
  // No bias for this test.
  m.SetBias({0});
  m.SetNumThreads(4);
  m.Invoke();
  // clang-format off
  EXPECT_THAT(
      m.GetOutput(),
      ElementsAreArray({
          39, 39, 39, 39, 39, 39,
          21, 21, 21, 21, 21, 21,
          39, 39, 39, 39, 39, 39,
          21, 21, 21, 21, 21, 21,
          39, 39, 39, 39, 39, 39,
          21, 21, 21, 21, 21, 21,
          39, 39, 39, 39, 39, 39,
          21, 21, 21, 21, 21, 21,
          39, 39, 39, 39, 39, 39,
          21, 21, 21, 21, 21, 21,
          39, 39, 39, 39, 39, 39,
          21, 21, 21, 21, 21, 21
      }));
  // clang-format on
 }
 class PerChannelQuantizedDepthwiseConvolutionOpModel
    : public BaseDepthwiseConvolutionOpModel {
 public:
--- a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
@ -139,7 +139,8 @@ inline void DispatchDepthwiseConv(
      // Call kernel optimized for depthwise convolutions using 3x3 filters.
      optimized_ops::depthwise_conv::DepthwiseConv3x3Filter(
          params, input_shape, input_data, filter_shape, filter_data,
-          bias_shape, bias_data, output_shape, output_data);
+          bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
          /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
      return;
 #else
      break;
@ -242,7 +243,8 @@ inline void DispatchDepthwiseConv(
    case DepthwiseConvImplementation::kUseGenericKernel: {
      optimized_ops::depthwise_conv::DepthwiseConvGeneral(
          params, input_shape, input_data, filter_shape, filter_data,
-          bias_shape, bias_data, output_shape, output_data);
+          bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
          /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
      return;
    }
    case DepthwiseConvImplementation::kNone:
@ -271,13 +273,15 @@ inline void DispatchDepthwiseConv(
      optimized_ops::DepthwiseConvWithRounding<
          DepthwiseConvOutputRounding::kAwayFromZero>(
          params, input_shape, input_data, filter_shape, filter_data,
-          bias_shape, bias_data, output_shape, output_data);
+          bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
          /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
      return;
    case DepthwiseConvOutputRounding::kUpward:
      optimized_ops::DepthwiseConvWithRounding<
          DepthwiseConvOutputRounding::kUpward>(
          params, input_shape, input_data, filter_shape, filter_data,
-          bias_shape, bias_data, output_shape, output_data);
+          bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
          /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
      return;
    default:
      break;
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@ -1662,7 +1662,7 @@ inline void DepthwiseConvGeneral(
    const uint8* input_data, const RuntimeShape& filter_shape,
    const uint8* filter_data, const RuntimeShape& bias_shape,
    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
+    uint8* output_data, int thread_start, int thread_end, int thread_dim) {
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
  const int pad_width = params.padding_values.width;
@ -1684,7 +1684,7 @@ inline void DepthwiseConvGeneral(
  const int input_depth = input_shape.Dims(3);
  const int filter_height = filter_shape.Dims(1);
  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
+  const int output_rows = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
 #ifdef USE_NEON
  const bool shift_left = (output_shift > 0);
@ -1700,6 +1700,7 @@ inline void DepthwiseConvGeneral(
                   kAccBufferActualSize);
  TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
  TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
  TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
  // row_accum_func will point to the core accumulation function to be used
  // for this DepthwiseConv op.
@ -1766,9 +1767,34 @@ inline void DepthwiseConvGeneral(
  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
  // Now that we have determined row_accum_func, we can start work.
-  uint8* output_ptr = output_data;
+  int batch_start = 0;
-  for (int b = 0; b < batches; ++b) {
+  int batch_end = batches;
-    for (int out_y = 0; out_y < output_height; ++out_y) {
+  int row_start = 0;
  int row_end = output_rows;
  int output_ptr_offset = 0;
  switch (thread_dim) {
    case 0:
      TFLITE_DCHECK_GE(thread_start, 0);
      TFLITE_DCHECK_LE(thread_end, batches);
      batch_start = thread_start;
      batch_end = thread_end;
      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
      break;
    case 1:
      TFLITE_DCHECK_GE(thread_start, 0);
      TFLITE_DCHECK_LE(thread_end, output_rows);
      row_start = thread_start;
      row_end = thread_end;
      output_ptr_offset = row_start * output_width * output_depth;
      break;
  }
  uint8* output_ptr = output_data + output_ptr_offset;
  int batch_step =
      (output_rows + row_start - row_end) * output_width * output_depth;
  for (int b = batch_start; b < batch_end; ++b) {
    for (int out_y = row_start; out_y < row_end; ++out_y) {
      const int in_y_origin = (out_y * stride_height) - pad_height;
      const int filter_y_start =
          std::max(0, (-in_y_origin + dilation_height_factor - 1) /
@ -1944,6 +1970,7 @@ inline void DepthwiseConvGeneral(
        }
      }
    }
    output_ptr += batch_step;
  }
 }
@ -1955,7 +1982,7 @@ inline void DepthwiseConvWithRounding(
    const uint8* input_data, const RuntimeShape& filter_shape,
    const uint8* filter_data, const RuntimeShape& bias_shape,
    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
+    uint8* output_data, int thread_start, int thread_end, int thread_dim) {
  gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
  const int depth_multiplier = params.depth_multiplier;
  const int32 output_activation_min = params.quantized_activation_min;
@ -1991,7 +2018,8 @@ inline void DepthwiseConvWithRounding(
    gemmlowp::ScopedProfilingLabel specialized_label("DepthwiseConv/8bit/3x3");
    depthwise_conv::DepthwiseConv3x3Filter(
        params, input_shape, input_data, filter_shape, filter_data, bias_shape,
-        bias_data, output_shape, output_data);
+        bias_data, output_shape, output_data, thread_start, thread_end,
        thread_dim);
    return;
  }
 #endif
@ -2000,7 +2028,77 @@ inline void DepthwiseConvWithRounding(
      "DepthwiseConv/8bit/General");
  depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data,
                                       filter_shape, filter_data, bias_shape,
-                                       bias_data, output_shape, output_data);
+                                       bias_data, output_shape, output_data,
                                       thread_start, thread_end, thread_dim);
 }
 inline void DepthwiseConvImpl(
    const DepthwiseParams& params, const RuntimeShape& input_shape,
    const uint8* input_data, const RuntimeShape& filter_shape,
    const uint8* filter_data, const RuntimeShape& bias_shape,
    const int32* bias_data, const RuntimeShape& output_shape,
    uint8* output_data, int thread_start, int thread_end, int thread_dim) {
  return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
      params, input_shape, input_data, filter_shape, filter_data, bias_shape,
      bias_data, output_shape, output_data, thread_start, thread_end,
      thread_dim);
 }
 template <typename T, typename TS>
 struct DepthwiseConvWorkerTask : public gemmlowp::Task {
  DepthwiseConvWorkerTask(const DepthwiseParams& params,
                          const RuntimeShape& input_shape, const T* input_data,
                          const RuntimeShape& filter_shape,
                          const T* filter_data, const RuntimeShape& bias_shape,
                          const TS* bias_data, const RuntimeShape& output_shape,
                          T* output_data, int thread_start, int thread_end,
                          int thread_dim)
      : params_(params),
        input_shape_(input_shape),
        input_data_(input_data),
        filter_shape_(filter_shape),
        filter_data_(filter_data),
        bias_shape_(bias_shape),
        bias_data_(bias_data),
        output_shape_(output_shape),
        output_data_(output_data),
        thread_start_(thread_start),
        thread_end_(thread_end),
        thread_dim_(thread_dim) {}
  void Run() override {
    DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_,
                      filter_data_, bias_shape_, bias_data_, output_shape_,
                      output_data_, thread_start_, thread_end_, thread_dim_);
  }
 private:
  const DepthwiseParams& params_;
  const RuntimeShape& input_shape_;
  const T* input_data_;
  const RuntimeShape& filter_shape_;
  const T* filter_data_;
  const RuntimeShape& bias_shape_;
  const TS* bias_data_;
  const RuntimeShape& output_shape_;
  T* output_data_;
  int thread_start_;
  int thread_end_;
  int thread_dim_;
 };
 inline int HowManyConvThreads(const RuntimeShape& output_shape,
                              const RuntimeShape& filter_shape,
                              int thread_dim) {
  constexpr int kMinMulPerThread = 8;
  const int output_units = output_shape.Dims(thread_dim);
  const int filter_height = filter_shape.Dims(1);
  const int filter_width = filter_shape.Dims(2);
  const int num_mul_per_unit =
      FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
  const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
  int thread_count = output_units / min_units_per_thread;
  return thread_count;
 }
 inline void DepthwiseConv(
@ -2008,10 +2106,50 @@ inline void DepthwiseConv(
    const uint8* input_data, const RuntimeShape& filter_shape,
    const uint8* filter_data, const RuntimeShape& bias_shape,
    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
+    uint8* output_data, gemmlowp::GemmContext* gemm_context = nullptr) {
-  return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
-      params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+
-      bias_data, output_shape, output_data);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
  const int output_batches = output_shape.Dims(0);
  const int output_rows = output_shape.Dims(1);
  int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
  int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
  int thread_dim, thread_count, thread_dim_size;
  if (thread_count_batch > thread_count_row) {
    thread_dim = 0;
    thread_dim_size = output_batches;
    thread_count = thread_count_batch;
  } else {
    thread_dim = 1;
    thread_dim_size = output_rows;
    thread_count = thread_count_row;
  }
  const int max_threads = gemm_context ? gemm_context->max_num_threads() : 1;
  thread_count = std::max(1, std::min(thread_count, max_threads));
  if (thread_count == 1) {
    DepthwiseConvImpl(params, input_shape, input_data, filter_shape,
                      filter_data, bias_shape, bias_data, output_shape,
                      output_data, /*thread_start=*/0,
                      /*thread_end=*/output_rows, /*thread_dim=*/1);
  } else {
    std::vector<gemmlowp::Task*> tasks(thread_count);
    int thread_start = 0;
    for (int i = 0; i < thread_count; ++i) {
      int thread_end =
          thread_start + (thread_dim_size - thread_start) / (thread_count - i);
      tasks[i] = new DepthwiseConvWorkerTask<uint8, int32>(
          params, input_shape, input_data, filter_shape, filter_data,
          bias_shape, bias_data, output_shape, output_data, thread_start,
          thread_end, thread_dim);
      thread_start = thread_end;
    }
    gemm_context->workers_pool()->Execute(tasks);
  }
 }
 }  // namespace optimized_ops
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@ -3535,7 +3535,7 @@ inline void DepthwiseConv3x3Filter(
    const uint8* input_data, const RuntimeShape& filter_shape,
    const uint8* filter_data, const RuntimeShape& bias_shape,
    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
+    uint8* output_data, int thread_start, int thread_end, int thread_dim) {
  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
  DepthwiseConvParams params;
@ -3586,6 +3586,7 @@ inline void DepthwiseConv3x3Filter(
  TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
  TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
  TFLITE_DCHECK(pad_width == pad_height);
  TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
  const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
  const int64_t input_batch_size = params.input_row_size * params.input_height;
@ -3619,14 +3620,35 @@ inline void DepthwiseConv3x3Filter(
  // used in gemmlowp.
  uint8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
-  for (int32 b = 0; b < batches; ++b) {
+  int batch_start = 0;
  int batch_end = batches;
  int row_start = 0;
  int row_end = params.output_height;
  switch (thread_dim) {
    case 0:
      TFLITE_DCHECK_GE(thread_start, 0);
      TFLITE_DCHECK_LE(thread_end, batches);
      batch_start = thread_start;
      batch_end = thread_end;
      break;
    case 1:
      TFLITE_DCHECK_GE(thread_start, 0);
      TFLITE_DCHECK_LE(thread_end, params.output_height);
      row_start = thread_start;
      row_end = thread_end;
      break;
  }
  for (int32 b = batch_start; b < batch_end; ++b) {
    const uint8* input_ptr = input_data + b * input_batch_size;
-    uint8* output_ptr = output_data + b * output_batch_size;
+    uint8* output_ptr = output_data + b * output_batch_size +
                        row_start * params.output_width * params.output_depth;
    int32 out_x = 0;
-    int32 out_y = 0;
+    int32 out_y = row_start;
    int32 end_x = params.output_width;
-    int32 end_y = params.output_height;
+    int32 end_y = row_end;
    if (pad_width == 1 && pad_height == 1) {
      DepthwiseConvHandlePadding(input_ptr, filter_data, bias_data, output_ptr,
@ -3635,8 +3657,8 @@ inline void DepthwiseConv3x3Filter(
      // Update extents now that the edges have been handled.
      out_x = 1;
      end_x = params.output_width - 1;
-      out_y = 1;
+      out_y = std::max(1, out_y);
-      end_y = params.output_height - 1;
+      end_y = std::min(params.output_height - 1, end_y);
      const int in_x = (out_x * stride_width) - pad_width;
      const int in_y = (out_y * stride_height) - pad_height;
      input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
--- a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
@ -234,9 +234,14 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
  op_params.output_shift = kDepthwiseReverseShift * output_shift;
-  DepthwiseConv(op_params, DimsToShape(input_dims), input_data,
+  const RuntimeShape output_shape = DimsToShape(output_dims);
-                DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+  const int output_height = output_shape.Dims(1);
-                bias_data, DimsToShape(output_dims), output_data);
+
  DepthwiseConvImpl(op_params, DimsToShape(input_dims), input_data,
                    DimsToShape(filter_dims), filter_data,
                    DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
                    output_data, /*thread_start=*/0,
                    /*thread_end=*/output_height, /*thread_dim=*/1);
 }
 inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@ -327,6 +327,7 @@ class SingleOpModel {
  }
  void SetNumThreads(int num_threads) {
    CHECK(interpreter_ != nullptr);
    interpreter_->SetNumThreads(num_threads);
  }