From dd473010edc8959354128138e42628f8c87f0fa2 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Tue, 5 Mar 2019 15:53:49 -0800
Subject: [PATCH] Optimize depthwise conv for particular filter width used in
 micro speech example

PiperOrigin-RevId: 236937295
---
 .../sparkfun_edge/command_responder.cc        |  16 +-
 .../lite/experimental/micro/kernels/BUILD     |  49 ++
 .../micro/kernels/depthwise_conv_test.cc      |  85 ++++
 .../portable_optimized/depthwise_conv.cc      | 439 ++++++++++++++++++
 .../experimental/micro/tools/make/Makefile    |   4 +-
 .../make/targets/apollo3evb_makefile.inc      |   3 +
 .../micro/tools/make/targets/osx_makefile.inc |   3 +
 7 files changed, 594 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/lite/experimental/micro/kernels/portable_optimized/depthwise_conv.cc

diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/command_responder.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/command_responder.cc
index f1ad9b017e9..78469f2b7d7 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/command_responder.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/command_responder.cc
@@ -33,21 +33,29 @@ void RespondToCommand(tflite::ErrorReporter* error_reporter,
   }
   static int count = 0;
 
-  // Toggle the yellow LED every time an inference is performed.
+  // Toggle the blue LED every time an inference is performed.
   ++count;
   if (count & 1) {
-    am_hal_gpio_output_set(AM_BSP_GPIO_LED_YELLOW);
+    am_hal_gpio_output_set(AM_BSP_GPIO_LED_BLUE);
   } else {
-    am_hal_gpio_output_clear(AM_BSP_GPIO_LED_YELLOW);
+    am_hal_gpio_output_clear(AM_BSP_GPIO_LED_BLUE);
   }
 
-  // Turn on the red LED if 'yes' was heard.
+  // Turn on the yellow LED if 'yes' was heard.
   am_hal_gpio_output_clear(AM_BSP_GPIO_LED_RED);
+  am_hal_gpio_output_clear(AM_BSP_GPIO_LED_YELLOW);
+  am_hal_gpio_output_clear(AM_BSP_GPIO_LED_GREEN);
   if (is_new_command) {
     error_reporter->Report("Heard %s (%d) @%dms", found_command, score,
                            current_time);
     if (found_command[0] == 'y') {
+      am_hal_gpio_output_set(AM_BSP_GPIO_LED_YELLOW);
+    }
+    if (found_command[0] == 'n') {
       am_hal_gpio_output_set(AM_BSP_GPIO_LED_RED);
     }
+    if (found_command[0] == 'u') {
+      am_hal_gpio_output_set(AM_BSP_GPIO_LED_GREEN);
+    }
   }
 }
diff --git a/tensorflow/lite/experimental/micro/kernels/BUILD b/tensorflow/lite/experimental/micro/kernels/BUILD
index e2d3164d4c3..451eed28528 100644
--- a/tensorflow/lite/experimental/micro/kernels/BUILD
+++ b/tensorflow/lite/experimental/micro/kernels/BUILD
@@ -46,6 +46,42 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "portable_optimized_micro_ops",
+    srcs = [
+        "fully_connected.cc",
+        "portable_optimized/depthwise_conv.cc",
+        "softmax.cc",
+    ],
+    hdrs = [
+    ],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels:op_macros",
+        "//tensorflow/lite/kernels:padding",
+        "//tensorflow/lite/kernels/internal:quantization_util",
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:tensor",
+    ],
+)
+
+cc_library(
+    name = "portable_optimized_ops_resolver",
+    srcs = [
+        "all_ops_resolver.cc",
+    ],
+    hdrs = [
+        "all_ops_resolver.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":portable_optimized_micro_ops",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "depthwise_conv_test",
     srcs = [
@@ -59,6 +95,19 @@ tflite_micro_cc_test(
     ],
 )
 
+tflite_micro_cc_test(
+    name = "portable_optimized_depthwise_conv_test",
+    srcs = [
+        "depthwise_conv_test.cc",
+    ],
+    deps = [
+        ":portable_optimized_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "fully_connected_test",
     srcs = [
diff --git a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
index 05ba8798c0d..ff952b39c00 100644
--- a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
@@ -50,6 +50,11 @@ void TestDepthwiseConvFloat(std::initializer_list<int> input_dims_data,
       CreateFloatTensor(output_data, output_dims, "output_tensor"),
   };
 
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output_dims_count; ++i) {
+    output_data[i] = 23;
+  }
+
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, &context);
 
@@ -403,4 +408,84 @@ TF_LITE_MICRO_TEST(SimpleTestReluQuantized) {
       kTfLiteActRelu, output_data);
 }
 
+TF_LITE_MICRO_TEST(SimpleTestOptimizedFilterWidth) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = 0;
+  const float input_max = 255.0f;
+  const float filter_min = -63.5f;
+  const float filter_max = 64.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 128.0f * (1 << 24);
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+  const int output_dims_count = 9;
+  uint8_t output_data[output_dims_count];
+
+  tflite::testing::TestDepthwiseConvQuantized(  //
+      {4, 1, 1, 9, 1},                          // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(7, input_min, input_max),
+          F2Q(8, input_min, input_max),
+          F2Q(3, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(9, input_min, input_max),
+          F2Q(10, input_min, input_max),
+          F2Q(5, input_min, input_max),
+          F2Q(6, input_min, input_max),
+          F2Q(11, input_min, input_max),
+          F2Q(12, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {4, 2, 1, 8, 1},       // Filter shape.
+      {
+          // Filter values.
+          F2Q(1, filter_min, filter_max),
+          F2Q(2, filter_min, filter_max),
+          F2Q(3, filter_min, filter_max),
+          F2Q(4, filter_min, filter_max),
+          F2Q(-9, filter_min, filter_max),
+          F2Q(10, filter_min, filter_max),
+          F2Q(-11, filter_min, filter_max),
+          F2Q(12, filter_min, filter_max),
+          F2Q(5, filter_min, filter_max),
+          F2Q(6, filter_min, filter_max),
+          F2Q(7, filter_min, filter_max),
+          F2Q(8, filter_min, filter_max),
+          F2Q(13, filter_min, filter_max),
+          F2Q(-14, filter_min, filter_max),
+          F2Q(15, filter_min, filter_max),
+          F2Q(-16, filter_min, filter_max),
+      },
+      filter_min, filter_max,  // Filter quantization range.
+      {1, 1},                  // Bias shape.
+      {
+          // Bias values.
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+          F2Q32(4, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          220,
+          184,
+          140,
+          150,
+          161,
+          200,
+          172,
+          148,
+          133,
+      },
+      {4, 1, 1, 9, 1},         // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActNone, output_data);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/portable_optimized/depthwise_conv.cc b/tensorflow/lite/experimental/micro/kernels/portable_optimized/depthwise_conv.cc
new file mode 100644
index 00000000000..f1ddf6b0733
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/portable_optimized/depthwise_conv.cc
@@ -0,0 +1,439 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace depthwise_conv {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+// Size of the cached buffer we'll be using to hold reordered weights.
+constexpr int kReshapedFilterDataSize = 1 * 1024;
+
+struct OpData {
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             int out_width, int out_height,
+                             const TfLiteType data_type, OpData* data) {
+  data->padding.height = ComputePadding(params->stride_height, 1, height,
+                                        filter_height, out_height);
+  data->padding.width =
+      ComputePadding(params->stride_width, 1, width, filter_width, out_width);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (data_type != kTfLiteFloat32) {
+    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    const TfLiteTensor* bias =
+        GetOptionalInputTensor(context, node, kBiasTensor);
+    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+    data->output_shift = -exponent;
+    CalculateActivationRangeUint8(params->activation, output,
+                                  &data->output_activation_min,
+                                  &data->output_activation_max);
+  }
+  return kTfLiteOk;
+}
+
+// Specialized implementation of the depthwise convolution operation designed to
+// work with the particular filter width of eight used by the default micro
+// speech sample code. It uses 1KB of RAM to hold reordered weight parameters,
+// converted from TFLite's NHWC format to NCHW format, and expressed as signed
+// eight bit integers, rather than unsigned. Care must be taken when calling
+// this not to use it for more than one node since there's only a single static
+// buffer holding the weights. You should use this implementation if depthwise
+// convolutions are a performance bottleneck, you have a layer that meets the
+// parameter requirements, and the extra RAM usage and additional code size are
+// not an issue.
+static inline void DepthwiseConvOptimizedForFilterWidthEight(
+    TfLiteContext* context, const DepthwiseParams& params,
+    const RuntimeShape& input_shape, const uint8* input_data,
+    const RuntimeShape& filter_shape, const uint8* filter_data,
+    const RuntimeShape& bias_shape, const int32* bias_data,
+    const RuntimeShape& output_shape, uint8* output_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  static int8_t reshaped_filter_data[kReshapedFilterDataSize];
+  const int needed_size =
+      output_depth * filter_width * filter_height * input_depth;
+  if (needed_size > kReshapedFilterDataSize) {
+    context->ReportError(
+        context,
+        "Size too large for reshaped weight buffer (%d needed, %d available)",
+        needed_size, kReshapedFilterDataSize);
+    return;
+  }
+
+  RuntimeShape reshaped_filter_shape;
+  reshaped_filter_shape.BuildFrom(
+      {1, output_depth, filter_height, filter_width});
+
+  // If this is the first time through, repack the weights into a cached buffer
+  // so that they can be accessed sequentially.
+  static bool is_reshaped_filter_initialized = false;
+  if (!is_reshaped_filter_initialized) {
+    for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+      for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+        for (int oc = 0; oc < output_depth; ++oc) {
+          const uint8* current_filter =
+              filter_data + Offset(filter_shape, 0, filter_y, filter_x, oc);
+          int8* reshaped_filter =
+              reshaped_filter_data +
+              Offset(reshaped_filter_shape, 0, oc, filter_y, filter_x);
+          *reshaped_filter = (int32_t)(*current_filter) + filter_offset;
+        }
+      }
+    }
+    is_reshaped_filter_initialized = true;
+  }
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int ic = 0; ic < input_depth; ++ic) {
+          for (int m = 0; m < depth_multiplier; m++) {
+            const int oc = m + ic * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32 acc = 0;
+            int in_y_start = in_y_origin;
+            int filter_y_start = 0;
+            if (in_y_origin < 0) {
+              in_y_start = 0;
+              filter_y_start = 0 - in_y_origin;
+            }
+            int filter_y_end = filter_height;
+            if ((in_y_origin + filter_height) >= input_height) {
+              filter_y_end -= (in_y_origin + filter_height) - input_height;
+            }
+            int in_y = in_y_start;
+            int in_x_start = in_x_origin;
+            int filter_x_start = 0;
+            bool is_out_of_x_bounds = false;
+            if (in_x_origin < 0) {
+              in_x_start = 0;
+              filter_x_start = 0 - in_x_origin;
+              is_out_of_x_bounds = true;
+            }
+            int filter_x_end = filter_width;
+            if ((in_x_origin + filter_width) >= input_width) {
+              filter_x_end -= (in_x_origin + filter_width) - input_width;
+              is_out_of_x_bounds = true;
+            }
+            for (int filter_y = filter_y_start; filter_y < filter_y_end;
+                 ++filter_y, ++in_y) {
+              const uint8* current_input =
+                  input_data + Offset(input_shape, b, in_y, in_x_start, ic);
+              if ((filter_width == 8) && !is_out_of_x_bounds) {
+                int8* current_filter =
+                    reshaped_filter_data + Offset(reshaped_filter_shape, 0, oc,
+                                                  filter_y, filter_x_start);
+                const uint32_t input_vals0 =
+                    *reinterpret_cast<const uint32_t*>(current_input);
+                current_input += 4;
+                const int32_t filter_vals0 =
+                    *reinterpret_cast<const int32_t*>(current_filter);
+                current_filter += 4;
+                const uint8 input_val0 = input_vals0 & 0xff;
+                const int8 filter_val0 = filter_vals0 & 0xff;
+                acc += filter_val0 * input_val0;
+                const uint8 input_val1 = (input_vals0 >> 8) & 0xff;
+                const int8 filter_val1 = (filter_vals0 >> 8) & 0xff;
+                acc += filter_val1 * input_val1;
+                const uint8 input_val2 = (input_vals0 >> 16) & 0xff;
+                const int8 filter_val2 = (filter_vals0 >> 16) & 0xff;
+                acc += filter_val2 * input_val2;
+                const uint8 input_val3 = (input_vals0 >> 24) & 0xff;
+                const int8 filter_val3 = (filter_vals0 >> 24) & 0xff;
+                acc += filter_val3 * input_val3;
+
+                const uint32_t input_vals1 =
+                    *reinterpret_cast<const uint32_t*>(current_input);
+                const int32_t filter_vals1 =
+                    *reinterpret_cast<const int32_t*>(current_filter);
+                const uint8 input_val4 = input_vals1 & 0xff;
+                const int8 filter_val4 = filter_vals1 & 0xff;
+                acc += filter_val4 * input_val4;
+                const uint8 input_val5 = (input_vals1 >> 8) & 0xff;
+                const int8 filter_val5 = (filter_vals1 >> 8) & 0xff;
+                acc += filter_val5 * input_val5;
+                const uint8 input_val6 = (input_vals1 >> 16) & 0xff;
+                const int8 filter_val6 = (filter_vals1 >> 16) & 0xff;
+                acc += filter_val6 * input_val6;
+                const uint8 input_val7 = (input_vals1 >> 24) & 0xff;
+                const int8 filter_val7 = (filter_vals1 >> 24) & 0xff;
+                acc += filter_val7 * input_val7;
+              } else {
+                const uint8* current_filter =
+                    filter_data +
+                    Offset(filter_shape, 0, filter_y, filter_x_start, oc);
+                for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                     ++filter_x) {
+                  int32 input_val = *current_input;
+                  current_input += input_depth;
+                  int32 filter_val = *current_filter;
+                  current_filter += output_depth;
+                  acc +=
+                      (filter_val + filter_offset) * (input_val + input_offset);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[oc];
+            }
+            acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+                DepthwiseConvOutputRounding::kAwayFromZero>(
+                acc, output_multiplier, output_shift);
+            acc += output_offset;
+            acc = std::max(acc, output_activation_min);
+            acc = std::min(acc, output_activation_max);
+            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
+                static_cast<uint8>(acc);
+          }
+        }
+      }
+    }
+  }
+}  // namespace
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteDepthwiseConvParams* params, OpData* data,
+               const TfLiteTensor* input, const TfLiteTensor* filter,
+               const TfLiteTensor* bias, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  tflite::reference_ops::DepthwiseConv(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(filter), GetTensorData<float>(filter),
+      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output));
+}
+
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteDepthwiseConvParams* params, OpData* data,
+                   const TfLiteTensor* input, const TfLiteTensor* filter,
+                   const TfLiteTensor* bias, TfLiteTensor* output) {
+  const int32_t input_offset = -input->params.zero_point;
+  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t output_offset = output->params.zero_point;
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = -data->output_shift;
+
+  // Figure out if we can use the optimized path for this set of parameters.
+  const int filter_width = GetTensorShape(filter).Dims(2);
+  const int input_depth = GetTensorShape(input).Dims(3);
+  const int output_depth = GetTensorShape(filter).Dims(3);
+  const int filter_height = GetTensorShape(filter).Dims(1);
+  const int needed_size =
+      output_depth * filter_width * filter_height * input_depth;
+  bool use_optimized_path = false;
+  if ((filter_width == 8) && (input_offset == 0) && (filter_offset == -127) &&
+      (input_depth == 1) && (needed_size <= kReshapedFilterDataSize)) {
+    // FIXME(petewarden) - We need a more robust way of handling this, ideally
+    // with an allocation mechanism available through the context API.
+    // Use the address of the node as a proxy for its identity, since we need
+    // to ensure the weight values are consistent between calls, and there's
+    // no easy way to do that quickly other than relying on the identity of
+    // the owning node.
+    static TfLiteNode* initialized_node_address = node;
+    if (initialized_node_address == node) {
+      use_optimized_path = true;
+    } else {
+      static bool has_warned = false;
+      if (!has_warned) {
+        context->ReportError(
+            context,
+            "Multiple depthwise conv ops match optimization parameters, but "
+            "only the first will use the fast path, because there's only one "
+            "RAM cache available");
+        has_warned = true;
+      }
+    }
+  }
+  if (use_optimized_path) {
+    DepthwiseConvOptimizedForFilterWidthEight(
+        context, op_params, GetTensorShape(input),
+        GetTensorData<uint8_t>(input), GetTensorShape(filter),
+        GetTensorData<uint8_t>(filter), GetTensorShape(bias),
+        GetTensorData<int32_t>(bias), GetTensorShape(output),
+        GetTensorData<uint8_t>(output));
+  } else {
+    tflite::reference_ops::DepthwiseConv(
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+        GetTensorShape(bias), GetTensorData<int32_t>(bias),
+        GetTensorShape(output), GetTensorData<uint8_t>(output));
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias =
+      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+  int out_width = ComputeOutSize(params->padding, width, filter_width,
+                                 params->stride_width);
+  int out_height = ComputeOutSize(params->padding, height, filter_height,
+                                  params->stride_height);
+  OpData local_data_object;
+  OpData* data = &local_data_object;
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, out_width,
+                                        out_height, data_type, data));
+
+  // TODO(aselle): Consider whether float conv and quantized conv should be
+  // separate ops to avoid dispatch overhead here.
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      EvalFloat(context, node, params, data, input, filter, bias, output);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantized(context, node, params, data, input, filter, bias, output);
+      break;
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace depthwise_conv
+
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
+  static TfLiteRegistration r = {depthwise_conv::Init, depthwise_conv::Free,
+                                 depthwise_conv::Prepare, depthwise_conv::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index 1179b285067..e11e8a8cf09 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -87,7 +87,6 @@ tensorflow/lite/core/api/op_resolver.cc \
 tensorflow/lite/kernels/kernel_util.cc \
 tensorflow/lite/kernels/internal/quantization_util.cc
 MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SRCS))
-MICROLITE_CC_SRCS := $(call specialize,$(MICROLITE_CC_SRCS))
 
 MICROLITE_CC_HDRS := \
 $(wildcard tensorflow/lite/experimental/micro/*.h) \
@@ -151,6 +150,9 @@ KEIL_PROJECT_FILES := \
 # keep this main makefile focused on the sources and dependencies.
 include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
 
+# Call specialize here so that platform-specific tags can be taken into account.
+MICROLITE_CC_SRCS := $(call specialize,$(MICROLITE_CC_SRCS))
+
 ALL_TAGS += $(TARGET_ARCH)
 
 ALL_SRCS := \
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
index 4b3e79849e9..4df26a7bf70 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -10,6 +10,9 @@ ifeq ($(TARGET),$(filter $(TARGET),apollo3evb sparkfun_edge))
   # with the hard interfaces.
   GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/
 
+  # Use the faster depthwise conv implementation.
+  ALL_TAGS += portable_optimized
+
   PLATFORM_FLAGS = \
     -DPART_apollo3 \
     -DAM_PACKAGE_BGA \
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc
index 3b91eeff9fd..090b4fa101d 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc
@@ -1,6 +1,9 @@
 # Settings for Mac OS platforms.
 ifeq ($(TARGET), osx)
 
+  # Make sure we can find the embedded GCC compiler.
+  export PATH := ${PATH}:tensorflow/lite/experimental/micro/tools/make/downloads/gcc_embedded/bin/
+
   PLATFORM_FLAGS = \
     -DTF_LITE_DISABLE_X86_NEON