Remove portable_optimized depthwise_conv implementation.

PR #34999 surfaced inconsistencies between the reference and portable_optimized implementations and we are also slowly transitioning to CMSIS-NN for the optimized implementations for ARM Cortex-M targets. There may still be an issue with the reference depthwise conv + dilation > 1 but that would be a separate bug. PiperOrigin-RevId: 317802639 Change-Id: Ic8c9acffb060bb3ec5f802a2045ac6ac8b9f2233
2020-06-22 22:29:15 -07:00 · 2020-06-22 22:29:15 -07:00 · 934df8dcea
commit 934df8dcea
parent 40e860b7a4
5 changed files with 1 additions and 624 deletions
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@ -102,30 +102,6 @@ cc_library(
    ],
 )

-# TODO(b/144176795): This target should really be handled differently so that we
-# do not have a fork in the build graph. The bug has some initial ideas.
-cc_library(
-    name = "portable_optimized_op_resolver",
-    srcs = [
-        "all_ops_resolver.cc",
-        "micro_mutable_op_resolver.h",
-        "micro_op_resolver.h",
-    ],
-    hdrs = [
-        "all_ops_resolver.h",
-    ],
-    copts = micro_copts(),
-    deps = [
-        ":micro_compatibility",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/kernels:op_macros",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "//tensorflow/lite/micro/kernels:portable_optimized_micro_ops",
-        "//tensorflow/lite/schema:schema_fbs",
-    ],
-)
-
 cc_library(
    name = "debug_log",
    srcs = [
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@ -20,7 +20,6 @@ package_group(
    packages = ["//tensorflow/lite/micro"],
 )

-# LINT.IfChange(micro_ops)
 cc_library(
    name = "micro_ops",
    srcs = [
@ -106,73 +105,6 @@ cc_library(
        ],
    }),
 )
-# LINT.ThenChange(//tensorflow/lite/micro/kernels/BUILD:portable_optimized_micro_ops)
-
-# LINT.IfChange(portable_optimized_micro_ops)
-cc_library(
-    name = "portable_optimized_micro_ops",
-    srcs = [
-        "activations.cc",
-        "add.cc",
-        "arg_min_max.cc",
-        "ceil.cc",
-        "circular_buffer.cc",
-        "comparisons.cc",
-        "concatenation.cc",
-        "conv.cc",
-        "dequantize.cc",
-        "elementwise.cc",
-        "ethosu.cc",
-        "floor.cc",
-        "fully_connected.cc",
-        "l2norm.cc",
-        "logical.cc",
-        "logistic.cc",
-        "maximum_minimum.cc",
-        "mul.cc",
-        "neg.cc",
-        "pack.cc",
-        "pad.cc",
-        "pooling.cc",
-        "portable_optimized/depthwise_conv.cc",
-        "prelu.cc",
-        "quantize.cc",
-        "reduce.cc",
-        "reshape.cc",
-        "resize_nearest_neighbor.cc",
-        "round.cc",
-        "softmax.cc",
-        "split.cc",
-        "strided_slice.cc",
-        "sub.cc",
-        "svdf.cc",
-        "tanh.cc",
-        "unpack.cc",
-    ],
-    hdrs = ["micro_ops.h"],
-    copts = micro_copts(),
-    visibility = [
-        # Needed for micro:portable_optimized_ops_resolver but visibility can not be
-        # finer-grained than a package.
-        ":micro_top_level",
-    ],
-    deps = [
-        ":activation_utils",
-        ":micro_utils",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/kernels:op_macros",
-        "//tensorflow/lite/kernels:padding",
-        "//tensorflow/lite/kernels/internal:common",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "//tensorflow/lite/kernels/internal:quantization_util",
-        "//tensorflow/lite/kernels/internal:reference_base",
-        "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/kernels/internal:types",
-        "//tensorflow/lite/micro:micro_utils",
-    ],
-)
-# LINT.ThenChange(//tensorflow/lite/micro/kernels/BUILD:micro_ops)

 test_suite(
    name = "all_tests",
@ -214,19 +146,6 @@ tflite_micro_cc_test(
    ],
 )

-tflite_micro_cc_test(
-    name = "portable_optimized_depthwise_conv_test",
-    srcs = [
-        "depthwise_conv_test.cc",
-    ],
-    deps = [
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/micro:portable_optimized_op_resolver",
-        "//tensorflow/lite/micro/testing:micro_test",
-    ],
-)
-
 tflite_micro_cc_test(
    name = "fully_connected_test",
    srcs = [
--- a/tensorflow/lite/micro/kernels/portable_optimized/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/portable_optimized/depthwise_conv.cc
@ -1,515 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
-
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
-#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/padding.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace depthwise_conv {
-namespace {
-
-constexpr int kInputTensor = 0;
-constexpr int kFilterTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
-
-// Depthwise conv is quantized along dimension 3:
-// https://www.tensorflow.org/lite/performance/quantization_spec
-constexpr int kDepthwiseConvQuantizedDimension = 3;
-
-// Size of the cached buffer we'll be using to hold reordered weights.
-constexpr int kReshapedFilterDataSize = 1 * 1024;
-
-struct OpData {
-  TfLitePaddingValues padding;
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-
-  // Per channel output multiplier and shift.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
-
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-};
-
-TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, int width,
-                             int height, int filter_width, int filter_height,
-                             int out_width, int out_height,
-                             const TfLiteType data_type, OpData* data) {
-  bool has_bias = node->inputs->size == 3;
-  // Check number of inputs/outputs
-  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  // Matching GetWindowedOutputSize in TensorFlow.
-  auto padding = params->padding;
-  data->padding = ComputePaddingHeightWidth(
-      params->stride_height, params->stride_width,
-      params->dilation_height_factor, params->dilation_width_factor, height,
-      width, filter_height, filter_width, padding, &out_height, &out_width);
-
-  // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
-  if (data_type != kTfLiteFloat32) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-    int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
-
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
-        context, input, filter, bias, output, params->activation,
-        &data->output_multiplier, &data->output_shift,
-        &data->output_activation_min, &data->output_activation_max,
-        data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
-  }
-  return kTfLiteOk;
-}
-
-// Specialized implementation of the depthwise convolution operation designed to
-// work with the particular filter width of eight used by the default micro
-// speech sample code. It uses 1KB of RAM to hold reordered weight parameters,
-// converted from TFLite's NHWC format to NCHW format, and expressed as signed
-// eight bit integers, rather than unsigned. Care must be taken when calling
-// this not to use it for more than one node since there's only a single static
-// buffer holding the weights. You should use this implementation if depthwise
-// convolutions are a performance bottleneck, you have a layer that meets the
-// parameter requirements, and the extra RAM usage and additional code size are
-// not an issue.
-static inline void DepthwiseConvOptimizedForFilterWidthEight(
-    TfLiteContext* context, const DepthwiseParams& params,
-    const RuntimeShape& input_shape, const uint8* input_data,
-    const RuntimeShape& filter_shape, const uint8* filter_data,
-    const RuntimeShape& bias_shape, const int32* bias_data,
-    const RuntimeShape& output_shape, uint8* output_data) {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
-  const int output_shift = params.output_shift;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-
-  static int16_t reshaped_filter_data[kReshapedFilterDataSize];
-  const int needed_size =
-      output_depth * filter_width * filter_height * input_depth;
-  if (needed_size > kReshapedFilterDataSize) {
-    TF_LITE_KERNEL_LOG(
-        context,
-        "Size too large for reshaped weight buffer (%d needed, %d available)",
-        needed_size, kReshapedFilterDataSize);
-    return;
-  }
-
-  RuntimeShape reshaped_filter_shape;
-  reshaped_filter_shape.BuildFrom(
-      {1, output_depth, filter_height, filter_width});
-
-  // If this is the first time through, repack the weights into a cached buffer
-  // so that they can be accessed sequentially.
-  static bool is_reshaped_filter_initialized = false;
-  if (!is_reshaped_filter_initialized) {
-    for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-      for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-        for (int oc = 0; oc < output_depth; ++oc) {
-          const uint8* current_filter =
-              filter_data + Offset(filter_shape, 0, filter_y, filter_x, oc);
-          int16_t* reshaped_filter =
-              reshaped_filter_data +
-              Offset(reshaped_filter_shape, 0, oc, filter_y, filter_x);
-          *reshaped_filter =
-              static_cast<int16_t>(*current_filter) + filter_offset;
-        }
-      }
-    }
-    is_reshaped_filter_initialized = true;
-  }
-
-  for (int b = 0; b < batches; ++b) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        for (int ic = 0; ic < input_depth; ++ic) {
-          for (int m = 0; m < depth_multiplier; m++) {
-            const int oc = m + ic * depth_multiplier;
-            const int in_x_origin = (out_x * stride_width) - pad_width;
-            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
-            int in_y_start = in_y_origin;
-            int filter_y_start = 0;
-            if (in_y_origin < 0) {
-              in_y_start = 0;
-              filter_y_start = 0 - in_y_origin;
-            }
-            int filter_y_end = filter_height;
-            if ((in_y_origin + filter_height) >= input_height) {
-              filter_y_end -= (in_y_origin + filter_height) - input_height;
-            }
-            int in_y = in_y_start;
-            int in_x_start = in_x_origin;
-            int filter_x_start = 0;
-            bool is_out_of_x_bounds = false;
-            if (in_x_origin < 0) {
-              in_x_start = 0;
-              filter_x_start = 0 - in_x_origin;
-              is_out_of_x_bounds = true;
-            }
-            int filter_x_end = filter_width;
-            if ((in_x_origin + filter_width) >= input_width) {
-              filter_x_end -= (in_x_origin + filter_width) - input_width;
-              is_out_of_x_bounds = true;
-            }
-            for (int filter_y = filter_y_start; filter_y < filter_y_end;
-                 ++filter_y, ++in_y) {
-              const uint8* current_input =
-                  input_data + Offset(input_shape, b, in_y, in_x_start, ic);
-              if ((filter_width == 8) && !is_out_of_x_bounds) {
-                int16* current_filter =
-                    reshaped_filter_data + Offset(reshaped_filter_shape, 0, oc,
-                                                  filter_y, filter_x_start);
-                const uint32_t input_vals0 =
-                    *reinterpret_cast<const uint32_t*>(current_input);
-                current_input += 4;
-                const int32_t filter_vals0 =
-                    *reinterpret_cast<const int32_t*>(current_filter);
-                current_filter += 2;
-                const uint8 input_val0 = input_vals0 & 0xff;
-                const int16 filter_val0 = filter_vals0 & 0xffff;
-                acc += filter_val0 * input_val0;
-                const uint8 input_val1 = (input_vals0 >> 8) & 0xff;
-                const int16 filter_val1 = (filter_vals0 >> 16) & 0xffff;
-                acc += filter_val1 * input_val1;
-
-                const int32_t filter_vals1 =
-                    *reinterpret_cast<const int32_t*>(current_filter);
-                current_filter += 2;
-                const uint8 input_val2 = (input_vals0 >> 16) & 0xff;
-                const int16 filter_val2 = filter_vals1 & 0xffff;
-                acc += filter_val2 * input_val2;
-                const uint8 input_val3 = (input_vals0 >> 24) & 0xff;
-                const int16 filter_val3 = (filter_vals1 >> 16) & 0xffff;
-                acc += filter_val3 * input_val3;
-
-                const uint32_t input_vals1 =
-                    *reinterpret_cast<const uint32_t*>(current_input);
-                const int32_t filter_vals2 =
-                    *reinterpret_cast<const int32_t*>(current_filter);
-                current_filter += 2;
-                const uint8 input_val4 = input_vals1 & 0xff;
-                const int16 filter_val4 = filter_vals2 & 0xffff;
-                acc += filter_val4 * input_val4;
-                const uint8 input_val5 = (input_vals1 >> 8) & 0xff;
-                const int16 filter_val5 = (filter_vals2 >> 16) & 0xffff;
-                acc += filter_val5 * input_val5;
-
-                const int32_t filter_vals3 =
-                    *reinterpret_cast<const int32_t*>(current_filter);
-                const uint8 input_val6 = (input_vals1 >> 16) & 0xff;
-                const int16 filter_val6 = filter_vals3 & 0xffff;
-                acc += filter_val6 * input_val6;
-                const uint8 input_val7 = (input_vals1 >> 24) & 0xff;
-                const int16 filter_val7 = (filter_vals3 >> 16) & 0xffff;
-                acc += filter_val7 * input_val7;
-              } else {
-                const uint8* current_filter =
-                    filter_data +
-                    Offset(filter_shape, 0, filter_y, filter_x_start, oc);
-                for (int filter_x = filter_x_start; filter_x < filter_x_end;
-                     ++filter_x) {
-                  int32 input_val = *current_input;
-                  current_input += input_depth;
-                  int32 filter_val = *current_filter;
-                  current_filter += output_depth;
-                  acc +=
-                      (filter_val + filter_offset) * (input_val + input_offset);
-                }
-              }
-            }
-            if (bias_data) {
-              acc += bias_data[oc];
-            }
-            acc = reference_ops::depthwise_conv::DepthwiseConvRound<
-                DepthwiseConvOutputRounding::kAwayFromZero>(
-                acc, output_multiplier, output_shift);
-            acc += output_offset;
-            acc = std::max(acc, output_activation_min);
-            acc = std::min(acc, output_activation_max);
-            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
-                static_cast<uint8>(acc);
-          }
-        }
-      }
-    }
-  }
-}  // namespace
-
-}  // namespace
-
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteDepthwiseConvParams* params, OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = 1;
-  op_params.dilation_height_factor = 1;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
-}
-
-// TODO(njeff): Optimize for int8 like we do for uint8.
-
-void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output) {
-  DepthwiseParams op_params;
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.weights_offset = 0;
-  op_params.output_offset = output->params.zero_point;
-  // TODO(b/130439627): Use calculated value for clamping.
-  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
-  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
-
-  reference_integer_ops::DepthwiseConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
-}
-
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteDepthwiseConvParams* params, OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = 1;
-  op_params.dilation_height_factor = 1;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-
-  // Figure out if we can use the optimized path for this set of parameters.
-  const int filter_width = GetTensorShape(filter).Dims(2);
-  const int input_depth = GetTensorShape(input).Dims(3);
-  const int output_depth = GetTensorShape(filter).Dims(3);
-  const int filter_height = GetTensorShape(filter).Dims(1);
-  const int needed_size =
-      output_depth * filter_width * filter_height * input_depth;
-  bool use_optimized_path = false;
-  if ((filter_width == 8) && (input_offset == 0) && (input_depth == 1) &&
-      (needed_size <= kReshapedFilterDataSize)) {
-    // FIXME(petewarden) - We need a more robust way of handling this, ideally
-    // with an allocation mechanism available through the context API.
-    // Use the address of the node as a proxy for its identity, since we need
-    // to ensure the weight values are consistent between calls, and there's
-    // no easy way to do that quickly other than relying on the identity of
-    // the owning node.
-    static TfLiteNode* initialized_node_address = node;
-    if (initialized_node_address == node) {
-      use_optimized_path = true;
-    } else {
-      static bool has_warned = false;
-      if (!has_warned) {
-        TF_LITE_KERNEL_LOG(
-            context,
-            "Multiple depthwise conv ops match optimization parameters, but "
-            "only the first will use the fast path, because there's only one "
-            "RAM cache available");
-        has_warned = true;
-      }
-    }
-  }
-  if (use_optimized_path) {
-    DepthwiseConvOptimizedForFilterWidthEight(
-        context, op_params, GetTensorShape(input),
-        GetTensorData<uint8_t>(input), GetTensorShape(filter),
-        GetTensorData<uint8_t>(filter), GetTensorShape(bias),
-        GetTensorData<int32_t>(bias), GetTensorShape(output),
-        GetTensorData<uint8_t>(output));
-  } else {
-    tflite::reference_ops::DepthwiseConv(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(filter), GetTensorData<uint8_t>(filter),
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
-  }
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias =
-      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
-
-  const TfLiteType data_type = input->type;
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-  int out_width = ComputeOutSize(params->padding, width, filter_width,
-                                 params->stride_width);
-  int out_height = ComputeOutSize(params->padding, height, filter_height,
-                                  params->stride_height);
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    TF_LITE_ENSURE(
-        context, affine_quantization->scale->size == 1 ||
-                     affine_quantization->scale->size ==
-                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, out_width,
-                                        out_height, data_type, &data));
-
-  // TODO(aselle): Consider whether float conv and quantized conv should be
-  // separate ops to avoid dispatch overhead here.
-  switch (input->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, output);
-      break;
-    case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
-                              output);
-      break;
-    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, output);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(input->type), input->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace depthwise_conv
-
-TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/depthwise_conv::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
--- a/tensorflow/lite/micro/tools/ci_build/test_mbed.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_mbed.sh
@ -49,7 +49,7 @@ fi

 make -f tensorflow/lite/micro/tools/make/Makefile \
  TARGET=${TARGET} \
-  TAGS="portable_optimized disco_f746ng" \
+  TAGS="disco_f746ng" \
  ${PROJECTS}

 readable_run tensorflow/lite/micro/tools/ci_build/install_mbed_cli.sh
--- a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
@ -24,9 +24,6 @@ ifeq ($(TARGET),$(filter $(TARGET),\
 $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)
  endif

-  # Use the faster depthwise conv implementation.
-  ALL_TAGS += portable_optimized
-
  PLATFORM_FLAGS = \
    -DPART_apollo3 \
    -DAM_PACKAGE_BGA \