PR #32168: Port mul from Tensorflow Lite to Tensorflow Lite Micro

Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/32168 This PR port the mul kernel to Tensorflow Lite Micro. Supported data types are float, int8 and uint8. Copybara import of the project: -- b163a6795209c367d1368311da554881617bc2b7 by Jens Elofsson <jens.elofsson@arm.com>: Move Mul ops to its own header file to remove gemmlowp dependency. -- 3b6c71fdd0f816c121264f28d1888be91f19fa0e by Jens Elofsson <jens.elofsson@arm.com>: Add MUL kernel to Tensorflow Lite Micro. -- e383f4fff2dcf713f012e842f3a027f6a8090d22 by Jens Elofsson <jens.elofsson@arm.com>: Add tests for the Mul kernel in Tensorflow Lite Micro. -- 1082b9e9101820b35c797a711e8c9677df129aaf by Jens Elofsson <jens.elofsson@arm.com>: Remove duplicated BroadcastMul4DSlow function. -- 584725fedac52c8c9c16eb904562e2797e294d8d by Jens Elofsson <jens.elofsson@arm.com>: Add Register_MUL to micro_ops.h and remove include that's failing. PiperOrigin-RevId: 279365723 Change-Id: Id64e22adbacb678f4d7b0034f506940e0540754d
2019-11-08 12:19:13 -08:00 · 2019-11-08 12:19:13 -08:00 · e93595f423
commit e93595f423
parent 115d4c6f66
10 changed files with 792 additions and 148 deletions
--- a/tensorflow/lite/experimental/micro/kernels/BUILD
+++ b/tensorflow/lite/experimental/micro/kernels/BUILD
@ -28,6 +28,7 @@ cc_library(
        "logical.cc",
        "logistic.cc",
        "maximum_minimum.cc",
+        "mul.cc",
        "neg.cc",
        "pack.cc",
        "pooling.cc",
@ -89,6 +90,7 @@ cc_library(
        "logical.cc",
        "logistic.cc",
        "maximum_minimum.cc",
+        "mul.cc",
        "neg.cc",
        "pack.cc",
        "pooling.cc",
@ -321,6 +323,19 @@ tflite_micro_cc_test(
    ],
 )

+tflite_micro_cc_test(
+    name = "mul_test",
+    srcs = [
+        "mul_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
 tflite_micro_cc_test(
    name = "arg_min_max_test",
    srcs = [
--- a/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
+++ b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
@ -62,6 +62,7 @@ AllOpsResolver::AllOpsResolver() {
  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK());
  AddBuiltin(BuiltinOperator_NEG, Register_NEG());
  AddBuiltin(BuiltinOperator_ADD, Register_ADD());
+  AddBuiltin(BuiltinOperator_MUL, Register_MUL());
  AddBuiltin(BuiltinOperator_QUANTIZE, Register_QUANTIZE(), 1, 4);
  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(), 1, 4);
  AddBuiltin(BuiltinOperator_RELU, Register_RELU());
--- a/tensorflow/lite/experimental/micro/kernels/micro_ops.h
+++ b/tensorflow/lite/experimental/micro/kernels/micro_ops.h
@ -54,6 +54,7 @@ TfLiteRegistration* Register_LOGISTIC();
 TfLiteRegistration* Register_MAXIMUM();
 TfLiteRegistration* Register_MAX_POOL_2D();
 TfLiteRegistration* Register_MINIMUM();
+TfLiteRegistration* Register_MUL();
 TfLiteRegistration* Register_NEG();
 TfLiteRegistration* Register_NOT_EQUAL();
 TfLiteRegistration* Register_PACK();
--- a/tensorflow/lite/experimental/micro/kernels/mul.cc
+++ b/tensorflow/lite/experimental/micro/kernels/mul.cc
@ -0,0 +1,181 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace mul {
+
+constexpr int kInput1Tensor = 0;
+constexpr int kInput2Tensor = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+
+  int32_t output_multiplier;
+  int output_shift;
+};
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteMulParams* params, OpData* data) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
+  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+
+  if (output->type == kTfLiteUInt8) {
+    CalculateActivationRangeUint8(params->activation, output,
+                                  &data->output_activation_min,
+                                  &data->output_activation_max);
+  } else if (output->type == kTfLiteInt8) {
+    CalculateActivationRangeInt8(params->activation, output,
+                                 &data->output_activation_min,
+                                 &data->output_activation_max);
+  }
+
+  double real_multiplier =
+      input1->params.scale * input2->params.scale / output->params.scale;
+  QuantizeMultiplier(real_multiplier, &data->output_multiplier,
+                     &data->output_shift);
+
+  return kTfLiteOk;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteMulParams* params, OpData* data,
+                   const TfLiteTensor* input1, const TfLiteTensor* input2,
+                   TfLiteTensor* output) {
+  if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8) {
+    tflite::ArithmeticParams op_params;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    op_params.input1_offset = -input1->params.zero_point;
+    op_params.input2_offset = -input2->params.zero_point;
+    op_params.output_offset = output->params.zero_point;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+        GetTensorShape(input1), GetTensorShape(input2), &op_params);
+
+#define TF_LITE_MUL(type, opname, dtype)                             \
+  type::opname(op_params, GetTensorShape(input1),                    \
+               GetTensorData<dtype>(input1), GetTensorShape(input2), \
+               GetTensorData<dtype>(input2), GetTensorShape(output), \
+               GetTensorData<dtype>(output));
+
+    if (output->type == kTfLiteInt8) {
+      if (need_broadcast) {
+        TF_LITE_MUL(reference_integer_ops, BroadcastMul4DSlow, int8_t);
+      } else {
+        TF_LITE_MUL(reference_integer_ops, Mul, int8_t);
+      }
+    } else if (output->type == kTfLiteUInt8) {
+      if (need_broadcast) {
+        TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, uint8_t);
+      } else {
+        TF_LITE_MUL(reference_ops, Mul, uint8_t);
+      }
+    }
+#undef TF_LITE_MUL
+  }
+}
+
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteMulParams* params, OpData* data,
+               const TfLiteTensor* input1, const TfLiteTensor* input2,
+               TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+      GetTensorShape(input1), GetTensorShape(input2), &op_params);
+#define TF_LITE_MUL(opname)                                                   \
+  reference_ops::opname(op_params, GetTensorShape(input1),                    \
+                        GetTensorData<float>(input1), GetTensorShape(input2), \
+                        GetTensorData<float>(input2), GetTensorShape(output), \
+                        GetTensorData<float>(output));
+
+  if (need_broadcast) {
+    TF_LITE_MUL(BroadcastMul4DSlow);
+  } else {
+    TF_LITE_MUL(Mul);
+  }
+#undef TF_LITE_MUL
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+  OpData data;
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
+  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  CalculateOpData(context, node, params, &data);
+
+  switch (input1->type) {
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+      EvalQuantized(context, node, params, &data, input1, input2, output);
+      break;
+    case kTfLiteFloat32:
+      EvalFloat(context, node, params, &data, input1, input2, output);
+      break;
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           input1->type);
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+}  // namespace mul
+
+TfLiteRegistration* Register_MUL() {
+  static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare, mul::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
--- a/tensorflow/lite/experimental/micro/kernels/mul_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/mul_test.cc
@ -0,0 +1,423 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestMulFloat(std::initializer_list<int> input1_dims_data,
+                  std::initializer_list<float> input1_data,
+                  std::initializer_list<int> input2_dims_data,
+                  std::initializer_list<float> input2_data,
+                  std::initializer_list<int> output_dims_data,
+                  std::initializer_list<float> expected_output_data,
+                  float* output_data, TfLiteFusedActivation activation) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input1_data, input1_dims, "input1_tensor"),
+      CreateFloatTensor(input2_data, input2_dims, "input2_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_MUL, 1);
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteMulParams builtin_data = {
+      .activation = activation,
+  };
+
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  for (int i = 0; i < output_dims_count; i++) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+template <typename T>
+void TestMulQuantized(std::initializer_list<int> input1_dims_data,
+                      std::initializer_list<T> input1_data,
+                      std::initializer_list<int> input2_dims_data,
+                      std::initializer_list<T> input2_data,
+                      const float input_min, const float input_max,
+                      std::initializer_list<int> output_dims_data,
+                      const float output_min, const float output_max,
+                      std::initializer_list<T> expected_output_data,
+                      T* output_data, TfLiteFusedActivation activation,
+                      int error_tolerance) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input1_data, input1_dims, "input1_tensor",
+                            input_min, input_max),
+      CreateQuantizedTensor(input2_data, input2_dims, "input2_tensor",
+                            input_min, input_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_MUL, 1);
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteMulParams builtin_data = {
+      .activation = activation,
+  };
+
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  for (int i = 0; i < output_dims_count; i++) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              error_tolerance);
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(Int8NoActivation) {
+  using tflite::testing::F2QS;
+  const float input_min = -1;
+  const float input_max = 1;
+  const float output_min = -1;
+  const float output_max = 1;
+
+  int8_t output_data[4];
+  tflite::testing::TestMulQuantized({4, 1, 2, 2, 1},  // input1 dims
+                                    {
+                                        F2QS(-0.8, input_min, input_max),
+                                        F2QS(0.2, input_min, input_max),
+                                        F2QS(0.9, input_min, input_max),
+                                        F2QS(0.7, input_min, input_max),
+                                    },                // input1 data
+                                    {4, 1, 2, 2, 1},  // input2 dims
+                                    {
+                                        F2QS(0.6, input_min, input_max),
+                                        F2QS(0.4, input_min, input_max),
+                                        F2QS(0.9, input_min, input_max),
+                                        F2QS(0.8, input_min, input_max),
+                                    },  // input2 data
+                                    input_min, input_max,
+                                    {4, 1, 2, 2, 1},  // output dims
+                                    output_min, output_max,
+                                    {
+                                        F2QS(-0.48, output_min, output_max),
+                                        F2QS(0.08, output_min, output_max),
+                                        F2QS(0.81, output_min, output_max),
+                                        F2QS(0.56, output_min, output_max),
+                                    },  // expected output data
+                                    output_data, kTfLiteActNone, 1);
+}
+
+TF_LITE_MICRO_TEST(Int8NoActivationLargeMultiplier) {
+  using tflite::testing::F2QS;
+  const float input_min = -100;
+  const float input_max = 100;
+  const float output_min = -10;
+  const float output_max = 10;
+
+  int8_t output_data[4];
+  tflite::testing::TestMulQuantized(
+      {4, 1, 2, 2, 1},
+      {
+          F2QS(-4, input_min, input_max),
+          F2QS(2, input_min, input_max),
+          F2QS(3, input_min, input_max),
+          F2QS(1, input_min, input_max),
+      },
+      {4, 1, 2, 2, 1},
+      {
+          /* F2QS(-1, input_min, input_max), F2QS(-3, input_min, input_max), */
+          F2QS(-1, input_min, input_max),
+          F2QS(-3, input_min, input_max),
+          F2QS(4, input_min, input_max),
+          F2QS(2, input_min, input_max),
+      },
+      input_min, input_max, {4, 1, 2, 2, 1}, output_min, output_max,
+      {
+          F2QS(4, output_min, output_max),
+          F2QS(-6, output_min, output_max),
+          F2QS(12, output_min, output_max),
+          F2QS(2, output_min, output_max),
+      },
+      // In Tensorflow Lite, this test have a max allowed error of 1.4f.
+      // A difference of 1.4 in floating points corresponds to 18 quantized
+      // for the output min/max [-10, 10].
+      output_data, kTfLiteActNone, 18);
+}
+
+TF_LITE_MICRO_TEST(Int8NoActivationBroadcast) {
+  using tflite::testing::F2QS;
+  const float input_min = -3.0;
+  const float input_max = 3.0;
+  const float output_min = -3.0;
+  const float output_max = 3.0;
+
+  int8_t output_data[6];
+  tflite::testing::TestMulQuantized({4, 1, 3, 1, 2},  // input1 shape
+                                    {
+                                        F2QS(-2.0, input_min, input_max),
+                                        F2QS(0.2, input_min, input_max),
+                                        F2QS(0.7, input_min, input_max),
+                                        F2QS(0.8, input_min, input_max),
+                                        F2QS(1.1, input_min, input_max),
+                                        F2QS(2.0, input_min, input_max),
+                                    },       // input1 data
+                                    {1, 1},  // input2 shape
+                                    {
+                                        F2QS(0.1, input_min, input_max),
+                                    },  // input2 data
+                                    input_min, input_max,
+                                    {4, 1, 3, 1, 2},  // output shape
+                                    output_min, output_max,
+                                    {
+                                        F2QS(-0.2, output_min, output_max),
+                                        F2QS(0.02, output_min, output_max),
+                                        F2QS(0.07, output_min, output_max),
+                                        F2QS(0.08, output_min, output_max),
+                                        F2QS(0.11, output_min, output_max),
+                                        F2QS(0.2, output_min, output_max),
+                                    },  // expected output data
+                                    output_data, kTfLiteActNone, 1);
+}
+
+TF_LITE_MICRO_TEST(UInt8NoActivation) {
+  using tflite::testing::F2Q;
+  const float input_min = -1;
+  const float input_max = 1;
+  const float output_min = -1;
+  const float output_max = 1;
+
+  uint8_t output_data[4];
+  tflite::testing::TestMulQuantized({4, 1, 2, 2, 1},  // input1 dims
+                                    {
+                                        F2Q(-0.8, input_min, input_max),
+                                        F2Q(0.2, input_min, input_max),
+                                        F2Q(0.9, input_min, input_max),
+                                        F2Q(0.7, input_min, input_max),
+                                    },                // input1 data
+                                    {4, 1, 2, 2, 1},  // input2 dims
+                                    {
+                                        F2Q(0.6, input_min, input_max),
+                                        F2Q(0.4, input_min, input_max),
+                                        F2Q(0.9, input_min, input_max),
+                                        F2Q(0.8, input_min, input_max),
+                                    },  // input2 data
+                                    input_min, input_max,
+                                    {4, 1, 2, 2, 1},  // output dims
+                                    output_min, output_max,
+                                    {
+                                        F2Q(-0.48, output_min, output_max),
+                                        F2Q(0.08, output_min, output_max),
+                                        F2Q(0.81, output_min, output_max),
+                                        F2Q(0.56, output_min, output_max),
+                                    },  // expected output data
+                                    output_data, kTfLiteActNone, 1);
+}
+
+TF_LITE_MICRO_TEST(UInt8NoActivationLargeMultiplier) {
+  using tflite::testing::F2Q;
+  const float input_min = -100;
+  const float input_max = 100;
+  const float output_min = -10;
+  const float output_max = 10;
+
+  uint8_t output_data[4];
+  tflite::testing::TestMulQuantized(
+      {4, 1, 2, 2, 1},
+      {
+          F2Q(-4, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),
+          F2Q(1, input_min, input_max),
+      },
+      {4, 1, 2, 2, 1},
+      {
+          F2Q(-1, input_min, input_max),
+          F2Q(-3, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(2, input_min, input_max),
+      },
+      input_min, input_max, {4, 1, 2, 2, 1}, output_min, output_max,
+      {
+          F2Q(4, output_min, output_max),
+          F2Q(-6, output_min, output_max),
+          F2Q(12, output_min, output_max),
+          F2Q(2, output_min, output_max),
+      },
+      // In Tensorflow Lite, this test have a max allowed error of 1.4f.
+      // A difference of 1.4 in floating points corresponds to 18 quantized
+      // for the output min/max [-10, 10].
+      output_data, kTfLiteActNone, 18);
+}
+
+TF_LITE_MICRO_TEST(UInt8NoActivationBroadcast) {
+  using tflite::testing::F2Q;
+  const float input_min = -3.0;
+  const float input_max = 3.0;
+  const float output_min = -3.0;
+  const float output_max = 3.0;
+
+  uint8_t output_data[6];
+  tflite::testing::TestMulQuantized({4, 1, 3, 1, 2},  // input1 shape
+                                    {
+                                        F2Q(-2.0, input_min, input_max),
+                                        F2Q(0.2, input_min, input_max),
+                                        F2Q(0.7, input_min, input_max),
+                                        F2Q(0.8, input_min, input_max),
+                                        F2Q(1.1, input_min, input_max),
+                                        F2Q(2.0, input_min, input_max),
+                                    },       // input1 data
+                                    {1, 1},  // input2 shape
+                                    {
+                                        F2Q(0.1, input_min, input_max),
+                                    },  // input2 data
+                                    input_min, input_max,
+                                    {4, 1, 3, 1, 2},  // output shape
+                                    output_min, output_max,
+                                    {
+                                        F2Q(-0.2, output_min, output_max),
+                                        F2Q(0.02, output_min, output_max),
+                                        F2Q(0.07, output_min, output_max),
+                                        F2Q(0.08, output_min, output_max),
+                                        F2Q(0.11, output_min, output_max),
+                                        F2Q(0.2, output_min, output_max),
+                                    },  // expected output data
+                                    output_data, kTfLiteActNone, 1);
+}
+
+TF_LITE_MICRO_TEST(FloatNoActivation) {
+  float output_data[4];
+  tflite::testing::TestMulFloat(
+      {4, 1, 2, 2, 1},          // input1 shape
+      {-2.0, 0.2, 0.7, 0.8},    // input1 data
+      {4, 1, 2, 2, 1},          // input2 shape
+      {0.1, 0.2, 0.3, 0.5},     // input2 data
+      {4, 1, 2, 2, 1},          // output shape
+      {-0.2, 0.04, 0.21, 0.4},  // expected output data
+      output_data, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(FloatRelu) {
+  float output_data[4];
+  tflite::testing::TestMulFloat(
+      {4, 1, 2, 2, 1},          // input1 shape
+      {-2.0, 0.2, 0.7, 0.8},    // input1 data
+      {4, 1, 2, 2, 1},          // input2 shape
+      {0.1, 0.2, 0.3, 0.5},     // input2 data
+      {4, 1, 2, 2, 1},          // output shape
+      {-0.2, 0.04, 0.21, 0.4},  // expected output data
+      output_data, kTfLiteActRelu1);
+}
+
+TF_LITE_MICRO_TEST(FloatBroadcast) {
+  float output_data[6];
+  tflite::testing::TestMulFloat(
+      {4, 1, 3, 1, 2},                      // input1 shape
+      {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0},      // input1 data
+      {1, 1},                               // input2 shape
+      {0.1},                                // input2 data
+      {4, 1, 3, 1, 2},                      // output shape
+      {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2},  // expected output data
+      output_data, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TESTS_END
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@ -133,7 +133,9 @@ tensorflow/lite/kernels/internal/reference/integer_ops/add.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/conv.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h \
+tensorflow/lite/kernels/internal/reference/integer_ops/mul.h \
 tensorflow/lite/kernels/internal/reference/maximum_minimum.h \
+tensorflow/lite/kernels/internal/reference/mul.h \
 tensorflow/lite/kernels/internal/reference/neg.h \
 tensorflow/lite/kernels/internal/reference/pooling.h \
 tensorflow/lite/kernels/internal/reference/prelu.h \
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@ -415,6 +415,7 @@ cc_library(
        "reference/integer_ops/tanh.h",
        "reference/logistic.h",
        "reference/maximum_minimum.h",
+        "reference/mul.h",
        "reference/neg.h",
        "reference/non_max_suppression.h",
        "reference/pooling.h",
@ -472,6 +473,7 @@ cc_library(
        "reference/legacy_reference_ops.h",
        "reference/logistic.h",
        "reference/maximum_minimum.h",
+        "reference/mul.h",
        "reference/neg.h",
        "reference/pooling.h",
        "reference/prelu.h",
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_

 #include "fixedpoint/fixedpoint.h"
-#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"

 namespace tflite {
@ -46,7 +45,6 @@ inline void Mul(const ArithmeticParams& params,
                const RuntimeShape& output_shape, int8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
-  gemmlowp::ScopedProfilingLabel label("Mul/8bit");
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);

@ -58,7 +56,6 @@ inline void Mul(const ArithmeticParams& params,
                const RuntimeShape& input1_shape, const int16* input1_data,
                const RuntimeShape& input2_shape, const int16* input2_data,
                const RuntimeShape& output_shape, int8_t* output_data) {
-  gemmlowp::ScopedProfilingLabel label("Mul/Int16Int8");
  int32 output_offset = params.output_offset;
  int32 output_activation_min = params.quantized_activation_min;
  int32 output_activation_max = params.quantized_activation_max;
@ -90,8 +87,6 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params,
                               const int8_t* input2_data,
                               const RuntimeShape& output_shape,
                               int8_t* output_data) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow/8bit");
-
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  // The input shapes are extended as part of NdArrayDesc initialization.
--- a/tensorflow/lite/kernels/internal/reference/mul.h
+++ b/tensorflow/lite/kernels/internal/reference/mul.h
@ -0,0 +1,166 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// Element-wise mul that can often be used for inner loop of broadcast Mul as
+// well as the non-broadcast Mul.
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const uint8* input1_data, const uint8* input2_data,
+                           uint8* output_data) {
+  for (int i = 0; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
+template <typename T>
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const T* input1_data,
+                const RuntimeShape& input2_shape, const T* input2_data,
+                const RuntimeShape& output_shape, T* output_data) {
+  T output_activation_min;
+  T output_activation_max;
+  GetActivationParams(params, &output_activation_min, &output_activation_max);
+
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] * input2_data[i], output_activation_min,
+        output_activation_max);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8* input1_data,
+                const RuntimeShape& input2_shape, const uint8* input2_data,
+                const RuntimeShape& output_shape, uint8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastMul4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const uint8* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const uint8* input2_data,
+                               const RuntimeShape& output_shape,
+                               uint8* output_data) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32 input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32 input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32 unclamped_result =
+              params.output_offset +
+              MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                            params.output_multiplier,
+                                            params.output_shift);
+          const int32 clamped_output = std::min(
+              params.quantized_activation_max,
+              std::max(params.quantized_activation_min, unclamped_result));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void BroadcastMul4DSlow(const ArithmeticParams& params,
+                        const RuntimeShape& unextended_input1_shape,
+                        const T* input1_data,
+                        const RuntimeShape& unextended_input2_shape,
+                        const T* input2_data,
+                        const RuntimeShape& unextended_output_shape,
+                        T* output_data) {
+  T output_activation_min;
+  T output_activation_max;
+  GetActivationParams(params, &output_activation_min, &output_activation_max);
+
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          output_data[Offset(output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/reference/logistic.h"
 #include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
 #include "tensorflow/lite/kernels/internal/reference/neg.h"
 #include "tensorflow/lite/kernels/internal/reference/pooling.h"
 #include "tensorflow/lite/kernels/internal/reference/prelu.h"
@ -358,24 +359,6 @@ inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
  }
 }

-template <typename T>
-inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const T* input1_data,
-                const RuntimeShape& input2_shape, const T* input2_data,
-                const RuntimeShape& output_shape, T* output_data) {
-  T output_activation_min;
-  T output_activation_max;
-  GetActivationParams(params, &output_activation_min, &output_activation_max);
-
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] * input2_data[i], output_activation_min,
-        output_activation_max);
-  }
-}
-
 // TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
@ -384,89 +367,6 @@ inline void Mul(const ArithmeticParams& params,
 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
 // reference_ops.h.
-template <typename T>
-void BroadcastMul4DSlow(const ArithmeticParams& params,
-                        const RuntimeShape& unextended_input1_shape,
-                        const T* input1_data,
-                        const RuntimeShape& unextended_input2_shape,
-                        const T* input2_data,
-                        const RuntimeShape& unextended_output_shape,
-                        T* output_data) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow");
-  T output_activation_min;
-  T output_activation_max;
-  GetActivationParams(params, &output_activation_min, &output_activation_max);
-
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < output_shape.Dims(0); ++b) {
-    for (int y = 0; y < output_shape.Dims(1); ++y) {
-      for (int x = 0; x < output_shape.Dims(2); ++x) {
-        for (int c = 0; c < output_shape.Dims(3); ++c) {
-          output_data[Offset(output_shape, b, y, x, c)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
-                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
-  }
-}
-
-// Element-wise mul that can often be used for inner loop of broadcast Mul as
-// well as the non-broadcast Mul.
-inline void MulElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
-                           uint8* output_data) {
-  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 unclamped_result =
-        params.output_offset +
-        MultiplyByQuantizedMultiplier(input1_val * input2_val,
-                                      params.output_multiplier,
-                                      params.output_shift);
-    const int32 clamped_output =
-        std::min(params.quantized_activation_max,
-                 std::max(params.quantized_activation_min, unclamped_result));
-    output_data[i] = static_cast<uint8>(clamped_output);
-  }
-}
-
-inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
-  TFLITE_DCHECK_LE(params.quantized_activation_min,
-                   params.quantized_activation_max);
-  gemmlowp::ScopedProfilingLabel label("Mul/8bit");
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-
-  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
-}
-
 inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
                                 const RuntimeShape& unswitched_input1_shape,
                                 const uint8* unswitched_input1_data,
@ -519,48 +419,6 @@ inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
  }
 }

-inline void BroadcastMul4DSlow(const ArithmeticParams& params,
-                               const RuntimeShape& input1_shape,
-                               const uint8* input1_data,
-                               const RuntimeShape& input2_shape,
-                               const uint8* input2_data,
-                               const RuntimeShape& output_shape,
-                               uint8* output_data) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow/8bit");
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  // The input shapes are extended as part of NdArrayDesc initialization.
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
-
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
-              params.input1_offset +
-              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
-              params.input2_offset +
-              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 unclamped_result =
-              params.output_offset +
-              MultiplyByQuantizedMultiplier(input1_val * input2_val,
-                                            params.output_multiplier,
-                                            params.output_shift);
-          const int32 clamped_output = std::min(
-              params.quantized_activation_max,
-              std::max(params.quantized_activation_min, unclamped_result));
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<uint8>(clamped_output);
-        }
-      }
-    }
-  }
-}

 inline void Mul(const ArithmeticParams& params,
                const RuntimeShape& input1_shape, const int16* input1_data,