From 00c62a3f6e6afeb744214af31de5bc6fd4a6ecb6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Aug 2019 23:08:17 -0700
Subject: [PATCH] Add maximum and minimum ops for TF Micro

PiperOrigin-RevId: 261267068
---
 .../lite/experimental/micro/kernels/BUILD     |  15 +
 .../micro/kernels/all_ops_resolver.cc         |   5 +-
 .../micro/kernels/maximum_minimum.cc          | 141 ++++++++
 .../micro/kernels/maximum_minimum_test.cc     | 314 ++++++++++++++++++
 .../experimental/micro/tools/make/Makefile    |   1 +
 tensorflow/lite/kernels/internal/BUILD        |   2 +
 .../internal/reference/maximum_minimum.h      |  61 ++++
 .../internal/reference/reference_ops.h        |  36 +-
 8 files changed, 539 insertions(+), 36 deletions(-)
 create mode 100644 tensorflow/lite/experimental/micro/kernels/maximum_minimum.cc
 create mode 100644 tensorflow/lite/experimental/micro/kernels/maximum_minimum_test.cc
 create mode 100644 tensorflow/lite/kernels/internal/reference/maximum_minimum.h

diff --git a/tensorflow/lite/experimental/micro/kernels/BUILD b/tensorflow/lite/experimental/micro/kernels/BUILD
index 85cbaf986b6..ca013a304e4 100644
--- a/tensorflow/lite/experimental/micro/kernels/BUILD
+++ b/tensorflow/lite/experimental/micro/kernels/BUILD
@@ -20,6 +20,7 @@ cc_library(
         "elementwise.cc",
         "floor.cc",
         "fully_connected.cc",
+        "maximum_minimum.cc",
         "pooling.cc",
         "prelu.cc",
         "softmax.cc",
@@ -63,6 +64,7 @@ cc_library(
         "elementwise.cc",
         "floor.cc",
         "fully_connected.cc",
+        "maximum_minimum.cc",
         "pooling.cc",
         "portable_optimized/depthwise_conv.cc",
         "prelu.cc",
@@ -214,6 +216,19 @@ tflite_micro_cc_test(
     ],
 )
 
+tflite_micro_cc_test(
+    name = "maximum_minimum_test",
+    srcs = [
+        "maximum_minimum_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "arg_min_max_test",
     srcs = [
diff --git a/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
index dc86d034349..42f9b108832 100644
--- a/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
+++ b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
@@ -25,9 +25,10 @@ TfLiteRegistration* Register_MAX_POOL_2D();
 TfLiteRegistration* Register_ABS();
 TfLiteRegistration* Register_PRELU();
 TfLiteRegistration* Register_FLOOR();
+TfLiteRegistration* Register_MAXIMUM();
+TfLiteRegistration* Register_MINIMUM();
 TfLiteRegistration* Register_ARG_MAX();
 TfLiteRegistration* Register_ARG_MIN();
-
 AllOpsResolver::AllOpsResolver() {
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
@@ -40,6 +41,8 @@ AllOpsResolver::AllOpsResolver() {
   AddBuiltin(BuiltinOperator_ABS, Register_ABS());
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
   AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
   AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
   AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN());
 }
diff --git a/tensorflow/lite/experimental/micro/kernels/maximum_minimum.cc b/tensorflow/lite/experimental/micro/kernels/maximum_minimum.cc
new file mode 100644
index 00000000000..bbbfb03f182
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/maximum_minimum.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace maximum_minimum {
+namespace {
+
+// This file has a reference implementation of TFMaximum/TFMinimum.
+enum KernelType {
+  kReference,
+};
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpContext {
+  OpContext(TfLiteContext* context, TfLiteNode* node) {
+    input1 = GetInput(context, node, kInputTensor1);
+    input2 = GetInput(context, node, kInputTensor2);
+    output = GetOutput(context, node, kOutputTensor);
+  }
+  const TfLiteTensor* input1;
+  const TfLiteTensor* input2;
+  TfLiteTensor* output;
+};
+
+struct MaximumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 > el2 ? el1 : el2;
+  }
+};
+
+struct MinimumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 < el2 ? el1 : el2;
+  }
+};
+
+}  // namespace
+
+template <typename data_type, typename op_type>
+void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
+                     const OpContext& op_context) {
+  reference_ops::MaximumMinimumBroadcast4DSlow(
+      GetTensorShape(op_context.input1),
+      GetTensorData<data_type>(op_context.input1),
+      GetTensorShape(op_context.input2),
+      GetTensorData<data_type>(op_context.input2),
+      GetTensorShape(op_context.output),
+      GetTensorData<data_type>(op_context.output),
+      op_type::template op<data_type>);
+}
+
+template <KernelType kernel_type, typename OpType>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
+
+  if (kernel_type == kReference) {
+    switch (op_context.output->type) {
+      case kTfLiteFloat32:
+        TFLiteOperation<float, OpType>(context, node, op_context);
+        break;
+      case kTfLiteUInt8:
+        TFLiteOperation<uint8_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt8:
+        TFLiteOperation<int8_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt32:
+        TFLiteOperation<int32_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt64:
+        TFLiteOperation<int64_t, OpType>(context, node, op_context);
+        break;
+      default:
+        context->ReportError(context,
+                             "Type %d is not supported by Maximum/Minimum.",
+                             op_context.output->type);
+        return kTfLiteError;
+    }
+  } else {
+    context->ReportError(context,
+                         "Kernel type not supported by Maximum/Minimum.",
+                         op_context.output->type);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace maximum_minimum
+
+TfLiteRegistration* Register_MAXIMUM() {
+  static TfLiteRegistration r = {
+      /* init */ nullptr,
+      /* free */ nullptr,
+      /* prepare */ nullptr,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MaximumOp>};
+  return &r;
+}
+
+TfLiteRegistration* Register_MINIMUM() {
+  static TfLiteRegistration r = {
+      /* init */ nullptr,
+      /* free */ nullptr,
+      /* prepare */ nullptr,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MinimumOp>};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/maximum_minimum_test.cc b/tensorflow/lite/experimental/micro/kernels/maximum_minimum_test.cc
new file mode 100644
index 00000000000..b944b4bd841
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/maximum_minimum_test.cc
@@ -0,0 +1,314 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestMaxMinFloat(tflite::BuiltinOperator op,
+                     std::initializer_list<int> input1_dims_data,
+                     std::initializer_list<float> input1_data,
+                     std::initializer_list<int> input2_dims_data,
+                     std::initializer_list<float> input2_data,
+                     std::initializer_list<float> expected_output_data,
+                     std::initializer_list<int> output_dims_data,
+                     float* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input1_data, input1_dims, "input1_tensor"),
+      CreateFloatTensor(input2_data, input2_dims, "input2_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5);
+  }
+}
+
+void TestMaxMinQuantized(
+    tflite::BuiltinOperator op, std::initializer_list<int> input1_dims_data,
+    std::initializer_list<uint8_t> input1_data, float input1_min,
+    float input1_max, std::initializer_list<int> input2_dims_data,
+    std::initializer_list<uint8_t> input2_data, float input2_min,
+    float input2_max, std::initializer_list<uint8_t> expected_output_data,
+    float output_min, float output_max,
+    std::initializer_list<int> output_dims_data, uint8_t* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input1_data, input1_dims, "input1_tensor",
+                            input1_min, input1_max),
+      CreateQuantizedTensor(input2_data, input2_dims, "input2_tensor",
+                            input2_min, input2_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+void TestMaxMinQuantizedInt32(
+    tflite::BuiltinOperator op, std::initializer_list<int> input1_dims_data,
+    std::initializer_list<int32_t> input1_data, float input1_min,
+    float input1_max, std::initializer_list<int> input2_dims_data,
+    std::initializer_list<int32_t> input2_data, float input2_min,
+    float input2_max, std::initializer_list<int32_t> expected_output_data,
+    float output_min, float output_max,
+    std::initializer_list<int> output_dims_data, int32_t* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInitializer(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInitializer(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantized32Tensor(input1_data, input1_dims, "input1_tensor",
+                              input1_min, input1_max),
+      CreateQuantized32Tensor(input2_data, input2_dims, "input2_tensor",
+                              input2_min, input2_max),
+      CreateQuantized32Tensor(output_data, output_dims, "output_tensor",
+                              output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 2});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = nullptr;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FloatTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::initializer_list<float> data2 = {-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  float output_data[6];
+
+  tflite::testing::TestMaxMinFloat(
+      tflite::BuiltinOperator_MAXIMUM, {3, 3, 1, 2},
+      data1,                               // input1 shape and data
+      {3, 3, 1, 2}, data2,                 // input2 shape and data
+      {1.0, 0.0, 1.0, 12.0, -2.0, -1.43},  // expected output
+      {3, 3, 1, 2}, output_data);          // output shape and data buffer
+
+  tflite::testing::TestMaxMinFloat(
+      tflite::BuiltinOperator_MINIMUM, {3, 3, 1, 2},
+      data1,                                 // input1 shape and data
+      {3, 3, 1, 2}, data2,                   // input2 shape and data
+      {-1.0, 0.0, -1.0, 11.0, -3.0, -1.44},  // expected output
+      {3, 3, 1, 2}, output_data);            // output shape and data buffer
+}
+
+TF_LITE_MICRO_TEST(Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  const float input1_min = -63.5;
+  const float input1_max = 64;
+  const float input2_min = -63.5;
+  const float input2_max = 64;
+  const float output_min = -63.5;
+  const float output_max = 64;
+
+  uint8_t output_data[6];
+
+  tflite::testing::TestMaxMinQuantized(
+      tflite::BuiltinOperator_MAXIMUM,
+      // input1 shape, data and bounds
+      {3, 3, 1, 2}, data1, input1_min, input1_max,
+      // input2 shape, data and bounds
+      {3, 3, 1, 2}, data2, input2_min, input2_max,
+      // expected output
+      {1, 0, 2, 12, 255, 23},
+      // output bounds, shape and data buffer
+      output_min, output_max, {3, 3, 1, 2}, output_data);
+
+  tflite::testing::TestMaxMinQuantized(
+      tflite::BuiltinOperator_MINIMUM,
+      // input1 shape, data and bounds
+      {3, 3, 1, 2}, data1, input1_min, input1_max,
+      // input2 shape, data and bounds
+      {3, 3, 1, 2}, data2, input2_min, input2_max,
+      // expected output
+      {0, 0, 1, 11, 2, 1},
+      // output bounds, shape and data buffer
+      output_min, output_max, {3, 3, 1, 2}, output_data);
+}
+
+TF_LITE_MICRO_TEST(FloatWithBroadcastTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
+  std::initializer_list<float> data2 = {0.5, 2.0};
+  float output_data[6];
+
+  tflite::testing::TestMaxMinFloat(
+      tflite::BuiltinOperator_MAXIMUM, {3, 3, 1, 2},
+      data1,                            // input1 shape and data
+      {1, 2}, data2,                    // input2 shape and data
+      {1.0, 2.0, 0.5, 2.0, 0.5, 11.0},  // expected output
+      {3, 3, 1, 2}, output_data);       // output shape and data buffer
+
+  tflite::testing::TestMaxMinFloat(
+      tflite::BuiltinOperator_MINIMUM, {3, 3, 1, 2},
+      data1,                               // input1 shape and data
+      {1, 2}, data2,                       // input2 shape and data
+      {0.5, 0.0, -1.0, -2.0, -1.44, 2.0},  // expected output
+      {3, 3, 1, 2}, output_data);          // output shape and data buffer
+}
+
+TF_LITE_MICRO_TEST(Int32WithBroadcastTest) {
+  const float input1_min = -63.5;
+  const float input1_max = 64;
+  const float input2_min = -63.5;
+  const float input2_max = 64;
+  const float output_min = -63.5;
+  const float output_max = 64;
+  std::initializer_list<int32_t> data1 = {1, 0, -1, -2, 3, 11};
+  std::initializer_list<int32_t> data2 = {2};
+  int32_t output_data[6];
+
+  tflite::testing::TestMaxMinQuantizedInt32(
+      tflite::BuiltinOperator_MAXIMUM,
+      // input1 shape, data and bounds
+      {3, 3, 1, 2}, data1, input1_min, input1_max,
+      // input2 shape, data and bounds
+      {1, 1}, data2, input2_min, input2_max,
+      // expected output
+      {2, 2, 2, 2, 3, 11},
+      // output bounds, shape and data buffer
+      output_min, output_max, {3, 3, 1, 2}, output_data);
+
+  tflite::testing::TestMaxMinQuantizedInt32(
+      tflite::BuiltinOperator_MINIMUM,
+      // input1 shape, data and bounds
+      {3, 3, 1, 2}, data1, input1_min, input1_max,
+      // input2 shape, data and bounds
+      {1, 1}, data2, input2_min, input2_max,
+      // expected output
+      {1, 0, -1, -2, 2, 2},
+      // output bounds, shape and data buffer
+      output_min, output_max, {3, 3, 1, 2}, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index 36366128f60..f51be430df3 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -114,6 +114,7 @@ tensorflow/lite/kernels/internal/reference/floor.h \
 tensorflow/lite/kernels/internal/reference/fully_connected.h \
 tensorflow/lite/kernels/internal/reference/pooling.h \
 tensorflow/lite/kernels/internal/reference/prelu.h \
+tensorflow/lite/kernels/internal/reference/maximum_minimum.h \
 tensorflow/lite/kernels/internal/reference/softmax.h \
 tensorflow/lite/kernels/internal/reference/arg_min_max.h \
 tensorflow/lite/kernels/internal/round.h \
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 7c5889f82e1..f1e91450fe1 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -366,6 +366,7 @@ cc_library(
         "reference/integer_ops/pooling.h",
         "reference/integer_ops/softmax.h",
         "reference/integer_ops/tanh.h",
+        "reference/maximum_minimum.h",
         "reference/pooling.h",
         "reference/prelu.h",
         "reference/reference_ops.h",
@@ -409,6 +410,7 @@ cc_library(
         "reference/floor.h",
         "reference/fully_connected.h",
         "reference/legacy_reference_ops.h",
+        "reference/maximum_minimum.h",
         "reference/pooling.h",
         "reference/prelu.h",
         "reference/reference_ops.h",
diff --git a/tensorflow/lite/kernels/internal/reference/maximum_minimum.h b/tensorflow/lite/kernels/internal/reference/maximum_minimum.h
new file mode 100644
index 00000000000..480069aa13e
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/maximum_minimum.h
@@ -0,0 +1,61 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T, typename Op>
+void MaximumMinimumBroadcast4DSlow(const RuntimeShape& unextended_input1_shape,
+                                   const T* input1_data,
+                                   const RuntimeShape& unextended_input2_shape,
+                                   const T* input2_data,
+                                   const RuntimeShape& unextended_output_shape,
+                                   T* output_data, Op op) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = op(in1_val, in2_val);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index db22827dc79..932df39fe33 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/conv.h"
 #include "tensorflow/lite/kernels/internal/reference/floor.h"
 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
 #include "tensorflow/lite/kernels/internal/reference/pooling.h"
 #include "tensorflow/lite/kernels/internal/reference/prelu.h"
 #include "tensorflow/lite/kernels/internal/reference/softmax.h"
@@ -3538,41 +3539,6 @@ inline void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
   Maximum(input1_shape, input1_data, input2_data, output_shape, output_data);
 }
 
-template <typename T, typename Op>
-void MaximumMinimumBroadcast4DSlow(const RuntimeShape& unextended_input1_shape,
-                                   const T* input1_data,
-                                   const RuntimeShape& unextended_input2_shape,
-                                   const T* input2_data,
-                                   const RuntimeShape& unextended_output_shape,
-                                   T* output_data, Op op) {
-  gemmlowp::ScopedProfilingLabel label("MaximumMinimumBroadcast4DSlow");
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-
-  for (int b = 0; b < output_shape.Dims(0); ++b) {
-    for (int y = 0; y < output_shape.Dims(1); ++y) {
-      for (int x = 0; x < output_shape.Dims(2); ++x) {
-        for (int c = 0; c < output_shape.Dims(3); ++c) {
-          auto out_idx = Offset(output_shape, b, y, x, c);
-          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
-          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
-          auto in1_val = input1_data[in1_idx];
-          auto in2_val = input2_data[in2_idx];
-          output_data[out_idx] = op(in1_val, in2_val);
-        }
-      }
-    }
-  }
-}
-
 template <typename T1, typename T2, typename T3>
 void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
             const T3* input2_data, const RuntimeShape& output_shape,