Support Global Average Pooling in XNNPACK delegate

- MEAN over spatial dimensions is converted as a Global Average Pooling PiperOrigin-RevId: 316031672 Change-Id: Icbecf2ccf2920c701ee2f6b04b6dcf9972b9ce0b
2020-06-11 20:19:12 -07:00 · 2020-06-11 20:19:12 -07:00 · ffc7592c82
commit ffc7592c82
parent 6642441bee
7 changed files with 700 additions and 5 deletions
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@ -172,6 +172,22 @@ cc_library(
    ],
 )

+cc_library(
+    name = "reduce_tester",
+    testonly = 1,
+    srcs = ["reduce_tester.cc"],
+    hdrs = ["reduce_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
    name = "softmax_tester",
    testonly = 1,
@ -429,6 +445,21 @@ cc_test(
    ],
 )

+cc_test(
+    name = "mean_test",
+    srcs = ["mean_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":reduce_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
    name = "minimum_test",
    srcs = ["minimum_test.cc"],
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@ -165,6 +165,16 @@ Below is the list of current operators and limitations:

 * Inputs and outputs must be in 32-bit floating-point format.

+### `MEAN`
+
+* The first input and the output must be a 4D tensors in 32-bit
+  floating-point format.
+* The second input (the input with the axes specification) must be static
+  (use `kTfLiteMmapRo` allocation type).
+* Only [1, 2] or [2, 1] axes specification (i.e. reduction across spatial
+  dimensions) is supported.
+* Only `keep_dims = True` parameter value is supported.
+
 ### `MINIMUM`

 * Inputs and outputs must be in 32-bit floating-point format.
--- a/tensorflow/lite/delegates/xnnpack/mean_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/mean_test.cc
@ -0,0 +1,265 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/reduce_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(Mean, DISABLED_4DReduceBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({0})
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_4DReduceHeight) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({1})
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_4DReduceWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({2})
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, 4DReduceHeightWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({1, 2})
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+
+  ReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({2, 1})
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_4DReduceChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({3})
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_3DReduceBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, width, channels})
+      .Axes({0})
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_3DReduceWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, width, channels})
+      .Axes({1})
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_3DReduceChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, width, channels})
+      .Axes({2})
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_2DReduceBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, channels})
+      .Axes({0})
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_2DReduceChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, channels})
+      .Axes({1})
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(Mean, DISABLED_1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  ReduceTester().InputShape({batch}).Axes({0}).Test(BuiltinOperator_MEAN,
+                                                    xnnpack_delegate.get());
+}
+
+TEST(Mean, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  ReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({1, 2})
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
--- a/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
@ -0,0 +1,171 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/reduce_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+void ReduceTester::Test(tflite::BuiltinOperator reduce_op,
+                        TfLiteDelegate* delegate) const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng = std::bind(
+      std::uniform_real_distribution<float>(-15.0f, 15.0f), std::ref(rng));
+
+  std::vector<char> buffer = CreateTfLiteModel(reduce_op);
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data, default_input_data + InputSize(),
+                std::ref(input_rng));
+
+  float* delegate_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data, default_input_data + InputSize(),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* delegate_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  const int32_t output_size = OutputSize();
+  for (size_t i = 0; i < output_size; i++) {
+    ASSERT_NEAR(
+        default_output_data[i], delegate_output_data[i],
+        std::numeric_limits<float>::epsilon() *
+            std::max(std::abs(default_output_data[i]) * RelativeTolerance(),
+                     1.0f));
+  }
+}
+
+std::vector<char> ReduceTester::CreateTfLiteModel(
+    tflite::BuiltinOperator reduce_op) const {
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<OperatorCode> operator_code =
+      CreateOperatorCode(builder, reduce_op);
+
+  const std::array<flatbuffers::Offset<Buffer>, 2> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+      CreateBuffer(builder, builder.CreateVector(
+                                reinterpret_cast<const uint8_t*>(Axes().data()),
+                                sizeof(int32_t) * Axes().size())),
+  }};
+
+  const std::vector<int32_t> output_shape = OutputShape();
+  const std::array<int32_t, 1> axes_shape{
+      {static_cast<int32_t>(Axes().size())}};
+  const std::array<flatbuffers::Offset<Tensor>, 3> tensors{{
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(InputShape().data(),
+                                                 InputShape().size()),
+                   TensorType_FLOAT32),
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(axes_shape.data(), axes_shape.size()),
+          TensorType_INT32, /*buffer=*/1),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(output_shape.data(),
+                                                 output_shape.size()),
+                   TensorType_FLOAT32),
+  }};
+
+  const flatbuffers::Offset<ReducerOptions> reducer_options =
+      CreateReducerOptions(builder, KeepDims());
+
+  const std::array<int32_t, 2> op_inputs{{0, 1}};
+  const std::array<int32_t, 1> op_outputs{{2}};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      tflite::BuiltinOptions_ReducerOptions, reducer_options.Union());
+
+  const std::array<int32_t, 1> subgraph_inputs{{0}};
+  const std::array<int32_t, 1> subgraph_outputs{{2}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(&op, 1));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Reduce model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+int32_t ReduceTester::ComputeSize(const std::vector<int32_t>& shape) {
+  return std::accumulate(shape.cbegin(), shape.cend(), 1,
+                         std::multiplies<int32_t>());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
--- a/tensorflow/lite/delegates/xnnpack/reduce_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/reduce_tester.h
@ -0,0 +1,117 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_REDUCE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_REDUCE_TESTER_H_
+
+#include <cstdint>
+#include <unordered_set>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class ReduceTester {
+ public:
+  ReduceTester() = default;
+  ReduceTester(const ReduceTester&) = delete;
+  ReduceTester& operator=(const ReduceTester&) = delete;
+
+  inline ReduceTester& InputShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    input_size_ = ReduceTester::ComputeSize(input_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline int32_t InputSize() const { return input_size_; }
+
+  inline ReduceTester& Axes(std::initializer_list<int32_t> axes) {
+    for (auto it = axes.begin(); it != axes.end(); ++it) {
+      EXPECT_GE(*it, 0);
+    }
+    axes_ = std::vector<int32_t>(axes.begin(), axes.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& Axes() const { return axes_; }
+
+  inline ReduceTester& KeepDims(bool keep_dims) {
+    keep_dims_ = keep_dims;
+    return *this;
+  }
+
+  inline bool KeepDims() const { return keep_dims_; }
+
+  inline std::vector<int32_t> OutputShape() const {
+    std::vector<int32_t> output_shape;
+    output_shape.reserve(InputShape().size());
+    std::unordered_set<int32_t> axes_set(Axes().cbegin(), Axes().cend());
+    for (int32_t i = 0; i < InputShape().size(); i++) {
+      if (axes_set.count(i) != 0) {
+        if (KeepDims()) {
+          output_shape.push_back(1);
+        }
+      } else {
+        output_shape.push_back(InputShape()[i]);
+      }
+    }
+    return output_shape;
+  }
+
+  inline int32_t OutputSize() const {
+    int32_t output_size = 1;
+    std::unordered_set<int32_t> axes_set(Axes().cbegin(), Axes().cend());
+    for (int32_t i = 0; i < InputShape().size(); i++) {
+      if (axes_set.count(i) == 0) {
+        output_size *= InputShape()[i];
+      }
+    }
+    return output_size;
+  }
+
+  inline ReduceTester& RelativeTolerance(float relative_tolerance) {
+    relative_tolerance_ = relative_tolerance;
+    return *this;
+  }
+
+  inline float RelativeTolerance() const { return relative_tolerance_; }
+
+  void Test(tflite::BuiltinOperator reduce_op, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(tflite::BuiltinOperator reduce_op) const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> axes_;
+  int32_t input_size_;
+  bool keep_dims_ = true;
+  float relative_tolerance_ = 10.0f;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_REDUCE_TESTER_H_
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@ -150,8 +150,9 @@ class Subgraph {
      }

      switch (registration->builtin_code) {
+        case kTfLiteBuiltinMean:
        case kTfLiteBuiltinPad:
-          // Ignore the second input (static padding), because it is
+          // Ignore the second input (static padding, or axes), because it is
          // represented as parameters of the XNNPACK operator rather than
          // extra input.
          {
@ -723,6 +724,20 @@ class Subgraph {
    return kTfLiteOk;
  }

+  static TfLiteStatus CheckAxesTensorShape(TfLiteContext* context,
+                                           const TfLiteTensor& tensor,
+                                           int tensor_index, int node_index) {
+    if (tensor.dims->size != 1) {
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of shape dimensions (%d) in "
+                               "axes tensor #%d in node #%d: "
+                               "expected a 1D tensor",
+                               tensor.dims->size, tensor_index, node_index);
+      return kTfLiteError;
+    }
+    return kTfLiteOk;
+  }
+
  static TfLiteStatus CheckTensorNonDynamicAllocation(
      TfLiteContext* context, const TfLiteTensor& tensor, int tensor_index,
      int node_index) {
@ -846,6 +861,13 @@ class Subgraph {
      case kTfLiteBuiltinMaximum:
        return VisitMaximumNode(subgraph, logging_context, node_index, node,
                                context->tensors, xnnpack_tensors);
+      case kTfLiteBuiltinMean: {
+        const TfLiteReducerParams* reducer_params =
+            static_cast<const TfLiteReducerParams*>(node->builtin_data);
+
+        return VisitMeanNode(subgraph, logging_context, node_index, node,
+                             context->tensors, reducer_params, xnnpack_tensors);
+      }
      case kTfLiteBuiltinMinimum:
        return VisitMinimumNode(subgraph, logging_context, node_index, node,
                                context->tensors, xnnpack_tensors);
@ -1723,6 +1745,85 @@ class Subgraph {
    return kTfLiteOk;
  }

+  static TfLiteStatus VisitMeanNode(
+      xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
+      TfLiteNode* node, const TfLiteTensor* tensors,
+      const TfLiteReducerParams* reducer_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
+                                           node->inputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& axes_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorType(logging_context, axes_tensor,
+                                          kTfLiteInt32, node->inputs->data[1],
+                                          node_index));
+    TF_LITE_ENSURE_STATUS(CheckAxesTensorShape(
+        logging_context, axes_tensor, node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, axes_tensor, node->inputs->data[1], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
+                                           node->outputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (!reducer_params->keep_dims) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unsupported MEAN reduction without keep_dims attributes in node %d",
+          node_index);
+      return kTfLiteError;
+    }
+
+    if (axes_tensor.dims->data[0] != 2) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unsupported MEAN reduction along %d axes in node %d",
+          axes_tensor.dims->data[0], node_index);
+      return kTfLiteError;
+    }
+
+    const int32_t* axes_data =
+        reinterpret_cast<const int32_t*>(axes_tensor.data.data);
+    if (std::min(axes_data[0], axes_data[1]) != 1 ||
+        std::max(axes_data[0], axes_data[1]) != 2) {
+      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                               "unsupported MEAN reduction along non-spatial "
+                               "axes %d and %d in node %d",
+                               std::min(axes_data[0], axes_data[1]),
+                               std::max(axes_data[0], axes_data[1]),
+                               node_index);
+      return kTfLiteError;
+    }
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_global_average_pooling_2d(
+          subgraph,
+          /*output_min=*/-std::numeric_limits<float>::infinity(),
+          /*output_max=*/+std::numeric_limits<float>::infinity(),
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate MEAN node #%d",
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
  static TfLiteStatus VisitMediaPipeDeconvolutionNode(
      xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
      TfLiteNode* node, const TfLiteTensor* tensors,
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):

    tf_http_archive(
        name = "XNNPACK",
-        sha256 = "abdb7ec410e5ee5264178973665d0071362223699639dc08de37a4c3ca4b0a61",
-        strip_prefix = "XNNPACK-af4524811a6d3123aa5fd603a232d97b6be2c7c9",
+        sha256 = "7469a0a634bfa90395ed311d07a21b1d0003604b37b12745bad1cf17860984e1",
+        strip_prefix = "XNNPACK-a059b7da184954fb6c01db0e7959352ee805e9f3",
        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/af4524811a6d3123aa5fd603a232d97b6be2c7c9.zip",
-            "https://github.com/google/XNNPACK/archive/af4524811a6d3123aa5fd603a232d97b6be2c7c9.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/a059b7da184954fb6c01db0e7959352ee805e9f3.zip",
+            "https://github.com/google/XNNPACK/archive/a059b7da184954fb6c01db0e7959352ee805e9f3.zip",
        ],
    )