From 5dbc34f565304a89216038a751302c32207377b3 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Mon, 25 May 2020 15:49:01 -0700
Subject: [PATCH] Support Pad with static paddings in XNNPACK delegate

PiperOrigin-RevId: 313107760
Change-Id: I7b04b9977081e760e9604f72d9da5f499ada88f3
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  31 ++
 tensorflow/lite/delegates/xnnpack/README.md   |  37 +--
 tensorflow/lite/delegates/xnnpack/pad_test.cc | 279 ++++++++++++++++++
 .../lite/delegates/xnnpack/pad_tester.cc      | 187 ++++++++++++
 .../lite/delegates/xnnpack/pad_tester.h       |  89 ++++++
 .../delegates/xnnpack/xnnpack_delegate.cc     | 199 ++++++++++++-
 tensorflow/workspace.bzl                      |   8 +-
 7 files changed, 782 insertions(+), 48 deletions(-)
 create mode 100644 tensorflow/lite/delegates/xnnpack/pad_test.cc
 create mode 100644 tensorflow/lite/delegates/xnnpack/pad_tester.cc
 create mode 100644 tensorflow/lite/delegates/xnnpack/pad_tester.h

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index e8e6c061160..6edb757e83f 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -91,6 +91,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "pad_tester",
+    testonly = 1,
+    srcs = ["pad_tester.cc"],
+    hdrs = ["pad_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
     name = "pool_2d_tester",
     testonly = 1,
@@ -293,6 +309,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "pad_test",
+    srcs = ["pad_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":pad_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "relu_test",
     srcs = ["relu_test.cc"],
diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index c4e3f540faf..98a08a4f647 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -92,8 +92,6 @@ Below is the list of current operators and limitations:
 * Only addition with two inputs is supported.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `AVERAGE_POOL_2D`
 
@@ -101,8 +99,6 @@ Below is the list of current operators and limitations:
 * 1x1 pooling is not supported.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `CONV_2D`
 
@@ -111,8 +107,6 @@ Below is the list of current operators and limitations:
 * Both filter and bias must be static (use `kTfLiteMmapRo` allocation type).
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) input and output
-  are not supported.
 
 ### `DEPTHWISE_CONV_2D`
 
@@ -121,8 +115,6 @@ Below is the list of current operators and limitations:
 * Both filter and bias must be static (use `kTfLiteMmapRo` allocation type).
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) input and output
-  are not supported.
 
 ### `FULLY_CONNECTED`
 
@@ -131,20 +123,14 @@ Below is the list of current operators and limitations:
 * Both filter and bias must be static (use `kTfLiteMmapRo` allocation type).
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) input and output
-  are not supported.
 
 ### `HARD_SWISH`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `LOGISTIC`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `MAX_POOL_2D`
 
@@ -152,16 +138,19 @@ Below is the list of current operators and limitations:
 * 1x1 pooling is not supported.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `MUL`
 
 * Inputs and outputs must be in 32-bit floating-point format.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
+
+### `PAD`
+
+* The first input and the output must be in 32-bit floating-point format.
+* The second input (the input with the padding specification) must be static
+  (use `kTfLiteMmapRo` allocation type).
+* The numbers of padding elements must be non-negative.
 
 ### `PRELU`
 
@@ -169,36 +158,28 @@ Below is the list of current operators and limitations:
 * Slope must be static (use `kTfLiteMmapRo` allocation type).
 * Slope must be either a 1D tensor, or have all its non-channel dimensions equal
   1.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) input and output
-  are not supported.
 
 ### `RELU`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `RELU6`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `RELU_N1_TO_1`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `SOFTMAX`
 
 * Inputs and outputs must be in 32-bit floating-point format.
 * Only `beta = 1.0` is supported.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### Other limitations
 
+* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
+  outputs are not supported.
 * Resizing model inputs (via `Interpreter::ResizeInputTensor`) is supported, but
   cause a complete reinitialization of the delegate instance, which has
   considerable overhead.
diff --git a/tensorflow/lite/delegates/xnnpack/pad_test.cc b/tensorflow/lite/delegates/xnnpack/pad_test.cc
new file mode 100644
index 00000000000..c93ff8ab661
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/pad_test.cc
@@ -0,0 +1,279 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/pad_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(Pad, Full4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), pad_rng(), pad_rng(), pad_rng()})
+      .InputPostPaddings({pad_rng(), pad_rng(), pad_rng(), pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Batch4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), 0, 0, 0})
+      .InputPostPaddings({pad_rng(), 0, 0, 0})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, HeightAndWidth4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, pad_rng(), pad_rng(), 0})
+      .InputPostPaddings({0, pad_rng(), pad_rng(), 0})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Channels4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, 0, 0, pad_rng()})
+      .InputPostPaddings({0, 0, 0, pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Full3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), pad_rng(), pad_rng()})
+      .InputPostPaddings({pad_rng(), pad_rng(), pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Batch3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), 0, 0})
+      .InputPostPaddings({pad_rng(), 0, 0})
+      .InputShape({shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Width3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, pad_rng(), 0})
+      .InputPostPaddings({0, pad_rng(), 0})
+      .InputShape({shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Channels3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, 0, pad_rng()})
+      .InputPostPaddings({0, 0, pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Full2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), pad_rng()})
+      .InputPostPaddings({pad_rng(), pad_rng()})
+      .InputShape({shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Batch2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), 0})
+      .InputPostPaddings({pad_rng(), 0})
+      .InputShape({shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Channels2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, pad_rng()})
+      .InputPostPaddings({0, pad_rng()})
+      .InputShape({shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), pad_rng()})
+      .InputPostPaddings({pad_rng(), pad_rng()})
+      .InputShape({shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, 0, 0, pad_rng()})
+      .InputPostPaddings({0, 0, 0, pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.cc b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
new file mode 100644
index 00000000000..e364b880124
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
@@ -0,0 +1,187 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/pad_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+std::vector<int32_t> PadTester::OutputShape() const {
+  std::vector<int32_t> output_shape;
+  output_shape.reserve(InputShape().size());
+  for (size_t i = 0; i < InputShape().size(); i++) {
+    int32_t output_dim = InputShape()[i];
+    if (i < InputPrePaddings().size()) {
+      output_dim += InputPrePaddings()[i];
+    }
+    if (i < InputPostPaddings().size()) {
+      output_dim += InputPostPaddings()[i];
+    }
+    output_shape.push_back(output_dim);
+  }
+  return output_shape;
+}
+
+void PadTester::Test(TfLiteDelegate* delegate) const {
+  ASSERT_EQ(InputPrePaddings().size(), InputPostPaddings().size());
+  ASSERT_LE(InputPrePaddings().size(), InputShape().size());
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data,
+                default_input_data + ComputeSize(InputShape()),
+                std::ref(input_rng));
+
+  float* delegate_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data, default_input_data + ComputeSize(InputShape()),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* delegate_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (size_t i = 0; i < ComputeSize(OutputShape()); i++) {
+    ASSERT_EQ(default_output_data[i], delegate_output_data[i]);
+  }
+}
+
+std::vector<char> PadTester::CreateTfLiteModel() const {
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<OperatorCode> operator_code =
+      CreateOperatorCode(builder, BuiltinOperator_PAD);
+
+  std::vector<int32_t> paddings(InputPrePaddings().size() +
+                                InputPostPaddings().size());
+  for (size_t i = 0; i < InputPrePaddings().size(); i++) {
+    paddings[i * 2] = InputPrePaddings()[i];
+    paddings[i * 2 + 1] = InputPostPaddings()[i];
+  }
+  const std::array<flatbuffers::Offset<Buffer>, 2> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+      CreateBuffer(builder,
+                   builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(paddings.data()),
+                       sizeof(float) * paddings.size())),
+  }};
+
+  const std::vector<int32_t> output_shape = OutputShape();
+  const std::array<int32_t, 2> paddings_shape{
+      {static_cast<int32_t>(InputPrePaddings().size()), 2}};
+  const std::array<flatbuffers::Offset<Tensor>, 3> tensors{{
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(InputShape().data(),
+                                                 InputShape().size()),
+                   TensorType_FLOAT32),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(paddings_shape.data(),
+                                                 paddings_shape.size()),
+                   TensorType_INT32, /*buffer=*/1),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(output_shape.data(),
+                                                 output_shape.size()),
+                   TensorType_FLOAT32),
+  }};
+
+  const std::array<int32_t, 2> op_inputs{{0, 1}};
+  const std::array<int32_t, 1> op_outputs{{2}};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+
+  const std::array<int32_t, 1> subgraph_inputs{{0}};
+  const std::array<int32_t, 1> subgraph_outputs{{2}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(&op, 1));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Pad model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+int32_t PadTester::ComputeSize(const std::vector<int32_t>& shape) {
+  return std::accumulate(shape.cbegin(), shape.cend(), 1,
+                         std::multiplies<int32_t>());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.h b/tensorflow/lite/delegates/xnnpack/pad_tester.h
new file mode 100644
index 00000000000..ffcd47e05e9
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.h
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_PAD_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_PAD_TESTER_H_
+
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class PadTester {
+ public:
+  PadTester() = default;
+  PadTester(const PadTester&) = delete;
+  PadTester& operator=(const PadTester&) = delete;
+
+  inline PadTester& InputShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline PadTester& InputPrePaddings(std::initializer_list<int32_t> paddings) {
+    for (auto it = paddings.begin(); it != paddings.end(); ++it) {
+      EXPECT_GE(*it, 0);
+    }
+    input_pre_paddings_ =
+        std::vector<int32_t>(paddings.begin(), paddings.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t> InputPrePaddings() const {
+    return input_pre_paddings_;
+  }
+
+  inline PadTester& InputPostPaddings(std::initializer_list<int32_t> paddings) {
+    for (auto it = paddings.begin(); it != paddings.end(); ++it) {
+      EXPECT_GE(*it, 0);
+    }
+    input_post_paddings_ =
+        std::vector<int32_t>(paddings.begin(), paddings.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t> InputPostPaddings() const {
+    return input_post_paddings_;
+  }
+
+  std::vector<int32_t> OutputShape() const;
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> input_pre_paddings_;
+  std::vector<int32_t> input_post_paddings_;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_PAD_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 6d9b4dac8f8..2beaa16255d 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 #include <algorithm>
+#include <array>
 #include <cstdint>
 #include <cstring>
 #include <limits>
@@ -120,9 +121,22 @@ class Subgraph {
         return nullptr;
       }
 
-      for (int k = 0; k < node->inputs->size; k++) {
-        const int t = node->inputs->data[k];
-        tensors[t] = t;
+      switch (registration->builtin_code) {
+        case kTfLiteBuiltinPad:
+          // Ignore the second input (static padding), because it is
+          // represented as parameters of the XNNPACK operator rather than
+          // extra input.
+          {
+            const int t = node->inputs->data[0];
+            tensors[t] = t;
+          }
+          break;
+        default:
+          // All other operators: process all inputs
+          for (int k = 0; k < node->inputs->size; k++) {
+            const int t = node->inputs->data[k];
+            tensors[t] = t;
+          }
       }
       for (int k = 0; k < node->outputs->size; k++) {
         const int t = node->outputs->data[k];
@@ -532,10 +546,11 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus CheckTensorFloatType(TfLiteContext* context,
-                                           const TfLiteTensor& tensor,
-                                           int tensor_index, int node_index) {
-    if (tensor.type != kTfLiteFloat32) {
+  static TfLiteStatus CheckTensorType(TfLiteContext* context,
+                                      const TfLiteTensor& tensor,
+                                      TfLiteType expected_type,
+                                      int tensor_index, int node_index) {
+    if (tensor.type != expected_type) {
       TF_LITE_MAYBE_KERNEL_LOG(
           context, "unsupported type %s in tensor #%d in node #%d",
           TfLiteTypeGetName(tensor.type), tensor_index, node_index);
@@ -544,28 +559,64 @@ class Subgraph {
     return kTfLiteOk;
   }
 
+  static TfLiteStatus CheckTensorFloatType(TfLiteContext* context,
+                                           const TfLiteTensor& tensor,
+                                           int tensor_index, int node_index) {
+    return CheckTensorType(context, tensor, kTfLiteFloat32, tensor_index,
+                           node_index);
+  }
+
   static TfLiteStatus CheckTensorShape(TfLiteContext* context,
                                        const TfLiteTensor& tensor,
-                                       int expected_num_dims,
+                                       int min_num_dims, int max_num_dims,
                                        int tensor_index) {
-    if (tensor.dims->size != expected_num_dims) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          context,
-          "unexpected number of shape dimensions (%d != %d) in tensor #%d",
-          tensor.dims->size, expected_num_dims, tensor_index);
-      return kTfLiteError;
+    if (min_num_dims == max_num_dims) {
+      if (tensor.dims->size != min_num_dims) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unsupported number of shape dimensions (%d) in tensor #%d: "
+            "%d dimensions expected",
+            tensor.dims->size, tensor_index, min_num_dims);
+        return kTfLiteError;
+      }
+    } else {
+      if (tensor.dims->size < min_num_dims) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unsupported number of shape dimensions (%d) in tensor #%d: "
+            "at least %d dimensions expected",
+            tensor.dims->size, tensor_index, min_num_dims);
+        return kTfLiteError;
+      }
+      if (tensor.dims->size > max_num_dims) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unsupported number of shape dimensions (%d) in tensor #%d: "
+            "at most %d dimensions expected",
+            tensor.dims->size, tensor_index, max_num_dims);
+        return kTfLiteError;
+      }
     }
     for (int i = 0; i < tensor.dims->size; i++) {
       if (tensor.dims->data[i] <= 0) {
         TF_LITE_MAYBE_KERNEL_LOG(context,
-                                 "invalid dimension #%d (%d) in tensor #%d", i,
-                                 tensor.dims->data[i], tensor_index);
+                                 "invalid num of elements (%d) in "
+                                 "dimension #%d in tensor #%d",
+                                 tensor.dims->data[i], i, tensor_index);
         return kTfLiteError;
       }
     }
     return kTfLiteOk;
   }
 
+  static TfLiteStatus CheckTensorShape(TfLiteContext* context,
+                                       const TfLiteTensor& tensor,
+                                       int expected_num_dims,
+                                       int tensor_index) {
+    return CheckTensorShape(context, tensor, expected_num_dims,
+                            expected_num_dims, tensor_index);
+  }
+
   static TfLiteStatus CheckSlopeTensorShape(TfLiteContext* context,
                                             const TfLiteTensor& tensor,
                                             int tensor_index, int node_index) {
@@ -592,6 +643,39 @@ class Subgraph {
     return kTfLiteOk;
   }
 
+  static TfLiteStatus CheckPaddingsTensorShape(TfLiteContext* context,
+                                               const TfLiteTensor& tensor,
+                                               int expected_rows,
+                                               int tensor_index,
+                                               int node_index) {
+    if (tensor.dims->size != 2) {
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of shape dimensions (%d) in "
+                               "padding tensor #%d in node #%d: "
+                               "expected a 2D tensor",
+                               tensor.dims->size, tensor_index, node_index);
+      return kTfLiteError;
+    }
+    if (tensor.dims->data[0] != expected_rows) {
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of rows (%d) in "
+                               "padding tensor #%d in node #%d: "
+                               "%d rows expected",
+                               tensor.dims->size, tensor_index, node_index,
+                               expected_rows);
+      return kTfLiteError;
+    }
+    if (tensor.dims->data[1] != 2) {
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of columns (%d) in "
+                               "padding tensor #%d in node #%d: "
+                               "2 columns expected",
+                               tensor.dims->size, tensor_index, node_index);
+      return kTfLiteError;
+    }
+    return kTfLiteOk;
+  }
+
   static TfLiteStatus CheckTensorNonDynamicAllocation(
       TfLiteContext* context, const TfLiteTensor& tensor, int tensor_index,
       int node_index) {
@@ -693,6 +777,9 @@ class Subgraph {
         return VisitMulNode(subgraph, logging_context, node_index, node,
                             context->tensors, mul_params, xnnpack_tensors);
       }
+      case kTfLiteBuiltinPad:
+        return VisitPadNode(subgraph, logging_context, node_index, node,
+                            context->tensors, xnnpack_tensors);
       case kTfLiteBuiltinPrelu:
         return VisitPreluNode(subgraph, logging_context, node_index, node,
                               context->tensors, xnnpack_tensors);
@@ -1565,6 +1652,86 @@ class Subgraph {
     return kTfLiteOk;
   }
 
+  static TfLiteStatus VisitPadNode(
+      xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
+      TfLiteNode* node, const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 1,
+                                           XNN_MAX_TENSOR_DIMS,
+                                           node->inputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& paddings_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorType(logging_context, paddings_tensor,
+                                          kTfLiteInt32, node->inputs->data[1],
+                                          node_index));
+    TF_LITE_ENSURE_STATUS(CheckPaddingsTensorShape(
+        logging_context, paddings_tensor, input_tensor.dims->size,
+        node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, paddings_tensor, node->inputs->data[1], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 1,
+                                           XNN_MAX_TENSOR_DIMS,
+                                           node->outputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    const int32_t* paddings_data =
+        reinterpret_cast<const int32_t*>(paddings_tensor.data.data);
+    for (int i = 0; i < paddings_tensor.dims->size; i++) {
+      const int32_t pre_padding = paddings_data[i * 2 + 0];
+      if (pre_padding < 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "invalid pre-padding %d for dimension #%d in node %d", pre_padding,
+            i, node_index);
+        return kTfLiteError;
+      }
+
+      const int32_t post_padding = paddings_data[i * 2 + 1];
+      if (post_padding < 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "invalid post-padding %d for dimension #%d in node %d", pre_padding,
+            i, node_index);
+        return kTfLiteError;
+      }
+    }
+
+    if (subgraph != nullptr) {
+      std::array<size_t, XNN_MAX_TENSOR_DIMS> pre_paddings{};
+      std::array<size_t, XNN_MAX_TENSOR_DIMS> post_paddings{};
+      for (int i = 0; i < paddings_tensor.dims->data[0]; i++) {
+        pre_paddings[i] = static_cast<size_t>(paddings_data[i * 2 + 0]);
+        post_paddings[i] = static_cast<size_t>(paddings_data[i * 2 + 1]);
+      }
+
+      const xnn_status status = xnn_define_static_constant_pad(
+          subgraph, pre_paddings.data(), post_paddings.data(),
+          /*padding_value=*/0.0f,
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate PAD node #%d",
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
   static TfLiteStatus VisitPreluNode(
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b7682468998..d196675b518 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "0440d9ad632945f10992664be84eb0c0c76581f8474df3c124aa30350981126c",
-        strip_prefix = "XNNPACK-d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee",
+        sha256 = "589acbfe90093c690a2817068fadfd7868000509304b5316d5c8d692b605b379",
+        strip_prefix = "XNNPACK-f5c4625a40ee296d47be936ff5e7b0809858627b",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee.zip",
-            "https://github.com/google/XNNPACK/archive/d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/f5c4625a40ee296d47be936ff5e7b0809858627b.zip",
+            "https://github.com/google/XNNPACK/archive/f5c4625a40ee296d47be936ff5e7b0809858627b.zip",
         ],
     )