From 772433a2a2120d0aefc6c3628c6254d5a1aaf19d Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Fri, 19 Jun 2020 01:58:58 -0700
Subject: [PATCH] Add flag for using optimized TFLite CPU kernels on iOS

This adds new experimental flags to the interpreter options of TFLite Obj-C and
Swift APIs, which can be used for opting in to a set of highly optimized
floating point kernels provided via the XNNPACK delegate. The flags can be used
as follows.

Obj-C:

    TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
    options.useXNNPACK = YES;
    NSError *error;
    TFLInterpreter *interpreter =
        [[TFLInterpreter alloc] initWithModelPath:@"model/path"
                                          options:options
                                            error:&error];

Swift:

    var options = InterpreterOptions()
    options.isXNNPackEnabled = true
    var interpreter = try Interpreter(modelPath: "model/path", options: options)

PiperOrigin-RevId: 317270012
Change-Id: I82aae43c3de13ab08af3c70513e2a458e807b0f1
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  4 ++
 tensorflow/lite/experimental/ios/BUILD.apple  | 18 +++++
 tensorflow/lite/experimental/objc/BUILD.apple |  1 +
 .../objc/TensorFlowLiteObjC-nightly.podspec   |  1 +
 .../objc/TensorFlowLiteObjC.podspec           |  1 +
 .../objc/TensorFlowLiteObjC.podspec.template  |  1 +
 .../objc/apis/TFLInterpreterOptions.h         | 21 ++++++
 .../objc/sources/TFLInterpreter.mm            | 15 +++++
 .../objc/tests/TFLInterpreterOptionsTests.m   |  9 +++
 .../swift/Sources/Interpreter.swift           | 67 ++++++++++++++++---
 .../swift/Tests/InterpreterTests.swift        | 62 +++++++++++------
 11 files changed, 171 insertions(+), 29 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 97e6aea2a6b..eaf7d8f6f03 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -14,6 +14,10 @@ EMSCRIPTEN_LINKOPTS = [
     "-s TOTAL_MEMORY=134217728",
 ]
 
+exports_files([
+    "xnnpack_delegate.h",
+])
+
 cc_library(
     name = "xnnpack_delegate",
     srcs = ["xnnpack_delegate.cc"],
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index 1a85b604f9b..7a40ca7b8e7 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -18,10 +18,26 @@ sh_binary(
     ],
 )
 
+# When the static framework is built with bazel, the all header files are moved
+# to the "Headers" directory with no header path prefixes. This auxiliary rule
+# is used for stripping the path prefix to the "common.h" file included by the
+# "xnnpack_delegate.h" header.
+genrule(
+    name = "strip_xnnpack_include_hdr",
+    srcs = ["//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h"],
+    outs = ["xnnpack_delegate.h"],
+    cmd = """
+    sed 's|#include ".*common.h"|#include "common.h"|'\
+    "$(location //tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h)"\
+    > "$@"
+    """,
+)
+
 # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteC_framework
 tflite_ios_static_framework(
     name = "TensorFlowLiteC_framework",
     hdrs = [
+        ":xnnpack_delegate.h",
         "//tensorflow/lite/c:c_api.h",
         "//tensorflow/lite/c:common.h",
     ],
@@ -105,6 +121,7 @@ cc_library(
     hdrs = [
         "//tensorflow/lite/c:c_api.h",
         "//tensorflow/lite/c:common.h",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
     ],
     tags = [
         "nobuilder",
@@ -112,6 +129,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
index 09d4547813a..d26d90c46a1 100644
--- a/tensorflow/lite/experimental/objc/BUILD.apple
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -64,6 +64,7 @@ objc_library(
     visibility = ios_visibility_whitelist(),
     deps = [
         "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
index e039fb57114..eed0f087f44 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
@@ -26,6 +26,7 @@ Pod::Spec.new do |s|
     objc_dir + '{apis,sources}/*.{h,m,mm}',
     tfl_dir + 'c/c_api.h',
     tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', "~> #{s.version}"
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
index c673cfad759..5817619a58f 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
@@ -26,6 +26,7 @@ Pod::Spec.new do |s|
     objc_dir + '{apis,sources}/*.{h,m,mm}',
     tfl_dir + 'c/c_api.h',
     tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', "#{s.version}"
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
index fc9e10e4a2c..4ab5753e016 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
@@ -26,6 +26,7 @@ Pod::Spec.new do |s|
     objc_dir + '{apis,sources}/*.{h,m,mm}',
     tfl_dir + 'c/c_api.h',
     tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', '~> 0.0.1-nightly'
diff --git a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
index 6461fbf0178..d7dbb2bd970 100644
--- a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
@@ -25,6 +25,27 @@ NS_ASSUME_NONNULL_BEGIN
  */
 @property(nonatomic) NSUInteger numberOfThreads;
 
+/**
+ * Experimental: Enable an optimized set of floating point CPU kernels (provided by XNNPACK).
+ *
+ * Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided via the
+ * XNNPACK delegate. Currently, this is restricted to a subset of floating point operations.
+ * Eventually, we plan to enable this by default, as it can provide significant performance benefits
+ * for many classes of floating point models. See
+ * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+ * for more details.
+ *
+ * Things to keep in mind when enabling this flag:
+ *
+ *     * Startup time and resize time may increase.
+ *     * Baseline memory consumption may increase.
+ *     * Compatibility with other delegates (e.g., GPU) has not been fully validated.
+ *     * Quantized models will not see any benefit.
+ *
+ * WARNING: This is an experimental interface that is subject to change.
+ */
+@property(nonatomic) BOOL useXNNPACK;
+
 /**
  * Initializes a new instance of `TFLInterpreterOptions`.
  *
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
index 94031ee5428..34dd119885d 100644
--- a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@@ -23,6 +23,7 @@
 #import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
 
 #include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 NS_ASSUME_NONNULL_BEGIN
 
@@ -45,6 +46,9 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 /** TfLiteInterpreter backed by C API. */
 @property(nonatomic, nullable) TfLiteInterpreter *interpreter;
 
+/** TfLiteDelegate backed by C API. */
+@property(nonatomic, nullable) TfLiteDelegate *xnnpack_delegate;
+
 @end
 
 @implementation TFLInterpreter
@@ -53,6 +57,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 
 - (void)dealloc {
   TfLiteInterpreterDelete(_interpreter);
+  TfLiteXNNPackDelegateDelete(_xnnpack_delegate);
 }
 
 #pragma mark - Public
@@ -104,6 +109,16 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
       }
       TfLiteInterpreterOptionsSetErrorReporter(cOptions, TFLInterpreterErrorReporter, nullptr);
 
+      if (options.useXNNPACK) {
+        TfLiteXNNPackDelegateOptions xnnpack_options = TfLiteXNNPackDelegateOptionsDefault();
+        if (options.numberOfThreads > 0) {
+          xnnpack_options.num_threads = (int32_t)options.numberOfThreads;
+        }
+
+        _xnnpack_delegate = TfLiteXNNPackDelegateCreate(&xnnpack_options);
+        TfLiteInterpreterOptionsAddDelegate(cOptions, _xnnpack_delegate);
+      }
+
       _interpreter = TfLiteInterpreterCreate(model, cOptions);
       if (_interpreter == nullptr) {
         [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
diff --git a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
index 00b800d6af9..286cba98b49 100644
--- a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
+++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
@@ -32,6 +32,7 @@ NS_ASSUME_NONNULL_BEGIN
   TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
   XCTAssertNotNil(options);
   XCTAssertEqual(options.numberOfThreads, 0);
+  XCTAssertFalse(options.useXNNPACK);
 }
 
 - (void)testSetNumberOfThread {
@@ -44,6 +45,14 @@ NS_ASSUME_NONNULL_BEGIN
   XCTAssertEqual(options.numberOfThreads, 3);
 }
 
+- (void)testUseXNNPACK {
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  options.useXNNPACK = YES;
+  XCTAssertTrue(options.useXNNPACK);
+  options.useXNNPACK = NO;
+  XCTAssertFalse(options.useXNNPACK);
+}
+
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
index b83c36c4e1d..3567822208d 100644
--- a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@@ -39,6 +39,9 @@ public final class Interpreter {
   /// The underlying `TfLiteInterpreter` C pointer.
   private var cInterpreter: CInterpreter?
 
+  /// The underlying `TfLiteDelegate` C pointer for XNNPACK delegate.
+  private var cXNNPackDelegate: Delegate.CDelegate?
+
   /// Creates a new instance with the given values.
   ///
   /// - Parameters:
@@ -78,6 +81,14 @@ public final class Interpreter {
       )
     }
     delegates?.forEach { TfLiteInterpreterOptionsAddDelegate(cInterpreterOptions, $0.cDelegate) }
+
+    // Configure the XNNPack delegate after the other delegates explicitly added by the user.
+    options.map {
+      if $0.isXNNPackEnabled {
+        configureXNNPack(options: $0, cInterpreterOptions: cInterpreterOptions)
+      }
+    }
+
     guard let cInterpreter = TfLiteInterpreterCreate(model.cModel, cInterpreterOptions) else {
       throw InterpreterError.failedToCreateInterpreter
     }
@@ -86,6 +97,7 @@ public final class Interpreter {
 
   deinit {
     TfLiteInterpreterDelete(cInterpreter)
+    TfLiteXNNPackDelegateDelete(cXNNPackDelegate)
   }
 
   /// Invokes the interpreter to perform inference from the loaded graph.
@@ -201,12 +213,13 @@ public final class Interpreter {
     guard case 0...maxIndex = index else {
       throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
     }
-    guard TfLiteInterpreterResizeInputTensor(
-      cInterpreter,
-      Int32(index),
-      shape.int32Dimensions,
-      Int32(shape.rank)
-    ) == kTfLiteOk
+    guard
+      TfLiteInterpreterResizeInputTensor(
+        cInterpreter,
+        Int32(index),
+        shape.int32Dimensions,
+        Int32(shape.rank)
+      ) == kTfLiteOk
     else {
       throw InterpreterError.failedToResizeInputTensor(index: index)
     }
@@ -236,11 +249,11 @@ public final class Interpreter {
     }
 
     #if swift(>=5.0)
-    let status = data.withUnsafeBytes {
-      TfLiteTensorCopyFromBuffer(cTensor, $0.baseAddress, data.count)
-    }
+      let status = data.withUnsafeBytes {
+        TfLiteTensorCopyFromBuffer(cTensor, $0.baseAddress, data.count)
+      }
     #else
-    let status = data.withUnsafeBytes { TfLiteTensorCopyFromBuffer(cTensor, $0, data.count) }
+      let status = data.withUnsafeBytes { TfLiteTensorCopyFromBuffer(cTensor, $0, data.count) }
     #endif  // swift(>=5.0)
     guard status == kTfLiteOk else { throw InterpreterError.failedToCopyDataToInputTensor }
     return try input(at: index)
@@ -256,6 +269,18 @@ public final class Interpreter {
       throw InterpreterError.failedToAllocateTensors
     }
   }
+
+  // MARK: - Private
+
+  private func configureXNNPack(options: Options, cInterpreterOptions: OpaquePointer) {
+    var cXNNPackOptions = TfLiteXNNPackDelegateOptionsDefault()
+    if let threadCount = options.threadCount, threadCount > 0 {
+      cXNNPackOptions.num_threads = Int32(threadCount)
+    }
+
+    cXNNPackDelegate = TfLiteXNNPackDelegateCreate(&cXNNPackOptions)
+    TfLiteInterpreterOptionsAddDelegate(cInterpreterOptions, cXNNPackDelegate)
+  }
 }
 
 extension Interpreter {
@@ -265,6 +290,28 @@ extension Interpreter {
     /// indicating that the `Interpreter` will decide the number of threads to use.
     public var threadCount: Int? = nil
 
+    /// Indicates whether an optimized set of floating point CPU kernels, provided by XNNPACK, is
+    /// enabled.
+    ///
+    /// - Experiment:
+    /// Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided
+    /// via the XNNPACK delegate. Currently, this is restricted to a subset of floating point
+    /// operations. Eventually, we plan to enable this by default, as it can provide significant
+    /// performance benefits for many classes of floating point models. See
+    /// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+    /// for more details.
+    ///
+    /// - Important:
+    /// Things to keep in mind when enabling this flag:
+    ///
+    ///     * Startup time and resize time may increase.
+    ///     * Baseline memory consumption may increase.
+    ///     * Compatibility with other delegates (e.g., GPU) has not been fully validated.
+    ///     * Quantized models will not see any benefit.
+    ///
+    /// - Warning: This is an experimental interface that is subject to change.
+    public var isXNNPackEnabled: Bool = false
+
     /// Creates a new instance with the default values.
     public init() {}
   }
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
index 8d0140279af..67d8120df4d 100644
--- a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
@@ -142,10 +142,12 @@ class InterpreterTests: XCTestCase {
   }
 
   func testResizeInputTensorAtIndexToShape_ThrowsInvalidIndex() {
-    XCTAssertThrowsError(try interpreter.resizeInput(
-      at: AddModel.invalidIndex,
-      to: [2, 2, 3]
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.resizeInput(
+        at: AddModel.invalidIndex,
+        to: [2, 2, 3]
+      )
+    ) { error in
       let maxIndex = AddModel.inputTensorCount - 1
       self.assertEqualErrors(
         actual: error,
@@ -162,10 +164,12 @@ class InterpreterTests: XCTestCase {
   }
 
   func testCopyDataToInputTensorAtIndex_ThrowsInvalidIndex() {
-    XCTAssertThrowsError(try interpreter.copy(
-      AddModel.inputData,
-      toInputAt: AddModel.invalidIndex
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.copy(
+        AddModel.inputData,
+        toInputAt: AddModel.invalidIndex
+      )
+    ) { error in
       let maxIndex = AddModel.inputTensorCount - 1
       self.assertEqualErrors(
         actual: error,
@@ -178,10 +182,12 @@ class InterpreterTests: XCTestCase {
     try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
     try interpreter.allocateTensors()
     let invalidData = Data(count: AddModel.dataCount - 1)
-    XCTAssertThrowsError(try interpreter.copy(
-      invalidData,
-      toInputAt: AddModel.validIndex
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.copy(
+        invalidData,
+        toInputAt: AddModel.validIndex
+      )
+    ) { error in
       self.assertEqualErrors(
         actual: error,
         expected: .invalidTensorDataCount(provided: invalidData.count, required: AddModel.dataCount)
@@ -223,12 +229,20 @@ class InterpreterOptionsTests: XCTestCase {
   func testInitWithDefaultValues() {
     let options = Interpreter.Options()
     XCTAssertNil(options.threadCount)
+    XCTAssertFalse(options.isXNNPackEnabled)
   }
 
   func testInitWithCustomValues() {
     var options = Interpreter.Options()
+
     options.threadCount = 2
     XCTAssertEqual(options.threadCount, 2)
+
+    options.isXNNPackEnabled = false
+    XCTAssertFalse(options.isXNNPackEnabled)
+
+    options.isXNNPackEnabled = true
+    XCTAssertTrue(options.isXNNPackEnabled)
   }
 
   func testEquatable() {
@@ -242,6 +256,15 @@ class InterpreterOptionsTests: XCTestCase {
 
     options2.threadCount = 3
     XCTAssertNotEqual(options1, options2)
+
+    options2.threadCount = 2
+    XCTAssertEqual(options1, options2)
+
+    options2.isXNNPackEnabled = true
+    XCTAssertNotEqual(options1, options2)
+
+    options1.isXNNPackEnabled = true
+    XCTAssertEqual(options1, options2)
   }
 }
 
@@ -326,14 +349,15 @@ extension Array {
   init?(unsafeData: Data) {
     guard unsafeData.count % MemoryLayout<Element>.stride == 0 else { return nil }
     #if swift(>=5.0)
-    self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) }
+      self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) }
     #else
-    self = unsafeData.withUnsafeBytes {
-      .init(UnsafeBufferPointer<Element>(
-        start: $0,
-        count: unsafeData.count / MemoryLayout<Element>.stride
-      ))
-    }
+      self = unsafeData.withUnsafeBytes {
+        .init(
+          UnsafeBufferPointer<Element>(
+            start: $0,
+            count: unsafeData.count / MemoryLayout<Element>.stride
+          ))
+      }
     #endif  // swift(>=5.0)
   }
 }