Add flag for using optimized TFLite CPU kernels on iOS

This adds new experimental flags to the interpreter options of TFLite Obj-C and Swift APIs, which can be used for opting in to a set of highly optimized floating point kernels provided via the XNNPACK delegate. The flags can be used as follows. Obj-C: TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init]; options.useXNNPACK = YES; NSError *error; TFLInterpreter *interpreter = [[TFLInterpreter alloc] initWithModelPath:@"model/path" options:options error:&error]; Swift: var options = InterpreterOptions() options.isXNNPackEnabled = true var interpreter = try Interpreter(modelPath: "model/path", options: options) PiperOrigin-RevId: 317270012 Change-Id: I82aae43c3de13ab08af3c70513e2a458e807b0f1
2020-06-19 01:58:58 -07:00 · 2020-06-19 01:58:58 -07:00 · 772433a2a2
commit 772433a2a2
parent e51b17f458
11 changed files with 171 additions and 29 deletions
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@ -14,6 +14,10 @@ EMSCRIPTEN_LINKOPTS = [
    "-s TOTAL_MEMORY=134217728",
 ]

+exports_files([
+    "xnnpack_delegate.h",
+])
+
 cc_library(
    name = "xnnpack_delegate",
    srcs = ["xnnpack_delegate.cc"],
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@ -18,10 +18,26 @@ sh_binary(
    ],
 )

+# When the static framework is built with bazel, the all header files are moved
+# to the "Headers" directory with no header path prefixes. This auxiliary rule
+# is used for stripping the path prefix to the "common.h" file included by the
+# "xnnpack_delegate.h" header.
+genrule(
+    name = "strip_xnnpack_include_hdr",
+    srcs = ["//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h"],
+    outs = ["xnnpack_delegate.h"],
+    cmd = """
+    sed 's|#include ".*common.h"|#include "common.h"|'\
+    "$(location //tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h)"\
+    > "$@"
+    """,
+)
+
 # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteC_framework
 tflite_ios_static_framework(
    name = "TensorFlowLiteC_framework",
    hdrs = [
+        ":xnnpack_delegate.h",
        "//tensorflow/lite/c:c_api.h",
        "//tensorflow/lite/c:common.h",
    ],
@ -105,6 +121,7 @@ cc_library(
    hdrs = [
        "//tensorflow/lite/c:c_api.h",
        "//tensorflow/lite/c:common.h",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
    ],
    tags = [
        "nobuilder",
@ -112,6 +129,7 @@ cc_library(
    ],
    deps = [
        "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
    ],
 )

--- a/tensorflow/lite/experimental/objc/BUILD.apple
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@ -64,6 +64,7 @@ objc_library(
    visibility = ios_visibility_whitelist(),
    deps = [
        "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
    ],
    alwayslink = 1,
 )
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
@ -26,6 +26,7 @@ Pod::Spec.new do |s|
    objc_dir + '{apis,sources}/*.{h,m,mm}',
    tfl_dir + 'c/c_api.h',
    tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
  ]
  s.module_map = objc_dir + 'apis/framework.modulemap'
  s.dependency 'TensorFlowLiteC', "~> #{s.version}"
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
@ -26,6 +26,7 @@ Pod::Spec.new do |s|
    objc_dir + '{apis,sources}/*.{h,m,mm}',
    tfl_dir + 'c/c_api.h',
    tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
  ]
  s.module_map = objc_dir + 'apis/framework.modulemap'
  s.dependency 'TensorFlowLiteC', "#{s.version}"
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
@ -26,6 +26,7 @@ Pod::Spec.new do |s|
    objc_dir + '{apis,sources}/*.{h,m,mm}',
    tfl_dir + 'c/c_api.h',
    tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
  ]
  s.module_map = objc_dir + 'apis/framework.modulemap'
  s.dependency 'TensorFlowLiteC', '~> 0.0.1-nightly'
--- a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
@ -25,6 +25,27 @@ NS_ASSUME_NONNULL_BEGIN
 */
@property(nonatomic) NSUInteger numberOfThreads;

+/**
+ * Experimental: Enable an optimized set of floating point CPU kernels (provided by XNNPACK).
+ *
+ * Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided via the
+ * XNNPACK delegate. Currently, this is restricted to a subset of floating point operations.
+ * Eventually, we plan to enable this by default, as it can provide significant performance benefits
+ * for many classes of floating point models. See
+ * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+ * for more details.
+ *
+ * Things to keep in mind when enabling this flag:
+ *
+ *     * Startup time and resize time may increase.
+ *     * Baseline memory consumption may increase.
+ *     * Compatibility with other delegates (e.g., GPU) has not been fully validated.
+ *     * Quantized models will not see any benefit.
+ *
+ * WARNING: This is an experimental interface that is subject to change.
+ */
+@property(nonatomic) BOOL useXNNPACK;
+
 /**
 * Initializes a new instance of `TFLInterpreterOptions`.
 *
--- a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@ -23,6 +23,7 @@
 #import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"

 #include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"

 NS_ASSUME_NONNULL_BEGIN

@ -45,6 +46,9 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 /** TfLiteInterpreter backed by C API. */
@property(nonatomic, nullable) TfLiteInterpreter *interpreter;

+/** TfLiteDelegate backed by C API. */
+@property(nonatomic, nullable) TfLiteDelegate *xnnpack_delegate;
+
@end

@implementation TFLInterpreter
@ -53,6 +57,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_

 - (void)dealloc {
  TfLiteInterpreterDelete(_interpreter);
+  TfLiteXNNPackDelegateDelete(_xnnpack_delegate);
 }

 #pragma mark - Public
@ -104,6 +109,16 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
      }
      TfLiteInterpreterOptionsSetErrorReporter(cOptions, TFLInterpreterErrorReporter, nullptr);

+      if (options.useXNNPACK) {
+        TfLiteXNNPackDelegateOptions xnnpack_options = TfLiteXNNPackDelegateOptionsDefault();
+        if (options.numberOfThreads > 0) {
+          xnnpack_options.num_threads = (int32_t)options.numberOfThreads;
+        }
+
+        _xnnpack_delegate = TfLiteXNNPackDelegateCreate(&xnnpack_options);
+        TfLiteInterpreterOptionsAddDelegate(cOptions, _xnnpack_delegate);
+      }
+
      _interpreter = TfLiteInterpreterCreate(model, cOptions);
      if (_interpreter == nullptr) {
        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
--- a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
+++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
@ -32,6 +32,7 @@ NS_ASSUME_NONNULL_BEGIN
  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
  XCTAssertNotNil(options);
  XCTAssertEqual(options.numberOfThreads, 0);
+  XCTAssertFalse(options.useXNNPACK);
 }

 - (void)testSetNumberOfThread {
@ -44,6 +45,14 @@ NS_ASSUME_NONNULL_BEGIN
  XCTAssertEqual(options.numberOfThreads, 3);
 }

+- (void)testUseXNNPACK {
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  options.useXNNPACK = YES;
+  XCTAssertTrue(options.useXNNPACK);
+  options.useXNNPACK = NO;
+  XCTAssertFalse(options.useXNNPACK);
+}
+
@end

 NS_ASSUME_NONNULL_END
--- a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@ -39,6 +39,9 @@ public final class Interpreter {
  /// The underlying `TfLiteInterpreter` C pointer.
  private var cInterpreter: CInterpreter?

+  /// The underlying `TfLiteDelegate` C pointer for XNNPACK delegate.
+  private var cXNNPackDelegate: Delegate.CDelegate?
+
  /// Creates a new instance with the given values.
  ///
  /// - Parameters:
@ -78,6 +81,14 @@ public final class Interpreter {
      )
    }
    delegates?.forEach { TfLiteInterpreterOptionsAddDelegate(cInterpreterOptions, $0.cDelegate) }
+
+    // Configure the XNNPack delegate after the other delegates explicitly added by the user.
+    options.map {
+      if $0.isXNNPackEnabled {
+        configureXNNPack(options: $0, cInterpreterOptions: cInterpreterOptions)
+      }
+    }
+
    guard let cInterpreter = TfLiteInterpreterCreate(model.cModel, cInterpreterOptions) else {
      throw InterpreterError.failedToCreateInterpreter
    }
@ -86,6 +97,7 @@ public final class Interpreter {

  deinit {
    TfLiteInterpreterDelete(cInterpreter)
+    TfLiteXNNPackDelegateDelete(cXNNPackDelegate)
  }

  /// Invokes the interpreter to perform inference from the loaded graph.
@ -201,12 +213,13 @@ public final class Interpreter {
    guard case 0...maxIndex = index else {
      throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
    }
-    guard TfLiteInterpreterResizeInputTensor(
-      cInterpreter,
-      Int32(index),
-      shape.int32Dimensions,
-      Int32(shape.rank)
-    ) == kTfLiteOk
+    guard
+      TfLiteInterpreterResizeInputTensor(
+        cInterpreter,
+        Int32(index),
+        shape.int32Dimensions,
+        Int32(shape.rank)
+      ) == kTfLiteOk
    else {
      throw InterpreterError.failedToResizeInputTensor(index: index)
    }
@ -236,11 +249,11 @@ public final class Interpreter {
    }

    #if swift(>=5.0)
-    let status = data.withUnsafeBytes {
-      TfLiteTensorCopyFromBuffer(cTensor, $0.baseAddress, data.count)
-    }
+      let status = data.withUnsafeBytes {
+        TfLiteTensorCopyFromBuffer(cTensor, $0.baseAddress, data.count)
+      }
    #else
-    let status = data.withUnsafeBytes { TfLiteTensorCopyFromBuffer(cTensor, $0, data.count) }
+      let status = data.withUnsafeBytes { TfLiteTensorCopyFromBuffer(cTensor, $0, data.count) }
    #endif  // swift(>=5.0)
    guard status == kTfLiteOk else { throw InterpreterError.failedToCopyDataToInputTensor }
    return try input(at: index)
@ -256,6 +269,18 @@ public final class Interpreter {
      throw InterpreterError.failedToAllocateTensors
    }
  }
+
+  // MARK: - Private
+
+  private func configureXNNPack(options: Options, cInterpreterOptions: OpaquePointer) {
+    var cXNNPackOptions = TfLiteXNNPackDelegateOptionsDefault()
+    if let threadCount = options.threadCount, threadCount > 0 {
+      cXNNPackOptions.num_threads = Int32(threadCount)
+    }
+
+    cXNNPackDelegate = TfLiteXNNPackDelegateCreate(&cXNNPackOptions)
+    TfLiteInterpreterOptionsAddDelegate(cInterpreterOptions, cXNNPackDelegate)
+  }
 }

 extension Interpreter {
@ -265,6 +290,28 @@ extension Interpreter {
    /// indicating that the `Interpreter` will decide the number of threads to use.
    public var threadCount: Int? = nil

+    /// Indicates whether an optimized set of floating point CPU kernels, provided by XNNPACK, is
+    /// enabled.
+    ///
+    /// - Experiment:
+    /// Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided
+    /// via the XNNPACK delegate. Currently, this is restricted to a subset of floating point
+    /// operations. Eventually, we plan to enable this by default, as it can provide significant
+    /// performance benefits for many classes of floating point models. See
+    /// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+    /// for more details.
+    ///
+    /// - Important:
+    /// Things to keep in mind when enabling this flag:
+    ///
+    ///     * Startup time and resize time may increase.
+    ///     * Baseline memory consumption may increase.
+    ///     * Compatibility with other delegates (e.g., GPU) has not been fully validated.
+    ///     * Quantized models will not see any benefit.
+    ///
+    /// - Warning: This is an experimental interface that is subject to change.
+    public var isXNNPackEnabled: Bool = false
+
    /// Creates a new instance with the default values.
    public init() {}
  }
--- a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
@ -142,10 +142,12 @@ class InterpreterTests: XCTestCase {
  }

  func testResizeInputTensorAtIndexToShape_ThrowsInvalidIndex() {
-    XCTAssertThrowsError(try interpreter.resizeInput(
-      at: AddModel.invalidIndex,
-      to: [2, 2, 3]
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.resizeInput(
+        at: AddModel.invalidIndex,
+        to: [2, 2, 3]
+      )
+    ) { error in
      let maxIndex = AddModel.inputTensorCount - 1
      self.assertEqualErrors(
        actual: error,
@ -162,10 +164,12 @@ class InterpreterTests: XCTestCase {
  }

  func testCopyDataToInputTensorAtIndex_ThrowsInvalidIndex() {
-    XCTAssertThrowsError(try interpreter.copy(
-      AddModel.inputData,
-      toInputAt: AddModel.invalidIndex
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.copy(
+        AddModel.inputData,
+        toInputAt: AddModel.invalidIndex
+      )
+    ) { error in
      let maxIndex = AddModel.inputTensorCount - 1
      self.assertEqualErrors(
        actual: error,
@ -178,10 +182,12 @@ class InterpreterTests: XCTestCase {
    try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
    try interpreter.allocateTensors()
    let invalidData = Data(count: AddModel.dataCount - 1)
-    XCTAssertThrowsError(try interpreter.copy(
-      invalidData,
-      toInputAt: AddModel.validIndex
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.copy(
+        invalidData,
+        toInputAt: AddModel.validIndex
+      )
+    ) { error in
      self.assertEqualErrors(
        actual: error,
        expected: .invalidTensorDataCount(provided: invalidData.count, required: AddModel.dataCount)
@ -223,12 +229,20 @@ class InterpreterOptionsTests: XCTestCase {
  func testInitWithDefaultValues() {
    let options = Interpreter.Options()
    XCTAssertNil(options.threadCount)
+    XCTAssertFalse(options.isXNNPackEnabled)
  }

  func testInitWithCustomValues() {
    var options = Interpreter.Options()
+
    options.threadCount = 2
    XCTAssertEqual(options.threadCount, 2)
+
+    options.isXNNPackEnabled = false
+    XCTAssertFalse(options.isXNNPackEnabled)
+
+    options.isXNNPackEnabled = true
+    XCTAssertTrue(options.isXNNPackEnabled)
  }

  func testEquatable() {
@ -242,6 +256,15 @@ class InterpreterOptionsTests: XCTestCase {

    options2.threadCount = 3
    XCTAssertNotEqual(options1, options2)
+
+    options2.threadCount = 2
+    XCTAssertEqual(options1, options2)
+
+    options2.isXNNPackEnabled = true
+    XCTAssertNotEqual(options1, options2)
+
+    options1.isXNNPackEnabled = true
+    XCTAssertEqual(options1, options2)
  }
 }

@ -326,14 +349,15 @@ extension Array {
  init?(unsafeData: Data) {
    guard unsafeData.count % MemoryLayout<Element>.stride == 0 else { return nil }
    #if swift(>=5.0)
-    self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) }
+      self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) }
    #else
-    self = unsafeData.withUnsafeBytes {
-      .init(UnsafeBufferPointer<Element>(
-        start: $0,
-        count: unsafeData.count / MemoryLayout<Element>.stride
-      ))
-    }
+      self = unsafeData.withUnsafeBytes {
+        .init(
+          UnsafeBufferPointer<Element>(
+            start: $0,
+            count: unsafeData.count / MemoryLayout<Element>.stride
+          ))
+      }
    #endif  // swift(>=5.0)
  }
 }