From 772433a2a2120d0aefc6c3628c6254d5a1aaf19d Mon Sep 17 00:00:00 2001 From: YoungSeok Yoon Date: Fri, 19 Jun 2020 01:58:58 -0700 Subject: [PATCH] Add flag for using optimized TFLite CPU kernels on iOS This adds new experimental flags to the interpreter options of TFLite Obj-C and Swift APIs, which can be used for opting in to a set of highly optimized floating point kernels provided via the XNNPACK delegate. The flags can be used as follows. Obj-C: TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init]; options.useXNNPACK = YES; NSError *error; TFLInterpreter *interpreter = [[TFLInterpreter alloc] initWithModelPath:@"model/path" options:options error:&error]; Swift: var options = InterpreterOptions() options.isXNNPackEnabled = true var interpreter = try Interpreter(modelPath: "model/path", options: options) PiperOrigin-RevId: 317270012 Change-Id: I82aae43c3de13ab08af3c70513e2a458e807b0f1 --- tensorflow/lite/delegates/xnnpack/BUILD | 4 ++ tensorflow/lite/experimental/ios/BUILD.apple | 18 +++++ tensorflow/lite/experimental/objc/BUILD.apple | 1 + .../objc/TensorFlowLiteObjC-nightly.podspec | 1 + .../objc/TensorFlowLiteObjC.podspec | 1 + .../objc/TensorFlowLiteObjC.podspec.template | 1 + .../objc/apis/TFLInterpreterOptions.h | 21 ++++++ .../objc/sources/TFLInterpreter.mm | 15 +++++ .../objc/tests/TFLInterpreterOptionsTests.m | 9 +++ .../swift/Sources/Interpreter.swift | 67 ++++++++++++++++--- .../swift/Tests/InterpreterTests.swift | 62 +++++++++++------ 11 files changed, 171 insertions(+), 29 deletions(-) diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD index 97e6aea2a6b..eaf7d8f6f03 100644 --- a/tensorflow/lite/delegates/xnnpack/BUILD +++ b/tensorflow/lite/delegates/xnnpack/BUILD @@ -14,6 +14,10 @@ EMSCRIPTEN_LINKOPTS = [ "-s TOTAL_MEMORY=134217728", ] +exports_files([ + "xnnpack_delegate.h", +]) + cc_library( name = "xnnpack_delegate", srcs = ["xnnpack_delegate.cc"], diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple index 1a85b604f9b..7a40ca7b8e7 100644 --- a/tensorflow/lite/experimental/ios/BUILD.apple +++ b/tensorflow/lite/experimental/ios/BUILD.apple @@ -18,10 +18,26 @@ sh_binary( ], ) +# When the static framework is built with bazel, the all header files are moved +# to the "Headers" directory with no header path prefixes. This auxiliary rule +# is used for stripping the path prefix to the "common.h" file included by the +# "xnnpack_delegate.h" header. +genrule( + name = "strip_xnnpack_include_hdr", + srcs = ["//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h"], + outs = ["xnnpack_delegate.h"], + cmd = """ + sed 's|#include ".*common.h"|#include "common.h"|'\ + "$(location //tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h)"\ + > "$@" + """, +) + # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteC_framework tflite_ios_static_framework( name = "TensorFlowLiteC_framework", hdrs = [ + ":xnnpack_delegate.h", "//tensorflow/lite/c:c_api.h", "//tensorflow/lite/c:common.h", ], @@ -105,6 +121,7 @@ cc_library( hdrs = [ "//tensorflow/lite/c:c_api.h", "//tensorflow/lite/c:common.h", + "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h", ], tags = [ "nobuilder", @@ -112,6 +129,7 @@ cc_library( ], deps = [ "//tensorflow/lite/c:c_api", + "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate", ], ) diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple index 09d4547813a..d26d90c46a1 100644 --- a/tensorflow/lite/experimental/objc/BUILD.apple +++ b/tensorflow/lite/experimental/objc/BUILD.apple @@ -64,6 +64,7 @@ objc_library( visibility = ios_visibility_whitelist(), deps = [ "//tensorflow/lite/c:c_api", + "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate", ], alwayslink = 1, ) diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec index e039fb57114..eed0f087f44 100644 --- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec +++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec @@ -26,6 +26,7 @@ Pod::Spec.new do |s| objc_dir + '{apis,sources}/*.{h,m,mm}', tfl_dir + 'c/c_api.h', tfl_dir + 'c/common.h', + tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h', ] s.module_map = objc_dir + 'apis/framework.modulemap' s.dependency 'TensorFlowLiteC', "~> #{s.version}" diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec index c673cfad759..5817619a58f 100644 --- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec +++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec @@ -26,6 +26,7 @@ Pod::Spec.new do |s| objc_dir + '{apis,sources}/*.{h,m,mm}', tfl_dir + 'c/c_api.h', tfl_dir + 'c/common.h', + tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h', ] s.module_map = objc_dir + 'apis/framework.modulemap' s.dependency 'TensorFlowLiteC', "#{s.version}" diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template index fc9e10e4a2c..4ab5753e016 100644 --- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template +++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template @@ -26,6 +26,7 @@ Pod::Spec.new do |s| objc_dir + '{apis,sources}/*.{h,m,mm}', tfl_dir + 'c/c_api.h', tfl_dir + 'c/common.h', + tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h', ] s.module_map = objc_dir + 'apis/framework.modulemap' s.dependency 'TensorFlowLiteC', '~> 0.0.1-nightly' diff --git a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h index 6461fbf0178..d7dbb2bd970 100644 --- a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h +++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h @@ -25,6 +25,27 @@ NS_ASSUME_NONNULL_BEGIN */ @property(nonatomic) NSUInteger numberOfThreads; +/** + * Experimental: Enable an optimized set of floating point CPU kernels (provided by XNNPACK). + * + * Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided via the + * XNNPACK delegate. Currently, this is restricted to a subset of floating point operations. + * Eventually, we plan to enable this by default, as it can provide significant performance benefits + * for many classes of floating point models. See + * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md + * for more details. + * + * Things to keep in mind when enabling this flag: + * + * * Startup time and resize time may increase. + * * Baseline memory consumption may increase. + * * Compatibility with other delegates (e.g., GPU) has not been fully validated. + * * Quantized models will not see any benefit. + * + * WARNING: This is an experimental interface that is subject to change. + */ +@property(nonatomic) BOOL useXNNPACK; + /** * Initializes a new instance of `TFLInterpreterOptions`. * diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm index 94031ee5428..34dd119885d 100644 --- a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm +++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm @@ -23,6 +23,7 @@ #import "tensorflow/lite/experimental/objc/apis/TFLTensor.h" #include "tensorflow/lite/c/c_api.h" +#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h" NS_ASSUME_NONNULL_BEGIN @@ -45,6 +46,9 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_ /** TfLiteInterpreter backed by C API. */ @property(nonatomic, nullable) TfLiteInterpreter *interpreter; +/** TfLiteDelegate backed by C API. */ +@property(nonatomic, nullable) TfLiteDelegate *xnnpack_delegate; + @end @implementation TFLInterpreter @@ -53,6 +57,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_ - (void)dealloc { TfLiteInterpreterDelete(_interpreter); + TfLiteXNNPackDelegateDelete(_xnnpack_delegate); } #pragma mark - Public @@ -104,6 +109,16 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_ } TfLiteInterpreterOptionsSetErrorReporter(cOptions, TFLInterpreterErrorReporter, nullptr); + if (options.useXNNPACK) { + TfLiteXNNPackDelegateOptions xnnpack_options = TfLiteXNNPackDelegateOptionsDefault(); + if (options.numberOfThreads > 0) { + xnnpack_options.num_threads = (int32_t)options.numberOfThreads; + } + + _xnnpack_delegate = TfLiteXNNPackDelegateCreate(&xnnpack_options); + TfLiteInterpreterOptionsAddDelegate(cOptions, _xnnpack_delegate); + } + _interpreter = TfLiteInterpreterCreate(model, cOptions); if (_interpreter == nullptr) { [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter diff --git a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m index 00b800d6af9..286cba98b49 100644 --- a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m +++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m @@ -32,6 +32,7 @@ NS_ASSUME_NONNULL_BEGIN TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init]; XCTAssertNotNil(options); XCTAssertEqual(options.numberOfThreads, 0); + XCTAssertFalse(options.useXNNPACK); } - (void)testSetNumberOfThread { @@ -44,6 +45,14 @@ NS_ASSUME_NONNULL_BEGIN XCTAssertEqual(options.numberOfThreads, 3); } +- (void)testUseXNNPACK { + TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init]; + options.useXNNPACK = YES; + XCTAssertTrue(options.useXNNPACK); + options.useXNNPACK = NO; + XCTAssertFalse(options.useXNNPACK); +} + @end NS_ASSUME_NONNULL_END diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift index b83c36c4e1d..3567822208d 100644 --- a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift +++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift @@ -39,6 +39,9 @@ public final class Interpreter { /// The underlying `TfLiteInterpreter` C pointer. private var cInterpreter: CInterpreter? + /// The underlying `TfLiteDelegate` C pointer for XNNPACK delegate. + private var cXNNPackDelegate: Delegate.CDelegate? + /// Creates a new instance with the given values. /// /// - Parameters: @@ -78,6 +81,14 @@ public final class Interpreter { ) } delegates?.forEach { TfLiteInterpreterOptionsAddDelegate(cInterpreterOptions, $0.cDelegate) } + + // Configure the XNNPack delegate after the other delegates explicitly added by the user. + options.map { + if $0.isXNNPackEnabled { + configureXNNPack(options: $0, cInterpreterOptions: cInterpreterOptions) + } + } + guard let cInterpreter = TfLiteInterpreterCreate(model.cModel, cInterpreterOptions) else { throw InterpreterError.failedToCreateInterpreter } @@ -86,6 +97,7 @@ public final class Interpreter { deinit { TfLiteInterpreterDelete(cInterpreter) + TfLiteXNNPackDelegateDelete(cXNNPackDelegate) } /// Invokes the interpreter to perform inference from the loaded graph. @@ -201,12 +213,13 @@ public final class Interpreter { guard case 0...maxIndex = index else { throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex) } - guard TfLiteInterpreterResizeInputTensor( - cInterpreter, - Int32(index), - shape.int32Dimensions, - Int32(shape.rank) - ) == kTfLiteOk + guard + TfLiteInterpreterResizeInputTensor( + cInterpreter, + Int32(index), + shape.int32Dimensions, + Int32(shape.rank) + ) == kTfLiteOk else { throw InterpreterError.failedToResizeInputTensor(index: index) } @@ -236,11 +249,11 @@ public final class Interpreter { } #if swift(>=5.0) - let status = data.withUnsafeBytes { - TfLiteTensorCopyFromBuffer(cTensor, $0.baseAddress, data.count) - } + let status = data.withUnsafeBytes { + TfLiteTensorCopyFromBuffer(cTensor, $0.baseAddress, data.count) + } #else - let status = data.withUnsafeBytes { TfLiteTensorCopyFromBuffer(cTensor, $0, data.count) } + let status = data.withUnsafeBytes { TfLiteTensorCopyFromBuffer(cTensor, $0, data.count) } #endif // swift(>=5.0) guard status == kTfLiteOk else { throw InterpreterError.failedToCopyDataToInputTensor } return try input(at: index) @@ -256,6 +269,18 @@ public final class Interpreter { throw InterpreterError.failedToAllocateTensors } } + + // MARK: - Private + + private func configureXNNPack(options: Options, cInterpreterOptions: OpaquePointer) { + var cXNNPackOptions = TfLiteXNNPackDelegateOptionsDefault() + if let threadCount = options.threadCount, threadCount > 0 { + cXNNPackOptions.num_threads = Int32(threadCount) + } + + cXNNPackDelegate = TfLiteXNNPackDelegateCreate(&cXNNPackOptions) + TfLiteInterpreterOptionsAddDelegate(cInterpreterOptions, cXNNPackDelegate) + } } extension Interpreter { @@ -265,6 +290,28 @@ extension Interpreter { /// indicating that the `Interpreter` will decide the number of threads to use. public var threadCount: Int? = nil + /// Indicates whether an optimized set of floating point CPU kernels, provided by XNNPACK, is + /// enabled. + /// + /// - Experiment: + /// Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided + /// via the XNNPACK delegate. Currently, this is restricted to a subset of floating point + /// operations. Eventually, we plan to enable this by default, as it can provide significant + /// performance benefits for many classes of floating point models. See + /// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md + /// for more details. + /// + /// - Important: + /// Things to keep in mind when enabling this flag: + /// + /// * Startup time and resize time may increase. + /// * Baseline memory consumption may increase. + /// * Compatibility with other delegates (e.g., GPU) has not been fully validated. + /// * Quantized models will not see any benefit. + /// + /// - Warning: This is an experimental interface that is subject to change. + public var isXNNPackEnabled: Bool = false + /// Creates a new instance with the default values. public init() {} } diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift index 8d0140279af..67d8120df4d 100644 --- a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift +++ b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift @@ -142,10 +142,12 @@ class InterpreterTests: XCTestCase { } func testResizeInputTensorAtIndexToShape_ThrowsInvalidIndex() { - XCTAssertThrowsError(try interpreter.resizeInput( - at: AddModel.invalidIndex, - to: [2, 2, 3] - )) { error in + XCTAssertThrowsError( + try interpreter.resizeInput( + at: AddModel.invalidIndex, + to: [2, 2, 3] + ) + ) { error in let maxIndex = AddModel.inputTensorCount - 1 self.assertEqualErrors( actual: error, @@ -162,10 +164,12 @@ class InterpreterTests: XCTestCase { } func testCopyDataToInputTensorAtIndex_ThrowsInvalidIndex() { - XCTAssertThrowsError(try interpreter.copy( - AddModel.inputData, - toInputAt: AddModel.invalidIndex - )) { error in + XCTAssertThrowsError( + try interpreter.copy( + AddModel.inputData, + toInputAt: AddModel.invalidIndex + ) + ) { error in let maxIndex = AddModel.inputTensorCount - 1 self.assertEqualErrors( actual: error, @@ -178,10 +182,12 @@ class InterpreterTests: XCTestCase { try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape) try interpreter.allocateTensors() let invalidData = Data(count: AddModel.dataCount - 1) - XCTAssertThrowsError(try interpreter.copy( - invalidData, - toInputAt: AddModel.validIndex - )) { error in + XCTAssertThrowsError( + try interpreter.copy( + invalidData, + toInputAt: AddModel.validIndex + ) + ) { error in self.assertEqualErrors( actual: error, expected: .invalidTensorDataCount(provided: invalidData.count, required: AddModel.dataCount) @@ -223,12 +229,20 @@ class InterpreterOptionsTests: XCTestCase { func testInitWithDefaultValues() { let options = Interpreter.Options() XCTAssertNil(options.threadCount) + XCTAssertFalse(options.isXNNPackEnabled) } func testInitWithCustomValues() { var options = Interpreter.Options() + options.threadCount = 2 XCTAssertEqual(options.threadCount, 2) + + options.isXNNPackEnabled = false + XCTAssertFalse(options.isXNNPackEnabled) + + options.isXNNPackEnabled = true + XCTAssertTrue(options.isXNNPackEnabled) } func testEquatable() { @@ -242,6 +256,15 @@ class InterpreterOptionsTests: XCTestCase { options2.threadCount = 3 XCTAssertNotEqual(options1, options2) + + options2.threadCount = 2 + XCTAssertEqual(options1, options2) + + options2.isXNNPackEnabled = true + XCTAssertNotEqual(options1, options2) + + options1.isXNNPackEnabled = true + XCTAssertEqual(options1, options2) } } @@ -326,14 +349,15 @@ extension Array { init?(unsafeData: Data) { guard unsafeData.count % MemoryLayout.stride == 0 else { return nil } #if swift(>=5.0) - self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) } + self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) } #else - self = unsafeData.withUnsafeBytes { - .init(UnsafeBufferPointer( - start: $0, - count: unsafeData.count / MemoryLayout.stride - )) - } + self = unsafeData.withUnsafeBytes { + .init( + UnsafeBufferPointer( + start: $0, + count: unsafeData.count / MemoryLayout.stride + )) + } #endif // swift(>=5.0) } }