Enable quantized models by default on iOS APIs. Also makes related changes to docs.

PiperOrigin-RevId: 337876402 Change-Id: I7abb19894297cfe2781997b3e3e3ba4074fbf7e4
2020-10-19 09:50:16 -07:00 · 2020-10-19 09:50:16 -07:00 · 1dd24b74c2
commit 1dd24b74c2
parent f7f73a1ff8
9 changed files with 51 additions and 29 deletions
--- a/tensorflow/lite/delegates/gpu/metal_delegate.h
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.h
@ -47,10 +47,15 @@ typedef struct {
  bool allow_precision_loss;
  TFLGpuDelegateWaitType wait_type;
  // Allows execution of integer quantized models
-  // TODO(b/169350710): Enable by default.
  bool enable_quantization;
 } TFLGpuDelegateOptions;

+// Populates TFLGpuDelegateOptions as follows:
+//   allow_precision_loss = false;
+//   wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive;
+//   enable_quantization = true;
+TFL_CAPI_EXPORT extern TFLGpuDelegateOptions TFLGpuDelegateOptionsDefault(void);
+
 // Creates a new delegate instance that need to be destroyed with
 // `TFLDeleteTfLiteGpuDelegate` when delegate is no longer used by TFLite.
 // When `options` is set to `nullptr`, the following default values are used:
--- a/tensorflow/lite/delegates/gpu/metal_delegate.mm
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.mm
@ -177,10 +177,7 @@ class Delegate {
    if (options) {
      options_ = *options;
    } else {
-      // Default options.
-      options_.allow_precision_loss = false;
-      options_.enable_quantization = false;
-      options_.wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive;
+      options_ = TFLGpuDelegateOptionsDefault();
    }
    metal_device_ = MTLCreateSystemDefaultDevice();
    command_queue_ = [metal_device_ newCommandQueue];
@ -732,3 +729,12 @@ bool TFLGpuDelegateSetCommandEncoder(
  metal_delegate->SetCommandEncoder(encoder, control_encoder);
  return true;
 }
+
+TFLGpuDelegateOptions TFLGpuDelegateOptionsDefault() {
+  TFLGpuDelegateOptions options = {
+      .allow_precision_loss = false,
+      .wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive,
+      .enable_quantization = true,
+  };
+  return options;
+}
--- a/tensorflow/lite/experimental/objc/apis/TFLMetalDelegate.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLMetalDelegate.h
@ -57,7 +57,7 @@ typedef NS_ENUM(NSUInteger, TFLMetalDelegateThreadWaitType) {

 /**
 * Indicates whether the GPU delegate allows execution of an 8-bit quantized model. The default is
- * `false`.
+ * `true`.
 */
@property(nonatomic, getter=isQuantizationEnabled) BOOL quantizationEnabled;

--- a/tensorflow/lite/experimental/objc/sources/TFLMetalDelegate.m
+++ b/tensorflow/lite/experimental/objc/sources/TFLMetalDelegate.m
@ -29,6 +29,7 @@ NS_ASSUME_NONNULL_BEGIN
 - (instancetype)init {
  self = [super init];
  if (self != nil) {
+    _quantizationEnabled = true;
    _waitType = TFLMetalDelegateThreadWaitTypePassive;
  }
  return self;
--- a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
@ -62,8 +62,8 @@ extension MetalDelegate {
    public var waitType: ThreadWaitType = .passive

    /// Indicates whether the GPU delegate allows execution of an 8-bit quantized model. The default
-    /// is `false`.
-    public var isQuantizationEnabled = false
+    /// is `true`.
+    public var isQuantizationEnabled = true

    /// Creates a new instance with the default values.
    public init() {}
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
@ -52,8 +52,7 @@ Pod::Spec.new do |s|
    metal.test_spec 'Tests' do |ts|
      ts.source_files = swift_dir + 'Tests/{Interpreter,MetalDelegate}Tests.swift'
      ts.resources = [
-        tfl_dir + 'testdata/add.bin',
-        tfl_dir + 'testdata/add_quantized.bin',
+        tfl_dir + 'testdata/multi_add.bin',
      ]
    end
  end
--- a/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
@ -34,18 +34,26 @@ class MetalDelegateTests: XCTestCase {
  }

  func testInitInterpreterWithDelegate() throws {
+    // If metal device is not available, skip.
+    if MTLCreateSystemDefaultDevice() == nil {
+      return
+    }
    let metalDelegate = MetalDelegate()
-    let interpreter = try Interpreter(modelPath: AddQuantizedModel.path, delegates: [metalDelegate])
+    let interpreter = try Interpreter(modelPath: MultiAddModel.path, delegates: [metalDelegate])
    XCTAssertEqual(interpreter.delegates?.count, 1)
    XCTAssertNil(interpreter.options)
  }

  func testInitInterpreterWithOptionsAndDelegate() throws {
+    // If metal device is not available, skip.
+    if MTLCreateSystemDefaultDevice() == nil {
+      return
+    }
    var options = Interpreter.Options()
    options.threadCount = 1
    let metalDelegate = MetalDelegate()
    let interpreter = try Interpreter(
-      modelPath: AddQuantizedModel.path,
+      modelPath: MultiAddModel.path,
      options: options,
      delegates: [metalDelegate]
    )
@ -91,3 +99,16 @@ class MetalDelegateOptionsTests: XCTestCase {
    XCTAssertNotEqual(options1, options2)
  }
 }
+
+
+/// Values for the `multi_add.bin` model.
+enum MultiAddModel {
+  static let info = (name: "multi_add", extension: "bin")
+
+  static var path: String = {
+    let bundle = Bundle(for: MetalDelegateTests.self)
+    guard let path = bundle.path(forResource: info.name, ofType: info.extension) else { return "" }
+    return path
+  }()
+}
+
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@ -12,11 +12,9 @@ resulting in lower latency. In the best scenario, inference on the GPU may now
 run fast enough for previously not available real-time applications.

 Unlike CPUs, GPUs compute with 16-bit or 32-bit floating point numbers and do
-not require quantization for optimal performance.
-
-**NOTE:** The delegate does accept 8-bit quantized models on Android. Support on
-iOS is experimental. Refer to the [advanced documentation](gpu_advanced.md) for
-details.
+not require quantization for optimal performance. The delegate does accept 8-bit
+quantized models, but the calculation will be performed in floating point
+numbers. Refer to the [advanced documentation](gpu_advanced.md) for details.

 Another benefit with GPU inference is its power efficiency. GPUs carry out the
 computations in a very efficient and optimized manner, so that they consume less
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@ -325,7 +325,6 @@ Android APIs support quantized models by default. To disable, do the following:
 **C++ API**

 ```c++
-// NEW: Prepare custom options with feature enabled.
 TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
 options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;

@ -336,7 +335,6 @@ if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 **Java API**

 ```java
-// NEW: Prepare GPU delegate with feature turned on.
 GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false));

 Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
@ -344,27 +342,21 @@ Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);

 #### iOS

-Support for quantized models on iOS APIs is experimental. To enable, do the
-following:
+iOD APIs support quantized models by default. To disable, do the following:

 **Swift API**

 ```swift
-// NEW: Prepare custom options with feature enabled.
 var options = MetalDelegate.Options()
-options.isQuantizationEnabled = true
+options.isQuantizationEnabled = false
 let delegate = MetalDelegate(options: options)
 ```

 **C API (also used for Objective-C)**

 ```c
-
-// THIS:
-// NEW: Prepare custom options with feature enabled.
-const TFLGpuDelegateOptions options = {
-  .enable_quantization = true,
-};
+TFLGpuDelegateOptions options = TFLGpuDelegateOptionsDefault();
+options.enable_quantization = false;

 auto* delegate = TFLGpuDelegateCreate(options);
 ```