Enable quantized models by default on iOS APIs. Also makes related changes to docs.

PiperOrigin-RevId: 337876402 Change-Id: I7abb19894297cfe2781997b3e3e3ba4074fbf7e4
2020-10-19 09:50:16 -07:00 · 2020-10-19 09:50:16 -07:00 · 1dd24b74c2
commit 1dd24b74c2
parent f7f73a1ff8
9 changed files with 51 additions and 29 deletions
--- a/tensorflow/lite/delegates/gpu/metal_delegate.h
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.h
@ -47,10 +47,15 @@ typedef struct {
  bool allow_precision_loss;
  TFLGpuDelegateWaitType wait_type;
  // Allows execution of integer quantized models
  // TODO(b/169350710): Enable by default.
  bool enable_quantization;
 } TFLGpuDelegateOptions;
 // Populates TFLGpuDelegateOptions as follows:
 //   allow_precision_loss = false;
 //   wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive;
 //   enable_quantization = true;
 TFL_CAPI_EXPORT extern TFLGpuDelegateOptions TFLGpuDelegateOptionsDefault(void);
 // Creates a new delegate instance that need to be destroyed with
 // `TFLDeleteTfLiteGpuDelegate` when delegate is no longer used by TFLite.
 // When `options` is set to `nullptr`, the following default values are used:
--- a/tensorflow/lite/delegates/gpu/metal_delegate.mm
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.mm
@ -177,10 +177,7 @@ class Delegate {
    if (options) {
      options_ = *options;
    } else {
-      // Default options.
+      options_ = TFLGpuDelegateOptionsDefault();
      options_.allow_precision_loss = false;
      options_.enable_quantization = false;
      options_.wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive;
    }
    metal_device_ = MTLCreateSystemDefaultDevice();
    command_queue_ = [metal_device_ newCommandQueue];
@ -732,3 +729,12 @@ bool TFLGpuDelegateSetCommandEncoder(
  metal_delegate->SetCommandEncoder(encoder, control_encoder);
  return true;
 }
 TFLGpuDelegateOptions TFLGpuDelegateOptionsDefault() {
  TFLGpuDelegateOptions options = {
      .allow_precision_loss = false,
      .wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive,
      .enable_quantization = true,
  };
  return options;
 }
--- a/tensorflow/lite/experimental/objc/apis/TFLMetalDelegate.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLMetalDelegate.h
@ -57,7 +57,7 @@ typedef NS_ENUM(NSUInteger, TFLMetalDelegateThreadWaitType) {
 /**
 * Indicates whether the GPU delegate allows execution of an 8-bit quantized model. The default is
- * `false`.
+ * `true`.
 */
@property(nonatomic, getter=isQuantizationEnabled) BOOL quantizationEnabled;
--- a/tensorflow/lite/experimental/objc/sources/TFLMetalDelegate.m
+++ b/tensorflow/lite/experimental/objc/sources/TFLMetalDelegate.m
@ -29,6 +29,7 @@ NS_ASSUME_NONNULL_BEGIN
 - (instancetype)init {
  self = [super init];
  if (self != nil) {
    _quantizationEnabled = true;
    _waitType = TFLMetalDelegateThreadWaitTypePassive;
  }
  return self;
--- a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
@ -62,8 +62,8 @@ extension MetalDelegate {
    public var waitType: ThreadWaitType = .passive
    /// Indicates whether the GPU delegate allows execution of an 8-bit quantized model. The default
-    /// is `false`.
+    /// is `true`.
-    public var isQuantizationEnabled = false
+    public var isQuantizationEnabled = true
    /// Creates a new instance with the default values.
    public init() {}
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
@ -52,8 +52,7 @@ Pod::Spec.new do |s|
    metal.test_spec 'Tests' do |ts|
      ts.source_files = swift_dir + 'Tests/{Interpreter,MetalDelegate}Tests.swift'
      ts.resources = [
-        tfl_dir + 'testdata/add.bin',
+        tfl_dir + 'testdata/multi_add.bin',
        tfl_dir + 'testdata/add_quantized.bin',
      ]
    end
  end
--- a/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
@ -34,18 +34,26 @@ class MetalDelegateTests: XCTestCase {
  }
  func testInitInterpreterWithDelegate() throws {
    // If metal device is not available, skip.
    if MTLCreateSystemDefaultDevice() == nil {
      return
    }
    let metalDelegate = MetalDelegate()
-    let interpreter = try Interpreter(modelPath: AddQuantizedModel.path, delegates: [metalDelegate])
+    let interpreter = try Interpreter(modelPath: MultiAddModel.path, delegates: [metalDelegate])
    XCTAssertEqual(interpreter.delegates?.count, 1)
    XCTAssertNil(interpreter.options)
  }
  func testInitInterpreterWithOptionsAndDelegate() throws {
    // If metal device is not available, skip.
    if MTLCreateSystemDefaultDevice() == nil {
      return
    }
    var options = Interpreter.Options()
    options.threadCount = 1
    let metalDelegate = MetalDelegate()
    let interpreter = try Interpreter(
-      modelPath: AddQuantizedModel.path,
+      modelPath: MultiAddModel.path,
      options: options,
      delegates: [metalDelegate]
    )
@ -91,3 +99,16 @@ class MetalDelegateOptionsTests: XCTestCase {
    XCTAssertNotEqual(options1, options2)
  }
 }
 /// Values for the `multi_add.bin` model.
 enum MultiAddModel {
  static let info = (name: "multi_add", extension: "bin")
  static var path: String = {
    let bundle = Bundle(for: MetalDelegateTests.self)
    guard let path = bundle.path(forResource: info.name, ofType: info.extension) else { return "" }
    return path
  }()
 }
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@ -12,11 +12,9 @@ resulting in lower latency. In the best scenario, inference on the GPU may now
 run fast enough for previously not available real-time applications.
 Unlike CPUs, GPUs compute with 16-bit or 32-bit floating point numbers and do
-not require quantization for optimal performance.
+not require quantization for optimal performance. The delegate does accept 8-bit
-
+quantized models, but the calculation will be performed in floating point
-**NOTE:** The delegate does accept 8-bit quantized models on Android. Support on
+numbers. Refer to the [advanced documentation](gpu_advanced.md) for details.
 iOS is experimental. Refer to the [advanced documentation](gpu_advanced.md) for
 details.
 Another benefit with GPU inference is its power efficiency. GPUs carry out the
 computations in a very efficient and optimized manner, so that they consume less
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@ -325,7 +325,6 @@ Android APIs support quantized models by default. To disable, do the following:
 **C++ API**
 ```c++
 // NEW: Prepare custom options with feature enabled.
 TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
 options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
@ -336,7 +335,6 @@ if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 **Java API**
 ```java
 // NEW: Prepare GPU delegate with feature turned on.
 GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false));
 Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
@ -344,27 +342,21 @@ Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
 #### iOS
-Support for quantized models on iOS APIs is experimental. To enable, do the
+iOD APIs support quantized models by default. To disable, do the following:
 following:
 **Swift API**
 ```swift
 // NEW: Prepare custom options with feature enabled.
 var options = MetalDelegate.Options()
-options.isQuantizationEnabled = true
+options.isQuantizationEnabled = false
 let delegate = MetalDelegate(options: options)
 ```
 **C API (also used for Objective-C)**
 ```c
-
+TFLGpuDelegateOptions options = TFLGpuDelegateOptionsDefault();
-// THIS:
+options.enable_quantization = false;
 // NEW: Prepare custom options with feature enabled.
 const TFLGpuDelegateOptions options = {
  .enable_quantization = true,
 };
 auto* delegate = TFLGpuDelegateCreate(options);
 ```