Enable quantized models by default on iOS APIs. Also makes related changes to docs.
PiperOrigin-RevId: 337876402 Change-Id: I7abb19894297cfe2781997b3e3e3ba4074fbf7e4
This commit is contained in:
		
							parent
							
								
									f7f73a1ff8
								
							
						
					
					
						commit
						1dd24b74c2
					
				| @ -47,10 +47,15 @@ typedef struct { | ||||
|   bool allow_precision_loss; | ||||
|   TFLGpuDelegateWaitType wait_type; | ||||
|   // Allows execution of integer quantized models
 | ||||
|   // TODO(b/169350710): Enable by default.
 | ||||
|   bool enable_quantization; | ||||
| } TFLGpuDelegateOptions; | ||||
| 
 | ||||
| // Populates TFLGpuDelegateOptions as follows:
 | ||||
| //   allow_precision_loss = false;
 | ||||
| //   wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive;
 | ||||
| //   enable_quantization = true;
 | ||||
| TFL_CAPI_EXPORT extern TFLGpuDelegateOptions TFLGpuDelegateOptionsDefault(void); | ||||
| 
 | ||||
| // Creates a new delegate instance that need to be destroyed with
 | ||||
| // `TFLDeleteTfLiteGpuDelegate` when delegate is no longer used by TFLite.
 | ||||
| // When `options` is set to `nullptr`, the following default values are used:
 | ||||
|  | ||||
| @ -177,10 +177,7 @@ class Delegate { | ||||
|     if (options) { | ||||
|       options_ = *options; | ||||
|     } else { | ||||
|       // Default options. | ||||
|       options_.allow_precision_loss = false; | ||||
|       options_.enable_quantization = false; | ||||
|       options_.wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive; | ||||
|       options_ = TFLGpuDelegateOptionsDefault(); | ||||
|     } | ||||
|     metal_device_ = MTLCreateSystemDefaultDevice(); | ||||
|     command_queue_ = [metal_device_ newCommandQueue]; | ||||
| @ -732,3 +729,12 @@ bool TFLGpuDelegateSetCommandEncoder( | ||||
|   metal_delegate->SetCommandEncoder(encoder, control_encoder); | ||||
|   return true; | ||||
| } | ||||
| 
 | ||||
| TFLGpuDelegateOptions TFLGpuDelegateOptionsDefault() { | ||||
|   TFLGpuDelegateOptions options = { | ||||
|       .allow_precision_loss = false, | ||||
|       .wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive, | ||||
|       .enable_quantization = true, | ||||
|   }; | ||||
|   return options; | ||||
| } | ||||
|  | ||||
| @ -57,7 +57,7 @@ typedef NS_ENUM(NSUInteger, TFLMetalDelegateThreadWaitType) { | ||||
| 
 | ||||
| /**
 | ||||
|  * Indicates whether the GPU delegate allows execution of an 8-bit quantized model. The default is | ||||
|  * `false`. | ||||
|  * `true`. | ||||
|  */ | ||||
| @property(nonatomic, getter=isQuantizationEnabled) BOOL quantizationEnabled; | ||||
| 
 | ||||
|  | ||||
| @ -29,6 +29,7 @@ NS_ASSUME_NONNULL_BEGIN | ||||
| - (instancetype)init { | ||||
|   self = [super init]; | ||||
|   if (self != nil) { | ||||
|     _quantizationEnabled = true; | ||||
|     _waitType = TFLMetalDelegateThreadWaitTypePassive; | ||||
|   } | ||||
|   return self; | ||||
|  | ||||
| @ -62,8 +62,8 @@ extension MetalDelegate { | ||||
|     public var waitType: ThreadWaitType = .passive | ||||
| 
 | ||||
|     /// Indicates whether the GPU delegate allows execution of an 8-bit quantized model. The default | ||||
|     /// is `false`. | ||||
|     public var isQuantizationEnabled = false | ||||
|     /// is `true`. | ||||
|     public var isQuantizationEnabled = true | ||||
| 
 | ||||
|     /// Creates a new instance with the default values. | ||||
|     public init() {} | ||||
|  | ||||
| @ -52,8 +52,7 @@ Pod::Spec.new do |s| | ||||
|     metal.test_spec 'Tests' do |ts| | ||||
|       ts.source_files = swift_dir + 'Tests/{Interpreter,MetalDelegate}Tests.swift' | ||||
|       ts.resources = [ | ||||
|         tfl_dir + 'testdata/add.bin', | ||||
|         tfl_dir + 'testdata/add_quantized.bin', | ||||
|         tfl_dir + 'testdata/multi_add.bin', | ||||
|       ] | ||||
|     end | ||||
|   end | ||||
|  | ||||
| @ -34,18 +34,26 @@ class MetalDelegateTests: XCTestCase { | ||||
|   } | ||||
| 
 | ||||
|   func testInitInterpreterWithDelegate() throws { | ||||
|     // If metal device is not available, skip. | ||||
|     if MTLCreateSystemDefaultDevice() == nil { | ||||
|       return | ||||
|     } | ||||
|     let metalDelegate = MetalDelegate() | ||||
|     let interpreter = try Interpreter(modelPath: AddQuantizedModel.path, delegates: [metalDelegate]) | ||||
|     let interpreter = try Interpreter(modelPath: MultiAddModel.path, delegates: [metalDelegate]) | ||||
|     XCTAssertEqual(interpreter.delegates?.count, 1) | ||||
|     XCTAssertNil(interpreter.options) | ||||
|   } | ||||
| 
 | ||||
|   func testInitInterpreterWithOptionsAndDelegate() throws { | ||||
|     // If metal device is not available, skip. | ||||
|     if MTLCreateSystemDefaultDevice() == nil { | ||||
|       return | ||||
|     } | ||||
|     var options = Interpreter.Options() | ||||
|     options.threadCount = 1 | ||||
|     let metalDelegate = MetalDelegate() | ||||
|     let interpreter = try Interpreter( | ||||
|       modelPath: AddQuantizedModel.path, | ||||
|       modelPath: MultiAddModel.path, | ||||
|       options: options, | ||||
|       delegates: [metalDelegate] | ||||
|     ) | ||||
| @ -91,3 +99,16 @@ class MetalDelegateOptionsTests: XCTestCase { | ||||
|     XCTAssertNotEqual(options1, options2) | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /// Values for the `multi_add.bin` model. | ||||
| enum MultiAddModel { | ||||
|   static let info = (name: "multi_add", extension: "bin") | ||||
| 
 | ||||
|   static var path: String = { | ||||
|     let bundle = Bundle(for: MetalDelegateTests.self) | ||||
|     guard let path = bundle.path(forResource: info.name, ofType: info.extension) else { return "" } | ||||
|     return path | ||||
|   }() | ||||
| } | ||||
| 
 | ||||
|  | ||||
| @ -12,11 +12,9 @@ resulting in lower latency. In the best scenario, inference on the GPU may now | ||||
| run fast enough for previously not available real-time applications. | ||||
| 
 | ||||
| Unlike CPUs, GPUs compute with 16-bit or 32-bit floating point numbers and do | ||||
| not require quantization for optimal performance. | ||||
| 
 | ||||
| **NOTE:** The delegate does accept 8-bit quantized models on Android. Support on | ||||
| iOS is experimental. Refer to the [advanced documentation](gpu_advanced.md) for | ||||
| details. | ||||
| not require quantization for optimal performance. The delegate does accept 8-bit | ||||
| quantized models, but the calculation will be performed in floating point | ||||
| numbers. Refer to the [advanced documentation](gpu_advanced.md) for details. | ||||
| 
 | ||||
| Another benefit with GPU inference is its power efficiency. GPUs carry out the | ||||
| computations in a very efficient and optimized manner, so that they consume less | ||||
|  | ||||
| @ -325,7 +325,6 @@ Android APIs support quantized models by default. To disable, do the following: | ||||
| **C++ API** | ||||
| 
 | ||||
| ```c++ | ||||
| // NEW: Prepare custom options with feature enabled. | ||||
| TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default(); | ||||
| options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE; | ||||
| 
 | ||||
| @ -336,7 +335,6 @@ if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false; | ||||
| **Java API** | ||||
| 
 | ||||
| ```java | ||||
| // NEW: Prepare GPU delegate with feature turned on. | ||||
| GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false)); | ||||
| 
 | ||||
| Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate); | ||||
| @ -344,27 +342,21 @@ Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate); | ||||
| 
 | ||||
| #### iOS | ||||
| 
 | ||||
| Support for quantized models on iOS APIs is experimental. To enable, do the | ||||
| following: | ||||
| iOD APIs support quantized models by default. To disable, do the following: | ||||
| 
 | ||||
| **Swift API** | ||||
| 
 | ||||
| ```swift | ||||
| // NEW: Prepare custom options with feature enabled. | ||||
| var options = MetalDelegate.Options() | ||||
| options.isQuantizationEnabled = true | ||||
| options.isQuantizationEnabled = false | ||||
| let delegate = MetalDelegate(options: options) | ||||
| ``` | ||||
| 
 | ||||
| **C API (also used for Objective-C)** | ||||
| 
 | ||||
| ```c | ||||
| 
 | ||||
| // THIS: | ||||
| // NEW: Prepare custom options with feature enabled. | ||||
| const TFLGpuDelegateOptions options = { | ||||
|   .enable_quantization = true, | ||||
| }; | ||||
| TFLGpuDelegateOptions options = TFLGpuDelegateOptionsDefault(); | ||||
| options.enable_quantization = false; | ||||
| 
 | ||||
| auto* delegate = TFLGpuDelegateCreate(options); | ||||
| ``` | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user