Add quantization option to Metal delegate Swift API
Also updated outdated GPU delegate documentation. PiperOrigin-RevId: 315469909 Change-Id: I7b524373a397763c886905e83a2e8b75226d9471
This commit is contained in:
parent
c254833717
commit
7c1b0d0a37
|
@ -35,6 +35,7 @@ public final class MetalDelegate: Delegate {
|
|||
var delegateOptions = TFLGpuDelegateOptions()
|
||||
delegateOptions.allow_precision_loss = options.allowsPrecisionLoss
|
||||
delegateOptions.wait_type = options.waitType.cWaitType
|
||||
delegateOptions.enable_quantization = options.isQuantizationEnabled
|
||||
cDelegate = TFLGpuDelegateCreate(&delegateOptions)
|
||||
}
|
||||
|
||||
|
@ -54,6 +55,10 @@ extension MetalDelegate {
|
|||
/// default is `passive`.
|
||||
public var waitType: ThreadWaitType = .passive
|
||||
|
||||
/// Indicates whether the GPU delegate allows execution of an 8-bit quantized model. The default
|
||||
/// is `false`.
|
||||
public var isQuantizationEnabled = false
|
||||
|
||||
/// Creates a new instance with the default values.
|
||||
public init() {}
|
||||
}
|
||||
|
|
|
@ -179,6 +179,28 @@ delegate.close();
|
|||
|
||||
### iOS
|
||||
|
||||
#### Swift
|
||||
|
||||
Initialize TensorFlow Lite interpreter with the GPU delegate.
|
||||
|
||||
```swift
|
||||
import TensorFlowLite
|
||||
|
||||
// Load model ...
|
||||
|
||||
let delegate = MetalDelegate()
|
||||
|
||||
if let interpreter = try Interpreter(modelPath: modelPath,
|
||||
delegates: [delegate]) {
|
||||
// Run inference ...
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
#### Objective-C
|
||||
|
||||
Note: For Objective-C, GPU delegate is provided via C API.
|
||||
|
||||
In your application code, include the GPU delegate header and call the
|
||||
`Interpreter::ModifyGraphWithDelegate` function to register the GPU delegate to
|
||||
the interpreter:
|
||||
|
|
|
@ -126,10 +126,28 @@ bazel build -c opt --config android_arm64 tensorflow/lite/delegates/gpu:gl_deleg
|
|||
bazel build -c opt --config android_arm64 tensorflow/lite/delegates/gpu:libtensorflowlite_gpu_gl.so # for dynamic library
|
||||
```
|
||||
|
||||
### iOS (ObjC++)
|
||||
### iOS (Swift)
|
||||
|
||||
To use TensorFlow Lite on GPU, get the GPU delegate via `NewGpuDelegate()` and
|
||||
then pass it to `Interpreter::ModifyGraphWithDelegate()` (instead of calling
|
||||
Initialize TensorFlow Lite interpreter with the GPU delegate.
|
||||
|
||||
```swift
|
||||
import TensorFlowLite
|
||||
|
||||
let delegate = MetalDelegate()
|
||||
if let interpreter = try Interpreter(modelPath: modelPath,
|
||||
delegates: [delegate]) {
|
||||
|
||||
// Run inference ...
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
### iOS (Objective-C)
|
||||
|
||||
Note: For Objective-C, GPU delegate is provided via C API.
|
||||
|
||||
To use TensorFlow Lite on GPU, get the GPU delegate via `TFLGpuDelegateCreate()`
|
||||
and then pass it to `Interpreter::ModifyGraphWithDelegate()` (instead of calling
|
||||
`Interpreter::AllocateTensors()`).
|
||||
|
||||
```c++
|
||||
|
@ -142,12 +160,7 @@ InterpreterBuilder(*model, op_resolver)(&interpreter);
|
|||
|
||||
// NEW: Prepare GPU delegate.
|
||||
|
||||
const GpuDelegateOptions options = {
|
||||
.allow_precision_loss = false,
|
||||
.wait_type = kGpuDelegateOptions::WaitType::Passive,
|
||||
};
|
||||
|
||||
auto* delegate = NewGpuDelegate(options);
|
||||
auto* delegate = TFLGpuDelegateCreate(/*default options=*/nullptr);
|
||||
if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
|
||||
|
||||
// Run inference.
|
||||
|
@ -156,7 +169,7 @@ if (interpreter->Invoke() != kTfLiteOk) return false;
|
|||
ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
|
||||
|
||||
// Clean up.
|
||||
DeleteGpuDelegate(delegate);
|
||||
TFLGpuDelegateDelete(delegate);
|
||||
```
|
||||
|
||||
Note: When calling `Interpreter::ModifyGraphWithDelegate()` or
|
||||
|
@ -169,7 +182,54 @@ called.
|
|||
|
||||
## Advanced usage
|
||||
|
||||
### Running quantized models (Experimental, Android only)
|
||||
### Delegate Options for iOS
|
||||
|
||||
`TFLGpuDelegateCreate()` accepts a `struct` of options.
|
||||
([C API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/gpu/metal_delegate.h),
|
||||
[Swift API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift))
|
||||
|
||||
Passing `nullptr`(C API) or nothing (Swift API) to the initializer sets the
|
||||
default options (which are explicated in the Basic Usage example above).
|
||||
|
||||
**Swift API**
|
||||
|
||||
```swift
|
||||
|
||||
// THIS:
|
||||
var options = MetalDelegate.Options()
|
||||
options.allowsPrecisionLoss = false
|
||||
options.waitType = .passive
|
||||
options.isQuantizationEnabled = false
|
||||
let delegate = MetalDelegate(options: options)
|
||||
|
||||
// IS THE SAME AS THIS:
|
||||
let delegate = MetalDelegate()
|
||||
|
||||
```
|
||||
|
||||
**C API (also used for Objective-C)**
|
||||
|
||||
```c++
|
||||
|
||||
// THIS:
|
||||
const TFLGpuDelegateOptions options = {
|
||||
.allow_precision_loss = false,
|
||||
.wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive,
|
||||
.enable_quantization = false,
|
||||
};
|
||||
|
||||
auto* delegate = TFLGpuDelegateCreate(options);
|
||||
|
||||
// IS THE SAME AS THIS:
|
||||
auto* delegate = TFLGpuDelegateCreate(nullptr);
|
||||
|
||||
```
|
||||
|
||||
While it is convenient to use `nullptr`, we recommend that you explicitly set
|
||||
the options, to avoid any unexpected behavior if default values are changed in
|
||||
the future.
|
||||
|
||||
### Running quantized models (Experimental)
|
||||
|
||||
The GPU delegate already supports
|
||||
[float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant)
|
||||
|
@ -186,6 +246,8 @@ tensors.
|
|||
|
||||
This feature can be enabled using delegate options as follows:
|
||||
|
||||
#### Android
|
||||
|
||||
**C++ API**
|
||||
|
||||
```c++
|
||||
|
@ -206,51 +268,30 @@ GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedMod
|
|||
Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
|
||||
```
|
||||
|
||||
### Delegate Options for iOS
|
||||
#### iOS
|
||||
|
||||
`NewGpuDelegate()` accepts a `struct` of options.
|
||||
**Swift API**
|
||||
|
||||
```c++
|
||||
struct GpuDelegateOptions {
|
||||
// Allows to quantify tensors, downcast values, process in float16 etc.
|
||||
bool allow_precision_loss;
|
||||
|
||||
enum class WaitType {
|
||||
// waitUntilCompleted
|
||||
kPassive,
|
||||
// Minimize latency. It uses active spinning instead of mutex and consumes
|
||||
// additional CPU resources.
|
||||
kActive,
|
||||
// Useful when the output is used with GPU pipeline then or if external
|
||||
// command encoder is set
|
||||
kDoNotWait,
|
||||
};
|
||||
WaitType wait_type;
|
||||
};
|
||||
```swift
|
||||
// NEW: Prepare custom options with feature enabled.
|
||||
var options = MetalDelegate.Options()
|
||||
options.isQuantizationEnabled = true
|
||||
let delegate = MetalDelegate(options: options)
|
||||
```
|
||||
|
||||
Passing `nullptr` into `NewGpuDelegate()` sets the default options (which are
|
||||
explicated in the Basic Usage example above).
|
||||
**C API (also used for Objective-C)**
|
||||
|
||||
```c++
|
||||
```c
|
||||
|
||||
// THIS:
|
||||
const GpuDelegateOptions options = {
|
||||
.allow_precision_loss = false,
|
||||
.wait_type = kGpuDelegateOptions::WaitType::Passive,
|
||||
// NEW: Prepare custom options with feature enabled.
|
||||
const TFLGpuDelegateOptions options = {
|
||||
.enable_quantization = true,
|
||||
};
|
||||
|
||||
auto* delegate = NewGpuDelegate(options);
|
||||
|
||||
// IS THE SAME AS THIS:
|
||||
auto* delegate = NewGpuDelegate(nullptr);
|
||||
|
||||
auto* delegate = TFLGpuDelegateCreate(options);
|
||||
```
|
||||
|
||||
While it is convenient to use `nullptr`, we recommend that you explicitly set
|
||||
the options, to avoid any unexpected behavior if default values are changed in
|
||||
the future.
|
||||
|
||||
### Input/Output Buffers (iOS only)
|
||||
|
||||
To do computation on the GPU, data must be made available to the GPU. This often
|
||||
|
@ -280,8 +321,13 @@ off by calling `Interpreter::SetAllowBufferHandleOutput(true)` during
|
|||
initialization.
|
||||
|
||||
```c++
|
||||
#include "tensorflow/lite/delegates/gpu/metal_delegate.h"
|
||||
#include "tensorflow/lite/delegates/gpu/metal_delegate_internal.h"
|
||||
|
||||
// ...
|
||||
|
||||
// Prepare GPU delegate.
|
||||
auto* delegate = NewGpuDelegate(nullptr);
|
||||
auto* delegate = TFLGpuDelegateCreate(nullptr);
|
||||
interpreter->SetAllowBufferHandleOutput(true); // disable default gpu->cpu copy
|
||||
if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false;
|
||||
if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false;
|
||||
|
|
Loading…
Reference in New Issue