Enable quantized models by default on Android APIs. Also makes related changes to docs.
PiperOrigin-RevId: 334403600 Change-Id: I4ffddeb457ddfef46171b840b99ecddb90bf2c27
This commit is contained in:
parent
07dd48763f
commit
4e68fdecae
tensorflow/lite
delegates/gpu
g3doc/performance
java/src/test/java/org/tensorflow/lite/gpu
@ -448,7 +448,7 @@ TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
|
||||
.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION,
|
||||
.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
|
||||
.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
|
||||
.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE,
|
||||
.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT,
|
||||
.max_delegated_partitions = 1,
|
||||
};
|
||||
return options;
|
||||
|
@ -51,6 +51,7 @@ enum TfLiteGpuInferencePriority {
|
||||
enum TfLiteGpuExperimentalFlags {
|
||||
TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE = 0,
|
||||
// Enables inference on quantized models with the delegate.
|
||||
// NOTE: This is enabled in TfLiteGpuDelegateOptionsV2Default.
|
||||
TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT = 1 << 0,
|
||||
// Enforces execution with the provided backend.
|
||||
TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY = 1 << 1,
|
||||
@ -108,6 +109,8 @@ typedef struct {
|
||||
// priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION
|
||||
// priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO
|
||||
// priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO
|
||||
// experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT
|
||||
// max_delegated_partitions = 1
|
||||
TFL_CAPI_EXPORT TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default();
|
||||
|
||||
// Creates a new delegate instance that need to be destroyed with
|
||||
|
@ -68,7 +68,7 @@ public class GpuDelegate implements Delegate, Closeable {
|
||||
*
|
||||
* <p>WARNING: This is an experimental API and subject to change.
|
||||
*
|
||||
* @param quantizedModelsAllowed When {@code true}, the GPU may run quantized models.
|
||||
* @param quantizedModelsAllowed When {@code true} (default), the GPU may run quantized models.
|
||||
*/
|
||||
public Options setQuantizedModelsAllowed(boolean quantizedModelsAllowed) {
|
||||
this.quantizedModelsAllowed = quantizedModelsAllowed;
|
||||
@ -87,7 +87,7 @@ public class GpuDelegate implements Delegate, Closeable {
|
||||
}
|
||||
|
||||
boolean precisionLossAllowed = true;
|
||||
boolean quantizedModelsAllowed = false;
|
||||
boolean quantizedModelsAllowed = true;
|
||||
int inferencePreference = INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
|
||||
}
|
||||
|
||||
|
@ -47,6 +47,7 @@ typedef struct {
|
||||
bool allow_precision_loss;
|
||||
TFLGpuDelegateWaitType wait_type;
|
||||
// Allows execution of integer quantized models
|
||||
// TODO(b/169350710): Enable by default.
|
||||
bool enable_quantization;
|
||||
} TFLGpuDelegateOptions;
|
||||
|
||||
|
@ -21,10 +21,11 @@ TensorFlow Lite provides the following delegates for hardware acceleration:
|
||||
|
||||
* **GPU delegate for cross platform acceleration** - The GPU delegate can be
|
||||
used on both Android and iOS. It is optimized to run 32-bit and 16-bit float
|
||||
based models where a GPU is available. For an overview of the GPU delegate,
|
||||
see [TensorFlow Lite on GPU](gpu_advanced.md). For step-by-step tutorials on
|
||||
using the GPU delegate with Android and iOS, see
|
||||
[TensorFlow Lite GPU Delegate Tutorial](gpu.md).
|
||||
based models where a GPU is available. It also supports 8-bit quantized
|
||||
models and provides GPU performance on par with their float versions. For
|
||||
details on the GPU delegate, see [TensorFlow Lite on GPU](gpu_advanced.md).
|
||||
For step-by-step tutorials on using the GPU delegate with Android and iOS,
|
||||
see [TensorFlow Lite GPU Delegate Tutorial](gpu.md).
|
||||
* **NNAPI delegate for newer Android devices** - The NNAPI delegate can be
|
||||
used to accelerate models on Android devices with GPU, DSP and / or NPU
|
||||
available. It is available in Android 8.1 (API 27+) or higher. For an
|
||||
|
@ -14,6 +14,10 @@ run fast enough for previously not available real-time applications.
|
||||
Unlike CPUs, GPUs compute with 16-bit or 32-bit floating point numbers and do
|
||||
not require quantization for optimal performance.
|
||||
|
||||
**NOTE:** The delegate does accept 8-bit quantized models on Android. Support on
|
||||
iOS is experimental. Refer to the [advanced documentation](gpu_advanced.md) for
|
||||
details.
|
||||
|
||||
Another benefit with GPU inference is its power efficiency. GPUs carry out the
|
||||
computations in a very efficient and optimized manner, so that they consume less
|
||||
power and generate less heat than when the same task is run on CPUs.
|
||||
|
@ -285,12 +285,10 @@ While it is convenient to use `nullptr`, we recommend that you explicitly set
|
||||
the options, to avoid any unexpected behavior if default values are changed in
|
||||
the future.
|
||||
|
||||
### Running quantized models (Experimental)
|
||||
### Running quantized models on GPU
|
||||
|
||||
The GPU delegate already supports
|
||||
[float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant)
|
||||
models. There is experimental support on Android and iOS to run 8-bit quantized
|
||||
as well. This includes all flavors of quantization, including:
|
||||
This section explains how the GPU delegate accelerates 8-bit quantized models.
|
||||
This includes all flavors of quantization, including:
|
||||
|
||||
* Models trained with
|
||||
[Quantization-aware training](https://www.tensorflow.org/lite/convert/quantization)
|
||||
@ -322,12 +320,14 @@ This feature can be enabled using delegate options as follows:
|
||||
|
||||
#### Android
|
||||
|
||||
Android APIs support quantized models by default. To disable, do the following:
|
||||
|
||||
**C++ API**
|
||||
|
||||
```c++
|
||||
// NEW: Prepare custom options with feature enabled.
|
||||
TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
|
||||
options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
|
||||
options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
|
||||
|
||||
auto* delegate = TfLiteGpuDelegateV2Create(options);
|
||||
if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
|
||||
@ -337,13 +337,16 @@ if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
|
||||
|
||||
```java
|
||||
// NEW: Prepare GPU delegate with feature turned on.
|
||||
GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
|
||||
GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false));
|
||||
|
||||
Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
|
||||
```
|
||||
|
||||
#### iOS
|
||||
|
||||
Support for quantized models on iOS APIs is experimental. To enable, do the
|
||||
following:
|
||||
|
||||
**Swift API**
|
||||
|
||||
```swift
|
||||
|
@ -76,8 +76,8 @@ public final class GpuDelegateTest {
|
||||
"tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
|
||||
|
||||
Interpreter.Options options = new Interpreter.Options();
|
||||
try (GpuDelegate delegate =
|
||||
new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
|
||||
// Default behavior allows quantized models.
|
||||
try (GpuDelegate delegate = new GpuDelegate();
|
||||
Interpreter interpreter =
|
||||
new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
|
||||
byte[][] output = new byte[1][1001];
|
||||
@ -98,12 +98,13 @@ public final class GpuDelegateTest {
|
||||
"tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
|
||||
|
||||
Interpreter.Options options = new Interpreter.Options();
|
||||
try (GpuDelegate delegate = new GpuDelegate();
|
||||
try (GpuDelegate delegate =
|
||||
new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false));
|
||||
Interpreter interpreter =
|
||||
new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
|
||||
byte[][] output = new byte[1][1001];
|
||||
interpreter.run(img, output);
|
||||
// Original execution plan remains since default behavior doesn't allow quantized models.
|
||||
// Original execution plan remains since we disabled quantized models.
|
||||
assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(31);
|
||||
assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
|
||||
assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
|
||||
|
Loading…
Reference in New Issue
Block a user