Enable quantized models by default on Android APIs. Also makes related changes to docs.

PiperOrigin-RevId: 334403600
Change-Id: I4ffddeb457ddfef46171b840b99ecddb90bf2c27
This commit is contained in:
Sachin Joglekar 2020-09-29 10:04:38 -07:00 committed by TensorFlower Gardener
parent 07dd48763f
commit 4e68fdecae
8 changed files with 31 additions and 18 deletions
tensorflow/lite
delegates/gpu
delegate.ccdelegate.h
java/src/main/java/org/tensorflow/lite/gpu
metal_delegate.h
g3doc/performance
java/src/test/java/org/tensorflow/lite/gpu

View File

@ -448,7 +448,7 @@ TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION,
.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE,
.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT,
.max_delegated_partitions = 1,
};
return options;

View File

@ -51,6 +51,7 @@ enum TfLiteGpuInferencePriority {
enum TfLiteGpuExperimentalFlags {
TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE = 0,
// Enables inference on quantized models with the delegate.
// NOTE: This is enabled in TfLiteGpuDelegateOptionsV2Default.
TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT = 1 << 0,
// Enforces execution with the provided backend.
TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY = 1 << 1,
@ -108,6 +109,8 @@ typedef struct {
// priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION
// priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO
// priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO
// experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT
// max_delegated_partitions = 1
TFL_CAPI_EXPORT TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default();
// Creates a new delegate instance that need to be destroyed with

View File

@ -68,7 +68,7 @@ public class GpuDelegate implements Delegate, Closeable {
*
* <p>WARNING: This is an experimental API and subject to change.
*
* @param quantizedModelsAllowed When {@code true}, the GPU may run quantized models.
* @param quantizedModelsAllowed When {@code true} (default), the GPU may run quantized models.
*/
public Options setQuantizedModelsAllowed(boolean quantizedModelsAllowed) {
this.quantizedModelsAllowed = quantizedModelsAllowed;
@ -87,7 +87,7 @@ public class GpuDelegate implements Delegate, Closeable {
}
boolean precisionLossAllowed = true;
boolean quantizedModelsAllowed = false;
boolean quantizedModelsAllowed = true;
int inferencePreference = INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
}

View File

@ -47,6 +47,7 @@ typedef struct {
bool allow_precision_loss;
TFLGpuDelegateWaitType wait_type;
// Allows execution of integer quantized models
// TODO(b/169350710): Enable by default.
bool enable_quantization;
} TFLGpuDelegateOptions;

View File

@ -21,10 +21,11 @@ TensorFlow Lite provides the following delegates for hardware acceleration:
* **GPU delegate for cross platform acceleration** - The GPU delegate can be
used on both Android and iOS. It is optimized to run 32-bit and 16-bit float
based models where a GPU is available. For an overview of the GPU delegate,
see [TensorFlow Lite on GPU](gpu_advanced.md). For step-by-step tutorials on
using the GPU delegate with Android and iOS, see
[TensorFlow Lite GPU Delegate Tutorial](gpu.md).
based models where a GPU is available. It also supports 8-bit quantized
models and provides GPU performance on par with their float versions. For
details on the GPU delegate, see [TensorFlow Lite on GPU](gpu_advanced.md).
For step-by-step tutorials on using the GPU delegate with Android and iOS,
see [TensorFlow Lite GPU Delegate Tutorial](gpu.md).
* **NNAPI delegate for newer Android devices** - The NNAPI delegate can be
used to accelerate models on Android devices with GPU, DSP and / or NPU
available. It is available in Android 8.1 (API 27+) or higher. For an

View File

@ -14,6 +14,10 @@ run fast enough for previously not available real-time applications.
Unlike CPUs, GPUs compute with 16-bit or 32-bit floating point numbers and do
not require quantization for optimal performance.
**NOTE:** The delegate does accept 8-bit quantized models on Android. Support on
iOS is experimental. Refer to the [advanced documentation](gpu_advanced.md) for
details.
Another benefit with GPU inference is its power efficiency. GPUs carry out the
computations in a very efficient and optimized manner, so that they consume less
power and generate less heat than when the same task is run on CPUs.

View File

@ -285,12 +285,10 @@ While it is convenient to use `nullptr`, we recommend that you explicitly set
the options, to avoid any unexpected behavior if default values are changed in
the future.
### Running quantized models (Experimental)
### Running quantized models on GPU
The GPU delegate already supports
[float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant)
models. There is experimental support on Android and iOS to run 8-bit quantized
as well. This includes all flavors of quantization, including:
This section explains how the GPU delegate accelerates 8-bit quantized models.
This includes all flavors of quantization, including:
* Models trained with
[Quantization-aware training](https://www.tensorflow.org/lite/convert/quantization)
@ -322,12 +320,14 @@ This feature can be enabled using delegate options as follows:
#### Android
Android APIs support quantized models by default. To disable, do the following:
**C++ API**
```c++
// NEW: Prepare custom options with feature enabled.
TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
auto* delegate = TfLiteGpuDelegateV2Create(options);
if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
@ -337,13 +337,16 @@ if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
```java
// NEW: Prepare GPU delegate with feature turned on.
GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false));
Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
```
#### iOS
Support for quantized models on iOS APIs is experimental. To enable, do the
following:
**Swift API**
```swift

View File

@ -76,8 +76,8 @@ public final class GpuDelegateTest {
"tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
Interpreter.Options options = new Interpreter.Options();
try (GpuDelegate delegate =
new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
// Default behavior allows quantized models.
try (GpuDelegate delegate = new GpuDelegate();
Interpreter interpreter =
new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
byte[][] output = new byte[1][1001];
@ -98,12 +98,13 @@ public final class GpuDelegateTest {
"tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
Interpreter.Options options = new Interpreter.Options();
try (GpuDelegate delegate = new GpuDelegate();
try (GpuDelegate delegate =
new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false));
Interpreter interpreter =
new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
byte[][] output = new byte[1][1001];
interpreter.run(img, output);
// Original execution plan remains since default behavior doesn't allow quantized models.
// Original execution plan remains since we disabled quantized models.
assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(31);
assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});