From 4e68fdecae3b97e120636114c7d50fd331609f40 Mon Sep 17 00:00:00 2001 From: Sachin Joglekar <srjoglekar@google.com> Date: Tue, 29 Sep 2020 10:04:38 -0700 Subject: [PATCH] Enable quantized models by default on Android APIs. Also makes related changes to docs. PiperOrigin-RevId: 334403600 Change-Id: I4ffddeb457ddfef46171b840b99ecddb90bf2c27 --- tensorflow/lite/delegates/gpu/delegate.cc | 2 +- tensorflow/lite/delegates/gpu/delegate.h | 3 +++ .../org/tensorflow/lite/gpu/GpuDelegate.java | 4 ++-- tensorflow/lite/delegates/gpu/metal_delegate.h | 1 + tensorflow/lite/g3doc/performance/delegates.md | 9 +++++---- tensorflow/lite/g3doc/performance/gpu.md | 4 ++++ .../lite/g3doc/performance/gpu_advanced.md | 17 ++++++++++------- .../tensorflow/lite/gpu/GpuDelegateTest.java | 9 +++++---- 8 files changed, 31 insertions(+), 18 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc index bfc2b7f08c4..98303b51da8 100644 --- a/tensorflow/lite/delegates/gpu/delegate.cc +++ b/tensorflow/lite/delegates/gpu/delegate.cc @@ -448,7 +448,7 @@ TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() { .inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION, .inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO, .inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO, - .experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE, + .experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT, .max_delegated_partitions = 1, }; return options; diff --git a/tensorflow/lite/delegates/gpu/delegate.h b/tensorflow/lite/delegates/gpu/delegate.h index 9af586bfd75..40a06bb4384 100644 --- a/tensorflow/lite/delegates/gpu/delegate.h +++ b/tensorflow/lite/delegates/gpu/delegate.h @@ -51,6 +51,7 @@ enum TfLiteGpuInferencePriority { enum TfLiteGpuExperimentalFlags { TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE = 0, // Enables inference on quantized models with the delegate. + // NOTE: This is enabled in TfLiteGpuDelegateOptionsV2Default. TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT = 1 << 0, // Enforces execution with the provided backend. TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY = 1 << 1, @@ -108,6 +109,8 @@ typedef struct { // priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION // priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO // priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO +// experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT +// max_delegated_partitions = 1 TFL_CAPI_EXPORT TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default(); // Creates a new delegate instance that need to be destroyed with diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java index 78cab0d2cbf..5eb6881be88 100644 --- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java +++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java @@ -68,7 +68,7 @@ public class GpuDelegate implements Delegate, Closeable { * * <p>WARNING: This is an experimental API and subject to change. * - * @param quantizedModelsAllowed When {@code true}, the GPU may run quantized models. + * @param quantizedModelsAllowed When {@code true} (default), the GPU may run quantized models. */ public Options setQuantizedModelsAllowed(boolean quantizedModelsAllowed) { this.quantizedModelsAllowed = quantizedModelsAllowed; @@ -87,7 +87,7 @@ public class GpuDelegate implements Delegate, Closeable { } boolean precisionLossAllowed = true; - boolean quantizedModelsAllowed = false; + boolean quantizedModelsAllowed = true; int inferencePreference = INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER; } diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.h b/tensorflow/lite/delegates/gpu/metal_delegate.h index e4bdba36799..ea9da126954 100644 --- a/tensorflow/lite/delegates/gpu/metal_delegate.h +++ b/tensorflow/lite/delegates/gpu/metal_delegate.h @@ -47,6 +47,7 @@ typedef struct { bool allow_precision_loss; TFLGpuDelegateWaitType wait_type; // Allows execution of integer quantized models + // TODO(b/169350710): Enable by default. bool enable_quantization; } TFLGpuDelegateOptions; diff --git a/tensorflow/lite/g3doc/performance/delegates.md b/tensorflow/lite/g3doc/performance/delegates.md index 14aeece21fa..6b233075398 100644 --- a/tensorflow/lite/g3doc/performance/delegates.md +++ b/tensorflow/lite/g3doc/performance/delegates.md @@ -21,10 +21,11 @@ TensorFlow Lite provides the following delegates for hardware acceleration: * **GPU delegate for cross platform acceleration** - The GPU delegate can be used on both Android and iOS. It is optimized to run 32-bit and 16-bit float - based models where a GPU is available. For an overview of the GPU delegate, - see [TensorFlow Lite on GPU](gpu_advanced.md). For step-by-step tutorials on - using the GPU delegate with Android and iOS, see - [TensorFlow Lite GPU Delegate Tutorial](gpu.md). + based models where a GPU is available. It also supports 8-bit quantized + models and provides GPU performance on par with their float versions. For + details on the GPU delegate, see [TensorFlow Lite on GPU](gpu_advanced.md). + For step-by-step tutorials on using the GPU delegate with Android and iOS, + see [TensorFlow Lite GPU Delegate Tutorial](gpu.md). * **NNAPI delegate for newer Android devices** - The NNAPI delegate can be used to accelerate models on Android devices with GPU, DSP and / or NPU available. It is available in Android 8.1 (API 27+) or higher. For an diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md index 3d9989558f5..96e8aa6f9dc 100644 --- a/tensorflow/lite/g3doc/performance/gpu.md +++ b/tensorflow/lite/g3doc/performance/gpu.md @@ -14,6 +14,10 @@ run fast enough for previously not available real-time applications. Unlike CPUs, GPUs compute with 16-bit or 32-bit floating point numbers and do not require quantization for optimal performance. +**NOTE:** The delegate does accept 8-bit quantized models on Android. Support on +iOS is experimental. Refer to the [advanced documentation](gpu_advanced.md) for +details. + Another benefit with GPU inference is its power efficiency. GPUs carry out the computations in a very efficient and optimized manner, so that they consume less power and generate less heat than when the same task is run on CPUs. diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md index 652100ab850..71415693f86 100644 --- a/tensorflow/lite/g3doc/performance/gpu_advanced.md +++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md @@ -285,12 +285,10 @@ While it is convenient to use `nullptr`, we recommend that you explicitly set the options, to avoid any unexpected behavior if default values are changed in the future. -### Running quantized models (Experimental) +### Running quantized models on GPU -The GPU delegate already supports -[float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant) -models. There is experimental support on Android and iOS to run 8-bit quantized -as well. This includes all flavors of quantization, including: +This section explains how the GPU delegate accelerates 8-bit quantized models. +This includes all flavors of quantization, including: * Models trained with [Quantization-aware training](https://www.tensorflow.org/lite/convert/quantization) @@ -322,12 +320,14 @@ This feature can be enabled using delegate options as follows: #### Android +Android APIs support quantized models by default. To disable, do the following: + **C++ API** ```c++ // NEW: Prepare custom options with feature enabled. TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default(); -options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT; +options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE; auto* delegate = TfLiteGpuDelegateV2Create(options); if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false; @@ -337,13 +337,16 @@ if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false; ```java // NEW: Prepare GPU delegate with feature turned on. -GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true)); +GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false)); Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate); ``` #### iOS +Support for quantized models on iOS APIs is experimental. To enable, do the +following: + **Swift API** ```swift diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java index d92a7119aab..de320fd68d6 100644 --- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java +++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java @@ -76,8 +76,8 @@ public final class GpuDelegateTest { "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg"); Interpreter.Options options = new Interpreter.Options(); - try (GpuDelegate delegate = - new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true)); + // Default behavior allows quantized models. + try (GpuDelegate delegate = new GpuDelegate(); Interpreter interpreter = new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) { byte[][] output = new byte[1][1001]; @@ -98,12 +98,13 @@ public final class GpuDelegateTest { "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg"); Interpreter.Options options = new Interpreter.Options(); - try (GpuDelegate delegate = new GpuDelegate(); + try (GpuDelegate delegate = + new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false)); Interpreter interpreter = new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) { byte[][] output = new byte[1][1001]; interpreter.run(img, output); - // Original execution plan remains since default behavior doesn't allow quantized models. + // Original execution plan remains since we disabled quantized models. assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(31); assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3}); assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});