Enable quantized models by default on Android APIs. Also makes related changes to docs.

PiperOrigin-RevId: 334403600 Change-Id: I4ffddeb457ddfef46171b840b99ecddb90bf2c27
2020-09-29 10:04:38 -07:00 · 2020-09-29 10:04:38 -07:00 · 4e68fdecae
commit 4e68fdecae
parent 07dd48763f
8 changed files with 31 additions and 18 deletions
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@ -448,7 +448,7 @@ TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
      .inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION,
      .inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
      .inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
-      .experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE,
+      .experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT,
      .max_delegated_partitions = 1,
  };
  return options;
--- a/tensorflow/lite/delegates/gpu/delegate.h
+++ b/tensorflow/lite/delegates/gpu/delegate.h
@ -51,6 +51,7 @@ enum TfLiteGpuInferencePriority {
 enum TfLiteGpuExperimentalFlags {
  TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE = 0,
  // Enables inference on quantized models with the delegate.
+  // NOTE: This is enabled in TfLiteGpuDelegateOptionsV2Default.
  TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT = 1 << 0,
  // Enforces execution with the provided backend.
  TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY = 1 << 1,
@ -108,6 +109,8 @@ typedef struct {
 //   priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION
 //   priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO
 //   priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO
+//   experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT
+//   max_delegated_partitions = 1
 TFL_CAPI_EXPORT TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default();

 // Creates a new delegate instance that need to be destroyed with
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
@ -68,7 +68,7 @@ public class GpuDelegate implements Delegate, Closeable {
     *
     * <p>WARNING: This is an experimental API and subject to change.
     *
-     * @param quantizedModelsAllowed When {@code true}, the GPU may run quantized models.
+     * @param quantizedModelsAllowed When {@code true} (default), the GPU may run quantized models.
     */
    public Options setQuantizedModelsAllowed(boolean quantizedModelsAllowed) {
      this.quantizedModelsAllowed = quantizedModelsAllowed;
@ -87,7 +87,7 @@ public class GpuDelegate implements Delegate, Closeable {
    }

    boolean precisionLossAllowed = true;
-    boolean quantizedModelsAllowed = false;
+    boolean quantizedModelsAllowed = true;
    int inferencePreference = INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
  }

--- a/tensorflow/lite/delegates/gpu/metal_delegate.h
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.h
@ -47,6 +47,7 @@ typedef struct {
  bool allow_precision_loss;
  TFLGpuDelegateWaitType wait_type;
  // Allows execution of integer quantized models
+  // TODO(b/169350710): Enable by default.
  bool enable_quantization;
 } TFLGpuDelegateOptions;

--- a/tensorflow/lite/g3doc/performance/delegates.md
+++ b/tensorflow/lite/g3doc/performance/delegates.md
@ -21,10 +21,11 @@ TensorFlow Lite provides the following delegates for hardware acceleration:

 *   **GPU delegate for cross platform acceleration** - The GPU delegate can be
    used on both Android and iOS. It is optimized to run 32-bit and 16-bit float
-    based models where a GPU is available. For an overview of the GPU delegate,
-    see [TensorFlow Lite on GPU](gpu_advanced.md). For step-by-step tutorials on
-    using the GPU delegate with Android and iOS, see
-    [TensorFlow Lite GPU Delegate Tutorial](gpu.md).
+    based models where a GPU is available. It also supports 8-bit quantized
+    models and provides GPU performance on par with their float versions. For
+    details on the GPU delegate, see [TensorFlow Lite on GPU](gpu_advanced.md).
+    For step-by-step tutorials on using the GPU delegate with Android and iOS,
+    see [TensorFlow Lite GPU Delegate Tutorial](gpu.md).
 *   **NNAPI delegate for newer Android devices** - The NNAPI delegate can be
    used to accelerate models on Android devices with GPU, DSP and / or NPU
    available. It is available in Android 8.1 (API 27+) or higher. For an
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@ -14,6 +14,10 @@ run fast enough for previously not available real-time applications.
 Unlike CPUs, GPUs compute with 16-bit or 32-bit floating point numbers and do
 not require quantization for optimal performance.

+**NOTE:** The delegate does accept 8-bit quantized models on Android. Support on
+iOS is experimental. Refer to the [advanced documentation](gpu_advanced.md) for
+details.
+
 Another benefit with GPU inference is its power efficiency. GPUs carry out the
 computations in a very efficient and optimized manner, so that they consume less
 power and generate less heat than when the same task is run on CPUs.
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@ -285,12 +285,10 @@ While it is convenient to use `nullptr`, we recommend that you explicitly set
 the options, to avoid any unexpected behavior if default values are changed in
 the future.

-### Running quantized models (Experimental)
+### Running quantized models on GPU

-The GPU delegate already supports
-[float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant)
-models. There is experimental support on Android and iOS to run 8-bit quantized
-as well. This includes all flavors of quantization, including:
+This section explains how the GPU delegate accelerates 8-bit quantized models.
+This includes all flavors of quantization, including:

 *   Models trained with
    [Quantization-aware training](https://www.tensorflow.org/lite/convert/quantization)
@ -322,12 +320,14 @@ This feature can be enabled using delegate options as follows:

 #### Android

+Android APIs support quantized models by default. To disable, do the following:
+
 **C++ API**

 ```c++
 // NEW: Prepare custom options with feature enabled.
 TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
-options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;

 auto* delegate = TfLiteGpuDelegateV2Create(options);
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
@ -337,13 +337,16 @@ if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;

 ```java
 // NEW: Prepare GPU delegate with feature turned on.
-GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false));

 Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
 ```

 #### iOS

+Support for quantized models on iOS APIs is experimental. To enable, do the
+following:
+
 **Swift API**

 ```swift
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
@ -76,8 +76,8 @@ public final class GpuDelegateTest {
            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");

    Interpreter.Options options = new Interpreter.Options();
-    try (GpuDelegate delegate =
-            new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+    // Default behavior allows quantized models.
+    try (GpuDelegate delegate = new GpuDelegate();
        Interpreter interpreter =
            new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
      byte[][] output = new byte[1][1001];
@ -98,12 +98,13 @@ public final class GpuDelegateTest {
            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");

    Interpreter.Options options = new Interpreter.Options();
-    try (GpuDelegate delegate = new GpuDelegate();
+    try (GpuDelegate delegate =
+            new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false));
        Interpreter interpreter =
            new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
      byte[][] output = new byte[1][1001];
      interpreter.run(img, output);
-      // Original execution plan remains since default behavior doesn't allow quantized models.
+      // Original execution plan remains since we disabled quantized models.
      assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(31);
      assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
      assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});