From 4e68fdecae3b97e120636114c7d50fd331609f40 Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Tue, 29 Sep 2020 10:04:38 -0700
Subject: [PATCH] Enable quantized models by default on Android APIs. Also
 makes related changes to docs.

PiperOrigin-RevId: 334403600
Change-Id: I4ffddeb457ddfef46171b840b99ecddb90bf2c27
---
 tensorflow/lite/delegates/gpu/delegate.cc       |  2 +-
 tensorflow/lite/delegates/gpu/delegate.h        |  3 +++
 .../org/tensorflow/lite/gpu/GpuDelegate.java    |  4 ++--
 tensorflow/lite/delegates/gpu/metal_delegate.h  |  1 +
 tensorflow/lite/g3doc/performance/delegates.md  |  9 +++++----
 tensorflow/lite/g3doc/performance/gpu.md        |  4 ++++
 .../lite/g3doc/performance/gpu_advanced.md      | 17 ++++++++++-------
 .../tensorflow/lite/gpu/GpuDelegateTest.java    |  9 +++++----
 8 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index bfc2b7f08c4..98303b51da8 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -448,7 +448,7 @@ TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
       .inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION,
       .inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
       .inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
-      .experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE,
+      .experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT,
       .max_delegated_partitions = 1,
   };
   return options;
diff --git a/tensorflow/lite/delegates/gpu/delegate.h b/tensorflow/lite/delegates/gpu/delegate.h
index 9af586bfd75..40a06bb4384 100644
--- a/tensorflow/lite/delegates/gpu/delegate.h
+++ b/tensorflow/lite/delegates/gpu/delegate.h
@@ -51,6 +51,7 @@ enum TfLiteGpuInferencePriority {
 enum TfLiteGpuExperimentalFlags {
   TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE = 0,
   // Enables inference on quantized models with the delegate.
+  // NOTE: This is enabled in TfLiteGpuDelegateOptionsV2Default.
   TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT = 1 << 0,
   // Enforces execution with the provided backend.
   TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY = 1 << 1,
@@ -108,6 +109,8 @@ typedef struct {
 //   priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION
 //   priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO
 //   priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO
+//   experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT
+//   max_delegated_partitions = 1
 TFL_CAPI_EXPORT TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default();
 
 // Creates a new delegate instance that need to be destroyed with
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
index 78cab0d2cbf..5eb6881be88 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
@@ -68,7 +68,7 @@ public class GpuDelegate implements Delegate, Closeable {
      *
      * <p>WARNING: This is an experimental API and subject to change.
      *
-     * @param quantizedModelsAllowed When {@code true}, the GPU may run quantized models.
+     * @param quantizedModelsAllowed When {@code true} (default), the GPU may run quantized models.
      */
     public Options setQuantizedModelsAllowed(boolean quantizedModelsAllowed) {
       this.quantizedModelsAllowed = quantizedModelsAllowed;
@@ -87,7 +87,7 @@ public class GpuDelegate implements Delegate, Closeable {
     }
 
     boolean precisionLossAllowed = true;
-    boolean quantizedModelsAllowed = false;
+    boolean quantizedModelsAllowed = true;
     int inferencePreference = INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
   }
 
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.h b/tensorflow/lite/delegates/gpu/metal_delegate.h
index e4bdba36799..ea9da126954 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.h
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.h
@@ -47,6 +47,7 @@ typedef struct {
   bool allow_precision_loss;
   TFLGpuDelegateWaitType wait_type;
   // Allows execution of integer quantized models
+  // TODO(b/169350710): Enable by default.
   bool enable_quantization;
 } TFLGpuDelegateOptions;
 
diff --git a/tensorflow/lite/g3doc/performance/delegates.md b/tensorflow/lite/g3doc/performance/delegates.md
index 14aeece21fa..6b233075398 100644
--- a/tensorflow/lite/g3doc/performance/delegates.md
+++ b/tensorflow/lite/g3doc/performance/delegates.md
@@ -21,10 +21,11 @@ TensorFlow Lite provides the following delegates for hardware acceleration:
 
 *   **GPU delegate for cross platform acceleration** - The GPU delegate can be
     used on both Android and iOS. It is optimized to run 32-bit and 16-bit float
-    based models where a GPU is available. For an overview of the GPU delegate,
-    see [TensorFlow Lite on GPU](gpu_advanced.md). For step-by-step tutorials on
-    using the GPU delegate with Android and iOS, see
-    [TensorFlow Lite GPU Delegate Tutorial](gpu.md).
+    based models where a GPU is available. It also supports 8-bit quantized
+    models and provides GPU performance on par with their float versions. For
+    details on the GPU delegate, see [TensorFlow Lite on GPU](gpu_advanced.md).
+    For step-by-step tutorials on using the GPU delegate with Android and iOS,
+    see [TensorFlow Lite GPU Delegate Tutorial](gpu.md).
 *   **NNAPI delegate for newer Android devices** - The NNAPI delegate can be
     used to accelerate models on Android devices with GPU, DSP and / or NPU
     available. It is available in Android 8.1 (API 27+) or higher. For an
diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
index 3d9989558f5..96e8aa6f9dc 100644
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -14,6 +14,10 @@ run fast enough for previously not available real-time applications.
 Unlike CPUs, GPUs compute with 16-bit or 32-bit floating point numbers and do
 not require quantization for optimal performance.
 
+**NOTE:** The delegate does accept 8-bit quantized models on Android. Support on
+iOS is experimental. Refer to the [advanced documentation](gpu_advanced.md) for
+details.
+
 Another benefit with GPU inference is its power efficiency. GPUs carry out the
 computations in a very efficient and optimized manner, so that they consume less
 power and generate less heat than when the same task is run on CPUs.
diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
index 652100ab850..71415693f86 100644
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -285,12 +285,10 @@ While it is convenient to use `nullptr`, we recommend that you explicitly set
 the options, to avoid any unexpected behavior if default values are changed in
 the future.
 
-### Running quantized models (Experimental)
+### Running quantized models on GPU
 
-The GPU delegate already supports
-[float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant)
-models. There is experimental support on Android and iOS to run 8-bit quantized
-as well. This includes all flavors of quantization, including:
+This section explains how the GPU delegate accelerates 8-bit quantized models.
+This includes all flavors of quantization, including:
 
 *   Models trained with
     [Quantization-aware training](https://www.tensorflow.org/lite/convert/quantization)
@@ -322,12 +320,14 @@ This feature can be enabled using delegate options as follows:
 
 #### Android
 
+Android APIs support quantized models by default. To disable, do the following:
+
 **C++ API**
 
 ```c++
 // NEW: Prepare custom options with feature enabled.
 TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
-options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
 
 auto* delegate = TfLiteGpuDelegateV2Create(options);
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
@@ -337,13 +337,16 @@ if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 
 ```java
 // NEW: Prepare GPU delegate with feature turned on.
-GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false));
 
 Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
 ```
 
 #### iOS
 
+Support for quantized models on iOS APIs is experimental. To enable, do the
+following:
+
 **Swift API**
 
 ```swift
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
index d92a7119aab..de320fd68d6 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
@@ -76,8 +76,8 @@ public final class GpuDelegateTest {
             "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
 
     Interpreter.Options options = new Interpreter.Options();
-    try (GpuDelegate delegate =
-            new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+    // Default behavior allows quantized models.
+    try (GpuDelegate delegate = new GpuDelegate();
         Interpreter interpreter =
             new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
       byte[][] output = new byte[1][1001];
@@ -98,12 +98,13 @@ public final class GpuDelegateTest {
             "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
 
     Interpreter.Options options = new Interpreter.Options();
-    try (GpuDelegate delegate = new GpuDelegate();
+    try (GpuDelegate delegate =
+            new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false));
         Interpreter interpreter =
             new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
       byte[][] output = new byte[1][1001];
       interpreter.run(img, output);
-      // Original execution plan remains since default behavior doesn't allow quantized models.
+      // Original execution plan remains since we disabled quantized models.
       assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(31);
       assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
       assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});