From 4001e3dad3c6340b0c2001d89b3954f189e9aeb5 Mon Sep 17 00:00:00 2001 From: Sachin Joglekar Date: Mon, 18 May 2020 15:22:44 -0700 Subject: [PATCH] Updates GPU delegate documentation with experimental quant support PiperOrigin-RevId: 312165090 Change-Id: I8fb624f71101fce6a379ed24f6002f8f4b60245d --- tensorflow/lite/g3doc/performance/gpu.md | 2 +- .../lite/g3doc/performance/gpu_advanced.md | 189 ++++++++---------- .../g3doc/performance/model_optimization.md | 6 +- 3 files changed, 84 insertions(+), 113 deletions(-) diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md index 8762afb4c83..b5abf46f845 100644 --- a/tensorflow/lite/g3doc/performance/gpu.md +++ b/tensorflow/lite/g3doc/performance/gpu.md @@ -31,7 +31,7 @@ models. For a step-by-step tutorial, watch the [GPU Delegate for Android](https://youtu.be/Xkhgre8r5G0) video. -Note: This requires OpenGL ES 3.1 or higher. +Note: This requires OpenCL or OpenGL ES (3.1 or higher). #### Step 1. Clone the TensorFlow source code and open it in Android Studio diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md index 9f47c2e55e8..dce3eb8db6b 100644 --- a/tensorflow/lite/g3doc/performance/gpu_advanced.md +++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md @@ -1,9 +1,9 @@ # TensorFlow Lite on GPU [TensorFlow Lite](https://www.tensorflow.org/mobile/tflite/) supports several -hardware accelerators. This document describes how to use the GPU backend using -the TensorFlow Lite delegate APIs on Android (requires OpenGL ES 3.1 or higher) -and iOS (requires iOS 8 or later). +hardware accelerators. This document describes how to use the GPU backend using +the TensorFlow Lite delegate APIs on Android (requires OpenCL or OpenGL ES 3.1 +and higher) and iOS (requires iOS 8 or later). ## Benefits of GPU Acceleration @@ -35,25 +35,33 @@ power and generating less heat than the same task run on a CPU. TensorFlow Lite on GPU supports the following ops in 16-bit and 32-bit float precision: -* `ADD v1` -* `AVERAGE_POOL_2D v1` -* `CONCATENATION v1` -* `CONV_2D v1` -* `DEPTHWISE_CONV_2D v1-2` -* `FULLY_CONNECTED v1` -* `LOGISTIC v1` -* `MAX_POOL_2D v1` -* `MUL v1` -* `PAD v1` -* `PRELU v1` -* `RELU v1` -* `RELU6 v1` -* `RESHAPE v1` -* `RESIZE_BILINEAR v1` -* `SOFTMAX v1` -* `STRIDED_SLICE v1` -* `SUB v1` -* `TRANSPOSE_CONV v1` +* `ADD` +* `AVERAGE_POOL_2D` +* `CONCATENATION` +* `CONV_2D` +* `DEPTHWISE_CONV_2D v1-2` +* `EXP` +* `FULLY_CONNECTED` +* `LOGISTIC` +* `LSTM v2 (Basic LSTM only)` +* `MAX_POOL_2D` +* `MAXIMUM` +* `MINIMUM` +* `MUL` +* `PAD` +* `PRELU` +* `RELU` +* `RELU6` +* `RESHAPE` +* `RESIZE_BILINEAR v1-3` +* `SOFTMAX` +* `STRIDED_SLICE` +* `SUB` +* `TRANSPOSE_CONV` + +By default, all ops are only supported at version 1. Enabling the +[experimental quantization support](gpu_advanced.md#running-quantized-models-experimental-android-only) +allows the appropriate versions; for example, ADD v2. ## Basic Usage @@ -82,8 +90,8 @@ delegate.close(); ### Android (C/C++) For C/C++ usage of TensorFlow Lite GPU on Android, the GPU delegate can be -created with `TfLiteGpuDelegateCreate()` and destroyed with -`TfLiteGpuDelegateDelete()`. +created with `TfLiteGpuDelegateV2Create()` and destroyed with +`TfLiteGpuDelegateV2Delete()`. ```c++ // Set up interpreter. @@ -94,15 +102,7 @@ std::unique_ptr interpreter; InterpreterBuilder(*model, op_resolver)(&interpreter); // NEW: Prepare GPU delegate. -const TfLiteGpuDelegateOptions options = { - .metadata = NULL, - .compile_options = { - .precision_loss_allowed = 1, // FP16 - .preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST, - .dynamic_batch_enabled = 0, // Not fully functional yet - }, -}; -auto* delegate = TfLiteGpuDelegateCreate(&options); +auto* delegate = TfLiteGpuDelegateV2Create(/*default options=*/nullptr); if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false; // Run inference. @@ -111,9 +111,13 @@ if (interpreter->Invoke() != kTfLiteOk) return false; ReadFromOutputTensor(interpreter->typed_output_tensor(0)); // NEW: Clean up. -TfLiteGpuDelegateDelete(delegate); +TfLiteGpuDelegateV2Delete(delegate); ``` +Take a look at `TfLiteGpuDelegateOptionsV2` to create a delegate instance with +custom options. You can initialize the default options with +`TfLiteGpuDelegateOptionsV2Default()` and then modify them as necessary. + TFLite GPU for Android C/C++ uses the [Bazel](https://bazel.io) build system. The delegate can be built, for example, using the following command: @@ -165,6 +169,43 @@ called. ## Advanced Usage +### Running quantized models (Experimental, Android only) + +The GPU delegate already supports +[float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant) +models. There is experimental support on Android to run 8-bit quantized as well. +This includes all flavors of quantization, including: + +* Models trained with + [Quantization-aware training](https://www.tensorflow.org/lite/convert/quantization) +* [Post-training dynamic-range quantization](https://www.tensorflow.org/lite/performance/post_training_quant) +* [Post-training full-integer quantization](https://www.tensorflow.org/lite/performance/post_training_integer_quant) + +To optimize performance, use models that have floating-point input & output +tensors. + +This feature can be enabled using delegate options as follows: + +**C++ API** + +```c++ +// NEW: Prepare custom options with feature enabled. +TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default(); +options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT; + +auto* delegate = TfLiteGpuDelegateV2Create(options); +if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false; +``` + +**Java API** + +```java +// NEW: Prepare GPU delegate with feature turned on. +GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true)); + +Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate); +``` + ### Delegate Options for iOS `NewGpuDelegate()` accepts a `struct` of options. @@ -210,7 +251,7 @@ While it is convenient to use `nullptr`, we recommend that you explicitly set the options, to avoid any unexpected behavior if default values are changed in the future. -### Input/Output Buffers +### Input/Output Buffers (iOS only) To do computation on the GPU, data must be made available to the GPU. This often requires performing a memory copy. It is desirable not to cross the CPU/GPU @@ -229,80 +270,10 @@ To achieve best performance, TensorFlow Lite makes it possible for users to directly read from and write to the TensorFlow hardware buffer and bypass avoidable memory copies. -#### Android - -Assuming the image input is in the GPU memory, it must first be converted to an -OpenGL Shader Storage Buffer Object (SSBO). You can associate a TfLiteTensor to -a user-prepared SSBO with `Interpreter.bindGlBufferToTensor()`. Note that -`Interpreter.bindGlBufferToTensor()` must be called before -`Interpreter.modifyGraphWithDelegate()`. - -```java -// Ensure a valid EGL rendering context. -EGLContext eglContext = eglGetCurrentContext(); -if (eglContext.equals(EGL_NO_CONTEXT)) return false; - -// Create an SSBO. -int[] id = new int[1]; -glGenBuffers(id.length, id, 0); -glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]); -glBufferData(GL_SHADER_STORAGE_BUFFER, inputSize, null, GL_STREAM_COPY); -glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind -int inputSsboId = id[0]; - -// Create interpreter. -Interpreter interpreter = new Interpreter(tfliteModel); -Tensor inputTensor = interpreter.getInputTensor(0); -GpuDelegate gpuDelegate = new GpuDelegate(); -// The buffer must be bound before the delegate is installed. -gpuDelegate.bindGlBufferToTensor(inputTensor, inputSsboId); -interpreter.modifyGraphWithDelegate(gpuDelegate); - -// Run inference; the null input argument indicates use of the bound buffer for input. -fillSsboWithCameraImageTexture(inputSsboId); -float[] outputArray = new float[outputSize]; -interpreter.runInference(null, outputArray); -``` - -A similar approach can be applied to the output tensor. In that case, -`Interpreter.Options.setAllowBufferHandleOutput(true)` should be passed on, to -disable the default copying of the network's output from GPU memory to CPU -memory. - -```java -// Ensure a valid EGL rendering context. -EGLContext eglContext = eglGetCurrentContext(); -if (eglContext.equals(EGL_NO_CONTEXT)) return false; - -// Create a SSBO. -int[] id = new int[1]; -glGenBuffers(id.length, id, 0); -glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]); -glBufferData(GL_SHADER_STORAGE_BUFFER, outputSize, null, GL_STREAM_COPY); -glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind -int outputSsboId = id[0]; - -// Create interpreter. -Interpreter.Options options = (new Interpreter.Options()).setAllowBufferHandleOutput(true); -Interpreter interpreter = new Interpreter(tfliteModel, options); -Tensor outputTensor = interpreter.getOutputTensor(0); -GpuDelegate gpuDelegate = new GpuDelegate(); -// The buffer must be bound before the delegate is installed. -gpuDelegate.bindGlBufferToTensor(outputTensor, outputSsboId); -interpreter.modifyGraphWithDelegate(gpuDelegate); - -// Run inference; the null output argument indicates use of the bound buffer for output. -ByteBuffer input = getCameraImageByteBuffer(); -interpreter.runInference(input, null); -renderOutputSsbo(outputSsboId); -``` - -#### iOS - Assuming the image input is in GPU memory, it must first be converted to a `MTLBuffer` object for Metal. You can associate a TfLiteTensor to a -user-prepared `MTLBuffer` with `BindMetalBufferToTensor()`. Note that -`BindMetalBufferToTensor()` must be called before +user-prepared `MTLBuffer` with `TFLGpuDelegateBindMetalBufferToTensor()`. Note +that `TFLGpuDelegateBindMetalBufferToTensor()` must be called before `Interpreter::ModifyGraphWithDelegate()`. Additionally, the inference output is, by default, copied from GPU memory to CPU memory. This behavior can be turned off by calling `Interpreter::SetAllowBufferHandleOutput(true)` during @@ -312,8 +283,8 @@ initialization. // Prepare GPU delegate. auto* delegate = NewGpuDelegate(nullptr); interpreter->SetAllowBufferHandleOutput(true); // disable default gpu->cpu copy -if (!BindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false; -if (!BindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false; +if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false; +if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false; if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false; // Run inference. diff --git a/tensorflow/lite/g3doc/performance/model_optimization.md b/tensorflow/lite/g3doc/performance/model_optimization.md index feb6cfecea6..c66b06f9b59 100644 --- a/tensorflow/lite/g3doc/performance/model_optimization.md +++ b/tensorflow/lite/g3doc/performance/model_optimization.md @@ -89,9 +89,9 @@ The following types of quantization are available in TensorFlow Lite: Technique | Data requirements | Size reduction | Accuracy | Supported hardware ------------------------------------------------------------------------------------------------------- | -------------------------------- | -------------- | --------------------------- | ------------------ [Post-training float16 quantization](post_training_float16_quant.ipynb) | No data | Up to 50% | Insignificant accuracy loss | CPU, GPU -[Post-training dynamic range quantization](post_training_quant.ipynb) | No data | Up to 75% | Accuracy loss | CPU -[Post-training integer quantization](post_training_integer_quant.ipynb) | Unlabelled representative sample | Up to 75% | Smaller accuracy loss | CPU, EdgeTPU, Hexagon DSP -[Quantization-aware training](http://www.tensorflow.org/model_optimization/guide/quantization/training) | Labelled training data | Up to 75% | Smallest accuracy loss | CPU, EdgeTPU, Hexagon DSP +[Post-training dynamic range quantization](post_training_quant.ipynb) | No data | Up to 75% | Accuracy loss | CPU, GPU (Android) +[Post-training integer quantization](post_training_integer_quant.ipynb) | Unlabelled representative sample | Up to 75% | Smaller accuracy loss | CPU, GPU (Android), EdgeTPU, Hexagon DSP +[Quantization-aware training](http://www.tensorflow.org/model_optimization/guide/quantization/training) | Labelled training data | Up to 75% | Smallest accuracy loss | CPU, GPU (Android), EdgeTPU, Hexagon DSP Below are the latency and accuracy results for post-training quantization and quantization-aware training on a few models. All latency numbers are measured on