468 lines
18 KiB
C++
468 lines
18 KiB
C++
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#include "tensorflow/lite/delegates/gpu/delegate.h"
|
|
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <thread> // NOLINT(build/c++11)
|
|
#include <vector>
|
|
|
|
#include "absl/container/flat_hash_map.h"
|
|
#include "absl/memory/memory.h"
|
|
#include "absl/types/span.h"
|
|
#include "tensorflow/lite/builtin_ops.h"
|
|
#include "tensorflow/lite/c/common.h"
|
|
#include "tensorflow/lite/delegates/gpu/api.h"
|
|
#include "tensorflow/lite/delegates/gpu/cl/api.h"
|
|
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
|
|
#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
|
|
#include "tensorflow/lite/delegates/gpu/common/model.h"
|
|
#include "tensorflow/lite/delegates/gpu/common/model_builder.h"
|
|
#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
|
|
#include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
|
|
#include "tensorflow/lite/delegates/gpu/common/status.h"
|
|
#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
|
|
#include "tensorflow/lite/minimal_logging.h"
|
|
|
|
#ifndef CL_DELEGATE_NO_GL
|
|
#include "tensorflow/lite/delegates/gpu/gl/api2.h"
|
|
#endif
|
|
|
|
namespace tflite {
|
|
namespace gpu {
|
|
namespace {
|
|
|
|
InferencePriority ToPriority(int32_t priority) {
|
|
switch (priority) {
|
|
case TFLITE_GPU_INFERENCE_PRIORITY_AUTO:
|
|
return InferencePriority::AUTO;
|
|
case TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION:
|
|
return InferencePriority::MAX_PRECISION;
|
|
case TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY:
|
|
return InferencePriority::MIN_LATENCY;
|
|
case TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE:
|
|
return InferencePriority::MIN_MEMORY_USAGE;
|
|
}
|
|
return InferencePriority::UNKNOWN;
|
|
}
|
|
|
|
InferenceUsage ToUsage(int32_t usage) {
|
|
switch (usage) {
|
|
case TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER:
|
|
return InferenceUsage::FAST_SINGLE_ANSWER;
|
|
case TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED:
|
|
return InferenceUsage::SUSTAINED_SPEED;
|
|
}
|
|
return InferenceUsage::UNKNOWN;
|
|
}
|
|
|
|
// Forward declarations.
|
|
TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
|
|
|
|
class Delegate {
|
|
public:
|
|
explicit Delegate(const TfLiteGpuDelegateOptionsV2* options)
|
|
: num_delegate_kernels_(0) {
|
|
options_ = options ? *options : TfLiteGpuDelegateOptionsV2Default();
|
|
if (options_.max_delegated_partitions <= 0) {
|
|
options_.max_delegated_partitions = 1;
|
|
}
|
|
}
|
|
|
|
TfLiteDelegate* tflite_delegate() { return &delegate_; }
|
|
const TfLiteGpuDelegateOptionsV2& options() const { return options_; }
|
|
|
|
bool IsQuantOpsAllowed() const {
|
|
return options_.experimental_flags &
|
|
TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
|
|
}
|
|
int MaxDelegatedPartitions() const {
|
|
return options_.max_delegated_partitions;
|
|
}
|
|
int num_delegate_kernels() const { return num_delegate_kernels_; }
|
|
|
|
private:
|
|
TfLiteDelegate delegate_ = {
|
|
.data_ = reinterpret_cast<void*>(this),
|
|
.Prepare = DelegatePrepare,
|
|
.CopyFromBufferHandle = nullptr,
|
|
.CopyToBufferHandle = nullptr,
|
|
.FreeBufferHandle = nullptr,
|
|
.flags = kTfLiteDelegateFlagsNone,
|
|
};
|
|
|
|
TfLiteGpuDelegateOptionsV2 options_;
|
|
int num_delegate_kernels_ = 0;
|
|
|
|
friend class DelegateKernel;
|
|
};
|
|
|
|
// Represent the execution of a subset of nodes on GPU.
|
|
class DelegateKernel {
|
|
public:
|
|
explicit DelegateKernel(Delegate* delegate) : delegate_(delegate) {
|
|
++delegate_->num_delegate_kernels_;
|
|
}
|
|
~DelegateKernel() { --delegate_->num_delegate_kernels_; }
|
|
|
|
absl::Status Prepare(TfLiteContext* context,
|
|
const TfLiteDelegateParams* delegate_params) {
|
|
thread_id_prepare_ = std::this_thread::get_id();
|
|
|
|
// Extract TFLite delegate execution plan from the context and convert it
|
|
// into GraphFloat32.
|
|
GraphFloat32 graph;
|
|
std::vector<uint32_t> input_refs;
|
|
std::vector<uint32_t> output_refs;
|
|
RETURN_IF_ERROR(InitializeGraph(context, delegate_params, &graph,
|
|
&input_refs, &output_refs));
|
|
|
|
std::unique_ptr<InferenceBuilder> builder;
|
|
bool graph_is_destroyed;
|
|
const int experimental_flags = delegate_->options().experimental_flags;
|
|
if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY) {
|
|
RETURN_IF_ERROR(
|
|
InitializeOpenClApi(&graph, &builder, &graph_is_destroyed));
|
|
} else if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY) {
|
|
RETURN_IF_ERROR(InitializeOpenGlApi(&graph, &builder));
|
|
} else {
|
|
// By default, we try CL first & fall back to GL if that fails.
|
|
absl::Status status =
|
|
InitializeOpenClApi(&graph, &builder, &graph_is_destroyed);
|
|
if (!status.ok()) {
|
|
TF_LITE_KERNEL_LOG(context, std::string(status.message()).c_str());
|
|
TF_LITE_KERNEL_LOG(context, "Falling back to OpenGL");
|
|
|
|
// Graph needs to be re-created because it is moved above.
|
|
GraphFloat32 graph2;
|
|
if (graph_is_destroyed) {
|
|
RETURN_IF_ERROR(InitializeGraph(context, delegate_params, &graph2,
|
|
&input_refs, &output_refs));
|
|
}
|
|
RETURN_IF_ERROR(InitializeOpenGlApi(
|
|
graph_is_destroyed ? &graph2 : &graph, &builder));
|
|
}
|
|
}
|
|
|
|
// At this point tflite didn't allocate tensors yet, therefore, collect
|
|
// indices and set all input and output tensors from tflite later.
|
|
input_indices_.reserve(input_refs.size());
|
|
for (uint32_t tensor_index : input_refs) {
|
|
const int64_t object_index = input_indices_.size();
|
|
input_indices_.push_back(tensor_index);
|
|
RETURN_IF_ERROR(
|
|
builder->SetInputObjectDef(object_index, GetObjectDef(tensor_index)));
|
|
}
|
|
output_indices_.reserve(output_refs.size());
|
|
for (uint32_t tensor_index : output_refs) {
|
|
const int64_t object_index = output_indices_.size();
|
|
output_indices_.push_back(tensor_index);
|
|
RETURN_IF_ERROR(builder->SetOutputObjectDef(object_index,
|
|
GetObjectDef(tensor_index)));
|
|
}
|
|
|
|
return builder->Build(&runner_);
|
|
}
|
|
|
|
// This directs the runtime to allocate memory for input/output temporary
|
|
// tensors that require dequantization/quantization.
|
|
absl::Status GetRequiredTemporaries(TfLiteContext* context, TfLiteNode* node,
|
|
TfLiteIntArray** temporaries_array_ptr) {
|
|
if (quant_conversion_map_.empty()) return absl::OkStatus();
|
|
|
|
std::vector<int> temporary_tensors;
|
|
for (auto index : input_indices_) {
|
|
if (quant_conversion_map_.find(index) != quant_conversion_map_.end()) {
|
|
temporary_tensors.push_back(index);
|
|
}
|
|
}
|
|
for (auto index : output_indices_) {
|
|
if (quant_conversion_map_.find(index) != quant_conversion_map_.end()) {
|
|
temporary_tensors.push_back(index);
|
|
}
|
|
}
|
|
*temporaries_array_ptr = TfLiteIntArrayCreate(temporary_tensors.size());
|
|
for (int i = 0; i < temporary_tensors.size(); ++i) {
|
|
(*temporaries_array_ptr)->data[i] = temporary_tensors[i];
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status Invoke(TfLiteContext* context) {
|
|
if (thread_id_prepare_ != std::this_thread::get_id()) {
|
|
TFLITE_LOG(tflite::TFLITE_LOG_WARNING,
|
|
"GpuDelegate invoke thread != prepare thread");
|
|
if (enforce_same_thread_) {
|
|
return absl::FailedPreconditionError(
|
|
"GpuDelegate must run on the same thread where it was "
|
|
"initialized.");
|
|
}
|
|
}
|
|
|
|
const bool is_dequant_required = !quant_conversion_map_.empty();
|
|
if (is_dequant_required) {
|
|
RETURN_IF_ERROR(
|
|
DequantizeInputs(context, input_indices_, quant_conversion_map_));
|
|
}
|
|
RETURN_IF_ERROR(SetInputsAndOutputs(context));
|
|
RETURN_IF_ERROR(runner_->Run());
|
|
if (is_dequant_required) {
|
|
RETURN_IF_ERROR(
|
|
QuantizeOutputs(context, output_indices_, quant_conversion_map_));
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
private:
|
|
absl::Status SetInputsAndOutputs(TfLiteContext* context) {
|
|
for (int i = 0; i < input_indices_.size(); ++i) {
|
|
RETURN_IF_ERROR(runner_->SetInputObject(
|
|
i, GetTensorObject(input_indices_[i], context)));
|
|
}
|
|
for (int i = 0; i < output_indices_.size(); ++i) {
|
|
RETURN_IF_ERROR(runner_->SetOutputObject(
|
|
i, GetTensorObject(output_indices_[i], context)));
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
ObjectDef GetObjectDef(int index) const {
|
|
ObjectDef default_object_def;
|
|
default_object_def.data_type = DataType::FLOAT32;
|
|
default_object_def.data_layout = DataLayout::BHWC;
|
|
default_object_def.object_type = ObjectType::CPU_MEMORY;
|
|
default_object_def.user_provided = true;
|
|
return default_object_def;
|
|
}
|
|
|
|
TensorObject GetTensorObject(int index, TfLiteContext* context) const {
|
|
auto& tensor = context->tensors[index];
|
|
return MakeCpuMemory(absl::MakeSpan(tensor.data.raw, tensor.bytes));
|
|
}
|
|
|
|
private:
|
|
absl::Status InitializeGraph(TfLiteContext* context,
|
|
const TfLiteDelegateParams* delegate_params,
|
|
GraphFloat32* graph,
|
|
std::vector<uint32_t>* input_refs,
|
|
std::vector<uint32_t>* output_refs) {
|
|
quant_conversion_map_.clear();
|
|
if (delegate_->IsQuantOpsAllowed()) {
|
|
RETURN_IF_ERROR(BuildFinalModel(context, delegate_params, graph,
|
|
&quant_conversion_map_));
|
|
} else {
|
|
RETURN_IF_ERROR(BuildFinalModel(context, delegate_params, graph));
|
|
}
|
|
|
|
input_refs->clear();
|
|
output_refs->clear();
|
|
const auto inputs = graph->inputs();
|
|
input_refs->reserve(inputs.size());
|
|
for (const auto& input : inputs) {
|
|
input_refs->push_back(input->tensor.ref);
|
|
}
|
|
const auto outputs = graph->outputs();
|
|
output_refs->reserve(outputs.size());
|
|
for (const auto& output : outputs) {
|
|
output_refs->push_back(output->tensor.ref);
|
|
}
|
|
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status InitializeOpenClApi(GraphFloat32* graph,
|
|
std::unique_ptr<InferenceBuilder>* builder,
|
|
bool* graph_is_destroyed) {
|
|
*graph_is_destroyed = false;
|
|
cl::InferenceEnvironmentOptions env_options;
|
|
cl::InferenceEnvironmentProperties properties;
|
|
RETURN_IF_ERROR(cl::NewInferenceEnvironment(env_options, &cl_environment_,
|
|
&properties));
|
|
auto delegate_options = delegate_->options();
|
|
cl::InferenceOptions options;
|
|
// If is_precision_loss_allowed == -1, then just use priorities instead
|
|
// of paying attention to is_precision_loss_allowed value.
|
|
if (delegate_options.is_precision_loss_allowed == -1) {
|
|
options.priority1 = ToPriority(delegate_options.inference_priority1);
|
|
options.priority2 = ToPriority(delegate_options.inference_priority2);
|
|
options.priority3 = ToPriority(delegate_options.inference_priority3);
|
|
} else {
|
|
// Users set is_precision_loss_allowed explicitly, thus use it explicitly.
|
|
if (delegate_options.is_precision_loss_allowed == 0) {
|
|
options.priority1 = InferencePriority::MAX_PRECISION;
|
|
} else {
|
|
options.priority1 = InferencePriority::MIN_LATENCY;
|
|
}
|
|
}
|
|
options.usage = ToUsage(delegate_options.inference_preference);
|
|
*graph_is_destroyed = true;
|
|
RETURN_IF_ERROR(cl_environment_->NewInferenceBuilder(
|
|
options, std::move(*graph), builder));
|
|
TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
|
|
"Initialized OpenCL-based API.");
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status InitializeOpenGlApi(GraphFloat32* graph,
|
|
std::unique_ptr<InferenceBuilder>* builder) {
|
|
#ifndef CL_DELEGATE_NO_GL
|
|
gl::InferenceEnvironmentOptions env_options;
|
|
gl::InferenceEnvironmentProperties properties;
|
|
RETURN_IF_ERROR(
|
|
NewInferenceEnvironment(env_options, &gl_environment_, &properties));
|
|
auto delegate_options = delegate_->options();
|
|
gl::InferenceOptions options;
|
|
options.usage = ToUsage(delegate_options.inference_preference);
|
|
options.priority1 = ToPriority(delegate_options.inference_priority1);
|
|
options.priority2 = ToPriority(delegate_options.inference_priority2);
|
|
options.priority3 = ToPriority(delegate_options.inference_priority3);
|
|
RETURN_IF_ERROR(gl_environment_->NewInferenceBuilder(std::move(*graph),
|
|
options, builder));
|
|
enforce_same_thread_ = true;
|
|
TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
|
|
"Initialized OpenGL-based API.");
|
|
#endif
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
// The Delegate instance that's shared across all DelegateKernel instances.
|
|
Delegate* const delegate_; // doesn't own the memory.
|
|
std::unique_ptr<cl::InferenceEnvironment> cl_environment_;
|
|
#ifndef CL_DELEGATE_NO_GL
|
|
std::unique_ptr<gl::InferenceEnvironment> gl_environment_;
|
|
#endif
|
|
std::unique_ptr<InferenceRunner> runner_;
|
|
std::vector<int64_t> input_indices_;
|
|
std::vector<int64_t> output_indices_;
|
|
// Whenever quantized inference is enabled, this maps the tensor index of each
|
|
// originally quantized (8-bit) tensor to its float version added in
|
|
// model_builder - and vice versa.
|
|
absl::flat_hash_map<int, int> quant_conversion_map_;
|
|
std::thread::id thread_id_prepare_; // thread id used for Prapare()
|
|
bool enforce_same_thread_ = false; // flag to enforce same thread for Invoke
|
|
};
|
|
|
|
inline DelegateKernel* GetDelegateKernel(TfLiteNode* node) {
|
|
return reinterpret_cast<DelegateKernel*>(node->user_data);
|
|
}
|
|
|
|
inline Delegate* GetDelegate(TfLiteDelegate* delegate) {
|
|
return reinterpret_cast<Delegate*>(delegate->data_);
|
|
}
|
|
|
|
TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
|
|
const TfLiteRegistration kRegistration = {
|
|
// .init
|
|
[](TfLiteContext* context, const char* buffer, size_t) -> void* {
|
|
const auto* params =
|
|
reinterpret_cast<const TfLiteDelegateParams*>(buffer);
|
|
auto* gpu_delegate = GetDelegate(params->delegate);
|
|
// Everything below should happen in prepare function call, but TFLite
|
|
// for whatever reason forbids that.
|
|
auto gpu_delegate_kernel =
|
|
absl::make_unique<DelegateKernel>(gpu_delegate);
|
|
const auto status = gpu_delegate_kernel->Prepare(context, params);
|
|
if (!status.ok()) {
|
|
TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Init: %s",
|
|
std::string(status.message()).c_str());
|
|
return nullptr;
|
|
}
|
|
return gpu_delegate_kernel.release();
|
|
},
|
|
// .free
|
|
[](TfLiteContext*, void* buffer) -> void {
|
|
delete reinterpret_cast<DelegateKernel*>(buffer);
|
|
},
|
|
// .prepare
|
|
[](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
|
|
if (!node->user_data) {
|
|
TF_LITE_KERNEL_LOG(
|
|
context,
|
|
"TfLiteGpuDelegate Prepare: delegate is not initialized");
|
|
return kTfLiteError;
|
|
}
|
|
auto* gpu_delegate_kernel = GetDelegateKernel(node);
|
|
const auto status = gpu_delegate_kernel->GetRequiredTemporaries(
|
|
context, node, &node->temporaries);
|
|
if (!status.ok()) {
|
|
TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Prepare: %s",
|
|
std::string(status.message()).c_str());
|
|
return kTfLiteError;
|
|
}
|
|
// TODO(akulik): tflite tensors are not allocated here either. It would
|
|
// be good to set inputs and outputs only once here instead of setting
|
|
// them every time in .invoke.
|
|
return kTfLiteOk;
|
|
},
|
|
// .invoke
|
|
[](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
|
|
const auto status = GetDelegateKernel(node)->Invoke(context);
|
|
if (!status.ok()) {
|
|
TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Invoke: %s",
|
|
std::string(status.message()).c_str());
|
|
return kTfLiteError;
|
|
}
|
|
return kTfLiteOk;
|
|
},
|
|
nullptr, // .profiling_string
|
|
0, // .builtin_code
|
|
"TfLiteGpuDelegateV2", // .custom_name
|
|
1, // .version
|
|
};
|
|
|
|
auto* gpu_delegate = GetDelegate(delegate);
|
|
TfLiteIntArray* ops_to_replace =
|
|
GetOpsToReplace(context, gpu_delegate->IsQuantOpsAllowed(),
|
|
gpu_delegate->MaxDelegatedPartitions());
|
|
const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
|
|
context, kRegistration, ops_to_replace, delegate);
|
|
TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Created %d GPU delegate kernels.",
|
|
gpu_delegate->num_delegate_kernels());
|
|
TfLiteIntArrayFree(ops_to_replace);
|
|
return status;
|
|
}
|
|
|
|
} // namespace
|
|
} // namespace gpu
|
|
} // namespace tflite
|
|
|
|
TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
|
|
TfLiteGpuDelegateOptionsV2 options = {
|
|
// set it to -1 to detect whether it was later adjusted.
|
|
.is_precision_loss_allowed = -1,
|
|
.inference_preference =
|
|
TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
|
|
.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION,
|
|
.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
|
|
.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
|
|
.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT,
|
|
.max_delegated_partitions = 1,
|
|
};
|
|
return options;
|
|
}
|
|
|
|
TfLiteDelegate* TfLiteGpuDelegateV2Create(
|
|
const TfLiteGpuDelegateOptionsV2* options) {
|
|
auto* gpu_delegate = new tflite::gpu::Delegate(options);
|
|
TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
|
|
"Created TensorFlow Lite delegate for GPU.");
|
|
return gpu_delegate ? gpu_delegate->tflite_delegate() : nullptr;
|
|
}
|
|
|
|
void TfLiteGpuDelegateV2Delete(TfLiteDelegate* delegate) {
|
|
delete tflite::gpu::GetDelegate(delegate);
|
|
}
|