From 042f6923feb2a2bad2074931cd10ef0dad63787e Mon Sep 17 00:00:00 2001 From: Raman Sarokin Date: Tue, 17 Nov 2020 16:45:27 -0800 Subject: [PATCH] Added support of Const tensors in OpenCL backend. PiperOrigin-RevId: 342973472 Change-Id: I38932dda93dfb729cf4f14ff6310da57ad9b1e06 --- .../delegates/gpu/cl/inference_context.cc | 57 ++++++--- .../lite/delegates/gpu/cl/inference_context.h | 9 +- .../lite/delegates/gpu/cl/serialization.cc | 17 ++- .../lite/delegates/gpu/cl/serialization.fbs | 1 + .../gpu/cl/serialization_generated.h | 109 +++++++++++------- .../delegates/gpu/common/task/tensor_desc.cc | 6 + .../delegates/gpu/common/task/tensor_desc.h | 1 + 7 files changed, 141 insertions(+), 59 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc index eb25cada84b..332de066bca 100644 --- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc +++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc @@ -212,9 +212,7 @@ absl::Status InferenceContext::InitFromGraph( node.cl_operation.MoveObjectRefsFromGenericToCL(); } } - for (auto& node : nodes_) { - node.cl_operation.GetGpuOperation().args_.ReleaseCPURepresentation(); - } + ReleaseCPURepresentation(); return absl::OkStatus(); } @@ -240,9 +238,7 @@ absl::Status InferenceContext::RestoreDeserialized( RETURN_IF_ERROR(node.cl_operation.CompileDeserialized(creation_context)); } RETURN_IF_ERROR(UpdateParams()); - for (auto& node : nodes_) { - node.cl_operation.GetGpuOperation().args_.ReleaseCPURepresentation(); - } + ReleaseCPURepresentation(); return absl::OkStatus(); } @@ -329,6 +325,16 @@ absl::Status InferenceContext::ConvertOperations(const GpuInfo& gpu_info, if (consumed_nodes.find(node.id) != consumed_nodes.end()) { continue; } + auto op_type = OperationTypeFromString(node.operation.type); + if (op_type == OperationType::CONST) { + auto attr = + absl::any_cast(node.operation.attributes); + auto outputs = graph.FindOutputs(node.id); + const_tensors_descs_[outputs[0]->id] = + tensor_reserver_.Get(outputs[0]->id).descriptor; + const_tensors_descs_[outputs[0]->id].UploadData(attr.tensor); + continue; + } std::string op_name = node.operation.type + " " + std::to_string(node.id); GPUOperationsSubgraph gpu_subgraph; if (hints.Check(ModelHints::kAllowSpecialKernels) && @@ -481,22 +487,34 @@ void InferenceContext::GetUsages(const std::function& functor, InferenceContext::TensorMemoryType InferenceContext::GetTensorMemoryType( ValueId id) { - if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) { - return TensorMemoryType::VARIABLE; + if (const_tensors_.find(id) != const_tensors_.end()) { + return TensorMemoryType::kConst; + } else if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) { + return TensorMemoryType::kVariable; } else if (IsBufferBased(tensor_reserver_.Get(id).descriptor.storage_type)) { - return TensorMemoryType::BUFFER; + return TensorMemoryType::kBuffer; } else { - return TensorMemoryType::STRONG_SHAPE; + return TensorMemoryType::kStrongShape; } } absl::Status InferenceContext::AllocateMemory(CLContext* context) { + RETURN_IF_ERROR(AllocateMemoryForConstTensors(context)); RETURN_IF_ERROR(AllocateMemoryForVariableTensors(context)); RETURN_IF_ERROR(AllocateMemoryForBuffers(context)); RETURN_IF_ERROR(AllocateMemoryForStrongShapes(context)); return absl::OkStatus(); } +absl::Status InferenceContext::AllocateMemoryForConstTensors( + CLContext* context) { + for (auto& description : const_tensors_descs_) { + RETURN_IF_ERROR(const_tensors_[description.first].CreateFromDescriptor( + description.second, context)); + } + return absl::OkStatus(); +} + absl::Status InferenceContext::AllocateMemoryForVariableTensors( CLContext* context) { std::map ref_value_to_tensor_index; @@ -520,7 +538,7 @@ absl::Status InferenceContext::AllocateMemoryForBuffers(CLContext* context) { std::map buffer_usages; GetUsages( [this](ValueId id) { - return GetTensorMemoryType(id) == TensorMemoryType::BUFFER; + return GetTensorMemoryType(id) == TensorMemoryType::kBuffer; }, &buffer_usages); @@ -555,7 +573,7 @@ absl::Status InferenceContext::AllocateMemoryForBuffers(CLContext* context) { for (auto& node : nodes_) { auto tensors = GetCLNodeTensors(node); for (auto& t : tensors) { - if (GetTensorMemoryType(t.first) != TensorMemoryType::BUFFER) continue; + if (GetTensorMemoryType(t.first) != TensorMemoryType::kBuffer) continue; const int tensor_index = graph_ids_to_shared_buffer_tensors_[t.first]; if (created_tensors[tensor_index]) continue; const auto& shape = tensor_reserver_.Get(t.first).shape; @@ -574,7 +592,7 @@ absl::Status InferenceContext::AllocateMemoryForStrongShapes( std::map usages; GetUsages( [this](ValueId id) { - return GetTensorMemoryType(id) == TensorMemoryType::STRONG_SHAPE; + return GetTensorMemoryType(id) == TensorMemoryType::kStrongShape; }, &usages); @@ -594,7 +612,7 @@ absl::Status InferenceContext::AllocateMemoryForStrongShapes( for (auto& node : nodes_) { auto tensors = GetCLNodeTensors(node); for (auto& t : tensors) { - if (GetTensorMemoryType(t.first) != TensorMemoryType::STRONG_SHAPE) { + if (GetTensorMemoryType(t.first) != TensorMemoryType::kStrongShape) { continue; } const auto& shape = tensor_reserver_.Get(t.first).shape; @@ -696,7 +714,9 @@ uint64_t InferenceContext::GetSizeOfMemoryAllocatedForIntermediateTensors() } Tensor* InferenceContext::GetTensor(ValueId id) { - if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) { + if (const_tensors_.find(id) != const_tensors_.end()) { + return &const_tensors_[id]; + } else if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) { return &variable_tensors_[variable_ids_and_refs_[id]]; } else if (graph_ids_to_shared_buffer_tensors_.find(id) != graph_ids_to_shared_buffer_tensors_.end()) { @@ -724,6 +744,13 @@ absl::Status InferenceContext::GetOutputTensor(ValueId id, return gpu_tensor.ReadData(queue, result); } +void InferenceContext::ReleaseCPURepresentation() { + for (auto& node : nodes_) { + node.cl_operation.GetGpuOperation().args_.ReleaseCPURepresentation(); + } + const_tensors_descs_.clear(); +} + absl::Status RunGraphTransforms(GraphFloat32* graph) { auto merge_padding_transform = NewMergePaddingWithAdd(); auto add_bias_transform = NewAddBias(); diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.h b/tensorflow/lite/delegates/gpu/cl/inference_context.h index d2b6248aef3..0c0565e3ff6 100644 --- a/tensorflow/lite/delegates/gpu/cl/inference_context.h +++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h @@ -102,7 +102,7 @@ class InferenceContext { const absl::Span serialized_model, Environment* env); private: - enum TensorMemoryType { STRONG_SHAPE = 0, BUFFER = 1, VARIABLE = 2 }; + enum class TensorMemoryType { kStrongShape, kBuffer, kVariable, kConst }; friend flatbuffers::Offset Encode( const InferenceContext& inference, @@ -119,6 +119,8 @@ class InferenceContext { absl::Status Merge(); absl::Status AllocateMemory(CLContext* context); + absl::Status AllocateMemoryForConstTensors(CLContext* context); + absl::Status AllocateMemoryForVariableTensors(CLContext* context); absl::Status AllocateMemoryForBuffers(CLContext* context); @@ -137,6 +139,8 @@ class InferenceContext { ProfilingCommandQueue* profiling_queue); absl::Status UpdateParams(); + void ReleaseCPURepresentation(); + // performance hacks bool need_flush_ = false; @@ -213,6 +217,9 @@ class InferenceContext { }; TensorReserver tensor_reserver_; + absl::flat_hash_map const_tensors_descs_; + std::map const_tensors_; + std::map variable_tensors_; std::vector shared_buffers_; std::vector diff --git a/tensorflow/lite/delegates/gpu/cl/serialization.cc b/tensorflow/lite/delegates/gpu/cl/serialization.cc index a7a7473c5f0..f8207a28ab8 100644 --- a/tensorflow/lite/delegates/gpu/cl/serialization.cc +++ b/tensorflow/lite/delegates/gpu/cl/serialization.cc @@ -944,12 +944,19 @@ flatbuffers::Offset Encode( std::vector> tensors_fb; auto tensors = inference.tensor_reserver_.GetTensorDescs(); - for (auto& tensor : tensors) { + for (const auto& tensor : tensors) { auto tensor_fb = Encode(tensor.second, tensor.first, builder); tensors_fb.push_back(tensor_fb); } auto tensors_fb_vec = builder->CreateVector(tensors_fb); + std::vector> const_tensors_fb; + for (const auto& tensor : inference.const_tensors_descs_) { + auto tensor_fb = Encode(tensor.second, tensor.first, builder); + const_tensors_fb.push_back(tensor_fb); + } + auto const_tensors_fb_vec = builder->CreateVector(const_tensors_fb); + std::vector> variable_ids_and_refs_fb; for (auto& pair : inference.variable_ids_and_refs_) { @@ -970,6 +977,7 @@ flatbuffers::Offset Encode( inf_builder.add_storage_type(tflite::gpu::ToFB(inference.storage_type_)); inf_builder.add_nodes(nodes_fb_vec); inf_builder.add_tensors(tensors_fb_vec); + inf_builder.add_const_tensors(const_tensors_fb_vec); inf_builder.add_input_ids(in_ids_fb); inf_builder.add_output_ids(out_ids_fb); inf_builder.add_variable_ids_and_refs(variable_ids_and_refs_fb_vec); @@ -995,12 +1003,17 @@ absl::Status Decode(const data::InferenceContext* fb_inference, } std::vector> tensors; - for (auto tensor_fb : *fb_inference->tensors()) { + for (const auto& tensor_fb : *fb_inference->tensors()) { TensorDescriptor desc; Decode(tensor_fb->desc(), &desc); tensors.push_back({tensor_fb->id(), std::move(desc)}); } inference->tensor_reserver_.Add(tensors); + for (const auto& tensor_fb : *fb_inference->const_tensors()) { + TensorDescriptor desc; + Decode(tensor_fb->desc(), &desc); + inference->const_tensors_descs_[tensor_fb->id()] = std::move(desc); + } for (auto in_fb : *fb_inference->input_ids()) { inference->input_ids_.push_back(in_fb); } diff --git a/tensorflow/lite/delegates/gpu/cl/serialization.fbs b/tensorflow/lite/delegates/gpu/cl/serialization.fbs index ecb2c3fa4a6..9d5cf5ed783 100644 --- a/tensorflow/lite/delegates/gpu/cl/serialization.fbs +++ b/tensorflow/lite/delegates/gpu/cl/serialization.fbs @@ -95,6 +95,7 @@ table InferenceContext { storage_type:tflite.gpu.data.TensorStorageType; nodes:[CLNode]; tensors:[TensorDescWithId]; + const_tensors:[TensorDescWithId]; input_ids:[int32]; variable_ids_and_refs:[PairOfValueIds]; output_ids:[int32]; diff --git a/tensorflow/lite/delegates/gpu/cl/serialization_generated.h b/tensorflow/lite/delegates/gpu/cl/serialization_generated.h index c564954e867..a3bc04e12ca 100644 --- a/tensorflow/lite/delegates/gpu/cl/serialization_generated.h +++ b/tensorflow/lite/delegates/gpu/cl/serialization_generated.h @@ -748,11 +748,12 @@ struct InferenceContext FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VT_STORAGE_TYPE = 14, VT_NODES = 16, VT_TENSORS = 18, - VT_INPUT_IDS = 20, - VT_VARIABLE_IDS_AND_REFS = 22, - VT_OUTPUT_IDS = 24, - VT_INPUT_REFS = 26, - VT_OUTPUT_REFS = 28 + VT_CONST_TENSORS = 20, + VT_INPUT_IDS = 22, + VT_VARIABLE_IDS_AND_REFS = 24, + VT_OUTPUT_IDS = 26, + VT_INPUT_REFS = 28, + VT_OUTPUT_REFS = 30 }; bool need_flush() const { return GetField(VT_NEED_FLUSH, 0) != 0; @@ -778,6 +779,13 @@ struct InferenceContext FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { const flatbuffers::Vector> *tensors() const { return GetPointer> *>(VT_TENSORS); } + const flatbuffers::Vector< + flatbuffers::Offset> + *const_tensors() const { + return GetPointer> *>( + VT_CONST_TENSORS); + } const flatbuffers::Vector *input_ids() const { return GetPointer *>(VT_INPUT_IDS); } @@ -801,12 +809,14 @@ struct InferenceContext FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VerifyField(verifier, VT_NEED_MANUAL_RELEASE) && VerifyField(verifier, VT_PRECISION) && VerifyField(verifier, VT_STORAGE_TYPE) && - VerifyOffset(verifier, VT_NODES) && - verifier.VerifyVector(nodes()) && + VerifyOffset(verifier, VT_NODES) && verifier.VerifyVector(nodes()) && verifier.VerifyVectorOfTables(nodes()) && VerifyOffset(verifier, VT_TENSORS) && verifier.VerifyVector(tensors()) && verifier.VerifyVectorOfTables(tensors()) && + VerifyOffset(verifier, VT_CONST_TENSORS) && + verifier.VerifyVector(const_tensors()) && + verifier.VerifyVectorOfTables(const_tensors()) && VerifyOffset(verifier, VT_INPUT_IDS) && verifier.VerifyVector(input_ids()) && VerifyOffset(verifier, VT_VARIABLE_IDS_AND_REFS) && @@ -817,8 +827,7 @@ struct InferenceContext FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VerifyOffset(verifier, VT_INPUT_REFS) && verifier.VerifyVector(input_refs()) && VerifyOffset(verifier, VT_OUTPUT_REFS) && - verifier.VerifyVector(output_refs()) && - verifier.EndTable(); + verifier.VerifyVector(output_refs()) && verifier.EndTable(); } }; @@ -850,6 +859,12 @@ struct InferenceContextBuilder { void add_tensors(flatbuffers::Offset>> tensors) { fbb_.AddOffset(InferenceContext::VT_TENSORS, tensors); } + void add_const_tensors( + flatbuffers::Offset>> + const_tensors) { + fbb_.AddOffset(InferenceContext::VT_CONST_TENSORS, const_tensors); + } void add_input_ids(flatbuffers::Offset> input_ids) { fbb_.AddOffset(InferenceContext::VT_INPUT_IDS, input_ids); } @@ -877,17 +892,26 @@ struct InferenceContextBuilder { }; inline flatbuffers::Offset CreateInferenceContext( - flatbuffers::FlatBufferBuilder &_fbb, - bool need_flush = false, - bool flush_periodically = false, - int32_t flush_period = 0, + flatbuffers::FlatBufferBuilder &_fbb, bool need_flush = false, + bool flush_periodically = false, int32_t flush_period = 0, bool need_manual_release = false, - tflite::gpu::cl::data::CalculationsPrecision precision = tflite::gpu::cl::data::CalculationsPrecision::F32, - tflite::gpu::data::TensorStorageType storage_type = tflite::gpu::data::TensorStorageType::UNKNOWN, - flatbuffers::Offset>> nodes = 0, - flatbuffers::Offset>> tensors = 0, + tflite::gpu::cl::data::CalculationsPrecision precision = + tflite::gpu::cl::data::CalculationsPrecision::F32, + tflite::gpu::data::TensorStorageType storage_type = + tflite::gpu::data::TensorStorageType::UNKNOWN, + flatbuffers::Offset< + flatbuffers::Vector>> + nodes = 0, + flatbuffers::Offset>> + tensors = 0, + flatbuffers::Offset>> + const_tensors = 0, flatbuffers::Offset> input_ids = 0, - flatbuffers::Offset>> variable_ids_and_refs = 0, + flatbuffers::Offset>> + variable_ids_and_refs = 0, flatbuffers::Offset> output_ids = 0, flatbuffers::Offset> input_refs = 0, flatbuffers::Offset> output_refs = 0) { @@ -897,6 +921,7 @@ inline flatbuffers::Offset CreateInferenceContext( builder_.add_output_ids(output_ids); builder_.add_variable_ids_and_refs(variable_ids_and_refs); builder_.add_input_ids(input_ids); + builder_.add_const_tensors(const_tensors); builder_.add_tensors(tensors); builder_.add_nodes(nodes); builder_.add_flush_period(flush_period); @@ -909,42 +934,44 @@ inline flatbuffers::Offset CreateInferenceContext( } inline flatbuffers::Offset CreateInferenceContextDirect( - flatbuffers::FlatBufferBuilder &_fbb, - bool need_flush = false, - bool flush_periodically = false, - int32_t flush_period = 0, + flatbuffers::FlatBufferBuilder &_fbb, bool need_flush = false, + bool flush_periodically = false, int32_t flush_period = 0, bool need_manual_release = false, - tflite::gpu::cl::data::CalculationsPrecision precision = tflite::gpu::cl::data::CalculationsPrecision::F32, - tflite::gpu::data::TensorStorageType storage_type = tflite::gpu::data::TensorStorageType::UNKNOWN, - const std::vector> *nodes = nullptr, - const std::vector> *tensors = nullptr, + tflite::gpu::cl::data::CalculationsPrecision precision = + tflite::gpu::cl::data::CalculationsPrecision::F32, + tflite::gpu::data::TensorStorageType storage_type = + tflite::gpu::data::TensorStorageType::UNKNOWN, + const std::vector> + *nodes = nullptr, + const std::vector< + flatbuffers::Offset> *tensors = + nullptr, + const std::vector> *const_tensors = nullptr, const std::vector *input_ids = nullptr, - const std::vector> *variable_ids_and_refs = nullptr, + const std::vector< + flatbuffers::Offset> + *variable_ids_and_refs = nullptr, const std::vector *output_ids = nullptr, const std::vector *input_refs = nullptr, const std::vector *output_refs = nullptr) { auto nodes__ = nodes ? _fbb.CreateVector>(*nodes) : 0; auto tensors__ = tensors ? _fbb.CreateVector>(*tensors) : 0; + auto const_tensors__ = + const_tensors + ? _fbb.CreateVector< + flatbuffers::Offset>( + *const_tensors) + : 0; auto input_ids__ = input_ids ? _fbb.CreateVector(*input_ids) : 0; auto variable_ids_and_refs__ = variable_ids_and_refs ? _fbb.CreateVector>(*variable_ids_and_refs) : 0; auto output_ids__ = output_ids ? _fbb.CreateVector(*output_ids) : 0; auto input_refs__ = input_refs ? _fbb.CreateVector(*input_refs) : 0; auto output_refs__ = output_refs ? _fbb.CreateVector(*output_refs) : 0; return tflite::gpu::cl::data::CreateInferenceContext( - _fbb, - need_flush, - flush_periodically, - flush_period, - need_manual_release, - precision, - storage_type, - nodes__, - tensors__, - input_ids__, - variable_ids_and_refs__, - output_ids__, - input_refs__, - output_refs__); + _fbb, need_flush, flush_periodically, flush_period, need_manual_release, + precision, storage_type, nodes__, tensors__, const_tensors__, input_ids__, + variable_ids_and_refs__, output_ids__, input_refs__, output_refs__); } inline const tflite::gpu::cl::data::InferenceContext *GetInferenceContext(const void *buf) { diff --git a/tensorflow/lite/delegates/gpu/common/task/tensor_desc.cc b/tensorflow/lite/delegates/gpu/common/task/tensor_desc.cc index 7b5258d985c..3c02221c5cf 100644 --- a/tensorflow/lite/delegates/gpu/common/task/tensor_desc.cc +++ b/tensorflow/lite/delegates/gpu/common/task/tensor_desc.cc @@ -744,6 +744,12 @@ AddressMode TensorDescriptor::AddressModeFromState() const { } } +void TensorDescriptor::UploadData( + const tflite::gpu::Tensor& src) { + shape = BHWDC(src.shape.b, src.shape.h, src.shape.w, 1, src.shape.c); + UploadData(absl::MakeConstSpan(src.data)); +} + void TensorDescriptor::UploadData( const tflite::gpu::Tensor& src) { shape = BHWDC(1, src.shape.h, src.shape.w, 1, src.shape.c); diff --git a/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h b/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h index bdf3b1f692a..8f339ba64ae 100644 --- a/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h +++ b/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h @@ -75,6 +75,7 @@ struct TensorDescriptor : public GPUObjectDescriptor { const std::vector& args, std::string* value_name, std::string* x_coord, std::string* y_coord, std::string* s_coord) const; + void UploadData(const tflite::gpu::Tensor& src); void UploadData(const tflite::gpu::Tensor& src); void UploadData(const tflite::gpu::Tensor& src);