From 0d882ea469dcffc62f32b9e981e4602fb3b3c43a Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 11 Dec 2020 17:01:08 -0800 Subject: [PATCH] Allow state tensors to use device memories in NNAPI delegate. PiperOrigin-RevId: 347102826 Change-Id: I9059612fd8019284416a814d240452b1f8f77b86 --- .../lite/delegates/nnapi/nnapi_delegate.cc | 449 +++++------------- .../lite/delegates/nnapi/nnapi_delegate.h | 57 --- .../delegates/nnapi/nnapi_delegate_kernel.h | 29 -- .../delegates/nnapi/nnapi_delegate_test.cc | 143 ++---- 4 files changed, 149 insertions(+), 529 deletions(-) diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc index a73b44bfcbd..89846501789 100644 --- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc +++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc @@ -449,78 +449,6 @@ ANeuralNetworksOperandType ConvertTensorTypeToNNType( return nn_operand_type; } -// Copy the CPU buffer of the input tensor to a shared memory address. Will -// apply data type conversion if needed. The returned tensor_size is the size -// after the potential data type conversion. -TfLiteStatus CopyOrConvertInputData(TfLiteContext* context, - TfLiteType ann_type_equivalent, - bool use_int8_asymm_signed, - TfLiteTensor* tensor, uint8_t* dst, - int* tensor_size) { - if (ann_type_equivalent != kTfLiteNoType) { - const auto num_elements = NumElements(tensor); - if (tensor->type == kTfLiteUInt8 && ann_type_equivalent == kTfLiteInt32) { - for (int i = 0; i < num_elements; ++i) { - reinterpret_cast(dst)[i] = - static_cast(tensor->data.uint8[i]); - } - } else if (tensor->type == kTfLiteInt8 && - ann_type_equivalent == kTfLiteUInt8) { - // Explicitly convert int8 values to uint8 values. - for (int i = 0; i < num_elements; ++i) { - dst[i] = static_cast( - static_cast(tensor->data.int8[i]) + 128); - } - } else if (tensor->type == kTfLiteInt8 && - ann_type_equivalent == kTfLiteInt32) { - if (use_int8_asymm_signed) { - for (int i = 0; i < num_elements; ++i) { - reinterpret_cast(dst)[i] = - static_cast(tensor->data.int8[i]); - } - } else { - for (int i = 0; i < num_elements; ++i) { - reinterpret_cast(dst)[i] = - static_cast(tensor->data.int8[i]) + 128; - } - } - } else { - TF_LITE_KERNEL_LOG( - context, - "NN API Delegate: unsupported tensor types conversion: " - "from type code %d to type code %d.\n", - tensor->type, ann_type_equivalent); - return kTfLiteError; - } - size_t type_size; - TF_LITE_ENSURE_OK(context, - GetSizeOfType(context, ann_type_equivalent, &type_size)); - *tensor_size = NumElements(tensor) * type_size; - } else { - // copy data to pre-allocated shared memory. - memcpy(dst, tensor->data.raw, tensor->bytes); - *tensor_size = tensor->bytes; - } - return kTfLiteOk; -} - -// Copy into the CPU buffer of the output tensor from a shared memory address. -// Will apply data type conversion if needed. -TfLiteStatus CopyOrConvertOutputData(TfLiteType ann_type_equivalent, - const uint8_t* src, TfLiteTensor* tensor) { - if (tensor->type == kTfLiteInt8 && ann_type_equivalent == kTfLiteUInt8) { - // Explicitly convert uint8 values to int8 values. - int8_t* output_ptr = tensor->data.int8; - const auto num_elements = NumElements(tensor); - for (int i = 0; i < num_elements; ++i) { - output_ptr[i] = static_cast(static_cast(src[i]) - 128); - } - } else { - memcpy(tensor->data.raw, src, tensor->bytes); - } - return kTfLiteOk; -} - constexpr size_t kDefaultByteAlignmentForNNAPI = 16; static size_t getNumPaddingBytes(size_t byte_size) { @@ -3714,7 +3642,6 @@ TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context, return kTfLiteOk; } - const auto& delegate_data = StatefulNnApiDelegate::GetData(node->delegate); ANeuralNetworksCompilation* compilation = nullptr; if (!nnapi_devices_.empty()) { // Compile for the selected accelerator. @@ -3782,67 +3709,6 @@ TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context, } RETURN_TFLITE_ERROR_IF_NN_ERROR(context, finish_result, "completing NNAPI compilation", nnapi_errno); - - const bool use_device_memory_for_state_tensors = - nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI13 && - delegate_data.use_device_memory_for_state_tensors && - delegate_data.single_partition_delegated && - // State tensors with dynamic shapes are currently not supported. - std::all_of(model_state_tfl_inputs_.begin(), - model_state_tfl_inputs_.end(), [&context](int tfl_index) { - TfLiteTensor* tensor = &context->tensors[tfl_index]; - return !IsDynamicTensor(tensor); - }); - if (use_device_memory_for_state_tensors) { - for (int tfl_index : model_state_tfl_inputs_) { - auto& info = nn_state_tensor_info_map_.at(tfl_index); - - // prepare device memory descriptor - ANeuralNetworksMemoryDesc* desc = nullptr; - RETURN_TFLITE_ERROR_IF_NN_ERROR( - context, nnapi_->ANeuralNetworksMemoryDesc_create(&desc), - "creating device memory descriptor", nnapi_errno); - RETURN_TFLITE_ERROR_IF_NN_ERROR( - context, - nnapi_->ANeuralNetworksMemoryDesc_addInputRole( - desc, compilation, info.nn_input_index, 1.0f), - "adding input role to the device memory descriptor", nnapi_errno); - RETURN_TFLITE_ERROR_IF_NN_ERROR( - context, - nnapi_->ANeuralNetworksMemoryDesc_addOutputRole( - desc, compilation, info.nn_output_index, 1.0f), - "adding output role to the device memory descriptor", nnapi_errno); - RETURN_TFLITE_ERROR_IF_NN_ERROR( - context, nnapi_->ANeuralNetworksMemoryDesc_finish(desc), - "finishing device memory descriptor", nnapi_errno); - - // allocate two device memories for each state tensor - ANeuralNetworksMemory* state_input_memory = nullptr; - RETURN_TFLITE_ERROR_IF_NN_ERROR( - context, - nnapi_->ANeuralNetworksMemory_createFromDesc(desc, - &state_input_memory), - "creating input device memory from the descriptor", nnapi_errno); - info.nn_input_memory_handle.reset(state_input_memory); - - ANeuralNetworksMemory* state_output_memory = nullptr; - RETURN_TFLITE_ERROR_IF_NN_ERROR( - context, - nnapi_->ANeuralNetworksMemory_createFromDesc(desc, - &state_output_memory), - "creating output device memory from the descriptor", nnapi_errno); - info.nn_output_memory_handle.reset(state_output_memory); - nnapi_->ANeuralNetworksMemoryDesc_free(desc); - - // we need a temporary buffer to sync states to raw pointers - TfLiteTensor* tensor = &context->tensors[tfl_index]; - if (tensor->buffer_handle == kTfLiteNullBufferHandle) { - info.nn_temp_buffer.reset( - new NNMemory(nnapi_, "temp state tensor", info.tensor_size)); - } - } - } - nn_compilation_.reset(compilation); return kTfLiteOk; @@ -3904,7 +3770,6 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, // Set compilation timeout if applicable. const auto delegate_options = StatefulNnApiDelegate::GetOptions(node->delegate); - const auto& delegate_data = StatefulNnApiDelegate::GetData(node->delegate); if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI13) { if (delegate_options.max_execution_timeout_duration_ns > 0) { RETURN_TFLITE_ERROR_IF_NN_ERROR( @@ -3970,24 +3835,14 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, } } - const bool use_device_memory_for_state_tensors = - nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI13 && - delegate_data.use_device_memory_for_state_tensors && - // TODO(b/174612931): Even if the model is not fully supported, we can - // still use device memories for state tensors if they are only used in - // one single partition. - delegate_data.single_partition_delegated && - std::all_of(model_state_tfl_inputs_.begin(), - model_state_tfl_inputs_.end(), [&context](int tfl_index) { - TfLiteTensor* tensor = &context->tensors[tfl_index]; - return !IsDynamicTensor(tensor); - }); - // Set the input tensor buffers. Note: we access tflite tensors using // absolute indices but NN api indices inputs by relative indices. int relative_input_index = 0; - size_t input_offset_accumulator = 0; + const bool use_int8_asymm_signed = + target_sdk_version_ >= kMinSdkVersionForNNAPI13; + + size_t input_offset = 0; for (auto absolute_input_index : TfLiteIntArrayView(node->inputs)) { if (absolute_input_index == kTfLiteOptionalTensor) { continue; @@ -4005,58 +3860,90 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, input_nn_operand_type_ptr = &input_nn_operand_type; } if (tensor->allocation_type != kTfLiteMmapRo) { - ANeuralNetworksMemory* input_memory_handle = nullptr; - uint32_t input_offset = 0; - uint32_t input_length = 0; - const bool is_state_tensor = - nn_state_tensor_info_map_.count(absolute_input_index) > 0; - if (is_state_tensor && use_device_memory_for_state_tensors && - // If the client requests to sync states to device, we will use the - // shared memory directly as input instead of explicitly copying into - // the device memory. - !delegate_data.sync_states_to_device) { - const auto& state_tensor_info = - nn_state_tensor_info_map_.at(absolute_input_index); - input_memory_handle = state_tensor_info.nn_input_memory_handle.get(); - input_offset = 0; - input_length = 0; - } else if (tensor->buffer_handle != kTfLiteNullBufferHandle && - tensor->buffer_handle < tensor_memory_map_->size()) { - input_memory_handle = - tensor_memory_map_->at(tensor->buffer_handle).memory; - input_offset = 0; - input_length = tensor->bytes; - } else { - int tensor_size = 0; - // copy or convert tensor data to pre-allocated shared memory. - const bool use_int8_asymm_signed = - target_sdk_version_ >= kMinSdkVersionForNNAPI13; - TF_LITE_ENSURE_OK( + if (tensor->buffer_handle != kTfLiteNullBufferHandle && + tensor->buffer_handle < tensor_memory_map_->size()) { + RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR( context, - CopyOrConvertInputData( - context, ann_type_equivalent, use_int8_asymm_signed, tensor, - nn_input_memory_->get_data_ptr() + input_offset_accumulator, - &tensor_size)); - input_memory_handle = nn_input_memory_->get_handle(); - input_offset = input_offset_accumulator; - input_length = tensor_size; - input_offset_accumulator += tensor_size; - input_offset_accumulator += getNumPaddingBytes(tensor_size); + nnapi_->ANeuralNetworksExecution_setInputFromMemory( + execution, relative_input_index, input_nn_operand_type_ptr, + tensor_memory_map_->at(tensor->buffer_handle).memory, 0, + tensor->bytes), + "associating NNAPI execution input with a memory object", tensor, + nnapi_errno); + relative_input_index++; + continue; } - RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR( - context, - nnapi_->ANeuralNetworksExecution_setInputFromMemory( - execution, relative_input_index, input_nn_operand_type_ptr, - input_memory_handle, input_offset, input_length), - "associating NNAPI execution input with a memory object", tensor, - nnapi_errno); + int tensor_size = 0; + if (ann_type_equivalent != kTfLiteNoType) { + const auto num_elements = NumElements(tensor); + uint8_t* input_ptr = nn_input_memory_->get_data_ptr() + input_offset; + if (tensor->type == kTfLiteUInt8 && + ann_type_equivalent == kTfLiteInt32) { + for (int i = 0; i < num_elements; ++i) { + reinterpret_cast(input_ptr)[i] = + static_cast(tensor->data.uint8[i]); + } + } else if (tensor->type == kTfLiteInt8 && + ann_type_equivalent == kTfLiteUInt8) { + // Explicitly convert int8 values to uint8 values. + for (int i = 0; i < num_elements; ++i) { + input_ptr[i] = static_cast( + static_cast(tensor->data.int8[i]) + 128); + } + } else if (tensor->type == kTfLiteInt8 && + ann_type_equivalent == kTfLiteInt32) { + if (use_int8_asymm_signed) { + for (int i = 0; i < num_elements; ++i) { + reinterpret_cast(input_ptr)[i] = + static_cast(tensor->data.int8[i]); + } + } else { + for (int i = 0; i < num_elements; ++i) { + reinterpret_cast(input_ptr)[i] = + static_cast(tensor->data.int8[i]) + 128; + } + } + } else { + context->ReportError( + context, + "NN API Delegate: unsupported tensor types conversion: " + "from type code %d to type code %d.\n", + tensor->type, ann_type_equivalent); + return kTfLiteError; + } + size_t type_size; + TF_LITE_ENSURE_OK( + context, GetSizeOfType(context, ann_type_equivalent, &type_size)); + tensor_size = NumElements(tensor) * type_size; + RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR( + context, + nnapi_->ANeuralNetworksExecution_setInputFromMemory( + execution, relative_input_index, input_nn_operand_type_ptr, + nn_input_memory_->get_handle(), input_offset, tensor_size), + "associating NNAPI execution input with a memory object", tensor, + nnapi_errno); + } else { + // copy data to pre-allocated shared memory. + memcpy(nn_input_memory_->get_data_ptr() + input_offset, + tensor->data.raw, tensor->bytes); + RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR( + context, + nnapi_->ANeuralNetworksExecution_setInputFromMemory( + execution, relative_input_index, input_nn_operand_type_ptr, + nn_input_memory_->get_handle(), input_offset, tensor->bytes), + "associating NNAPI execution input with a memory object", tensor, + nnapi_errno); + tensor_size = tensor->bytes; + } + input_offset += tensor_size; + input_offset += getNumPaddingBytes(tensor_size); relative_input_index++; } } // Set the output tensor buffers. int relative_output_index = 0; - size_t output_offset_accumulator = 0; + size_t output_offset = 0; for (auto output_index : TfLiteIntArrayView(node->outputs)) { // If the NNAPI implementation doesn't have some of the outputs // they are left unmapped and we should not try to read their value here @@ -4090,12 +3977,11 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, context, nnapi_->ANeuralNetworksExecution_setOutputFromMemory( execution, relative_output_index, output_nn_operand_type_ptr, - nn_output_memory_->get_handle(), output_offset_accumulator, - tensor->bytes), + nn_output_memory_->get_handle(), output_offset, tensor->bytes), "associating NNAPI execution output to a memory object", tensor, nnapi_errno); - output_offset_accumulator += tensor->bytes; - output_offset_accumulator += getNumPaddingBytes(tensor->bytes); + output_offset += tensor->bytes; + output_offset += getNumPaddingBytes(tensor->bytes); } relative_output_index++; } @@ -4104,27 +3990,16 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, // current invocation. for (size_t i = 0; i < model_state_tfl_inputs_.size(); i++) { int state_tensor_idx = model_state_tfl_inputs_[i]; - if (use_device_memory_for_state_tensors) { - auto* device_memory = nn_state_tensor_info_map_.at(state_tensor_idx) - .nn_output_memory_handle.get(); - RETURN_TFLITE_ERROR_IF_NN_ERROR( - context, - nnapi_->ANeuralNetworksExecution_setOutputFromMemory( - execution, relative_output_index, nullptr, device_memory, 0, 0), - "associating NNAPI execution output with a device memory object", - nnapi_errno); - } else { - TfLiteTensor* tensor = &context->tensors[state_tensor_idx]; - // Here we are using a deep copy for state_in tensors so that we are not - // reading and writing into the same buffer during a invocation. - // TODO(b/110369471): using double shared buffer to minimize the copies. - RETURN_TFLITE_ERROR_IF_NN_ERROR( - context, - nnapi_->ANeuralNetworksExecution_setOutput( - execution, relative_output_index, nullptr, tensor->data.raw, - tensor->bytes), - "associating NNAPI execution output to a buffer", nnapi_errno); - } + TfLiteTensor* tensor = &context->tensors[state_tensor_idx]; + // Here we are using a deep copy for state_in tensors so that we are not + // reading and writing into the same buffer during a invocation. + // TODO(b/110369471): using double shared buffer to minimize the copies. + RETURN_TFLITE_ERROR_IF_NN_ERROR( + context, + nnapi_->ANeuralNetworksExecution_setOutput( + execution, relative_output_index, nullptr, tensor->data.raw, + tensor->bytes), + "associating NNAPI execution output to a buffer", nnapi_errno); relative_output_index++; } // Invoke ANN in blocking fashion. @@ -4147,70 +4022,39 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, } // copy results from shared memory to the destination. - output_offset_accumulator = 0; + output_offset = 0; for (auto output_index : TfLiteIntArrayView(node->outputs)) { TfLiteTensor* tensor = &context->tensors[output_index]; if (tensor->buffer_handle != kTfLiteNullBufferHandle) { continue; } - const TfLiteType ann_type_equivalent = + TfLiteType ann_type_equivalent = operand_mapping_.lite_index_to_ann_type_conversion(output_index); - TF_LITE_ENSURE_OK( - context, CopyOrConvertOutputData(ann_type_equivalent, - nn_output_memory_->get_data_ptr() + - output_offset_accumulator, - tensor)); - output_offset_accumulator += tensor->bytes; - output_offset_accumulator += getNumPaddingBytes(tensor->bytes); - } - - // sync state tensors from device memories - if (use_device_memory_for_state_tensors && - delegate_data.sync_states_from_device) { - for (auto& [tfl_index, info] : nn_state_tensor_info_map_) { - TfLiteTensor* tensor = &context->tensors[tfl_index]; - if (tensor->buffer_handle != kTfLiteNullBufferHandle && - tensor->buffer_handle < tensor_memory_map_->size()) { - RETURN_TFLITE_ERROR_IF_NN_ERROR( - context, - nnapi_->ANeuralNetworksMemory_copy( - info.nn_output_memory_handle.get(), - tensor_memory_map_->at(tensor->buffer_handle).memory), - "syncing device memory from device", nnapi_errno); - } else { - // For pointer tensor data, we need to copy twice: - // 1. device memory -> shared memory - // 2. shared memory -> raw pointer - // The second copy may also need type conversion from uint8 -> int8. - RETURN_TFLITE_ERROR_IF_NN_ERROR(context, - nnapi_->ANeuralNetworksMemory_copy( - info.nn_output_memory_handle.get(), - info.nn_temp_buffer->get_handle()), - "syncing device memory from device", - nnapi_errno); - const TfLiteType ann_type_equivalent = - operand_mapping_.lite_index_to_ann_type_conversion(tfl_index); - TF_LITE_ENSURE_OK(context, - CopyOrConvertOutputData( - ann_type_equivalent, - info.nn_temp_buffer->get_data_ptr(), tensor)); + if (tensor->type == kTfLiteInt8 && ann_type_equivalent == kTfLiteUInt8) { + // Explicitly convert uint8 values to int8 values. + uint8_t* output_ptr = reinterpret_cast( + nn_output_memory_->get_data_ptr() + output_offset); + const auto num_elements = NumElements(tensor); + for (int i = 0; i < num_elements; ++i) { + output_ptr[i] = + static_cast(static_cast(output_ptr[i]) - 128); } } - } - - // swap device memory handles so that the state output of the current - // invocation will be used as the state input of the next invocation - if (use_device_memory_for_state_tensors) { - for (auto& [tfl_index, info] : nn_state_tensor_info_map_) { - std::swap(info.nn_input_memory_handle, info.nn_output_memory_handle); - } + memcpy(tensor->data.raw, nn_output_memory_->get_data_ptr() + output_offset, + tensor->bytes); + output_offset += tensor->bytes; + output_offset += getNumPaddingBytes(tensor->bytes); } // copy output of all output tensors in feedback_loops_ into the // associated input - for (auto [output_tensor_idx, input_tensor_idx] : feedback_loops_) { + for (auto feedback_loop : feedback_loops_) { + int output_tensor_idx; + int input_tensor_idx; + std::tie(output_tensor_idx, input_tensor_idx) = feedback_loop; TfLiteTensor& src = context->tensors[output_tensor_idx]; TfLiteTensor& dest = context->tensors[input_tensor_idx]; + memcpy(dest.data.raw, src.data.raw, src.bytes); } @@ -4778,17 +4622,6 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph( std::vector outputs; outputs.reserve(output_tensors->size); - for (int tfl_index : model_state_tfl_inputs_) { - NNStateTensorInfo info = { - .nn_input_memory_handle = - std::unique_ptr( - nullptr, NNFreeMemory(nnapi_)), - .nn_output_memory_handle = - std::unique_ptr( - nullptr, NNFreeMemory(nnapi_))}; - nn_state_tensor_info_map_.emplace(tfl_index, std::move(info)); - } - size_t total_input_byte_size = 0; // Make the TensorFlow Lite inputs and outputs to ann_indices. for (int i : TfLiteIntArrayView(input_tensors)) { @@ -4798,6 +4631,10 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph( // The delegate might not have mapped this input (this can // happen if one tensor is split in several ones) operand_mapping_.lite_index_to_ann(i) != -1) { + inputs.push_back(operand_mapping_.lite_index_to_ann(i)); + if (context->tensors[i].buffer_handle != kTfLiteNullBufferHandle) { + continue; + } const TfLiteType nn_type_conversion = operand_mapping_.lite_index_to_ann_type_conversion(i); int tensor_size = 0; @@ -4809,15 +4646,6 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph( context, GetSizeOfType(context, nn_type_conversion, &type_size)); tensor_size = NumElements(&context->tensors[i]) * type_size; } - if (auto it = nn_state_tensor_info_map_.find(i); - it != nn_state_tensor_info_map_.end()) { - it->second.nn_input_index = inputs.size(); - it->second.tensor_size = tensor_size; - } - inputs.push_back(operand_mapping_.lite_index_to_ann(i)); - if (context->tensors[i].buffer_handle != kTfLiteNullBufferHandle) { - continue; - } total_input_byte_size += tensor_size; total_input_byte_size += getNumPaddingBytes(tensor_size); } @@ -4838,11 +4666,8 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph( } // Add state output tensors as model outputs. - for (int i = 0; i < model_state_outputs_.size(); i++) { - const int tfl_index = model_state_tfl_inputs_[i]; - const int nn_model_index = model_state_outputs_[i]; - nn_state_tensor_info_map_.at(tfl_index).nn_output_index = outputs.size(); - outputs.push_back(nn_model_index); + for (int i : model_state_outputs_) { + outputs.push_back(i); } // Tell ANN to declare inputs/outputs @@ -4947,8 +4772,6 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi, if (nnapi->android_sdk_version >= kMinSdkVersionForNNAPI11) { delegate_data_.allow_dynamic_dimensions = options.allow_dynamic_dimensions; } - delegate_data_.use_device_memory_for_state_tensors = - options.use_device_memory_for_state_tensors; TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO, "Created TensorFlow Lite delegate for NNAPI."); Prepare = DoPrepare; @@ -4991,17 +4814,9 @@ const StatefulNnApiDelegate::Options StatefulNnApiDelegate::GetOptions( options.max_execution_loop_timeout_duration_ns = delegate_data->max_execution_loop_timeout_duration_ns; options.allow_dynamic_dimensions = delegate_data->allow_dynamic_dimensions; - options.use_device_memory_for_state_tensors = - delegate_data->use_device_memory_for_state_tensors; return options; } -const StatefulNnApiDelegate::Data& StatefulNnApiDelegate::GetData( - TfLiteDelegate* delegate) { - auto* delegate_data = reinterpret_cast(delegate->data_); - return *delegate_data; -} - const std::vector& StatefulNnApiDelegate::GetTensorMemoryMap(TfLiteDelegate* delegate) { auto delegate_data = reinterpret_cast(delegate->data_); @@ -5062,24 +4877,6 @@ int StatefulNnApiDelegate::GetNnApiErrno() const { return delegate_data_.nnapi_errno; } -TfLiteStatus StatefulNnApiDelegate::SetSyncStatesToDevice( - bool sync_states_to_device) { - if (!delegate_data_.use_device_memory_for_state_tensors) { - return kTfLiteError; - } - delegate_data_.sync_states_to_device = sync_states_to_device; - return kTfLiteOk; -} - -TfLiteStatus StatefulNnApiDelegate::SetSyncStatesFromDevice( - bool sync_states_from_device) { - if (!delegate_data_.use_device_memory_for_state_tensors) { - return kTfLiteError; - } - delegate_data_.sync_states_from_device = sync_states_from_device; - return kTfLiteOk; -} - // static TfLiteStatus StatefulNnApiDelegate::GetNodesSupportedByAccelerator( TfLiteContext* context, TfLiteDelegate* delegate, const NnApi* nnapi, @@ -5111,9 +4908,9 @@ TfLiteStatus StatefulNnApiDelegate::GetNodesSupportedByAccelerator( supported_partition_nodes.begin(), supported_partition_nodes.end()); - bool single_partition_delegated = (supported_partition_nodes.size() == - partition_params.nodes_to_replace->size); - if (single_partition_delegated) { + bool model_fully_supported = (supported_partition_nodes.size() == + partition_params.nodes_to_replace->size); + if (model_fully_supported) { delegate_data->CacheDelegateKernel(&partition_params, kernel_state.release()); } @@ -5328,10 +5125,6 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context, params_array, params_array + num_partitions), &nodes_to_delegate)); - if (!nodes_to_delegate.empty() && num_partitions == 1) { - delegate_data->single_partition_delegated = true; - } - if (nodes_to_delegate.empty()) { return kTfLiteOk; } else { diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h index dbc92f7d5a4..4b12b0d0d18 100644 --- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h +++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h @@ -125,33 +125,6 @@ class StatefulNnApiDelegate : public TfLiteDelegate { // accelerator. This should only be enabled if the target device supports // dynamic dimensions of the model. bool allow_dynamic_dimensions = false; - - // When set to true, the delegate will allocate device memory for state - // tensors to reduce data copying and transformation overhead. In such a - // case, the user must explicitly specify whether they would like to sync - // states between host and device before and after each invocation by - // SetSyncStatesToDevice and SetSyncStatesFromDevice. The following code - // snippet demonstrates the usage: - // - // StatefulNnapiDelegate::Options options; - // options.use_device_memory_for_state_tensors = true; - // ... - // - // for (int i = 0; i < sequence_size; i++) { - // ... - // - // // Push initial states to the device before the first invocation. - // delegate->SetSyncStatesToDevice(i == 0); - // - // // Get states data back to the host CPU buffer after the final - // // invocation. - // delegate->SetSyncStatesFromDevice(i == sequence_size - 1); - // - // interpreter->Invoke(); - // } - // - // WARNING: This is an experimental interface that is subject to change. - bool use_device_memory_for_state_tensors = false; }; // Uses default options. @@ -213,23 +186,7 @@ class StatefulNnApiDelegate : public TfLiteDelegate { // (i.e. when calling interpreter.ModifyGraphWithDelegate(delegate)). int GetNnApiErrno() const; - // Specifies whether the device memories should be initialized from the - // content of CPU buffers of state tensors before the execution or not. - // Will return an error if the delegate is not initialized with - // use_device_memory_for_state_tensors set to true. - // WARNING: This is an experimental interface that is subject to change. - TfLiteStatus SetSyncStatesToDevice(bool sync_states_to_device); - - // Specifies whether the device memories should be copied to the content of - // CPU buffers of state tensors after the execution or not. - // Will return an error if the delegate is not initialized with - // use_device_memory_for_state_tensors set to true. - // WARNING: This is an experimental interface that is subject to change. - TfLiteStatus SetSyncStatesFromDevice(bool sync_states_from_device); - private: - friend NNAPIDelegateKernel; - // Encapsulates all delegate data. struct Data { // Pointer to NNAPI implementation to be used by this delegate as @@ -278,17 +235,6 @@ class StatefulNnApiDelegate : public TfLiteDelegate { uint64_t max_execution_loop_timeout_duration_ns = 0; // Whether to allow dynamic dimension sizes without re-compilation. bool allow_dynamic_dimensions = false; - // When set to true, the delegate will allocate device memories for state - // tensors to reduce data copying and transformation overhead. - bool use_device_memory_for_state_tensors = false; - // When set to true, the device memories will be initialized from the - // content of CPU buffers of state tensors before the execution. - bool sync_states_to_device = false; - // When set to true, the device memories will be copied to the content of - // CPU buffers of state tensors after the execution. - bool sync_states_from_device = false; - // Whether the model is fully supported by the delegate. - bool single_partition_delegated = false; explicit Data(const NnApi* nnapi); ~Data(); @@ -302,9 +248,6 @@ class StatefulNnApiDelegate : public TfLiteDelegate { const TfLiteDelegateParams* delegate_params); }; - // Returns the delegate data. - static const Data& GetData(TfLiteDelegate* delegate); - // Implements TfLiteDelegate::Prepare. Please refer to TFLiteDelegate // documentation for more info. static TfLiteStatus DoPrepare(TfLiteContext* context, diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h index 60c32a1ef0f..36c1dd32efb 100644 --- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h +++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/lite/allocation.h" #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h" -#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h" #include "tensorflow/lite/nnapi/nnapi_implementation.h" namespace tflite { @@ -155,18 +154,6 @@ class NNFreeExecution { // NnApi instance to use. Not owned by this object. const NnApi* nnapi_; }; -// RAII NN API Memory Destructor for use with std::unique_ptr -class NNFreeMemory { - public: - explicit NNFreeMemory(const NnApi* nnapi) : nnapi_(nnapi) {} - void operator()(ANeuralNetworksMemory* memory) { - nnapi_->ANeuralNetworksMemory_free(memory); - } - - private: - // NnApi instance to use. Not owned by this object. - const NnApi* nnapi_; -}; // Manage NNAPI shared memory handle class NNMemory { @@ -188,19 +175,6 @@ class NNMemory { ANeuralNetworksMemory* nn_memory_handle_ = nullptr; }; -// Basic info and NN device memory handles for state tensors. -struct NNStateTensorInfo { - uint32_t nn_input_index = 0; - uint32_t nn_output_index = 0; - // The size of the NN state tensor after applying any potential data type - // conversion. - int tensor_size = 0; - std::unique_ptr nn_input_memory_handle; - std::unique_ptr nn_output_memory_handle; - // The shared memory used to sync the state from the device. - std::unique_ptr nn_temp_buffer; -}; - enum class NNAPIValidationFailureType : int { // The operator is not supported by either NNAPI or the NNAPI Delegate. @@ -366,9 +340,6 @@ class NNAPIDelegateKernel { // data available for TFLite model users std::vector> feedback_loops_; - // TfLite index -> state tensor info. - std::map nn_state_tensor_info_map_; - std::unique_ptr nn_input_memory_; std::unique_ptr nn_output_memory_; diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc index c1a3923de4d..16e7a260961 100644 --- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc +++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc @@ -2718,15 +2718,24 @@ class RNNOpModel : public SingleOpModelWithNNAPI { public: RNNOpModel(int batches, int units, int size, const TensorType weights = TensorType_FLOAT32, - const TensorType recurrent_weights = TensorType_FLOAT32) { - Init(batches, units, size, weights, recurrent_weights); - } - - RNNOpModel(const StatefulNnApiDelegate::Options& options, int batches, - int units, int size, const TensorType weights = TensorType_FLOAT32, const TensorType recurrent_weights = TensorType_FLOAT32) - : SingleOpModelWithNNAPI(options) { - Init(batches, units, size, weights, recurrent_weights); + : batches_(batches), units_(units), input_size_(size) { + input_ = AddInput(TensorType_FLOAT32); + weights_ = AddInput(weights); + recurrent_weights_ = AddInput(recurrent_weights); + bias_ = AddInput(TensorType_FLOAT32); + hidden_state_ = AddVariableInput(TensorType_FLOAT32); + output_ = AddOutput(TensorType_FLOAT32); + SetBuiltinOp( + BuiltinOperator_RNN, BuiltinOptions_RNNOptions, + CreateRNNOptions(builder_, ActivationFunctionType_RELU).Union()); + BuildInterpreterWithNNAPI({ + {batches_, input_size_}, // input tensor + {units_, input_size_}, // weights tensor + {units_, units_}, // recurrent weights tensor + {units_}, // bias tensor + {batches_, units_} // hidden state tensor + }); } void SetBias(std::initializer_list f) { PopulateTensor(bias_, f); } @@ -2747,16 +2756,8 @@ class RNNOpModel : public SingleOpModelWithNNAPI { PopulateTensor(input_, offset, begin, end); } - void SetHiddenState(const std::vector& data) { - PopulateTensor(hidden_state_, data); - } - std::vector GetOutput() { return ExtractVector(output_); } - std::vector GetHiddenState() { - return ExtractVector(hidden_state_); - } - int input_size() { return input_size_; } int num_units() { return units_; } int num_batches() { return batches_; } @@ -2772,50 +2773,8 @@ class RNNOpModel : public SingleOpModelWithNNAPI { int batches_; int units_; int input_size_; - - private: - // Performs initialization logic shared across all constructors. - void Init(int batches, int units, int size, const TensorType weights, - const TensorType recurrent_weights) { - batches_ = batches; - units_ = units; - input_size_ = size; - input_ = AddInput(TensorType_FLOAT32); - weights_ = AddInput(weights); - recurrent_weights_ = AddInput(recurrent_weights); - bias_ = AddInput(TensorType_FLOAT32); - hidden_state_ = AddVariableInput(TensorType_FLOAT32); - output_ = AddOutput(TensorType_FLOAT32); - SetBuiltinOp( - BuiltinOperator_RNN, BuiltinOptions_RNNOptions, - CreateRNNOptions(builder_, ActivationFunctionType_RELU).Union()); - BuildInterpreterWithNNAPI({ - {batches_, input_size_}, // input tensor - {units_, input_size_}, // weights tensor - {units_, units_}, // recurrent weights tensor - {units_}, // bias tensor - {batches_, units_} // hidden state tensor - }); - } }; -static void InvokeAndTestSingleRnnStep(int step_index, RNNOpModel* rnn) { - float* batch_start = rnn_input + step_index * rnn->input_size(); - float* batch_end = batch_start + rnn->input_size(); - rnn->SetInput(0, batch_start, batch_end); - rnn->SetInput(rnn->input_size(), batch_start, batch_end); - - rnn->Invoke(); - - float* golden_start = rnn_golden_output + step_index * rnn->num_units(); - float* golden_end = golden_start + rnn->num_units(); - std::vector expected; - expected.insert(expected.end(), golden_start, golden_end); - expected.insert(expected.end(), golden_start, golden_end); - - EXPECT_THAT(rnn->GetOutput(), ElementsAreArray(ArrayFloatNear(expected))); -} - TEST(NNAPIDelegate, RnnBlackBoxTest) { RNNOpModel rnn(2, 16, 8); rnn.SetWeights(rnn_weights); @@ -2826,66 +2785,20 @@ TEST(NNAPIDelegate, RnnBlackBoxTest) { (rnn.input_size() * rnn.num_batches()); for (int i = 0; i < input_sequence_size; i++) { - InvokeAndTestSingleRnnStep(i, &rnn); - } -} + float* batch_start = rnn_input + i * rnn.input_size(); + float* batch_end = batch_start + rnn.input_size(); + rnn.SetInput(0, batch_start, batch_end); + rnn.SetInput(rnn.input_size(), batch_start, batch_end); -TEST(NNAPIDelegate, RnnDeviceMemoryBasicTest) { - StatefulNnApiDelegate::Options options; - options.use_device_memory_for_state_tensors = true; + rnn.Invoke(); - RNNOpModel rnn(options, 2, 16, 8); - rnn.SetWeights(rnn_weights); - rnn.SetBias(rnn_bias); - rnn.SetRecurrentWeights(rnn_recurrent_weights); + float* golden_start = rnn_golden_output + i * rnn.num_units(); + float* golden_end = golden_start + rnn.num_units(); + std::vector expected; + expected.insert(expected.end(), golden_start, golden_end); + expected.insert(expected.end(), golden_start, golden_end); - auto* delegate = rnn.GetDelegate(); - const int input_sequence_size = sizeof(rnn_input) / sizeof(float) / - (rnn.input_size() * rnn.num_batches()); - - // Only sync the state to device in the first invocation, all subsequent - // states are kept inside the driver. - for (int i = 0; i < input_sequence_size; i++) { - delegate->SetSyncStatesToDevice(i == 0); - InvokeAndTestSingleRnnStep(i, &rnn); - } -} - -TEST(NNAPIDelegate, RnnDeviceMemorySyncTest) { - StatefulNnApiDelegate::Options options; - options.use_device_memory_for_state_tensors = true; - - RNNOpModel rnn(options, 2, 16, 8); - rnn.SetWeights(rnn_weights); - rnn.SetBias(rnn_bias); - rnn.SetRecurrentWeights(rnn_recurrent_weights); - - auto* delegate = rnn.GetDelegate(); - const int input_sequence_size = sizeof(rnn_input) / sizeof(float) / - (rnn.input_size() * rnn.num_batches()); - const int sync_output_index = input_sequence_size / 2; - - // The following steps test SetSyncStatesFromDevice and SetSyncStatesToDevice: - // 1. Invoke RNN sequence until sync_output_index; - // 2. Extract the hidden output state at sync_output_index by - // SetSyncStatesFromDevice(true); - // 3. Continue RNN sequence until the end; - // 4. Reset the hidden state by SetSyncStatesToDevice(true), the state should - // go back to sync_output_index; - // 5. Continue RNN sequence from sync_output_index + 1 until the end. - std::vector hidden_state_data; - for (int i = 0; i < input_sequence_size; i++) { - delegate->SetSyncStatesToDevice(i == 0); - delegate->SetSyncStatesFromDevice(i == sync_output_index); - InvokeAndTestSingleRnnStep(i, &rnn); - if (i == sync_output_index) { - hidden_state_data = rnn.GetHiddenState(); - } - } - rnn.SetHiddenState(hidden_state_data); - for (int i = sync_output_index + 1; i < input_sequence_size; i++) { - delegate->SetSyncStatesToDevice(i == (sync_output_index + 1)); - InvokeAndTestSingleRnnStep(i, &rnn); + EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected))); } }