From f781e5eb44d3ece0ca82627e7cf4c87ad95e4620 Mon Sep 17 00:00:00 2001
From: Stefano Galarraga <galarragas@google.com>
Date: Fri, 21 Feb 2020 03:54:58 -0800
Subject: [PATCH] Support partition limitation even when accelerator name is
 not specified

PiperOrigin-RevId: 296406206
Change-Id: I4bdd9d0e9b578401bb380828c628e14ef5aa711c
---
 tensorflow/lite/delegates/nnapi/BUILD         |   1 +
 .../lite/delegates/nnapi/nnapi_delegate.cc    |  32 ++--
 .../nnapi_delegate_device_selection_test.cc   | 152 ++++++++++++++----
 3 files changed, 142 insertions(+), 43 deletions(-)
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index ee47ad0e24d..021f74b6ab0 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -179,6 +179,7 @@ cc_test(
         ":nnapi_delegate",
         ":nnapi_delegate_mock_test",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:test_util",
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index a3a4babd91f..84b3cfb2f15 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -4185,25 +4185,29 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
       .version = 1,
   };
 
-  std::vector<int>& nodes_to_delegate = supported_nodes;
+  std::vector<int> nodes_to_delegate;
+
+  int num_partitions;
+  TfLiteDelegateParams* params_array;
   if (is_accelerator_specified) {
-    std::vector<int> device_supported_nodes;
-    int num_partitions;
-    TfLiteDelegateParams* params_array;
-
+    // Filtering out nodes not supported by target accelerators
     TF_LITE_ENSURE_STATUS(GetNodesSupportedByAccelerator(
-        context, delegate, nnapi, supported_nodes, &device_supported_nodes,
+        context, delegate, nnapi, supported_nodes, &nodes_to_delegate,
         &num_partitions, &params_array, nnapi_errno));
-
-    TF_LITE_ENSURE_STATUS(LimitDelegatedPartitions(
-        delegate_options.max_number_delegated_partitions,
-        std::vector<TfLiteDelegateParams>(params_array,
-                                          params_array + num_partitions),
-        &device_supported_nodes));
-
-    nodes_to_delegate = device_supported_nodes;
+  } else {
+    nodes_to_delegate = supported_nodes;
+    auto supported_nodes_int_array = BuildTfLiteIntArray(supported_nodes);
+    TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
+        context, supported_nodes_int_array.get(), &params_array,
+        &num_partitions));
   }
 
+  TF_LITE_ENSURE_STATUS(
+      LimitDelegatedPartitions(delegate_options.max_number_delegated_partitions,
+                               std::vector<TfLiteDelegateParams>(
+                                   params_array, params_array + num_partitions),
+                               &nodes_to_delegate));
+
   if (nodes_to_delegate.empty()) {
     return kTfLiteOk;
   } else {
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
index bf9e00bee69..d6183e63013 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <sys/mman.h>
 
 #include <algorithm>
+#include <array>
 #include <iterator>
 #include <memory>
 #include <numeric>
@@ -23,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
@@ -545,13 +547,39 @@ TEST_F(UnsupportedOperationOnDeviceTest, ShouldCacheModelCompilation) {
 }
 
 // Model with a chain of no-op (add with zero operations)
+// interleaved with no-op custom nodes.
 class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
  public:
   LongIdentityModel(const std::vector<int>& input_shape, int graph_size,
+                    const std::unordered_set<int>& custom_nodes_indexes,
                     const NnApi* nnapi, const std::string& accelerator_name,
                     int max_nnapi_partitions)
       : MultiOpModel(),
         AcceleratedModel(nnapi, accelerator_name, max_nnapi_partitions) {
+    Init(input_shape, graph_size, custom_nodes_indexes);
+  }
+
+  LongIdentityModel(const std::vector<int>& input_shape, int graph_size,
+                    const std::unordered_set<int>& custom_nodes_indexes,
+                    const NnApi* nnapi, int max_nnapi_partitions)
+      : MultiOpModel(), AcceleratedModel(nnapi, false, max_nnapi_partitions) {
+    Init(input_shape, graph_size, custom_nodes_indexes);
+  }
+
+  void SetInput(std::vector<float> value) { PopulateTensor(input_, value); }
+
+  int CountNnApiPartitions() {
+    return std::count_if(
+        std::begin(interpreter_->execution_plan()),
+        std::end(interpreter_->execution_plan()), [this](const int node_index) {
+          return interpreter_->node_and_registration(node_index)
+                     ->first.delegate != nullptr;
+        });
+  }
+
+ private:
+  void Init(const std::vector<int>& input_shape, int graph_size,
+            const std::unordered_set<int>& custom_nodes_indexes) {
     auto* delegate = GetDelegate();
     this->SetApplyDelegate([delegate](Interpreter* interpreter) {
       interpreter->ModifyGraphWithDelegate(delegate);
@@ -574,10 +602,15 @@ class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
                  {intermediate_outputs[0]});
 
     for (int i = 0; i < intermediate_outputs.size() - 1; i++) {
-      AddBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
-                   CreateAddOptions(builder_).Union(),
-                   {intermediate_outputs[i], zero_input_},
-                   {intermediate_outputs[i + 1]});
+      if (custom_nodes_indexes.count(i + 1) == 1) {
+        AddCustomOp("custom_no_op", {}, [this]() { return CustomNoOpNode(); },
+                    {intermediate_outputs[i]}, {intermediate_outputs[i + 1]});
+      } else {
+        AddBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+                     CreateAddOptions(builder_).Union(),
+                     {intermediate_outputs[i], zero_input_},
+                     {intermediate_outputs[i + 1]});
+      }
     }
 
     AddBuiltinOp(
@@ -592,18 +625,42 @@ class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
     PopulateTensor(zero_input_, zero);
   }
 
-  void SetInput(std::vector<float> value) { PopulateTensor(input_, value); }
+  // Return the registration of a custom node simply copying input to output.
+  TfLiteRegistration* CustomNoOpNode() {
+    static TfLiteRegistration no_op = {
+        .init = [](TfLiteContext* context, const char* buffer,
+                   size_t length) -> void* { return nullptr; },
 
-  int CountNnApiPartitions() {
-    return std::count_if(
-        std::begin(interpreter_->execution_plan()),
-        std::end(interpreter_->execution_plan()), [this](const int node_index) {
-          return interpreter_->node_and_registration(node_index)
-                     ->first.delegate != nullptr;
-        });
+        .free = [](TfLiteContext* context, void* buffer) -> void {},
+
+        .prepare = [](TfLiteContext* context,
+                      TfLiteNode* node) -> TfLiteStatus {
+          if (node->inputs->size != 1 || node->outputs->size != 1) {
+            return kTfLiteError;
+          }
+
+          return kTfLiteOk;
+        },
+
+        .invoke = [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
+          auto input_tensor = context->tensors[node->inputs->data[0]];
+          auto output_tensor = context->tensors[node->outputs->data[0]];
+
+          std::copy(input_tensor.data.raw,
+                    input_tensor.data.raw + input_tensor.bytes,
+                    output_tensor.data.raw);
+
+          return kTfLiteOk;
+        },
+
+        .profiling_string = nullptr,
+        .builtin_code = kTfLiteBuiltinDelegate,
+        .custom_name = "NoOpTestDelegate",
+        .version = 1,
+    };
+
+    return &no_op;
   }
-
- private:
   int input_;
   int zero_input_;
   int output_;
@@ -643,7 +700,8 @@ class DelegatePartitionLimitTest
   // input_shape.
   void Init(int max_nnapi_partitions,
             const std::vector<int>& nnapi_partition_sizes,
-            const std::vector<int>& input_shape) {
+            const std::vector<int>& input_shape,
+            bool specify_accelerator = true) {
     // The graph will have as number of nodes the sum of nodes in the NNAPI
     // partitions plus nnapi_partition_sizes.size() - 1 nodes that will be
     // not supported by NNAPI and will cause the
@@ -658,20 +716,36 @@ class DelegatePartitionLimitTest
       unsupported_ops_idxs.insert(partition_node_idx);
     }
 
-    DelegatePartitionLimitTestNodeFilter()->ConfigureSupportedNodes(
-        graph_size_, unsupported_ops_idxs);
+    if (specify_accelerator) {
+      // Building a model that will contain initially a single partition
+      // and will get then partitioned by checking the operations supported
+      // by the target accelerator.
+      // This because I am not able to know the size of each partition in my
+      // stubbed GetSupportedOperationsForDevices API.
+      DelegatePartitionLimitTestNodeFilter()->ConfigureSupportedNodes(
+          graph_size_, unsupported_ops_idxs);
 
-    nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
-        [](const ANeuralNetworksModel* model,
-           const ANeuralNetworksDevice* const* devices, uint32_t num_devices,
-           bool* supported_ops) -> int {
-          DelegatePartitionLimitTestNodeFilter()->SetNodeSupport(supported_ops);
-          return ANEURALNETWORKS_NO_ERROR;
-        });
+      nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
+          [](const ANeuralNetworksModel* model,
+             const ANeuralNetworksDevice* const* devices, uint32_t num_devices,
+             bool* supported_ops) -> int {
+            DelegatePartitionLimitTestNodeFilter()->SetNodeSupport(
+                supported_ops);
+            return ANEURALNETWORKS_NO_ERROR;
+          });
 
-    model_ = std::make_unique<LongIdentityModel>(
-        input_shape, graph_size_, nnapi_mock_->GetNnApi(),
-        /*accelerator_name=*/"test-device", max_nnapi_partitions);
+      model_ = std::make_unique<LongIdentityModel>(
+          input_shape, graph_size_,
+          /*custom_nodes_indexes=*/std::unordered_set<int>(),
+          nnapi_mock_->GetNnApi(),
+          /*accelerator_name=*/"test-device", max_nnapi_partitions);
+    } else {
+      // Building a model containing custom nodes that won't be supported
+      // by the delegate and generate the partitions.
+      model_ = std::make_unique<LongIdentityModel>(
+          input_shape, graph_size_, unsupported_ops_idxs,
+          nnapi_mock_->GetNnApi(), max_nnapi_partitions);
+    }
   }
 
   std::unique_ptr<LongIdentityModel> model_;
@@ -718,24 +792,44 @@ TEST_F(DelegatePartitionLimitTest,
 }
 
 TEST_F(DelegatePartitionLimitTest, ShouldDelegatePartitionWithHigherNodeCount) {
+  int kLargestModelSize = 3;
   Init(/*max_nnapi_partitions=*/1,
        /*nnapi_partition_sizes=*/{3, 2},
        /*input_shape=*/{1, 2, 2, 1});
 
   EXPECT_EQ(model_->CountNnApiPartitions(), 1);
-  EXPECT_EQ(model_->CountOpsExecutedByCpuKernel(), OriginalGraphSize() - 3);
+  EXPECT_EQ(model_->CountOpsExecutedByCpuKernel(),
+            OriginalGraphSize() - kLargestModelSize);
 }
 
 TEST_F(DelegatePartitionLimitTest,
        ShouldDelegatePartitionsWithHigherNodeCount) {
+  int kLargestModelSize = 5;
+  int kSecondLargestModelSize = 4;
   Init(/*max_nnapi_partitions=*/2,
-       /*nnapi_partition_sizes=*/{1, 5, 2, 4},
+       /*nnapi_partition_sizes=*/
+       {1, kLargestModelSize, 2, kSecondLargestModelSize},
        /*input_shape=*/{1, 2, 2, 1});
 
   EXPECT_EQ(model_->CountNnApiPartitions(), 2);
   EXPECT_EQ(model_->CountOpsExecutedByCpuKernel(), OriginalGraphSize() - 9);
 }
 
+TEST_F(DelegatePartitionLimitTest,
+       ShouldLimitPartitionsEvenWithoutAcceleratorNameSpecified) {
+  int kLargestModelSize = 5;
+  int kSecondLargestModelSize = 4;
+  Init(/*max_nnapi_partitions=*/2,
+       /*nnapi_partition_sizes=*/
+       {1, kLargestModelSize, 2, kSecondLargestModelSize},
+       /*input_shape=*/{1, 2, 2, 1}, /*specify_accelerator=*/false);
+
+  EXPECT_EQ(model_->CountNnApiPartitions(), 2);
+  EXPECT_EQ(
+      model_->CountOpsExecutedByCpuKernel(),
+      OriginalGraphSize() - (kLargestModelSize + kSecondLargestModelSize));
+}
+
 }  // namespace
 }  // namespace tflite