Add an option in GPU delegate to parameterize the #partitions to delegate. The default value of this parameter is 1, same w/ the current behavior.

PiperOrigin-RevId: 307788075 Change-Id: I26bb65fcf049e82cc46e88f818b07ae245fbb1cc
2020-04-22 04:10:11 -07:00 · 2020-04-22 04:10:11 -07:00 · 1912ef16d6
commit 1912ef16d6
parent e97559bc40
6 changed files with 254 additions and 43 deletions
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@ -2602,7 +2602,8 @@ bool IsAllAllowedTensors(TfLiteContext* context, const TfLiteIntArray* array,

 // TODO(impjdi): Check number of input/output tensors and their dimensions.
 // TODO(impjdi): Check ops' parameters.
-TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops) {
+TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops,
+                                int max_delegated_partitions) {
  delegates::IsNodeSupportedFn node_supported_fn =
      [=](TfLiteContext* context, TfLiteNode* node,
          TfLiteRegistration* registration,
@ -2633,11 +2634,11 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops) {
    return TfLiteIntArrayCreate(0);
  }

-  // We simply get 1st largest partition, but we could later explore whether
-  // getting more partitions could lead to better performance, i.e. by
-  // parameterizing '1' here.
+  // By default, we simply get 1st largest partition as 'max_delegate_partions'
+  // is set to 1 by default.
  std::vector<int> ops_to_replace =
-      partition_helper.GetNodesOfFirstNLargestPartitions(1);
+      partition_helper.GetNodesOfFirstNLargestPartitions(
+          max_delegated_partitions);

  if (!unsupported_nodes_info.empty()) {
    std::string unsupported = absl::StrJoin(unsupported_nodes_info, "\n");
@ -2647,9 +2648,7 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops) {
    if (!ops_to_replace.empty()) {
      absl::StrAppend(
          &error_message, ops_to_replace.size(),
-          " operations will run on the GPU (first node: ",
-          ops_to_replace.front(), ", last node: ", ops_to_replace.back(),
-          "), and the remaining ",
+          " operations will run on the GPU, and the remaining ",
          partition_helper.num_total_nodes() - ops_to_replace.size());
    } else {
      absl::StrAppend(&error_message,
--- a/tensorflow/lite/delegates/gpu/common/model_builder.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.h
@ -29,8 +29,12 @@ namespace gpu {

 // Validates which operations are supported and returns array of operations to
 // replace with GPU kernels. The caller must free the pointer on TfLiteIntArray.
+// 'max_delegated_partitions' limits the maximum number of partitions to
+// delegate as a graph could possibly have multiple partitions (each partition
+// consists of a subset of ops) to be replaced.
 TfLiteIntArray* GetOpsToReplace(TfLiteContext* context,
-                                bool allow_quant_ops = false);
+                                bool allow_quant_ops = false,
+                                int max_delegated_partitions = 1);

 // Extracts TFLite delegate execution plan from the input TFLite context and
 // converts it into generic graph format.
--- a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
@ -502,6 +502,187 @@ TEST(ModelBuilderTest, GetOpsToReplaceDoesNotPruneUint8) {
  TfLiteIntArrayFree(ops_to_replace);
 }

+class Interpreter2Fp32 : public DelegatedInterpreter {
+ public:
+  Interpreter2Fp32() : DelegatedInterpreter(4) {
+    void* builtin_data = malloc(sizeof(int));
+    EXPECT_EQ(interpreter_.AddTensors(8), kTfLiteOk);
+    EXPECT_EQ(interpreter_.SetInputs({0, 2, 4, 6}), kTfLiteOk);
+    EXPECT_EQ(interpreter_.SetOutputs({7}), kTfLiteOk);
+
+    // Add a Dequantize Node with uint8 input.
+    const TfLiteRegistration reg_dequant = {/*init=*/nullptr,
+                                            /*free=*/nullptr,
+                                            /*prepare=*/nullptr,
+                                            /*invoke=*/nullptr,
+                                            /*profiling_string=*/nullptr,
+                                            kTfLiteBuiltinDequantize};
+    EXPECT_EQ(interpreter_.AddNodeWithParameters(
+                  /*inputs=*/{0}, /*outputs=*/{1}, /*init_data=*/nullptr,
+                  /*init_data_size=*/0, /*builtin_data=*/nullptr,
+                  /*registration=*/&reg_dequant),
+              kTfLiteOk);
+
+    // Add an ADD node that GPU delegate can parse.
+    const TfLiteRegistration reg_add0 = {
+        [](TfLiteContext* context, const char* buffer, size_t length) {
+          return reinterpret_cast<void*>(new int(1));
+        },
+        [](TfLiteContext* context, void* buffer) {
+          delete reinterpret_cast<int*>(buffer);
+        },
+        nullptr,
+        nullptr,
+        nullptr,
+        kTfLiteBuiltinAdd};
+    EXPECT_EQ(interpreter_.AddNodeWithParameters(
+                  /*inputs=*/{1, 2}, /*outputs=*/{3}, /*init_data=*/nullptr,
+                  /*init_data_size=*/0,
+                  /*builtin_data=*/builtin_data,
+                  /*registration=*/&reg_add0),
+              kTfLiteOk);
+
+    // Add a Pack Node that GPU delegate doesn't support
+    const TfLiteRegistration reg_pack = {/*init=*/nullptr,
+                                         /*free=*/nullptr,
+                                         /*prepare=*/nullptr,
+                                         /*invoke=*/nullptr,
+                                         /*profiling_string=*/nullptr,
+                                         kTfLiteBuiltinPack};
+    EXPECT_EQ(interpreter_.AddNodeWithParameters(
+                  /*inputs=*/{3, 4}, /*outputs=*/{5}, /*init_data=*/nullptr,
+                  /*init_data_size=*/0, /*builtin_data=*/nullptr,
+                  /*registration=*/&reg_pack),
+              kTfLiteOk);
+
+    const TfLiteRegistration reg_add1 = {
+        [](TfLiteContext* context, const char* buffer, size_t length) {
+          return reinterpret_cast<void*>(new int[2]);
+        },
+        [](TfLiteContext* context, void* buffer) {
+          delete reinterpret_cast<int*>(buffer);
+        },
+        nullptr,
+        nullptr,
+        nullptr,
+        kTfLiteBuiltinAdd};
+    EXPECT_EQ(interpreter_.AddNodeWithParameters(
+                  /*inputs=*/{5, 6}, /*outputs=*/{7}, /*init_data=*/nullptr,
+                  /*init_data_size=*/0,
+                  /*builtin_data=*/builtin_data,
+                  /*registration=*/&reg_add1),
+              kTfLiteOk);
+
+    std::vector<int> dims = {1};
+    TfLiteQuantization quantization;
+    quantization.type = kTfLiteNoQuantization;
+    EXPECT_EQ(interpreter_.SetTensorParametersReadWrite(
+                  0, TfLiteType::kTfLiteUInt8, "t0", dims, quantization, false),
+              kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            1, TfLiteType::kTfLiteFloat32, "t1", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            2, TfLiteType::kTfLiteFloat32, "t2", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            3, TfLiteType::kTfLiteFloat32, "t3", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            4, TfLiteType::kTfLiteFloat32, "t4", dims, quantization, false),
+        kTfLiteOk);
+
+    dims.push_back(2);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            5, TfLiteType::kTfLiteFloat32, "t5", dims, quantization, false),
+        kTfLiteOk);
+    EXPECT_EQ(
+        interpreter_.SetTensorParametersReadWrite(
+            6, TfLiteType::kTfLiteFloat32, "t6", dims, quantization, false),
+        kTfLiteOk);
+
+    exec_plan()->data[0] = 0;
+    exec_plan()->data[1] = 1;
+    exec_plan()->data[2] = 2;
+    exec_plan()->data[3] = 3;
+  }
+};
+
+Interpreter2Fp32* interpreter2_fp32 = new Interpreter2Fp32();
+
+TEST(ModelBuilderTest, GetOpsToReplaceMultiplePartitions) {
+  // A graph with a Dequant node with uint8 input, a Pack node are not pruned.
+  // As these ops are currently not supported on the GPU, they will be scheduled
+  // to run on the CPU while the remaining supported op Add on the GPU.
+  //
+  //   t0 (uint8) -> Dequant(0) -> t1 (FP32) -> Add(1) -> t3 (FP32) -> PACK (2)
+  //                               t2 (FP32) -/           t4 (FP32) -/
+  //   PACK (2) -> t5 (FP32) -> Add(3) -> t7
+  //            -> t6 (FP32) -/
+  //
+  TfLiteContext* context = interpreter2_fp32->context();
+
+  // These functions are meant to be called inside delegates. Swap out
+  // for similar functions to permit direct calling of GetOpsToReplace.
+  context->GetExecutionPlan = [](struct TfLiteContext* context,
+                                 TfLiteIntArray** execution_plan) {
+    *execution_plan = interpreter2_fp32->exec_plan();
+    return kTfLiteOk;
+  };
+  context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index,
+                                       TfLiteNode** node,
+                                       TfLiteRegistration** registration) {
+    auto& node_and_reg =
+        interpreter2_fp32->nodes_and_registration()[node_index];
+    *node = &node_and_reg.first;
+    *registration = &node_and_reg.second;
+    return kTfLiteOk;
+  };
+  context->PreviewDelegatePartitioning =
+      [](struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
+         TfLiteDelegateParams** partition_params_array, int* num_partitions) {
+        auto params = interpreter2_fp32->add_delegate_params();
+        params->nodes_to_replace = TfLiteIntArrayCreate(1);
+        params->nodes_to_replace->data[0] = 1;
+        params->input_tensors = TfLiteIntArrayCreate(2);
+        params->input_tensors->data[0] = 1;
+        params->input_tensors->data[1] = 2;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 3;
+
+        params = interpreter2_fp32->add_delegate_params();
+        params->nodes_to_replace = TfLiteIntArrayCreate(1);
+        params->nodes_to_replace->data[0] = 3;
+        params->input_tensors = TfLiteIntArrayCreate(2);
+        params->input_tensors->data[0] = 5;
+        params->input_tensors->data[1] = 6;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 7;
+
+        *partition_params_array = interpreter2_fp32->delegate_params();
+        *num_partitions = interpreter2_fp32->num_delegate_params();
+        return kTfLiteOk;
+      };
+
+  TfLiteIntArray* ops_to_replace = GetOpsToReplace(
+      context, /*allow_quant_ops=*/false, /*max_delegated_partitions*/ 2);
+
+  // As the Dequant op is not pruned and the ADD op could run on GPU, we have
+  // 2 partitions.
+  EXPECT_EQ(ops_to_replace->size, 2);
+  // ADD at index 1.
+  EXPECT_EQ(1, ops_to_replace->data[0]);
+  // ADD at index 3.
+  EXPECT_EQ(3, ops_to_replace->data[1]);
+
+  TfLiteIntArrayFree(ops_to_replace);
+}
+
 class InterpreterMultiNode : public DelegatedInterpreter {
 public:
  explicit InterpreterMultiNode(bool add_op_first = true)
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@ -70,17 +70,25 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);

 class Delegate {
 public:
-  explicit Delegate(const TfLiteGpuDelegateOptionsV2* options) {
+  explicit Delegate(const TfLiteGpuDelegateOptionsV2* options)
+      : num_delegate_kernels_(0) {
    options_ = options ? *options : TfLiteGpuDelegateOptionsV2Default();
+    if (options_.max_delegated_partitions <= 0) {
+      options_.max_delegated_partitions = 1;
+    }
  }

  TfLiteDelegate* tflite_delegate() { return &delegate_; }
  const TfLiteGpuDelegateOptionsV2& options() const { return options_; }

-  bool IsQuantOpsAllowed() {
+  bool IsQuantOpsAllowed() const {
    return options_.experimental_flags &
           TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
  }
+  int MaxDelegatedPartitions() const {
+    return options_.max_delegated_partitions;
+  }
+  int num_delegate_kernels() const { return num_delegate_kernels_; }

 private:
  TfLiteDelegate delegate_ = {
@ -93,13 +101,18 @@ class Delegate {
  };

  TfLiteGpuDelegateOptionsV2 options_;
+  int num_delegate_kernels_ = 0;
+
+  friend class DelegateKernel;
 };

 // Represent the execution of a subset of nodes on GPU.
 class DelegateKernel {
 public:
-  explicit DelegateKernel(const TfLiteGpuDelegateOptionsV2& options)
-      : options_(options) {}
+  explicit DelegateKernel(Delegate* delegate) : delegate_(delegate) {
+    ++delegate_->num_delegate_kernels_;
+  }
+  ~DelegateKernel() { --delegate_->num_delegate_kernels_; }

  absl::Status Prepare(TfLiteContext* context,
                       const TfLiteDelegateParams* delegate_params) {
@ -115,11 +128,11 @@ class DelegateKernel {

    std::unique_ptr<InferenceBuilder> builder;
    bool graph_is_destroyed;
-    if (options_.experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY) {
+    const int experimental_flags = delegate_->options().experimental_flags;
+    if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY) {
      RETURN_IF_ERROR(
          InitializeOpenClApi(&graph, &builder, &graph_is_destroyed));
-    } else if (options_.experimental_flags &
-               TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY) {
+    } else if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY) {
      RETURN_IF_ERROR(InitializeOpenGlApi(&graph, &builder));
    } else {
      // By default, we try CL first & fall back to GL if that fails.
@ -241,8 +254,7 @@ class DelegateKernel {
                               std::vector<uint32_t>* input_refs,
                               std::vector<uint32_t>* output_refs) {
    quant_conversion_map_.clear();
-    if (options_.experimental_flags &
-        TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT) {
+    if (delegate_->IsQuantOpsAllowed()) {
      RETURN_IF_ERROR(BuildFinalModel(context, delegate_params, graph,
                                      &quant_conversion_map_));
    } else {
@ -337,22 +349,23 @@ class DelegateKernel {
    cl::InferenceEnvironmentProperties properties;
    RETURN_IF_ERROR(cl::NewInferenceEnvironment(env_options, &cl_environment_,
                                                &properties));
+    auto delegate_options = delegate_->options();
    cl::InferenceOptions options;
    // If is_precision_loss_allowed == -1, then just use priorities instead
    // of paying attention to is_precision_loss_allowed value.
-    if (options_.is_precision_loss_allowed == -1) {
-      options.priority1 = ToPriority(options_.inference_priority1);
-      options.priority2 = ToPriority(options_.inference_priority2);
-      options.priority3 = ToPriority(options_.inference_priority3);
+    if (delegate_options.is_precision_loss_allowed == -1) {
+      options.priority1 = ToPriority(delegate_options.inference_priority1);
+      options.priority2 = ToPriority(delegate_options.inference_priority2);
+      options.priority3 = ToPriority(delegate_options.inference_priority3);
    } else {
      // Users set is_precision_loss_allowed explicitly, thus use it explicitly.
-      if (options_.is_precision_loss_allowed == 0) {
+      if (delegate_options.is_precision_loss_allowed == 0) {
        options.priority1 = InferencePriority::MAX_PRECISION;
      } else {
        options.priority1 = InferencePriority::MIN_LATENCY;
      }
    }
-    options.usage = ToUsage(options_.inference_preference);
+    options.usage = ToUsage(delegate_options.inference_preference);
    *graph_is_destroyed = true;
    RETURN_IF_ERROR(cl_environment_->NewInferenceBuilder(
        options, std::move(*graph), builder));
@ -367,11 +380,12 @@ class DelegateKernel {
    gl::InferenceEnvironmentProperties properties;
    RETURN_IF_ERROR(
        NewInferenceEnvironment(env_options, &gl_environment_, &properties));
+    auto delegate_options = delegate_->options();
    gl::InferenceOptions options;
-    options.usage = ToUsage(options_.inference_preference);
-    options.priority1 = ToPriority(options_.inference_priority1);
-    options.priority2 = ToPriority(options_.inference_priority2);
-    options.priority3 = ToPriority(options_.inference_priority3);
+    options.usage = ToUsage(delegate_options.inference_preference);
+    options.priority1 = ToPriority(delegate_options.inference_priority1);
+    options.priority2 = ToPriority(delegate_options.inference_priority2);
+    options.priority3 = ToPriority(delegate_options.inference_priority3);
    RETURN_IF_ERROR(gl_environment_->NewInferenceBuilder(std::move(*graph),
                                                         options, builder));
    enforce_same_thread_ = true;
@ -380,9 +394,8 @@ class DelegateKernel {
    return absl::OkStatus();
  }

-  // Shared across all DelegateKernel instances, passed by the Delegate
-  // instance.
-  const TfLiteGpuDelegateOptionsV2& options_;
+  // The Delegate instance that's shared across all DelegateKernel instances.
+  Delegate* const delegate_;  // doesn't own the memory.
  std::unique_ptr<cl::InferenceEnvironment> cl_environment_;
  std::unique_ptr<gl::InferenceEnvironment> gl_environment_;
  std::unique_ptr<InferenceRunner> runner_;
@ -414,7 +427,7 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
        // Everything below should happen in prepare function call, but TFLite
        // for whatever reason forbids that.
        auto gpu_delegate_kernel =
-            absl::make_unique<DelegateKernel>(gpu_delegate->options());
+            absl::make_unique<DelegateKernel>(gpu_delegate);
        const auto status = gpu_delegate_kernel->Prepare(context, params);
        if (!status.ok()) {
          context->ReportError(context, "TfLiteGpuDelegate Init: %s",
@ -463,10 +476,15 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
      "TfLiteGpuDelegateV2",  // .custom_name
      1,                      // .version
  };
-  TfLiteIntArray* ops_to_replace = GetOpsToReplace(
-      context, /*allow_quant_ops=*/GetDelegate(delegate)->IsQuantOpsAllowed());
+
+  auto* gpu_delegate = GetDelegate(delegate);
+  TfLiteIntArray* ops_to_replace =
+      GetOpsToReplace(context, gpu_delegate->IsQuantOpsAllowed(),
+                      gpu_delegate->MaxDelegatedPartitions());
  const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
      context, kRegistration, ops_to_replace, delegate);
+  TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Created %d GPU delegate kernels.",
+                  gpu_delegate->num_delegate_kernels());
  TfLiteIntArrayFree(ops_to_replace);
  return status;
 }
@ -476,15 +494,17 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
 }  // namespace tflite

 TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
-  TfLiteGpuDelegateOptionsV2 options;
-  // set it to -1 to detect whether it was later adjusted.
-  options.is_precision_loss_allowed = -1;
-  options.inference_preference =
-      TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
-  options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
-  options.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO;
-  options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO;
-  options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
+  TfLiteGpuDelegateOptionsV2 options = {
+      // set it to -1 to detect whether it was later adjusted.
+      .is_precision_loss_allowed = -1,
+      .inference_preference =
+          TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+      .inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION,
+      .inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
+      .inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
+      .experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE,
+      .max_delegated_partitions = 1,
+  };
  return options;
 }

--- a/tensorflow/lite/delegates/gpu/delegate.h
+++ b/tensorflow/lite/delegates/gpu/delegate.h
@ -109,6 +109,11 @@ typedef struct {

  // Bitmask flags. See the comments in TfLiteGpuExperimentalFlags.
  int64_t experimental_flags;
+
+  // A graph could have multiple partitions that can be delegated to the GPU.
+  // This limits the maximum number of partitions to be delegated. By default,
+  // it's set to 1 in TfLiteGpuDelegateOptionsV2Default().
+  int32_t max_delegated_partitions;
 } TfLiteGpuDelegateOptionsV2;

 // Populates TfLiteGpuDelegateOptionsV2 as follows:
--- a/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
@ -129,6 +129,8 @@ TfLiteDelegatePtr GpuDelegateProvider::CreateTfLiteDelegate(
        gpu_opts.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY;
      }
    }
+    gpu_opts.max_delegated_partitions =
+        params.Get<int>("max_delegated_partitions");
    delegate = evaluation::CreateGPUDelegate(&gpu_opts);
 #elif defined(REAL_IPHONE_DEVICE)
    TFLGpuDelegateOptions gpu_opts = {0};