Add an option in GPU delegate to parameterize the #partitions to delegate. The default value of this parameter is 1, same w/ the current behavior.

PiperOrigin-RevId: 307788075
Change-Id: I26bb65fcf049e82cc46e88f818b07ae245fbb1cc
This commit is contained in:
Chao Mei 2020-04-22 04:10:11 -07:00 committed by TensorFlower Gardener
parent e97559bc40
commit 1912ef16d6
6 changed files with 254 additions and 43 deletions

View File

@ -2602,7 +2602,8 @@ bool IsAllAllowedTensors(TfLiteContext* context, const TfLiteIntArray* array,
// TODO(impjdi): Check number of input/output tensors and their dimensions.
// TODO(impjdi): Check ops' parameters.
TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops) {
TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops,
int max_delegated_partitions) {
delegates::IsNodeSupportedFn node_supported_fn =
[=](TfLiteContext* context, TfLiteNode* node,
TfLiteRegistration* registration,
@ -2633,11 +2634,11 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops) {
return TfLiteIntArrayCreate(0);
}
// We simply get 1st largest partition, but we could later explore whether
// getting more partitions could lead to better performance, i.e. by
// parameterizing '1' here.
// By default, we simply get 1st largest partition as 'max_delegate_partions'
// is set to 1 by default.
std::vector<int> ops_to_replace =
partition_helper.GetNodesOfFirstNLargestPartitions(1);
partition_helper.GetNodesOfFirstNLargestPartitions(
max_delegated_partitions);
if (!unsupported_nodes_info.empty()) {
std::string unsupported = absl::StrJoin(unsupported_nodes_info, "\n");
@ -2647,9 +2648,7 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops) {
if (!ops_to_replace.empty()) {
absl::StrAppend(
&error_message, ops_to_replace.size(),
" operations will run on the GPU (first node: ",
ops_to_replace.front(), ", last node: ", ops_to_replace.back(),
"), and the remaining ",
" operations will run on the GPU, and the remaining ",
partition_helper.num_total_nodes() - ops_to_replace.size());
} else {
absl::StrAppend(&error_message,

View File

@ -29,8 +29,12 @@ namespace gpu {
// Validates which operations are supported and returns array of operations to
// replace with GPU kernels. The caller must free the pointer on TfLiteIntArray.
// 'max_delegated_partitions' limits the maximum number of partitions to
// delegate as a graph could possibly have multiple partitions (each partition
// consists of a subset of ops) to be replaced.
TfLiteIntArray* GetOpsToReplace(TfLiteContext* context,
bool allow_quant_ops = false);
bool allow_quant_ops = false,
int max_delegated_partitions = 1);
// Extracts TFLite delegate execution plan from the input TFLite context and
// converts it into generic graph format.

View File

@ -502,6 +502,187 @@ TEST(ModelBuilderTest, GetOpsToReplaceDoesNotPruneUint8) {
TfLiteIntArrayFree(ops_to_replace);
}
class Interpreter2Fp32 : public DelegatedInterpreter {
public:
Interpreter2Fp32() : DelegatedInterpreter(4) {
void* builtin_data = malloc(sizeof(int));
EXPECT_EQ(interpreter_.AddTensors(8), kTfLiteOk);
EXPECT_EQ(interpreter_.SetInputs({0, 2, 4, 6}), kTfLiteOk);
EXPECT_EQ(interpreter_.SetOutputs({7}), kTfLiteOk);
// Add a Dequantize Node with uint8 input.
const TfLiteRegistration reg_dequant = {/*init=*/nullptr,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*invoke=*/nullptr,
/*profiling_string=*/nullptr,
kTfLiteBuiltinDequantize};
EXPECT_EQ(interpreter_.AddNodeWithParameters(
/*inputs=*/{0}, /*outputs=*/{1}, /*init_data=*/nullptr,
/*init_data_size=*/0, /*builtin_data=*/nullptr,
/*registration=*/&reg_dequant),
kTfLiteOk);
// Add an ADD node that GPU delegate can parse.
const TfLiteRegistration reg_add0 = {
[](TfLiteContext* context, const char* buffer, size_t length) {
return reinterpret_cast<void*>(new int(1));
},
[](TfLiteContext* context, void* buffer) {
delete reinterpret_cast<int*>(buffer);
},
nullptr,
nullptr,
nullptr,
kTfLiteBuiltinAdd};
EXPECT_EQ(interpreter_.AddNodeWithParameters(
/*inputs=*/{1, 2}, /*outputs=*/{3}, /*init_data=*/nullptr,
/*init_data_size=*/0,
/*builtin_data=*/builtin_data,
/*registration=*/&reg_add0),
kTfLiteOk);
// Add a Pack Node that GPU delegate doesn't support
const TfLiteRegistration reg_pack = {/*init=*/nullptr,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*invoke=*/nullptr,
/*profiling_string=*/nullptr,
kTfLiteBuiltinPack};
EXPECT_EQ(interpreter_.AddNodeWithParameters(
/*inputs=*/{3, 4}, /*outputs=*/{5}, /*init_data=*/nullptr,
/*init_data_size=*/0, /*builtin_data=*/nullptr,
/*registration=*/&reg_pack),
kTfLiteOk);
const TfLiteRegistration reg_add1 = {
[](TfLiteContext* context, const char* buffer, size_t length) {
return reinterpret_cast<void*>(new int[2]);
},
[](TfLiteContext* context, void* buffer) {
delete reinterpret_cast<int*>(buffer);
},
nullptr,
nullptr,
nullptr,
kTfLiteBuiltinAdd};
EXPECT_EQ(interpreter_.AddNodeWithParameters(
/*inputs=*/{5, 6}, /*outputs=*/{7}, /*init_data=*/nullptr,
/*init_data_size=*/0,
/*builtin_data=*/builtin_data,
/*registration=*/&reg_add1),
kTfLiteOk);
std::vector<int> dims = {1};
TfLiteQuantization quantization;
quantization.type = kTfLiteNoQuantization;
EXPECT_EQ(interpreter_.SetTensorParametersReadWrite(
0, TfLiteType::kTfLiteUInt8, "t0", dims, quantization, false),
kTfLiteOk);
EXPECT_EQ(
interpreter_.SetTensorParametersReadWrite(
1, TfLiteType::kTfLiteFloat32, "t1", dims, quantization, false),
kTfLiteOk);
EXPECT_EQ(
interpreter_.SetTensorParametersReadWrite(
2, TfLiteType::kTfLiteFloat32, "t2", dims, quantization, false),
kTfLiteOk);
EXPECT_EQ(
interpreter_.SetTensorParametersReadWrite(
3, TfLiteType::kTfLiteFloat32, "t3", dims, quantization, false),
kTfLiteOk);
EXPECT_EQ(
interpreter_.SetTensorParametersReadWrite(
4, TfLiteType::kTfLiteFloat32, "t4", dims, quantization, false),
kTfLiteOk);
dims.push_back(2);
EXPECT_EQ(
interpreter_.SetTensorParametersReadWrite(
5, TfLiteType::kTfLiteFloat32, "t5", dims, quantization, false),
kTfLiteOk);
EXPECT_EQ(
interpreter_.SetTensorParametersReadWrite(
6, TfLiteType::kTfLiteFloat32, "t6", dims, quantization, false),
kTfLiteOk);
exec_plan()->data[0] = 0;
exec_plan()->data[1] = 1;
exec_plan()->data[2] = 2;
exec_plan()->data[3] = 3;
}
};
Interpreter2Fp32* interpreter2_fp32 = new Interpreter2Fp32();
TEST(ModelBuilderTest, GetOpsToReplaceMultiplePartitions) {
// A graph with a Dequant node with uint8 input, a Pack node are not pruned.
// As these ops are currently not supported on the GPU, they will be scheduled
// to run on the CPU while the remaining supported op Add on the GPU.
//
// t0 (uint8) -> Dequant(0) -> t1 (FP32) -> Add(1) -> t3 (FP32) -> PACK (2)
// t2 (FP32) -/ t4 (FP32) -/
// PACK (2) -> t5 (FP32) -> Add(3) -> t7
// -> t6 (FP32) -/
//
TfLiteContext* context = interpreter2_fp32->context();
// These functions are meant to be called inside delegates. Swap out
// for similar functions to permit direct calling of GetOpsToReplace.
context->GetExecutionPlan = [](struct TfLiteContext* context,
TfLiteIntArray** execution_plan) {
*execution_plan = interpreter2_fp32->exec_plan();
return kTfLiteOk;
};
context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index,
TfLiteNode** node,
TfLiteRegistration** registration) {
auto& node_and_reg =
interpreter2_fp32->nodes_and_registration()[node_index];
*node = &node_and_reg.first;
*registration = &node_and_reg.second;
return kTfLiteOk;
};
context->PreviewDelegatePartitioning =
[](struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
TfLiteDelegateParams** partition_params_array, int* num_partitions) {
auto params = interpreter2_fp32->add_delegate_params();
params->nodes_to_replace = TfLiteIntArrayCreate(1);
params->nodes_to_replace->data[0] = 1;
params->input_tensors = TfLiteIntArrayCreate(2);
params->input_tensors->data[0] = 1;
params->input_tensors->data[1] = 2;
params->output_tensors = TfLiteIntArrayCreate(1);
params->output_tensors->data[0] = 3;
params = interpreter2_fp32->add_delegate_params();
params->nodes_to_replace = TfLiteIntArrayCreate(1);
params->nodes_to_replace->data[0] = 3;
params->input_tensors = TfLiteIntArrayCreate(2);
params->input_tensors->data[0] = 5;
params->input_tensors->data[1] = 6;
params->output_tensors = TfLiteIntArrayCreate(1);
params->output_tensors->data[0] = 7;
*partition_params_array = interpreter2_fp32->delegate_params();
*num_partitions = interpreter2_fp32->num_delegate_params();
return kTfLiteOk;
};
TfLiteIntArray* ops_to_replace = GetOpsToReplace(
context, /*allow_quant_ops=*/false, /*max_delegated_partitions*/ 2);
// As the Dequant op is not pruned and the ADD op could run on GPU, we have
// 2 partitions.
EXPECT_EQ(ops_to_replace->size, 2);
// ADD at index 1.
EXPECT_EQ(1, ops_to_replace->data[0]);
// ADD at index 3.
EXPECT_EQ(3, ops_to_replace->data[1]);
TfLiteIntArrayFree(ops_to_replace);
}
class InterpreterMultiNode : public DelegatedInterpreter {
public:
explicit InterpreterMultiNode(bool add_op_first = true)

View File

@ -70,17 +70,25 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
class Delegate {
public:
explicit Delegate(const TfLiteGpuDelegateOptionsV2* options) {
explicit Delegate(const TfLiteGpuDelegateOptionsV2* options)
: num_delegate_kernels_(0) {
options_ = options ? *options : TfLiteGpuDelegateOptionsV2Default();
if (options_.max_delegated_partitions <= 0) {
options_.max_delegated_partitions = 1;
}
}
TfLiteDelegate* tflite_delegate() { return &delegate_; }
const TfLiteGpuDelegateOptionsV2& options() const { return options_; }
bool IsQuantOpsAllowed() {
bool IsQuantOpsAllowed() const {
return options_.experimental_flags &
TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
}
int MaxDelegatedPartitions() const {
return options_.max_delegated_partitions;
}
int num_delegate_kernels() const { return num_delegate_kernels_; }
private:
TfLiteDelegate delegate_ = {
@ -93,13 +101,18 @@ class Delegate {
};
TfLiteGpuDelegateOptionsV2 options_;
int num_delegate_kernels_ = 0;
friend class DelegateKernel;
};
// Represent the execution of a subset of nodes on GPU.
class DelegateKernel {
public:
explicit DelegateKernel(const TfLiteGpuDelegateOptionsV2& options)
: options_(options) {}
explicit DelegateKernel(Delegate* delegate) : delegate_(delegate) {
++delegate_->num_delegate_kernels_;
}
~DelegateKernel() { --delegate_->num_delegate_kernels_; }
absl::Status Prepare(TfLiteContext* context,
const TfLiteDelegateParams* delegate_params) {
@ -115,11 +128,11 @@ class DelegateKernel {
std::unique_ptr<InferenceBuilder> builder;
bool graph_is_destroyed;
if (options_.experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY) {
const int experimental_flags = delegate_->options().experimental_flags;
if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY) {
RETURN_IF_ERROR(
InitializeOpenClApi(&graph, &builder, &graph_is_destroyed));
} else if (options_.experimental_flags &
TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY) {
} else if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY) {
RETURN_IF_ERROR(InitializeOpenGlApi(&graph, &builder));
} else {
// By default, we try CL first & fall back to GL if that fails.
@ -241,8 +254,7 @@ class DelegateKernel {
std::vector<uint32_t>* input_refs,
std::vector<uint32_t>* output_refs) {
quant_conversion_map_.clear();
if (options_.experimental_flags &
TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT) {
if (delegate_->IsQuantOpsAllowed()) {
RETURN_IF_ERROR(BuildFinalModel(context, delegate_params, graph,
&quant_conversion_map_));
} else {
@ -337,22 +349,23 @@ class DelegateKernel {
cl::InferenceEnvironmentProperties properties;
RETURN_IF_ERROR(cl::NewInferenceEnvironment(env_options, &cl_environment_,
&properties));
auto delegate_options = delegate_->options();
cl::InferenceOptions options;
// If is_precision_loss_allowed == -1, then just use priorities instead
// of paying attention to is_precision_loss_allowed value.
if (options_.is_precision_loss_allowed == -1) {
options.priority1 = ToPriority(options_.inference_priority1);
options.priority2 = ToPriority(options_.inference_priority2);
options.priority3 = ToPriority(options_.inference_priority3);
if (delegate_options.is_precision_loss_allowed == -1) {
options.priority1 = ToPriority(delegate_options.inference_priority1);
options.priority2 = ToPriority(delegate_options.inference_priority2);
options.priority3 = ToPriority(delegate_options.inference_priority3);
} else {
// Users set is_precision_loss_allowed explicitly, thus use it explicitly.
if (options_.is_precision_loss_allowed == 0) {
if (delegate_options.is_precision_loss_allowed == 0) {
options.priority1 = InferencePriority::MAX_PRECISION;
} else {
options.priority1 = InferencePriority::MIN_LATENCY;
}
}
options.usage = ToUsage(options_.inference_preference);
options.usage = ToUsage(delegate_options.inference_preference);
*graph_is_destroyed = true;
RETURN_IF_ERROR(cl_environment_->NewInferenceBuilder(
options, std::move(*graph), builder));
@ -367,11 +380,12 @@ class DelegateKernel {
gl::InferenceEnvironmentProperties properties;
RETURN_IF_ERROR(
NewInferenceEnvironment(env_options, &gl_environment_, &properties));
auto delegate_options = delegate_->options();
gl::InferenceOptions options;
options.usage = ToUsage(options_.inference_preference);
options.priority1 = ToPriority(options_.inference_priority1);
options.priority2 = ToPriority(options_.inference_priority2);
options.priority3 = ToPriority(options_.inference_priority3);
options.usage = ToUsage(delegate_options.inference_preference);
options.priority1 = ToPriority(delegate_options.inference_priority1);
options.priority2 = ToPriority(delegate_options.inference_priority2);
options.priority3 = ToPriority(delegate_options.inference_priority3);
RETURN_IF_ERROR(gl_environment_->NewInferenceBuilder(std::move(*graph),
options, builder));
enforce_same_thread_ = true;
@ -380,9 +394,8 @@ class DelegateKernel {
return absl::OkStatus();
}
// Shared across all DelegateKernel instances, passed by the Delegate
// instance.
const TfLiteGpuDelegateOptionsV2& options_;
// The Delegate instance that's shared across all DelegateKernel instances.
Delegate* const delegate_; // doesn't own the memory.
std::unique_ptr<cl::InferenceEnvironment> cl_environment_;
std::unique_ptr<gl::InferenceEnvironment> gl_environment_;
std::unique_ptr<InferenceRunner> runner_;
@ -414,7 +427,7 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
// Everything below should happen in prepare function call, but TFLite
// for whatever reason forbids that.
auto gpu_delegate_kernel =
absl::make_unique<DelegateKernel>(gpu_delegate->options());
absl::make_unique<DelegateKernel>(gpu_delegate);
const auto status = gpu_delegate_kernel->Prepare(context, params);
if (!status.ok()) {
context->ReportError(context, "TfLiteGpuDelegate Init: %s",
@ -463,10 +476,15 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
"TfLiteGpuDelegateV2", // .custom_name
1, // .version
};
TfLiteIntArray* ops_to_replace = GetOpsToReplace(
context, /*allow_quant_ops=*/GetDelegate(delegate)->IsQuantOpsAllowed());
auto* gpu_delegate = GetDelegate(delegate);
TfLiteIntArray* ops_to_replace =
GetOpsToReplace(context, gpu_delegate->IsQuantOpsAllowed(),
gpu_delegate->MaxDelegatedPartitions());
const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
context, kRegistration, ops_to_replace, delegate);
TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Created %d GPU delegate kernels.",
gpu_delegate->num_delegate_kernels());
TfLiteIntArrayFree(ops_to_replace);
return status;
}
@ -476,15 +494,17 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
} // namespace tflite
TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
TfLiteGpuDelegateOptionsV2 options;
// set it to -1 to detect whether it was later adjusted.
options.is_precision_loss_allowed = -1;
options.inference_preference =
TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
options.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO;
options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO;
options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
TfLiteGpuDelegateOptionsV2 options = {
// set it to -1 to detect whether it was later adjusted.
.is_precision_loss_allowed = -1,
.inference_preference =
TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION,
.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE,
.max_delegated_partitions = 1,
};
return options;
}

View File

@ -109,6 +109,11 @@ typedef struct {
// Bitmask flags. See the comments in TfLiteGpuExperimentalFlags.
int64_t experimental_flags;
// A graph could have multiple partitions that can be delegated to the GPU.
// This limits the maximum number of partitions to be delegated. By default,
// it's set to 1 in TfLiteGpuDelegateOptionsV2Default().
int32_t max_delegated_partitions;
} TfLiteGpuDelegateOptionsV2;
// Populates TfLiteGpuDelegateOptionsV2 as follows:

View File

@ -129,6 +129,8 @@ TfLiteDelegatePtr GpuDelegateProvider::CreateTfLiteDelegate(
gpu_opts.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY;
}
}
gpu_opts.max_delegated_partitions =
params.Get<int>("max_delegated_partitions");
delegate = evaluation::CreateGPUDelegate(&gpu_opts);
#elif defined(REAL_IPHONE_DEVICE)
TFLGpuDelegateOptions gpu_opts = {0};