From dab856a93fdecb88880e08bc94928f3e0f141cf9 Mon Sep 17 00:00:00 2001 From: Raman Sarokin Date: Mon, 3 Aug 2020 17:10:40 -0700 Subject: [PATCH] Removed ElementwiseOperation. Simplified ex-ElementwiseOperations, ReLU, PReLU, etc. PiperOrigin-RevId: 324715510 Change-Id: I3d98cdbcc8075bb91f20e065b0aca2ab16a4e8e5 --- .../delegates/gpu/cl/inference_context.cc | 8 +- .../lite/delegates/gpu/cl/kernels/add.cc | 43 ++-- .../lite/delegates/gpu/cl/kernels/add.h | 22 +- .../lite/delegates/gpu/cl/kernels/add_test.cc | 6 +- .../delegates/gpu/cl/kernels/elementwise.cc | 215 ++++++------------ .../delegates/gpu/cl/kernels/elementwise.h | 91 ++------ .../gpu/cl/kernels/elementwise_test.cc | 62 +++-- .../delegates/gpu/cl/kernels/gpu_operation.cc | 158 +++++++------ .../delegates/gpu/cl/kernels/gpu_operation.h | 76 ++----- .../lite/delegates/gpu/cl/kernels/prelu.cc | 46 ++-- .../lite/delegates/gpu/cl/kernels/prelu.h | 41 +--- .../delegates/gpu/cl/kernels/prelu_test.cc | 4 +- .../gpu/cl/kernels/quantize_and_dequantize.cc | 66 ++---- .../gpu/cl/kernels/quantize_and_dequantize.h | 38 +--- .../kernels/quantize_and_dequantize_test.cc | 20 +- .../lite/delegates/gpu/cl/kernels/relu.cc | 42 ++-- .../lite/delegates/gpu/cl/kernels/relu.h | 22 +- .../delegates/gpu/cl/kernels/relu_test.cc | 8 +- .../gpu/cl/selectors/operation_selector.cc | 75 +++--- .../gpu/cl/selectors/simple_selectors.cc | 28 ++- .../gpu/cl/selectors/simple_selectors.h | 8 +- 21 files changed, 368 insertions(+), 711 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc index 3067c81ec94..8e23eb1bcee 100644 --- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc +++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc @@ -390,9 +390,7 @@ void InferenceContext::Merge() { continue; } auto& linkable_node = nodes_[next_nodes[0]]; - auto* elementwise = - dynamic_cast(linkable_node.operations[0].get()); - if (!elementwise || !elementwise->IsLinkable() || + if (!linkable_node.operations[0]->IsLinkable() || linkable_node.outputs.size() != 1 || !IsReady(ready_tensors, linkable_node)) { continue; @@ -410,9 +408,7 @@ void InferenceContext::Merge() { } for (auto& node : nodes_) { for (int j = 1; j < node.operations.size(); ++j) { - auto* elementwise = - dynamic_cast(node.operations[j].get()); - node.operations[0]->AddOperation(elementwise); + node.operations[0]->AddOperation(node.operations[j].get()); } } } diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc index 1d09e39b83b..1cb41e79d88 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc @@ -25,42 +25,29 @@ namespace tflite { namespace gpu { namespace cl { -Add::Add(const OperationDef& definition, const std::vector& channels, - int dst_channels) - : ElementwiseOperation(definition) { +GPUOperation CreateAdd(const OperationDef& definition, + const std::vector& channels, int dst_channels) { + GPUOperation add(definition); int dst_depth = DivideRoundUp(dst_channels, 4); int src0_depth = DivideRoundUp(channels[0], 4); - linkable_ = dst_depth == src0_depth; + add.elementwise_ = true; + add.linkable_ = dst_depth == src0_depth; if (src0_depth < dst_depth) { - check_src_channels_size_ = true; + add.check_src_channels_size_ = true; } - for (int i = 1; i < definition_.src_tensors.size(); ++i) { + for (int i = 1; i < definition.src_tensors.size(); ++i) { const std::string tensor_name = absl::StrCat("src_data_", i); - auto src_desc = definition_.src_tensors[i]; - if (definition_.IsBatchSupported()) { + auto src_desc = definition.src_tensors[i]; + if (definition.IsBatchSupported()) { src_desc.SetStateVar("BatchedWidth", "true"); } - AddSrcTensor(tensor_name, src_desc); - code_ += "if (S_COORD < args." + tensor_name + ".Slices()) {\n"; - code_ += " in_out_value += args." + tensor_name + - ".Read(X_COORD, Y_COORD, S_COORD);\n"; - code_ += "}\n"; + add.AddSrcTensor(tensor_name, src_desc); + add.code_ += "if (S_COORD < args." + tensor_name + ".Slices()) {\n"; + add.code_ += " in_out_value += args." + tensor_name + + ".Read(X_COORD, Y_COORD, S_COORD);\n"; + add.code_ += "}\n"; } -} - -Add::Add(Add&& operation) : ElementwiseOperation(std::move(operation)) {} - -Add& Add::operator=(Add&& operation) { - if (this != &operation) { - ElementwiseOperation::operator=(std::move(operation)); - } - return *this; -} - -Add CreateAdd(const OperationDef& definition, const std::vector& channels, - int dst_channels) { - Add operation(definition, channels, dst_channels); - return operation; + return add; } } // namespace cl diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.h b/tensorflow/lite/delegates/gpu/cl/kernels/add.h index 81b2fed116f..0e9d7e0d333 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/add.h +++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.h @@ -27,24 +27,10 @@ namespace tflite { namespace gpu { namespace cl { -// Add operation inherited from ElementwiseOperation, but it is more -// complicated than usual elementwise, that is why it has own versions for -// Compile. Add operation support not equal tensors on input (for possibility to -// remove Padding operation with zeroes in Z dimension) -class Add : public ElementwiseOperation { - public: - Add(const OperationDef& definition, const std::vector& channels, - int dst_channels); - - // Move only - Add(Add&& operation); - Add& operator=(Add&& operation); - Add(const Add&) = delete; - Add& operator=(const Add&) = delete; -}; - -Add CreateAdd(const OperationDef& definition, const std::vector& channels, - int dst_channels); +// Add operation supports not equal tensors on input (for possibility to +// remove Padding operation with zeroes in channels dimension) +GPUOperation CreateAdd(const OperationDef& definition, + const std::vector& channels, int dst_channels); } // namespace cl } // namespace gpu diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc index 1eccab87646..2856b37a497 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc @@ -49,7 +49,7 @@ TEST_F(OpenCLOperationTest, AddTwoEqualTensors) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - Add operation = CreateAdd(op_def, channels, channels[0]); + GPUOperation operation = CreateAdd(op_def, channels, channels[0]); ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); EXPECT_THAT(dst_tensor.data, @@ -77,7 +77,7 @@ TEST_F(OpenCLOperationTest, AddFirstTensorHasMoreChannelsThanSecond) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - Add operation = CreateAdd(op_def, channels, channels[0]); + GPUOperation operation = CreateAdd(op_def, channels, channels[0]); ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation, BHWC(1, 2, 1, 6), &dst_tensor)); EXPECT_THAT(dst_tensor.data, @@ -107,7 +107,7 @@ TEST_F(OpenCLOperationTest, AddFirstTensorHasLessChannelsThanSecond) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - Add operation = CreateAdd(op_def, channels, 6); + GPUOperation operation = CreateAdd(op_def, channels, 6); ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation, BHWC(1, 2, 1, 6), &dst_tensor)); EXPECT_THAT(dst_tensor.data, diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc index 21866021e91..063b15c1b69 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc @@ -134,128 +134,33 @@ std::string GetTwoInputCode(const OperationType& op_type, } } // namespace -ElementwiseOneInput::ElementwiseOneInput(const OperationDef& definition, - const OperationType& op_type) - : ElementwiseOperation(definition) { - code_ = GetOneInputCode(op_type, definition.precision, "in_out_value"); +GPUOperation CreateElementwiseOneInput(const OperationDef& definition, + const OperationType& op_type) { + GPUOperation op(definition); + op.elementwise_ = true; + op.code_ = GetOneInputCode(op_type, definition.precision, "in_out_value"); + return op; } -ElementwiseOneInput::ElementwiseOneInput(ElementwiseOneInput&& operation) - : ElementwiseOperation(std::move(operation)) {} - -ElementwiseOneInput& ElementwiseOneInput::operator=( - ElementwiseOneInput&& operation) { - if (this != &operation) { - ElementwiseOperation::operator=(std::move(operation)); - } - return *this; -} - -ElementwiseOneInput CreateElementwiseOneInput(const OperationDef& definition, - const OperationType& op_type) { - ElementwiseOneInput operation(definition, op_type); - return operation; -} - -ElementwiseOneRuntimeOneScalar::ElementwiseOneRuntimeOneScalar( - const OperationDef& definition, const OperationType& op_type, - float scalar_parameter, CalculationsPrecision scalar_precision) - : ElementwiseOperation(definition) { - if (definition.precision == CalculationsPrecision::F32) { - args_.AddFloat("scalar", scalar_parameter); - } else { - args_.AddHalf("scalar", half(scalar_parameter)); - } - code_ = GetTwoInputCode(op_type, "in_out_value", "args.scalar"); -} - -ElementwiseOneRuntimeOneScalar::ElementwiseOneRuntimeOneScalar( - ElementwiseOneRuntimeOneScalar&& operation) - : ElementwiseOperation(std::move(operation)) {} - -ElementwiseOneRuntimeOneScalar& ElementwiseOneRuntimeOneScalar::operator=( - ElementwiseOneRuntimeOneScalar&& operation) { - if (this != &operation) { - ElementwiseOperation::operator=(std::move(operation)); - } - return *this; -} - -ElementwiseOneRuntimeOneScalar CreateElementwiseOneRuntimeOneScalar( +GPUOperation CreateElementwiseOneRuntimeOneScalar( const CreationContext& creation_context, const OperationDef& definition, const OperationType& op_type, float scalar_parameter) { - const auto scalar_precision = creation_context.device->IsPowerVR() - ? CalculationsPrecision::F32 - : definition.precision; - ElementwiseOneRuntimeOneScalar operation(definition, op_type, - scalar_parameter, scalar_precision); - return operation; -} - -ElementwiseTwoInput::ElementwiseTwoInput(const OperationDef& definition, - const OperationType& op_type, - const BroadcastSettings& broadcast) - : ElementwiseOperation(definition), - broadcast_(broadcast) { - auto src_desc = definition.src_tensors[1]; - if (definition.IsBatchSupported()) { - src_desc.SetStateVar("BatchedWidth", "true"); + GPUOperation op(definition); + op.elementwise_ = true; + if (definition.precision == CalculationsPrecision::F32) { + op.args_.AddFloat("scalar", scalar_parameter); + } else { + op.args_.AddHalf("scalar", half(scalar_parameter)); } - AddSrcTensor("second_tensor", src_desc); - const std::string x_coord = broadcast.width ? "0" : "X_COORD"; - const std::string y_coord = broadcast.height ? "0" : "Y_COORD"; - const std::string s_coord = broadcast.channels ? "0" : "S_COORD"; - code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(", x_coord, - ", ", y_coord, ", ", s_coord, ");\n"); - if (broadcast.channels) { - code_ += " second_val.y = second_val.x;\n"; - code_ += " second_val.z = second_val.x;\n"; - code_ += " second_val.w = second_val.x;\n"; - } - code_ += GetTwoInputCode(op_type, "in_out_value", "second_val"); -} - -ElementwiseTwoInput::ElementwiseTwoInput(const OperationDef& definition, - const OperationType& op_type, - const BroadcastSettings& broadcast, - Tensor&& constant_tensor) - : ElementwiseOperation(definition), - broadcast_(broadcast) { - auto descriptor = constant_tensor.GetDescriptor(); - args_.AddObject("second_tensor", AccessType::READ, - absl::make_unique(std::move(constant_tensor)), - absl::make_unique(descriptor)); - const std::string x_coord = broadcast.width ? "0" : "X_COORD"; - const std::string y_coord = broadcast.height ? "0" : "Y_COORD"; - const std::string s_coord = broadcast.channels ? "0" : "S_COORD"; - code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(", x_coord, - ", ", y_coord, ", ", s_coord, ");\n"); - if (broadcast.channels) { - code_ += " second_val.y = second_val.x;\n"; - code_ += " second_val.z = second_val.x;\n"; - code_ += " second_val.w = second_val.x;\n"; - } - code_ += GetTwoInputCode(op_type, "in_out_value", "second_val"); -} - -ElementwiseTwoInput::ElementwiseTwoInput(ElementwiseTwoInput&& operation) - : ElementwiseOperation(std::move(operation)), - broadcast_(operation.broadcast_) {} - -ElementwiseTwoInput& ElementwiseTwoInput::operator=( - ElementwiseTwoInput&& operation) { - if (this != &operation) { - broadcast_ = operation.broadcast_; - ElementwiseOperation::operator=(std::move(operation)); - } - return *this; + op.code_ = GetTwoInputCode(op_type, "in_out_value", "args.scalar"); + return op; } absl::Status CreateElementwiseTwoInput( const CreationContext& creation_context, const OperationDef& definition, const OperationType& op_type, const tflite::gpu::Tensor& constant_tensor, - ElementwiseTwoInput* result) { + GPUOperation* result) { const BHWC shape = BHWC(1, 1, 1, constant_tensor.shape.v); TensorStorageType storage_type = SelectBestStorageType(*creation_context.context, *creation_context.device, @@ -268,12 +173,21 @@ absl::Status CreateElementwiseTwoInput( &gpu_tensor)); RETURN_IF_ERROR( gpu_tensor.WriteData(creation_context.queue, constant_tensor)); - BroadcastSettings broadcast; - broadcast.width = true; - broadcast.height = true; - broadcast.channels = shape.c == 1; - *result = ElementwiseTwoInput(definition, op_type, broadcast, - std::move(gpu_tensor)); + + *result = GPUOperation(definition); + result->elementwise_ = true; + result->args_.AddObject("second_tensor", AccessType::READ, + absl::make_unique(std::move(gpu_tensor)), + absl::make_unique(desc)); + const std::string s_coord = shape.c == 1 ? "0" : "S_COORD"; + result->code_ = absl::StrCat( + "FLT4 second_val = args.second_tensor.Read(0, 0, ", s_coord, ");\n"); + if (shape.c == 1) { + result->code_ += " second_val.y = second_val.x;\n"; + result->code_ += " second_val.z = second_val.x;\n"; + result->code_ += " second_val.w = second_val.x;\n"; + } + result->code_ += GetTwoInputCode(op_type, "in_out_value", "second_val"); return absl::OkStatus(); } @@ -281,7 +195,7 @@ absl::Status CreateElementwiseTwoInput( const CreationContext& creation_context, const OperationDef& definition, const OperationType& op_type, const tflite::gpu::Tensor& constant_tensor, - ElementwiseTwoInput* result) { + GPUOperation* result) { const BHWC shape = BHWC(1, constant_tensor.shape.h, constant_tensor.shape.w, constant_tensor.shape.c); TensorStorageType storage_type = @@ -295,34 +209,49 @@ absl::Status CreateElementwiseTwoInput( &gpu_tensor)); RETURN_IF_ERROR( gpu_tensor.WriteData(creation_context.queue, constant_tensor)); - BroadcastSettings broadcast; - broadcast.width = shape.w == 1; - broadcast.height = shape.h == 1; - broadcast.channels = shape.c == 1; - *result = ElementwiseTwoInput(definition, op_type, broadcast, - std::move(gpu_tensor)); + + *result = GPUOperation(definition); + result->elementwise_ = true; + result->args_.AddObject("second_tensor", AccessType::READ, + absl::make_unique(std::move(gpu_tensor)), + absl::make_unique(desc)); + const std::string x_coord = shape.w == 1 ? "0" : "X_COORD"; + const std::string y_coord = shape.h == 1 ? "0" : "Y_COORD"; + const std::string s_coord = shape.c == 1 ? "0" : "S_COORD"; + result->code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(", + x_coord, ", ", y_coord, ", ", s_coord, ");\n"); + if (shape.c == 1) { + result->code_ += " second_val.y = second_val.x;\n"; + result->code_ += " second_val.z = second_val.x;\n"; + result->code_ += " second_val.w = second_val.x;\n"; + } + result->code_ += GetTwoInputCode(op_type, "in_out_value", "second_val"); + return absl::OkStatus(); } -ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition, - const OperationType& op_type, - const BHWC& shape) { - BroadcastSettings broadcast; - broadcast.width = shape.w == 1; - broadcast.height = shape.h == 1; - broadcast.channels = shape.c == 1; - ElementwiseTwoInput operation(definition, op_type, broadcast); - return operation; -} - -ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition, - const OperationType& op_type) { - BroadcastSettings broadcast; - broadcast.width = false; - broadcast.height = false; - broadcast.channels = false; - ElementwiseTwoInput operation(definition, op_type, broadcast); - return operation; +GPUOperation CreateElementwiseTwoInput(const OperationDef& definition, + const OperationType& op_type, + const BHWC& shape) { + GPUOperation op(definition); + op.elementwise_ = true; + auto src_desc = definition.src_tensors[1]; + if (definition.IsBatchSupported()) { + src_desc.SetStateVar("BatchedWidth", "true"); + } + op.AddSrcTensor("second_tensor", src_desc); + const std::string x_coord = shape.w == 1 ? "0" : "X_COORD"; + const std::string y_coord = shape.h == 1 ? "0" : "Y_COORD"; + const std::string s_coord = shape.c == 1 ? "0" : "S_COORD"; + op.code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(", x_coord, + ", ", y_coord, ", ", s_coord, ");\n"); + if (shape.c == 1) { + op.code_ += " second_val.y = second_val.x;\n"; + op.code_ += " second_val.z = second_val.x;\n"; + op.code_ += " second_val.w = second_val.x;\n"; + } + op.code_ += GetTwoInputCode(op_type, "in_out_value", "second_val"); + return op; } } // namespace cl diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h index 9712ee96b90..d03d535b39a 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h +++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h @@ -26,93 +26,38 @@ namespace tflite { namespace gpu { namespace cl { -// Class for simple one input operations without any parameters, for example -// log, sin, cos and etc. -class ElementwiseOneInput : public ElementwiseOperation { - public: - ElementwiseOneInput(const OperationDef& definition, - const OperationType& op_type); +// Creates simple one input operation without any parameters, for example +// log, sin, cos, etc. +GPUOperation CreateElementwiseOneInput(const OperationDef& definition, + const OperationType& op_type); - // Move only - ElementwiseOneInput(ElementwiseOneInput&& operation); - ElementwiseOneInput& operator=(ElementwiseOneInput&& operation); - ElementwiseOneInput(const ElementwiseOneInput&) = delete; - ElementwiseOneInput& operator=(const ElementwiseOneInput&) = delete; -}; - -ElementwiseOneInput CreateElementwiseOneInput(const OperationDef& definition, - const OperationType& op_type); - -// Class for simple two input (first input is runtime tensor and second input is -// scalar argument) operations without any parameters, for example sub, div and -// etc. -class ElementwiseOneRuntimeOneScalar : public ElementwiseOperation { - public: - ElementwiseOneRuntimeOneScalar(const OperationDef& definition, - const OperationType& op_type, - float scalar_parameter, - CalculationsPrecision scalar_precision); - - // Move only - ElementwiseOneRuntimeOneScalar(ElementwiseOneRuntimeOneScalar&& operation); - ElementwiseOneRuntimeOneScalar& operator=( - ElementwiseOneRuntimeOneScalar&& operation); - ElementwiseOneRuntimeOneScalar(const ElementwiseOneRuntimeOneScalar&) = - delete; - ElementwiseOneRuntimeOneScalar& operator=( - const ElementwiseOneRuntimeOneScalar&) = delete; -}; - -ElementwiseOneRuntimeOneScalar CreateElementwiseOneRuntimeOneScalar( +// Creates simple two input (first input is runtime tensor and second input is +// scalar argument) operation, for example sub, div, pow, etc. +GPUOperation CreateElementwiseOneRuntimeOneScalar( const CreationContext& creation_context, const OperationDef& definition, const OperationType& op_type, float scalar_parameter); -struct BroadcastSettings { - bool width; - bool height; - bool channels; -}; - -// Class for simple two input(first input is runtime tensor and second input is -// runtime or constant tensor) operations without any parameters, for example -// sub, div and etc. -class ElementwiseTwoInput : public ElementwiseOperation { - public: - ElementwiseTwoInput() = default; - ElementwiseTwoInput(const OperationDef& definition, - const OperationType& op_type, - const BroadcastSettings& broadcast); - - ElementwiseTwoInput(const OperationDef& definition, - const OperationType& op_type, - const BroadcastSettings& broadcast, - Tensor&& constant_tensor); - - // Move only - ElementwiseTwoInput(ElementwiseTwoInput&& operation); - ElementwiseTwoInput& operator=(ElementwiseTwoInput&& operation); - ElementwiseTwoInput(const ElementwiseTwoInput&) = delete; - ElementwiseTwoInput& operator=(const ElementwiseTwoInput&) = delete; - - private: - BroadcastSettings broadcast_; -}; - +// Creates simple two input(first input is runtime tensor and second input is +// constant linear tensor) operation, for example sub, div and etc. absl::Status CreateElementwiseTwoInput( const CreationContext& creation_context, const OperationDef& definition, const OperationType& op_type, const tflite::gpu::Tensor& constant_tensor, - ElementwiseTwoInput* result); + GPUOperation* result); +// Creates simple two input(first input is runtime tensor and second input is +// constant HWC tensor) operation, for example sub, div and etc. absl::Status CreateElementwiseTwoInput( const CreationContext& creation_context, const OperationDef& definition, const OperationType& op_type, const tflite::gpu::Tensor& constant_tensor, - ElementwiseTwoInput* result); + GPUOperation* result); -ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition, - const OperationType& op_type, - const BHWC& shape); +// Creates simple two input(2 runtime tensors) operation, for example +// sub, div and etc. +GPUOperation CreateElementwiseTwoInput(const OperationDef& definition, + const OperationType& op_type, + const BHWC& shape); } // namespace cl } // namespace gpu diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc index ac825c0cdfc..11a651df901 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc @@ -45,7 +45,7 @@ TEST_F(OpenCLOperationTest, Abs) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseOneInput operation = + GPUOperation operation = CreateElementwiseOneInput(op_def, OperationType::ABS); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); @@ -70,7 +70,7 @@ TEST_F(OpenCLOperationTest, Cos) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseOneInput operation = + GPUOperation operation = CreateElementwiseOneInput(op_def, OperationType::COS); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); @@ -95,7 +95,7 @@ TEST_F(OpenCLOperationTest, Copy) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseOneInput operation = + GPUOperation operation = CreateElementwiseOneInput(op_def, OperationType::COPY); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); @@ -118,7 +118,7 @@ TEST_F(OpenCLOperationTest, Elu) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseOneInput operation = + GPUOperation operation = CreateElementwiseOneInput(op_def, OperationType::ELU); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 1, 1, 7), &dst_tensor)); @@ -144,7 +144,7 @@ TEST_F(OpenCLOperationTest, Exp) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseOneInput operation = + GPUOperation operation = CreateElementwiseOneInput(op_def, OperationType::EXP); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 1, 1, 7), &dst_tensor)); @@ -171,7 +171,7 @@ TEST_F(OpenCLOperationTest, HardSwish) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseOneInput operation = + GPUOperation operation = CreateElementwiseOneInput(op_def, OperationType::HARD_SWISH); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, src_tensor.shape, &dst_tensor)); @@ -197,7 +197,7 @@ TEST_F(OpenCLOperationTest, Log) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseOneInput operation = + GPUOperation operation = CreateElementwiseOneInput(op_def, OperationType::LOG); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); @@ -222,7 +222,7 @@ TEST_F(OpenCLOperationTest, Rsqrt) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseOneInput operation = + GPUOperation operation = CreateElementwiseOneInput(op_def, OperationType::RSQRT); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); @@ -249,7 +249,7 @@ TEST_F(OpenCLOperationTest, Sigmoid) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseOneInput operation = + GPUOperation operation = CreateElementwiseOneInput(op_def, OperationType::SIGMOID); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); @@ -273,7 +273,7 @@ TEST_F(OpenCLOperationTest, Sin) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseOneInput operation = + GPUOperation operation = CreateElementwiseOneInput(op_def, OperationType::SIN); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); @@ -299,7 +299,7 @@ TEST_F(OpenCLOperationTest, Sqrt) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseOneInput operation = + GPUOperation operation = CreateElementwiseOneInput(op_def, OperationType::SQRT); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); @@ -325,7 +325,7 @@ TEST_F(OpenCLOperationTest, Square) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseOneInput operation = + GPUOperation operation = CreateElementwiseOneInput(op_def, OperationType::SQUARE); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); @@ -349,7 +349,7 @@ TEST_F(OpenCLOperationTest, Tanh) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseOneInput operation = + GPUOperation operation = CreateElementwiseOneInput(op_def, OperationType::TANH); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); @@ -378,7 +378,7 @@ TEST_F(OpenCLOperationTest, Sub) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseTwoInput operation = CreateElementwiseTwoInput( + GPUOperation operation = CreateElementwiseTwoInput( op_def, OperationType::SUB, src_tensor_1.shape); ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1}, creation_context_, &operation, @@ -406,7 +406,7 @@ TEST_F(OpenCLOperationTest, SquaredDiff) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseTwoInput operation = CreateElementwiseTwoInput( + GPUOperation operation = CreateElementwiseTwoInput( op_def, OperationType::SQUARED_DIFF, src_tensor_1.shape); ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1}, creation_context_, &operation, @@ -434,7 +434,7 @@ TEST_F(OpenCLOperationTest, Div) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseTwoInput operation = CreateElementwiseTwoInput( + GPUOperation operation = CreateElementwiseTwoInput( op_def, OperationType::DIV, src_tensor_1.shape); ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1}, creation_context_, &operation, @@ -462,7 +462,7 @@ TEST_F(OpenCLOperationTest, Pow) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseTwoInput operation = CreateElementwiseTwoInput( + GPUOperation operation = CreateElementwiseTwoInput( op_def, OperationType::POW, src_tensor_1.shape); ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1}, creation_context_, &operation, @@ -490,7 +490,7 @@ TEST_F(OpenCLOperationTest, Add) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseTwoInput operation = CreateElementwiseTwoInput( + GPUOperation operation = CreateElementwiseTwoInput( op_def, OperationType::ADD, src_tensor_1.shape); ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1}, creation_context_, &operation, @@ -518,7 +518,7 @@ TEST_F(OpenCLOperationTest, Maximum) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseTwoInput operation = CreateElementwiseTwoInput( + GPUOperation operation = CreateElementwiseTwoInput( op_def, OperationType::MAXIMUM, src_tensor_1.shape); ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1}, creation_context_, &operation, @@ -547,9 +547,8 @@ TEST_F(OpenCLOperationTest, MaximumWithScalar) { op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; const float* scalar = absl::get_if(&attr.param); - ElementwiseOneRuntimeOneScalar operation = - CreateElementwiseOneRuntimeOneScalar(creation_context_, op_def, - OperationType::MAXIMUM, *scalar); + GPUOperation operation = CreateElementwiseOneRuntimeOneScalar( + creation_context_, op_def, OperationType::MAXIMUM, *scalar); ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation, BHWC(1, 4, 1, 1), &dst_tensor)); EXPECT_THAT(dst_tensor.data, @@ -578,7 +577,7 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantLinearTensor) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseTwoInput operation; + GPUOperation operation; ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def, OperationType::MAXIMUM, linear_tensor, &operation)); @@ -608,7 +607,7 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensor) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseTwoInput operation; + GPUOperation operation; ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def, OperationType::MAXIMUM, hwc_tensor, &operation)); @@ -637,7 +636,7 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensorBroadcastChannels) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseTwoInput operation; + GPUOperation operation; ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def, OperationType::MAXIMUM, hwc_tensor, &operation)); @@ -666,7 +665,7 @@ TEST_F(OpenCLOperationTest, Minimum) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseTwoInput operation = CreateElementwiseTwoInput( + GPUOperation operation = CreateElementwiseTwoInput( op_def, OperationType::MINIMUM, src_tensor_1.shape); ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1}, creation_context_, &operation, @@ -695,9 +694,8 @@ TEST_F(OpenCLOperationTest, MinimumWithScalar) { op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; const float* scalar = absl::get_if(&attr.param); - ElementwiseOneRuntimeOneScalar operation = - CreateElementwiseOneRuntimeOneScalar(creation_context_, op_def, - OperationType::MINIMUM, *scalar); + GPUOperation operation = CreateElementwiseOneRuntimeOneScalar( + creation_context_, op_def, OperationType::MINIMUM, *scalar); ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation, BHWC(1, 4, 1, 1), &dst_tensor)); EXPECT_THAT(dst_tensor.data, @@ -723,7 +721,7 @@ TEST_F(OpenCLOperationTest, Mul) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseTwoInput operation = CreateElementwiseTwoInput( + GPUOperation operation = CreateElementwiseTwoInput( op_def, OperationType::MUL, src_tensor_1.shape); ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1}, creation_context_, &operation, @@ -751,7 +749,7 @@ TEST_F(OpenCLOperationTest, MulBroadcastHW) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseTwoInput operation = CreateElementwiseTwoInput( + GPUOperation operation = CreateElementwiseTwoInput( op_def, OperationType::MUL, src_tensor_1.shape); ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1}, creation_context_, &operation, @@ -779,7 +777,7 @@ TEST_F(OpenCLOperationTest, MulBroadcastChannels) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ElementwiseTwoInput operation = CreateElementwiseTwoInput( + GPUOperation operation = CreateElementwiseTwoInput( op_def, OperationType::MUL, src_tensor_1.shape); ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1}, creation_context_, &operation, diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc index beb62632099..7260048c6d3 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc @@ -49,6 +49,20 @@ std::string GetElementWiseCode(const OperationDef& op_def, return c; } +absl::Status MergeOperations(const std::vector& linked_ops, + Arguments* merged_args, std::string* merged_code) { + for (int i = 0; i < linked_ops.size(); ++i) { + std::string code = linked_ops[i]->code_; + std::string unique_postfix = absl::StrCat("_link", i + 1); + linked_ops[i]->args_.RenameArgs(unique_postfix, &code); + *merged_code += "{\n" + code + "\n}\n"; + RETURN_IF_ERROR( + merged_args->Merge(std::move(linked_ops[i]->args_), unique_postfix)); + linked_ops[i]->AddUniquePostfix(unique_postfix); + } + return absl::OkStatus(); +} + } // namespace DataType OperationDef::GetDataType() const { @@ -108,14 +122,17 @@ void GPUOperation::SetDst(Tensor* ptr, int index) { } GPUOperation::GPUOperation(GPUOperation&& operation) - : definition_(std::move(operation.definition_)), + : args_(std::move(operation.args_)), + code_(std::move(operation.code_)), + elementwise_(operation.elementwise_), + linkable_(operation.linkable_), + check_src_channels_size_(operation.check_src_channels_size_), + definition_(std::move(operation.definition_)), src_(std::move(operation.src_)), dst_(std::move(operation.dst_)), - args_(std::move(operation.args_)), kernel_(std::move(operation.kernel_)), work_group_size_(operation.work_group_size_), grid_size_(operation.grid_size_), - code_(std::move(operation.code_)), src_tensors_names_(std::move(operation.src_tensors_names_)), dst_tensors_names_(std::move(operation.dst_tensors_names_)), compiler_options_(std::move(operation.compiler_options_)), @@ -123,14 +140,17 @@ GPUOperation::GPUOperation(GPUOperation&& operation) GPUOperation& GPUOperation::operator=(GPUOperation&& operation) { if (this != &operation) { + args_ = std::move(operation.args_); + code_ = std::move(operation.code_); + elementwise_ = operation.elementwise_; + linkable_ = operation.linkable_; + check_src_channels_size_ = operation.check_src_channels_size_; definition_ = std::move(operation.definition_); src_ = std::move(operation.src_); dst_ = std::move(operation.dst_); - args_ = std::move(operation.args_); kernel_ = std::move(operation.kernel_); std::swap(work_group_size_, operation.work_group_size_); std::swap(grid_size_, operation.grid_size_); - code_ = std::move(operation.code_); src_tensors_names_ = std::move(operation.src_tensors_names_); dst_tensors_names_ = std::move(operation.dst_tensors_names_); compiler_options_ = std::move(operation.compiler_options_); @@ -139,7 +159,7 @@ GPUOperation& GPUOperation::operator=(GPUOperation&& operation) { return *this; } -void GPUOperation::AddOperation(ElementwiseOperation* operation) { +void GPUOperation::AddOperation(GPUOperation* operation) { linked_operations_.push_back(operation); } @@ -183,73 +203,62 @@ absl::Status GPUOperation::UpdateParams() { } absl::Status GPUOperation::Compile(const CreationContext& creation_context) { - std::string element_wise_code; - RETURN_IF_ERROR( - MergeOperations(linked_operations_, &args_, &element_wise_code)); - RETURN_IF_ERROR(args_.TransformToCLCode( - creation_context.device->GetInfo(), - {{dst_tensors_names_[0], element_wise_code}}, &code_)); - RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel( - code_, "main_function", compiler_options_, *creation_context.context, - *creation_context.device, &kernel_)); + if (elementwise_) { + auto src_desc = + absl::make_unique(definition_.src_tensors[0]); + if (definition_.IsBatchSupported()) { + src_desc->SetStateVar("BatchedWidth", "true"); + } + src_tensors_names_.insert(src_tensors_names_.begin(), "src_tensor"); + args_.AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc)); + + auto dst_desc = + absl::make_unique(definition_.dst_tensors[0]); + if (definition_.IsBatchSupported()) { + dst_desc->SetStateVar("BatchedWidth", "true"); + } + dst_tensors_names_.insert(dst_tensors_names_.begin(), "dst_tensor"); + args_.AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc)); + + std::string code = + GetElementWiseCode(definition_, check_src_channels_size_); + std::string element_wise_code; + element_wise_code += "{\n" + code_ + "\n}\n"; + RETURN_IF_ERROR( + MergeOperations(linked_operations_, &args_, &element_wise_code)); + RETURN_IF_ERROR(args_.TransformToCLCode( + creation_context.device->GetInfo(), + {{dst_tensors_names_[0], element_wise_code}}, &code)); + code = absl::Substitute(code, args_.GetListOfArgs()); + RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel( + code, "main_function", *creation_context.context, + *creation_context.device, &kernel_)); + } else { + std::string element_wise_code; + RETURN_IF_ERROR( + MergeOperations(linked_operations_, &args_, &element_wise_code)); + RETURN_IF_ERROR(args_.TransformToCLCode( + creation_context.device->GetInfo(), + {{dst_tensors_names_[0], element_wise_code}}, &code_)); + RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel( + code_, "main_function", compiler_options_, *creation_context.context, + *creation_context.device, &kernel_)); + } return PostCompileCheck(creation_context.device->GetInfo()); } -ElementwiseOperation::ElementwiseOperation(ElementwiseOperation&& operation) - : GPUOperation(std::move(operation)), - check_src_channels_size_(operation.check_src_channels_size_), - linkable_(operation.linkable_) {} - -ElementwiseOperation& ElementwiseOperation::operator=( - ElementwiseOperation&& operation) { - if (this != &operation) { - check_src_channels_size_ = operation.check_src_channels_size_; - linkable_ = operation.linkable_; - GPUOperation::operator=(std::move(operation)); +int3 GPUOperation::GetGridSize() const { + if (elementwise_) { + const int grid_x = dst_[0]->Width() * dst_[0]->Batch(); + const int grid_y = dst_[0]->Height(); + const int grid_z = dst_[0]->Slices(); + return int3(grid_x, grid_y, grid_z); + } else { + return int3(0, 0, 0); } - return *this; } -int3 ElementwiseOperation::GetGridSize() const { - const int grid_x = dst_[0]->Width() * dst_[0]->Batch(); - const int grid_y = dst_[0]->Height(); - const int grid_z = dst_[0]->Slices(); - return int3(grid_x, grid_y, grid_z); -} - -absl::Status ElementwiseOperation::Compile( - const CreationContext& creation_context) { - auto src_desc = - absl::make_unique(definition_.src_tensors[0]); - if (definition_.IsBatchSupported()) { - src_desc->SetStateVar("BatchedWidth", "true"); - } - src_tensors_names_.insert(src_tensors_names_.begin(), "src_tensor"); - args_.AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc)); - - auto dst_desc = - absl::make_unique(definition_.dst_tensors[0]); - if (definition_.IsBatchSupported()) { - dst_desc->SetStateVar("BatchedWidth", "true"); - } - dst_tensors_names_.insert(dst_tensors_names_.begin(), "dst_tensor"); - args_.AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc)); - - std::string code = GetElementWiseCode(definition_, check_src_channels_size_); - std::string element_wise_code; - element_wise_code += "{\n" + code_ + "\n}\n"; - RETURN_IF_ERROR( - MergeOperations(linked_operations_, &args_, &element_wise_code)); - RETURN_IF_ERROR(args_.TransformToCLCode( - creation_context.device->GetInfo(), - {{dst_tensors_names_[0], element_wise_code}}, &code)); - code = absl::Substitute(code, args_.GetListOfArgs()); - return creation_context.cache->GetOrCreateCLKernel( - code, "main_function", *creation_context.context, - *creation_context.device, &kernel_); -} - -void ElementwiseOperation::AddUniquePostfix(const std::string& unique_postfix) { +void GPUOperation::AddUniquePostfix(const std::string& unique_postfix) { for (int i = 0; i < src_tensors_names_.size(); ++i) { src_tensors_names_[i] += unique_postfix; } @@ -258,21 +267,6 @@ void ElementwiseOperation::AddUniquePostfix(const std::string& unique_postfix) { } } -absl::Status MergeOperations( - const std::vector& linked_ops, - Arguments* merged_args, std::string* merged_code) { - for (int i = 0; i < linked_ops.size(); ++i) { - std::string code = linked_ops[i]->GetCode(); - std::string unique_postfix = absl::StrCat("_link", i + 1); - auto&& link_args = linked_ops[i]->MoveArgs(); - link_args.RenameArgs(unique_postfix, &code); - *merged_code += "{\n" + code + "\n}\n"; - RETURN_IF_ERROR(merged_args->Merge(std::move(link_args), unique_postfix)); - linked_ops[i]->AddUniquePostfix(unique_postfix); - } - return absl::OkStatus(); -} - } // namespace cl } // namespace gpu } // namespace tflite diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h index 01e11f3ea64..620883f26f4 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h +++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h @@ -59,18 +59,15 @@ struct OperationDef { bool IsBatchSupported() const; }; -class ElementwiseOperation; - // GPUOperation represents some implementation of neural network operation on -// GPU. GPUOperation can contain ElementwiseOperation operations, in this case, -// ElementwiseOperation still hold necessary data and should be alive. -// When GPUOperation contains ElementwiseOperations, this GPUoperation replaces -// some sequence of operations Op + el_op0 + el_op1 + ... +// GPU. GPUOperation can contain another GPU operations with flag elementwise_. +// When GPUOperation contains another GPU ops, this GPUoperation replaces +// some sequence of operations Op + op0 + op1 + ... // Because of this abilities of GPUOperation, usage scenario is next: // Create instance of GPUOperation. -// Create all instances of ElementwiseOperations that we will(probably) attach -// to GPUOperation. Attach all ElementwiseOperations to GPUOperation. Call -// GPUOperation.Compile(). Don't call ElementwiseOperation.Compile() if it +// Create all instances of GPUOperations that we will(probably) attach +// to GPUOperation. Attach all GPUOperations to GPUOperation. Call +// GPUOperation.Compile(). Don't call GPUOperations.Compile() if it // attached, it useless(and may be error) class GPUOperation { public: @@ -83,7 +80,7 @@ class GPUOperation { GPUOperation(const GPUOperation&) = delete; GPUOperation& operator=(const GPUOperation&) = delete; - void AddOperation(ElementwiseOperation* operation); + void AddOperation(GPUOperation* operation); void SetSrc(Tensor* ptr, int index = 0); void SetDst(Tensor* ptr, int index = 0); @@ -116,64 +113,37 @@ class GPUOperation { void AddDstTensor(const std::string& tensor_name, const TensorDescriptor& desc); + bool IsLinkable() const { return elementwise_ && linkable_; } + + // for linking + void AddUniquePostfix(const std::string& unique_postfix); + + Arguments args_; + std::string code_; + + bool elementwise_ = false; + // applicable only with elementwise_ = true; + bool linkable_ = true; // by default every elementwise is linkable + // applicable only with elementwise_ = true; + bool check_src_channels_size_ = false; + protected: virtual absl::Status BindArguments() { return absl::OkStatus(); } - virtual int3 GetGridSize() const = 0; + virtual int3 GetGridSize() const; // Defines operation calculation precision and format of src/dst tensors. OperationDef definition_; std::vector src_; std::vector dst_; - Arguments args_; CLKernel kernel_; int3 work_group_size_ = int3(8, 4, 1); int3 grid_size_ = int3(0, 0, 0); - std::string code_; std::vector src_tensors_names_; std::vector dst_tensors_names_; std::vector compiler_options_; - std::vector linked_operations_; + std::vector linked_operations_; }; -// ElementwiseOperation can be fused(linked) to another operation. -// field linked_ indicate about this -// link_index_ used mostly for generating of correct names for -// linked code variables -// link_index_ is number of operation in sequence of linked operations -// and should be unique in this sequence -// link_index_ = 0 is equivalent that operation not linked. -class ElementwiseOperation : public GPUOperation { - public: - ElementwiseOperation() {} - explicit ElementwiseOperation(const OperationDef& definition) - : GPUOperation(definition) {} - - virtual ~ElementwiseOperation() {} - - absl::Status Compile(const CreationContext& creation_context) override; - int3 GetGridSize() const override; - - // Move only - ElementwiseOperation(ElementwiseOperation&& operation); - ElementwiseOperation& operator=(ElementwiseOperation&& operation); - ElementwiseOperation(const ElementwiseOperation&) = delete; - ElementwiseOperation& operator=(const ElementwiseOperation&) = delete; - - Arguments&& MoveArgs() { return std::move(args_); } - std::string GetCode() const { return code_; } - void AddUniquePostfix(const std::string& unique_postfix); - - bool IsLinkable() const { return linkable_; } - - protected: - bool check_src_channels_size_ = false; - bool linkable_ = true; -}; - -absl::Status MergeOperations( - const std::vector& linked_ops, - Arguments* merged_args, std::string* merged_code); - } // namespace cl } // namespace gpu } // namespace tflite diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc index 85c88f3b51b..1ca2e096a0e 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc @@ -24,47 +24,43 @@ namespace tflite { namespace gpu { namespace cl { -PReLU::PReLU(const OperationDef& definition, const PReLUAttributes& attr, - CalculationsPrecision scalar_precision) - : ElementwiseOperation(definition) { +absl::Status CreatePReLU(const CreationContext& creation_context, + const OperationDef& definition, + const PReLUAttributes& attr, GPUOperation* result) { + *result = GPUOperation(definition); + result->elementwise_ = true; if (attr.clip != 0) { if (definition.precision == CalculationsPrecision::F32) { - args_.AddFloat("clip", attr.clip); + result->args_.AddFloat("clip", attr.clip); } else { - args_.AddHalf("clip", half(attr.clip)); + result->args_.AddHalf("clip", half(attr.clip)); } - code_ = + result->code_ = "in_out_value = clamp(in_out_value, (FLT4)(0.0f), (FLT4)(args.clip)) + " "min((FLT4)(0.0f), in_out_value) * args.alpha.Read(S_COORD);"; } else { - code_ = + result->code_ = "in_out_value = max((FLT4)(0.0f), in_out_value) + min((FLT4)(0.0f), " "in_out_value) * args.alpha.Read(S_COORD);"; } -} -PReLU::PReLU(PReLU&& operation) : ElementwiseOperation(std::move(operation)) {} - -PReLU& PReLU::operator=(PReLU&& operation) { - if (this != &operation) { - ElementwiseOperation::operator=(std::move(operation)); - } - return *this; -} - -absl::Status CreatePReLU(const CreationContext& creation_context, - const OperationDef& definition, - const PReLUAttributes& attr, PReLU* result) { auto alpha = absl::get_if>(&attr.alpha); if (!alpha) { return absl::InvalidArgumentError("Alpha is missing"); } - const auto scalar_precision = creation_context.device->IsPowerVR() - ? CalculationsPrecision::F32 - : definition.precision; - *result = PReLU(definition, attr, scalar_precision); - RETURN_IF_ERROR(result->UploadParameters(*alpha, creation_context.context)); + TensorLinearDescriptor desc; + desc.storage_type = + DeduceLinearStorageType(definition.GetPrimaryStorageType()); + desc.element_type = definition.GetPrimaryDataType(); + + LinearStorage lt; + RETURN_IF_ERROR( + CreateLinearStorage(desc, *alpha, creation_context.context, <)); + result->args_.AddObject("alpha", AccessType::READ, + absl::make_unique(std::move(lt)), + absl::make_unique(desc)); + return absl::OkStatus(); } diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h index e65559cf7c7..b673217c799 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h +++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h @@ -31,48 +31,9 @@ namespace tflite { namespace gpu { namespace cl { -class PReLU : public ElementwiseOperation { - public: - PReLU() = default; - // Move only - PReLU(PReLU&& operation); - PReLU& operator=(PReLU&& operation); - PReLU(const PReLU&) = delete; - PReLU& operator=(const PReLU&) = delete; - - friend absl::Status CreatePReLU(const CreationContext& creation_context, - const OperationDef& definition, - const PReLUAttributes& attr, PReLU* result); - - private: - PReLU(const OperationDef& definition, const PReLUAttributes& attr, - CalculationsPrecision scalar_precision); - - template - absl::Status UploadParameters( - const tflite::gpu::Tensor& parameters, CLContext* context); -}; - absl::Status CreatePReLU(const CreationContext& creation_context, const OperationDef& definition, - const PReLUAttributes& attr, PReLU* result); - -template -absl::Status PReLU::UploadParameters( - const tflite::gpu::Tensor& parameters, CLContext* context) { - TensorLinearDescriptor desc; - desc.storage_type = - DeduceLinearStorageType(definition_.GetPrimaryStorageType()); - desc.element_type = definition_.GetPrimaryDataType(); - - LinearStorage lt; - RETURN_IF_ERROR(CreateLinearStorage(desc, parameters, context, <)); - args_.AddObject("alpha", AccessType::READ, - absl::make_unique(std::move(lt)), - absl::make_unique(desc)); - - return absl::OkStatus(); -} + const PReLUAttributes& attr, GPUOperation* result); } // namespace cl } // namespace gpu diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc index 4b0006c7f32..06ff09ccca7 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc @@ -52,7 +52,7 @@ TEST_F(OpenCLOperationTest, PReLUAlpha) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - PReLU operation; + GPUOperation operation; ASSERT_OK(CreatePReLU(creation_context_, op_def, attr, &operation)); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); @@ -83,7 +83,7 @@ TEST_F(OpenCLOperationTest, PReLUAlphaClip) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - PReLU operation; + GPUOperation operation; ASSERT_OK(CreatePReLU(creation_context_, op_def, attr, &operation)); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc index 957fc9bbb98..e0c44e1cda7 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc @@ -25,59 +25,37 @@ limitations under the License. namespace tflite { namespace gpu { namespace cl { - -QuantizeAndDequantize::QuantizeAndDequantize( - const OperationDef& definition, const QuantizeAndDequantizeAttributes& attr, - CalculationsPrecision scalar_precision) - : ElementwiseOperation(definition) { - if (definition.precision == CalculationsPrecision::F32) { - args_.AddFloat("min", attr.min); - args_.AddFloat("max", attr.max); - args_.AddFloat("scale", attr.scale); - } else { - args_.AddHalf("min", half(attr.min)); - args_.AddHalf("max", half(attr.max)); - args_.AddHalf("scale", half(attr.scale)); - } - code_ = R"( -FLT4 clamped_value = min((FLT4)(args.max), max((FLT4)(args.min), in_out_value)); -FLT4 quantized_value = round((clamped_value - (FLT4)(args.min)) / (FLT4)(args.scale)); -FLT4 dequantized_value = quantized_value * (FLT4)(args.scale) + (FLT4)(args.min); -in_out_value = dequantized_value;)"; -} - -QuantizeAndDequantize::QuantizeAndDequantize(QuantizeAndDequantize&& operation) - : ElementwiseOperation(std::move(operation)) {} - -QuantizeAndDequantize& QuantizeAndDequantize::operator=( - QuantizeAndDequantize&& operation) { - if (this != &operation) { - ElementwiseOperation::operator=(std::move(operation)); - } - return *this; -} - -absl::Status CreateQuantizeAndDequantize( +GPUOperation CreateQuantizeAndDequantize( const CreationContext& creation_context, const OperationDef& definition, - const QuantizeAndDequantizeAttributes& attr, - QuantizeAndDequantize* result) { - const auto scalar_precision = creation_context.device->IsPowerVR() - ? CalculationsPrecision::F32 - : definition.precision; + const QuantizeAndDequantizeAttributes& attr) { + QuantizeAndDequantizeAttributes adjusted_attr = attr; const bool is_fp16 = definition.precision == CalculationsPrecision::F16 || definition.precision == CalculationsPrecision::F32_F16; if (is_fp16 && attr.scale < 0.000062f) { // The smallest positive normal number for Half-precision floating-point // format is 2^-14 ~ 0.000062f. Therefore, if the scale is lesser than this // number, we just reset it accordingly. - QuantizeAndDequantizeAttributes adjusted_attr = attr; adjusted_attr.scale = 0.000062f; - *result = - QuantizeAndDequantize(definition, adjusted_attr, scalar_precision); - } else { - *result = QuantizeAndDequantize(definition, attr, scalar_precision); } - return absl::OkStatus(); + + GPUOperation op(definition); + op.elementwise_ = true; + if (definition.precision == CalculationsPrecision::F32) { + op.args_.AddFloat("min", adjusted_attr.min); + op.args_.AddFloat("max", adjusted_attr.max); + op.args_.AddFloat("scale", adjusted_attr.scale); + } else { + op.args_.AddHalf("min", half(adjusted_attr.min)); + op.args_.AddHalf("max", half(adjusted_attr.max)); + op.args_.AddHalf("scale", half(adjusted_attr.scale)); + } + op.code_ = R"( +FLT4 clamped_value = min((FLT4)(args.max), max((FLT4)(args.min), in_out_value)); +FLT4 quantized_value = round((clamped_value - (FLT4)(args.min)) / (FLT4)(args.scale)); +FLT4 dequantized_value = quantized_value * (FLT4)(args.scale) + (FLT4)(args.min); +in_out_value = dequantized_value;)"; + + return op; } } // namespace cl diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h index a40aa21d23c..6e028625852 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h +++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h @@ -43,43 +43,9 @@ namespace cl { // // NOTE: We do not need to nudge min/max values in this op, since they would // already be adjusted while generating the quantized model. -class QuantizeAndDequantize : public ElementwiseOperation { - public: - QuantizeAndDequantize() = default; - // Move only - QuantizeAndDequantize(QuantizeAndDequantize&& operation); - QuantizeAndDequantize& operator=(QuantizeAndDequantize&& operation); - QuantizeAndDequantize(const QuantizeAndDequantize&) = delete; - QuantizeAndDequantize& operator=(const QuantizeAndDequantize&) = delete; - - friend absl::Status CreateQuantizeAndDequantize( - const CreationContext& creation_context, const OperationDef& definition, - const QuantizeAndDequantizeAttributes& attr, - QuantizeAndDequantize* result); - - private: - QuantizeAndDequantize(const OperationDef& definition, - const QuantizeAndDequantizeAttributes& attr, - CalculationsPrecision scalar_precision); - - template - absl::Status UploadParameters( - const tflite::gpu::Tensor& parameters, CLContext* context); -}; - -absl::Status CreateQuantizeAndDequantize( +GPUOperation CreateQuantizeAndDequantize( const CreationContext& creation_context, const OperationDef& definition, - const QuantizeAndDequantizeAttributes& attr, QuantizeAndDequantize* result); - -template -absl::Status QuantizeAndDequantize::UploadParameters( - const tflite::gpu::Tensor& parameters, CLContext* context) { - LinearStorageCreateInfo create_info; - create_info.storage_type = - DeduceLinearStorageType(definition_.GetPrimaryStorageType()); - create_info.data_type = definition_.GetPrimaryDataType(); - return absl::OkStatus(); -} + const QuantizeAndDequantizeAttributes& attr); } // namespace cl } // namespace gpu diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc index 71d6d066b9b..43b5d69323d 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc @@ -56,9 +56,8 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits8) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - QuantizeAndDequantize operation; - ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr, - &operation)); + GPUOperation operation = + CreateQuantizeAndDequantize(creation_context_, op_def, attr); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 3, 2, 1), &dst_tensor)); EXPECT_THAT(dst_tensor.data, @@ -92,9 +91,8 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits8_NegativeRange) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - QuantizeAndDequantize operation; - ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr, - &operation)); + GPUOperation operation = + CreateQuantizeAndDequantize(creation_context_, op_def, attr); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 3, 1, 2), &dst_tensor)); EXPECT_THAT(dst_tensor.data, @@ -128,9 +126,8 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits16) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - QuantizeAndDequantize operation; - ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr, - &operation)); + GPUOperation operation = + CreateQuantizeAndDequantize(creation_context_, op_def, attr); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 3, 1, 2), &dst_tensor)); EXPECT_THAT(dst_tensor.data, @@ -164,9 +161,8 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits16_NegativeRange) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - QuantizeAndDequantize operation; - ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr, - &operation)); + GPUOperation operation = + CreateQuantizeAndDequantize(creation_context_, op_def, attr); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 3, 2, 1), &dst_tensor)); EXPECT_THAT(dst_tensor.data, diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc index 774c030545a..a80dccd6259 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc @@ -21,50 +21,36 @@ limitations under the License. namespace tflite { namespace gpu { namespace cl { +GPUOperation CreateReLU(const CreationContext& creation_context, + const OperationDef& definition, + const ReLUAttributes& attr) { + GPUOperation op(definition); + op.elementwise_ = true; -ReLU::ReLU(const OperationDef& definition, const ReLUAttributes& attr, - CalculationsPrecision scalar_precision) - : ElementwiseOperation(definition) { std::string min_func; if (attr.alpha != 0.0f) { min_func = "min(in_out_value * args.alpha, (FLT)(0.0f))"; if (definition.precision == CalculationsPrecision::F32) { - args_.AddFloat("alpha", attr.alpha); + op.args_.AddFloat("alpha", attr.alpha); } else { - args_.AddHalf("alpha", half(attr.alpha)); + op.args_.AddHalf("alpha", half(attr.alpha)); } } else { min_func = "(FLT)(0.0f)"; } if (attr.clip != 0.0f) { if (definition.precision == CalculationsPrecision::F32) { - args_.AddFloat("clip", attr.clip); + op.args_.AddFloat("clip", attr.clip); } else { - args_.AddHalf("clip", half(attr.clip)); + op.args_.AddHalf("clip", half(attr.clip)); } - code_ = absl::StrCat("in_out_value = clamp(in_out_value, " + min_func + - ", args.clip);"); + op.code_ = absl::StrCat("in_out_value = clamp(in_out_value, " + min_func + + ", args.clip);"); } else { - code_ = absl::StrCat("in_out_value = max(in_out_value, ", min_func, ");"); + op.code_ = + absl::StrCat("in_out_value = max(in_out_value, ", min_func, ");"); } -} - -ReLU::ReLU(ReLU&& operation) : ElementwiseOperation(std::move(operation)) {} - -ReLU& ReLU::operator=(ReLU&& operation) { - if (this != &operation) { - ElementwiseOperation::operator=(std::move(operation)); - } - return *this; -} - -ReLU CreateReLU(const CreationContext& creation_context, - const OperationDef& definition, const ReLUAttributes& attr) { - const auto scalar_precision = creation_context.device->IsPowerVR() - ? CalculationsPrecision::F32 - : definition.precision; - ReLU operation(definition, attr, scalar_precision); - return operation; + return op; } } // namespace cl diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h index ccb6f6ca37f..001e23da41c 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h +++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h @@ -25,25 +25,9 @@ namespace tflite { namespace gpu { namespace cl { -class ReLU : public ElementwiseOperation { - public: - // Move only - ReLU(ReLU&& operation); - ReLU& operator=(ReLU&& operation); - ReLU(const ReLU&) = delete; - ReLU& operator=(const ReLU&) = delete; - - friend ReLU CreateReLU(const CreationContext& creation_context, - const OperationDef& definition, - const ReLUAttributes& attr); - - private: - ReLU(const OperationDef& definition, const ReLUAttributes& attr, - CalculationsPrecision scalar_precision); -}; - -ReLU CreateReLU(const CreationContext& creation_context, - const OperationDef& definition, const ReLUAttributes& attr); +GPUOperation CreateReLU(const CreationContext& creation_context, + const OperationDef& definition, + const ReLUAttributes& attr); } // namespace cl } // namespace gpu diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc index cebc9886ba5..f741a408661 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc @@ -49,7 +49,7 @@ TEST_F(OpenCLOperationTest, ReLUNoClipNoAlpha) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ReLU operation = CreateReLU(creation_context_, op_def, attr); + GPUOperation operation = CreateReLU(creation_context_, op_def, attr); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); EXPECT_THAT(dst_tensor.data, @@ -76,7 +76,7 @@ TEST_F(OpenCLOperationTest, ReLUClip) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ReLU operation = CreateReLU(creation_context_, op_def, attr); + GPUOperation operation = CreateReLU(creation_context_, op_def, attr); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); EXPECT_THAT(dst_tensor.data, @@ -103,7 +103,7 @@ TEST_F(OpenCLOperationTest, ReLUAlpha) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ReLU operation = CreateReLU(creation_context_, op_def, attr); + GPUOperation operation = CreateReLU(creation_context_, op_def, attr); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); EXPECT_THAT(dst_tensor.data, @@ -130,7 +130,7 @@ TEST_F(OpenCLOperationTest, ReLUAlphaClip) { op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); TensorFloat32 dst_tensor; - ReLU operation = CreateReLU(creation_context_, op_def, attr); + GPUOperation operation = CreateReLU(creation_context_, op_def, attr); ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation, BHWC(1, 2, 1, 2), &dst_tensor)); EXPECT_THAT(dst_tensor.data, diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc index 088677ba7e2..f60af5f730d 100644 --- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc +++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc @@ -144,9 +144,9 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context, if (inputs.size() == 2 && (inputs[0]->tensor.shape.c == inputs[1]->tensor.shape.c || inputs[1]->tensor.shape.c == 1)) { - ElementwiseTwoInput operation = + GPUOperation operation = CreateElementwiseTwoInput(op_def, op_type, inputs[1]->tensor.shape); - *gpu_op = absl::make_unique(std::move(operation)); + *gpu_op = absl::make_unique(std::move(operation)); return absl::OkStatus(); } else if (inputs.size() >= 2) { auto output = outputs[0]; @@ -167,25 +167,21 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context, absl::get_if>( &attr.param); if (scalar) { - ElementwiseOneRuntimeOneScalar operation = - CreateElementwiseOneRuntimeOneScalar(creation_context, op_def, - op_type, *scalar); - *gpu_op = absl::make_unique( - std::move(operation)); + GPUOperation operation = CreateElementwiseOneRuntimeOneScalar( + creation_context, op_def, op_type, *scalar); + *gpu_op = absl::make_unique(std::move(operation)); return absl::OkStatus(); } else if (linear_tensor) { - ElementwiseTwoInput operation; + GPUOperation operation; RETURN_IF_ERROR(CreateElementwiseTwoInput( creation_context, op_def, op_type, *linear_tensor, &operation)); - *gpu_op = - absl::make_unique(std::move(operation)); + *gpu_op = absl::make_unique(std::move(operation)); return absl::OkStatus(); } else if (hwc_tensor) { - ElementwiseTwoInput operation; + GPUOperation operation; RETURN_IF_ERROR(CreateElementwiseTwoInput( creation_context, op_def, op_type, *hwc_tensor, &operation)); - *gpu_op = - absl::make_unique(std::move(operation)); + *gpu_op = absl::make_unique(std::move(operation)); return absl::OkStatus(); } } @@ -295,9 +291,9 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context, } case OperationType::MUL: { if (inputs.size() == 2) { - ElementwiseTwoInput operation = + GPUOperation operation = CreateElementwiseTwoInput(op_def, op_type, inputs[1]->tensor.shape); - *gpu_op = absl::make_unique(std::move(operation)); + *gpu_op = absl::make_unique(std::move(operation)); return absl::OkStatus(); } else if (inputs.size() == 1 && node.operation.attributes.has_value()) { auto attr = @@ -310,25 +306,21 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context, absl::get_if>( &attr.param); if (scalar) { - ElementwiseOneRuntimeOneScalar operation = - CreateElementwiseOneRuntimeOneScalar(creation_context, op_def, - op_type, *scalar); - *gpu_op = absl::make_unique( - std::move(operation)); + GPUOperation operation = CreateElementwiseOneRuntimeOneScalar( + creation_context, op_def, op_type, *scalar); + *gpu_op = absl::make_unique(std::move(operation)); return absl::OkStatus(); } else if (linear_tensor) { - ElementwiseTwoInput operation; + GPUOperation operation; RETURN_IF_ERROR(CreateElementwiseTwoInput( creation_context, op_def, op_type, *linear_tensor, &operation)); - *gpu_op = - absl::make_unique(std::move(operation)); + *gpu_op = absl::make_unique(std::move(operation)); return absl::OkStatus(); } else if (hwc_tensor) { - ElementwiseTwoInput operation; + GPUOperation operation; RETURN_IF_ERROR(CreateElementwiseTwoInput( creation_context, op_def, op_type, *hwc_tensor, &operation)); - *gpu_op = - absl::make_unique(std::move(operation)); + *gpu_op = absl::make_unique(std::move(operation)); return absl::OkStatus(); } } @@ -353,8 +345,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context, case OperationType::QUANTIZE_AND_DEQUANTIZE: { auto attr = absl::any_cast( node.operation.attributes); - return SelectQuantizeAndDequantize(attr, creation_context, op_def, - gpu_op); + SelectQuantizeAndDequantize(attr, creation_context, op_def, gpu_op); + return absl::OkStatus(); } case OperationType::RELU: { auto attr = absl::any_cast(node.operation.attributes); @@ -405,9 +397,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context, case OperationType::SQRT: case OperationType::SQUARE: case OperationType::TANH: { - ElementwiseOneInput operation = - CreateElementwiseOneInput(op_def, op_type); - *gpu_op = absl::make_unique(std::move(operation)); + GPUOperation operation = CreateElementwiseOneInput(op_def, op_type); + *gpu_op = absl::make_unique(std::move(operation)); return absl::OkStatus(); } case OperationType::DIV: @@ -417,9 +408,9 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context, case OperationType::SQUARED_DIFF: case OperationType::SUB: { if (inputs.size() == 2) { - ElementwiseTwoInput operation = + GPUOperation operation = CreateElementwiseTwoInput(op_def, op_type, inputs[1]->tensor.shape); - *gpu_op = absl::make_unique(std::move(operation)); + *gpu_op = absl::make_unique(std::move(operation)); return absl::OkStatus(); } else if (inputs.size() == 1 && node.operation.attributes.has_value()) { auto attr = @@ -432,25 +423,21 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context, absl::get_if>( &attr.param); if (scalar) { - ElementwiseOneRuntimeOneScalar operation = - CreateElementwiseOneRuntimeOneScalar(creation_context, op_def, - op_type, *scalar); - *gpu_op = absl::make_unique( - std::move(operation)); + GPUOperation operation = CreateElementwiseOneRuntimeOneScalar( + creation_context, op_def, op_type, *scalar); + *gpu_op = absl::make_unique(std::move(operation)); return absl::OkStatus(); } else if (linear_tensor) { - ElementwiseTwoInput operation; + GPUOperation operation; RETURN_IF_ERROR(CreateElementwiseTwoInput( creation_context, op_def, op_type, *linear_tensor, &operation)); - *gpu_op = - absl::make_unique(std::move(operation)); + *gpu_op = absl::make_unique(std::move(operation)); return absl::OkStatus(); } else if (hwc_tensor) { - ElementwiseTwoInput operation; + GPUOperation operation; RETURN_IF_ERROR(CreateElementwiseTwoInput( creation_context, op_def, op_type, *hwc_tensor, &operation)); - *gpu_op = - absl::make_unique(std::move(operation)); + *gpu_op = absl::make_unique(std::move(operation)); return absl::OkStatus(); } } diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc index a32efd5dd2c..1c0bed74422 100644 --- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc +++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc @@ -54,17 +54,17 @@ void SelectLSTM(const OperationDef& op_def, const DeviceInfo& device_info, void SelectReLU(const CreationContext& creation_context, const ReLUAttributes& attr, const OperationDef& op_def, std::unique_ptr* ptr) { - ReLU relu = CreateReLU(creation_context, op_def, attr); - *ptr = absl::make_unique(std::move(relu)); + GPUOperation relu = CreateReLU(creation_context, op_def, attr); + *ptr = absl::make_unique(std::move(relu)); } absl::Status SelectPReLU(const PReLUAttributes& attr, const CreationContext& creation_context, const OperationDef& op_def, std::unique_ptr* ptr) { - PReLU operation; + GPUOperation operation; RETURN_IF_ERROR(CreatePReLU(creation_context, op_def, attr, &operation)); - *ptr = absl::make_unique(std::move(operation)); + *ptr = absl::make_unique(std::move(operation)); return absl::OkStatus(); } @@ -85,8 +85,8 @@ void SelectMaxUnpooling(const MaxUnpooling2DAttributes& attr, void SelectAdd(const OperationDef& op_def, const std::vector& channels, int dst_channels, std::unique_ptr* ptr) { - Add operation = CreateAdd(op_def, channels, dst_channels); - *ptr = absl::make_unique(std::move(operation)); + GPUOperation operation = CreateAdd(op_def, channels, dst_channels); + *ptr = absl::make_unique(std::move(operation)); } absl::Status SelectResize(const Resize2DAttributes& attr, @@ -203,15 +203,13 @@ absl::Status SelectWinograd36To4x4( return absl::OkStatus(); } -absl::Status SelectQuantizeAndDequantize( - const QuantizeAndDequantizeAttributes& attr, - const CreationContext& creation_context, const OperationDef& op_def, - std::unique_ptr* ptr) { - QuantizeAndDequantize operation; - RETURN_IF_ERROR( - CreateQuantizeAndDequantize(creation_context, op_def, attr, &operation)); - *ptr = absl::make_unique(std::move(operation)); - return absl::OkStatus(); +void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr, + const CreationContext& creation_context, + const OperationDef& op_def, + std::unique_ptr* ptr) { + GPUOperation operation = + CreateQuantizeAndDequantize(creation_context, op_def, attr); + *ptr = absl::make_unique(std::move(operation)); } } // namespace cl diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h index f266882a458..7133aa94502 100644 --- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h +++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h @@ -97,10 +97,10 @@ absl::Status SelectWinograd36To4x4( const tflite::gpu::Tensor& biases, std::unique_ptr* ptr); -absl::Status SelectQuantizeAndDequantize( - const QuantizeAndDequantizeAttributes& attr, - const CreationContext& creation_context, const OperationDef& op_def, - std::unique_ptr* ptr); +void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr, + const CreationContext& creation_context, + const OperationDef& op_def, + std::unique_ptr* ptr); } // namespace cl } // namespace gpu