From dab856a93fdecb88880e08bc94928f3e0f141cf9 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 3 Aug 2020 17:10:40 -0700
Subject: [PATCH] Removed ElementwiseOperation. Simplified
 ex-ElementwiseOperations, ReLU, PReLU, etc.

PiperOrigin-RevId: 324715510
Change-Id: I3d98cdbcc8075bb91f20e065b0aca2ab16a4e8e5
---
 .../delegates/gpu/cl/inference_context.cc     |   8 +-
 .../lite/delegates/gpu/cl/kernels/add.cc      |  43 ++--
 .../lite/delegates/gpu/cl/kernels/add.h       |  22 +-
 .../lite/delegates/gpu/cl/kernels/add_test.cc |   6 +-
 .../delegates/gpu/cl/kernels/elementwise.cc   | 215 ++++++------------
 .../delegates/gpu/cl/kernels/elementwise.h    |  91 ++------
 .../gpu/cl/kernels/elementwise_test.cc        |  62 +++--
 .../delegates/gpu/cl/kernels/gpu_operation.cc | 158 +++++++------
 .../delegates/gpu/cl/kernels/gpu_operation.h  |  76 ++-----
 .../lite/delegates/gpu/cl/kernels/prelu.cc    |  46 ++--
 .../lite/delegates/gpu/cl/kernels/prelu.h     |  41 +---
 .../delegates/gpu/cl/kernels/prelu_test.cc    |   4 +-
 .../gpu/cl/kernels/quantize_and_dequantize.cc |  66 ++----
 .../gpu/cl/kernels/quantize_and_dequantize.h  |  38 +---
 .../kernels/quantize_and_dequantize_test.cc   |  20 +-
 .../lite/delegates/gpu/cl/kernels/relu.cc     |  42 ++--
 .../lite/delegates/gpu/cl/kernels/relu.h      |  22 +-
 .../delegates/gpu/cl/kernels/relu_test.cc     |   8 +-
 .../gpu/cl/selectors/operation_selector.cc    |  75 +++---
 .../gpu/cl/selectors/simple_selectors.cc      |  28 ++-
 .../gpu/cl/selectors/simple_selectors.h       |   8 +-
 21 files changed, 368 insertions(+), 711 deletions(-)
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 3067c81ec94..8e23eb1bcee 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -390,9 +390,7 @@ void InferenceContext::Merge() {
       continue;
     }
     auto& linkable_node = nodes_[next_nodes[0]];
-    auto* elementwise =
-        dynamic_cast<ElementwiseOperation*>(linkable_node.operations[0].get());
-    if (!elementwise || !elementwise->IsLinkable() ||
+    if (!linkable_node.operations[0]->IsLinkable() ||
         linkable_node.outputs.size() != 1 ||
         !IsReady(ready_tensors, linkable_node)) {
       continue;
@@ -410,9 +408,7 @@ void InferenceContext::Merge() {
   }
   for (auto& node : nodes_) {
     for (int j = 1; j < node.operations.size(); ++j) {
-      auto* elementwise =
-          dynamic_cast<ElementwiseOperation*>(node.operations[j].get());
-      node.operations[0]->AddOperation(elementwise);
+      node.operations[0]->AddOperation(node.operations[j].get());
     }
   }
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
index 1d09e39b83b..1cb41e79d88 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
@@ -25,42 +25,29 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-Add::Add(const OperationDef& definition, const std::vector<int>& channels,
-         int dst_channels)
-    : ElementwiseOperation(definition) {
+GPUOperation CreateAdd(const OperationDef& definition,
+                       const std::vector<int>& channels, int dst_channels) {
+  GPUOperation add(definition);
   int dst_depth = DivideRoundUp(dst_channels, 4);
   int src0_depth = DivideRoundUp(channels[0], 4);
-  linkable_ = dst_depth == src0_depth;
+  add.elementwise_ = true;
+  add.linkable_ = dst_depth == src0_depth;
   if (src0_depth < dst_depth) {
-    check_src_channels_size_ = true;
+    add.check_src_channels_size_ = true;
   }
-  for (int i = 1; i < definition_.src_tensors.size(); ++i) {
+  for (int i = 1; i < definition.src_tensors.size(); ++i) {
     const std::string tensor_name = absl::StrCat("src_data_", i);
-    auto src_desc = definition_.src_tensors[i];
-    if (definition_.IsBatchSupported()) {
+    auto src_desc = definition.src_tensors[i];
+    if (definition.IsBatchSupported()) {
       src_desc.SetStateVar("BatchedWidth", "true");
     }
-    AddSrcTensor(tensor_name, src_desc);
-    code_ += "if (S_COORD < args." + tensor_name + ".Slices()) {\n";
-    code_ += "  in_out_value += args." + tensor_name +
-             ".Read(X_COORD, Y_COORD, S_COORD);\n";
-    code_ += "}\n";
+    add.AddSrcTensor(tensor_name, src_desc);
+    add.code_ += "if (S_COORD < args." + tensor_name + ".Slices()) {\n";
+    add.code_ += "  in_out_value += args." + tensor_name +
+                 ".Read(X_COORD, Y_COORD, S_COORD);\n";
+    add.code_ += "}\n";
   }
-}
-
-Add::Add(Add&& operation) : ElementwiseOperation(std::move(operation)) {}
-
-Add& Add::operator=(Add&& operation) {
-  if (this != &operation) {
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-Add CreateAdd(const OperationDef& definition, const std::vector<int>& channels,
-              int dst_channels) {
-  Add operation(definition, channels, dst_channels);
-  return operation;
+  return add;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.h b/tensorflow/lite/delegates/gpu/cl/kernels/add.h
index 81b2fed116f..0e9d7e0d333 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.h
@@ -27,24 +27,10 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-// Add operation inherited from ElementwiseOperation, but it is more
-// complicated than usual elementwise, that is why it has own versions for
-// Compile. Add operation support not equal tensors on input (for possibility to
-// remove Padding operation with zeroes in Z dimension)
-class Add : public ElementwiseOperation {
- public:
-  Add(const OperationDef& definition, const std::vector<int>& channels,
-      int dst_channels);
-
-  // Move only
-  Add(Add&& operation);
-  Add& operator=(Add&& operation);
-  Add(const Add&) = delete;
-  Add& operator=(const Add&) = delete;
-};
-
-Add CreateAdd(const OperationDef& definition, const std::vector<int>& channels,
-              int dst_channels);
+// Add operation supports not equal tensors on input (for possibility to
+// remove Padding operation with zeroes in channels dimension)
+GPUOperation CreateAdd(const OperationDef& definition,
+                       const std::vector<int>& channels, int dst_channels);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc
index 1eccab87646..2856b37a497 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc
@@ -49,7 +49,7 @@ TEST_F(OpenCLOperationTest, AddTwoEqualTensors) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Add operation = CreateAdd(op_def, channels, channels[0]);
+      GPUOperation operation = CreateAdd(op_def, channels, channels[0]);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -77,7 +77,7 @@ TEST_F(OpenCLOperationTest, AddFirstTensorHasMoreChannelsThanSecond) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Add operation = CreateAdd(op_def, channels, channels[0]);
+      GPUOperation operation = CreateAdd(op_def, channels, channels[0]);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 2, 1, 6), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -107,7 +107,7 @@ TEST_F(OpenCLOperationTest, AddFirstTensorHasLessChannelsThanSecond) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Add operation = CreateAdd(op_def, channels, 6);
+      GPUOperation operation = CreateAdd(op_def, channels, 6);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 2, 1, 6), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
index 21866021e91..063b15c1b69 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
@@ -134,128 +134,33 @@ std::string GetTwoInputCode(const OperationType& op_type,
 }
 }  // namespace
 
-ElementwiseOneInput::ElementwiseOneInput(const OperationDef& definition,
-                                         const OperationType& op_type)
-    : ElementwiseOperation(definition) {
-  code_ = GetOneInputCode(op_type, definition.precision, "in_out_value");
+GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
+                                       const OperationType& op_type) {
+  GPUOperation op(definition);
+  op.elementwise_ = true;
+  op.code_ = GetOneInputCode(op_type, definition.precision, "in_out_value");
+  return op;
 }
 
-ElementwiseOneInput::ElementwiseOneInput(ElementwiseOneInput&& operation)
-    : ElementwiseOperation(std::move(operation)) {}
-
-ElementwiseOneInput& ElementwiseOneInput::operator=(
-    ElementwiseOneInput&& operation) {
-  if (this != &operation) {
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-ElementwiseOneInput CreateElementwiseOneInput(const OperationDef& definition,
-                                              const OperationType& op_type) {
-  ElementwiseOneInput operation(definition, op_type);
-  return operation;
-}
-
-ElementwiseOneRuntimeOneScalar::ElementwiseOneRuntimeOneScalar(
-    const OperationDef& definition, const OperationType& op_type,
-    float scalar_parameter, CalculationsPrecision scalar_precision)
-    : ElementwiseOperation(definition) {
-  if (definition.precision == CalculationsPrecision::F32) {
-    args_.AddFloat("scalar", scalar_parameter);
-  } else {
-    args_.AddHalf("scalar", half(scalar_parameter));
-  }
-  code_ = GetTwoInputCode(op_type, "in_out_value", "args.scalar");
-}
-
-ElementwiseOneRuntimeOneScalar::ElementwiseOneRuntimeOneScalar(
-    ElementwiseOneRuntimeOneScalar&& operation)
-    : ElementwiseOperation(std::move(operation)) {}
-
-ElementwiseOneRuntimeOneScalar& ElementwiseOneRuntimeOneScalar::operator=(
-    ElementwiseOneRuntimeOneScalar&& operation) {
-  if (this != &operation) {
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-ElementwiseOneRuntimeOneScalar CreateElementwiseOneRuntimeOneScalar(
+GPUOperation CreateElementwiseOneRuntimeOneScalar(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type, float scalar_parameter) {
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
-  ElementwiseOneRuntimeOneScalar operation(definition, op_type,
-                                           scalar_parameter, scalar_precision);
-  return operation;
-}
-
-ElementwiseTwoInput::ElementwiseTwoInput(const OperationDef& definition,
-                                         const OperationType& op_type,
-                                         const BroadcastSettings& broadcast)
-    : ElementwiseOperation(definition),
-      broadcast_(broadcast) {
-  auto src_desc = definition.src_tensors[1];
-  if (definition.IsBatchSupported()) {
-    src_desc.SetStateVar("BatchedWidth", "true");
+  GPUOperation op(definition);
+  op.elementwise_ = true;
+  if (definition.precision == CalculationsPrecision::F32) {
+    op.args_.AddFloat("scalar", scalar_parameter);
+  } else {
+    op.args_.AddHalf("scalar", half(scalar_parameter));
   }
-  AddSrcTensor("second_tensor", src_desc);
-  const std::string x_coord = broadcast.width ? "0" : "X_COORD";
-  const std::string y_coord = broadcast.height ? "0" : "Y_COORD";
-  const std::string s_coord = broadcast.channels ? "0" : "S_COORD";
-  code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(", x_coord,
-                       ", ", y_coord, ", ", s_coord, ");\n");
-  if (broadcast.channels) {
-    code_ += "  second_val.y = second_val.x;\n";
-    code_ += "  second_val.z = second_val.x;\n";
-    code_ += "  second_val.w = second_val.x;\n";
-  }
-  code_ += GetTwoInputCode(op_type, "in_out_value", "second_val");
-}
-
-ElementwiseTwoInput::ElementwiseTwoInput(const OperationDef& definition,
-                                         const OperationType& op_type,
-                                         const BroadcastSettings& broadcast,
-                                         Tensor&& constant_tensor)
-    : ElementwiseOperation(definition),
-      broadcast_(broadcast) {
-  auto descriptor = constant_tensor.GetDescriptor();
-  args_.AddObject("second_tensor", AccessType::READ,
-                  absl::make_unique<Tensor>(std::move(constant_tensor)),
-                  absl::make_unique<TensorDescriptor>(descriptor));
-  const std::string x_coord = broadcast.width ? "0" : "X_COORD";
-  const std::string y_coord = broadcast.height ? "0" : "Y_COORD";
-  const std::string s_coord = broadcast.channels ? "0" : "S_COORD";
-  code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(", x_coord,
-                       ", ", y_coord, ", ", s_coord, ");\n");
-  if (broadcast.channels) {
-    code_ += "  second_val.y = second_val.x;\n";
-    code_ += "  second_val.z = second_val.x;\n";
-    code_ += "  second_val.w = second_val.x;\n";
-  }
-  code_ += GetTwoInputCode(op_type, "in_out_value", "second_val");
-}
-
-ElementwiseTwoInput::ElementwiseTwoInput(ElementwiseTwoInput&& operation)
-    : ElementwiseOperation(std::move(operation)),
-      broadcast_(operation.broadcast_) {}
-
-ElementwiseTwoInput& ElementwiseTwoInput::operator=(
-    ElementwiseTwoInput&& operation) {
-  if (this != &operation) {
-    broadcast_ = operation.broadcast_;
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
+  op.code_ = GetTwoInputCode(op_type, "in_out_value", "args.scalar");
+  return op;
 }
 
 absl::Status CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type,
     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
-    ElementwiseTwoInput* result) {
+    GPUOperation* result) {
   const BHWC shape = BHWC(1, 1, 1, constant_tensor.shape.v);
   TensorStorageType storage_type =
       SelectBestStorageType(*creation_context.context, *creation_context.device,
@@ -268,12 +173,21 @@ absl::Status CreateElementwiseTwoInput(
                                &gpu_tensor));
   RETURN_IF_ERROR(
       gpu_tensor.WriteData(creation_context.queue, constant_tensor));
-  BroadcastSettings broadcast;
-  broadcast.width = true;
-  broadcast.height = true;
-  broadcast.channels = shape.c == 1;
-  *result = ElementwiseTwoInput(definition, op_type, broadcast,
-                                std::move(gpu_tensor));
+
+  *result = GPUOperation(definition);
+  result->elementwise_ = true;
+  result->args_.AddObject("second_tensor", AccessType::READ,
+                          absl::make_unique<Tensor>(std::move(gpu_tensor)),
+                          absl::make_unique<TensorDescriptor>(desc));
+  const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
+  result->code_ = absl::StrCat(
+      "FLT4 second_val = args.second_tensor.Read(0, 0, ", s_coord, ");\n");
+  if (shape.c == 1) {
+    result->code_ += "  second_val.y = second_val.x;\n";
+    result->code_ += "  second_val.z = second_val.x;\n";
+    result->code_ += "  second_val.w = second_val.x;\n";
+  }
+  result->code_ += GetTwoInputCode(op_type, "in_out_value", "second_val");
   return absl::OkStatus();
 }
 
@@ -281,7 +195,7 @@ absl::Status CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type,
     const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
-    ElementwiseTwoInput* result) {
+    GPUOperation* result) {
   const BHWC shape = BHWC(1, constant_tensor.shape.h, constant_tensor.shape.w,
                           constant_tensor.shape.c);
   TensorStorageType storage_type =
@@ -295,34 +209,49 @@ absl::Status CreateElementwiseTwoInput(
                                &gpu_tensor));
   RETURN_IF_ERROR(
       gpu_tensor.WriteData(creation_context.queue, constant_tensor));
-  BroadcastSettings broadcast;
-  broadcast.width = shape.w == 1;
-  broadcast.height = shape.h == 1;
-  broadcast.channels = shape.c == 1;
-  *result = ElementwiseTwoInput(definition, op_type, broadcast,
-                                std::move(gpu_tensor));
+
+  *result = GPUOperation(definition);
+  result->elementwise_ = true;
+  result->args_.AddObject("second_tensor", AccessType::READ,
+                          absl::make_unique<Tensor>(std::move(gpu_tensor)),
+                          absl::make_unique<TensorDescriptor>(desc));
+  const std::string x_coord = shape.w == 1 ? "0" : "X_COORD";
+  const std::string y_coord = shape.h == 1 ? "0" : "Y_COORD";
+  const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
+  result->code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(",
+                               x_coord, ", ", y_coord, ", ", s_coord, ");\n");
+  if (shape.c == 1) {
+    result->code_ += "  second_val.y = second_val.x;\n";
+    result->code_ += "  second_val.z = second_val.x;\n";
+    result->code_ += "  second_val.w = second_val.x;\n";
+  }
+  result->code_ += GetTwoInputCode(op_type, "in_out_value", "second_val");
+
   return absl::OkStatus();
 }
 
-ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition,
-                                              const OperationType& op_type,
-                                              const BHWC& shape) {
-  BroadcastSettings broadcast;
-  broadcast.width = shape.w == 1;
-  broadcast.height = shape.h == 1;
-  broadcast.channels = shape.c == 1;
-  ElementwiseTwoInput operation(definition, op_type, broadcast);
-  return operation;
-}
-
-ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition,
-                                              const OperationType& op_type) {
-  BroadcastSettings broadcast;
-  broadcast.width = false;
-  broadcast.height = false;
-  broadcast.channels = false;
-  ElementwiseTwoInput operation(definition, op_type, broadcast);
-  return operation;
+GPUOperation CreateElementwiseTwoInput(const OperationDef& definition,
+                                       const OperationType& op_type,
+                                       const BHWC& shape) {
+  GPUOperation op(definition);
+  op.elementwise_ = true;
+  auto src_desc = definition.src_tensors[1];
+  if (definition.IsBatchSupported()) {
+    src_desc.SetStateVar("BatchedWidth", "true");
+  }
+  op.AddSrcTensor("second_tensor", src_desc);
+  const std::string x_coord = shape.w == 1 ? "0" : "X_COORD";
+  const std::string y_coord = shape.h == 1 ? "0" : "Y_COORD";
+  const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
+  op.code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(", x_coord,
+                          ", ", y_coord, ", ", s_coord, ");\n");
+  if (shape.c == 1) {
+    op.code_ += "  second_val.y = second_val.x;\n";
+    op.code_ += "  second_val.z = second_val.x;\n";
+    op.code_ += "  second_val.w = second_val.x;\n";
+  }
+  op.code_ += GetTwoInputCode(op_type, "in_out_value", "second_val");
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
index 9712ee96b90..d03d535b39a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
@@ -26,93 +26,38 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-// Class for simple one input operations without any parameters, for example
-// log, sin, cos and etc.
-class ElementwiseOneInput : public ElementwiseOperation {
- public:
-  ElementwiseOneInput(const OperationDef& definition,
-                      const OperationType& op_type);
+// Creates simple one input operation without any parameters, for example
+// log, sin, cos, etc.
+GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
+                                       const OperationType& op_type);
 
-  // Move only
-  ElementwiseOneInput(ElementwiseOneInput&& operation);
-  ElementwiseOneInput& operator=(ElementwiseOneInput&& operation);
-  ElementwiseOneInput(const ElementwiseOneInput&) = delete;
-  ElementwiseOneInput& operator=(const ElementwiseOneInput&) = delete;
-};
-
-ElementwiseOneInput CreateElementwiseOneInput(const OperationDef& definition,
-                                              const OperationType& op_type);
-
-// Class for simple two input (first input is runtime tensor and second input is
-// scalar argument) operations without any parameters, for example sub, div and
-// etc.
-class ElementwiseOneRuntimeOneScalar : public ElementwiseOperation {
- public:
-  ElementwiseOneRuntimeOneScalar(const OperationDef& definition,
-                                 const OperationType& op_type,
-                                 float scalar_parameter,
-                                 CalculationsPrecision scalar_precision);
-
-  // Move only
-  ElementwiseOneRuntimeOneScalar(ElementwiseOneRuntimeOneScalar&& operation);
-  ElementwiseOneRuntimeOneScalar& operator=(
-      ElementwiseOneRuntimeOneScalar&& operation);
-  ElementwiseOneRuntimeOneScalar(const ElementwiseOneRuntimeOneScalar&) =
-      delete;
-  ElementwiseOneRuntimeOneScalar& operator=(
-      const ElementwiseOneRuntimeOneScalar&) = delete;
-};
-
-ElementwiseOneRuntimeOneScalar CreateElementwiseOneRuntimeOneScalar(
+// Creates simple two input (first input is runtime tensor and second input is
+// scalar argument) operation, for example sub, div, pow, etc.
+GPUOperation CreateElementwiseOneRuntimeOneScalar(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type, float scalar_parameter);
 
-struct BroadcastSettings {
-  bool width;
-  bool height;
-  bool channels;
-};
-
-// Class for simple two input(first input is runtime tensor and second input is
-// runtime or constant tensor) operations without any parameters, for example
-// sub, div and etc.
-class ElementwiseTwoInput : public ElementwiseOperation {
- public:
-  ElementwiseTwoInput() = default;
-  ElementwiseTwoInput(const OperationDef& definition,
-                      const OperationType& op_type,
-                      const BroadcastSettings& broadcast);
-
-  ElementwiseTwoInput(const OperationDef& definition,
-                      const OperationType& op_type,
-                      const BroadcastSettings& broadcast,
-                      Tensor&& constant_tensor);
-
-  // Move only
-  ElementwiseTwoInput(ElementwiseTwoInput&& operation);
-  ElementwiseTwoInput& operator=(ElementwiseTwoInput&& operation);
-  ElementwiseTwoInput(const ElementwiseTwoInput&) = delete;
-  ElementwiseTwoInput& operator=(const ElementwiseTwoInput&) = delete;
-
- private:
-  BroadcastSettings broadcast_;
-};
-
+// Creates simple two input(first input is runtime tensor and second input is
+// constant linear tensor) operation, for example sub, div and etc.
 absl::Status CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type,
     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
-    ElementwiseTwoInput* result);
+    GPUOperation* result);
 
+// Creates simple two input(first input is runtime tensor and second input is
+// constant HWC tensor) operation, for example sub, div and etc.
 absl::Status CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type,
     const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
-    ElementwiseTwoInput* result);
+    GPUOperation* result);
 
-ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition,
-                                              const OperationType& op_type,
-                                              const BHWC& shape);
+// Creates simple two input(2 runtime tensors) operation, for example
+// sub, div and etc.
+GPUOperation CreateElementwiseTwoInput(const OperationDef& definition,
+                                       const OperationType& op_type,
+                                       const BHWC& shape);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
index ac825c0cdfc..11a651df901 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
@@ -45,7 +45,7 @@ TEST_F(OpenCLOperationTest, Abs) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::ABS);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -70,7 +70,7 @@ TEST_F(OpenCLOperationTest, Cos) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::COS);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -95,7 +95,7 @@ TEST_F(OpenCLOperationTest, Copy) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::COPY);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -118,7 +118,7 @@ TEST_F(OpenCLOperationTest, Elu) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::ELU);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 7), &dst_tensor));
@@ -144,7 +144,7 @@ TEST_F(OpenCLOperationTest, Exp) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::EXP);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 7), &dst_tensor));
@@ -171,7 +171,7 @@ TEST_F(OpenCLOperationTest, HardSwish) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::HARD_SWISH);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     src_tensor.shape, &dst_tensor));
@@ -197,7 +197,7 @@ TEST_F(OpenCLOperationTest, Log) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::LOG);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -222,7 +222,7 @@ TEST_F(OpenCLOperationTest, Rsqrt) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::RSQRT);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -249,7 +249,7 @@ TEST_F(OpenCLOperationTest, Sigmoid) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::SIGMOID);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -273,7 +273,7 @@ TEST_F(OpenCLOperationTest, Sin) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::SIN);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -299,7 +299,7 @@ TEST_F(OpenCLOperationTest, Sqrt) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::SQRT);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -325,7 +325,7 @@ TEST_F(OpenCLOperationTest, Square) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::SQUARE);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -349,7 +349,7 @@ TEST_F(OpenCLOperationTest, Tanh) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseOneInput operation =
+      GPUOperation operation =
           CreateElementwiseOneInput(op_def, OperationType::TANH);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -378,7 +378,7 @@ TEST_F(OpenCLOperationTest, Sub) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::SUB, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -406,7 +406,7 @@ TEST_F(OpenCLOperationTest, SquaredDiff) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::SQUARED_DIFF, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -434,7 +434,7 @@ TEST_F(OpenCLOperationTest, Div) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::DIV, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -462,7 +462,7 @@ TEST_F(OpenCLOperationTest, Pow) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::POW, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -490,7 +490,7 @@ TEST_F(OpenCLOperationTest, Add) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::ADD, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -518,7 +518,7 @@ TEST_F(OpenCLOperationTest, Maximum) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::MAXIMUM, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -547,9 +547,8 @@ TEST_F(OpenCLOperationTest, MaximumWithScalar) {
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
       const float* scalar = absl::get_if<float>(&attr.param);
-      ElementwiseOneRuntimeOneScalar operation =
-          CreateElementwiseOneRuntimeOneScalar(creation_context_, op_def,
-                                               OperationType::MAXIMUM, *scalar);
+      GPUOperation operation = CreateElementwiseOneRuntimeOneScalar(
+          creation_context_, op_def, OperationType::MAXIMUM, *scalar);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -578,7 +577,7 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantLinearTensor) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation;
+      GPUOperation operation;
       ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
                                           OperationType::MAXIMUM, linear_tensor,
                                           &operation));
@@ -608,7 +607,7 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensor) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation;
+      GPUOperation operation;
       ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
                                           OperationType::MAXIMUM, hwc_tensor,
                                           &operation));
@@ -637,7 +636,7 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensorBroadcastChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation;
+      GPUOperation operation;
       ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
                                           OperationType::MAXIMUM, hwc_tensor,
                                           &operation));
@@ -666,7 +665,7 @@ TEST_F(OpenCLOperationTest, Minimum) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::MINIMUM, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -695,9 +694,8 @@ TEST_F(OpenCLOperationTest, MinimumWithScalar) {
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
       const float* scalar = absl::get_if<float>(&attr.param);
-      ElementwiseOneRuntimeOneScalar operation =
-          CreateElementwiseOneRuntimeOneScalar(creation_context_, op_def,
-                                               OperationType::MINIMUM, *scalar);
+      GPUOperation operation = CreateElementwiseOneRuntimeOneScalar(
+          creation_context_, op_def, OperationType::MINIMUM, *scalar);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -723,7 +721,7 @@ TEST_F(OpenCLOperationTest, Mul) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::MUL, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -751,7 +749,7 @@ TEST_F(OpenCLOperationTest, MulBroadcastHW) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::MUL, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
@@ -779,7 +777,7 @@ TEST_F(OpenCLOperationTest, MulBroadcastChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+      GPUOperation operation = CreateElementwiseTwoInput(
           op_def, OperationType::MUL, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
index beb62632099..7260048c6d3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@@ -49,6 +49,20 @@ std::string GetElementWiseCode(const OperationDef& op_def,
   return c;
 }
 
+absl::Status MergeOperations(const std::vector<GPUOperation*>& linked_ops,
+                             Arguments* merged_args, std::string* merged_code) {
+  for (int i = 0; i < linked_ops.size(); ++i) {
+    std::string code = linked_ops[i]->code_;
+    std::string unique_postfix = absl::StrCat("_link", i + 1);
+    linked_ops[i]->args_.RenameArgs(unique_postfix, &code);
+    *merged_code += "{\n" + code + "\n}\n";
+    RETURN_IF_ERROR(
+        merged_args->Merge(std::move(linked_ops[i]->args_), unique_postfix));
+    linked_ops[i]->AddUniquePostfix(unique_postfix);
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 DataType OperationDef::GetDataType() const {
@@ -108,14 +122,17 @@ void GPUOperation::SetDst(Tensor* ptr, int index) {
 }
 
 GPUOperation::GPUOperation(GPUOperation&& operation)
-    : definition_(std::move(operation.definition_)),
+    : args_(std::move(operation.args_)),
+      code_(std::move(operation.code_)),
+      elementwise_(operation.elementwise_),
+      linkable_(operation.linkable_),
+      check_src_channels_size_(operation.check_src_channels_size_),
+      definition_(std::move(operation.definition_)),
       src_(std::move(operation.src_)),
       dst_(std::move(operation.dst_)),
-      args_(std::move(operation.args_)),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_),
       grid_size_(operation.grid_size_),
-      code_(std::move(operation.code_)),
       src_tensors_names_(std::move(operation.src_tensors_names_)),
       dst_tensors_names_(std::move(operation.dst_tensors_names_)),
       compiler_options_(std::move(operation.compiler_options_)),
@@ -123,14 +140,17 @@ GPUOperation::GPUOperation(GPUOperation&& operation)
 
 GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
   if (this != &operation) {
+    args_ = std::move(operation.args_);
+    code_ = std::move(operation.code_);
+    elementwise_ = operation.elementwise_;
+    linkable_ = operation.linkable_;
+    check_src_channels_size_ = operation.check_src_channels_size_;
     definition_ = std::move(operation.definition_);
     src_ = std::move(operation.src_);
     dst_ = std::move(operation.dst_);
-    args_ = std::move(operation.args_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
     std::swap(grid_size_, operation.grid_size_);
-    code_ = std::move(operation.code_);
     src_tensors_names_ = std::move(operation.src_tensors_names_);
     dst_tensors_names_ = std::move(operation.dst_tensors_names_);
     compiler_options_ = std::move(operation.compiler_options_);
@@ -139,7 +159,7 @@ GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
   return *this;
 }
 
-void GPUOperation::AddOperation(ElementwiseOperation* operation) {
+void GPUOperation::AddOperation(GPUOperation* operation) {
   linked_operations_.push_back(operation);
 }
 
@@ -183,73 +203,62 @@ absl::Status GPUOperation::UpdateParams() {
 }
 
 absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(
-      creation_context.device->GetInfo(),
-      {{dst_tensors_names_[0], element_wise_code}}, &code_));
-  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code_, "main_function", compiler_options_, *creation_context.context,
-      *creation_context.device, &kernel_));
+  if (elementwise_) {
+    auto src_desc =
+        absl::make_unique<TensorDescriptor>(definition_.src_tensors[0]);
+    if (definition_.IsBatchSupported()) {
+      src_desc->SetStateVar("BatchedWidth", "true");
+    }
+    src_tensors_names_.insert(src_tensors_names_.begin(), "src_tensor");
+    args_.AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+
+    auto dst_desc =
+        absl::make_unique<TensorDescriptor>(definition_.dst_tensors[0]);
+    if (definition_.IsBatchSupported()) {
+      dst_desc->SetStateVar("BatchedWidth", "true");
+    }
+    dst_tensors_names_.insert(dst_tensors_names_.begin(), "dst_tensor");
+    args_.AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+
+    std::string code =
+        GetElementWiseCode(definition_, check_src_channels_size_);
+    std::string element_wise_code;
+    element_wise_code += "{\n" + code_ + "\n}\n";
+    RETURN_IF_ERROR(
+        MergeOperations(linked_operations_, &args_, &element_wise_code));
+    RETURN_IF_ERROR(args_.TransformToCLCode(
+        creation_context.device->GetInfo(),
+        {{dst_tensors_names_[0], element_wise_code}}, &code));
+    code = absl::Substitute(code, args_.GetListOfArgs());
+    RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+        code, "main_function", *creation_context.context,
+        *creation_context.device, &kernel_));
+  } else {
+    std::string element_wise_code;
+    RETURN_IF_ERROR(
+        MergeOperations(linked_operations_, &args_, &element_wise_code));
+    RETURN_IF_ERROR(args_.TransformToCLCode(
+        creation_context.device->GetInfo(),
+        {{dst_tensors_names_[0], element_wise_code}}, &code_));
+    RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+        code_, "main_function", compiler_options_, *creation_context.context,
+        *creation_context.device, &kernel_));
+  }
   return PostCompileCheck(creation_context.device->GetInfo());
 }
 
-ElementwiseOperation::ElementwiseOperation(ElementwiseOperation&& operation)
-    : GPUOperation(std::move(operation)),
-      check_src_channels_size_(operation.check_src_channels_size_),
-      linkable_(operation.linkable_) {}
-
-ElementwiseOperation& ElementwiseOperation::operator=(
-    ElementwiseOperation&& operation) {
-  if (this != &operation) {
-    check_src_channels_size_ = operation.check_src_channels_size_;
-    linkable_ = operation.linkable_;
-    GPUOperation::operator=(std::move(operation));
+int3 GPUOperation::GetGridSize() const {
+  if (elementwise_) {
+    const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+    const int grid_y = dst_[0]->Height();
+    const int grid_z = dst_[0]->Slices();
+    return int3(grid_x, grid_y, grid_z);
+  } else {
+    return int3(0, 0, 0);
   }
-  return *this;
 }
 
-int3 ElementwiseOperation::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status ElementwiseOperation::Compile(
-    const CreationContext& creation_context) {
-  auto src_desc =
-      absl::make_unique<TensorDescriptor>(definition_.src_tensors[0]);
-  if (definition_.IsBatchSupported()) {
-    src_desc->SetStateVar("BatchedWidth", "true");
-  }
-  src_tensors_names_.insert(src_tensors_names_.begin(), "src_tensor");
-  args_.AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
-
-  auto dst_desc =
-      absl::make_unique<TensorDescriptor>(definition_.dst_tensors[0]);
-  if (definition_.IsBatchSupported()) {
-    dst_desc->SetStateVar("BatchedWidth", "true");
-  }
-  dst_tensors_names_.insert(dst_tensors_names_.begin(), "dst_tensor");
-  args_.AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
-
-  std::string code = GetElementWiseCode(definition_, check_src_channels_size_);
-  std::string element_wise_code;
-  element_wise_code += "{\n" + code_ + "\n}\n";
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(
-      creation_context.device->GetInfo(),
-      {{dst_tensors_names_[0], element_wise_code}}, &code));
-  code = absl::Substitute(code, args_.GetListOfArgs());
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-void ElementwiseOperation::AddUniquePostfix(const std::string& unique_postfix) {
+void GPUOperation::AddUniquePostfix(const std::string& unique_postfix) {
   for (int i = 0; i < src_tensors_names_.size(); ++i) {
     src_tensors_names_[i] += unique_postfix;
   }
@@ -258,21 +267,6 @@ void ElementwiseOperation::AddUniquePostfix(const std::string& unique_postfix) {
   }
 }
 
-absl::Status MergeOperations(
-    const std::vector<ElementwiseOperation*>& linked_ops,
-    Arguments* merged_args, std::string* merged_code) {
-  for (int i = 0; i < linked_ops.size(); ++i) {
-    std::string code = linked_ops[i]->GetCode();
-    std::string unique_postfix = absl::StrCat("_link", i + 1);
-    auto&& link_args = linked_ops[i]->MoveArgs();
-    link_args.RenameArgs(unique_postfix, &code);
-    *merged_code += "{\n" + code + "\n}\n";
-    RETURN_IF_ERROR(merged_args->Merge(std::move(link_args), unique_postfix));
-    linked_ops[i]->AddUniquePostfix(unique_postfix);
-  }
-  return absl::OkStatus();
-}
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
index 01e11f3ea64..620883f26f4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@@ -59,18 +59,15 @@ struct OperationDef {
   bool IsBatchSupported() const;
 };
 
-class ElementwiseOperation;
-
 // GPUOperation represents some implementation of neural network operation on
-// GPU. GPUOperation can contain ElementwiseOperation operations, in this case,
-// ElementwiseOperation still hold necessary data and should be alive.
-// When GPUOperation contains ElementwiseOperations, this GPUoperation replaces
-// some sequence of operations Op + el_op0 + el_op1 + ...
+// GPU. GPUOperation can contain another GPU operations with flag elementwise_.
+// When GPUOperation contains another GPU ops, this GPUoperation replaces
+// some sequence of operations Op + op0 + op1 + ...
 // Because of this abilities of GPUOperation, usage scenario is next:
 // Create instance of GPUOperation.
-// Create all instances of ElementwiseOperations that we will(probably) attach
-// to GPUOperation. Attach all ElementwiseOperations to GPUOperation. Call
-// GPUOperation.Compile(). Don't call ElementwiseOperation.Compile() if it
+// Create all instances of GPUOperations that we will(probably) attach
+// to GPUOperation. Attach all GPUOperations to GPUOperation. Call
+// GPUOperation.Compile(). Don't call GPUOperations.Compile() if it
 // attached, it useless(and may be error)
 class GPUOperation {
  public:
@@ -83,7 +80,7 @@ class GPUOperation {
   GPUOperation(const GPUOperation&) = delete;
   GPUOperation& operator=(const GPUOperation&) = delete;
 
-  void AddOperation(ElementwiseOperation* operation);
+  void AddOperation(GPUOperation* operation);
 
   void SetSrc(Tensor* ptr, int index = 0);
   void SetDst(Tensor* ptr, int index = 0);
@@ -116,64 +113,37 @@ class GPUOperation {
   void AddDstTensor(const std::string& tensor_name,
                     const TensorDescriptor& desc);
 
+  bool IsLinkable() const { return elementwise_ && linkable_; }
+
+  // for linking
+  void AddUniquePostfix(const std::string& unique_postfix);
+
+  Arguments args_;
+  std::string code_;
+
+  bool elementwise_ = false;
+  // applicable only with elementwise_ = true;
+  bool linkable_ = true;  // by default every elementwise is linkable
+  // applicable only with elementwise_ = true;
+  bool check_src_channels_size_ = false;
+
  protected:
   virtual absl::Status BindArguments() { return absl::OkStatus(); }
-  virtual int3 GetGridSize() const = 0;
+  virtual int3 GetGridSize() const;
 
   // Defines operation calculation precision and format of src/dst tensors.
   OperationDef definition_;
   std::vector<Tensor*> src_;
   std::vector<Tensor*> dst_;
-  Arguments args_;
   CLKernel kernel_;
   int3 work_group_size_ = int3(8, 4, 1);
   int3 grid_size_ = int3(0, 0, 0);
-  std::string code_;
   std::vector<std::string> src_tensors_names_;
   std::vector<std::string> dst_tensors_names_;
   std::vector<CompilerOptions> compiler_options_;
-  std::vector<ElementwiseOperation*> linked_operations_;
+  std::vector<GPUOperation*> linked_operations_;
 };
 
-// ElementwiseOperation can be fused(linked) to another operation.
-// field linked_ indicate about this
-// link_index_ used mostly for generating of correct names for
-//   linked code variables
-// link_index_ is number of operation in sequence of linked operations
-// and should be unique in this sequence
-// link_index_ = 0 is equivalent that operation not linked.
-class ElementwiseOperation : public GPUOperation {
- public:
-  ElementwiseOperation() {}
-  explicit ElementwiseOperation(const OperationDef& definition)
-      : GPUOperation(definition) {}
-
-  virtual ~ElementwiseOperation() {}
-
-  absl::Status Compile(const CreationContext& creation_context) override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  ElementwiseOperation(ElementwiseOperation&& operation);
-  ElementwiseOperation& operator=(ElementwiseOperation&& operation);
-  ElementwiseOperation(const ElementwiseOperation&) = delete;
-  ElementwiseOperation& operator=(const ElementwiseOperation&) = delete;
-
-  Arguments&& MoveArgs() { return std::move(args_); }
-  std::string GetCode() const { return code_; }
-  void AddUniquePostfix(const std::string& unique_postfix);
-
-  bool IsLinkable() const { return linkable_; }
-
- protected:
-  bool check_src_channels_size_ = false;
-  bool linkable_ = true;
-};
-
-absl::Status MergeOperations(
-    const std::vector<ElementwiseOperation*>& linked_ops,
-    Arguments* merged_args, std::string* merged_code);
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
index 85c88f3b51b..1ca2e096a0e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
@@ -24,47 +24,43 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-PReLU::PReLU(const OperationDef& definition, const PReLUAttributes& attr,
-             CalculationsPrecision scalar_precision)
-    : ElementwiseOperation(definition) {
+absl::Status CreatePReLU(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const PReLUAttributes& attr, GPUOperation* result) {
+  *result = GPUOperation(definition);
+  result->elementwise_ = true;
   if (attr.clip != 0) {
     if (definition.precision == CalculationsPrecision::F32) {
-      args_.AddFloat("clip", attr.clip);
+      result->args_.AddFloat("clip", attr.clip);
     } else {
-      args_.AddHalf("clip", half(attr.clip));
+      result->args_.AddHalf("clip", half(attr.clip));
     }
-    code_ =
+    result->code_ =
         "in_out_value = clamp(in_out_value, (FLT4)(0.0f), (FLT4)(args.clip)) + "
         "min((FLT4)(0.0f), in_out_value) * args.alpha.Read(S_COORD);";
   } else {
-    code_ =
+    result->code_ =
         "in_out_value = max((FLT4)(0.0f), in_out_value) + min((FLT4)(0.0f), "
         "in_out_value) * args.alpha.Read(S_COORD);";
   }
-}
 
-PReLU::PReLU(PReLU&& operation) : ElementwiseOperation(std::move(operation)) {}
-
-PReLU& PReLU::operator=(PReLU&& operation) {
-  if (this != &operation) {
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status CreatePReLU(const CreationContext& creation_context,
-                         const OperationDef& definition,
-                         const PReLUAttributes& attr, PReLU* result) {
   auto alpha =
       absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.alpha);
   if (!alpha) {
     return absl::InvalidArgumentError("Alpha is missing");
   }
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
-  *result = PReLU(definition, attr, scalar_precision);
-  RETURN_IF_ERROR(result->UploadParameters(*alpha, creation_context.context));
+  TensorLinearDescriptor desc;
+  desc.storage_type =
+      DeduceLinearStorageType(definition.GetPrimaryStorageType());
+  desc.element_type = definition.GetPrimaryDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, *alpha, creation_context.context, &lt));
+  result->args_.AddObject("alpha", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
+
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
index e65559cf7c7..b673217c799 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
@@ -31,48 +31,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class PReLU : public ElementwiseOperation {
- public:
-  PReLU() = default;
-  // Move only
-  PReLU(PReLU&& operation);
-  PReLU& operator=(PReLU&& operation);
-  PReLU(const PReLU&) = delete;
-  PReLU& operator=(const PReLU&) = delete;
-
-  friend absl::Status CreatePReLU(const CreationContext& creation_context,
-                                  const OperationDef& definition,
-                                  const PReLUAttributes& attr, PReLU* result);
-
- private:
-  PReLU(const OperationDef& definition, const PReLUAttributes& attr,
-        CalculationsPrecision scalar_precision);
-
-  template <DataType T>
-  absl::Status UploadParameters(
-      const tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context);
-};
-
 absl::Status CreatePReLU(const CreationContext& creation_context,
                          const OperationDef& definition,
-                         const PReLUAttributes& attr, PReLU* result);
-
-template <DataType T>
-absl::Status PReLU::UploadParameters(
-    const tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context) {
-  TensorLinearDescriptor desc;
-  desc.storage_type =
-      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
-  desc.element_type = definition_.GetPrimaryDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, parameters, context, &lt));
-  args_.AddObject("alpha", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-
-  return absl::OkStatus();
-}
+                         const PReLUAttributes& attr, GPUOperation* result);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
index 4b0006c7f32..06ff09ccca7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
@@ -52,7 +52,7 @@ TEST_F(OpenCLOperationTest, PReLUAlpha) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      PReLU operation;
+      GPUOperation operation;
       ASSERT_OK(CreatePReLU(creation_context_, op_def, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -83,7 +83,7 @@ TEST_F(OpenCLOperationTest, PReLUAlphaClip) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      PReLU operation;
+      GPUOperation operation;
       ASSERT_OK(CreatePReLU(creation_context_, op_def, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
index 957fc9bbb98..e0c44e1cda7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
@@ -25,59 +25,37 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-QuantizeAndDequantize::QuantizeAndDequantize(
-    const OperationDef& definition, const QuantizeAndDequantizeAttributes& attr,
-    CalculationsPrecision scalar_precision)
-    : ElementwiseOperation(definition) {
-  if (definition.precision == CalculationsPrecision::F32) {
-    args_.AddFloat("min", attr.min);
-    args_.AddFloat("max", attr.max);
-    args_.AddFloat("scale", attr.scale);
-  } else {
-    args_.AddHalf("min", half(attr.min));
-    args_.AddHalf("max", half(attr.max));
-    args_.AddHalf("scale", half(attr.scale));
-  }
-  code_ = R"(
-FLT4 clamped_value = min((FLT4)(args.max), max((FLT4)(args.min), in_out_value));
-FLT4 quantized_value = round((clamped_value - (FLT4)(args.min)) / (FLT4)(args.scale));
-FLT4 dequantized_value = quantized_value * (FLT4)(args.scale) + (FLT4)(args.min);
-in_out_value = dequantized_value;)";
-}
-
-QuantizeAndDequantize::QuantizeAndDequantize(QuantizeAndDequantize&& operation)
-    : ElementwiseOperation(std::move(operation)) {}
-
-QuantizeAndDequantize& QuantizeAndDequantize::operator=(
-    QuantizeAndDequantize&& operation) {
-  if (this != &operation) {
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status CreateQuantizeAndDequantize(
+GPUOperation CreateQuantizeAndDequantize(
     const CreationContext& creation_context, const OperationDef& definition,
-    const QuantizeAndDequantizeAttributes& attr,
-    QuantizeAndDequantize* result) {
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
+    const QuantizeAndDequantizeAttributes& attr) {
+  QuantizeAndDequantizeAttributes adjusted_attr = attr;
   const bool is_fp16 = definition.precision == CalculationsPrecision::F16 ||
                        definition.precision == CalculationsPrecision::F32_F16;
   if (is_fp16 && attr.scale < 0.000062f) {
     // The smallest positive normal number for Half-precision floating-point
     // format is 2^-14 ~ 0.000062f. Therefore, if the scale is lesser than this
     // number, we just reset it accordingly.
-    QuantizeAndDequantizeAttributes adjusted_attr = attr;
     adjusted_attr.scale = 0.000062f;
-    *result =
-        QuantizeAndDequantize(definition, adjusted_attr, scalar_precision);
-  } else {
-    *result = QuantizeAndDequantize(definition, attr, scalar_precision);
   }
-  return absl::OkStatus();
+
+  GPUOperation op(definition);
+  op.elementwise_ = true;
+  if (definition.precision == CalculationsPrecision::F32) {
+    op.args_.AddFloat("min", adjusted_attr.min);
+    op.args_.AddFloat("max", adjusted_attr.max);
+    op.args_.AddFloat("scale", adjusted_attr.scale);
+  } else {
+    op.args_.AddHalf("min", half(adjusted_attr.min));
+    op.args_.AddHalf("max", half(adjusted_attr.max));
+    op.args_.AddHalf("scale", half(adjusted_attr.scale));
+  }
+  op.code_ = R"(
+FLT4 clamped_value = min((FLT4)(args.max), max((FLT4)(args.min), in_out_value));
+FLT4 quantized_value = round((clamped_value - (FLT4)(args.min)) / (FLT4)(args.scale));
+FLT4 dequantized_value = quantized_value * (FLT4)(args.scale) + (FLT4)(args.min);
+in_out_value = dequantized_value;)";
+
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
index a40aa21d23c..6e028625852 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
@@ -43,43 +43,9 @@ namespace cl {
 //
 // NOTE: We do not need to nudge min/max values in this op, since they would
 // already be adjusted while generating the quantized model.
-class QuantizeAndDequantize : public ElementwiseOperation {
- public:
-  QuantizeAndDequantize() = default;
-  // Move only
-  QuantizeAndDequantize(QuantizeAndDequantize&& operation);
-  QuantizeAndDequantize& operator=(QuantizeAndDequantize&& operation);
-  QuantizeAndDequantize(const QuantizeAndDequantize&) = delete;
-  QuantizeAndDequantize& operator=(const QuantizeAndDequantize&) = delete;
-
-  friend absl::Status CreateQuantizeAndDequantize(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const QuantizeAndDequantizeAttributes& attr,
-      QuantizeAndDequantize* result);
-
- private:
-  QuantizeAndDequantize(const OperationDef& definition,
-                        const QuantizeAndDequantizeAttributes& attr,
-                        CalculationsPrecision scalar_precision);
-
-  template <DataType T>
-  absl::Status UploadParameters(
-      const tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context);
-};
-
-absl::Status CreateQuantizeAndDequantize(
+GPUOperation CreateQuantizeAndDequantize(
     const CreationContext& creation_context, const OperationDef& definition,
-    const QuantizeAndDequantizeAttributes& attr, QuantizeAndDequantize* result);
-
-template <DataType T>
-absl::Status QuantizeAndDequantize::UploadParameters(
-    const tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context) {
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
-      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
-  create_info.data_type = definition_.GetPrimaryDataType();
-  return absl::OkStatus();
-}
+    const QuantizeAndDequantizeAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
index 71d6d066b9b..43b5d69323d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
@@ -56,9 +56,8 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits8) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      QuantizeAndDequantize operation;
-      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
-                                            &operation));
+      GPUOperation operation =
+          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -92,9 +91,8 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits8_NegativeRange) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      QuantizeAndDequantize operation;
-      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
-                                            &operation));
+      GPUOperation operation =
+          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -128,9 +126,8 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits16) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      QuantizeAndDequantize operation;
-      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
-                                            &operation));
+      GPUOperation operation =
+          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -164,9 +161,8 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits16_NegativeRange) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      QuantizeAndDequantize operation;
-      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
-                                            &operation));
+      GPUOperation operation =
+          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
index 774c030545a..a80dccd6259 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
@@ -21,50 +21,36 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
+GPUOperation CreateReLU(const CreationContext& creation_context,
+                        const OperationDef& definition,
+                        const ReLUAttributes& attr) {
+  GPUOperation op(definition);
+  op.elementwise_ = true;
 
-ReLU::ReLU(const OperationDef& definition, const ReLUAttributes& attr,
-           CalculationsPrecision scalar_precision)
-    : ElementwiseOperation(definition) {
   std::string min_func;
   if (attr.alpha != 0.0f) {
     min_func = "min(in_out_value * args.alpha, (FLT)(0.0f))";
     if (definition.precision == CalculationsPrecision::F32) {
-      args_.AddFloat("alpha", attr.alpha);
+      op.args_.AddFloat("alpha", attr.alpha);
     } else {
-      args_.AddHalf("alpha", half(attr.alpha));
+      op.args_.AddHalf("alpha", half(attr.alpha));
     }
   } else {
     min_func = "(FLT)(0.0f)";
   }
   if (attr.clip != 0.0f) {
     if (definition.precision == CalculationsPrecision::F32) {
-      args_.AddFloat("clip", attr.clip);
+      op.args_.AddFloat("clip", attr.clip);
     } else {
-      args_.AddHalf("clip", half(attr.clip));
+      op.args_.AddHalf("clip", half(attr.clip));
     }
-    code_ = absl::StrCat("in_out_value = clamp(in_out_value, " + min_func +
-                         ", args.clip);");
+    op.code_ = absl::StrCat("in_out_value = clamp(in_out_value, " + min_func +
+                            ", args.clip);");
   } else {
-    code_ = absl::StrCat("in_out_value = max(in_out_value, ", min_func, ");");
+    op.code_ =
+        absl::StrCat("in_out_value = max(in_out_value, ", min_func, ");");
   }
-}
-
-ReLU::ReLU(ReLU&& operation) : ElementwiseOperation(std::move(operation)) {}
-
-ReLU& ReLU::operator=(ReLU&& operation) {
-  if (this != &operation) {
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-ReLU CreateReLU(const CreationContext& creation_context,
-                const OperationDef& definition, const ReLUAttributes& attr) {
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
-  ReLU operation(definition, attr, scalar_precision);
-  return operation;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
index ccb6f6ca37f..001e23da41c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
@@ -25,25 +25,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class ReLU : public ElementwiseOperation {
- public:
-  // Move only
-  ReLU(ReLU&& operation);
-  ReLU& operator=(ReLU&& operation);
-  ReLU(const ReLU&) = delete;
-  ReLU& operator=(const ReLU&) = delete;
-
-  friend ReLU CreateReLU(const CreationContext& creation_context,
-                         const OperationDef& definition,
-                         const ReLUAttributes& attr);
-
- private:
-  ReLU(const OperationDef& definition, const ReLUAttributes& attr,
-       CalculationsPrecision scalar_precision);
-};
-
-ReLU CreateReLU(const CreationContext& creation_context,
-                const OperationDef& definition, const ReLUAttributes& attr);
+GPUOperation CreateReLU(const CreationContext& creation_context,
+                        const OperationDef& definition,
+                        const ReLUAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
index cebc9886ba5..f741a408661 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
@@ -49,7 +49,7 @@ TEST_F(OpenCLOperationTest, ReLUNoClipNoAlpha) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ReLU operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -76,7 +76,7 @@ TEST_F(OpenCLOperationTest, ReLUClip) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ReLU operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -103,7 +103,7 @@ TEST_F(OpenCLOperationTest, ReLUAlpha) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ReLU operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -130,7 +130,7 @@ TEST_F(OpenCLOperationTest, ReLUAlphaClip) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ReLU operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index 088677ba7e2..f60af5f730d 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -144,9 +144,9 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       if (inputs.size() == 2 &&
           (inputs[0]->tensor.shape.c == inputs[1]->tensor.shape.c ||
            inputs[1]->tensor.shape.c == 1)) {
-        ElementwiseTwoInput operation =
+        GPUOperation operation =
             CreateElementwiseTwoInput(op_def, op_type, inputs[1]->tensor.shape);
-        *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+        *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       } else if (inputs.size() >= 2) {
         auto output = outputs[0];
@@ -167,25 +167,21 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
             absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(
                 &attr.param);
         if (scalar) {
-          ElementwiseOneRuntimeOneScalar operation =
-              CreateElementwiseOneRuntimeOneScalar(creation_context, op_def,
-                                                   op_type, *scalar);
-          *gpu_op = absl::make_unique<ElementwiseOneRuntimeOneScalar>(
-              std::move(operation));
+          GPUOperation operation = CreateElementwiseOneRuntimeOneScalar(
+              creation_context, op_def, op_type, *scalar);
+          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
           return absl::OkStatus();
         } else if (linear_tensor) {
-          ElementwiseTwoInput operation;
+          GPUOperation operation;
           RETURN_IF_ERROR(CreateElementwiseTwoInput(
               creation_context, op_def, op_type, *linear_tensor, &operation));
-          *gpu_op =
-              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
           return absl::OkStatus();
         } else if (hwc_tensor) {
-          ElementwiseTwoInput operation;
+          GPUOperation operation;
           RETURN_IF_ERROR(CreateElementwiseTwoInput(
               creation_context, op_def, op_type, *hwc_tensor, &operation));
-          *gpu_op =
-              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
           return absl::OkStatus();
         }
       }
@@ -295,9 +291,9 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     }
     case OperationType::MUL: {
       if (inputs.size() == 2) {
-        ElementwiseTwoInput operation =
+        GPUOperation operation =
             CreateElementwiseTwoInput(op_def, op_type, inputs[1]->tensor.shape);
-        *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+        *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
         auto attr =
@@ -310,25 +306,21 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
             absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(
                 &attr.param);
         if (scalar) {
-          ElementwiseOneRuntimeOneScalar operation =
-              CreateElementwiseOneRuntimeOneScalar(creation_context, op_def,
-                                                   op_type, *scalar);
-          *gpu_op = absl::make_unique<ElementwiseOneRuntimeOneScalar>(
-              std::move(operation));
+          GPUOperation operation = CreateElementwiseOneRuntimeOneScalar(
+              creation_context, op_def, op_type, *scalar);
+          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
           return absl::OkStatus();
         } else if (linear_tensor) {
-          ElementwiseTwoInput operation;
+          GPUOperation operation;
           RETURN_IF_ERROR(CreateElementwiseTwoInput(
               creation_context, op_def, op_type, *linear_tensor, &operation));
-          *gpu_op =
-              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
           return absl::OkStatus();
         } else if (hwc_tensor) {
-          ElementwiseTwoInput operation;
+          GPUOperation operation;
           RETURN_IF_ERROR(CreateElementwiseTwoInput(
               creation_context, op_def, op_type, *hwc_tensor, &operation));
-          *gpu_op =
-              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
           return absl::OkStatus();
         }
       }
@@ -353,8 +345,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     case OperationType::QUANTIZE_AND_DEQUANTIZE: {
       auto attr = absl::any_cast<QuantizeAndDequantizeAttributes>(
           node.operation.attributes);
-      return SelectQuantizeAndDequantize(attr, creation_context, op_def,
-                                         gpu_op);
+      SelectQuantizeAndDequantize(attr, creation_context, op_def, gpu_op);
+      return absl::OkStatus();
     }
     case OperationType::RELU: {
       auto attr = absl::any_cast<ReLUAttributes>(node.operation.attributes);
@@ -405,9 +397,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     case OperationType::SQRT:
     case OperationType::SQUARE:
     case OperationType::TANH: {
-      ElementwiseOneInput operation =
-          CreateElementwiseOneInput(op_def, op_type);
-      *gpu_op = absl::make_unique<ElementwiseOneInput>(std::move(operation));
+      GPUOperation operation = CreateElementwiseOneInput(op_def, op_type);
+      *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
       return absl::OkStatus();
     }
     case OperationType::DIV:
@@ -417,9 +408,9 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     case OperationType::SQUARED_DIFF:
     case OperationType::SUB: {
       if (inputs.size() == 2) {
-        ElementwiseTwoInput operation =
+        GPUOperation operation =
             CreateElementwiseTwoInput(op_def, op_type, inputs[1]->tensor.shape);
-        *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+        *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
         auto attr =
@@ -432,25 +423,21 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
             absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(
                 &attr.param);
         if (scalar) {
-          ElementwiseOneRuntimeOneScalar operation =
-              CreateElementwiseOneRuntimeOneScalar(creation_context, op_def,
-                                                   op_type, *scalar);
-          *gpu_op = absl::make_unique<ElementwiseOneRuntimeOneScalar>(
-              std::move(operation));
+          GPUOperation operation = CreateElementwiseOneRuntimeOneScalar(
+              creation_context, op_def, op_type, *scalar);
+          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
           return absl::OkStatus();
         } else if (linear_tensor) {
-          ElementwiseTwoInput operation;
+          GPUOperation operation;
           RETURN_IF_ERROR(CreateElementwiseTwoInput(
               creation_context, op_def, op_type, *linear_tensor, &operation));
-          *gpu_op =
-              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
           return absl::OkStatus();
         } else if (hwc_tensor) {
-          ElementwiseTwoInput operation;
+          GPUOperation operation;
           RETURN_IF_ERROR(CreateElementwiseTwoInput(
               creation_context, op_def, op_type, *hwc_tensor, &operation));
-          *gpu_op =
-              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
           return absl::OkStatus();
         }
       }
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
index a32efd5dd2c..1c0bed74422 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -54,17 +54,17 @@ void SelectLSTM(const OperationDef& op_def, const DeviceInfo& device_info,
 void SelectReLU(const CreationContext& creation_context,
                 const ReLUAttributes& attr, const OperationDef& op_def,
                 std::unique_ptr<GPUOperation>* ptr) {
-  ReLU relu = CreateReLU(creation_context, op_def, attr);
-  *ptr = absl::make_unique<ReLU>(std::move(relu));
+  GPUOperation relu = CreateReLU(creation_context, op_def, attr);
+  *ptr = absl::make_unique<GPUOperation>(std::move(relu));
 }
 
 absl::Status SelectPReLU(const PReLUAttributes& attr,
                          const CreationContext& creation_context,
                          const OperationDef& op_def,
                          std::unique_ptr<GPUOperation>* ptr) {
-  PReLU operation;
+  GPUOperation operation;
   RETURN_IF_ERROR(CreatePReLU(creation_context, op_def, attr, &operation));
-  *ptr = absl::make_unique<PReLU>(std::move(operation));
+  *ptr = absl::make_unique<GPUOperation>(std::move(operation));
   return absl::OkStatus();
 }
 
@@ -85,8 +85,8 @@ void SelectMaxUnpooling(const MaxUnpooling2DAttributes& attr,
 
 void SelectAdd(const OperationDef& op_def, const std::vector<int>& channels,
                int dst_channels, std::unique_ptr<GPUOperation>* ptr) {
-  Add operation = CreateAdd(op_def, channels, dst_channels);
-  *ptr = absl::make_unique<Add>(std::move(operation));
+  GPUOperation operation = CreateAdd(op_def, channels, dst_channels);
+  *ptr = absl::make_unique<GPUOperation>(std::move(operation));
 }
 
 absl::Status SelectResize(const Resize2DAttributes& attr,
@@ -203,15 +203,13 @@ absl::Status SelectWinograd36To4x4(
   return absl::OkStatus();
 }
 
-absl::Status SelectQuantizeAndDequantize(
-    const QuantizeAndDequantizeAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
-  QuantizeAndDequantize operation;
-  RETURN_IF_ERROR(
-      CreateQuantizeAndDequantize(creation_context, op_def, attr, &operation));
-  *ptr = absl::make_unique<QuantizeAndDequantize>(std::move(operation));
-  return absl::OkStatus();
+void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
+                                 const CreationContext& creation_context,
+                                 const OperationDef& op_def,
+                                 std::unique_ptr<GPUOperation>* ptr) {
+  GPUOperation operation =
+      CreateQuantizeAndDequantize(creation_context, op_def, attr);
+  *ptr = absl::make_unique<GPUOperation>(std::move(operation));
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
index f266882a458..7133aa94502 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@@ -97,10 +97,10 @@ absl::Status SelectWinograd36To4x4(
     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
     std::unique_ptr<GPUOperation>* ptr);
 
-absl::Status SelectQuantizeAndDequantize(
-    const QuantizeAndDequantizeAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr);
+void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
+                                 const CreationContext& creation_context,
+                                 const OperationDef& op_def,
+                                 std::unique_ptr<GPUOperation>* ptr);
 
 }  // namespace cl
 }  // namespace gpu