From d353f49989edd1e9e8cf41466fb2d46c226eea5d Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 3 Aug 2020 18:28:29 -0700
Subject: [PATCH] Extended support of SUB (and other elementwise ops). OpenCL
 delegate supports SUB with runtime tensor as second argument.

PiperOrigin-RevId: 324726289
Change-Id: If26a72a5214bffc7b664f1902344ab04038ed3f5
---
 .../delegates/gpu/cl/kernels/elementwise.cc   | 103 +++++++++++++-----
 .../delegates/gpu/cl/kernels/elementwise.h    |  26 +----
 .../gpu/cl/kernels/elementwise_test.cc        |  60 +++++++---
 .../gpu/cl/selectors/operation_selector.cc    |  99 ++---------------
 .../delegates/gpu/common/model_builder.cc     |   2 +
 .../lite/delegates/gpu/common/operations.h    |   4 +
 6 files changed, 143 insertions(+), 151 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
index 063b15c1b69..f735f1aa047 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
@@ -98,53 +98,51 @@ std::string GetOneInputCode(const OperationType& op_type,
 }
 
 std::string GetTwoInputCode(const OperationType& op_type,
+                            const std::string& result_var,
                             const std::string& input0,
-                            const std::string& input1) {
+                            const std::string& input1,
+                            bool swap_inputs = false) {
   std::string result;
   switch (op_type) {
     case OperationType::ADD:
-      result += "$0 += $1;\n";
+      result += "$0 = $1 + $2;\n";
       break;
     case OperationType::DIV:
-      result += "$0 /= $1;\n";
+      result += "$0 = $1 / $2;\n";
       break;
     case OperationType::MAXIMUM:
-      result += "$0 = max($0, $1);\n";
+      result += "$0 = max($1, $2);\n";
       break;
     case OperationType::MINIMUM:
-      result += "$0 = min($0, $1);\n";
+      result += "$0 = min($1, $2);\n";
       break;
     case OperationType::MUL:
-      result += "$0 *= $1;\n";
+      result += "$0 = $1 * $2;\n";
       break;
     case OperationType::POW:
-      result += "$0 = pow($0, $1);\n";
+      result += "$0 = pow($1, $2);\n";
       break;
     case OperationType::SQUARED_DIFF:
-      result += "$0 -= $1;\n";
-      result += "$0 *= $0;\n";
+      result += "$0 = ($1 - $2) * ($1 - $2);\n";
       break;
     case OperationType::SUB:
-      result += "$0 -= $1;\n";
+      result += "$0 = $1 - $2;\n";
       break;
     default:
       return "Unknown operation type;\n";
   }
-  return absl::Substitute(result, input0, input1);
-}
-}  // namespace
-
-GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
-                                       const OperationType& op_type) {
-  GPUOperation op(definition);
-  op.elementwise_ = true;
-  op.code_ = GetOneInputCode(op_type, definition.precision, "in_out_value");
-  return op;
+  if (swap_inputs) {
+    return absl::Substitute(result, result_var, input1, input0);
+  } else {
+    return absl::Substitute(result, result_var, input0, input1);
+  }
 }
 
+// Creates simple two input (first input is runtime tensor and second input is
+// scalar argument) operation, for example sub, div, pow, etc.
 GPUOperation CreateElementwiseOneRuntimeOneScalar(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const OperationType& op_type, float scalar_parameter) {
+    const OperationDef& definition, const OperationType& op_type,
+    float scalar_parameter, bool swap_inputs) {
   GPUOperation op(definition);
   op.elementwise_ = true;
   if (definition.precision == CalculationsPrecision::F32) {
@@ -152,15 +150,21 @@ GPUOperation CreateElementwiseOneRuntimeOneScalar(
   } else {
     op.args_.AddHalf("scalar", half(scalar_parameter));
   }
-  op.code_ = GetTwoInputCode(op_type, "in_out_value", "args.scalar");
+  op.code_ =
+      "FLT4 second_val = (FLT4)(args.scalar, args.scalar, args.scalar, "
+      "args.scalar);\n";
+  op.code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
+                              "second_val", swap_inputs);
   return op;
 }
 
+// Creates simple two input(first input is runtime tensor and second input is
+// constant linear tensor) operation, for example sub, div and etc.
 absl::Status CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type,
     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
-    GPUOperation* result) {
+    bool swap_inputs, GPUOperation* result) {
   const BHWC shape = BHWC(1, 1, 1, constant_tensor.shape.v);
   TensorStorageType storage_type =
       SelectBestStorageType(*creation_context.context, *creation_context.device,
@@ -187,15 +191,18 @@ absl::Status CreateElementwiseTwoInput(
     result->code_ += "  second_val.z = second_val.x;\n";
     result->code_ += "  second_val.w = second_val.x;\n";
   }
-  result->code_ += GetTwoInputCode(op_type, "in_out_value", "second_val");
+  result->code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
+                                   "second_val", swap_inputs);
   return absl::OkStatus();
 }
 
+// Creates simple two input(first input is runtime tensor and second input is
+// constant HWC tensor) operation, for example sub, div and etc.
 absl::Status CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type,
     const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
-    GPUOperation* result) {
+    bool swap_inputs, GPUOperation* result) {
   const BHWC shape = BHWC(1, constant_tensor.shape.h, constant_tensor.shape.w,
                           constant_tensor.shape.c);
   TensorStorageType storage_type =
@@ -225,11 +232,50 @@ absl::Status CreateElementwiseTwoInput(
     result->code_ += "  second_val.z = second_val.x;\n";
     result->code_ += "  second_val.w = second_val.x;\n";
   }
-  result->code_ += GetTwoInputCode(op_type, "in_out_value", "second_val");
+  result->code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
+                                   "second_val", swap_inputs);
 
   return absl::OkStatus();
 }
 
+}  // namespace
+
+GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
+                                       const OperationType& op_type) {
+  GPUOperation op(definition);
+  op.elementwise_ = true;
+  op.code_ = GetOneInputCode(op_type, definition.precision, "in_out_value");
+  return op;
+}
+
+absl::Status CreateElementwise(const CreationContext& creation_context,
+                               const OperationDef& definition,
+                               const OperationType& op_type,
+                               const ElementwiseAttributes& attr,
+                               GPUOperation* result) {
+  const float* scalar = absl::get_if<float>(&attr.param);
+  const auto* linear_tensor =
+      absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.param);
+  const auto* hwc_tensor =
+      absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(&attr.param);
+
+  if (scalar) {
+    *result = CreateElementwiseOneRuntimeOneScalar(
+        definition, op_type, *scalar, attr.runtime_tensor_is_second);
+    return absl::OkStatus();
+  } else if (linear_tensor) {
+    return CreateElementwiseTwoInput(creation_context, definition, op_type,
+                                     *linear_tensor,
+                                     attr.runtime_tensor_is_second, result);
+  } else if (hwc_tensor) {
+    return CreateElementwiseTwoInput(creation_context, definition, op_type,
+                                     *hwc_tensor, attr.runtime_tensor_is_second,
+                                     result);
+  }
+  return absl::UnimplementedError(
+      "No elementwise implementation for this case");
+}
+
 GPUOperation CreateElementwiseTwoInput(const OperationDef& definition,
                                        const OperationType& op_type,
                                        const BHWC& shape) {
@@ -250,7 +296,8 @@ GPUOperation CreateElementwiseTwoInput(const OperationDef& definition,
     op.code_ += "  second_val.z = second_val.x;\n";
     op.code_ += "  second_val.w = second_val.x;\n";
   }
-  op.code_ += GetTwoInputCode(op_type, "in_out_value", "second_val");
+  op.code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
+                              "second_val", false);
   return op;
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
index d03d535b39a..f841cdba9fb 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
@@ -31,27 +31,13 @@ namespace cl {
 GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
                                        const OperationType& op_type);
 
-// Creates simple two input (first input is runtime tensor and second input is
-// scalar argument) operation, for example sub, div, pow, etc.
-GPUOperation CreateElementwiseOneRuntimeOneScalar(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const OperationType& op_type, float scalar_parameter);
-
 // Creates simple two input(first input is runtime tensor and second input is
-// constant linear tensor) operation, for example sub, div and etc.
-absl::Status CreateElementwiseTwoInput(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const OperationType& op_type,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
-    GPUOperation* result);
-
-// Creates simple two input(first input is runtime tensor and second input is
-// constant HWC tensor) operation, for example sub, div and etc.
-absl::Status CreateElementwiseTwoInput(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const OperationType& op_type,
-    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
-    GPUOperation* result);
+// constant or linear/hwc tensor) operation, for example sub, div and etc.
+absl::Status CreateElementwise(const CreationContext& creation_context,
+                               const OperationDef& definition,
+                               const OperationType& op_type,
+                               const ElementwiseAttributes& attr,
+                               GPUOperation* result);
 
 // Creates simple two input(2 runtime tensors) operation, for example
 // sub, div and etc.
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
index 11a651df901..23ee6622e8c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
@@ -546,9 +546,9 @@ TEST_F(OpenCLOperationTest, MaximumWithScalar) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      const float* scalar = absl::get_if<float>(&attr.param);
-      GPUOperation operation = CreateElementwiseOneRuntimeOneScalar(
-          creation_context_, op_def, OperationType::MAXIMUM, *scalar);
+      GPUOperation operation;
+      ASSERT_OK(CreateElementwise(creation_context_, op_def,
+                                  OperationType::MAXIMUM, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -578,9 +578,8 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantLinearTensor) {
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
       GPUOperation operation;
-      ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
-                                          OperationType::MAXIMUM, linear_tensor,
-                                          &operation));
+      ASSERT_OK(CreateElementwise(creation_context_, op_def,
+                                  OperationType::MAXIMUM, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -597,6 +596,8 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensor) {
   ::tflite::gpu::Tensor<HWC, DataType::FLOAT32> hwc_tensor;
   hwc_tensor.shape = HWC(2, 1, 2);
   hwc_tensor.data = {0.5f, 2.0f, 0.7f, 4.7f};
+  ElementwiseAttributes attr;
+  attr.param = hwc_tensor;
 
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
@@ -608,9 +609,8 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensor) {
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
       GPUOperation operation;
-      ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
-                                          OperationType::MAXIMUM, hwc_tensor,
-                                          &operation));
+      ASSERT_OK(CreateElementwise(creation_context_, op_def,
+                                  OperationType::MAXIMUM, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -626,6 +626,8 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensorBroadcastChannels) {
   ::tflite::gpu::Tensor<HWC, DataType::FLOAT32> hwc_tensor;
   hwc_tensor.shape = HWC(2, 1, 1);
   hwc_tensor.data = {0.5f, 2.0f};
+  ElementwiseAttributes attr;
+  attr.param = hwc_tensor;
 
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
@@ -637,9 +639,8 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensorBroadcastChannels) {
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
       GPUOperation operation;
-      ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
-                                          OperationType::MAXIMUM, hwc_tensor,
-                                          &operation));
+      ASSERT_OK(CreateElementwise(creation_context_, op_def,
+                                  OperationType::MAXIMUM, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -693,9 +694,9 @@ TEST_F(OpenCLOperationTest, MinimumWithScalar) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      const float* scalar = absl::get_if<float>(&attr.param);
-      GPUOperation operation = CreateElementwiseOneRuntimeOneScalar(
-          creation_context_, op_def, OperationType::MINIMUM, *scalar);
+      GPUOperation operation;
+      ASSERT_OK(CreateElementwise(creation_context_, op_def,
+                                  OperationType::MINIMUM, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -788,6 +789,35 @@ TEST_F(OpenCLOperationTest, MulBroadcastChannels) {
   }
 }
 
+TEST_F(OpenCLOperationTest, SubWithScalarAtFirstPosition) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 4, 1, 1);
+  src_tensor_0.data = {0.0f, -6.2f, 2.0f, -3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = 4.0f;
+  attr.runtime_tensor_is_second = true;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation;
+      ASSERT_OK(CreateElementwise(creation_context_, op_def, OperationType::SUB,
+                                  attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 4, 1, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {4.0f, 10.2f, 2.0f, 7.0f}));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index f60af5f730d..e1225e83e95 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -159,31 +159,11 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
         auto attr =
             absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
-        const float* scalar = absl::get_if<float>(&attr.param);
-        const auto* linear_tensor =
-            absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
-                &attr.param);
-        const auto* hwc_tensor =
-            absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(
-                &attr.param);
-        if (scalar) {
-          GPUOperation operation = CreateElementwiseOneRuntimeOneScalar(
-              creation_context, op_def, op_type, *scalar);
-          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
-          return absl::OkStatus();
-        } else if (linear_tensor) {
-          GPUOperation operation;
-          RETURN_IF_ERROR(CreateElementwiseTwoInput(
-              creation_context, op_def, op_type, *linear_tensor, &operation));
-          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
-          return absl::OkStatus();
-        } else if (hwc_tensor) {
-          GPUOperation operation;
-          RETURN_IF_ERROR(CreateElementwiseTwoInput(
-              creation_context, op_def, op_type, *hwc_tensor, &operation));
-          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
-          return absl::OkStatus();
-        }
+        GPUOperation operation;
+        RETURN_IF_ERROR(CreateElementwise(creation_context, op_def, op_type,
+                                          attr, &operation));
+        *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
+        return absl::OkStatus();
       }
       return absl::UnimplementedError(absl::StrCat(
           "No support of ", node.operation.type, " with this parameters"));
@@ -289,44 +269,6 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
           absl::make_unique<MeanStdDevNormalization>(std::move(operation));
       return absl::OkStatus();
     }
-    case OperationType::MUL: {
-      if (inputs.size() == 2) {
-        GPUOperation operation =
-            CreateElementwiseTwoInput(op_def, op_type, inputs[1]->tensor.shape);
-        *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
-        return absl::OkStatus();
-      } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
-        auto attr =
-            absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
-        const float* scalar = absl::get_if<float>(&attr.param);
-        const auto* linear_tensor =
-            absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
-                &attr.param);
-        const auto* hwc_tensor =
-            absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(
-                &attr.param);
-        if (scalar) {
-          GPUOperation operation = CreateElementwiseOneRuntimeOneScalar(
-              creation_context, op_def, op_type, *scalar);
-          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
-          return absl::OkStatus();
-        } else if (linear_tensor) {
-          GPUOperation operation;
-          RETURN_IF_ERROR(CreateElementwiseTwoInput(
-              creation_context, op_def, op_type, *linear_tensor, &operation));
-          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
-          return absl::OkStatus();
-        } else if (hwc_tensor) {
-          GPUOperation operation;
-          RETURN_IF_ERROR(CreateElementwiseTwoInput(
-              creation_context, op_def, op_type, *hwc_tensor, &operation));
-          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
-          return absl::OkStatus();
-        }
-      }
-      return absl::UnimplementedError(absl::StrCat(
-          "No support of ", node.operation.type, " with this parameters"));
-    }
     case OperationType::PAD: {
       auto attr = absl::any_cast<PadAttributes>(node.operation.attributes);
       SelectPadding(attr, op_def, gpu_op);
@@ -404,6 +346,7 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     case OperationType::DIV:
     case OperationType::MAXIMUM:
     case OperationType::MINIMUM:
+    case OperationType::MUL:
     case OperationType::POW:
     case OperationType::SQUARED_DIFF:
     case OperationType::SUB: {
@@ -415,31 +358,11 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
         auto attr =
             absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
-        const float* scalar = absl::get_if<float>(&attr.param);
-        const auto* linear_tensor =
-            absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
-                &attr.param);
-        const auto* hwc_tensor =
-            absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(
-                &attr.param);
-        if (scalar) {
-          GPUOperation operation = CreateElementwiseOneRuntimeOneScalar(
-              creation_context, op_def, op_type, *scalar);
-          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
-          return absl::OkStatus();
-        } else if (linear_tensor) {
-          GPUOperation operation;
-          RETURN_IF_ERROR(CreateElementwiseTwoInput(
-              creation_context, op_def, op_type, *linear_tensor, &operation));
-          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
-          return absl::OkStatus();
-        } else if (hwc_tensor) {
-          GPUOperation operation;
-          RETURN_IF_ERROR(CreateElementwiseTwoInput(
-              creation_context, op_def, op_type, *hwc_tensor, &operation));
-          *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
-          return absl::OkStatus();
-        }
+        GPUOperation operation;
+        RETURN_IF_ERROR(CreateElementwise(creation_context, op_def, op_type,
+                                          attr, &operation));
+        *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
+        return absl::OkStatus();
       }
       return absl::UnimplementedError(absl::StrCat(
           "No support of ", node.operation.type, " with this parameters"));
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index bf24e0d9eff..4c0fd827834 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -847,6 +847,8 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
                                                         /*outputs=*/1));
       ElementwiseAttributes attr;
       RETURN_IF_ERROR(ParseInputsWithConstTensor(node, reader, &attr.param));
+      attr.runtime_tensor_is_second =
+          IsConstantTensor(reader->GetInputTensor(0));
       node->operation.attributes = std::move(attr);
     } else {
       return absl::InvalidArgumentError("Incorrect operation type passed");
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 225165589ae..563dbdec96e 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -490,6 +490,10 @@ BHWC CalculateOutputShape(const BHWC& input, const MeanAttributes& attr);
 
 struct ElementwiseAttributes {
   TensorOrScalar param;
+  // For elementwise operation with 2 inputs op(A, B), runtime_tensor_is_second
+  // true when runtime tensor is B(on second position). this is important for
+  // ops that non commutative, for example substract.
+  bool runtime_tensor_is_second = false;
 };
 
 struct ReshapeAttributes {