diff --git a/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h b/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h
index f35682dad3a..cb02a173dd9 100644
--- a/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h
@@ -38,6 +38,9 @@ namespace gpu {
 namespace cl {
 class ClOperation;
 }
+namespace metal {
+class ComputeTask;
+}
 
 // kCustom: default value
 //   GPUOperation::GetGridSize must be overloaded
@@ -145,6 +148,7 @@ class GPUOperation {
 
  protected:
   friend class cl::ClOperation;
+  friend class metal::ComputeTask;
   friend flatbuffers::Offset<tflite::gpu::data::GPUOperation> Encode(
       const GPUOperation& op, flatbuffers::FlatBufferBuilder* builder);
   friend absl::Status Decode(const tflite::gpu::data::GPUOperation* fb_op,
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index f46834398ec..eb179af712f 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -121,6 +121,7 @@ objc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.cc b/tensorflow/lite/delegates/gpu/metal/compute_task.cc
index b76989f2094..9c0642bd36f 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task.cc
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task.cc
@@ -33,11 +33,43 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace metal {
+namespace {
+int3 GetWorkGroupsCount(int grid_dimension, const int3& grid_size,
+                        const int3& work_group_size,
+                        const int3& work_group_launch_order) {
+  int3 work_groups_count;
+  if (grid_dimension == 1) {
+    work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    work_groups_count.y = 1;
+    work_groups_count.z = 1;
+  } else if (grid_dimension == 2) {
+    int3 wgs;
+    wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
+    work_groups_count.x = wgs[work_group_launch_order[0]];
+    work_groups_count.y = wgs[work_group_launch_order[1]];
+    work_groups_count.z = 1;
+  } else {  // grid_dimension == 3
+    int3 wgs;
+    wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
+    wgs.z = DivideRoundUp(grid_size.z, work_group_size.z);
+    work_groups_count.x = wgs[work_group_launch_order[0]];
+    work_groups_count.y = wgs[work_group_launch_order[1]];
+    work_groups_count.z = wgs[work_group_launch_order[2]];
+  }
+  return work_groups_count;
+}
+}  // namespace
 
 void ComputeTask::Init(std::unique_ptr<ComputeTaskDescriptor>&& task_desc) {
   task_desc_ = std::move(task_desc);
 }
 
+void ComputeTask::Init(std::unique_ptr<GPUOperation>&& operation) {
+  operation_ = std::move(operation);
+}
+
 absl::Status ComputeTask::Compile(CalculationsPrecision precision,
                                   MetalDevice* device) {
   task_desc_->AssembleCode();
@@ -111,6 +143,71 @@ absl::Status ComputeTask::Compile(CalculationsPrecision precision,
   return absl::OkStatus();
 }
 
+absl::Status ComputeTask::CompileOp(MetalDevice* device) {
+  operation_->AssembleCode(device->GetInfo());
+  const std::map<std::string, std::string> linkables = {
+      {operation_->dst_tensors_names_[0], operation_->elementwise_code_}};
+  RETURN_IF_ERROR(metal_args_.Init(linkables, device, &operation_->args_,
+                                   &operation_->code_));
+
+  operation_->args_.ReleaseCPURepresentation();
+  NSString* storageType;
+  NSString* accumulatorType;
+  NSString* toAccumulatorType = @"";
+  NSString* toAccumulatorType2 = @"";
+  NSString* toAccumulatorType3 = @"";
+  NSString* toAccumulatorType4 = @"";
+  if (operation_->definition_.precision == CalculationsPrecision::F32) {
+    storageType = @"float";
+    accumulatorType = @"float";
+  } else {
+    // FP16
+    storageType = @"half";
+    if (operation_->definition_.precision == CalculationsPrecision::F32_F16) {
+      accumulatorType = @"float";
+      toAccumulatorType = @"float";
+      toAccumulatorType2 = @"float2";
+      toAccumulatorType3 = @"float3";
+      toAccumulatorType4 = @"float4";
+    } else {
+      accumulatorType = @"half";
+    }
+  }
+  NSDictionary<NSString*, NSString*>* macros = @{
+    @"FLT" : storageType,
+    @"FLT2" : [NSString stringWithFormat:@"%@2", storageType],
+    @"FLT3" : [NSString stringWithFormat:@"%@3", storageType],
+    @"FLT4" : [NSString stringWithFormat:@"%@4", storageType],
+    @"ACCUM_FLT" : accumulatorType,
+    @"ACCUM_FLT2" : [NSString stringWithFormat:@"%@2", accumulatorType],
+    @"ACCUM_FLT3" : [NSString stringWithFormat:@"%@3", accumulatorType],
+    @"ACCUM_FLT4" : [NSString stringWithFormat:@"%@4", accumulatorType],
+    @"TO_ACCUM_TYPE" : toAccumulatorType,
+    @"TO_ACCUM2_TYPE" : toAccumulatorType2,
+    @"TO_ACCUM3_TYPE" : toAccumulatorType3,
+    @"TO_ACCUM4_TYPE" : toAccumulatorType4,
+    @"MAIN_FUNCTION" : @"\"kernel void ComputeFunction\"",
+    @"GLOBAL_ID_0" : @"static_cast<int>(reserved_gid.x)",
+    @"GLOBAL_ID_1" : @"static_cast<int>(reserved_gid.y)",
+    @"GLOBAL_ID_2" : @"static_cast<int>(reserved_gid.z)",
+    @"INIT_FLT(value)" : [NSString stringWithFormat:@"%@(value)", storageType],
+    @"INIT_FLT4(value)" :
+        [NSString stringWithFormat:@"%@4(value)", storageType],
+  };
+
+  NSString* code =
+      [NSString stringWithCString:operation_->code_.c_str()
+                         encoding:[NSString defaultCStringEncoding]];
+  id<MTLComputePipelineState> program;
+  RETURN_IF_ERROR(CreateComputeProgram(device->device(), code,
+                                       @"ComputeFunction", macros, &program));
+  if (!program) {
+    return absl::InternalError("Unknown shader compilation error");
+  }
+  program_ = program;
+  return absl::OkStatus();
+}
+
 absl::Status ComputeTask::UpdateParams(const GpuInfo& gpu_info,
                                        const std::vector<BHWC>& src_shapes,
                                        const std::vector<BHWC>& dst_shapes) {
@@ -137,6 +234,33 @@ absl::Status ComputeTask::UpdateParams(const GpuInfo& gpu_info,
   return absl::OkStatus();
 }
 
+absl::Status ComputeTask::UpdateOpParams() {
+  for (int i = 0; i < operation_->src_tensors_names_.size(); ++i) {
+    const auto* metal_spatial_tensor =
+        dynamic_cast<const MetalSpatialTensor*>(operation_->src_[i]);
+    if (!metal_spatial_tensor) {
+      return absl::InvalidArgumentError("Expected MetalSpatialTensor.");
+    }
+    RETURN_IF_ERROR(metal_args_.SetObjectRef(operation_->src_tensors_names_[i],
+                                             *metal_spatial_tensor));
+  }
+  for (int i = 0; i < operation_->dst_tensors_names_.size(); ++i) {
+    const auto* metal_spatial_tensor =
+        dynamic_cast<const MetalSpatialTensor*>(operation_->dst_[i]);
+    if (!metal_spatial_tensor) {
+      return absl::InvalidArgumentError("Expected MetalSpatialTensor.");
+    }
+    RETURN_IF_ERROR(metal_args_.SetObjectRef(operation_->dst_tensors_names_[i],
+                                             *metal_spatial_tensor));
+  }
+  RETURN_IF_ERROR(operation_->BindArguments(&metal_args_));
+  operation_->grid_size_ = operation_->GetGridSize();
+  operation_->work_groups_count_ = GetWorkGroupsCount(
+      operation_->grid_dimension_, operation_->grid_size_,
+      operation_->work_group_size_, operation_->work_group_launch_order_);
+  return absl::OkStatus();
+}
+
 void ComputeTask::EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder) {
   // The dispatch call is intended to be skipped.
   if (groups_count_.x * groups_count_.y * groups_count_.z == 0) {
@@ -145,8 +269,7 @@ void ComputeTask::EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder) {
 
   [encoder setComputePipelineState:program_];
 
-  int bindIndex = 0;
-  metal_args_.Encode(encoder, bindIndex);
+  metal_args_.Encode(encoder, 0);
 
   MTLSize groupsCount =
       MTLSizeMake(groups_count_.x, groups_count_.y, groups_count_.z);
@@ -155,6 +278,19 @@ void ComputeTask::EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder) {
   [encoder dispatchThreadgroups:groupsCount threadsPerThreadgroup:groupsSize];
 }
 
+void ComputeTask::EncodeOpWithEncoder(id<MTLComputeCommandEncoder> encoder) {
+  [encoder setComputePipelineState:program_];
+  metal_args_.Encode(encoder, 0);
+  MTLSize groupsCount, groupsSize;
+  groupsCount.width = operation_->work_groups_count_.x;
+  groupsCount.height = operation_->work_groups_count_.y;
+  groupsCount.depth = operation_->work_groups_count_.z;
+  groupsSize.width = operation_->work_group_size_.x;
+  groupsSize.height = operation_->work_group_size_.y;
+  groupsSize.depth = operation_->work_group_size_.z;
+  [encoder dispatchThreadgroups:groupsCount threadsPerThreadgroup:groupsSize];
+}
+
 void ComputeTask::SetSrcTensor(const MetalSpatialTensor& tensor, int index) {
   auto status =
       metal_args_.SetObjectRef(task_desc_->src_tensors_names[index], tensor);
diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.h b/tensorflow/lite/delegates/gpu/metal/compute_task.h
index cc31edea26f..c8912892662 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task.h
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/precision.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/metal/common.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
 #include "tensorflow/lite/delegates/gpu/metal/metal_arguments.h"
@@ -49,25 +50,35 @@ class ComputeTask {
 
   void Init(std::unique_ptr<ComputeTaskDescriptor>&& task_desc);
 
+  void Init(std::unique_ptr<GPUOperation>&& operation);
+
   ComputeTaskDescriptor& GetTaskDesc() { return *task_desc_; }
   const ComputeTaskDescriptor& GetTaskDesc() const { return *task_desc_; }
 
   /// Returns empty string or error if shader can't be compiled.
   absl::Status Compile(CalculationsPrecision precision, MetalDevice* device);
 
+  absl::Status CompileOp(MetalDevice* device);
+
   /// Updates parameters for inputs/outputs/intermediate tensors
   absl::Status UpdateParams(const GpuInfo& gpu_info,
                             const std::vector<BHWC>& src_shapes,
                             const std::vector<BHWC>& dst_shapes);
 
+  // should be called after changes of inputs/outputs.
+  absl::Status UpdateOpParams();
+
   void EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder);
 
+  void EncodeOpWithEncoder(id<MTLComputeCommandEncoder> encoder);
+
   void SetSrcTensor(const MetalSpatialTensor& tensor, int index);
 
   void SetDstTensor(const MetalSpatialTensor& tensor, int index);
 
  private:
   std::unique_ptr<ComputeTaskDescriptor> task_desc_;
+  std::unique_ptr<GPUOperation> operation_ = nullptr;
   id<MTLComputePipelineState> program_;
   MetalArguments metal_args_;
   uint3 groups_size_;
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index 2c81fdfbf6a..0ac07d2b202 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -64,6 +64,7 @@ objc_library(
     deps = [
         ":add",
         ":test_util",
+        "//tensorflow/lite/delegates/gpu/common/tasks:add_test_util",
     ],
 )
 
@@ -812,6 +813,8 @@ objc_library(
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/common/task:testing_util",
         "//tensorflow/lite/delegates/gpu/metal:common",
         "//tensorflow/lite/delegates/gpu/metal:inference_context",
         "//tensorflow/lite/delegates/gpu/metal:metal_device",
@@ -897,6 +900,7 @@ objc_library(
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/common/tasks:add_test_util",
         "//tensorflow/lite/delegates/gpu/metal:common",
         "//tensorflow/lite/delegates/gpu/metal:inference_context",
         "//tensorflow/lite/kernels/internal:quantization_util",
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm
index e935a10eb56..c7ac3bd94e9 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
+#include "tensorflow/lite/delegates/gpu/common/tasks/add_test_util.h"
 
 using ::tflite::gpu::ElementwiseAttributes;
 using ::tflite::gpu::BHWC;
@@ -123,4 +124,22 @@ using ::tflite::gpu::metal::SingleOpModel;
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }
 
+- (void)testAddTwoEqualTensors {
+  tflite::gpu::metal::MetalExecutionEnvironment exec_env_;
+  auto status = AddTwoEqualTensorsTest(&exec_env_);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
+- (void)testAddFirstTensorHasMoreChannelsThanSecond {
+  tflite::gpu::metal::MetalExecutionEnvironment exec_env_;
+  auto status = AddFirstTensorHasMoreChannelsThanSecondTest(&exec_env_);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
+- (void)testAddFirstTensorHasLessChannelsThanSecond {
+  tflite::gpu::metal::MetalExecutionEnvironment exec_env_;
+  auto status = AddFirstTensorHasLessChannelsThanSecond(&exec_env_);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
 @end
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/test_util.cc b/tensorflow/lite/delegates/gpu/metal/kernels/test_util.cc
index 5fe993b55c0..d629ff40038 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/test_util.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/test_util.cc
@@ -184,6 +184,72 @@ MetalExecutionEnvironment::GetSupportedStoragesWithHWZeroClampSupport() const {
   return {};
 }
 
+absl::Status MetalExecutionEnvironment::ExecuteGPUOperation(
+    const std::vector<TensorFloat32>& src_cpu,
+    std::unique_ptr<GPUOperation>&& operation,
+    const std::vector<BHWC>& dst_sizes,
+    const std::vector<TensorFloat32*>& dst_cpu) {
+  const OperationDef op_def = operation->GetDefinition();
+  std::vector<MetalSpatialTensor> src(src_cpu.size());
+  for (int i = 0; i < src_cpu.size(); ++i) {
+    auto src_shape = src_cpu[i].shape;
+    if (src_shape.b != 1 && !op_def.IsBatchSupported()) {
+      return absl::InvalidArgumentError(
+          "Layout doesn't have Batch dimension, but shape.b != 1");
+    }
+    RETURN_IF_ERROR(CreateTensor(device_.device(), src_shape,
+                                 op_def.src_tensors[i], &src[i]));
+    RETURN_IF_ERROR(src[i].WriteData(src_cpu[i]));
+    operation->SetSrc(&src[i], i);
+  }
+
+  std::vector<MetalSpatialTensor> dst(dst_cpu.size());
+  for (int i = 0; i < dst_cpu.size(); ++i) {
+    auto dst_shape = dst_sizes[i];
+    if (dst_shape.b != 1 && !op_def.IsBatchSupported()) {
+      return absl::InvalidArgumentError(
+          "Layout doesn't have Batch dimension, but shape.b != 1");
+    }
+    RETURN_IF_ERROR(CreateTensor(device_.device(), dst_shape,
+                                 op_def.dst_tensors[i], &dst[i]));
+    operation->SetDst(&dst[i], i);
+  }
+
+  std::vector<BHWC> src_shapes;
+  std::vector<BHWC> dst_shapes;
+  std::vector<ValueId> src_ids;
+  std::vector<ValueId> dst_ids;
+  for (int i = 0; i < src_cpu.size(); ++i) {
+    src_ids.push_back(i);
+    src_shapes.push_back(src_cpu[i].shape);
+  }
+  for (int i = 0; i < dst_cpu.size(); ++i) {
+    dst_ids.push_back(src_cpu.size() + i);
+    dst_shapes.push_back(dst_sizes[i]);
+  }
+
+  ComputeTask gpu_task;
+  gpu_task.Init(std::move(operation));
+  RETURN_IF_ERROR(gpu_task.CompileOp(&device_));
+  RETURN_IF_ERROR(gpu_task.UpdateOpParams());
+
+  id<MTLCommandQueue> command_queue = [device_.device() newCommandQueue];
+  id<MTLCommandBuffer> command_buffer = [command_queue commandBuffer];
+  id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
+  gpu_task.EncodeOpWithEncoder(encoder);
+  [encoder endEncoding];
+  [command_buffer commit];
+  [command_buffer waitUntilCompleted];
+
+  for (int i = 0; i < dst_cpu.size(); ++i) {
+    dst_cpu[i]->shape = dst_sizes[i];
+    dst_cpu[i]->data = std::vector<float>(dst_sizes[i].DimensionsProduct(), 0);
+    RETURN_IF_ERROR(dst[i].ReadData(dst_cpu[i]));
+  }
+
+  return absl::OkStatus();
+}
+
 absl::Status MetalExecutionEnvironment::ExecuteGPUOperation(
     const std::vector<TensorFloat32>& src_cpu,
     std::unique_ptr<ComputeTaskDescriptor>&& operation,
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/test_util.h b/tensorflow/lite/delegates/gpu/metal/kernels/test_util.h
index d9f46a4762e..e8ce371f178 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/test_util.h
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/test_util.h
@@ -24,6 +24,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
 #include "tensorflow/lite/delegates/gpu/metal/inference_context.h"
@@ -61,20 +63,24 @@ class SingleOpModel {
 absl::Status CompareVectors(const std::vector<float>& reference,
                             const std::vector<float>& output, float max_error);
 
-class MetalExecutionEnvironment {
+class MetalExecutionEnvironment : public TestExecutionEnvironment {
  public:
   MetalExecutionEnvironment() = default;
   ~MetalExecutionEnvironment() = default;
 
-  std::vector<CalculationsPrecision> GetSupportedPrecisions() const;
-  std::vector<TensorStorageType> GetSupportedStorages() const;
-  // returns storage types that support zero clamping when reading OOB in HW
-  // (Height/Width) dimensions.
+  std::vector<CalculationsPrecision> GetSupportedPrecisions() const override;
+  std::vector<TensorStorageType> GetSupportedStorages() const override;
   std::vector<TensorStorageType> GetSupportedStoragesWithHWZeroClampSupport()
-      const;
+      const override;
 
   const GpuInfo& GetGpuInfo() const { return device_.GetInfo(); }
 
+  absl::Status ExecuteGPUOperation(
+      const std::vector<TensorFloat32>& src_cpu,
+      std::unique_ptr<GPUOperation>&& operation,
+      const std::vector<BHWC>& dst_sizes,
+      const std::vector<TensorFloat32*>& dst_cpu) override;
+
   absl::Status ExecuteGPUOperation(
       const std::vector<TensorFloat32>& src_cpu,
       std::unique_ptr<ComputeTaskDescriptor>&& operation,