diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.cc b/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.cc
index 5bfbf97ad26..17f8f246261 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.cc
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.cc
@@ -45,7 +45,6 @@ kernel void ComputeFunction($1
     return;
   }
   FLT4 value = args.src_tensor.Read(X, Y, Z);
-  $2
   args.dst_tensor.Write(value, X, Y, Z);
 }
 )";
@@ -148,7 +147,6 @@ void ComputeTaskDescriptor::AssembleCode() {
       return std::make_pair(groups_size, groups_count);
     };
   }
-  shader_source = absl::Substitute(shader_source, "$0", "$1", "");
 }
 
 }  // namespace metal
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/concat.cc b/tensorflow/lite/delegates/gpu/metal/kernels/concat.cc
index 295f1313fcc..4ce5a227791 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/concat.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/concat.cc
@@ -84,23 +84,14 @@ std::string GetConcatChannelsCode(const OperationDef& op_def,
         c += "  for (int i = 0; i < " + t_name + ".Slices(); i += 2) {\n";
         c += "    FLT4 result0 = " + t_name + ".Read(" + coords + ", i);\n";
         c += "    FLT4 result1 = " + t_name + ".Read(" + coords + ", i + 1);\n";
-        c += "    uint3 gid = uint3(ugid.x, ugid.y, uint(S));\n";
-        c += "    $2\n";
-        c += "    FLT4 value = result0;\n";
-        c += "    args.dst_tensor.Write(value, " + coords + ", S);\n";
-        c += "    gid = uint3(ugid.x, ugid.y, uint(S + 1));\n";
-        c += "    $2\n";
-        c += "    value = result1;\n";
-        c += "    args.dst_tensor.Write(value, " + coords + ", S + 1);\n";
+        c += "    args.dst_tensor.Write(result0, " + coords + ", S);\n";
+        c += "    args.dst_tensor.Write(result1, " + coords + ", S + 1);\n";
         c += "    S += 2;\n";
         c += "  }\n";
       } else {
         c += "  for (int i = 0; i < " + t_name + ".Slices(); ++i) {\n";
         c += "    FLT4 result = " + t_name + ".Read(" + coords + ", i);\n";
-        c += "    uint3 gid = uint3(ugid.x, ugid.y, uint(S));\n";
-        c += "    $2\n";
-        c += "    FLT4 value = result;\n";
-        c += "    args.dst_tensor.Write(value, " + coords + ", S);\n";
+        c += "    args.dst_tensor.Write(result, " + coords + ", S);\n";
         c += "    S++;\n";
         c += "  }\n";
       }
@@ -125,11 +116,6 @@ std::string GetConcatChannelsCode(const OperationDef& op_def,
           out_channel++;
           if (out_channel == 4) {
             out_channel = 0;
-            c += "  {\n";
-            c += "    uint3 gid = uint3(ugid.x, ugid.y, uint(" +
-                 std::to_string(z) + "));\n";
-            c += "    $2\n";
-            c += "  }\n";
             c += "  args.dst_tensor.Write(value, " + coords + ", " +
                  std::to_string(z) + ");\n";
             z++;
@@ -139,11 +125,6 @@ std::string GetConcatChannelsCode(const OperationDef& op_def,
       }
     }
     if (out_channel != 0) {
-      c += "  {\n";
-      c += "    uint3 gid = uint3(ugid.x, ugid.y, uint(" + std::to_string(z) +
-           "));\n";
-      c += "    $2\n";
-      c += "  }\n";
       c += "  args.dst_tensor.Write(value, " + coords + ", " +
            std::to_string(z) + ");\n";
     }
@@ -273,10 +254,6 @@ std::string GetConcatKernelCode(const OperationDef& op_def,
     c += "  } \n";
     c += "  coord -= " + field + ";\n";
   }
-  c += "  {\n";
-  c += "    uint3 gid = uint3(ugid.x, ugid.y, ugid.z);\n";
-  c += "    $2\n";
-  c += "  }\n";
   c += "  args.dst_tensor.Write(value, " + dst_coord + ");\n";
   c += "}\n";
   return c;
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
index 9706a6cda8e..be5a9e254bb 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
@@ -549,9 +549,6 @@ kernel void ComputeFunction(
         c += "      FLT4 value = FLT4(r" + s_zyx + ");\n";
         c += "      int linear_index = offset_" + s_yx +
              " + args.dst_tensor.SliceStride() * " + s_z + ";\n";
-        c += "      uint3 gid = uint3(X + " + s_x + ", Y + " + s_y + ", Z + " +
-             s_z + ");\n";
-        c += "      $2\n";
         c += "      args.dst_tensor.Linking(value, X + " + s_x + ", Y + " +
              s_y + ", Z + " + s_z + ");\n";
         c += "      args.dst_tensor.WriteLinear(value, linear_index);\n";
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
index 22524b14bc0..7e7f9700544 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
@@ -160,26 +160,18 @@ kernel void ComputeFunction(
 
   if (y0_in && x0_in) {
     FLT4 value = FLT4(r0);
-    uint3 gid = uint3(gid_x, gid_y, gid_z);
-    $2
     args.dst_tensor.Write(value, gid_x, gid_y, gid_z);
   }
   if (y1_in && x0_in) {
     FLT4 value = FLT4(l0);
-    uint3 gid = uint3(gid_x, gid_y + 1, gid_z);
-    $2
     args.dst_tensor.Write(value, gid_x, gid_y + 1, gid_z);
   }
   if (y0_in && x1_in) {
     FLT4 value = FLT4(t0);
-    uint3 gid = uint3(gid_x + 1, gid_y, gid_z);
-    $2
     args.dst_tensor.Write(value, gid_x + 1, gid_y, gid_z);
   }
   if (y1_in && x1_in) {
     FLT4 value = FLT4(b0);
-    uint3 gid = uint3(gid_x + 1, gid_y + 1, gid_z);
-    $2
     args.dst_tensor.Write(value, gid_x + 1, gid_y + 1, gid_z);
   }
 }
@@ -330,14 +322,10 @@ kernel void ComputeFunction(
 
   if (y0_in) {
     FLT4 value = FLT4(r0);
-    uint3 gid = uint3(gid_x, gid_y, gid_z);
-    $2
     args.dst_tensor.Write(value, gid_x, gid_y, gid_z);
   }
   if (y1_in) {
     FLT4 value = FLT4(l0);
-    uint3 gid = uint3(gid_x, gid_y + 1, gid_z);
-    $2
     args.dst_tensor.Write(value, gid_x, gid_y + 1, gid_z);
   }
 }
@@ -454,9 +442,7 @@ kernel void ComputeFunction(
     }
   }
   FLT4 res = FLT4(sum0) + args.biases.Read(dst_z);
-  FLT4 value = res;
-  $2
-  args.dst_tensor.Write(value, dst_x, dst_y, dst_z);
+  args.dst_tensor.Write(res, dst_x, dst_y, dst_z);
 }
 )";
   ComputeTaskDescriptor desc(definition);
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
index ff9b0d3d20b..04fa7effa0c 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
@@ -104,8 +104,6 @@ std::string GetFullyConnectedCode(const GpuInfo& gpu_info, int src_channels,
   if (tid.y == 0 && tid.x % 4 == 0 && dst_s < args.dst_tensor.Slices()) {
     FLT4 value = FLT4(temp[tid.x][0], temp[tid.x + 1][0], temp[tid.x + 2][0], temp[tid.x + 3][0]) +
       args.bias.Read(dst_s);
-    uint3 gid = uint3(0u, 0u, uint(dst_s));
-    $$2
     args.dst_tensor.Write(value, 0, 0, dst_s);
   }
 }
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc
index cb5dc973792..1b46b05f50c 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc
@@ -67,7 +67,6 @@ kernel void ComputeFunction(
   value.z = t_index == indexes.z ? src_color.z : 0.0;
   value.w = t_index == indexes.w ? src_color.w : 0.0;
 
-  $2
   args.dst_tensor.Write(value, X, Y, gid.z);
 }
   )";
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc b/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc
index 9a92a94593c..86d4ee7565f 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc
@@ -86,7 +86,6 @@ std::string GetMeanCode(const int3& work_group_size) {
   }
   c += "  FLT4 value = FLT4(sum * args.inv_multiplier_y);\n";
   c += R"(
-  $2
   args.dst_tensor.Write(value, 0, 0, gid.z);
 }
 )";
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/padding.cc b/tensorflow/lite/delegates/gpu/metal/kernels/padding.cc
index 1edff8be184..8ed67343b3d 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/padding.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/padding.cc
@@ -119,7 +119,6 @@ std::string GetPaddingCode(const PadAttributes& attr) {
     }
     code += "  }\n";
   }
-  code += "  $2\n";
   code += "  args.dst_tensor.Write(value, gid.x, gid.y, gid.z);\n";
   code += "}\n";
   return code;
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc
index 65632482538..7cb42f0a3b3 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc
@@ -59,9 +59,7 @@ kernel void ComputeFunction(
       maximum = max(maximum, src_color);
     }
   }
-  FLT4 value = maximum;
-  $2
-  args.dst_tensor.Write(value, gid.x, gid.y, gid.z);
+  args.dst_tensor.Write(maximum, gid.x, gid.y, gid.z);
 }
   )";
   return shader_source;
@@ -110,9 +108,7 @@ kernel void ComputeFunction(
       index_counter++;
     }
   }
-  args.dst_tensor.GetAddress(linear_index, gid.x, gid.y, gid.z);
   FLT4 value = static_cast<FLT4>(indexes);
-  $2
   args.dst_tensor.Write(value, gid.x, gid.y, gid.z);
 }
   )";
@@ -147,11 +143,9 @@ kernel void ComputeFunction(
       sum += src_color;
     }
   }
-  args.dst_tensor.GetAddress(linear_index, gid.x, gid.y, gid.z);
   // If window_size==0, window covered nothing. This situation is a sign of
   // incorrectly constructed operation. NaNs are expected as output.
   FLT4 value = FLT4(sum / window_size);
-  $2
   args.dst_tensor.Write(value, gid.x, gid.y, gid.z);
 }
 )";
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/metal/kernels/reshape.cc
index 722e4c6590a..39709e310ac 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/reshape.cc
@@ -62,8 +62,6 @@ kernel void ComputeFunction(
       value[i] = args.src_tensor.Read(src_x, src_y, src_layer)[src_channel];
     }
   }
-
-  $2
   args.dst_tensor.Write(value, igid.x, igid.y, igid.z);
 })";
   return code;
@@ -91,7 +89,6 @@ kernel void ComputeFunction(
   int src_z = t0 - src_x * args.src_tensor.Slices();  // t0 % args.src_tensor.Slices();
 
   FLT4 value = args.src_tensor.Read(src_x, src_y, src_z);
-  $2
   args.dst_tensor.Write(value, X, Y, Z);
 })";
   return code;
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc b/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc
index 89570667494..d3cb1405daf 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc
@@ -65,7 +65,6 @@ kernel void ComputeFunction(
   // bilinear interpolation
   FLT4 value = mix(mix(tex11, tex21, static_cast<FLT>(t.x)),
                    mix(tex12, tex22, static_cast<FLT>(t.x)), static_cast<FLT>(t.y));
-  $2
   args.dst_tensor.Write(value, gid.x, gid.y, gid.z);
 }
 )";
@@ -106,8 +105,6 @@ kernel void ComputeFunction(
   c += "  coord.y = min(coord.y, args.src_tensor.Height() - 1);\n";
   c += R"(
   FLT4 value = args.src_tensor.Read(coord.x, coord.y, gid.z);
-  args.dst_tensor.GetAddress(linear_index, gid.x, gid.y, gid.z);
-  $2
   args.dst_tensor.Write(value, gid.x, gid.y, gid.z);
 }
 )";
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/slice.cc b/tensorflow/lite/delegates/gpu/metal/kernels/slice.cc
index be226fbcb04..a55fe06550e 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/slice.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/slice.cc
@@ -134,9 +134,7 @@ kernel void ComputeFunction($1
       c += "  }\n";
     }
   }
-  c += "  FLT4 value = result;\n";
-  c += "  $2\n";
-  c += "  args.dst_tensor.Write(value, X, Y, Z);\n";
+  c += "  args.dst_tensor.Write(result, X, Y, Z);\n";
   c += "}\n";
   return c;
 }
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc
index 6176229753f..bd1fc552f53 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc
@@ -117,8 +117,6 @@ kernel void ComputeFunction($1
   if (dst_s < args.src_tensor.Slices()) {
     float4 src = float4(args.src_tensor.Read(0, 0, dst_s)) - float4(maximum);
     FLT4 value = FLT4(exp(src) * sum);
-    uint3 gid = uint3(0, 0, dst_s);
-    $2
     args.dst_tensor.Write(value, 0, 0, dst_s);
   }
 })";
@@ -161,7 +159,6 @@ kernel void ComputeFunction(
   for (int d = 0; d < args.dst_tensor.Slices(); ++d) {
     float4 src = float4(args.src_tensor.Read(gid.x, gid.y, d)) - float4(maximum);
     FLT4 value = FLT4(exp(src) / sum);
-    $2
     args.dst_tensor.Write(value, gid.x, gid.y, d);
   }
 }
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc
index f9eca52f1e7..ce4252d2ff7 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc
@@ -50,7 +50,6 @@ kernel void ComputeFunction($1 uint3 gid[[thread_position_in_grid]]) {
     uint src_c = dst_c % args.src_tensor.Channels();
     value[i] = args.src_tensor.Read(src_x, src_y, src_c / 4)[src_c % 4];
   }
-  $2
   args.dst_tensor.Write(value, gid.x, gid.y, gid.z);
 })";
 
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
index adb454b780e..b413cb24bfa 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
@@ -103,8 +103,6 @@ std::string GetDeconvolution(const ConvolutionTransposedAttributes& attr) {
 
       for (short l = 0; l < dst_depth; ++l) {
         FLT4 value = FLT4(out[l * 4], out[l * 4 + 1], out[l * 4 + 2], out[l * 4 + 3]) + args.biases.Read(l);
-        uint3 gid = uint3(ugid.x, ugid.y, uint(l));
-        $$2
         args.dst_tensor.Write(value, ugid.x, ugid.y, l);
       }
     }
@@ -225,8 +223,6 @@ std::string GetDeconvolutionShared(const ConvolutionTransposedAttributes& attr,
 
       for (short l = 0; l < dst_depth; ++l) {
         FLT4 value = FLT4(out[l * 4], out[l * 4 + 1], out[l * 4 + 2], out[l * 4 + 3]) + args.biases.Read(l);
-        uint3 gid = uint3(ugid.x, ugid.y, uint(l));
-        $$2
         args.dst_tensor.Write(value, ugid.x, ugid.y, l);
       }
     }
@@ -400,8 +396,6 @@ kernel void ComputeFunction(
           c += "  if (" + x_check + " && " + y_check + ") {\n";
           c += "    FLT4 value = FLT4(" + R + ") + bias_val;\n";
           std::string dst_coords = dst_x + ", " + dst_y + ", Z";
-          c += "    uint3 gid = uint3(" + dst_coords + ");\n";
-          c += "    $2\n";
           c += "    args.dst_tensor.Write(value, " + dst_coords + ");\n";
           c += "  }\n";
         }
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
index 022565af21b..c8ec883b84d 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
@@ -307,28 +307,20 @@ kernel void ComputeFunction($1
     FLT4 t1 = I[y][3] + I[y][4];
     if (tile_x < args.dst_tensor.Width()) {
       FLT4 value = I[y][0] + t0 + t1 + bias_val;
-      uint3 gid = uint3(tile_x, tile_y + y, global_ids.z);
-      $2
       args.dst_tensor.Write(value, tile_x, tile_y + y, global_ids.z);
     }
     FLT4 t2 = I[y][1] - I[y][2];
     FLT4 t3 = I[y][3] - I[y][4];
     if (tile_x + 1 < args.dst_tensor.Width()) {
       FLT4 value = t2 * At[7] + t3 * At[9] + bias_val;
-      uint3 gid = uint3(tile_x + 1, tile_y + y, global_ids.z);
-      $2
       args.dst_tensor.Write(value, tile_x + 1, tile_y + y, global_ids.z);
     }
     if (tile_x + 2 < args.dst_tensor.Width()) {
       FLT4 value = t0 * At[13] + t1 * At[15] + bias_val;
-      uint3 gid = uint3(tile_x + 2, tile_y + y, global_ids.z);
-      $2
       args.dst_tensor.Write(value, tile_x + 2, tile_y + y, global_ids.z);
     }
     if (tile_x + 3 < args.dst_tensor.Width()) {
       FLT4 value = t2 * At[19] + t3 * At[21] + I[y][5] + bias_val;
-      uint3 gid = uint3(tile_x + 3, tile_y + y, global_ids.z);
-      $2
       args.dst_tensor.Write(value, tile_x + 3, tile_y + y, global_ids.z);
     }
   }
@@ -411,28 +403,20 @@ kernel void ComputeFunction($1
   FLT4 bias_val = args.biases.Read(DST_Z);
   if (tile_x < args.dst_tensor.Width()) {
     FLT4 value = I0 + t0 + t1 + bias_val;
-    uint3 gid = uint3(tile_x, tile_y, global_ids.z);
-    $2;
     args.dst_tensor.Write(value, tile_x, tile_y, global_ids.z);
   }
   FLT4 t2 = I1 - I2;
   FLT4 t3 = I3 - I4;
   if (tile_x + 1 < args.dst_tensor.Width()) {
     FLT4 value = t2 * At[7] + t3 * At[9] + bias_val;
-    uint3 gid = uint3(tile_x + 1, tile_y, global_ids.z);
-    $2;
     args.dst_tensor.Write(value, tile_x + 1, tile_y, global_ids.z);
   }
   if (tile_x + 2 < args.dst_tensor.Width()) {
     FLT4 value = t0 * At[13] + t1 * At[15] + bias_val;
-    uint3 gid = uint3(tile_x + 2, tile_y, global_ids.z);
-    $2;
     args.dst_tensor.Write(value, tile_x + 2, tile_y, global_ids.z);
   }
   if (tile_x + 3 < args.dst_tensor.Width()) {
     FLT4 value = t2 * At[19] + t3 * At[21] + I5 + bias_val;
-    uint3 gid = uint3(tile_x + 3, tile_y, global_ids.z);
-    $2;
     args.dst_tensor.Write(value, tile_x + 3, tile_y, global_ids.z);
   }
 }