Added operation_selector for Metal backend.

Making it similar to gpu/common/ structure. PiperOrigin-RevId: 348573563 Change-Id: I993f124c4d3fac14c397495aaf131e6614c5317f
2020-12-21 20:56:32 -08:00 · 2020-12-21 20:56:32 -08:00 · 60325944ed
commit 60325944ed
parent dde0d4f92c
9 changed files with 681 additions and 597 deletions
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@ -35,7 +35,7 @@ cc_library(
        "//tensorflow/lite/delegates/gpu/common:util",
        "//tensorflow/lite/delegates/gpu/common:winograd_util",
        "//tensorflow/lite/delegates/gpu/metal/kernels",
-        "//tensorflow/lite/delegates/gpu/metal/kernels:custom_registry",
+        "//tensorflow/lite/delegates/gpu/metal/selectors:operation_selector",
        "//tensorflow/lite/delegates/gpu/metal/selectors:subgraph",
    ],
 )
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@ -28,500 +28,12 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
 #include "tensorflow/lite/delegates/gpu/metal/compiled_model.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/add.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/concat.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/conv.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/custom_registry.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/elementwise.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/mean.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/padding.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/pooling.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/prelu.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/quantize_and_dequantize.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/relu.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/reshape.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/resize.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/slice.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/softmax.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h"
-#include "tensorflow/lite/delegates/gpu/metal/kernels/winograd.h"
+#include "tensorflow/lite/delegates/gpu/metal/selectors/operation_selector.h"
 #include "tensorflow/lite/delegates/gpu/metal/selectors/subgraph.h"

 namespace tflite {
 namespace gpu {
 namespace metal {
-namespace {
-
-std::unique_ptr<ComputeTaskDescriptor> SelectDepthWiseConv(
-    const OperationDef& op_def, const DepthwiseConvolution2DAttributes& attr) {
-  if (CheckDepthWiseConv3x3Stride1x1Support(attr)) {
-    auto gpu_op = DepthWiseConv3x3Stride1x1(op_def, attr);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  } else if (CheckDepthWiseConv3x3Stride2Support(attr)) {
-    auto gpu_op = DepthWiseConv3x3Stride2(op_def, attr);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  } else {
-    auto gpu_op = DepthWiseConvolution(op_def, attr);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  }
-}
-
-std::unique_ptr<ComputeTaskDescriptor> SelectConvolutionTransposed(
-    const OperationDef& op_def, const ConvolutionTransposedAttributes& attr,
-    const GpuInfo& gpu_info) {
-  if (CheckConvolutionTransposed4x4Support(attr)) {
-    auto gpu_op = ConvolutionTransposed4x4(op_def, attr, gpu_info);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  } else {
-    auto gpu_op = ConvolutionTransposed(op_def, attr, gpu_info);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  }
-}
-
-std::unique_ptr<ComputeTaskDescriptor> SelectQuantizeAndDequantize(
-    const OperationDef& op_def, const QuantizeAndDequantizeAttributes& attr) {
-  auto gpu_op = QuantizeAndDequantize(op_def, attr);
-  return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-}
-
-std::unique_ptr<ComputeTaskDescriptor> SelectPReLU(
-    const OperationDef& op_def, const BHWC& src_shape,
-    const PReLUAttributes& attr) {
-  auto alpha = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&attr.alpha);
-  if (alpha) {
-    auto gpu_op = PReLU(op_def, attr);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  }
-  auto alpha3d = absl::get_if<Tensor<HWC, DataType::FLOAT32>>(&attr.alpha);
-  if (!alpha3d) {
-    return {};
-  }
-  if (alpha3d->shape.h != src_shape.h || alpha3d->shape.w != src_shape.w ||
-      alpha3d->shape.c != src_shape.c) {
-    return {};
-  }
-  auto gpu_op = PReLUFull(op_def, attr);
-  return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-}
-
-std::unique_ptr<ComputeTaskDescriptor> SelectReshape(
-    const OperationDef& op_def, const BHWC& src_shape,
-    const ReshapeAttributes& attr) {
-  if (src_shape.c % 4 == 0 && attr.new_shape.c % 4 == 0) {
-    auto gpu_op = Reshapex4(op_def, attr);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  } else {
-    auto gpu_op = Reshape(op_def, attr);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  }
-}
-
-std::unique_ptr<ComputeTaskDescriptor> SelectSoftmax(const OperationDef& op_def,
-                                                     const BHWC& src_shape,
-                                                     const GpuInfo& gpu_info) {
-  if (src_shape.w == 1 && src_shape.h == 1) {
-    auto gpu_op = Softmax1x1(op_def, gpu_info);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  } else {
-    auto gpu_op = Softmax(op_def);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  }
-}
-
-std::unique_ptr<ComputeTaskDescriptor> SelectSpaceToDepth(
-    const OperationDef& op_def, const SpaceToDepthAttributes& attr) {
-  auto gpu_op = SpaceToDepth(op_def, attr);
-  return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-}
-
-std::unique_ptr<ComputeTaskDescriptor> SelectWinograd4x4To36(
-    const OperationDef& op_def, const Winograd4x4To36Attributes& attr,
-    const GpuInfo& gpu_info) {
-  if (gpu_info.IsApple()) {
-    auto gpu_op = Winograd4x4To36(op_def, attr);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  } else {
-    auto gpu_op = Winograd4x4To36TileX6(op_def, attr);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  }
-}
-
-std::unique_ptr<ComputeTaskDescriptor> SelectWinograd36To4x4(
-    const OperationDef& op_def, const Winograd36To4x4Attributes& attr,
-    const GpuInfo& gpu_info) {
-  if (gpu_info.IsApple()) {
-    auto gpu_op = Winograd36To4x4(op_def, attr);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  } else {
-    auto gpu_op = Winograd36To4x4Tile4x1(op_def, attr);
-    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  }
-}
-
-bool IsRecommendedForWinograd4x4To6x6(const Convolution2DAttributes& attr,
-                                      const GpuInfo& gpu_info,
-                                      const BHWC& dst_shape) {
-  const int tiles_x = DivideRoundUp(dst_shape.w, 4);
-  const int tiles_y = DivideRoundUp(dst_shape.h, 4);
-  const int total_tiles = tiles_x * tiles_y;
-  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
-  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
-  int min_depth = 16;
-  const int min_tiles = 32;
-  if (total_tiles >= min_tiles * 8) {
-    min_depth /= 4;
-    min_depth = std::max(min_depth, 8);
-  } else if (total_tiles >= min_tiles * 4) {
-    min_depth /= 2;
-    min_depth = std::max(min_depth, 8);
-  }
-  const bool recommended_channels =
-      src_depth >= min_depth && dst_depth >= min_depth;
-  const bool recommended_hw = total_tiles >= min_tiles;
-  return recommended_channels && recommended_hw;
-}
-
-absl::Status WinogradFromNode(const GpuInfo& gpu_info,
-                              const std::vector<Value*>& inputs,
-                              const std::vector<Value*>& outputs,
-                              const OperationDef& op_def,
-                              const BHWC& input_shape, const BHWC& output_shape,
-                              const Convolution2DAttributes& attr,
-                              GPUOperationsSubgraph* gpu_subgraph) {
-  if (!IsSuitableForWinograd4x4To6x6(attr)) {
-    return absl::UnimplementedError("No implementation for this case.");
-  }
-  if (!IsRecommendedForWinograd4x4To6x6(attr, gpu_info, output_shape)) {
-    return absl::UnimplementedError("Not recommended for this case.");
-  }
-
-  const int tiles_x = DivideRoundUp(output_shape.w, 4);
-  const int tiles_y = DivideRoundUp(output_shape.h, 4);
-  const BHWC shape_0{input_shape.b, 36, tiles_x * tiles_y, input_shape.c};
-  const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
-  TensorDescriptor tensor_desc = op_def.src_tensors[0];
-  gpu_subgraph->new_tensors = {{shape_0, tensor_desc}, {shape_1, tensor_desc}};
-  gpu_subgraph->operations.clear();
-  gpu_subgraph->operations.resize(3);
-
-  OperationDef winograd_up_def;
-  winograd_up_def.precision = op_def.precision;
-  winograd_up_def.src_tensors.push_back(op_def.src_tensors[0]);
-  winograd_up_def.dst_tensors.push_back(op_def.src_tensors[0]);
-  auto& winograd_up = gpu_subgraph->operations[0];
-  Winograd4x4To36Attributes wino_up_attr;
-  wino_up_attr.padding = attr.padding;
-  winograd_up.operation =
-      SelectWinograd4x4To36(winograd_up_def, wino_up_attr, gpu_info);
-  winograd_up.input_ids = {static_cast<int>(inputs[0]->id)};
-  winograd_up.output_ids = {-1};
-
-  OperationDef conv_def;
-  conv_def.precision = op_def.precision;
-  conv_def.src_tensors.push_back(op_def.src_tensors[0]);
-  conv_def.dst_tensors.push_back(op_def.src_tensors[0]);
-  auto& conv = gpu_subgraph->operations[1];
-  conv.input_ids = {-1};
-  conv.output_ids = {-2};
-  auto gpu_op = ConvolutionWino4x4To6x6(conv_def, shape_1, attr, gpu_info);
-  conv.operation = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-  OperationDef winograd_down_def;
-  winograd_down_def.precision = op_def.precision;
-  winograd_down_def.src_tensors.push_back(op_def.src_tensors[0]);
-  winograd_down_def.dst_tensors.push_back(op_def.dst_tensors[0]);
-  auto& winograd_down = gpu_subgraph->operations[2];
-  winograd_down.input_ids = {-2};
-  winograd_down.output_ids = {static_cast<int>(outputs[0]->id)};
-  Winograd36To4x4Attributes wino_down_attr;
-  wino_down_attr.output_shape = outputs[0]->tensor.shape;
-  wino_down_attr.biases = attr.bias;
-  winograd_down.operation =
-      SelectWinograd36To4x4(winograd_down_def, wino_down_attr, gpu_info);
-  return absl::OkStatus();
-}
-
-absl::Status GPUOperationFromNode(const GpuInfo& gpu_info,
-                                  const OperationDef& op_def,
-                                  const std::vector<Value*>& inputs,
-                                  const std::vector<Value*>& outputs,
-                                  const Node& node,
-                                  GPUOperationsSubgraph* gpu_subgraph) {
-  std::unique_ptr<ComputeTaskDescriptor>* task =
-      InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
-  auto op_type = OperationTypeFromString(node.operation.type);
-  switch (op_type) {
-    case OperationType::ADD: {
-      if (inputs.size() == 1) {
-        if (node.operation.attributes.has_value()) {
-          auto attr =
-              absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
-          auto gpu_op = ElementwiseWithOneInputAndConstantArguent(
-              op_def, op_type, attr.param);
-          *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-        } else {
-          return absl::UnimplementedError(
-              "Missing attributes for single input op: " + node.operation.type);
-        }
-      } else if (inputs.size() == 2) {
-        auto gpu_op =
-            ElementwiseWithTwoInputs(op_def, inputs[1]->tensor.shape, op_type);
-        *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      } else {  // more than 2 inputs
-        auto gpu_op = Add(op_def);
-        *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      }
-      break;
-    }
-    case OperationType::CONCAT: {
-      std::vector<BHWC> input_shapes;
-      for (auto& input : inputs) {
-        input_shapes.push_back(input->tensor.shape);
-      }
-      auto gpu_op = Concat(
-          op_def, absl::any_cast<ConcatAttributes>(node.operation.attributes),
-          input_shapes);
-      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      break;
-    }
-    case OperationType::CONVOLUTION_2D: {
-      if (inputs.size() != 1) {
-        return absl::UnimplementedError(
-            "Convolution does not support more than 1 runtime tensor");
-      }
-      auto attr =
-          absl::any_cast<Convolution2DAttributes>(node.operation.attributes);
-      auto input_shape = inputs[0]->tensor.shape;
-      auto output_shape = outputs[0]->tensor.shape;
-      if (WinogradFromNode(gpu_info, inputs, outputs, op_def, input_shape,
-                           output_shape, attr, gpu_subgraph)
-              .ok()) {
-        return absl::OkStatus();
-      } else {
-        auto gpu_op = ConvolutionGeneric(op_def, output_shape, attr, gpu_info);
-        *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      }
-      break;
-    }
-    case OperationType::CONVOLUTION_TRANSPOSED:
-      if (inputs.size() != 1) {
-        return absl::UnimplementedError(
-            "Convolution Transposed does not support more than 1 runtime "
-            "tensor");
-      }
-      *task = SelectConvolutionTransposed(
-          op_def,
-          absl::any_cast<ConvolutionTransposedAttributes>(
-              node.operation.attributes),
-          gpu_info);
-      break;
-    case OperationType::DEPTHWISE_CONVOLUTION:
-      if (inputs.size() != 1) {
-        return absl::UnimplementedError(
-            "DepthWise Convolution does not support more than 1 runtime "
-            "tensor");
-      }
-      *task = SelectDepthWiseConv(
-          op_def, absl::any_cast<DepthwiseConvolution2DAttributes>(
-                      node.operation.attributes));
-      break;
-    case OperationType::FULLY_CONNECTED: {
-      auto gpu_op = FullyConnected(
-          op_def,
-          absl::any_cast<FullyConnectedAttributes>(node.operation.attributes),
-          gpu_info);
-      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      break;
-    }
-    case OperationType::MAX_UNPOOLING_2D: {
-      auto gpu_op = MaxUnpooling(
-          op_def,
-          absl::any_cast<MaxUnpooling2DAttributes>(node.operation.attributes));
-      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      break;
-    }
-    case OperationType::MEAN: {
-      auto attr = absl::any_cast<MeanAttributes>(node.operation.attributes);
-      if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) {
-        return absl::UnimplementedError("Mean supports HW axis only in Metal");
-      }
-      auto gpu_op = Mean(op_def, attr);
-      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      break;
-    }
-    case OperationType::MUL:
-      if (inputs.size() == 1) {
-        if (node.operation.attributes.has_value()) {
-          auto attr =
-              absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
-          auto gpu_op = ElementwiseWithOneInputAndConstantArguent(
-              op_def, op_type, attr.param);
-          *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-        } else {
-          return absl::UnimplementedError(
-              "Missing attributes for single input op: " + node.operation.type);
-        }
-      } else if (inputs.size() == 2) {
-        auto gpu_op =
-            ElementwiseWithTwoInputs(op_def, inputs[1]->tensor.shape, op_type);
-        *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      }
-      break;
-    case OperationType::PAD: {
-      auto attr = absl::any_cast<PadAttributes>(node.operation.attributes);
-      if (attr.appended.b != 0 || attr.prepended.b != 0) {
-        return absl::UnimplementedError("Padding for BATCH is not supported.");
-      }
-      auto gpu_op = Padding(op_def, attr);
-      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      break;
-    }
-    case OperationType::POOLING_2D: {
-      auto attr =
-          absl::any_cast<Pooling2DAttributes>(node.operation.attributes);
-      auto pooling_op_def = op_def;
-      pooling_op_def.dst_tensors = {op_def.dst_tensors[0]};
-      auto gpu_op = Pooling(op_def, attr, false);
-      gpu_subgraph->operations[0].operation =
-          absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      gpu_subgraph->operations[0].input_ids = {static_cast<int>(inputs[0]->id)};
-      gpu_subgraph->operations[0].output_ids = {
-          static_cast<int>(outputs[0]->id)};
-      if (attr.type == PoolingType::MAX && attr.output_indices) {
-        gpu_subgraph->operations.push_back({});
-        auto gpu_ind_op = Pooling(op_def, attr, true);
-        gpu_subgraph->operations[1].operation =
-            absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_ind_op));
-        gpu_subgraph->operations[1].input_ids = {
-            static_cast<int>(inputs[0]->id)};
-        gpu_subgraph->operations[1].output_ids = {
-            static_cast<int>(outputs[1]->id)};
-      }
-      break;
-    }
-    case OperationType::PRELU: {
-      const auto src_shape = inputs[0]->tensor.shape;
-      *task = SelectPReLU(
-          op_def, src_shape,
-          absl::any_cast<PReLUAttributes>(node.operation.attributes));
-      break;
-    }
-    case OperationType::RELU: {
-      auto gpu_op = ReLU(
-          op_def, absl::any_cast<ReLUAttributes>(node.operation.attributes));
-      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      break;
-    }
-    case OperationType::QUANTIZE_AND_DEQUANTIZE:
-      *task = SelectQuantizeAndDequantize(
-          op_def, absl::any_cast<QuantizeAndDequantizeAttributes>(
-                      node.operation.attributes));
-      break;
-    case OperationType::RESHAPE: {
-      const auto src_shape = inputs[0]->tensor.shape;
-      *task = SelectReshape(
-          op_def, src_shape,
-          absl::any_cast<ReshapeAttributes>(node.operation.attributes));
-      break;
-    }
-    case OperationType::RESIZE: {
-      auto gpu_op =
-          Resize(op_def,
-                 absl::any_cast<Resize2DAttributes>(node.operation.attributes));
-      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      break;
-    }
-    case OperationType::SLICE: {
-      auto gpu_op = Slice(
-          op_def, absl::any_cast<SliceAttributes>(node.operation.attributes));
-      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      break;
-    }
-    case OperationType::SOFTMAX: {
-      auto attr = absl::any_cast<SoftmaxAttributes>(node.operation.attributes);
-      if (attr.axis != Axis::CHANNELS) {
-        return absl::UnimplementedError(
-            "Softmax supports only CHANNELS dimension");
-      }
-      const auto src_shape = inputs[0]->tensor.shape;
-      *task = SelectSoftmax(op_def, src_shape, gpu_info);
-      break;
-    }
-    case OperationType::SPACE_TO_DEPTH:
-      *task = SelectSpaceToDepth(op_def, absl::any_cast<SpaceToDepthAttributes>(
-                                             node.operation.attributes));
-      break;
-    case OperationType::ABS:
-    case OperationType::COPY:
-    case OperationType::COS:
-    case OperationType::ELU:
-    case OperationType::EXP:
-    case OperationType::HARD_SWISH:
-    case OperationType::LOG:
-    case OperationType::NEG:
-    case OperationType::RSQRT:
-    case OperationType::SIGMOID:
-    case OperationType::SIN:
-    case OperationType::SQRT:
-    case OperationType::SQUARE:
-    case OperationType::TANH: {
-      auto gpu_op = ElementwiseWithOneInput(op_def, op_type);
-      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      break;
-    }
-    case OperationType::DIV:
-    case OperationType::MAXIMUM:
-    case OperationType::MINIMUM:
-    case OperationType::POW:
-    case OperationType::SQUARED_DIFF:
-    case OperationType::SUB: {
-      if (inputs.size() == 1) {
-        if (node.operation.attributes.has_value()) {
-          auto attr =
-              absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
-          auto gpu_op = ElementwiseWithOneInputAndConstantArguent(
-              op_def, op_type, attr.param);
-          *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-        } else {
-          return absl::UnimplementedError(
-              "Missing attributes for single input op: " + node.operation.type);
-        }
-      } else if (inputs.size() == 2) {
-        auto gpu_op =
-            ElementwiseWithTwoInputs(op_def, inputs[1]->tensor.shape, op_type);
-        *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
-      }
-    } break;
-    case OperationType::BATCH_NORMALIZATION:
-    case OperationType::BATCH_TO_SPACE:
-    case OperationType::BATCHED_MATMUL:
-    case OperationType::CONST:
-    case OperationType::LSTM:
-    // TODO(b/162763635): implement MeanStddevNormalization for Metal.
-    case OperationType::MEAN_STDDEV_NORMALIZATION:
-    case OperationType::REDUCE_MAXIMUM:
-    case OperationType::REDUCE_MINIMUM:
-    case OperationType::REDUCE_PRODUCT:
-    case OperationType::REDUCE_SUM:
-    // comparison operations
-    case OperationType::LESS:
-    case OperationType::LESS_EQUAL:
-    case OperationType::EQUAL:
-    case OperationType::NOT_EQUAL:
-    case OperationType::GREATER:
-    case OperationType::GREATER_EQUAL:
-    case OperationType::SPACE_TO_BATCH:
-    case OperationType::TRANSPOSE:
-    case OperationType::UNKNOWN:
-      return absl::UnimplementedError("Unsupported op: " + node.operation.type);
-  }
-  return absl::OkStatus();
-}
-
-}  // namespace

 absl::Status Compile(const GraphFloat32& graph, const GpuInfo& gpu_info,
                     CalculationsPrecision precision,
@ -537,81 +49,54 @@ absl::Status Compile(const GraphFloat32& graph, const GpuInfo& gpu_info,
  }
  int node_linear_id = 0;
  for (const auto& node : graph.nodes()) {
-    std::vector<ValueId> inputs;
-    for (auto& input : graph.FindInputs(node->id)) {
-      inputs.push_back(static_cast<ValueId>(input->id));
+    auto inputs = graph.FindInputs(node->id);
+    auto outputs = graph.FindOutputs(node->id);
+    DataType data_type = DeduceDataTypeFromPrecision(precision);
+    TensorDescriptor tensor_descriptor =
+        TensorDescriptor{data_type, TensorStorageType::BUFFER, Layout::HWC};
+    OperationDef op_def;
+    op_def.precision = precision;
+    for (int j = 0; j < inputs.size(); ++j) {
+      op_def.src_tensors.push_back(tensor_descriptor);
    }
-    std::vector<ValueId> outputs;
-    for (auto& output : graph.FindOutputs(node->id)) {
-      outputs.push_back(static_cast<ValueId>(output->id));
+    for (int j = 0; j < outputs.size(); ++j) {
+      op_def.dst_tensors.push_back(tensor_descriptor);
    }
-    std::vector<NodeDescriptor> node_descs;
-    std::vector<ComputeTaskDescriptorPtr> custom_tasks;
-    auto custom_status = RegisterCustomOps(graph, node, inputs, outputs,
-                                           precision, &custom_tasks);
-    if (!custom_status.ok()) {
-      auto inputs = graph.FindInputs(node->id);
-      auto outputs = graph.FindOutputs(node->id);
-      DataType data_type = DeduceDataTypeFromPrecision(precision);
-      TensorDescriptor tensor_descriptor =
-          TensorDescriptor{data_type, TensorStorageType::BUFFER, Layout::HWC};
-      OperationDef op_def;
-      op_def.precision = precision;
-      for (int j = 0; j < inputs.size(); ++j) {
-        op_def.src_tensors.push_back(tensor_descriptor);
-      }
-      for (int j = 0; j < outputs.size(); ++j) {
-        op_def.dst_tensors.push_back(tensor_descriptor);
-      }
-      GPUOperationsSubgraph gpu_subgraph;
-      RETURN_IF_ERROR(GPUOperationFromNode(gpu_info, op_def, inputs, outputs,
-                                           *node, &gpu_subgraph));
-      std::map<int, ValueId> mapping_to_global_ids;
-      for (int j = 0; j < gpu_subgraph.new_tensors.size(); ++j) {
-        const auto& t = gpu_subgraph.new_tensors[j];
-        last_value_id++;
-        compiled_model->tensor_shapes[last_value_id] = t.first;
-        mapping_to_global_ids[j] = last_value_id;
-      }
-      for (auto& gpu_op : gpu_subgraph.operations) {
-        NodeDescriptor metal_node;
-        metal_node.task = std::move(gpu_op.operation);
-        metal_node.src_tensors_ids.resize(gpu_op.input_ids.size());
-        for (int j = 0; j < gpu_op.input_ids.size(); ++j) {
-          int id = gpu_op.input_ids[j];
-          if (id >= 0) {
-            metal_node.src_tensors_ids[j] = id;
-          } else {
-            metal_node.src_tensors_ids[j] = mapping_to_global_ids[-(id + 1)];
-          }
+    GPUOperationsSubgraph gpu_subgraph;
+    RETURN_IF_ERROR(GPUOperationFromNode(gpu_info, op_def, inputs, outputs,
+                                         *node, &gpu_subgraph));
+    std::map<int, ValueId> mapping_to_global_ids;
+    for (int j = 0; j < gpu_subgraph.new_tensors.size(); ++j) {
+      const auto& t = gpu_subgraph.new_tensors[j];
+      last_value_id++;
+      compiled_model->tensor_shapes[last_value_id] = t.first;
+      mapping_to_global_ids[j] = last_value_id;
+    }
+    for (auto& gpu_op : gpu_subgraph.operations) {
+      NodeDescriptor metal_node;
+      metal_node.task = std::move(gpu_op.operation);
+      metal_node.src_tensors_ids.resize(gpu_op.input_ids.size());
+      for (int j = 0; j < gpu_op.input_ids.size(); ++j) {
+        int id = gpu_op.input_ids[j];
+        if (id >= 0) {
+          metal_node.src_tensors_ids[j] = id;
+        } else {
+          metal_node.src_tensors_ids[j] = mapping_to_global_ids[-(id + 1)];
        }
-        metal_node.dst_tensors_ids.resize(gpu_op.output_ids.size());
-        for (int j = 0; j < gpu_op.output_ids.size(); ++j) {
-          int id = gpu_op.output_ids[j];
-          if (id >= 0) {
-            metal_node.dst_tensors_ids[j] = id;
-          } else {
-            metal_node.dst_tensors_ids[j] = mapping_to_global_ids[-(id + 1)];
-          }
+      }
+      metal_node.dst_tensors_ids.resize(gpu_op.output_ids.size());
+      for (int j = 0; j < gpu_op.output_ids.size(); ++j) {
+        int id = gpu_op.output_ids[j];
+        if (id >= 0) {
+          metal_node.dst_tensors_ids[j] = id;
+        } else {
+          metal_node.dst_tensors_ids[j] = mapping_to_global_ids[-(id + 1)];
        }
-        metal_node.description =
-            node->operation.type + " " + std::to_string(node->id);
-        node_descs.push_back(std::move(metal_node));
      }
-    } else {
-      for (auto& custom_task : custom_tasks) {
-        NodeDescriptor node_desc;
-        node_desc.task = custom_task;
-        node_desc.description =
-            node->operation.type + "_" + std::to_string(node->id);
-        node_desc.src_tensors_ids = inputs;
-        node_desc.dst_tensors_ids = outputs;
-        node_descs.push_back(node_desc);
-      }
-    }
-    for (auto& node_desc : node_descs) {
-      node_desc.id = node_linear_id++;
-      compiled_model->nodes.push_back(node_desc);
+      metal_node.description =
+          node->operation.type + " " + std::to_string(node->id);
+      metal_node.id = node_linear_id++;
+      compiled_model->nodes.push_back(std::move(metal_node));
    }
  }
  return absl::OkStatus();
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@ -168,18 +168,6 @@ macos_unit_test(
    deps = [":conv_test_lib"],
 )

-cc_library(
-    name = "custom_registry",
-    srcs = ["custom_registry.cc"],
-    hdrs = ["custom_registry.h"],
-    deps = [
-        "//tensorflow/lite/delegates/gpu/common:model",
-        "//tensorflow/lite/delegates/gpu/common:precision",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
-    ],
-)
-
 cc_library(
    name = "depthwise_conv",
    srcs = ["depthwise_conv.cc"],
--- a/tensorflow/lite/delegates/gpu/metal/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/selectors/BUILD
@ -3,6 +3,37 @@ package(
    licenses = ["notice"],  # Apache 2.0
 )

+cc_library(
+    name = "default_selector",
+    hdrs = ["default_selector.h"],
+    deps = [
+        ":subgraph",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/metal/selectors/default:default_selector",  # buildcleaner: keep
+    ],
+)
+
+cc_library(
+    name = "operation_selector",
+    srcs = ["operation_selector.cc"],
+    hdrs = ["operation_selector.h"],
+    deps = [
+        ":default_selector",
+        ":subgraph",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:precision",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/common:winograd_util",
+        "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
+        "//tensorflow/lite/delegates/gpu/metal/kernels",
+    ],
+)
+
 cc_library(
    name = "subgraph",
    srcs = ["subgraph.cc"],
--- a/tensorflow/lite/delegates/gpu/metal/selectors/default/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/selectors/default/BUILD
@ -0,0 +1,16 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "default_selector",
+    srcs = ["default_selector.cc"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/metal/selectors:subgraph",
+        "@com_google_absl//absl/strings",
+    ],
+)
--- a/tensorflow/lite/delegates/gpu/metal/selectors/default/default_selector.cc
+++ b/tensorflow/lite/delegates/gpu/metal/selectors/default/default_selector.cc
@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -13,25 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/lite/delegates/gpu/metal/kernels/custom_registry.h"
-
-#include <vector>
+#include <memory>

+#include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
-#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/selectors/subgraph.h"

 namespace tflite {
 namespace gpu {
 namespace metal {

-absl::Status RegisterCustomOps(const GraphFloat32& graph, const Node* node,
-                               const std::vector<ValueId>& inputs,
-                               const std::vector<ValueId>& outputs,
-                               CalculationsPrecision precision,
-                               std::vector<ComputeTaskDescriptorPtr>* tasks) {
-  return absl::UnimplementedError("Unsupported op: " + node->operation.type);
+absl::Status SelectDefault(const GpuInfo& gpu_info, const OperationDef& op_def,
+                           const std::vector<Value*>& inputs,
+                           const std::vector<Value*>& outputs, const Node& node,
+                           GPUOperationsSubgraph* gpu_subgraph) {
+  return absl::UnimplementedError(
+      absl::StrCat("No selector for ", node.operation.type));
 }

 }  // namespace metal
--- a/tensorflow/lite/delegates/gpu/metal/selectors/default_selector.h
+++ b/tensorflow/lite/delegates/gpu/metal/selectors/default_selector.h
@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -13,29 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_CUSTOM_REGISTRY_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_CUSTOM_REGISTRY_H_
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_SELECTORS_DEFAULT_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_SELECTORS_DEFAULT_SELECTOR_H_

-#include <vector>
+#include <memory>

 #include "tensorflow/lite/delegates/gpu/common/model.h"
-#include "tensorflow/lite/delegates/gpu/common/precision.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/selectors/subgraph.h"

 namespace tflite {
 namespace gpu {
 namespace metal {

-// Registers custom operations.
-absl::Status RegisterCustomOps(const GraphFloat32& graph, const Node* node,
-                               const std::vector<ValueId>& inputs,
-                               const std::vector<ValueId>& outputs,
-                               CalculationsPrecision precision,
-                               std::vector<ComputeTaskDescriptorPtr>* tasks);
+absl::Status SelectDefault(const GpuInfo& gpu_info, const OperationDef& op_def,
+                           const std::vector<Value*>& inputs,
+                           const std::vector<Value*>& outputs, const Node& node,
+                           GPUOperationsSubgraph* gpu_subgraph);

 }  // namespace metal
 }  // namespace gpu
 }  // namespace tflite

-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_CUSTOM_REGISTRY_H_
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_SELECTORS_DEFAULT_SELECTOR_H_
--- a/tensorflow/lite/delegates/gpu/metal/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/metal/selectors/operation_selector.cc
@ -0,0 +1,529 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/selectors/operation_selector.h"
+
+#include <vector>
+
+#include "absl/strings/substitute.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/add.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/concat.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/conv.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/elementwise.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/mean.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/padding.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/pooling.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/prelu.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/quantize_and_dequantize.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/relu.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/reshape.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/resize.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/slice.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/softmax.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/winograd.h"
+#include "tensorflow/lite/delegates/gpu/metal/selectors/default_selector.h"
+#include "tensorflow/lite/delegates/gpu/metal/selectors/subgraph.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+namespace {
+
+std::unique_ptr<ComputeTaskDescriptor> SelectDepthWiseConv(
+    const OperationDef& op_def, const DepthwiseConvolution2DAttributes& attr) {
+  if (CheckDepthWiseConv3x3Stride1x1Support(attr)) {
+    auto gpu_op = DepthWiseConv3x3Stride1x1(op_def, attr);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  } else if (CheckDepthWiseConv3x3Stride2Support(attr)) {
+    auto gpu_op = DepthWiseConv3x3Stride2(op_def, attr);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  } else {
+    auto gpu_op = DepthWiseConvolution(op_def, attr);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  }
+}
+
+std::unique_ptr<ComputeTaskDescriptor> SelectConvolutionTransposed(
+    const OperationDef& op_def, const ConvolutionTransposedAttributes& attr,
+    const GpuInfo& gpu_info) {
+  if (CheckConvolutionTransposed4x4Support(attr)) {
+    auto gpu_op = ConvolutionTransposed4x4(op_def, attr, gpu_info);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  } else {
+    auto gpu_op = ConvolutionTransposed(op_def, attr, gpu_info);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  }
+}
+
+std::unique_ptr<ComputeTaskDescriptor> SelectQuantizeAndDequantize(
+    const OperationDef& op_def, const QuantizeAndDequantizeAttributes& attr) {
+  auto gpu_op = QuantizeAndDequantize(op_def, attr);
+  return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+}
+
+std::unique_ptr<ComputeTaskDescriptor> SelectPReLU(
+    const OperationDef& op_def, const BHWC& src_shape,
+    const PReLUAttributes& attr) {
+  auto alpha = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&attr.alpha);
+  if (alpha) {
+    auto gpu_op = PReLU(op_def, attr);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  }
+  auto alpha3d = absl::get_if<Tensor<HWC, DataType::FLOAT32>>(&attr.alpha);
+  if (!alpha3d) {
+    return {};
+  }
+  if (alpha3d->shape.h != src_shape.h || alpha3d->shape.w != src_shape.w ||
+      alpha3d->shape.c != src_shape.c) {
+    return {};
+  }
+  auto gpu_op = PReLUFull(op_def, attr);
+  return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+}
+
+std::unique_ptr<ComputeTaskDescriptor> SelectReshape(
+    const OperationDef& op_def, const BHWC& src_shape,
+    const ReshapeAttributes& attr) {
+  if (src_shape.c % 4 == 0 && attr.new_shape.c % 4 == 0) {
+    auto gpu_op = Reshapex4(op_def, attr);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  } else {
+    auto gpu_op = Reshape(op_def, attr);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  }
+}
+
+std::unique_ptr<ComputeTaskDescriptor> SelectSoftmax(const OperationDef& op_def,
+                                                     const BHWC& src_shape,
+                                                     const GpuInfo& gpu_info) {
+  if (src_shape.w == 1 && src_shape.h == 1) {
+    auto gpu_op = Softmax1x1(op_def, gpu_info);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  } else {
+    auto gpu_op = Softmax(op_def);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  }
+}
+
+std::unique_ptr<ComputeTaskDescriptor> SelectSpaceToDepth(
+    const OperationDef& op_def, const SpaceToDepthAttributes& attr) {
+  auto gpu_op = SpaceToDepth(op_def, attr);
+  return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+}
+
+std::unique_ptr<ComputeTaskDescriptor> SelectWinograd4x4To36(
+    const OperationDef& op_def, const Winograd4x4To36Attributes& attr,
+    const GpuInfo& gpu_info) {
+  if (gpu_info.IsApple()) {
+    auto gpu_op = Winograd4x4To36(op_def, attr);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  } else {
+    auto gpu_op = Winograd4x4To36TileX6(op_def, attr);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  }
+}
+
+std::unique_ptr<ComputeTaskDescriptor> SelectWinograd36To4x4(
+    const OperationDef& op_def, const Winograd36To4x4Attributes& attr,
+    const GpuInfo& gpu_info) {
+  if (gpu_info.IsApple()) {
+    auto gpu_op = Winograd36To4x4(op_def, attr);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  } else {
+    auto gpu_op = Winograd36To4x4Tile4x1(op_def, attr);
+    return absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  }
+}
+
+bool IsRecommendedForWinograd4x4To6x6(const Convolution2DAttributes& attr,
+                                      const GpuInfo& gpu_info,
+                                      const BHWC& dst_shape) {
+  const int tiles_x = DivideRoundUp(dst_shape.w, 4);
+  const int tiles_y = DivideRoundUp(dst_shape.h, 4);
+  const int total_tiles = tiles_x * tiles_y;
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+  int min_depth = 16;
+  const int min_tiles = 32;
+  if (total_tiles >= min_tiles * 8) {
+    min_depth /= 4;
+    min_depth = std::max(min_depth, 8);
+  } else if (total_tiles >= min_tiles * 4) {
+    min_depth /= 2;
+    min_depth = std::max(min_depth, 8);
+  }
+  const bool recommended_channels =
+      src_depth >= min_depth && dst_depth >= min_depth;
+  const bool recommended_hw = total_tiles >= min_tiles;
+  return recommended_channels && recommended_hw;
+}
+
+absl::Status WinogradFromNode(const GpuInfo& gpu_info,
+                              const std::vector<Value*>& inputs,
+                              const std::vector<Value*>& outputs,
+                              const OperationDef& op_def,
+                              const BHWC& input_shape, const BHWC& output_shape,
+                              const Convolution2DAttributes& attr,
+                              GPUOperationsSubgraph* gpu_subgraph) {
+  if (!IsSuitableForWinograd4x4To6x6(attr)) {
+    return absl::UnimplementedError("No implementation for this case.");
+  }
+  if (!IsRecommendedForWinograd4x4To6x6(attr, gpu_info, output_shape)) {
+    return absl::UnimplementedError("Not recommended for this case.");
+  }
+
+  const int tiles_x = DivideRoundUp(output_shape.w, 4);
+  const int tiles_y = DivideRoundUp(output_shape.h, 4);
+  const BHWC shape_0{input_shape.b, 36, tiles_x * tiles_y, input_shape.c};
+  const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
+  TensorDescriptor tensor_desc = op_def.src_tensors[0];
+  gpu_subgraph->new_tensors = {{shape_0, tensor_desc}, {shape_1, tensor_desc}};
+  gpu_subgraph->operations.clear();
+  gpu_subgraph->operations.resize(3);
+
+  OperationDef winograd_up_def;
+  winograd_up_def.precision = op_def.precision;
+  winograd_up_def.src_tensors.push_back(op_def.src_tensors[0]);
+  winograd_up_def.dst_tensors.push_back(op_def.src_tensors[0]);
+  auto& winograd_up = gpu_subgraph->operations[0];
+  Winograd4x4To36Attributes wino_up_attr;
+  wino_up_attr.padding = attr.padding;
+  winograd_up.operation =
+      SelectWinograd4x4To36(winograd_up_def, wino_up_attr, gpu_info);
+  winograd_up.input_ids = {static_cast<int>(inputs[0]->id)};
+  winograd_up.output_ids = {-1};
+
+  OperationDef conv_def;
+  conv_def.precision = op_def.precision;
+  conv_def.src_tensors.push_back(op_def.src_tensors[0]);
+  conv_def.dst_tensors.push_back(op_def.src_tensors[0]);
+  auto& conv = gpu_subgraph->operations[1];
+  conv.input_ids = {-1};
+  conv.output_ids = {-2};
+  auto gpu_op = ConvolutionWino4x4To6x6(conv_def, shape_1, attr, gpu_info);
+  conv.operation = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+  OperationDef winograd_down_def;
+  winograd_down_def.precision = op_def.precision;
+  winograd_down_def.src_tensors.push_back(op_def.src_tensors[0]);
+  winograd_down_def.dst_tensors.push_back(op_def.dst_tensors[0]);
+  auto& winograd_down = gpu_subgraph->operations[2];
+  winograd_down.input_ids = {-2};
+  winograd_down.output_ids = {static_cast<int>(outputs[0]->id)};
+  Winograd36To4x4Attributes wino_down_attr;
+  wino_down_attr.output_shape = outputs[0]->tensor.shape;
+  wino_down_attr.biases = attr.bias;
+  winograd_down.operation =
+      SelectWinograd36To4x4(winograd_down_def, wino_down_attr, gpu_info);
+  return absl::OkStatus();
+}
+
+}  // namespace
+
+absl::Status GPUOperationFromNode(const GpuInfo& gpu_info,
+                                  const OperationDef& op_def,
+                                  const std::vector<Value*>& inputs,
+                                  const std::vector<Value*>& outputs,
+                                  const Node& node,
+                                  GPUOperationsSubgraph* gpu_subgraph) {
+  std::unique_ptr<ComputeTaskDescriptor>* task =
+      InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
+  auto op_type = OperationTypeFromString(node.operation.type);
+  switch (op_type) {
+    case OperationType::ADD: {
+      if (inputs.size() == 1) {
+        if (node.operation.attributes.has_value()) {
+          auto attr =
+              absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
+          auto gpu_op = ElementwiseWithOneInputAndConstantArguent(
+              op_def, op_type, attr.param);
+          *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+        } else {
+          return absl::UnimplementedError(
+              "Missing attributes for single input op: " + node.operation.type);
+        }
+      } else if (inputs.size() == 2) {
+        auto gpu_op =
+            ElementwiseWithTwoInputs(op_def, inputs[1]->tensor.shape, op_type);
+        *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      } else {  // more than 2 inputs
+        auto gpu_op = Add(op_def);
+        *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      }
+      break;
+    }
+    case OperationType::CONCAT: {
+      std::vector<BHWC> input_shapes;
+      for (auto& input : inputs) {
+        input_shapes.push_back(input->tensor.shape);
+      }
+      auto gpu_op = Concat(
+          op_def, absl::any_cast<ConcatAttributes>(node.operation.attributes),
+          input_shapes);
+      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      break;
+    }
+    case OperationType::CONVOLUTION_2D: {
+      if (inputs.size() != 1) {
+        return absl::UnimplementedError(
+            "Convolution does not support more than 1 runtime tensor");
+      }
+      auto attr =
+          absl::any_cast<Convolution2DAttributes>(node.operation.attributes);
+      auto input_shape = inputs[0]->tensor.shape;
+      auto output_shape = outputs[0]->tensor.shape;
+      if (WinogradFromNode(gpu_info, inputs, outputs, op_def, input_shape,
+                           output_shape, attr, gpu_subgraph)
+              .ok()) {
+        return absl::OkStatus();
+      } else {
+        auto gpu_op = ConvolutionGeneric(op_def, output_shape, attr, gpu_info);
+        *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      }
+      break;
+    }
+    case OperationType::CONVOLUTION_TRANSPOSED:
+      if (inputs.size() != 1) {
+        return absl::UnimplementedError(
+            "Convolution Transposed does not support more than 1 runtime "
+            "tensor");
+      }
+      *task = SelectConvolutionTransposed(
+          op_def,
+          absl::any_cast<ConvolutionTransposedAttributes>(
+              node.operation.attributes),
+          gpu_info);
+      break;
+    case OperationType::DEPTHWISE_CONVOLUTION:
+      if (inputs.size() != 1) {
+        return absl::UnimplementedError(
+            "DepthWise Convolution does not support more than 1 runtime "
+            "tensor");
+      }
+      *task = SelectDepthWiseConv(
+          op_def, absl::any_cast<DepthwiseConvolution2DAttributes>(
+                      node.operation.attributes));
+      break;
+    case OperationType::FULLY_CONNECTED: {
+      auto gpu_op = FullyConnected(
+          op_def,
+          absl::any_cast<FullyConnectedAttributes>(node.operation.attributes),
+          gpu_info);
+      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      break;
+    }
+    case OperationType::MAX_UNPOOLING_2D: {
+      auto gpu_op = MaxUnpooling(
+          op_def,
+          absl::any_cast<MaxUnpooling2DAttributes>(node.operation.attributes));
+      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      break;
+    }
+    case OperationType::MEAN: {
+      auto attr = absl::any_cast<MeanAttributes>(node.operation.attributes);
+      if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) {
+        return absl::UnimplementedError("Mean supports HW axis only in Metal");
+      }
+      auto gpu_op = Mean(op_def, attr);
+      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      break;
+    }
+    case OperationType::MUL:
+      if (inputs.size() == 1) {
+        if (node.operation.attributes.has_value()) {
+          auto attr =
+              absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
+          auto gpu_op = ElementwiseWithOneInputAndConstantArguent(
+              op_def, op_type, attr.param);
+          *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+        } else {
+          return absl::UnimplementedError(
+              "Missing attributes for single input op: " + node.operation.type);
+        }
+      } else if (inputs.size() == 2) {
+        auto gpu_op =
+            ElementwiseWithTwoInputs(op_def, inputs[1]->tensor.shape, op_type);
+        *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      }
+      break;
+    case OperationType::PAD: {
+      auto attr = absl::any_cast<PadAttributes>(node.operation.attributes);
+      if (attr.appended.b != 0 || attr.prepended.b != 0) {
+        return absl::UnimplementedError("Padding for BATCH is not supported.");
+      }
+      auto gpu_op = Padding(op_def, attr);
+      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      break;
+    }
+    case OperationType::POOLING_2D: {
+      auto attr =
+          absl::any_cast<Pooling2DAttributes>(node.operation.attributes);
+      auto pooling_op_def = op_def;
+      pooling_op_def.dst_tensors = {op_def.dst_tensors[0]};
+      auto gpu_op = Pooling(op_def, attr, false);
+      gpu_subgraph->operations[0].operation =
+          absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      gpu_subgraph->operations[0].input_ids = {static_cast<int>(inputs[0]->id)};
+      gpu_subgraph->operations[0].output_ids = {
+          static_cast<int>(outputs[0]->id)};
+      if (attr.type == PoolingType::MAX && attr.output_indices) {
+        gpu_subgraph->operations.push_back({});
+        auto gpu_ind_op = Pooling(op_def, attr, true);
+        gpu_subgraph->operations[1].operation =
+            absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_ind_op));
+        gpu_subgraph->operations[1].input_ids = {
+            static_cast<int>(inputs[0]->id)};
+        gpu_subgraph->operations[1].output_ids = {
+            static_cast<int>(outputs[1]->id)};
+      }
+      break;
+    }
+    case OperationType::PRELU: {
+      const auto src_shape = inputs[0]->tensor.shape;
+      *task = SelectPReLU(
+          op_def, src_shape,
+          absl::any_cast<PReLUAttributes>(node.operation.attributes));
+      break;
+    }
+    case OperationType::RELU: {
+      auto gpu_op = ReLU(
+          op_def, absl::any_cast<ReLUAttributes>(node.operation.attributes));
+      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      break;
+    }
+    case OperationType::QUANTIZE_AND_DEQUANTIZE:
+      *task = SelectQuantizeAndDequantize(
+          op_def, absl::any_cast<QuantizeAndDequantizeAttributes>(
+                      node.operation.attributes));
+      break;
+    case OperationType::RESHAPE: {
+      const auto src_shape = inputs[0]->tensor.shape;
+      *task = SelectReshape(
+          op_def, src_shape,
+          absl::any_cast<ReshapeAttributes>(node.operation.attributes));
+      break;
+    }
+    case OperationType::RESIZE: {
+      auto gpu_op =
+          Resize(op_def,
+                 absl::any_cast<Resize2DAttributes>(node.operation.attributes));
+      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      break;
+    }
+    case OperationType::SLICE: {
+      auto gpu_op = Slice(
+          op_def, absl::any_cast<SliceAttributes>(node.operation.attributes));
+      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      break;
+    }
+    case OperationType::SOFTMAX: {
+      auto attr = absl::any_cast<SoftmaxAttributes>(node.operation.attributes);
+      if (attr.axis != Axis::CHANNELS) {
+        return absl::UnimplementedError(
+            "Softmax supports only CHANNELS dimension");
+      }
+      const auto src_shape = inputs[0]->tensor.shape;
+      *task = SelectSoftmax(op_def, src_shape, gpu_info);
+      break;
+    }
+    case OperationType::SPACE_TO_DEPTH:
+      *task = SelectSpaceToDepth(op_def, absl::any_cast<SpaceToDepthAttributes>(
+                                             node.operation.attributes));
+      break;
+    case OperationType::ABS:
+    case OperationType::COPY:
+    case OperationType::COS:
+    case OperationType::ELU:
+    case OperationType::EXP:
+    case OperationType::HARD_SWISH:
+    case OperationType::LOG:
+    case OperationType::NEG:
+    case OperationType::RSQRT:
+    case OperationType::SIGMOID:
+    case OperationType::SIN:
+    case OperationType::SQRT:
+    case OperationType::SQUARE:
+    case OperationType::TANH: {
+      auto gpu_op = ElementwiseWithOneInput(op_def, op_type);
+      *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      break;
+    }
+    case OperationType::DIV:
+    case OperationType::MAXIMUM:
+    case OperationType::MINIMUM:
+    case OperationType::POW:
+    case OperationType::SQUARED_DIFF:
+    case OperationType::SUB: {
+      if (inputs.size() == 1) {
+        if (node.operation.attributes.has_value()) {
+          auto attr =
+              absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
+          auto gpu_op = ElementwiseWithOneInputAndConstantArguent(
+              op_def, op_type, attr.param);
+          *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+        } else {
+          return absl::UnimplementedError(
+              "Missing attributes for single input op: " + node.operation.type);
+        }
+      } else if (inputs.size() == 2) {
+        auto gpu_op =
+            ElementwiseWithTwoInputs(op_def, inputs[1]->tensor.shape, op_type);
+        *task = absl::make_unique<ComputeTaskDescriptor>(std::move(gpu_op));
+      }
+    } break;
+    case OperationType::BATCH_NORMALIZATION:
+    case OperationType::BATCH_TO_SPACE:
+    case OperationType::BATCHED_MATMUL:
+    case OperationType::CONST:
+    case OperationType::LSTM:
+    // TODO(b/162763635): implement MeanStddevNormalization for Metal.
+    case OperationType::MEAN_STDDEV_NORMALIZATION:
+    case OperationType::REDUCE_MAXIMUM:
+    case OperationType::REDUCE_MINIMUM:
+    case OperationType::REDUCE_PRODUCT:
+    case OperationType::REDUCE_SUM:
+    // comparison operations
+    case OperationType::LESS:
+    case OperationType::LESS_EQUAL:
+    case OperationType::EQUAL:
+    case OperationType::NOT_EQUAL:
+    case OperationType::GREATER:
+    case OperationType::GREATER_EQUAL:
+    case OperationType::SPACE_TO_BATCH:
+    case OperationType::TRANSPOSE:
+      return absl::UnimplementedError("Unsupported op: " + node.operation.type);
+    default:
+      return SelectDefault(gpu_info, op_def, inputs, outputs, node,
+                           gpu_subgraph);
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/metal/selectors/operation_selector.h
+++ b/tensorflow/lite/delegates/gpu/metal/selectors/operation_selector.h
@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_SELECTORS_OPERATION_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_SELECTORS_OPERATION_SELECTOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/metal/selectors/subgraph.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+absl::Status GPUOperationFromNode(const GpuInfo& gpu_info,
+                                  const OperationDef& op_def,
+                                  const std::vector<Value*>& inputs,
+                                  const std::vector<Value*>& outputs,
+                                  const Node& node,
+                                  GPUOperationsSubgraph* gpu_subgraph);
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_SELECTORS_OPERATION_SELECTOR_H_