Opensource OpenCL-based backend.

Next step is to wire it up with OpenGL-based backend and provide a single GPU delegate. PiperOrigin-RevId: 263822202
2019-08-16 12:18:22 -07:00 · 2019-08-16 12:18:22 -07:00 · f1b58a9c2c
commit f1b58a9c2c
parent 3a73493dfe
165 changed files with 24255 additions and 0 deletions
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@ -0,0 +1,423 @@
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "opencl_wrapper",
+    srcs = ["opencl_wrapper.cc"],
+    hdrs = ["opencl_wrapper.h"],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-ldl",  # opencl_wrapper calls dlopen()
+            "-lm",
+        ],
+        "//conditions:default": ["-ldl"],  # opencl_wrapper calls dlopen()
+    }),
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+        "@opencl_headers",
+    ],
+)
+
+cc_library(
+    name = "cl_device",
+    srcs = ["cl_device.cc"],
+    hdrs = ["cl_device.h"],
+    deps = [
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "cl_event",
+    srcs = ["cl_event.cc"],
+    hdrs = ["cl_event.h"],
+    deps = [
+        ":opencl_wrapper",
+    ],
+)
+
+cc_library(
+    name = "cl_context",
+    srcs = ["cl_context.cc"],
+    hdrs = ["cl_context.h"],
+    deps = [
+        ":cl_device",
+        ":cl_image_format",
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "cl_memory",
+    srcs = ["cl_memory.cc"],
+    hdrs = ["cl_memory.h"],
+    deps = [
+        ":opencl_wrapper",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+    ],
+)
+
+cc_library(
+    name = "cl_command_queue",
+    srcs = ["cl_command_queue.cc"],
+    hdrs = ["cl_command_queue.h"],
+    deps = [
+        ":cl_context",
+        ":cl_device",
+        ":cl_event",
+        ":cl_kernel",
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "cl_image_format",
+    srcs = ["cl_image_format.cc"],
+    hdrs = ["cl_image_format.h"],
+    deps = [
+        ":opencl_wrapper",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+    ],
+)
+
+cc_library(
+    name = "util",
+    srcs = ["util.cc"],
+    hdrs = ["util.h"],
+    deps = [
+        ":opencl_wrapper",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "cl_errors",
+    hdrs = ["cl_errors.h"],
+    deps = [
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+    ],
+)
+
+cc_library(
+    name = "cl_program",
+    srcs = ["cl_program.cc"],
+    hdrs = ["cl_program.h"],
+    deps = [
+        ":cl_context",
+        ":cl_device",
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "gl_interop",
+    srcs = ["gl_interop.cc"],
+    hdrs = ["gl_interop.h"],
+    deps = [
+        ":cl_context",
+        ":cl_device",
+        ":cl_errors",
+        ":cl_event",
+        ":cl_memory",
+        ":egl_sync",
+        ":environment",
+        ":opencl_wrapper",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/gl:gl_call",
+        "//tensorflow/lite/delegates/gpu/gl:gl_sync",
+        "//tensorflow/lite/delegates/gpu/gl:portable",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "program_cache",
+    srcs = ["program_cache.cc"],
+    hdrs = ["program_cache.h"],
+    deps = [
+        ":cl_context",
+        ":cl_device",
+        ":cl_kernel",
+        ":cl_program",
+        ":compiled_program_cache_cc_fbs",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/types:span",
+        "@farmhash_archive//:farmhash",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "precision",
+    srcs = ["precision.cc"],
+    hdrs = ["precision.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+    ],
+)
+
+cc_library(
+    name = "cl_kernel",
+    srcs = ["cl_kernel.cc"],
+    hdrs = ["cl_kernel.h"],
+    deps = [
+        ":cl_context",
+        ":cl_device",
+        ":cl_program",
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:flt_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "buffer",
+    srcs = ["buffer.cc"],
+    hdrs = ["buffer.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_context",
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "texture2d",
+    srcs = ["texture2d.cc"],
+    hdrs = ["texture2d.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_context",
+        ":opencl_wrapper",
+        ":tensor_type",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "tensor",
+    srcs = ["tensor.cc"],
+    hdrs = ["tensor.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_context",
+        ":cl_device",
+        ":cl_image_format",
+        ":cl_memory",
+        ":tensor_type",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "tensor_type",
+    srcs = ["tensor_type.cc"],
+    hdrs = ["tensor_type.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+    ],
+)
+
+cc_library(
+    name = "tensor_type_util",
+    srcs = ["tensor_type_util.cc"],
+    hdrs = ["tensor_type_util.h"],
+    deps = [
+        ":tensor_type",
+        "//tensorflow/lite/delegates/gpu:api",
+    ],
+)
+
+cc_library(
+    name = "environment",
+    srcs = ["environment.cc"],
+    hdrs = ["environment.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_context",
+        ":cl_device",
+        ":cl_kernel",
+        ":precision",
+        ":program_cache",
+        ":tensor",
+        ":tensor_type",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+    ],
+)
+
+cc_library(
+    name = "inference_context",
+    srcs = ["inference_context.cc"],
+    hdrs = ["inference_context.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_device",
+        ":environment",
+        ":model_hints",
+        ":opencl_wrapper",
+        ":precision",
+        ":tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl/selectors:operation_selector",
+        "//tensorflow/lite/delegates/gpu/common:memory_management",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common/transformations:add_bias",
+        "//tensorflow/lite/delegates/gpu/common/transformations:merge_padding_with",
+    ],
+)
+
+cc_library(
+    name = "linear_storage",
+    srcs = ["linear_storage.cc"],
+    hdrs = ["linear_storage.h"],
+    deps = [
+        ":buffer",
+        ":opencl_wrapper",
+        ":tensor_type",
+        ":texture2d",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "model_hints",
+    hdrs = ["model_hints.h"],
+)
+
+cc_library(
+    name = "egl_sync",
+    srcs = ["egl_sync.cc"],
+    hdrs = ["egl_sync.h"],
+    defines = [
+        "EGL_EGLEXT_PROTOTYPES",
+    ],
+    deps = [
+        ":cl_device",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/gl:gl_call",
+    ],
+)
+
+cc_library(
+    name = "api",
+    srcs = ["api.cc"],
+    hdrs = ["api.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_errors",
+        ":cl_event",
+        ":egl_sync",
+        ":environment",
+        ":gl_interop",
+        ":inference_context",
+        ":opencl_wrapper",
+        ":precision",
+        ":tensor",
+        ":tensor_type",
+        ":tensor_type_util",
+        "//tensorflow/lite/delegates/gpu:api",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:converter",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "gpu_api_delegate",
+    srcs = ["gpu_api_delegate.cc"],
+    hdrs = ["gpu_api_delegate.h"],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-lEGL",
+            "-lGLESv3",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":api",
+        ":opencl_wrapper",
+        ":tensor_type_util",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/delegates/gpu:api",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:model_builder",
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+flatbuffer_cc_library(
+    name = "compiled_program_cache_cc_fbs",
+    srcs = ["compiled_program_cache.fbs"],
+    flatc_args = [
+        "--scoped-enums",
+    ],
+)
+
+tflite_portable_test_suite()
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@ -0,0 +1,790 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/api.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include <EGL/eglext.h>
+#include "absl/memory/memory.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/gl_interop.h"
+#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/converter.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+// Connects tensor definition provided by a user (external) with tensor
+// definition used by the inference engine (internal).
+struct TensorTieDef {
+  ValueId id;
+  AccessType access_type;
+  TensorObjectDef internal_def;
+  TensorObjectDef external_def;
+};
+
+// Connects external tensor object to internal tensor object and provides
+// functionality to copy data to/from external object to internal.
+class TensorTie {
+ public:
+  explicit TensorTie(const TensorTieDef& def) : def_(def) {}
+
+  virtual ~TensorTie() {}
+
+  virtual Status SetExternalObject(TensorObject obj) {
+    return InvalidArgumentError("Tensor object is readonly.");
+  }
+
+  virtual TensorObject GetExternalObject() = 0;
+
+  virtual Status CopyToExternalObject() = 0;
+
+  virtual Status CopyFromExternalObject() = 0;
+
+  const TensorTieDef& def() const { return def_; }
+
+ private:
+  const TensorTieDef def_;
+};
+
+// Both internal and external defs are identical, therefore nothing to connect
+// here.
+class NoopTensorTie : public TensorTie {
+ public:
+  NoopTensorTie(const TensorTieDef& def, TensorObject obj)
+      : TensorTie(def), obj_(obj) {}
+
+  static bool IsSupported(const TensorTieDef& def) {
+    return def.external_def == def.internal_def;
+  }
+
+  static Status New(const TensorTieDef& def, TensorObject internal_object,
+                    std::unique_ptr<TensorTie>* tie) {
+    *tie = absl::make_unique<NoopTensorTie>(def, internal_object);
+    return OkStatus();
+  }
+
+  TensorObject GetExternalObject() final { return obj_; }
+
+  Status CopyToExternalObject() final { return OkStatus(); }
+
+  Status CopyFromExternalObject() final { return OkStatus(); }
+
+ private:
+  TensorObject obj_;
+};
+
+// Does one-step conversion between internal and external objects.
+// It may also allocate external objects if requested.
+class DefaultTensorTie : public TensorTie {
+ public:
+  DefaultTensorTie(const TensorTieDef& def, TensorObject internal_obj)
+      : TensorTie(def), internal_obj_(internal_obj) {}
+
+  static bool IsSupported(const TensorTieDef& def,
+                          TensorObjectConverterBuilder* converter_builder) {
+    auto object_type = def.external_def.object_def.object_type;
+    return (object_type == ObjectType::OPENCL_BUFFER ||
+            object_type == ObjectType::OPENCL_TEXTURE ||
+            object_type == ObjectType::CPU_MEMORY) &&
+           converter_builder->IsSupported(def.internal_def, def.external_def) &&
+           converter_builder->IsSupported(def.external_def, def.internal_def);
+  }
+
+  static Status New(const TensorTieDef& def, TensorObject internal_object,
+                    TensorObjectConverterBuilder* converter_builder,
+                    Environment* env, std::unique_ptr<TensorTie>* tie) {
+    auto tie_impl = absl::make_unique<DefaultTensorTie>(def, internal_object);
+    RETURN_IF_ERROR(tie_impl->Init(converter_builder, env));
+    *tie = std::move(tie_impl);
+    return OkStatus();
+  }
+
+  Status CopyToExternalObject() final {
+    if (!converter_to_) {
+      return UnavailableError("Conversion is not available");
+    }
+    return converter_to_->Convert(internal_obj_, GetExternalObject());
+  }
+
+  Status CopyFromExternalObject() final {
+    if (!converter_from_) {
+      return UnavailableError("Conversion is not available");
+    }
+    return converter_from_->Convert(GetExternalObject(), internal_obj_);
+  }
+
+  Status SetExternalObject(TensorObject obj) final {
+    if (!def().external_def.object_def.user_provided) {
+      return InvalidArgumentError("External object is read-only");
+    }
+    if (!IsValid(def().external_def, obj)) {
+      return InvalidArgumentError("Given object is not valid");
+    }
+    external_obj_ = obj;
+    return OkStatus();
+  }
+
+  TensorObject GetExternalObject() final { return external_obj_; }
+
+ private:
+  Status Init(TensorObjectConverterBuilder* converter_builder,
+              Environment* env) {
+    RETURN_IF_ERROR(converter_builder->MakeConverter(
+        def().internal_def, def().external_def, &converter_to_));
+    RETURN_IF_ERROR(converter_builder->MakeConverter(
+        def().external_def, def().internal_def, &converter_from_));
+    return MaybeAllocateExternalObject(env);
+  }
+
+  Status MaybeAllocateExternalObject(Environment* env) {
+    const TensorObjectDef& d = def().external_def;
+    if (d.object_def.user_provided) {
+      return OkStatus();
+    }
+    switch (d.object_def.object_type) {
+      case ObjectType::CPU_MEMORY: {
+        size_t bytes_size =
+            d.dimensions.product() * SizeOf(d.object_def.data_type);
+        cpu_memory_.resize(bytes_size);
+        external_obj_ = CpuMemory{cpu_memory_.data(), cpu_memory_.size()};
+        break;
+      }
+      case ObjectType::OPENCL_TEXTURE:
+      case ObjectType::OPENCL_BUFFER: {
+        auto& dims = d.dimensions;
+        RETURN_IF_ERROR(
+            AllocateTensorMemory(env->context(), env->device(), dims.w, dims.h,
+                                 dims.c, d.object_def.data_type,
+                                 ToTensorStorageType(d.object_def.object_type,
+                                                     d.object_def.data_layout),
+                                 &cl_memory_));
+        if (d.object_def.object_type == ObjectType::OPENCL_TEXTURE) {
+          external_obj_ = OpenClTexture{cl_memory_.memory()};
+        } else {
+          external_obj_ = OpenClBuffer{cl_memory_.memory()};
+        }
+        break;
+      }
+      default:
+        return InternalError("Unexpected object type");
+    }
+    return OkStatus();
+  }
+
+  const TensorObject internal_obj_;
+  TensorObject external_obj_;
+  CLMemory cl_memory_;
+  std::vector<uint8_t> cpu_memory_;
+  std::unique_ptr<TensorObjectConverter> converter_to_;
+  std::unique_ptr<TensorObjectConverter> converter_from_;
+};
+
+// Copies data to intermediate OpenCL buffer and then does two step conversion.
+// It drives the following cases were one-step conversion is not supported:
+//   - CPU BHWC -> CL buffer BHWC -> CL texture DHWC4.
+class TwoStepTensorTie : public TensorTie {
+ public:
+  explicit TwoStepTensorTie(const TensorTieDef& def) : TensorTie(def) {}
+
+  static bool IsSupported(const TensorTieDef& def,
+                          TensorObjectConverterBuilder* converter_builder) {
+    auto defs = MakeOuterInnerDefs(def);
+    return DefaultTensorTie::IsSupported(defs.first, converter_builder) &&
+           DefaultTensorTie::IsSupported(defs.second, converter_builder);
+  }
+
+  static Status New(const TensorTieDef& def, TensorObject internal_object,
+                    TensorObjectConverterBuilder* converter_builder,
+                    Environment* env, std::unique_ptr<TensorTie>* tie) {
+    auto tie_impl = absl::make_unique<TwoStepTensorTie>(def);
+    RETURN_IF_ERROR(tie_impl->Init(internal_object, converter_builder, env));
+    *tie = std::move(tie_impl);
+    return OkStatus();
+  }
+
+  Status CopyToExternalObject() final {
+    RETURN_IF_ERROR(inner_tie_->CopyToExternalObject());
+    return outer_tie_->CopyToExternalObject();
+  }
+
+  Status CopyFromExternalObject() final {
+    RETURN_IF_ERROR(outer_tie_->CopyFromExternalObject());
+    return inner_tie_->CopyFromExternalObject();
+  }
+
+  Status SetExternalObject(TensorObject obj) final {
+    return outer_tie_->SetExternalObject(obj);
+  }
+
+  TensorObject GetExternalObject() final {
+    return outer_tie_->GetExternalObject();
+  }
+
+ private:
+  static std::pair<TensorTieDef, TensorTieDef> MakeOuterInnerDefs(
+      const TensorTieDef& def) {
+    TensorTieDef outer_def;
+    outer_def.external_def = def.external_def;
+    outer_def.internal_def = def.external_def;
+    outer_def.internal_def.object_def.object_type = ObjectType::OPENCL_BUFFER;
+    outer_def.internal_def.object_def.user_provided = true;
+
+    TensorTieDef inner_def;
+    inner_def.external_def = outer_def.internal_def;
+    inner_def.external_def.object_def.user_provided = false;
+    inner_def.internal_def = def.internal_def;
+    return std::make_pair(outer_def, inner_def);
+  }
+
+  Status Init(TensorObject internal_object,
+              TensorObjectConverterBuilder* converter_builder,
+              Environment* env) {
+    auto defs = MakeOuterInnerDefs(def());
+    RETURN_IF_ERROR(DefaultTensorTie::New(defs.second, internal_object,
+                                          converter_builder, env, &inner_tie_));
+    return DefaultTensorTie::New(defs.first, inner_tie_->GetExternalObject(),
+                                 converter_builder, env, &outer_tie_);
+  }
+
+  std::unique_ptr<TensorTie> inner_tie_;
+  std::unique_ptr<TensorTie> outer_tie_;
+};
+
+// Captures GL object into CL context before performing a conversion.
+class GlBufferHolder : public TensorTie {
+ public:
+  GlBufferHolder(const TensorTieDef& def, GlInteropFabric* gl_interop_fabric,
+                 Environment* env)
+      : TensorTie(def),
+        gl_interop_fabric_(gl_interop_fabric),
+        environment_(env) {}
+
+  static bool IsSupported(const TensorTieDef& def,
+                          TensorObjectConverterBuilder* converter_builder) {
+    if (!def.external_def.object_def.user_provided ||
+        def.external_def.object_def.object_type != ObjectType::OPENGL_SSBO) {
+      return false;
+    }
+    return DefaultTensorTie::IsSupported(MakeClDef(def), converter_builder);
+  }
+
+  static Status New(const TensorTieDef& def, TensorObject internal_object,
+                    TensorObjectConverterBuilder* converter_builder,
+                    GlInteropFabric* gl_interop_fabric, Environment* env,
+                    std::unique_ptr<TensorTie>* tie) {
+    auto tie_impl =
+        absl::make_unique<GlBufferHolder>(def, gl_interop_fabric, env);
+    RETURN_IF_ERROR(DefaultTensorTie::New(MakeClDef(def), internal_object,
+                                          converter_builder, env,
+                                          &tie_impl->tie_));
+    *tie = std::move(tie_impl);
+    return OkStatus();
+  }
+
+  Status SetExternalObject(TensorObject obj) final {
+    auto ssbo = absl::get_if<OpenGlBuffer>(&obj);
+    if (!ssbo) {
+      return InvalidArgumentError("Missing OpenGL SSBO");
+    }
+    auto old_ssbo = absl::get_if<OpenGlBuffer>(&external_obj_);
+    if (old_ssbo && ssbo->id == old_ssbo->id) {
+      return OkStatus();
+    }
+    if (cl_object_.memory()) {
+      gl_interop_fabric_->UnregisterMemory(cl_object_.memory());
+    }
+    RETURN_IF_ERROR(CreateClMemoryFromGlBuffer(
+        ssbo->id, def().access_type, &environment_->context(), &cl_object_));
+    external_obj_ = obj;
+    RETURN_IF_ERROR(tie_->SetExternalObject(OpenClBuffer{cl_object_.memory()}));
+    gl_interop_fabric_->RegisterMemory(cl_object_.memory());
+    return OkStatus();
+  }
+
+  TensorObject GetExternalObject() final { return external_obj_; }
+
+  Status CopyFromExternalObject() final {
+    return tie_->CopyFromExternalObject();
+  }
+
+  Status CopyToExternalObject() final { return tie_->CopyToExternalObject(); }
+
+ private:
+  static TensorTieDef MakeClDef(const TensorTieDef& def) {
+    auto cl_def = def;
+    cl_def.external_def.object_def.object_type = ObjectType::OPENCL_BUFFER;
+    cl_def.external_def.object_def.user_provided = true;
+    return cl_def;
+  }
+
+  CLMemory cl_object_;
+  GlInteropFabric* gl_interop_fabric_;
+  Environment* environment_;
+  std::unique_ptr<TensorTie> tie_;
+  TensorObject external_obj_;
+};
+
+TensorObject TensorToObj(const Tensor& tensor) {
+  if (tensor.StorageType() == TensorStorageType::BUFFER) {
+    return OpenClBuffer{tensor.GetMemoryPtr()};
+  }
+  return OpenClTexture{tensor.GetMemoryPtr()};
+}
+
+// Responsible for creating new tensor objects.
+class TensorTieFactory {
+ public:
+  TensorTieFactory(Environment* env, InferenceContext* context,
+                   GlInteropFabric* gl_interop_fabric)
+      : env_(*env),
+        context_(*context),
+        gl_interop_fabric_(gl_interop_fabric),
+        converter_builder_(NewConverterBuilder(env)) {}
+
+  bool IsSupported(const TensorTieDef& def) const {
+    auto converter = converter_builder_.get();
+    return IsValid(def.external_def.object_def) &&
+           (NoopTensorTie::IsSupported(def) ||
+            DefaultTensorTie::IsSupported(def, converter) ||
+            GlBufferHolder::IsSupported(def, converter) ||
+            TwoStepTensorTie::IsSupported(def, converter));
+  }
+
+  Status NewTensorTie(const TensorTieDef& def,
+                      std::unique_ptr<TensorTie>* tie) {
+    TensorObject internal_object = TensorToObj(*context_.GetTensor(def.id));
+    auto converter = converter_builder_.get();
+    if (NoopTensorTie::IsSupported(def)) {
+      return NoopTensorTie::New(def, internal_object, tie);
+    }
+    if (DefaultTensorTie::IsSupported(def, converter)) {
+      return DefaultTensorTie::New(def, internal_object, converter, &env_, tie);
+    }
+    if (GlBufferHolder::IsSupported(def, converter)) {
+      if (!gl_interop_fabric_) {
+        return InvalidArgumentError(
+            "GL object is used but InferenceEnvironmentOptions does not have "
+            "EGL display and context set.");
+      }
+      return GlBufferHolder::New(def, internal_object, converter,
+                                 gl_interop_fabric_, &env_, tie);
+    }
+    if (TwoStepTensorTie::IsSupported(def, converter)) {
+      return TwoStepTensorTie::New(def, internal_object, converter, &env_, tie);
+    }
+    return UnimplementedError("Unsupported tensor tie definition.");
+  }
+
+ private:
+  Environment& env_;
+  InferenceContext& context_;
+  GlInteropFabric* gl_interop_fabric_;
+  std::unique_ptr<TensorObjectConverterBuilder> converter_builder_;
+};
+
+class InferenceRunnerImpl : public InferenceRunner {
+ public:
+  InferenceRunnerImpl(const InferenceEnvironmentOptions& env_options,
+                      Environment* environment,
+                      std::unique_ptr<InferenceContext> context,
+                      std::unique_ptr<GlInteropFabric> gl_interop_fabric)
+      : env_options_(env_options),
+        environment_(environment),
+        context_(std::move(context)),
+        gl_interop_fabric_(std::move(gl_interop_fabric)) {}
+
+  Status Initialize(const std::vector<TensorTieDef>& inputs,
+                    const std::vector<TensorTieDef>& outputs,
+                    TensorTieFactory* factory) {
+    RETURN_IF_ERROR(LinkTensors(inputs, factory, &inputs_));
+    return LinkTensors(outputs, factory, &outputs_);
+  }
+
+  std::vector<TensorObjectDef> inputs() const override {
+    return GetExternalDefinitions(inputs_);
+  }
+
+  std::vector<TensorObjectDef> outputs() const override {
+    return GetExternalDefinitions(outputs_);
+  }
+
+  Status GetInputObject(int index, TensorObject* object) override {
+    if (index < 0 || index > inputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    *object = inputs_[index]->GetExternalObject();
+    return OkStatus();
+  }
+
+  Status GetOutputObject(int index, TensorObject* object) override {
+    if (index < 0 || index > outputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    *object = outputs_[index]->GetExternalObject();
+    return OkStatus();
+  }
+
+  Status SetInputObject(int index, TensorObject object) override {
+    if (index < 0 || index > inputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    return inputs_[index]->SetExternalObject(object);
+  }
+
+  Status SetOutputObject(int index, TensorObject object) override {
+    if (index < 0 || index > outputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    return outputs_[index]->SetExternalObject(object);
+  }
+
+  Status Run() override {
+    if (gl_interop_fabric_) {
+      RETURN_IF_ERROR(gl_interop_fabric_->Start());
+    }
+    for (auto& obj : inputs_) {
+      RETURN_IF_ERROR(obj->CopyFromExternalObject());
+    }
+    RETURN_IF_ERROR(context_->AddToQueue(environment_->queue()));
+    clFlush(environment_->queue()->queue());
+    for (auto& obj : outputs_) {
+      RETURN_IF_ERROR(obj->CopyToExternalObject());
+    }
+    if (gl_interop_fabric_) {
+      RETURN_IF_ERROR(gl_interop_fabric_->Finish());
+    }
+    return OkStatus();
+  }
+
+ private:
+  static Status LinkTensors(const std::vector<TensorTieDef>& defs,
+                            TensorTieFactory* factory,
+                            std::vector<std::unique_ptr<TensorTie>>* objects) {
+    objects->reserve(defs.size());
+    for (auto& def : defs) {
+      std::unique_ptr<TensorTie> object;
+      RETURN_IF_ERROR(factory->NewTensorTie(def, &object));
+      objects->push_back(std::move(object));
+    }
+    return OkStatus();
+  }
+
+  static std::vector<TensorObjectDef> GetExternalDefinitions(
+      const std::vector<std::unique_ptr<TensorTie>>& objects) {
+    std::vector<TensorObjectDef> defs;
+    defs.reserve(objects.size());
+    for (auto& obj : objects) {
+      defs.push_back(obj->def().external_def);
+    }
+    return defs;
+  }
+
+  const InferenceEnvironmentOptions env_options_;
+  Environment* environment_;
+  std::unique_ptr<InferenceContext> context_;
+  std::unique_ptr<GlInteropFabric> gl_interop_fabric_;
+  std::vector<std::unique_ptr<TensorTie>> inputs_;
+  std::vector<std::unique_ptr<TensorTie>> outputs_;
+};
+
+TensorObjectDef TensorToDef(const Tensor& tensor) {
+  TensorObjectDef def;
+  def.dimensions.b = 1;
+  def.dimensions.h = tensor.Height();
+  def.dimensions.w = tensor.Width();
+  def.dimensions.c = tensor.Channels();
+  def.object_def.data_layout = ToDataLayout(tensor.StorageType());
+  def.object_def.data_type = tensor.DataType();
+  def.object_def.object_type = ToObjectType(tensor.StorageType());
+  def.object_def.user_provided = false;
+  return def;
+}
+
+class InferenceBuilderImpl : public InferenceBuilder {
+ public:
+  InferenceBuilderImpl(const InferenceOptions& options,
+                       const InferenceEnvironmentOptions env_options,
+                       const InferenceEnvironmentProperties properties,
+                       Environment* environment,
+                       std::unique_ptr<GraphFloat32> graph)
+      : options_(options),
+        env_options_(env_options),
+        properties_(properties),
+        environment_(environment),
+        graph_(std::move(graph)) {}
+
+  Status Initialize() {
+    // Select precision based on given options.
+    CalculationsPrecision precision = CalculationsPrecision::F32;
+    if (options_.allow_precision_loss) {
+      precision = options_.priority == InferencePriority::MAX_PRECISION
+                      ? CalculationsPrecision::F32_F16
+                      : CalculationsPrecision::F16;
+    }
+
+    // Increase precision if not supported.
+    if (!environment_->IsSupported(precision)) {
+      precision = CalculationsPrecision::F32_F16;
+      if (!environment_->IsSupported(precision)) {
+        precision = CalculationsPrecision::F32;
+      }
+    }
+
+    context_ = absl::make_unique<InferenceContext>();
+    InferenceContext::CreateInferenceInfo create_info;
+    create_info.precision = precision;
+    create_info.storage_type = GetOptimalStorageType(environment_->device());
+    create_info.hints.Add(ModelHints::kReduceKernelsCount);
+    // TODO(sorokin) temporary hack to speed up init time in some cases.
+    // TODO(sorokin): move this check to the place where hint is applied.
+    if ((precision == CalculationsPrecision::F16 ||
+         precision == CalculationsPrecision::F32_F16) &&
+        create_info.storage_type == TensorStorageType::TEXTURE_ARRAY &&
+        environment_->device().IsAdreno6xxOrHigher()) {
+      create_info.hints.Add(ModelHints::kFastTuning);
+    }
+    RETURN_IF_ERROR(
+        context_->InitFromGraph(create_info, *graph_, environment_));
+
+    if (env_options_.IsGlAware()) {
+      gl_interop_fabric_ = absl::make_unique<GlInteropFabric>(
+          env_options_.egl_display, environment_);
+    }
+    tie_factory_ = absl::make_unique<TensorTieFactory>(
+        environment_, context_.get(), gl_interop_fabric_.get());
+
+    inputs_ = LinkTensors(graph_->inputs());
+    outputs_ = LinkTensors(graph_->outputs());
+    return OkStatus();
+  }
+
+  std::vector<TensorObjectDef> inputs() const override {
+    return GetExternalDefinitions(inputs_);
+  }
+
+  std::vector<TensorObjectDef> outputs() const override {
+    return GetExternalDefinitions(outputs_);
+  }
+
+  Status SetInputShape(int index, const Dimensions& dimensions) override {
+    if (index < 0 || index > inputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    return UnimplementedError("Changing input shapes is not supported");
+  }
+
+  Status SetInputObjectDef(int index, ObjectDef new_def) override {
+    if (index < 0 || index > inputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    auto def = inputs_[index];
+    def.external_def.object_def = new_def;
+    if (!tie_factory_->IsSupported(def)) {
+      return InvalidArgumentError("New object definition is not supported.");
+    }
+    inputs_[index] = def;
+    return OkStatus();
+  }
+
+  Status SetOutputObjectDef(int index, ObjectDef new_def) override {
+    if (index < 0 || index > outputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    auto def = outputs_[index];
+    def.external_def.object_def = new_def;
+    if (!tie_factory_->IsSupported(def)) {
+      return InvalidArgumentError("New object definition is not supported.");
+    }
+    outputs_[index] = def;
+    return OkStatus();
+  }
+
+  Status Build(std::unique_ptr<InferenceRunner>* runner) override {
+    if (gl_interop_fabric_ && !HasGlObjects()) {
+      // destroy interop layer when there are no GL objects to avoid
+      // extra synchronization cost.
+      gl_interop_fabric_.reset(nullptr);
+    }
+    auto runner_impl = absl::make_unique<InferenceRunnerImpl>(
+        env_options_, environment_, std::move(context_),
+        std::move(gl_interop_fabric_));
+    RETURN_IF_ERROR(
+        runner_impl->Initialize(inputs_, outputs_, tie_factory_.get()));
+    *runner = std::move(runner_impl);
+    return OkStatus();
+  }
+
+ private:
+  // Links internal tensors with external user-facing objects.
+  std::vector<TensorTieDef> LinkTensors(
+      const std::vector<Value<TensorRef<BHWC>>*>& values) {
+    std::vector<TensorTieDef> links;
+    links.reserve(values.size());
+    for (const auto& value : values) {
+      TensorObjectDef def = TensorToDef(*context_->GetTensor(value->id));
+      AccessType access = graph_->IsGraphInput(value->id) ? AccessType::READ
+                                                          : AccessType::WRITE;
+      links.push_back({value->id, access, def, def});
+    }
+    return links;
+  }
+
+  bool HasGlObjects() const {
+    auto is_gl = [](ObjectType t) {
+      return t == ObjectType::OPENGL_SSBO || t == ObjectType::OPENGL_TEXTURE;
+    };
+    for (const TensorTieDef& def : inputs_) {
+      if (is_gl(def.external_def.object_def.object_type)) {
+        return true;
+      }
+    }
+    for (const TensorTieDef& def : outputs_) {
+      if (is_gl(def.external_def.object_def.object_type)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  static std::vector<TensorObjectDef> GetExternalDefinitions(
+      const std::vector<TensorTieDef>& links) {
+    std::vector<TensorObjectDef> defs;
+    defs.reserve(links.size());
+    for (auto& desc : links) {
+      defs.push_back(desc.external_def);
+    }
+    return defs;
+  }
+
+  const InferenceOptions options_;
+  const InferenceEnvironmentOptions env_options_;
+  const InferenceEnvironmentProperties properties_;
+
+  std::unique_ptr<InferenceContext> context_;
+  std::unique_ptr<GlInteropFabric> gl_interop_fabric_;
+  Environment* environment_;
+
+  std::unique_ptr<GraphFloat32> graph_;
+  std::vector<TensorTieDef> inputs_;
+  std::vector<TensorTieDef> outputs_;
+  std::unique_ptr<TensorTieFactory> tie_factory_;
+};
+
+class InferenceEnvironmentImpl : public InferenceEnvironment {
+ public:
+  explicit InferenceEnvironmentImpl(const InferenceEnvironmentOptions& options)
+      : options_(options) {}
+
+  Status Init() {
+    RETURN_IF_ERROR(LoadOpenCL());
+    properties_.is_opencl_available = true;
+
+    if (options_.IsGlAware()) {
+      RETURN_IF_ERROR(CreateGLCompatibleEnvironment(
+          reinterpret_cast<cl_context_properties>(options_.egl_context),
+          reinterpret_cast<cl_context_properties>(options_.egl_display),
+          &environment_));
+    } else {
+      RETURN_IF_ERROR(CreateEnvironment(&environment_));
+    }
+    auto& device = environment_.device();
+    properties_.is_gl_sharing_supported = IsGlSharingSupported(device);
+    properties_.is_gl_to_cl_fast_sync_supported =
+        IsClEventFromEglSyncSupported(device);
+    properties_.is_cl_to_gl_fast_sync_supported =
+        IsEglSyncFromClEventSupported();
+    if (options_.IsGlAware() && !properties_.is_gl_sharing_supported) {
+      return UnavailableError("GL sharing is not supported");
+    }
+    return OkStatus();
+  }
+
+  Status NewInferenceBuilder(const InferenceOptions& options,
+                             const GraphFloat32& model,
+                             std::unique_ptr<InferenceBuilder>* builder) final {
+    if (environment_.program_cache() &&
+        !options_.serialized_binary_cache.empty()) {
+      // Ignore returned error. Cache is discarded.
+      environment_.program_cache()
+          ->AddSerializedCache(environment_.context(), environment_.device(),
+                               options_.serialized_binary_cache)
+          .IgnoreError();
+    }
+
+    auto cl_graph = absl::make_unique<GraphFloat32>();
+    RETURN_IF_ERROR(model.MakeExactCopy(cl_graph.get()));
+    RETURN_IF_ERROR(RunGraphTransforms(cl_graph.get()));
+    auto builder_impl = absl::make_unique<InferenceBuilderImpl>(
+        options, options_, properties_, &environment_, std::move(cl_graph));
+    RETURN_IF_ERROR(builder_impl->Initialize());
+    *builder = std::move(builder_impl);
+    return OkStatus();
+  }
+
+  std::vector<uint8_t> GetSerializedBinaryCache() const final {
+    std::vector<uint8_t> data;
+    // Is there was a problem, data would be empty.
+    environment_.program_cache()
+        ->GetSerializedCache(environment_.device(), &data)
+        .IgnoreError();
+    return data;
+  }
+
+  const InferenceEnvironmentProperties& properties() const {
+    return properties_;
+  }
+
+ private:
+  const InferenceEnvironmentOptions options_;
+  Environment environment_;
+  InferenceEnvironmentProperties properties_;
+};
+
+}  // namespace
+
+Status NewInferenceEnvironment(
+    const InferenceEnvironmentOptions& options,
+    std::unique_ptr<InferenceEnvironment>* environment,
+    InferenceEnvironmentProperties* properties) {
+  auto env_impl = absl::make_unique<InferenceEnvironmentImpl>(options);
+  Status status = env_impl->Init();
+  if (properties) {
+    *properties = env_impl->properties();
+  }
+  RETURN_IF_ERROR(status);
+  *environment = std::move(env_impl);
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/api.h
+++ b/tensorflow/lite/delegates/gpu/cl/api.h
@ -0,0 +1,125 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
+
+#include <cstdint>
+#include <memory>
+
+#include <EGL/egl.h>
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+// Usage example:
+//
+//   std::unique_ptr<InferenceEnvironment> env;
+//   RETURN_IF_ERROR(NewInferenceEnvironment(option, &env));
+//
+//   InferenceOptions options;
+//
+//   std::unique_ptr<InferenceBuilder> builder;
+//   RETURN_IF_ERROR(env->NewInferenceBuilder(options, model, &builder));
+//   // now builder is ready to prepare inference runner.
+//
+// -----------------
+// Supported formats
+// -----------------
+//
+// OpenCL implementation uses 2D textures as the primary format.
+// Tensor in HWDC4 layout is {TEXTURE_2D, RGBA, width := W*D, height := H}.
+//
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+enum class InferencePriority {
+  MIN_LATENCY,
+
+  MAX_PRECISION,
+};
+
+struct InferenceOptions {
+  bool allow_precision_loss = false;
+
+  InferencePriority priority = InferencePriority::MAX_PRECISION;
+};
+
+// Indicates environment
+struct InferenceEnvironmentProperties {
+  bool is_opencl_available = false;
+
+  // GL objects (buffers and textures) could be shared with CL context.
+  bool is_gl_sharing_supported = false;
+
+  // Indicates whether fast GL->CL synchronization is supported.
+  bool is_gl_to_cl_fast_sync_supported = false;
+
+  // Indicates whether fast CL->GL synchronization is supported.
+  bool is_cl_to_gl_fast_sync_supported = false;
+};
+
+// Environment manages all resources that need to stay until any inference is
+// running using OpenCL backend.
+class InferenceEnvironment {
+ public:
+  virtual ~InferenceEnvironment() {}
+
+  virtual Status NewInferenceBuilder(
+      const InferenceOptions& options, const GraphFloat32& model,
+      std::unique_ptr<InferenceBuilder>* builder) = 0;
+
+  // Returns opaque binary blob that contains a collection of already compiled
+  // OpenCL kernels present in a cache. Returned data could be re-used later
+  // to speed up compilation time when new environment is created for the same
+  // set of models.
+  // Returned data is valid only if used on the same device, otherwise it will
+  // not be compatible and will be discarded.
+  virtual std::vector<uint8_t> GetSerializedBinaryCache() const = 0;
+};
+
+struct InferenceEnvironmentOptions {
+  // Whenever input and/or output is GL object, EGL display and context must be
+  // set to create GL aware OpenCL context. Do not set these variables whenever
+  // GL interoperability is not needed.
+  EGLDisplay egl_display = EGL_NO_DISPLAY;
+  EGLContext egl_context = EGL_NO_CONTEXT;
+
+  // Should contain data returned from
+  // InferenceEnvironment::GetSerializedBinaryCache method.
+  // Invalid or incompatible data will be discarded. Compiled binary may become
+  // incompatible when GPU driver is updated.
+  absl::Span<const uint8_t> serialized_binary_cache;
+
+  bool IsGlAware() const {
+    return egl_context != EGL_NO_CONTEXT && egl_display != EGL_NO_DISPLAY;
+  }
+};
+
+// Creates new OpenCL environment that needs to stay around until all inference
+// runners are destroyed.
+Status NewInferenceEnvironment(
+    const InferenceEnvironmentOptions& options,
+    std::unique_ptr<InferenceEnvironment>* environment,
+    InferenceEnvironmentProperties* properties /* optional */);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
--- a/tensorflow/lite/delegates/gpu/cl/buffer.cc
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.cc
@ -0,0 +1,89 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only, const void* data,
+                    CLContext* context, Buffer* result) {
+  cl_mem_flags flags = gpu_read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
+  if (data != nullptr) {
+    flags |= CL_MEM_COPY_HOST_PTR;
+  }
+  cl_int error_code;
+  cl_mem buffer = clCreateBuffer(context->context(), flags, size_in_bytes,
+                                 const_cast<void*>(data), &error_code);
+  if (!buffer) {
+    return UnknownError(
+        absl::StrCat("Failed to allocate device memory with clCreateBuffer",
+                     CLErrorCodeToString(error_code)));
+  }
+
+  *result = Buffer(buffer, size_in_bytes);
+
+  return OkStatus();
+}
+}  // namespace
+
+Buffer::Buffer(cl_mem buffer, size_t size_in_bytes)
+    : buffer_(buffer), size_(size_in_bytes) {}
+
+Buffer::Buffer(Buffer&& buffer) : buffer_(buffer.buffer_), size_(buffer.size_) {
+  buffer.buffer_ = nullptr;
+  buffer.size_ = 0;
+}
+
+Buffer& Buffer::operator=(Buffer&& buffer) {
+  if (this != &buffer) {
+    Release();
+    std::swap(size_, buffer.size_);
+    std::swap(buffer_, buffer.buffer_);
+  }
+  return *this;
+}
+
+Buffer::~Buffer() { Release(); }
+
+void Buffer::Release() {
+  if (buffer_) {
+    clReleaseMemObject(buffer_);
+    buffer_ = nullptr;
+    size_ = 0;
+  }
+}
+
+Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
+                            Buffer* result) {
+  return CreateBuffer(size_in_bytes, true, nullptr, context, result);
+}
+
+Status CreateReadOnlyBuffer(size_t size_in_bytes, const void* data,
+                            CLContext* context, Buffer* result) {
+  return CreateBuffer(size_in_bytes, true, data, context, result);
+}
+
+Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext* context,
+                             Buffer* result) {
+  return CreateBuffer(size_in_bytes, false, nullptr, context, result);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/buffer.h
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.h
@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_BUFFER_H_
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Buffer represent linear GPU data storage with arbitrary data format.
+// Buffer is moveable but not copyable.
+class Buffer {
+ public:
+  Buffer() {}  // just for using Buffer as a class members
+  Buffer(cl_mem buffer, size_t size_in_bytes);
+
+  // Move only
+  Buffer(Buffer&& buffer);
+  Buffer& operator=(Buffer&& buffer);
+  Buffer(const Buffer&) = delete;
+  Buffer& operator=(const Buffer&) = delete;
+
+  ~Buffer();
+
+  cl_mem GetMemoryPtr() const { return buffer_; }
+
+  // Writes data to a buffer. Data should point to a region that
+  // has exact size in bytes as size_in_bytes(constructor parameter).
+  template <typename T>
+  Status WriteData(CLCommandQueue* queue, const absl::Span<T> data);
+
+  // Reads data from Buffer into CPU memory.
+  template <typename T>
+  Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
+
+ private:
+  void Release();
+
+  cl_mem buffer_ = nullptr;
+  int size_;
+};
+
+Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
+                            Buffer* result);
+
+Status CreateReadOnlyBuffer(size_t size_in_bytes, const void* data,
+                            CLContext* context, Buffer* result);
+
+Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext* context,
+                             Buffer* result);
+
+template <typename T>
+Status Buffer::WriteData(CLCommandQueue* queue, const absl::Span<T> data) {
+  if (size_ != sizeof(T) * data.size()) {
+    return InvalidArgumentError(
+        "absl::Span<T> data size is different from buffer allocated size.");
+  }
+  RETURN_IF_ERROR(queue->EnqueueWriteBuffer(buffer_, size_, data.data()));
+  return OkStatus();
+}
+
+template <typename T>
+Status Buffer::ReadData(CLCommandQueue* queue, std::vector<T>* result) const {
+  if (size_ % sizeof(T) != 0) {
+    return UnknownError("Wrong element size(typename T is not correct?");
+  }
+
+  const int elements_count = size_ / sizeof(T);
+  result->resize(elements_count);
+
+  return queue->EnqueueReadBuffer(buffer_, size_, result->data());
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_BUFFER_H_
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
@ -0,0 +1,326 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+CLCommandQueue::CLCommandQueue(cl_command_queue queue) : queue_(queue) {}
+
+CLCommandQueue::CLCommandQueue(CLCommandQueue&& queue) : queue_(queue.queue_) {
+  queue.queue_ = nullptr;
+}
+
+CLCommandQueue& CLCommandQueue::operator=(CLCommandQueue&& queue) {
+  if (this != &queue) {
+    Release();
+    std::swap(queue_, queue.queue_);
+  }
+  return *this;
+}
+
+CLCommandQueue::~CLCommandQueue() { Release(); }
+
+void CLCommandQueue::Release() {
+  if (queue_) {
+    clReleaseCommandQueue(queue_);
+    queue_ = nullptr;
+  }
+}
+
+Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
+                                        int3 work_group_size, CLEvent* event) {
+  std::vector<size_t> local(3);
+  std::vector<size_t> global(3);
+  for (int i = 0; i < 3; ++i) {
+    local[i] = work_group_size[i];
+    global[i] = AlignByN(grid[i], work_group_size[i]);
+  }
+  cl_event resulting_event;
+  const int error_code =
+      clEnqueueNDRangeKernel(queue_, kernel.kernel(), 3, nullptr, global.data(),
+                             local.data(), 0, nullptr, &resulting_event);
+  *event = CLEvent(resulting_event);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
+                                        int3 work_group_size) {
+  std::vector<size_t> local(3);
+  std::vector<size_t> global(3);
+  for (int i = 0; i < 3; ++i) {
+    local[i] = work_group_size[i];
+    global[i] = AlignByN(grid[i], work_group_size[i]);
+  }
+  const int error_code =
+      clEnqueueNDRangeKernel(queue_, kernel.kernel(), 3, nullptr, global.data(),
+                             local.data(), 0, nullptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
+  cl_event resulting_event;
+  const int error_code = clEnqueueMarker(queue_, &resulting_event);
+  *event = CLEvent(resulting_event);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to clEnqueueMarker - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region,
+                                         const void* data) {
+  const size_t origin[] = {0, 0, 0};
+  const size_t r[] = {static_cast<size_t>(region.x),
+                      static_cast<size_t>(region.y),
+                      static_cast<size_t>(region.z)};
+  auto error_code = clEnqueueWriteImage(queue_, memory, CL_TRUE, origin, r, 0,
+                                        0, data, 0, nullptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to upload data to GPU (clEnqueueWriteImage) - ",
+                     CLErrorCodeToString(error_code)));
+  }
+
+  return OkStatus();
+}
+
+Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region,
+                                        void* data) {
+  const size_t origin[] = {0, 0, 0};
+  const size_t r[] = {static_cast<size_t>(region.x),
+                      static_cast<size_t>(region.y),
+                      static_cast<size_t>(region.z)};
+  auto error_code = clEnqueueReadImage(queue_, memory, CL_TRUE, origin, r, 0, 0,
+                                       data, 0, nullptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to read data from GPU (clEnqueueReadImage) - ",
+                     CLErrorCodeToString(error_code)));
+  }
+
+  return OkStatus();
+}
+
+Status CLCommandQueue::EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
+                                          const void* data) {
+  auto error_code = clEnqueueWriteBuffer(
+      queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to upload data to GPU (clEnqueueWriteBuffer) - ",
+                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CLCommandQueue::EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes,
+                                         void* data) {
+  auto error_code = clEnqueueReadBuffer(
+      queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to read data from GPU (clEnqueueReadBuffer) - ",
+                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CLCommandQueue::WaitForCompletion() {
+  auto error_code = clFinish(queue_);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to clFinish - ", CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+ProfilingCommandQueue::ProfilingCommandQueue(cl_command_queue queue)
+    : CLCommandQueue(queue) {
+  events_.reserve(128);
+}
+
+ProfilingCommandQueue::ProfilingCommandQueue(ProfilingCommandQueue&& queue)
+    : CLCommandQueue(std::move(queue)),
+      events_(std::move(queue.events_)),
+      current_label_(std::move(queue.current_label_)) {}
+
+ProfilingCommandQueue& ProfilingCommandQueue::operator=(
+    ProfilingCommandQueue&& queue) {
+  if (this != &queue) {
+    events_ = std::move(queue.events_);
+    current_label_ = std::move(queue.current_label_);
+    CLCommandQueue::operator=(std::move(queue));
+  }
+  return *this;
+}
+
+void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
+  current_label_ = name;
+}
+
+void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
+
+Status ProfilingCommandQueue::DispatchImplicit(const CLKernel& kernel,
+                                               int3 grid,
+                                               int3 work_group_size) {
+  events_.push_back(CLEvent());
+  RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
+      kernel, grid, work_group_size, &events_[events_.size() - 1]));
+  events_.back().SetName(current_label_);
+  return OkStatus();
+}
+
+ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
+  ProfilingInfo result;
+  result.dispatches.resize(events_.size());
+  for (int i = 0; i < events_.size(); ++i) {
+    result.dispatches[i].label = events_[i].GetName();
+    result.dispatches[i].time_ns = events_[i].GetEventTimeNs();
+  }
+  return result;
+}
+
+Status ProfilingCommandQueue::GetBestWorkGroupIndex(
+    const CLKernel& kernel, const DeviceInfo& device_info, const int3& grid,
+    const std::vector<int3>& work_group_sizes, int* index) {
+  // Some Adreno 3xx can have wrong numbers for some events
+  const bool possible_bug_with_events =
+      device_info.vendor == Vendor::QUALCOMM &&
+      device_info.adreno_info.gpu_version < 400;
+  events_.resize(work_group_sizes.size());
+  for (int i = 0; i < work_group_sizes.size(); ++i) {
+    RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
+        kernel, grid, work_group_sizes[i], &events_[i]));
+
+    // reducing the speed of memory leak on Mali for some kernels
+    if (device_info.vendor == Vendor::MALI && i % 8 == 7) {
+      events_[i - 7].Wait();
+    }
+    if (possible_bug_with_events) {
+      // We are trying to increase probability for correct result.
+      RETURN_IF_ERROR(WaitForCompletion());
+    }
+  }
+
+  RETURN_IF_ERROR(WaitForCompletion());
+
+  // To release memory of some kernel pool on Mali.
+  if (device_info.vendor == Vendor::MALI) {
+    RETURN_IF_ERROR(kernel.ReInit());
+  }
+
+  int minimum_index = 0;
+  double minimum_time = std::numeric_limits<double>::max();
+  if (possible_bug_with_events) {  // we will try to cut out suspicious results
+    double average_time = 0.0;
+    int average_samples_count = 0;
+    for (int i = 0; i < work_group_sizes.size(); ++i) {
+      if (events_[i].GetEventTimeMs() < 100 * 1000) {  // 100 sec
+        average_time += events_[i].GetEventTimeMs();
+        average_samples_count++;
+      }
+    }
+    average_time /= average_samples_count;
+    for (int i = 0; i < work_group_sizes.size(); ++i) {
+      double time = events_[i].GetEventTimeMs();
+      if (time < minimum_time && time >= 0.1 * average_time) {
+        minimum_index = i;
+        minimum_time = time;
+      }
+    }
+  } else {
+    for (int i = 0; i < work_group_sizes.size(); ++i) {
+      double time = events_[i].GetEventTimeMs();
+      if (time < minimum_time) {
+        minimum_index = i;
+        minimum_time = time;
+      }
+    }
+  }
+
+  *index = minimum_index;
+
+  return OkStatus();
+}
+
+Status CreateCLCommandQueue(const CLDevice& device, const CLContext& context,
+                            CLCommandQueue* result) {
+  int error_code;
+  cl_command_queue queue =
+      clCreateCommandQueue(context.context(), device.id(), 0, &error_code);
+  if (!queue) {
+    return UnknownError(absl::StrCat("Failed to create a command queue - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+
+  *result = CLCommandQueue(queue);
+  return OkStatus();
+}
+
+double ProfilingCommandQueue::GetQueueExecutionTimeMs() const {
+  const uint64_t start = events_.front().GetStartedTimeNs();
+  const uint64_t end = events_.back().GetFinishedTimeNs();
+  const uint64_t time_ns = (end - start);
+
+  return static_cast<double>(time_ns) / 1000000.0;
+}
+
+double ProfilingCommandQueue::GetSumOfEventsTimeMs() const {
+  double sum = 0.0;
+  for (int i = 0; i < events_.size(); ++i) {
+    sum += events_[i].GetEventTimeMs();
+  }
+  return sum;
+}
+
+Status CreateProfilingCommandQueue(const CLDevice& device,
+                                   const CLContext& context,
+                                   ProfilingCommandQueue* result) {
+  int error_code;
+  cl_command_queue queue = clCreateCommandQueue(
+      context.context(), device.id(), CL_QUEUE_PROFILING_ENABLE, &error_code);
+  if (!queue) {
+    return UnknownError(absl::StrCat("Failed to create a command queue - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+
+  *result = ProfilingCommandQueue(queue);
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
@ -0,0 +1,136 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct ProfilingInfo {
+  struct DispatchInfo {
+    std::string label;
+    uint64_t time_ns;
+    double GetTimeMs() const { return static_cast<double>(time_ns) * 1e-6; }
+  };
+
+  std::vector<DispatchInfo> dispatches;
+};
+
+// A wrapper around opencl command queue
+class CLCommandQueue {
+ public:
+  CLCommandQueue() {}
+  explicit CLCommandQueue(cl_command_queue queue);
+
+  // Move only
+  CLCommandQueue(CLCommandQueue&& queue);
+  CLCommandQueue& operator=(CLCommandQueue&& queue);
+  CLCommandQueue(const CLCommandQueue&) = delete;
+  CLCommandQueue& operator=(const CLCommandQueue&) = delete;
+
+  virtual ~CLCommandQueue();
+
+  cl_command_queue queue() const { return queue_; }
+
+  virtual Status DispatchImplicit(const CLKernel& kernel, int3 grid,
+                                  int3 work_group_size);
+
+  Status EnqueueEvent(CLEvent* event);
+
+  Status DispatchImplicit(const CLKernel& kernel, int3 grid,
+                          int3 work_group_size, CLEvent* event);
+
+  Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data);
+  Status EnqueueReadImage(cl_mem memory, int3 region, void* data);
+
+  Status EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
+                            const void* data);
+  Status EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes, void* data);
+
+  Status WaitForCompletion();
+
+ protected:
+  void Release();
+
+  cl_command_queue queue_ = nullptr;
+};
+
+class ProfilingCommandQueue : public CLCommandQueue {
+ public:
+  ProfilingCommandQueue() {}
+  explicit ProfilingCommandQueue(cl_command_queue queue);
+
+  // Move only
+  ProfilingCommandQueue(ProfilingCommandQueue&& queue);
+  ProfilingCommandQueue& operator=(ProfilingCommandQueue&& queue);
+  ProfilingCommandQueue(const ProfilingCommandQueue&) = delete;
+  ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete;
+
+  Status DispatchImplicit(const CLKernel& kernel, int3 grid,
+                          int3 work_group_size) override;
+
+  // will write index for fastest work_group among work_group_sizes
+  Status GetBestWorkGroupIndex(const CLKernel& kernel,
+                               const DeviceInfo& device_info, const int3& grid,
+                               const std::vector<int3>& work_group_sizes,
+                               int* index);
+
+  // call ResetMeasurements() to start new seriese of measurements
+  void ResetMeasurements();
+
+  double GetQueueExecutionTimeMs() const;
+
+  // Difference from GetQueueExecutionTimeMs is that this number doesn't include
+  // time between kernels(kernels launchs or preparing) on GPU. Usually, this
+  // time should be 5-10% better than GetQueueExecutionTimeMs, because 5-10%
+  // spend on something else(maybe kernels launchs or preparing)
+  double GetSumOfEventsTimeMs() const;
+
+  // This label will be used for all subsequent dispatches.
+  void SetEventsLabel(const std::string& name);
+
+  ProfilingInfo GetProfilingInfo() const;
+
+ private:
+  std::vector<CLEvent> events_;
+  std::string current_label_;
+};
+
+Status CreateCLCommandQueue(const CLDevice& device, const CLContext& context,
+                            CLCommandQueue* result);
+
+Status CreateProfilingCommandQueue(const CLDevice& device,
+                                   const CLContext& context,
+                                   ProfilingCommandQueue* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
--- a/tensorflow/lite/delegates/gpu/cl/cl_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_context.cc
@ -0,0 +1,123 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_image_format.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::vector<cl_image_format> GetSupportedImage2DFormats(cl_context context,
+                                                        cl_mem_flags flags) {
+  cl_uint num_image_formats;
+  cl_int error = clGetSupportedImageFormats(
+      context, flags, CL_MEM_OBJECT_IMAGE2D, 0, nullptr, &num_image_formats);
+  if (error != CL_SUCCESS) {
+    return {};
+  }
+
+  std::vector<cl_image_format> result(num_image_formats);
+  error = clGetSupportedImageFormats(context, flags, CL_MEM_OBJECT_IMAGE2D,
+                                     num_image_formats, &result[0], nullptr);
+  if (error != CL_SUCCESS) {
+    return {};
+  }
+  return result;
+}
+
+Status CreateCLContext(const CLDevice& device,
+                       cl_context_properties* properties, CLContext* result) {
+  int error_code;
+  cl_device_id device_id = device.id();
+  cl_context context =
+      clCreateContext(properties, 1, &device_id, nullptr, nullptr, &error_code);
+  if (!context) {
+    return UnknownError(absl::StrCat("Failed to create a compute context - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+
+  *result = CLContext(context);
+  return OkStatus();
+}
+
+}  // namespace
+
+CLContext::CLContext(cl_context context) : context_(context) {}
+
+CLContext::CLContext(CLContext&& context) : context_(context.context_) {
+  context.context_ = nullptr;
+}
+
+CLContext& CLContext::operator=(CLContext&& context) {
+  if (this != &context) {
+    Release();
+    std::swap(context_, context.context_);
+  }
+  return *this;
+}
+
+CLContext::~CLContext() { Release(); }
+
+void CLContext::Release() {
+  if (context_) {
+    clReleaseContext(context_);
+    context_ = nullptr;
+  }
+}
+
+bool CLContext::IsFloatTexture2DSupported(int num_channels, DataType data_type,
+                                          cl_mem_flags flags) const {
+  auto supported_formats = GetSupportedImage2DFormats(context_, flags);
+  for (auto format : supported_formats) {
+    if (format.image_channel_data_type == ToImageChannelType(data_type) &&
+        format.image_channel_order == ToChannelOrder(num_channels)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+Status CreateCLContext(const CLDevice& device, CLContext* result) {
+  return CreateCLContext(device, nullptr, result);
+}
+
+Status CreateCLGLContext(const CLDevice& device,
+                         cl_context_properties egl_context,
+                         cl_context_properties egl_display, CLContext* result) {
+  if (!device.SupportsExtension("cl_khr_gl_sharing")) {
+    return UnavailableError("Device doesn't support CL-GL sharing.");
+  }
+  cl_context_properties platform =
+      reinterpret_cast<cl_context_properties>(device.platform());
+  cl_context_properties props[] = {CL_GL_CONTEXT_KHR,
+                                   egl_context,
+                                   CL_EGL_DISPLAY_KHR,
+                                   egl_display,
+                                   CL_CONTEXT_PLATFORM,
+                                   platform,
+                                   0};
+  return CreateCLContext(device, props, result);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/cl_context.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_context.h
@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_CONTEXT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_CONTEXT_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// A RAII wrapper around opencl context
+class CLContext {
+ public:
+  CLContext() {}
+  explicit CLContext(cl_context context);
+
+  // Move only
+  CLContext(CLContext&& context);
+  CLContext& operator=(CLContext&& context);
+  CLContext(const CLContext&) = delete;
+  CLContext& operator=(const CLContext&) = delete;
+
+  ~CLContext();
+
+  cl_context context() const { return context_; }
+
+  bool IsFloatTexture2DSupported(int num_channels, DataType data_type,
+                                 cl_mem_flags flags = CL_MEM_READ_WRITE) const;
+
+ private:
+  void Release();
+
+  cl_context context_ = nullptr;
+};
+
+Status CreateCLContext(const CLDevice& device, CLContext* result);
+Status CreateCLGLContext(const CLDevice& device,
+                         cl_context_properties egl_context,
+                         cl_context_properties egl_display, CLContext* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_CONTEXT_H_
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@ -0,0 +1,398 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+template <>
+std::string GetDeviceInfo<std::string>(cl_device_id id, cl_device_info info) {
+  size_t size;
+  cl_int error = clGetDeviceInfo(id, info, 0, nullptr, &size);
+  if (error != CL_SUCCESS) {
+    return "";
+  }
+
+  std::string result(size - 1, 0);
+  error = clGetDeviceInfo(id, info, size, &result[0], nullptr);
+  if (error != CL_SUCCESS) {
+    return "";
+  }
+  return result;
+}
+
+namespace {
+template <typename T>
+T GetPlatformInfo(cl_platform_id id, cl_platform_info info) {
+  T result;
+  cl_int error = clGetPlatformInfo(id, info, sizeof(T), &result, nullptr);
+  if (error != CL_SUCCESS) {
+    return -1;
+  }
+  return result;
+}
+
+std::string GetPlatformInfo(cl_platform_id id, cl_platform_info info) {
+  size_t size;
+  cl_int error = clGetPlatformInfo(id, info, 0, nullptr, &size);
+  if (error != CL_SUCCESS) {
+    return "";
+  }
+
+  std::string result(size - 1, 0);
+  error = clGetPlatformInfo(id, info, size, &result[0], nullptr);
+  if (error != CL_SUCCESS) {
+    return "";
+  }
+  return result;
+}
+
+void GetDeviceWorkDimsSizes(cl_device_id id, int* result) {
+  int dims_count =
+      GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
+  if (dims_count < 3) {
+    return;
+  }
+  std::vector<size_t> limits(dims_count);
+  cl_int error =
+      clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                      sizeof(size_t) * dims_count, limits.data(), nullptr);
+  if (error != CL_SUCCESS) {
+    return;
+  }
+  // dims_count must be at least 3 according to spec
+  result[0] = limits[0];
+  result[1] = limits[1];
+  result[2] = limits[2];
+}
+
+OpenCLVersion ParseCLVersion(const std::string& version) {
+  const auto first_dot_pos = version.find_first_of('.');
+  if (first_dot_pos == std::string::npos) {
+    return OpenCLVersion::CL_1_0;
+  }
+  const int major = version[first_dot_pos - 1] - '0';
+  const int minor = version[first_dot_pos + 1] - '0';
+
+  if (major == 1) {
+    if (minor == 2) {
+      return OpenCLVersion::CL_1_2;
+    } else if (minor == 1) {
+      return OpenCLVersion::CL_1_1;
+    } else {
+      return OpenCLVersion::CL_1_0;
+    }
+  } else {
+    return OpenCLVersion::CL_2_0;
+  }
+}
+
+Vendor ParseVendor(const std::string& device_name,
+                   const std::string& vendor_name) {
+  std::string d_name = device_name;
+  std::string v_name = vendor_name;
+  std::transform(d_name.begin(), d_name.end(), d_name.begin(), ::tolower);
+  std::transform(v_name.begin(), v_name.end(), v_name.begin(), ::tolower);
+  if (d_name.find("qualcomm") != std::string::npos ||
+      v_name.find("qualcomm") != std::string::npos) {
+    return Vendor::QUALCOMM;
+  } else if (d_name.find("mali") != std::string::npos ||
+             v_name.find("mali") != std::string::npos) {
+    return Vendor::MALI;
+  } else if (d_name.find("power") != std::string::npos ||
+             v_name.find("power") != std::string::npos) {
+    return Vendor::POWERVR;
+  } else if (d_name.find("nvidia") != std::string::npos ||
+             v_name.find("nvidia") != std::string::npos) {
+    return Vendor::NVIDIA;
+  } else {
+    return Vendor::UNKNOWN;
+  }
+}
+
+// check that gpu_version belong to range min_version-max_version
+// min_version is included and max_version is excluded.
+bool isGPUVersionInRange(int gpu_version, int min_version, int max_version) {
+  return gpu_version >= min_version && gpu_version < max_version;
+}
+}  // namespace
+
+// There is no rule for gpu version encoding, but we found these samples:
+// Version: OpenCL C 2.0 Adreno(TM) 540   // Pixel 2
+// Version: OpenCL C 2.0 Adreno(TM) 630   // Sony Compact XZ2
+// Version: OpenCL C 2.0 Adreno(TM) 630   // Pixel 3
+// Version: OpenCL C 2.0 Adreno(TM) 540   // Samsung S8
+// Version: OpenCL C 1.2 Adreno(TM) 430   // HTC One M9
+// Version: OpenCL C 2.0 Adreno(TM) 530   // Samsung S7 Edge
+// Version: OpenCL C 1.2 Adreno(TM) 405   // Motorola Moto G(4)
+// After the number string ends.
+// It is assumed that the <vendor-specific information> for Adreno GPUs has
+// the following format:
+// <text?><space?>Adreno(TM)<space><text?><version>
+// Returns -1 if vendor-specific information cannot be parsed
+int GetAdrenoGPUVersion(const std::string& gpu_version) {
+  const std::string gpu = absl::AsciiStrToLower(gpu_version);
+  const std::vector<absl::string_view> words = absl::StrSplit(gpu, ' ');
+  int i = 0;
+  for (; i < words.size(); ++i) {
+    if (words[i].find("adreno") != words[i].npos) {
+      break;
+    }
+  }
+  i += 1;
+  for (; i < words.size(); ++i) {
+    int number;
+    bool is_number = absl::SimpleAtoi(words[i], &number);
+    // Adreno GPUs starts from 2xx, but opencl support should be only from 3xx
+    if (is_number && number >= 300) {
+      return number;
+    }
+  }
+  return -1;
+}
+
+std::string VendorToString(Vendor v) {
+  switch (v) {
+    case Vendor::QUALCOMM:
+      return "Qualcomm";
+    case Vendor::MALI:
+      return "Mali";
+    case Vendor::POWERVR:
+      return "PowerVR";
+    case Vendor::NVIDIA:
+      return "NVIDIA";
+    case Vendor::UNKNOWN:
+      return "unknown vendor";
+  }
+}
+
+std::string OpenCLVersionToString(OpenCLVersion version) {
+  switch (version) {
+    case OpenCLVersion::CL_1_0:
+      return "1.0";
+    case OpenCLVersion::CL_1_1:
+      return "1.1";
+    case OpenCLVersion::CL_1_2:
+      return "1.2";
+    case OpenCLVersion::CL_2_0:
+      return "2.0";
+  }
+}
+
+AdrenoInfo::AdrenoInfo(const std::string& device_version)
+    : gpu_version(GetAdrenoGPUVersion(device_version)) {}
+
+int AdrenoInfo::GetMaximumWavesCount() const {
+  if (gpu_version < 400) {
+    return -1;  // Adreno 3xx does not support it currently
+  } else if (gpu_version >= 400 && gpu_version < 500) {
+    return -1;  // Adreno 4xx does not support it currently
+  } else if (gpu_version >= 500 && gpu_version < 600) {
+    return -1;  // Adreno 5xx does not support it currently
+  } else if (gpu_version >= 600 && gpu_version < 700) {
+    return gpu_version == 640 ? 30 : 16;
+  } else {
+    return -1;  //  Adreno 7xx and higher does not exist yet
+  }
+}
+
+int AdrenoInfo::GetRegisterMemorySizePerComputeUnit() const {
+  if (gpu_version < 400) {
+    return -1;  // Adreno 3xx does not support it currently
+  } else if (gpu_version >= 400 && gpu_version < 500) {
+    return -1;  // Adreno 4xx does not support it currently
+  } else if (gpu_version >= 500 && gpu_version < 600) {
+    return -1;  // Adreno 5xx does not support it currently
+  } else if (gpu_version >= 600 && gpu_version < 700) {
+    return gpu_version == 640 ? 128 * 144 * 16 : 128 * 96 * 16;
+  } else {
+    return -1;  //  Adreno 7xx and higher does not exist yet
+  }
+}
+
+int AdrenoInfo::GetMaximumWavesCount(int register_footprint_per_tread,
+                                     bool full_wave) const {
+  const int register_usage_per_wave =
+      GetWaveSize(full_wave) * register_footprint_per_tread;
+  const int possible_waves_count =
+      GetRegisterMemorySizePerComputeUnit() / register_usage_per_wave;
+  return std::min(possible_waves_count, GetMaximumWavesCount());
+}
+
+int AdrenoInfo::GetWaveSize(bool full_wave) const {
+  if (gpu_version < 400) {
+    return -1;  // Adreno 3xx does not support it currently
+  } else if (gpu_version < 600) {
+    return full_wave ? 64 : 32;
+  } else {
+    return full_wave ? 128 : 64;
+  }
+}
+
+DeviceInfo::DeviceInfo(cl_device_id id)
+    : adreno_info(GetDeviceInfo<std::string>(id, CL_DEVICE_OPENCL_C_VERSION)) {
+  const auto device_name = GetDeviceInfo<std::string>(id, CL_DEVICE_NAME);
+  const auto vendor_name = GetDeviceInfo<std::string>(id, CL_DEVICE_VENDOR);
+  vendor = ParseVendor(device_name, vendor_name);
+  cl_version = ParseCLVersion(
+      GetDeviceInfo<std::string>(id, CL_DEVICE_OPENCL_C_VERSION));
+  extensions =
+      absl::StrSplit(GetDeviceInfo<std::string>(id, CL_DEVICE_EXTENSIONS), ' ');
+  supports_fp16 = false;
+  for (const auto& ext : extensions) {
+    if (ext == "cl_khr_fp16") {
+      supports_fp16 = true;
+    }
+  }
+  compute_units_count = GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_COMPUTE_UNITS);
+  image2d_max_width = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
+  image2d_max_height = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_WIDTH);
+  if (cl_version >= OpenCLVersion::CL_1_2) {
+    image_buffer_max_size =
+        GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE);
+    image_array_max_layers =
+        GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE);
+  }
+  GetDeviceWorkDimsSizes(id, max_work_items_sizes);
+}
+
+bool DeviceInfo::SupportsTextureArray() const {
+  return cl_version >= OpenCLVersion::CL_1_2;
+}
+
+CLDevice::CLDevice(cl_device_id id, cl_platform_id platform_id)
+    : id_(id), platform_id_(platform_id), info_(id) {}
+
+CLDevice::CLDevice(const CLDevice& device)
+    : id_(device.id_), platform_id_(device.platform_id_), info_(device.info_) {}
+
+CLDevice& CLDevice::operator=(const CLDevice& device) {
+  if (this != &device) {
+    id_ = device.id_;
+    platform_id_ = device.platform_id_;
+    info_ = device.info_;
+  }
+  return *this;
+}
+
+CLDevice::CLDevice(CLDevice&& device)
+    : id_(device.id_),
+      platform_id_(device.platform_id_),
+      info_(std::move(device.info_)) {
+  device.id_ = nullptr;
+  device.platform_id_ = nullptr;
+}
+
+CLDevice& CLDevice::operator=(CLDevice&& device) {
+  if (this != &device) {
+    id_ = nullptr;
+    platform_id_ = nullptr;
+    std::swap(id_, device.id_);
+    std::swap(platform_id_, device.platform_id_);
+    info_ = std::move(device.info_);
+  }
+  return *this;
+}
+
+bool CLDevice::SupportsFP16() const { return info_.supports_fp16; }
+
+bool CLDevice::SupportsExtension(const std::string& extension) const {
+  for (const auto& ext : info_.extensions) {
+    if (ext == extension) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CLDevice::SupportsTextureArray() const {
+  return info_.SupportsTextureArray();
+}
+
+std::string CLDevice::GetPlatformVersion() const {
+  return GetPlatformInfo(platform_id_, CL_PLATFORM_VERSION);
+}
+
+bool CLDevice::IsAdreno() const { return info_.vendor == Vendor::QUALCOMM; }
+
+bool CLDevice::IsAdreno3xx() const {
+  return IsAdreno() &&
+         isGPUVersionInRange(info_.adreno_info.gpu_version, 300, 400);
+}
+
+bool CLDevice::IsAdreno4xx() const {
+  return IsAdreno() &&
+         isGPUVersionInRange(info_.adreno_info.gpu_version, 400, 500);
+}
+
+bool CLDevice::IsAdreno5xx() const {
+  return IsAdreno() &&
+         isGPUVersionInRange(info_.adreno_info.gpu_version, 500, 600);
+}
+
+bool CLDevice::IsAdreno6xx() const {
+  return IsAdreno() &&
+         isGPUVersionInRange(info_.adreno_info.gpu_version, 600, 700);
+}
+
+bool CLDevice::IsAdreno6xxOrHigher() const {
+  return IsAdreno() && info_.adreno_info.gpu_version >= 600;
+}
+
+bool CLDevice::SupportsOneLayerTextureArray() const {
+  return !IsAdreno() || info_.adreno_info.support_one_layer_texture_array;
+}
+
+void CLDevice::DisableOneLayerTextureArray() {
+  info_.adreno_info.support_one_layer_texture_array = false;
+}
+
+Status CreateDefaultGPUDevice(CLDevice* result) {
+  cl_uint num_platforms;
+  clGetPlatformIDs(0, nullptr, &num_platforms);
+  if (num_platforms == 0) {
+    return UnknownError("No supported OpenCL platform.");
+  }
+  std::vector<cl_platform_id> platforms(num_platforms);
+  clGetPlatformIDs(num_platforms, platforms.data(), nullptr);
+
+  cl_uint num_devices;
+  clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices);
+  if (num_devices == 0) {
+    return UnknownError("No GPU on current platform.");
+  }
+
+  std::vector<cl_device_id> devices(num_devices);
+  clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices.data(),
+                 nullptr);
+
+  *result = CLDevice(devices[0], platforms[0]);
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.h
@ -0,0 +1,140 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_DEVICE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_DEVICE_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+enum class Vendor { QUALCOMM, MALI, POWERVR, NVIDIA, UNKNOWN };
+std::string VendorToString(Vendor v);
+
+enum class OpenCLVersion { CL_1_0, CL_1_1, CL_1_2, CL_2_0 };
+std::string OpenCLVersionToString(OpenCLVersion version);
+
+// for use only in cl_device.cc, but putted here to make tests
+int GetAdrenoGPUVersion(const std::string& gpu_version);
+
+struct AdrenoInfo {
+  AdrenoInfo() = default;
+  explicit AdrenoInfo(const std::string& device_version);
+  int gpu_version = -1;  // can be, for example, 405/430/540/530/630 etc.
+
+  // This function returns some not very documented physical parameter of
+  // Adreno6xx GPU.
+  // We obtained it using Snapdragon Profiler.
+  int GetMaximumWavesCount() const;
+
+  // returns amount of register memory per CU(Compute Unit) in bytes.
+  int GetRegisterMemorySizePerComputeUnit() const;
+
+  // returns maximum possible amount of waves based on register usage.
+  int GetMaximumWavesCount(int register_footprint_per_tread,
+                           bool full_wave = true) const;
+
+  int GetWaveSize(bool full_wave) const;
+
+  // Not supported on some Adreno devices with specific driver version.
+  // b/131099086
+  bool support_one_layer_texture_array = true;
+};
+
+struct DeviceInfo {
+  DeviceInfo() = default;
+  explicit DeviceInfo(cl_device_id id);
+
+  bool SupportsTextureArray() const;
+
+  std::vector<std::string> extensions;
+  bool supports_fp16;
+  Vendor vendor;
+  OpenCLVersion cl_version;
+  int compute_units_count;
+  int image2d_max_width;
+  int image2d_max_height;
+  int image_buffer_max_size;
+  int image_array_max_layers;
+  int max_work_items_sizes[3];
+
+  AdrenoInfo adreno_info;
+};
+
+// A wrapper around opencl device id
+class CLDevice {
+ public:
+  CLDevice() = default;
+  CLDevice(cl_device_id id, cl_platform_id platform_id);
+
+  CLDevice(CLDevice&& device);
+  CLDevice& operator=(CLDevice&& device);
+  CLDevice(const CLDevice&);
+  CLDevice& operator=(const CLDevice&);
+
+  ~CLDevice() {}
+
+  cl_device_id id() const { return id_; }
+  cl_platform_id platform() const { return platform_id_; }
+  std::string GetPlatformVersion() const;
+
+  const DeviceInfo& GetInfo() const { return info_; }
+  const DeviceInfo* GetInfoPtr() const { return &info_; }
+
+  Vendor vendor() const { return info_.vendor; }
+  OpenCLVersion cl_version() const { return info_.cl_version; }
+  bool SupportsFP16() const;
+  bool SupportsTextureArray() const;
+  bool SupportsExtension(const std::string& extension) const;
+  bool IsAdreno() const;
+  bool IsAdreno3xx() const;
+  bool IsAdreno4xx() const;
+  bool IsAdreno5xx() const;
+  bool IsAdreno6xx() const;
+  bool IsAdreno6xxOrHigher() const;
+
+  // To track bug on some Adreno. b/131099086
+  bool SupportsOneLayerTextureArray() const;
+  void DisableOneLayerTextureArray();
+
+ private:
+  cl_device_id id_ = nullptr;
+  cl_platform_id platform_id_ = nullptr;
+  DeviceInfo info_;
+};
+
+Status CreateDefaultGPUDevice(CLDevice* result);
+
+template <typename T>
+T GetDeviceInfo(cl_device_id id, cl_device_info info) {
+  T result;
+  cl_int error = clGetDeviceInfo(id, info, sizeof(T), &result, nullptr);
+  if (error != CL_SUCCESS) {
+    return -1;
+  }
+  return result;
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_DEVICE_H_
--- a/tensorflow/lite/delegates/gpu/cl/cl_errors.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_errors.h
@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ERRORS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ERRORS_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// @return if error_code is success, then return OK status. Otherwise translates
+// error code into a message.
+inline Status GetOpenCLError(cl_int error_code) {
+  if (error_code == CL_SUCCESS) {
+    return OkStatus();
+  }
+  return InternalError("OpenCL error: " + CLErrorCodeToString(error_code));
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ERRORS_H_
--- a/tensorflow/lite/delegates/gpu/cl/cl_event.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_event.cc
@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+CLEvent::CLEvent(cl_event event) : event_(event) {}
+
+CLEvent::CLEvent(CLEvent&& event)
+    : event_(event.event_), name_(std::move(event.name_)) {
+  event.event_ = nullptr;
+}
+
+CLEvent& CLEvent::operator=(CLEvent&& event) {
+  if (this != &event) {
+    Release();
+    std::swap(event_, event.event_);
+    name_ = std::move(event.name_);
+  }
+  return *this;
+}
+
+uint64_t CLEvent::GetStartedTimeNs() const {
+  cl_ulong time_ns;
+  clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
+                          &time_ns, nullptr);
+  return time_ns;
+}
+
+uint64_t CLEvent::GetFinishedTimeNs() const {
+  cl_ulong time_ns;
+  clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
+                          &time_ns, nullptr);
+  return time_ns;
+}
+
+double CLEvent::GetEventTimeMs() const {
+  const uint64_t start = GetStartedTimeNs();
+  const uint64_t end = GetFinishedTimeNs();
+  const uint64_t time_ns = (end - start);
+
+  return static_cast<double>(time_ns) * 1e-6;
+}
+
+uint64_t CLEvent::GetEventTimeNs() const {
+  return GetFinishedTimeNs() - GetStartedTimeNs();
+}
+
+void CLEvent::SetName(const std::string& name) { name_ = name; }
+
+void CLEvent::Wait() const { clWaitForEvents(1, &event_); }
+
+CLEvent::~CLEvent() { Release(); }
+
+void CLEvent::Release() {
+  if (event_) {
+    clReleaseEvent(event_);
+    event_ = nullptr;
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/cl_event.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_event.h
@ -0,0 +1,69 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_EVENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_EVENT_H_
+
+#include <cstdint>
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// A RAII wrapper around opencl event
+class CLEvent {
+ public:
+  CLEvent() {}
+  explicit CLEvent(cl_event event);
+
+  // Move only
+  CLEvent(CLEvent&& event);
+  CLEvent& operator=(CLEvent&& event);
+  CLEvent(const CLEvent&) = delete;
+  CLEvent& operator=(const CLEvent&) = delete;
+
+  ~CLEvent();
+
+  uint64_t GetStartedTimeNs() const;
+  uint64_t GetFinishedTimeNs() const;
+
+  double GetEventTimeMs() const;
+  uint64_t GetEventTimeNs() const;
+
+  void Wait() const;
+
+  cl_event event() const { return event_; }
+
+  bool is_valid() const { return event_ != nullptr; }
+
+  void SetName(const std::string& name);
+  std::string GetName() const { return name_; }
+
+ private:
+  void Release();
+
+  cl_event event_ = nullptr;
+
+  std::string name_;  // optional, for profiling mostly
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_EVENT_H_
--- a/tensorflow/lite/delegates/gpu/cl/cl_image_format.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_image_format.cc
@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_image_format.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+cl_channel_order ToChannelOrder(int num_channels) {
+  switch (num_channels) {
+    case 1:
+      return CL_R;
+    case 2:
+      return CL_RG;
+    case 3:
+      return CL_RGB;
+    case 4:
+      return CL_RGBA;
+    default:
+      return -1;
+  }
+}
+
+cl_channel_type ToImageChannelType(DataType data_type) {
+  switch (data_type) {
+    case DataType::FLOAT32:
+      return CL_FLOAT;
+    case DataType::FLOAT16:
+      return CL_HALF_FLOAT;
+    default:
+      return -1;
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/cl_image_format.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_image_format.h
@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_IMAGE_FORMAT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_IMAGE_FORMAT_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+cl_channel_order ToChannelOrder(int num_channels);
+
+cl_channel_type ToImageChannelType(DataType data_type);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_IMAGE_FORMAT_H_
--- a/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc
@ -0,0 +1,178 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+Status GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id,
+                                 int* result) {
+  size_t max_work_group_size;
+  cl_int error_code =
+      clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE,
+                               sizeof(size_t), &max_work_group_size, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to get info CL_KERNEL_WORK_GROUP_SIZE ",
+                     CLErrorCodeToString(error_code)));
+  }
+  *result = static_cast<int>(max_work_group_size);
+  return OkStatus();
+}
+
+Status GetKernelPrivateMemorySize(cl_kernel kernel, cl_device_id device_id,
+                                  int* result) {
+  cl_ulong private_mem_size;
+  cl_int error_code =
+      clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PRIVATE_MEM_SIZE,
+                               sizeof(cl_ulong), &private_mem_size, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(
+        absl::StrCat("Failed to get info CL_KERNEL_PRIVATE_MEM_SIZE ",
+                     CLErrorCodeToString(error_code)));
+  }
+  *result = static_cast<int>(private_mem_size);
+  return OkStatus();
+}
+
+}  // namespace
+
+CLKernel::CLKernel(CLKernel&& kernel)
+    : private_memory_size_(kernel.private_memory_size_),
+      max_work_group_size_(kernel.max_work_group_size_),
+      binding_counter_(kernel.binding_counter_),
+      function_name_(std::move(kernel.function_name_)),
+      program_(kernel.program_),
+      kernel_(kernel.kernel_) {
+  kernel.kernel_ = nullptr;
+}
+
+CLKernel& CLKernel::operator=(CLKernel&& kernel) {
+  if (this != &kernel) {
+    Release();
+    std::swap(private_memory_size_, kernel.private_memory_size_);
+    std::swap(max_work_group_size_, kernel.max_work_group_size_);
+    std::swap(binding_counter_, kernel.binding_counter_);
+    function_name_ = std::move(kernel.function_name_);
+    std::swap(program_, kernel.program_);
+    std::swap(kernel_, kernel.kernel_);
+  }
+  return *this;
+}
+
+CLKernel::~CLKernel() { Release(); }
+
+Status CLKernel::ReInit() const {
+  clReleaseKernel(kernel_);
+  cl_kernel* kern_ptr = const_cast<cl_kernel*>(&kernel_);
+  int error_code;
+  *kern_ptr = clCreateKernel(program_, function_name_.c_str(), &error_code);
+  if (!kernel_ || error_code != CL_SUCCESS) {
+    *kern_ptr = nullptr;
+    return UnknownError(absl::StrCat("Failed to create ", function_name_,
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+void CLKernel::Release() {
+  if (kernel_) {
+    clReleaseKernel(kernel_);
+    clReleaseProgram(program_);
+    kernel_ = nullptr;
+  }
+}
+
+Status CLKernel::CreateFromProgram(const CLProgram& program,
+                                   const std::string& function_name) {
+  int error_code;
+  function_name_ = function_name;
+  kernel_ =
+      clCreateKernel(program.program(), function_name.c_str(), &error_code);
+  if (!kernel_ || error_code != CL_SUCCESS) {
+    kernel_ = nullptr;
+    return UnknownError(absl::StrCat("Failed to create ", function_name,
+                                     CLErrorCodeToString(error_code)));
+  }
+
+  program_ = program.program();
+  clRetainProgram(program_);
+
+  RETURN_IF_ERROR(GetKernelPrivateMemorySize(kernel_, program.GetDeviceId(),
+                                             &private_memory_size_));
+  RETURN_IF_ERROR(GetKernelMaxWorkGroupSize(kernel_, program.GetDeviceId(),
+                                            &max_work_group_size_));
+  return OkStatus();
+}
+
+Status CLKernel::SetMemory(int index, cl_mem memory) {
+  return SetBytes(index, &memory, sizeof(cl_mem));
+}
+
+Status CLKernel::SetMemoryAuto(cl_mem memory) {
+  return SetBytesAuto(&memory, sizeof(cl_mem));
+}
+
+Status CLKernel::SetBytes(int index, const void* ptr, int length) const {
+  const int error_code = clSetKernelArg(kernel_, index, length, ptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to set kernel arguments - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CLKernel::SetBytesAuto(const void* ptr, int length) {
+  const int error_code = clSetKernelArg(kernel_, binding_counter_, length, ptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to set kernel arguments - ",
+                                     CLErrorCodeToString(error_code),
+                                     "(at index - ", binding_counter_, ")"));
+  }
+  binding_counter_++;
+  return OkStatus();
+}
+
+template <>
+Status CLKernel::SetBytes<FLT>(int index, const FLT& value) const {
+  return SetBytes(index, value.GetData(), value.GetSize());
+}
+
+template <>
+Status CLKernel::SetBytes<FLT4>(int index, const FLT4& value) const {
+  return SetBytes(index, value.GetData(), value.GetSize());
+}
+
+template <>
+Status CLKernel::SetBytesAuto<FLT>(const FLT& value) {
+  return SetBytesAuto(value.GetData(), value.GetSize());
+}
+
+template <>
+Status CLKernel::SetBytesAuto<FLT4>(const FLT4& value) {
+  return SetBytesAuto(value.GetData(), value.GetSize());
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
@ -0,0 +1,105 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_KERNEL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_KERNEL_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Arguments binding to CLKernel can be manual or automatic
+// In manual you specify binding index explicitly
+// In automatic binding, index auto-incremented with every binding call
+// Also, if you use automatic mode you must call ResetBindingCounter
+//   before parameters binding
+class CLKernel {
+ public:
+  CLKernel() {}
+
+  // Move only
+  CLKernel(CLKernel&& kernel);
+  CLKernel& operator=(CLKernel&& kernel);
+  CLKernel(const CLKernel&) = delete;
+  CLKernel& operator=(const CLKernel&) = delete;
+
+  ~CLKernel();
+
+  cl_kernel kernel() const { return kernel_; }
+
+  Status CreateFromProgram(const CLProgram& program,
+                           const std::string& function_name);
+
+  Status SetMemory(int index, cl_mem memory);
+  Status SetMemoryAuto(cl_mem memory);
+  template <typename T>
+  Status SetBytes(int index, const T& value) const {
+    return SetBytes(index, static_cast<const void*>(&value), sizeof(T));
+  }
+  template <typename T>
+  Status SetBytesAuto(const T& value) {
+    return SetBytesAuto(static_cast<const void*>(&value), sizeof(T));
+  }
+
+  int GetPrivateMemorySize() const { return private_memory_size_; }
+  int GetMaxWorkGroupSize() const { return max_work_group_size_; }
+
+  void ResetBindingCounter() { binding_counter_ = 0; }
+
+  // Do not use this function
+  // workaround for Mali memory leak
+  Status ReInit() const;
+
+ private:
+  void Release();
+  Status SetBytes(int index, const void* ptr, int length) const;
+  Status SetBytesAuto(const void* ptr, int length);
+
+  int private_memory_size_;
+  int max_work_group_size_;
+  int binding_counter_ = -1;
+
+  std::string function_name_;
+  // reference to program from which kernel was created
+  cl_program program_ = nullptr;
+  cl_kernel kernel_ = nullptr;
+};
+
+template <>
+Status CLKernel::SetBytes<FLT>(int index, const FLT& value) const;
+
+template <>
+Status CLKernel::SetBytes<FLT4>(int index, const FLT4& value) const;
+
+template <>
+Status CLKernel::SetBytesAuto<FLT>(const FLT& value);
+
+template <>
+Status CLKernel::SetBytesAuto<FLT4>(const FLT4& value);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_KERNEL_H_
--- a/tensorflow/lite/delegates/gpu/cl/cl_memory.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_memory.cc
@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_memory.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+cl_mem_flags ToClMemFlags(AccessType access_type) {
+  switch (access_type) {
+    case AccessType::READ:
+      return CL_MEM_READ_ONLY;
+    case AccessType::WRITE:
+      return CL_MEM_WRITE_ONLY;
+    case AccessType::READ_WRITE:
+      return CL_MEM_READ_WRITE;
+  }
+
+  return CL_MEM_READ_ONLY;  // unreachable
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/cl_memory.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_memory.h
@ -0,0 +1,89 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_MEMORY_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_MEMORY_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// RAII wrapper for OpenCL memory object.
+//
+// Image is moveable but not copyable.
+class CLMemory {
+ public:
+  // Creates invalid object.
+  CLMemory() : CLMemory(nullptr, false) {}
+
+  CLMemory(cl_mem memory, bool has_ownership)
+      : memory_(memory), has_ownership_(has_ownership) {}
+
+  // Move-only
+  CLMemory(const CLMemory&) = delete;
+  CLMemory& operator=(const CLMemory&) = delete;
+  CLMemory(CLMemory&& image)
+      : memory_(image.memory_), has_ownership_(image.has_ownership_) {
+    image.memory_ = nullptr;
+  }
+
+  ~CLMemory() { Invalidate(); }
+
+  CLMemory& operator=(CLMemory&& image) {
+    if (this != &image) {
+      Invalidate();
+      std::swap(memory_, image.memory_);
+      has_ownership_ = image.has_ownership_;
+    }
+    return *this;
+  }
+
+  cl_mem memory() const { return memory_; }
+
+  bool is_valid() const { return memory_ != nullptr; }
+
+  // @return true if this object actually owns corresponding CL memory
+  //         and manages it's lifetime.
+  bool has_ownership() const { return has_ownership_; }
+
+  cl_mem Release() {
+    cl_mem to_return = memory_;
+    memory_ = nullptr;
+    return to_return;
+  }
+
+ private:
+  void Invalidate() {
+    if (memory_ && has_ownership_) {
+      clReleaseMemObject(memory_);
+    }
+    memory_ = nullptr;
+  }
+
+  cl_mem memory_ = nullptr;
+  bool has_ownership_ = false;
+};
+
+cl_mem_flags ToClMemFlags(AccessType access_type);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_MEMORY_H_
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
@ -0,0 +1,186 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetProgramBuildInfo(cl_program program, cl_device_id id,
+                                cl_program_build_info info) {
+  size_t size;
+  cl_int error_code =
+      clGetProgramBuildInfo(program, id, info, 0, nullptr, &size);
+  if (error_code != CL_SUCCESS) {
+    return absl::StrCat("Failed to GetProgramBuildInfo - ",
+                        CLErrorCodeToString(error_code));
+  }
+
+  std::string result(size - 1, 0);
+  error_code =
+      clGetProgramBuildInfo(program, id, info, size, &result[0], nullptr);
+  if (error_code != CL_SUCCESS) {
+    return absl::StrCat("Failed to GetProgramBuildInfo - ",
+                        CLErrorCodeToString(error_code));
+  }
+  return result;
+}
+
+Status GetBinarySize(cl_program program, size_t* binary_size) {
+  cl_int error_code = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
+                                       sizeof(size_t), binary_size, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to get program binary size - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status BuildProgram(cl_program program, const CLDevice& device,
+                    const std::string& compiler_options) {
+  const int error_code = clBuildProgram(
+      program, 0, nullptr, compiler_options.c_str(), nullptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat(
+        "Failed to build program executable - ",
+        CLErrorCodeToString(error_code),
+        GetProgramBuildInfo(program, device.id(), CL_PROGRAM_BUILD_LOG)));
+  }
+
+  return OkStatus();
+}
+
+std::string CompilerOptionToString(const CLDevice& device,
+                                   CompilerOptions option) {
+  switch (option) {
+    case CompilerOptions::ADRENO_FULL_SIMD_LINE:
+      if (device.GetInfo().adreno_info.gpu_version < 500) {
+        return "-qcom-accelerate-16-bit";
+      } else {
+        return "-qcom-accelerate-16-bit=true";
+      }
+  }
+}
+
+}  // namespace
+
+std::string CompilerOptionsToString(
+    const CLDevice& device,
+    const std::vector<CompilerOptions>& compiler_options) {
+  std::string result;
+  for (auto option : compiler_options) {
+    absl::StrAppend(&result, CompilerOptionToString(device, option), " ");
+  }
+  return result;
+}
+
+CLProgram::CLProgram(cl_program program, cl_device_id device_id)
+    : program_(program), device_id_(device_id) {}
+
+CLProgram::CLProgram(CLProgram&& program)
+    : program_(program.program_), device_id_(program.device_id_) {
+  program.program_ = nullptr;
+}
+
+CLProgram& CLProgram::operator=(CLProgram&& program) {
+  if (this != &program) {
+    Release();
+    std::swap(program_, program.program_);
+    std::swap(device_id_, program.device_id_);
+  }
+  return *this;
+}
+
+CLProgram::~CLProgram() { Release(); }
+
+void CLProgram::Release() {
+  if (program_) {
+    clReleaseProgram(program_);
+    program_ = nullptr;
+  }
+}
+
+Status CLProgram::GetBinary(std::vector<uint8_t>* result) const {
+  size_t binary_size;
+  RETURN_IF_ERROR(GetBinarySize(program_, &binary_size));
+  result->resize(result->size() + binary_size);
+  uint8_t* binary_ptr = result->data() + result->size() - binary_size;
+  cl_int error_code = clGetProgramInfo(program_, CL_PROGRAM_BINARIES,
+                                       binary_size, &binary_ptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to get program binary - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  return OkStatus();
+}
+
+Status CreateCLProgram(const std::string& code,
+                       const std::string& compiler_options,
+                       const CLContext& context, const CLDevice& device,
+                       CLProgram* result) {
+  int error_code;
+  const char* source = code.c_str();
+
+  cl_program program = clCreateProgramWithSource(context.context(), 1, &source,
+                                                 nullptr, &error_code);
+  if (!program || error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to create compute program - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+
+  *result = CLProgram(program, device.id());
+  RETURN_IF_ERROR(BuildProgram(program, device, compiler_options));
+  return OkStatus();
+}
+
+Status CreateCLProgramFromBinary(const CLContext& context,
+                                 const CLDevice& device,
+                                 absl::Span<const uint8_t> binary,
+                                 CLProgram* result) {
+  cl_int binary_status;
+  cl_int error_code;
+  cl_device_id devices_list[] = {device.id()};
+  size_t binary_size = binary.size();
+  const uint8_t* binary_pointer = binary.data();
+  cl_program program = clCreateProgramWithBinary(
+      context.context(), 1, devices_list, &binary_size, &binary_pointer,
+      &binary_status, &error_code);
+  if (binary_status != CL_SUCCESS) {
+    return UnknownError(absl::StrCat(
+        "Something wrong with binary after clCreateProgramWithBinary - ",
+        binary_status));
+  }
+  if (error_code != CL_SUCCESS) {
+    return UnknownError(absl::StrCat("Failed to create program - ",
+                                     CLErrorCodeToString(error_code)));
+  }
+  *result = CLProgram(program, device.id());
+  return BuildProgram(program, device, "");
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.h
@ -0,0 +1,90 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_PROGRAM_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_PROGRAM_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// ADRENO_FULL_SIMD_LINE:
+//   Adreno can have 2 sizes for SIMD size.
+//   On Adreno 4xx/5xx it is 32/64, on Adreno6xx it is 64/128.
+//   Some our algorithms actually rely on exact size, for example on full
+//   SIMD size, so we need this define.
+//   This define is actually -qcom-accelerate-16-bit, but it controls SIMD size.
+enum class CompilerOptions { ADRENO_FULL_SIMD_LINE };
+
+std::string CompilerOptionsToString(
+    const CLDevice& device,
+    const std::vector<CompilerOptions>& compiler_options);
+
+class CLProgram {
+ public:
+  CLProgram() {}
+  CLProgram(cl_program program, cl_device_id device_id);
+
+  // Move only
+  CLProgram(CLProgram&& program);
+  CLProgram& operator=(CLProgram&& program);
+  CLProgram(const CLProgram&) = delete;
+  CLProgram& operator=(const CLProgram&) = delete;
+
+  ~CLProgram();
+
+  cl_program program() const { return program_; }
+
+  // Return the cl_device_id associated with the program object.
+  // This can be the device associated with context on which the program object
+  // has been created or can be device that was specified when a progam object
+  // was created using clCreateProgramWithBinary.
+  cl_device_id GetDeviceId() const { return device_id_; }
+
+  Status GetBinary(std::vector<uint8_t>* result) const;
+
+ private:
+  void Release();
+
+  cl_program program_ = nullptr;
+
+  // reference
+  cl_device_id device_id_ = nullptr;
+};
+
+Status CreateCLProgram(const std::string& code,
+                       const std::string& compiler_options,
+                       const CLContext& context, const CLDevice& device,
+                       CLProgram* result);
+
+Status CreateCLProgramFromBinary(const CLContext& context,
+                                 const CLDevice& device,
+                                 absl::Span<const uint8_t> binary,
+                                 CLProgram* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_PROGRAM_H_
--- a/tensorflow/lite/delegates/gpu/cl/compiled_program_cache.fbs
+++ b/tensorflow/lite/delegates/gpu/cl/compiled_program_cache.fbs
@ -0,0 +1,17 @@
+namespace tflite.gpu.cl.data;
+
+file_identifier "AFCM";
+
+file_extension "jetbin";
+
+table Program {
+  fingerprint:uint64;
+  binary:[ubyte];
+}
+
+table CompiledCache {
+  driver_version:string;
+  programs:[Program];
+}
+
+root_type CompiledCache;
--- a/tensorflow/lite/delegates/gpu/cl/egl_sync.cc
+++ b/tensorflow/lite/delegates/gpu/cl/egl_sync.cc
@ -0,0 +1,71 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
+
+#include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status EglSync::NewFence(EGLDisplay display, EglSync* sync) {
+  EGLSyncKHR egl_sync;
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(eglCreateSyncKHR, &egl_sync, display,
+                                      EGL_SYNC_FENCE_KHR, nullptr));
+  if (egl_sync == EGL_NO_SYNC_KHR) {
+    return InternalError("Returned empty KHR EGL sync");
+  }
+  *sync = EglSync(display, egl_sync);
+  return OkStatus();
+}
+
+EglSync& EglSync::operator=(EglSync&& sync) {
+  if (this != &sync) {
+    Invalidate();
+    std::swap(sync_, sync.sync_);
+    display_ = sync.display_;
+  }
+  return *this;
+}
+
+void EglSync::Invalidate() {
+  if (sync_ != EGL_NO_SYNC_KHR) {
+    eglDestroySyncKHR(display_, sync_);
+    sync_ = EGL_NO_SYNC_KHR;
+  }
+}
+
+Status EglSync::ServerWait() {
+  EGLint result;
+  RETURN_IF_ERROR(
+      TFLITE_GPU_CALL_EGL(eglWaitSyncKHR, &result, display_, sync_, 0));
+  return result == EGL_TRUE ? OkStatus() : InternalError("eglWaitSync failed");
+}
+
+Status EglSync::ClientWait() {
+  EGLint result;
+  // TODO(akulik): make it active wait for better performance
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(eglClientWaitSyncKHR, &result, display_,
+                                      sync_, EGL_SYNC_FLUSH_COMMANDS_BIT_KHR,
+                                      EGL_FOREVER_KHR));
+  return result == EGL_CONDITION_SATISFIED_KHR
+             ? OkStatus()
+             : InternalError("eglClientWaitSync failed");
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/egl_sync.h
+++ b/tensorflow/lite/delegates/gpu/cl/egl_sync.h
@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_EGL_SYNC_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_EGL_SYNC_H_
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// RAII wrapper for EGL sync object.
+// EglSync is moveable but not copyable.
+class EglSync {
+ public:
+  // Creates a fence in OpenGL command stream. This sync is enqueued and *not*
+  // flushed.
+  //
+  // Depends on EGL_KHR_fence_sync extension.
+  static Status NewFence(EGLDisplay display, EglSync* sync);
+
+  // Creates invalid object.
+  EglSync() : EglSync(EGL_NO_DISPLAY, EGL_NO_SYNC_KHR) {}
+
+  EglSync(EGLDisplay display, EGLSyncKHR sync)
+      : display_(display), sync_(sync) {}
+
+  // Move-only
+  EglSync(EglSync&& sync);
+  EglSync& operator=(EglSync&& sync);
+  EglSync(const EglSync&) = delete;
+  EglSync& operator=(const EglSync&) = delete;
+
+  ~EglSync() { Invalidate(); }
+
+  // Causes GPU to block and wait until this sync has been signaled.
+  // This call does not block and returns immediately.
+  Status ServerWait();
+
+  // Causes CPU to block and wait until this sync has been signaled.
+  Status ClientWait();
+
+  // Returns the EGLDisplay on which this instance was created.
+  EGLDisplay display() const { return display_; }
+
+  // Returns the EGLSyncKHR wrapped by this instance.
+  EGLSyncKHR sync() const { return sync_; }
+
+  // Returns true if this instance wraps a valid EGLSync object.
+  bool is_valid() const { return sync_ != nullptr; }
+
+ private:
+  void Invalidate();
+
+  EGLDisplay display_;
+  EGLSyncKHR sync_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_EGL_SYNC_H_
--- a/tensorflow/lite/delegates/gpu/cl/environment.cc
+++ b/tensorflow/lite/delegates/gpu/cl/environment.cc
@ -0,0 +1,240 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+namespace {
+CalculationsPrecision GetPossiblePrecision(
+    const CLDevice& gpu, CalculationsPrecision desired_precision) {
+  if (!gpu.SupportsFP16() && desired_precision != CalculationsPrecision::F32) {
+    return CalculationsPrecision::F32;
+  }
+
+  return desired_precision;
+}
+
+std::string GetKernelOneLayerTextureArray() {
+  return R"(
+
+__kernel void main_function(__write_only image2d_array_t dst) {
+  int X = (int)(get_global_id(0));
+  int Y = (int)(get_global_id(1));
+
+  write_imagef(dst, (int4)(X, Y, 0, 0), (float4)(2.0, 2.0, 2.0, 2.0));
+}
+)";
+}
+
+// Some Adreno < 600 have bug with one layer texture array. b/131099086
+// If we have one layer texture array and will write smt from kernel to this
+// texture, we will get zeroes instead of actual values.
+// The same kernel will work, if we use texture array with more than one layer.
+// With help of this code we can detect this bug.
+Status CheckKernelSupportOfOneLayerTextureArray(Environment* env,
+                                                bool* result) {
+  // No bug on Adreno 6xx
+  if (env->device().GetInfo().adreno_info.gpu_version >= 600) {
+    *result = true;
+    return OkStatus();
+  }
+  CLKernel kernel;
+  RETURN_IF_ERROR(CreateKernel(GetKernelOneLayerTextureArray(), "main_function",
+                               env, &kernel));
+  Tensor tensor;
+  RETURN_IF_ERROR(CreateTensor(env->context(), env->device(), 4, 4, 4,
+                               DataType::FLOAT32,
+                               TensorStorageType::TEXTURE_ARRAY, &tensor));
+  RETURN_IF_ERROR(kernel.SetMemory(0, tensor.GetMemoryPtr()));
+  RETURN_IF_ERROR(env->queue()->DispatchImplicit(kernel, {4, 4, 1}, {4, 4, 1}));
+  std::vector<float> cpu_data(64, 0.0f);
+  RETURN_IF_ERROR(tensor.ReadDataBHWC(absl::MakeSpan(cpu_data), env->queue()));
+
+  *result = true;
+  for (int i = 0; i < 64; ++i) {
+    if (cpu_data[i] != 2.0) {
+      *result = false;
+      break;
+    }
+  }
+  return OkStatus();
+}
+
+Status CreateEnvironment(Environment* result, bool shared,
+                         cl_context_properties egl_context,
+                         cl_context_properties egl_display) {
+  CLDevice gpu;
+  RETURN_IF_ERROR(CreateDefaultGPUDevice(&gpu));
+
+  CLContext context;
+  if (shared) {
+    RETURN_IF_ERROR(CreateCLGLContext(gpu, egl_context, egl_display, &context));
+  } else {
+    RETURN_IF_ERROR(CreateCLContext(gpu, &context));
+  }
+  CLCommandQueue queue;
+  RETURN_IF_ERROR(CreateCLCommandQueue(gpu, context, &queue));
+  ProfilingCommandQueue profiling_queue;
+  RETURN_IF_ERROR(CreateProfilingCommandQueue(gpu, context, &profiling_queue));
+
+  *result = Environment(std::move(gpu), std::move(context), std::move(queue),
+                        std::move(profiling_queue));
+
+  if (result->device().IsAdreno() && result->device().SupportsTextureArray()) {
+    bool supports_one_layer;
+    RETURN_IF_ERROR(
+        CheckKernelSupportOfOneLayerTextureArray(result, &supports_one_layer));
+    if (!supports_one_layer) {
+      result->GetDevicePtr()->DisableOneLayerTextureArray();
+    }
+  }
+
+  return OkStatus();
+}
+}  // namespace
+
+Environment::Environment(CLDevice&& device, CLContext&& context,
+                         CLCommandQueue&& queue,
+                         ProfilingCommandQueue&& profiling_queue)
+    : device_(std::move(device)),
+      context_(std::move(context)),
+      queue_(std::move(queue)),
+      profiling_queue_(std::move(profiling_queue)) {}
+
+Environment::Environment(Environment&& environment)
+    : device_(std::move(environment.device_)),
+      context_(std::move(environment.context_)),
+      queue_(std::move(environment.queue_)),
+      profiling_queue_(std::move(environment.profiling_queue_)),
+      program_cache_(std::move(environment.program_cache_)) {}
+
+Environment& Environment::operator=(Environment&& environment) {
+  if (this != &environment) {
+    device_ = std::move(environment.device_);
+    context_ = std::move(environment.context_);
+    queue_ = std::move(environment.queue_);
+    profiling_queue_ = std::move(environment.profiling_queue_);
+    program_cache_ = std::move(environment.program_cache_);
+  }
+  return *this;
+}
+
+void Environment::SetHighPerformance() const {
+  // TODO(sorokin) use cl_perf_hint if available
+}
+
+void Environment::SetDefaultPerformance() const {
+  // TODO(sorokin) use cl_perf_hint if available
+}
+
+void Environment::SetLowPerformance() const {
+  // TODO(sorokin) use cl_perf_hint if available
+}
+
+std::vector<CalculationsPrecision> Environment::GetSupportedPrecisions() const {
+  std::vector<CalculationsPrecision> precisions;
+  for (CalculationsPrecision precision :
+       {CalculationsPrecision::F32, CalculationsPrecision::F32_F16,
+        CalculationsPrecision::F16}) {
+    if (IsSupported(precision)) {
+      precisions.push_back(precision);
+    }
+  }
+  return precisions;
+}
+
+bool Environment::IsSupported(CalculationsPrecision precision) const {
+  switch (precision) {
+    case CalculationsPrecision::F32_F16:
+    case CalculationsPrecision::F16:
+      return device_.SupportsFP16();
+    case CalculationsPrecision::F32:
+      return true;
+  }
+}
+
+std::vector<TensorStorageType> Environment::GetSupportedTextureStorages()
+    const {
+  std::vector<TensorStorageType> storage_types = {
+      TensorStorageType::TEXTURE_2D};
+  if (device_.SupportsTextureArray()) {
+    storage_types.push_back(TensorStorageType::TEXTURE_ARRAY);
+  }
+  return storage_types;
+}
+
+std::vector<TensorStorageType> Environment::GetSupportedStorages() const {
+  std::vector<TensorStorageType> storage_types = {TensorStorageType::TEXTURE_2D,
+                                                  TensorStorageType::BUFFER};
+  if (device_.SupportsTextureArray()) {
+    storage_types.push_back(TensorStorageType::TEXTURE_ARRAY);
+  }
+  return storage_types;
+}
+
+TensorStorageType GetOptimalStorageType(const CLDevice& gpu) {
+  TensorStorageType storage_type;
+  if (gpu.vendor() != Vendor::QUALCOMM) {
+    storage_type = TensorStorageType::BUFFER;
+  } else {
+    if (gpu.IsAdreno6xxOrHigher()) {
+      storage_type = TensorStorageType::TEXTURE_ARRAY;
+    } else {
+      storage_type = TensorStorageType::TEXTURE_2D;
+    }
+  }
+
+  return storage_type;
+}
+
+Status CreateDefaultEnvironment(Environment* result) {
+  return CreateEnvironment(result, false, 0, 0);
+}
+
+Status CreateEnvironment(Environment* result) {
+  return CreateEnvironment(result, false, 0, 0);
+}
+
+Status CreateGLCompatibleEnvironment(cl_context_properties egl_context,
+                                     cl_context_properties egl_display,
+                                     Environment* result) {
+  return CreateEnvironment(result, true, egl_context, egl_display);
+}
+
+Status CreateKernel(const std::string& code, const std::string& function_name,
+                    Environment* env, CLKernel* result) {
+  return CreateKernel(code, function_name, {}, env, result);
+}
+
+Status CreateKernel(const std::string& code, const std::string& function_name,
+                    const std::vector<CompilerOptions>& compiler_options,
+                    Environment* env, CLKernel* result) {
+  return env->program_cache()->GetOrCreateCLKernel(
+      code, function_name, compiler_options, env->context(), env->device(),
+      result);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/environment.h
+++ b/tensorflow/lite/delegates/gpu/cl/environment.h
@ -0,0 +1,93 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_ENVIRONMENT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_ENVIRONMENT_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Environment {
+ public:
+  Environment() = default;
+  explicit Environment(CLDevice&& device, CLContext&& context,
+                       CLCommandQueue&& queue,
+                       ProfilingCommandQueue&& profiling_queue);
+
+  // Move only
+  Environment(Environment&& environment);
+  Environment& operator=(Environment&& environment);
+  Environment(const Environment&) = delete;
+  Environment& operator=(const Environment&) = delete;
+
+  const CLDevice& device() const { return device_; }
+  CLDevice* GetDevicePtr() { return &device_; }
+  const CLDevice* GetDevicePtr() const { return &device_; }
+  CLContext& context() { return context_; }
+  CLCommandQueue* queue() { return &queue_; }
+  ProfilingCommandQueue* profiling_queue() { return &profiling_queue_; }
+  ProgramCache* program_cache() { return &program_cache_; }
+  const ProgramCache* program_cache() const { return &program_cache_; }
+
+  std::vector<CalculationsPrecision> GetSupportedPrecisions() const;
+  bool IsSupported(CalculationsPrecision precision) const;
+  std::vector<TensorStorageType> GetSupportedTextureStorages() const;
+  std::vector<TensorStorageType> GetSupportedStorages() const;
+
+  void SetHighPerformance() const;
+  void SetDefaultPerformance() const;
+  void SetLowPerformance() const;  // for energy saving
+
+ private:
+  CLDevice device_;
+  CLContext context_;
+  CLCommandQueue queue_;
+  ProfilingCommandQueue profiling_queue_;
+  ProgramCache program_cache_;
+};
+
+TensorStorageType GetOptimalStorageType(const CLDevice& gpu);
+
+Status CreateDefaultEnvironment(Environment* result);
+
+Status CreateEnvironment(Environment* result);
+Status CreateGLCompatibleEnvironment(cl_context_properties egl_context,
+                                     cl_context_properties egl_display,
+                                     Environment* result);
+
+Status CreateKernel(const std::string& code, const std::string& function_name,
+                    Environment* env, CLKernel* result);
+
+Status CreateKernel(const std::string& code, const std::string& function_name,
+                    const std::vector<CompilerOptions>& compiler_options,
+                    Environment* env, CLKernel* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_ENVIRONMENT_H_
--- a/tensorflow/lite/delegates/gpu/cl/gl_interop.cc
+++ b/tensorflow/lite/delegates/gpu/cl/gl_interop.cc
@ -0,0 +1,259 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/gl_interop.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_sync.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+// TODO(b/131897059): replace with 64 version when EGL 1.5 is available.
+// it should use KHR_cl_event2 extension. More details are in b/129974818.
+using PFNEGLCREATESYNCPROC = EGLSync(EGLAPIENTRYP)(
+    EGLDisplay dpy, EGLenum type, const EGLAttrib* attrib_list);
+
+PFNEGLCREATESYNCPROC g_eglCreateSync = nullptr;
+
+}  // namespace
+
+Status CreateEglSyncFromClEvent(cl_event event, EGLDisplay display,
+                                EglSync* sync) {
+  if (!IsEglSyncFromClEventSupported()) {
+    return UnimplementedError("CreateEglSyncFromClEvent is not supported");
+  }
+  EGLSync egl_sync;
+  const EGLAttrib attributes[] = {EGL_CL_EVENT_HANDLE,
+                                  reinterpret_cast<EGLAttrib>(event), EGL_NONE};
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(g_eglCreateSync, &egl_sync, display,
+                                      EGL_SYNC_CL_EVENT, attributes));
+  if (egl_sync == EGL_NO_SYNC) {
+    return InternalError("Returned empty EGL sync");
+  }
+  *sync = EglSync(display, egl_sync);
+  return OkStatus();
+}
+
+bool IsEglSyncFromClEventSupported() {
+  // In C++11, static initializers are guaranteed to be evaluated only once.
+  static bool supported = []() -> bool {
+    // This function requires EGL 1.5 to work
+    g_eglCreateSync = reinterpret_cast<PFNEGLCREATESYNCPROC>(
+        eglGetProcAddress("eglCreateSync"));
+    // eglQueryString accepts EGL_NO_DISPLAY only starting EGL 1.5
+    if (!eglQueryString(EGL_NO_DISPLAY, EGL_EXTENSIONS)) {
+      g_eglCreateSync = nullptr;
+    }
+    return (g_eglCreateSync != nullptr);
+  }();
+  return supported;
+}
+
+Status CreateClEventFromEglSync(cl_context context, const EglSync& egl_sync,
+                                CLEvent* event) {
+  cl_int error_code;
+  cl_event new_event = clCreateEventFromEGLSyncKHR(
+      context, egl_sync.sync(), egl_sync.display(), &error_code);
+  if (error_code != CL_SUCCESS) {
+    return InternalError(
+        absl::StrCat("Unable to create CL sync from EGL sync. ",
+                     CLErrorCodeToString(error_code)));
+  }
+  *event = CLEvent(new_event);
+  return OkStatus();
+}
+
+bool IsClEventFromEglSyncSupported(const CLDevice& device) {
+  return device.SupportsExtension("cl_khr_egl_event");
+}
+
+Status CreateClMemoryFromGlBuffer(GLuint gl_ssbo_id, AccessType access_type,
+                                  CLContext* context, CLMemory* memory) {
+  cl_int error_code;
+  auto mem = clCreateFromGLBuffer(context->context(), ToClMemFlags(access_type),
+                                  gl_ssbo_id, &error_code);
+  if (error_code != CL_SUCCESS) {
+    return InternalError(
+        absl::StrCat("Unable to acquire CL buffer from GL buffer. ",
+                     CLErrorCodeToString(error_code)));
+  }
+  *memory = CLMemory(mem, true);
+  return OkStatus();
+}
+
+Status CreateClMemoryFromGlTexture(GLenum texture_target, GLuint texture_id,
+                                   AccessType access_type, CLContext* context,
+                                   CLMemory* memory) {
+  cl_int error_code;
+  auto mem =
+      clCreateFromGLTexture(context->context(), ToClMemFlags(access_type),
+                            texture_target, 0, texture_id, &error_code);
+  if (error_code != CL_SUCCESS) {
+    return InternalError(
+        absl::StrCat("Unable to create CL buffer from GL texture. ",
+                     CLErrorCodeToString(error_code)));
+  }
+  *memory = CLMemory(mem, true);
+  return OkStatus();
+}
+
+bool IsGlSharingSupported(const CLDevice& device) {
+  return clCreateFromGLBuffer && clCreateFromGLTexture &&
+         device.SupportsExtension("cl_khr_gl_sharing");
+}
+
+AcquiredGlObjects::~AcquiredGlObjects() { Release({}, nullptr).IgnoreError(); }
+
+Status AcquiredGlObjects::Acquire(const std::vector<cl_mem>& memory,
+                                  cl_command_queue queue,
+                                  const std::vector<cl_event>& wait_events,
+                                  CLEvent* acquire_event,
+                                  AcquiredGlObjects* objects) {
+  if (!memory.empty()) {
+    cl_event new_event;
+    cl_int error_code = clEnqueueAcquireGLObjects(
+        queue, memory.size(), memory.data(), wait_events.size(),
+        wait_events.data(), acquire_event ? &new_event : nullptr);
+    if (error_code != CL_SUCCESS) {
+      return InternalError(absl::StrCat("Unable to acquire GL object. ",
+                                        CLErrorCodeToString(error_code)));
+    }
+    if (acquire_event) {
+      *acquire_event = CLEvent(new_event);
+    }
+    clFlush(queue);
+  }
+  *objects = AcquiredGlObjects(memory, queue);
+  return OkStatus();
+}
+
+Status AcquiredGlObjects::Release(const std::vector<cl_event>& wait_events,
+                                  CLEvent* release_event) {
+  if (queue_ && !memory_.empty()) {
+    cl_event new_event;
+    cl_int error_code = clEnqueueReleaseGLObjects(
+        queue_, memory_.size(), memory_.data(), wait_events.size(),
+        wait_events.data(), release_event ? &new_event : nullptr);
+    if (error_code != CL_SUCCESS) {
+      return InternalError(absl::StrCat("Unable to release GL object. ",
+                                        CLErrorCodeToString(error_code)));
+    }
+    if (release_event) {
+      *release_event = CLEvent(new_event);
+    }
+    clFlush(queue_);
+    queue_ = nullptr;
+  }
+  return OkStatus();
+}
+
+GlInteropFabric::GlInteropFabric(EGLDisplay egl_display,
+                                 Environment* environment)
+    : is_egl_sync_supported_(true),
+      is_egl_to_cl_mapping_supported_(
+          IsClEventFromEglSyncSupported(environment->device())),
+      is_cl_to_egl_mapping_supported_(IsEglSyncFromClEventSupported()),
+      egl_display_(egl_display),
+      context_(environment->context().context()),
+      queue_(environment->queue()->queue()) {}
+
+void GlInteropFabric::RegisterMemory(cl_mem memory) {
+  memory_.push_back(memory);
+}
+
+void GlInteropFabric::UnregisterMemory(cl_mem memory) {
+  auto it = std::find(memory_.begin(), memory_.end(), memory);
+  if (it != memory_.end()) {
+    memory_.erase(it);
+  }
+}
+
+Status GlInteropFabric::Start() {
+  if (!is_enabled()) {
+    return OkStatus();
+  }
+
+  // In GL-CL interoperability, we need to make sure GL finished processing of
+  // all commands that might affect GL objects. There are a few ways:
+  //   a) glFinish
+  //      slow, but portable
+  //   b) EglSync + ClientWait
+  //      faster alternative for glFinish, but still slow as it stalls GPU
+  //      pipeline.
+  //   c) EglSync->CLEvent or GlSync->CLEvent mapping
+  //      Fast, as it allows to map sync to CL event and use it as a dependency
+  //      later without stalling GPU pipeline.
+  if (is_egl_sync_supported_) {
+    EglSync sync;
+    RETURN_IF_ERROR(EglSync::NewFence(egl_display_, &sync));
+    if (is_egl_to_cl_mapping_supported_) {
+      // (c) EglSync->CLEvent or GlSync->CLEvent mapping
+      glFlush();
+      RETURN_IF_ERROR(
+          CreateClEventFromEglSync(context_, sync, &inbound_event_));
+    } else {
+      // (b) EglSync + ClientWait
+      RETURN_IF_ERROR(sync.ClientWait());
+    }
+  } else {
+    // (a) glFinish / GL fence sync
+    RETURN_IF_ERROR(gl::GlActiveSyncWait());
+  }
+
+  // Acquire all GL objects needed while processing.
+  auto make_acquire_wait = [&]() -> std::vector<cl_event> {
+    if (inbound_event_.is_valid()) {
+      return {inbound_event_.event()};
+    }
+    return {};
+  };
+  return AcquiredGlObjects::Acquire(memory_, queue_, make_acquire_wait(),
+                                    nullptr, &gl_objects_);
+}
+
+Status GlInteropFabric::Finish() {
+  if (!is_enabled()) {
+    return OkStatus();
+  }
+  RETURN_IF_ERROR(gl_objects_.Release({}, &outbound_event_));
+
+  // if (is_egl_sync_supported_ && is_cl_to_egl_mapping_supported_) {
+  //   EglSync egl_outbound_sync;
+  //   RETURN_IF_ERROR(CreateEglSyncFromClEvent(outbound_event_.event(),
+  //                                            egl_display_,
+  //                                            &egl_outbound_sync));
+  //   // Instruct GL pipeline to wait until corresponding CL event is signaled.
+  //   RETURN_IF_ERROR(egl_outbound_sync.ServerWait());
+  //   glFlush();
+  // } else {
+  //   // Slower option if proper sync is not supported. It is equivalent to
+  //   // clFinish, but, hopefully, faster.
+  //   outbound_event_.Wait();
+  // }
+
+  // This slow sync is the only working solution right now. We have to debug why
+  // above version is not working fast and reliable.
+  outbound_event_.Wait();
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/gl_interop.h
+++ b/tensorflow/lite/delegates/gpu/cl/gl_interop.h
@ -0,0 +1,144 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
+
+#include <vector>
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Creates an EglSync from OpenCL event. Source event does not need to outlive
+// returned sync and could be safely destroyed.
+//
+// Depends on EGL 1.5.
+Status CreateEglSyncFromClEvent(cl_event event, EGLDisplay display,
+                                EglSync* sync);
+
+// Returns true if 'CreateEglSyncFromClEvent' is supported.
+bool IsEglSyncFromClEventSupported();
+
+// Creates CL event from EGL sync.
+// Created event could only be comsumed by AcquiredGlObject::Acquire call as
+// a 'wait_event'.
+Status CreateClEventFromEglSync(cl_context context, const EglSync& egl_sync,
+                                CLEvent* event);
+
+// Returns true if 'CreateClEventFromEglSync' is supported.
+bool IsClEventFromEglSyncSupported(const CLDevice& device);
+
+// Creates new CL memory object from OpenGL buffer.
+Status CreateClMemoryFromGlBuffer(GLuint gl_ssbo_id, AccessType access_type,
+                                  CLContext* context, CLMemory* memory);
+
+// Creates new CL memory object from OpenGL texture.
+Status CreateClMemoryFromGlTexture(GLenum texture_target, GLuint texture_id,
+                                   AccessType access_type, CLContext* context,
+                                   CLMemory* memory);
+
+// Returns true if GL objects could be shared with OpenCL context.
+bool IsGlSharingSupported(const CLDevice& device);
+
+// RAII-wrapper for GL objects acquired into CL context.
+class AcquiredGlObjects {
+ public:
+  static bool IsSupported(const CLDevice& device);
+
+  AcquiredGlObjects() : AcquiredGlObjects({}, nullptr) {}
+
+  // Quitely releases OpenGL objects. It is recommended to call Release()
+  // explicitly to properly handle potential errors.
+  ~AcquiredGlObjects();
+
+  // Acquires memory from the OpenGL context. Memory must be created by either
+  // CreateClMemoryFromGlBuffer or CreateClMemoryFromGlTexture calls.
+  // If 'acquire_event' is not nullptr, it will be signared once acquisition is
+  // complete.
+  static Status Acquire(const std::vector<cl_mem>& memory,
+                        cl_command_queue queue,
+                        const std::vector<cl_event>& wait_events,
+                        CLEvent* acquire_event /* optional */,
+                        AcquiredGlObjects* objects);
+
+  // Releases OpenCL memory back to OpenGL context. If 'release_event' is not
+  // nullptr, it will be signalled once release is complete.
+  Status Release(const std::vector<cl_event>& wait_events,
+                 CLEvent* release_event /* optional */);
+
+ private:
+  AcquiredGlObjects(const std::vector<cl_mem>& memory, cl_command_queue queue)
+      : memory_(memory), queue_(queue) {}
+
+  std::vector<cl_mem> memory_;
+  cl_command_queue queue_;
+};
+
+// Incapsulates all complicated GL-CL synchronization. It manages life time of
+// all appropriate events to ensure fast synchronization whenever possible.
+class GlInteropFabric {
+ public:
+  GlInteropFabric(EGLDisplay egl_display, Environment* environment);
+
+  // Ensures proper GL->CL synchronization is in place before
+  // GL objects that are mapped to CL objects are used.
+  Status Start();
+
+  // Puts appropriate CL->GL synchronization after all work is complete.
+  Status Finish();
+
+  // Registers memory to be used from GL context. Such CL memory object must
+  // be created with CreateClMemoryFromGlBuffer or CreateClMemoryFromGlTexture
+  // call.
+  void RegisterMemory(cl_mem memory);
+
+  // Unregisters memory registered with RegisterMemory call.
+  void UnregisterMemory(cl_mem memory);
+
+ private:
+  bool is_enabled() const { return egl_display_ && !memory_.empty(); }
+
+  bool is_egl_sync_supported_;
+  bool is_egl_to_cl_mapping_supported_;
+  bool is_cl_to_egl_mapping_supported_;
+
+  const EGLDisplay egl_display_;
+  cl_context context_;
+  cl_command_queue queue_;
+  CLEvent inbound_event_;
+  CLEvent outbound_event_;
+  std::vector<cl_mem> memory_;
+  AcquiredGlObjects gl_objects_;  // transient during Start/Finish calls.
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
--- a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.cc
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.cc
@ -0,0 +1,367 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h"
+
+#include <cstdint>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/cl/api.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_builder.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+// Forward declarations.
+TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
+
+InferencePriority ToPriority(int32_t priority) {
+  switch (priority) {
+    case TfLiteGpuInferencePriority::
+        TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION:
+      return InferencePriority::MAX_PRECISION;
+    case TfLiteGpuInferencePriority::TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY:
+      return InferencePriority::MIN_LATENCY;
+  }
+  return InferencePriority::MAX_PRECISION;
+}
+
+DataType ToDataType(TfLiteType data_type) {
+  switch (data_type) {
+    case kTfLiteFloat16:
+      return DataType::FLOAT16;
+    case kTfLiteFloat32:
+      return DataType::FLOAT32;
+    default:
+      return DataType::UNKNOWN;
+  }
+}
+
+DataLayout ToDataLayoutFromTFL(TfLiteGpuDataLayout data_layout) {
+  switch (data_layout) {
+    case TFLITE_GPU_DATA_LAYOUT_BHWC:
+      return DataLayout::BHWC;
+    case TFLITE_GPU_DATA_LAYOUT_DHWC4:
+      return DataLayout::DHWC4;
+    default:
+      return DataLayout::UNKNOWN;
+  }
+}
+
+class Delegate {
+ public:
+  explicit Delegate(const TfLiteGpuDelegateOptions_New* options) {
+    if (options) {
+      options_ = *options;
+    } else {
+      // Default options.
+      options_.compile_options.precision_loss_allowed = 0;
+      options_.compile_options.inference_priority = TfLiteGpuInferencePriority::
+          TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
+      options_.egl_display = eglGetCurrentDisplay();
+      options_.egl_context = eglGetCurrentContext();
+      options_.serialized_binary_cache_data = nullptr;
+      options_.serialized_binary_cache_size = 0;
+    }
+  }
+
+  Status Prepare(TfLiteContext* context,
+                 const TfLiteDelegateParams* delegate_params) {
+    // Extract TFLite delegate execution plan from the context and convert it
+    // into FlowGraph32.
+    GraphFloat32 graph;
+    RETURN_IF_ERROR(BuildModel(context, delegate_params, &graph));
+
+    // Apply general transformations on the graph.
+    NullTransformationReporter reporter;
+    ModelTransformer transformer(&graph, &reporter);
+    if (!ApplyGeneralTransformations(&transformer)) {
+      return InternalError("Graph general transformations failed");
+    }
+
+    InferenceEnvironmentOptions env_options;
+    env_options.egl_context = options_.egl_context;
+    env_options.egl_display = options_.egl_display;
+    env_options.serialized_binary_cache = {
+        options_.serialized_binary_cache_data,
+        options_.serialized_binary_cache_size};
+    InferenceEnvironmentProperties properties;
+    Status status =
+        NewInferenceEnvironment(env_options, &environment_, &properties);
+    if (!properties.is_opencl_available) {
+      context->ReportError(context,
+                           "TfLiteGpuDelegate: OpenCL is not available");
+    }
+    if (!properties.is_gl_sharing_supported) {
+      context->ReportError(context,
+                           "TfLiteGpuDelegate: GL sharing is not supported");
+    }
+    if (!properties.is_cl_to_gl_fast_sync_supported) {
+      context->ReportError(
+          context, "TfLiteGpuDelegate: fast CL to GL sync is not supported");
+    }
+    if (!properties.is_gl_to_cl_fast_sync_supported) {
+      context->ReportError(
+          context, "TfLiteGpuDelegate: fast GL to CL sync is not supported");
+    }
+    RETURN_IF_ERROR(status);
+
+    InferenceOptions options;
+    options.priority = ToPriority(options_.compile_options.inference_priority);
+    options.allow_precision_loss =
+        options_.compile_options.precision_loss_allowed != 0;
+    std::unique_ptr<InferenceBuilder> builder;
+    RETURN_IF_ERROR(
+        environment_->NewInferenceBuilder(options, graph, &builder));
+
+    // At this point tflite didn't allocate tensors yet, therefore, collect
+    // indices and set all input and output tensors from tflite later.
+    auto inputs = graph.inputs();
+    input_indices_.reserve(inputs.size());
+    for (auto input : inputs) {
+      auto tensor_index = input->tensor.ref;
+      int object_index = input_indices_.size();
+      input_indices_.push_back(tensor_index);
+      RETURN_IF_ERROR(
+          builder->SetInputObjectDef(object_index, GetObjectDef(tensor_index)));
+    }
+    auto outputs = graph.outputs();
+    output_indices_.reserve(outputs.size());
+    for (auto output : outputs) {
+      auto tensor_index = output->tensor.ref;
+      int object_index = output_indices_.size();
+      output_indices_.push_back(tensor_index);
+      RETURN_IF_ERROR(builder->SetOutputObjectDef(object_index,
+                                                  GetObjectDef(tensor_index)));
+    }
+
+    return builder->Build(&runner_);
+  }
+
+  Status SetInputsAndOutputs(TfLiteContext* context) {
+    int i = 0;
+    for (auto index : input_indices_) {
+      RETURN_IF_ERROR(
+          runner_->SetInputObject(i++, GetTensorObject(index, context)));
+    }
+    i = 0;
+    for (auto index : output_indices_) {
+      RETURN_IF_ERROR(
+          runner_->SetOutputObject(i++, GetTensorObject(index, context)));
+    }
+    return OkStatus();
+  }
+
+  Status Invoke(TfLiteContext* context) {
+    RETURN_IF_ERROR(SetInputsAndOutputs(context));
+    return runner_->Run();
+  }
+
+  void BindGlBufferToTensor(GLuint buffer_id, int tensor_index,
+                            DataType data_type, DataLayout data_layout) {
+    // At this point the delegate haven't seen a model yet. Therefore, just
+    // record what object gets assigned.
+    if (tensor_index >= tensors_.size()) {
+      tensors_.resize(tensor_index + 1);
+    }
+    TensorObjectDef def;
+    def.object_def.data_type = data_type;
+    def.object_def.data_layout = data_layout;
+    def.object_def.object_type = ObjectType::OPENGL_SSBO;
+    def.object_def.user_provided = true;
+    def.dimensions = Dimensions(0, 0, 0, 0);
+    OpenGlBuffer buffer;
+    buffer.id = buffer_id;
+    TensorObject obj = buffer;
+    tensors_[tensor_index] = std::make_pair(obj, def);
+  }
+
+  ObjectDef GetObjectDef(int index) const {
+    if (index < tensors_.size() && IsValid(tensors_[index].second)) {
+      return tensors_[index].second.object_def;
+    }
+    ObjectDef default_object_def;
+    default_object_def.data_type = DataType::FLOAT32;
+    default_object_def.data_layout = DataLayout::BHWC;
+    default_object_def.object_type = ObjectType::CPU_MEMORY;
+    default_object_def.user_provided = true;
+    return default_object_def;
+  }
+
+  TensorObject GetTensorObject(int index, TfLiteContext* context) const {
+    if (index < tensors_.size() &&
+        IsValid(tensors_[index].second, tensors_[index].first)) {
+      return tensors_[index].first;
+    }
+    auto& tensor = context->tensors[index];
+    return MakeCpuMemory(absl::MakeSpan(tensor.data.raw, tensor.bytes));
+  }
+
+  TfLiteDelegate* tflite_delegate() { return &delegate_; }
+
+  bool SupportsGlObjects() const {
+    return options_.egl_context != EGL_NO_CONTEXT &&
+           options_.egl_display != EGL_NO_DISPLAY;
+  }
+
+  absl::Span<const uint8_t> GetSerializedBinaryCache() {
+    binary_cache_ = environment_->GetSerializedBinaryCache();
+    return binary_cache_;
+  }
+
+ private:
+  TfLiteDelegate delegate_ = {
+      reinterpret_cast<void*>(this),  // .data_
+      DelegatePrepare,                // .Prepare
+      nullptr,                        // .CopyFromBufferHandle
+      nullptr,                        // .CopyToBufferHandle
+      nullptr,                        // .FreeBufferHandle
+      kTfLiteDelegateFlagsNone,       // .flags
+  };
+
+  TfLiteGpuDelegateOptions_New options_;
+  std::unique_ptr<InferenceEnvironment> environment_;
+  std::unique_ptr<InferenceRunner> runner_;
+  std::vector<int64_t> input_indices_;
+  std::vector<int64_t> output_indices_;
+  std::vector<uint8_t> binary_cache_;
+  std::vector<std::pair<TensorObject, TensorObjectDef>> tensors_;
+};
+
+inline Delegate* GetDelegate(TfLiteNode* node) {
+  return reinterpret_cast<Delegate*>(node->user_data);
+}
+
+inline Delegate* GetDelegate(TfLiteDelegate* delegate) {
+  return reinterpret_cast<Delegate*>(delegate->data_);
+}
+
+TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
+  const TfLiteRegistration kRegistration = {
+      // .init
+      [](TfLiteContext* context, const char* buffer, size_t) -> void* {
+        const auto* params =
+            reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+        auto* gpu_delegate = GetDelegate(params->delegate);
+        // Everything below should happen in prepare function call, but TFLite
+        // for whatever reason forbids that.
+        const auto status = gpu_delegate->Prepare(context, params);
+        if (!status.ok()) {
+          context->ReportError(context, "TfLiteGpuDelegate Init: %s",
+                               status.error_message().c_str());
+          return nullptr;
+        }
+        return gpu_delegate;
+      },
+      // .free
+      [](TfLiteContext*, void* buffer) -> void {},
+      // .prepare
+      [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
+        if (!node->user_data) {
+          context->ReportError(
+              context,
+              "TfLiteGpuDelegate Prepare: delegate is not initialized");
+          return kTfLiteError;
+        }
+        // TODO(akulik): tflite tensors are not allocated here either. It would
+        // be good to set inputs and outputs only once here instead of setting
+        // them every time in .invoke.
+        return kTfLiteOk;
+      },
+      // .invoke
+      [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
+        const auto status = GetDelegate(node)->Invoke(context);
+        if (!status.ok()) {
+          context->ReportError(context, "TfLiteGpuDelegate Invoke: %s",
+                               status.error_message().c_str());
+          return kTfLiteError;
+        }
+        return kTfLiteOk;
+      },
+      nullptr,                  // .profiling_string
+      0,                        // .builtin_code
+      "TfLiteGpuDelegate_New",  // .custom_name
+      1,                        // .version
+  };
+  TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
+  const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
+      context, kRegistration, ops_to_replace, delegate);
+  TfLiteIntArrayFree(ops_to_replace);
+  return status;
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+TfLiteDelegate* TfLiteGpuDelegateCreate_New(
+    const TfLiteGpuDelegateOptions_New* options) {
+  auto* gpu_delegate = new tflite::gpu::cl::Delegate(options);
+  return gpu_delegate ? gpu_delegate->tflite_delegate() : nullptr;
+}
+
+void TfLiteGpuDelegateDelete_New(TfLiteDelegate* delegate) {
+  delete tflite::gpu::cl::GetDelegate(delegate);
+}
+
+TFL_CAPI_EXPORT TfLiteStatus TfLiteGpuDelegateBindGlBufferToTensor(
+    TfLiteDelegate* delegate, GLuint buffer_id, int tensor_index,
+    TfLiteType data_type, TfLiteGpuDataLayout data_layout) {
+  auto* gpu_delegate = tflite::gpu::cl::GetDelegate(delegate);
+  if (!gpu_delegate) {
+    return kTfLiteError;
+  }
+  if (!gpu_delegate->SupportsGlObjects()) {
+    return kTfLiteError;
+  }
+  auto type = tflite::gpu::cl::ToDataType(data_type);
+  if (type == tflite::gpu::DataType::UNKNOWN) {
+    return kTfLiteError;
+  }
+  auto layout = tflite::gpu::cl::ToDataLayoutFromTFL(data_layout);
+  if (layout == tflite::gpu::DataLayout::UNKNOWN) {
+    return kTfLiteError;
+  }
+  gpu_delegate->BindGlBufferToTensor(buffer_id, tensor_index, type, layout);
+  return kTfLiteOk;
+}
+
+bool TfLiteGpuDelegateGetSerializedBinaryCache(TfLiteDelegate* delegate,
+                                               size_t* size,
+                                               const uint8_t** data) {
+  *size = 0;
+  auto* gpu_delegate = tflite::gpu::cl::GetDelegate(delegate);
+  if (!gpu_delegate) {
+    return false;
+  }
+  auto cache = gpu_delegate->GetSerializedBinaryCache();
+  if (cache.empty()) {
+    return false;
+  }
+  *size = cache.size();
+  *data = cache.data();
+  return true;
+}
--- a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
@ -0,0 +1,120 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
+
+#include <stdint.h>
+
+#include <EGL/egl.h>
+#include <GLES3/gl31.h>
+#include "tensorflow/lite/c/c_api_internal.h"
+
+#ifdef SWIG
+#define TFL_CAPI_EXPORT
+#else
+#if defined(_WIN32)
+#ifdef TF_COMPILE_LIBRARY
+#define TFL_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TFL_CAPI_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+enum TfLiteGpuInferencePriority {
+  TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION = 0,
+  TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY = 1,
+};
+
+// Shader compilation options.
+struct TFL_CAPI_EXPORT TfLiteGpuCompileOptions_New {
+  // When set to zero, computations are carried out in 32-bit floating point.
+  // Otherwise, the GPU may quantify tensors, downcast values, process in FP16
+  // (recommended).
+  int32_t precision_loss_allowed;
+
+  // Priority is defined in TfLiteGpuInferencePriority.
+  int32_t inference_priority;
+};
+
+struct TFL_CAPI_EXPORT TfLiteGpuDelegateOptions_New {
+  TfLiteGpuCompileOptions_New compile_options;
+
+  // [Optional]
+  // Whenever EGL display and EGL context are set, corresponding OpenCL context
+  // will be created.
+  // These variables are required when using GL objects as inputs or outputs.
+  EGLDisplay egl_display;
+  EGLContext egl_context;
+
+  // [Optional]
+  // Contains data returned from TfLiteGpuDelegateGetSerializedBinaryCache call.
+  // Invalid or incompatible data will be discarded. Compiled binary may become
+  // incompatible when GPU driver is updated.
+  const uint8_t* serialized_binary_cache_data;
+  size_t serialized_binary_cache_size;
+};
+
+// Creates a new delegate instance that need to be destroyed with
+// TfLiteGpuDelegateDelete_New when delegate is no longer used by TFLite.
+// When `options` is set to `nullptr`, the following default values are used:
+// .compile_options = {
+//   .precision_loss_allowed = false,
+// }
+// .egl_display = eglGetCurrentDisplay(),
+// .egl_context = eglGetCurrentContext();
+TFL_CAPI_EXPORT TfLiteDelegate* TfLiteGpuDelegateCreate_New(
+    const TfLiteGpuDelegateOptions_New* options);
+
+// Destroys a delegate created with `TfLiteGpuDelegateCreate_New` call.
+TFL_CAPI_EXPORT void TfLiteGpuDelegateDelete_New(TfLiteDelegate* delegate);
+
+enum TfLiteGpuDataLayout {
+  TFLITE_GPU_DATA_LAYOUT_BHWC = 0,
+  TFLITE_GPU_DATA_LAYOUT_DHWC4 = 1,
+};
+
+// Binds GL shader storage object to an input or an output tensor in the
+// initialized delegate. Bound buffer should have sufficient storage to
+// accommodate all elements of a tensor.
+//
+// Supports data of kTfliteFloat16 or kTfliteFloat32 types in BHWC or DHWC4 data
+// layouts.
+//
+// *** Must be called *before* `Interpreter::ModifyGraphWithDelegate`. ***
+TFL_CAPI_EXPORT TfLiteStatus TfLiteGpuDelegateBindGlBufferToTensor(
+    TfLiteDelegate* delegate, GLuint buffer_id, int tensor_index,
+    TfLiteType data_type, TfLiteGpuDataLayout data_layout);
+
+// Returns opaque binary blob that contains a collection of cached OpenCL
+// binaries. Returned data could be re-used later to speed up initialization
+// time when new delegate is created for the same model.
+// Returned data is valid only if used on the same device, otherwise it will
+// not be compatible and will be discarded.
+TFL_CAPI_EXPORT bool TfLiteGpuDelegateGetSerializedBinaryCache(
+    TfLiteDelegate* delegate, size_t* size, const uint8_t** data);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@ -0,0 +1,419 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/add_bias.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+namespace {
+bool IsReady(const std::unordered_set<ValueId>& ready_tensors,
+             const CLNode& node) {
+  for (const ValueId in_id : node.inputs) {
+    if (ready_tensors.find(in_id) == ready_tensors.end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<std::pair<ValueId, TensorDescriptor>> GetCLNodeTensors(
+    const CLNode& node) {
+  std::vector<std::pair<ValueId, TensorDescriptor>> result;
+  for (int i = 0; i < node.operations.size(); ++i) {
+    const OperationDef op_def = node.operations[i]->GetDefinition();
+    const auto& first_range = node.ranges[0];
+    for (int k = first_range.x; k < first_range.y; ++k) {
+      result.push_back({node.inputs[k], op_def.src_tensors[k - first_range.x]});
+    }
+    for (int j = 1; j < node.ranges.size(); ++j) {
+      const auto& range = node.ranges[j];
+      for (int k = range.x; k < range.y; ++k) {
+        result.push_back({node.inputs[k], op_def.src_tensors[k - range.x + 1]});
+      }
+    }
+    for (int j = 0; j < node.outputs.size(); ++j) {
+      result.push_back({node.outputs[j], op_def.dst_tensors[j]});
+    }
+  }
+
+  return result;
+}
+
+void MergeCLNodes(CLNode* src, CLNode* dst) {
+  int offset = dst->inputs.size();
+  for (int j = 0; j < src->inputs.size(); ++j) {
+    if (src->inputs[j] != dst->outputs[0]) {
+      dst->inputs.push_back(src->inputs[j]);
+    }
+  }
+  auto first_range = src->ranges[0];
+  dst->ranges.push_back(
+      int2(first_range.x + offset, first_range.y - 1 + offset));
+  for (int i = 1; i < src->ranges.size(); ++i) {
+    auto range = src->ranges[i];
+    dst->ranges.push_back(int2(range.x + offset, range.y + offset));
+  }
+  dst->outputs[0] = src->outputs[0];
+  for (int i = 0; i < src->operations.size(); ++i) {
+    dst->operations.push_back(std::move(src->operations[i]));
+  }
+  dst->name += " linked : " + src->name;
+}
+
+void AddUsage(ValueId id, int task_index,
+              std::map<ValueId, int2>* usage_records) {
+  auto it = usage_records->find(id);
+  if (it == usage_records->end()) {
+    (*usage_records)[id].x = task_index;
+    (*usage_records)[id].y = task_index;
+  } else {
+    (*usage_records)[id].y = task_index;
+  }
+}
+
+}  // namespace
+
+CLNode::CLNode(CLNode&& node)
+    : operations(std::move(node.operations)),
+      inputs(std::move(node.inputs)),
+      outputs(std::move(node.outputs)),
+      ranges(std::move(node.ranges)),
+      name(std::move(node.name)) {}
+
+CLNode& CLNode::operator=(CLNode&& node) {
+  if (this != &node) {
+    operations = std::move(node.operations);
+    inputs = std::move(node.inputs);
+    outputs = std::move(node.outputs);
+    ranges = std::move(node.ranges);
+    name = std::move(node.name);
+  }
+  return *this;
+}
+
+Status InferenceContext::InitFromGraph(const CreateInferenceInfo& create_info,
+                                       const GraphFloat32& graph,
+                                       Environment* env) {
+  precision_ = create_info.precision;
+  storage_type_ = create_info.storage_type;
+  if (env->device().vendor() == Vendor::MALI) {
+    need_flush_ = true;
+    need_manual_release_ = true;
+  }
+  CopyInAndOutIds(graph);
+  CreationContext creation_context;
+  creation_context.device = env->GetDevicePtr();
+  creation_context.context = &env->context();
+  creation_context.queue = env->queue();
+  creation_context.cache = env->program_cache();
+  RETURN_IF_ERROR(
+      ConvertOperations(creation_context, graph, create_info.hints));
+  Merge();
+  RETURN_IF_ERROR(
+      AllocateMemory(graph, env->device(), creation_context.context));
+  BindMemoryToOperations();
+  RETURN_IF_ERROR(Compile(creation_context));
+
+  TuningParameters tuning_parameters;
+  tuning_parameters.queue = env->profiling_queue();
+  tuning_parameters.info = env->device().GetInfoPtr();
+  if (create_info.hints.Check(ModelHints::kFastTuning)) {
+    tuning_parameters.tuning_type = TuningType::FAST;
+  }
+  RETURN_IF_ERROR(Tune(tuning_parameters));
+  return OkStatus();
+}
+
+Status InferenceContext::InitFromGraphWithTransforms(
+    const CreateInferenceInfo& create_info, GraphFloat32* graph,
+    Environment* env) {
+  RETURN_IF_ERROR(RunGraphTransforms(graph));
+  RETURN_IF_ERROR(InitFromGraph(create_info, *graph, env));
+  return OkStatus();
+}
+
+void InferenceContext::CopyInAndOutIds(const GraphFloat32& graph) {
+  const auto inputs = graph.inputs();
+  for (const auto& input : inputs) {
+    input_ids_.push_back(input->id);
+  }
+
+  const auto outputs = graph.outputs();
+  for (const auto& output : outputs) {
+    output_ids_.push_back(output->id);
+  }
+}
+
+Status InferenceContext::ConvertOperations(
+    const CreationContext& creation_context, const GraphFloat32& graph,
+    ModelHints hints) {
+  std::vector<Node*> graph_nodes = graph.nodes();
+  for (int i = 0; i < graph_nodes.size(); ++i) {
+    const Node& node = *graph_nodes[i];
+    auto inputs = graph.FindInputs(node.id);
+    auto outputs = graph.FindOutputs(node.id);
+    OperationDef op_def;
+    op_def.precision = precision_;
+    auto data_type = DeduceDataTypeFromPrecision(precision_);
+    for (int j = 0; j < inputs.size(); ++j) {
+      op_def.src_tensors.push_back({data_type, storage_type_});
+    }
+    for (int j = 0; j < outputs.size(); ++j) {
+      op_def.dst_tensors.push_back({data_type, storage_type_});
+    }
+    std::unique_ptr<GPUOperation> gpu_op;
+    RETURN_IF_ERROR(GPUOperationFromNode(creation_context, op_def, hints, graph,
+                                         node, &gpu_op));
+    CLNode cl_node;
+    cl_node.operations.push_back(std::move(gpu_op));
+    cl_node.ranges.push_back(int2(0, static_cast<int>(inputs.size())));
+    cl_node.inputs.resize(inputs.size());
+    for (int j = 0; j < inputs.size(); ++j) {
+      cl_node.inputs[j] = inputs[j]->id;
+    }
+    cl_node.outputs.resize(outputs.size());
+    for (int j = 0; j < outputs.size(); ++j) {
+      cl_node.outputs[j] = outputs[j]->id;
+    }
+    cl_node.name = node.operation.type + " " + std::to_string(node.id) + " " +
+                   std::to_string(i);
+    nodes_.push_back(std::move(cl_node));
+  }
+
+  return OkStatus();
+}
+
+void InferenceContext::Merge() {
+  std::unordered_set<ValueId> ready_tensors;
+  for (const auto& input_id : input_ids_) {
+    ready_tensors.insert(input_id);
+  }
+  for (int i = 0; i < nodes_.size(); ++i) {
+    auto& node = nodes_[i];
+    for (const auto& out_id : node.outputs) {
+      ready_tensors.insert(out_id);
+    }
+    if (node.outputs.size() != 1) {
+      continue;
+    }
+    std::vector<int> next_nodes;
+    for (int j = i + 1; j < nodes_.size(); ++j) {
+      for (int k = 0; k < nodes_[j].inputs.size(); ++k) {
+        if (nodes_[j].inputs[k] == node.outputs[0]) {
+          next_nodes.push_back(j);
+        }
+      }
+    }
+    if (next_nodes.size() != 1) {
+      continue;
+    }
+    auto& linkable_node = nodes_[next_nodes[0]];
+    auto* elementwise =
+        dynamic_cast<ElementwiseOperation*>(linkable_node.operations[0].get());
+    if (!elementwise || linkable_node.outputs.size() != 1 ||
+        !IsReady(ready_tensors, linkable_node)) {
+      continue;
+    }
+    MergeCLNodes(&linkable_node, &node);
+    nodes_.erase(nodes_.begin() + next_nodes[0]);
+    i -= 1;
+  }
+  for (auto& node : nodes_) {
+    for (int j = 1; j < node.operations.size(); ++j) {
+      auto* elementwise =
+          dynamic_cast<ElementwiseOperation*>(node.operations[j].get());
+      node.operations[0]->AddOperation(elementwise);
+    }
+  }
+}
+
+Status InferenceContext::AllocateMemory(const GraphFloat32& graph,
+                                        const CLDevice& device,
+                                        CLContext* context) {
+  std::map<ValueId, int2> usages;
+  for (int op_index = 0; op_index < nodes_.size(); ++op_index) {
+    auto tensors = GetCLNodeTensors(nodes_[op_index]);
+    for (auto& tensor : tensors) {
+      AddUsage(tensor.first, op_index, &usages);
+    }
+  }
+
+  std::vector<TensorUsageRecord<BHWC>> usage_records;
+  std::map<ValueId, ValueId> remap_from_graph_ids;
+  for (auto& usage : usages) {
+    const auto& shape = graph.GetValue(usage.first)->tensor.shape;
+    remap_from_graph_ids[usage.first] = usage_records.size();
+    usage_records.push_back({shape, static_cast<TaskId>(usage.second.x),
+                             static_cast<TaskId>(usage.second.y)});
+  }
+
+  ObjectsAssignment<BHWC> assignment;
+  RETURN_IF_ERROR(AssignObjectsToTensors(
+      usage_records, MemoryStrategy::EQUALITY, &assignment));
+
+  for (auto& node : nodes_) {
+    for (auto& id : node.inputs) {
+      ValueId new_id = assignment.object_ids[remap_from_graph_ids[id]];
+      remap_from_graph_ids_to_shared_[id] = new_id;
+      id = new_id;
+    }
+    for (auto& id : node.outputs) {
+      ValueId new_id = assignment.object_ids[remap_from_graph_ids[id]];
+      remap_from_graph_ids_to_shared_[id] = new_id;
+      id = new_id;
+    }
+  }
+
+  for (auto& node : nodes_) {
+    auto tensors = GetCLNodeTensors(node);
+    for (auto& tensor : tensors) {
+      const auto& it = tensors_.find(tensor.first);
+      if (it == tensors_.end()) {
+        const auto& shape = assignment.object_sizes[tensor.first];
+        Tensor* t = &tensors_[tensor.first];
+        RETURN_IF_ERROR(CreateTensor(*context, device, shape.w, shape.h,
+                                     shape.c, tensor.second.data_type,
+                                     tensor.second.storage_type, t));
+      }
+    }
+  }
+  return OkStatus();
+}
+
+void InferenceContext::BindMemoryToOperations() {
+  for (auto& node : nodes_) {
+    const auto& first_range = node.ranges[0];
+    for (int k = first_range.x; k < first_range.y; ++k) {
+      auto id = node.inputs[k];
+      const auto& it = tensors_.find(id);
+      node.operations[0]->SetSrc(&it->second, k - first_range.x);
+    }
+    for (int i = 1; i < node.ranges.size(); ++i) {
+      const auto& range = node.ranges[i];
+      for (int k = range.x; k < range.y; ++k) {
+        auto id = node.inputs[k];
+        const auto& it = tensors_.find(id);
+        node.operations[i]->SetSrc(&it->second, k - range.x + 1);
+      }
+    }
+
+    for (int i = 0; i < node.outputs.size(); ++i) {
+      auto id = node.outputs[i];
+      const auto& it = tensors_.find(id);
+      node.operations[0]->SetDst(&it->second, i);
+    }
+  }
+}
+
+Status InferenceContext::Compile(const CreationContext& creation_context) {
+  for (auto& node : nodes_) {
+    RETURN_IF_ERROR(node.operations[0]->Compile(creation_context));
+  }
+  return OkStatus();
+}
+
+Status InferenceContext::Tune(const TuningParameters& tuning_parameters) {
+  for (auto& node : nodes_) {
+    RETURN_IF_ERROR(node.operations[0]->Tune(tuning_parameters));
+  }
+  return OkStatus();
+}
+
+Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
+  if (need_manual_release_) {
+    if (prev_enqueue_start_point_.is_valid()) {
+      prev_enqueue_start_point_.Wait();
+    }
+    RETURN_IF_ERROR(queue->EnqueueEvent(&prev_enqueue_start_point_));
+  }
+  for (auto& node : nodes_) {
+    RETURN_IF_ERROR(node.operations[0]->AddToQueue(queue));
+  }
+  if (need_flush_) {
+    clFlush(queue->queue());
+  }
+  return OkStatus();
+}
+
+Status InferenceContext::Profile(ProfilingCommandQueue* queue,
+                                 ProfilingInfo* result) {
+  queue->ResetMeasurements();
+  for (auto& node : nodes_) {
+    queue->SetEventsLabel(node.name);
+    RETURN_IF_ERROR(node.operations[0]->AddToQueue(queue));
+  }
+  RETURN_IF_ERROR(queue->WaitForCompletion());
+  *result = queue->GetProfilingInfo();
+  return OkStatus();
+}
+
+Tensor* InferenceContext::GetTensor(ValueId id) {
+  return &tensors_[remap_from_graph_ids_to_shared_[id]];
+}
+
+Status InferenceContext::SetInputTensor(ValueId id, const TensorFloat32& tensor,
+                                        CLCommandQueue* queue) {
+  return GetTensor(id)->WriteData(queue, tensor);
+}
+
+Status InferenceContext::GetOutputTensor(ValueId id, CLCommandQueue* queue,
+                                         TensorFloat32* result) {
+  const auto& gpu_tensor = *GetTensor(id);
+  const int4 dst_size = gpu_tensor.GetSizeWithDepth();
+  const auto dst_shape = BHWC(1, dst_size.y, dst_size.x, dst_size.z);
+  result->id = id;
+  result->shape = dst_shape;
+  result->data.resize(dst_shape.DimensionsProduct());
+  return gpu_tensor.ReadData(queue, result);
+}
+
+Status RunGraphTransforms(GraphFloat32* graph) {
+  auto merge_padding_transform = NewMergePaddingWithAdd();
+  auto add_bias_transform = NewAddBias();
+  ModelTransformer transformer(graph, /*reporter=*/nullptr);
+  if (!transformer.Apply("add_bias", add_bias_transform.get())) {
+    return InternalError("Invalid add_bias transform");
+  }
+  if (!transformer.Apply("merge_padding", merge_padding_transform.get())) {
+    return InternalError("Invalid merge_padding transform");
+  }
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.h
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h
@ -0,0 +1,131 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct CLNode {
+  std::vector<std::unique_ptr<GPUOperation>> operations;
+  std::vector<ValueId> inputs;
+  std::vector<ValueId> outputs;
+  // So as CLNode can have few operations, ranges keep range of ids from inputs,
+  // for every operation.
+  std::vector<int2> ranges;
+
+  // Mostly for debug purposess.
+  std::string name;
+
+  CLNode() = default;
+
+  CLNode(CLNode&& node);
+  CLNode& operator=(CLNode&& node);
+  CLNode(const CLNode&) = delete;
+  CLNode& operator=(const CLNode&) = delete;
+};
+
+class InferenceContext {
+ public:
+  struct CreateInferenceInfo {
+    CalculationsPrecision precision;
+    TensorStorageType storage_type;
+    ModelHints hints;
+  };
+  Status InitFromGraph(const CreateInferenceInfo& create_info,
+                       const GraphFloat32& graph, Environment* env);
+
+  // Applies OpenCL-specific transformations to the graph before the
+  // initialization. These transformations are either impossible or useless in
+  // other backends.
+  Status InitFromGraphWithTransforms(const CreateInferenceInfo& create_info,
+                                     GraphFloat32* graph, Environment* env);
+
+  Status AddToQueue(CLCommandQueue* queue);
+  Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result);
+
+  Status SetInputTensor(ValueId id, const TensorFloat32& tensor,
+                        CLCommandQueue* queue);
+
+  // It will work only with input/output tensor ids. For all other ids we don't
+  // have any guarantees.
+  Tensor* GetTensor(ValueId id);
+
+  Status GetOutputTensor(ValueId id, CLCommandQueue* queue,
+                         TensorFloat32* result);
+
+ private:
+  void CopyInAndOutIds(const GraphFloat32& graph);
+  Status ConvertOperations(const CreationContext& creation_context,
+                           const GraphFloat32& graph, ModelHints hints);
+  void CreateLinks();
+  void Merge();
+  Status AllocateMemory(const GraphFloat32& graph, const CLDevice& device,
+                        CLContext* context);
+  void BindMemoryToOperations();
+  Status Compile(const CreationContext& creation_context);
+  Status Tune(const TuningParameters& tuning_parameters);
+
+  // performance hacks
+  bool need_flush_ = false;
+
+  // In order to reduce memory leak on Mali a pipeline needs to be synchronized
+  // with CPU to prevent growing internal global OpenCL kernel pool. One trick
+  // is to enqueue an event from a previous run. Most of the time is should
+  // already be executed on GPU and should not stall the pipeline.
+  bool need_manual_release_ = false;
+  CLEvent prev_enqueue_start_point_;
+
+  CalculationsPrecision precision_;
+  TensorStorageType storage_type_;
+
+  // Directly mapped nodes from graph, but some of them "inactiv" due
+  //  to fusion (inactiv = fused).
+  // Memory is allocated only once, in ConvertOperations, and is not modified
+  //  anywhere.
+  std::vector<CLNode> nodes_;
+  std::map<ValueId, Tensor> tensors_;
+  std::map<ValueId, ValueId> remap_from_graph_ids_to_shared_;
+
+  std::vector<ValueId> input_ids_;
+  std::vector<ValueId> output_ids_;
+};
+
+// Runs OpenCL specific transforms for the graph.
+Status RunGraphTransforms(GraphFloat32* graph);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
--- a/tensorflow/lite/delegates/gpu/cl/kernels/abs.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/abs.cc
@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/abs.h"
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Abs::Abs(Abs&& operation) : ElementwiseOperation(std::move(operation)) {}
+
+Abs& Abs::operator=(Abs&& operation) {
+  if (this != &operation) {
+    ElementwiseOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+std::string Abs::GetCoreCode(const std::string& src, const std::string& z_coord,
+                             const std::string& address) const {
+  return absl::StrCat(src, " = fabs(", src, ");\n");
+}
+
+Abs CreateAbs(const OperationDef& definition) {
+  Abs operation(definition);
+  operation.SetLinkIndex(0);
+  return operation;
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/abs.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/abs.h
@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ABS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ABS_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Abs : public ElementwiseOperation {
+ public:
+  explicit Abs(const OperationDef& definition)
+      : ElementwiseOperation(definition) {}
+
+  // Move only
+  Abs(Abs&& operation);
+  Abs& operator=(Abs&& operation);
+  Abs(const Abs&) = delete;
+  Abs& operator=(const Abs&) = delete;
+
+  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
+                          const std::string& address) const override;
+};
+
+Abs CreateAbs(const OperationDef& definition);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ABS_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/abs_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/abs_test.cc
@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/abs.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, Abs) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {half(0.0f), half(-1.0f), half(-0.05f), half(0.045f)};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Abs operation = CreateAbs(op_def);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(0.0f), {half(0.0f), half(1.0f),
+                                              half(0.05f), half(0.045f)}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
@ -0,0 +1,193 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/add.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+bool HasTexture2DStorageType(const OperationDef& def) {
+  for (auto& src_tensor : def.src_tensors) {
+    if (src_tensor.storage_type == TensorStorageType::TEXTURE_2D) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+std::string Add::GetElementWiseCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "dst_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ);
+  c += GetArgsDeclaration();
+  c += ::tflite::gpu::cl::GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 dst_size\n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  FLT4 src = (FLT4)(0.0);\n";
+  c += "    " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  if (src_depthes_[0] != dst_depth_) {
+    c += "  if (Z < " + std::to_string(src_depthes_[0]) + ") {\n";
+    if (src_descriptor.storage_type == TensorStorageType::TEXTURE_2D) {
+      c += "    float t_y = address.y - Z; \n";
+      c += "    int ti_y = (t_y + 0.5) * " + inv_divisor_name_ + "; \n";
+      c += "    int2 tmp_add = (int2)(address.x, ti_y  * " +
+           std::to_string(src_depthes_[0]) + " + Z);\n";
+      c += "    src += " + src_tensor.Read3D("tmp_add") + ";\n";
+    } else {
+      c += "    src += " + src_tensor.Read3D("address") + ";\n";
+    }
+    c += "  }\n";
+  } else {
+    c += "  src += " + src_tensor.Read3D("address") + ";\n";
+  }
+  c += "  " + GetCoreCode("src", "Z", "address");
+  c += PostProcess(linked_operations, "src", "Z", "address");
+  c += "  " + dst_tensor.Write3D("src", "address") + "\n";
+  c += "} \n";
+  return c;
+}
+
+Add::Add(const OperationDef& definition, const std::vector<int>& channels,
+         int dst_channels)
+    : ElementwiseOperation(definition),
+      dst_depth_(IntegralDivideRoundUp(dst_channels, 4)) {
+  src_depthes_.resize(channels.size());
+  for (int i = 0; i < channels.size(); ++i) {
+    src_depthes_[i] = IntegralDivideRoundUp(channels[i], 4);
+  }
+}
+
+Add::Add(Add&& operation)
+    : ElementwiseOperation(std::move(operation)),
+      link_index_(operation.link_index_),
+      inv_divisor_name_(std::move(operation.inv_divisor_name_)),
+      src_depthes_(std::move(operation.src_depthes_)),
+      dst_depth_(operation.dst_depth_) {}
+
+Add& Add::operator=(Add&& operation) {
+  if (this != &operation) {
+    link_index_ = operation.link_index_;
+    inv_divisor_name_ = std::move(operation.inv_divisor_name_);
+    src_depthes_ = std::move(operation.src_depthes_);
+    dst_depth_ = operation.dst_depth_;
+    ElementwiseOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+void Add::SetLinkIndex(int index) {
+  inv_divisor_name_ = absl::StrCat("inv_divisor_", index);
+  link_index_ = index;
+}
+
+std::string Add::GetCoreCode(const std::string& src, const std::string& z_coord,
+                             const std::string& address) const {
+  std::string result;
+  for (int i = 1; i < src_depthes_.size(); ++i) {
+    const std::string tensor_name =
+        absl::StrCat("src_data_", link_index_, "_", i);
+    TensorCodeGenerator src_tensor(tensor_name, "", definition_.src_tensors[i]);
+    if (src_depthes_[i] != dst_depth_) {
+      absl::StrAppend(&result, "  if (", z_coord, " < ", src_depthes_[i],
+                      ") {\n");
+      if (definition_.src_tensors[i].storage_type ==
+          TensorStorageType::TEXTURE_2D) {
+        absl::StrAppend(&result, "    float t_y = ", address, ".y - ", z_coord,
+                        ";\n");
+        absl::StrAppend(&result, "    int ti_y = (t_y + 0.5) * ",
+                        inv_divisor_name_, ";\n");
+        absl::StrAppend(&result, "    int2 tmp_add = (int2)(", address,
+                        ".x, ti_y * ", src_depthes_[i], " + ", z_coord, ");\n");
+        absl::StrAppend(&result, "    ", src,
+                        " += ", src_tensor.Read3D("tmp_add"), ";\n");
+      } else {
+        absl::StrAppend(&result, "    ", src,
+                        " += ", src_tensor.Read3D(address), ";\n");
+      }
+      absl::StrAppend(&result, "  }\n");
+    } else {
+      absl::StrAppend(&result, "  ", src,
+                      " += ", src_tensor.Read3D(address) + ";\n");
+    }
+  }
+  return result;
+}
+
+std::string Add::GetArgsDeclaration() const {
+  std::string args;
+  for (int i = 1; i < src_depthes_.size(); ++i) {
+    const std::string tensor_name =
+        absl::StrCat("src_data_", link_index_, "_", i);
+    TensorCodeGenerator src_tensor(tensor_name, "", definition_.src_tensors[i]);
+    absl::StrAppend(&args, ",\n", src_tensor.GetDeclaration(AccessType::READ));
+  }
+  if (HasTexture2DStorageType(definition_)) {
+    absl::StrAppend(&args, ",\n   float ", inv_divisor_name_);
+  }
+  return args;
+}
+
+Status Add::BindArguments(CLKernel* kernel) {
+  for (int i = 1; i < src_depthes_.size(); ++i) {
+    RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[i]->GetMemoryPtr()));
+  }
+  if (HasTexture2DStorageType(definition_)) {
+    float inv_divisor = 1.0f / static_cast<float>(dst_depth_);
+    RETURN_IF_ERROR(kernel->SetBytesAuto(inv_divisor));
+  }
+  return OkStatus();
+}
+
+Status Add::Compile(const CreationContext& creation_context) {
+  const auto code =
+      GetElementWiseCode(definition_.src_tensors[0], definition_.dst_tensors[0],
+                         definition_.precision, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Add CreateAdd(const OperationDef& definition, const std::vector<int>& channels,
+              int dst_channels) {
+  Add operation(definition, channels, dst_channels);
+  operation.SetLinkIndex(0);
+  return operation;
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.h
@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ADD_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ADD_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Add operation inherited from ElementwiseOperation, but it is much more
+// complicated than usual elementwise, that is why it has own versions for
+// Compile. Add operation support not equal tensors on input (for possibility to
+// remove Padding operation with zeroes in Z dimension)
+class Add : public ElementwiseOperation {
+ public:
+  Add(const OperationDef& definition, const std::vector<int>& channels,
+      int dst_channels);
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  Add(Add&& operation);
+  Add& operator=(Add&& operation);
+  Add(const Add&) = delete;
+  Add& operator=(const Add&) = delete;
+
+  void SetLinkIndex(int index) override;
+  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
+                          const std::string& address) const override;
+  std::string GetArgsDeclaration() const override;
+  Status BindArguments(CLKernel* kernel) override;
+
+ private:
+  std::string GetElementWiseCode(
+      const TensorDescriptor& src_descriptor,
+      const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+      const std::vector<ElementwiseOperation*>& linked_operations);
+
+  int link_index_;
+  std::string inv_divisor_name_;
+  std::vector<int> src_depthes_;
+  int dst_depth_;
+};
+
+Add CreateAdd(const OperationDef& definition, const std::vector<int>& channels,
+              int dst_channels);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ADD_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc
@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/add.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, AddTwoEqualTensors) {
+  TensorFloat32 src0, src1;
+  src0.shape = BHWC(1, 2, 1, 2);
+  src0.data = {0.0f, -1.0f, -0.05f, 0.045f};
+  src1.shape = BHWC(1, 2, 1, 2);
+  src1.data = {0.0f, 1.0f, -0.05f, -0.045f};
+  std::vector<int> channels = {2, 2};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Add operation = CreateAdd(op_def, channels, channels[0]);
+      ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 0.0f, -0.1f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, AddFirstTensorHasMoreChannelsThanSecond) {
+  TensorFloat32 src0, src1;
+  src0.shape = BHWC(1, 2, 1, 6);
+  src0.data = {0.0f,   -1.0f,  -0.05f, 0.045f, 1.0f,   -2.0f,
+               -1.05f, 1.045f, 2.0f,   -3.0f,  -2.05f, 2.045f};
+  src1.shape = BHWC(1, 2, 1, 2);
+  src1.data = {0.0f, 1.0f, -0.05f, -0.045f};
+  std::vector<int> channels = {6, 2};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Add operation = CreateAdd(op_def, channels, channels[0]);
+      ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 6), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {0.0f, 0.0f, -0.05f, 0.045f, 1.0f, -2.0f, -1.1f,
+                             1.0f, 2.0f, -3.0f, -2.05f, 2.045f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, AddFirstTensorHasLessChannelsThanSecond) {
+  TensorFloat32 src0, src1;
+  src1.shape = BHWC(1, 2, 1, 6);
+  src1.data = {0.0f,   -1.0f,  -0.05f, 0.045f, 1.0f,   -2.0f,
+               -1.05f, 1.045f, 2.0f,   -3.0f,  -2.05f, 2.045f};
+  src0.shape = BHWC(1, 2, 1, 2);
+  src0.data = {0.0f, 1.0f, -0.05f, -0.045f};
+  std::vector<int> channels = {2, 6};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Add operation = CreateAdd(op_def, channels, 6);
+      ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 6), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {0.0f, 0.0f, -0.05f, 0.045f, 1.0f, -2.0f, -1.1f,
+                             1.0f, 2.0f, -3.0f, -2.05f, 2.045f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.cc
@ -0,0 +1,136 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h"
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetApplyMaskKernelCode(
+    const OperationDef& definition,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src("src_data", "src_size", definition.src_tensors[0]);
+  TensorCodeGenerator mask("src_mask", "src_size_1", definition.src_tensors[1]);
+  TensorCodeGenerator dst("dst_data", "dst_size", definition.dst_tensors[0]);
+
+  std::string c = GetCommonDefines(definition.precision);
+
+  c += "__kernel void main_function(\n";
+  c += src.GetDeclaration(AccessType::READ) + ",\n";
+  c += mask.GetDeclaration(AccessType::READ) + ",\n";
+  c += dst.GetDeclaration(AccessType::WRITE);
+  c += GetArgsDeclaration(linked_operations);
+  c += "    int apply_mask_type,\n";
+  c += "    int4 src_size,\n";
+  c += "    int4 src_size_1,\n";
+  c += "    int4 dst_size  \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) return;\n";
+  c += "  FLT4 result = " + src.Read3D("X", "Y", "Z") + ";\n";
+  c += "  if (apply_mask_type == 1) {\n";
+  c += "    result *= " + mask.Read3D("X", "Y", "Z") + ";\n";
+  c += "  } else if (apply_mask_type == 2) {\n";
+  c += "    result *= " + mask.Read3D("0", "0", "Z") + ";\n";
+  c += "  } else {\n";
+  c += "    result *= " + mask.Read3D("X", "Y", "0") + ".x;\n";
+  c += "  }\n";
+  c += "  " + dst.GetAddress("dst_adr", "X", "Y", "Z");
+  c += PostProcess(linked_operations, "result", "Z", "dst_adr");
+  c += "  " + dst.Write3D("result", "dst_adr");
+  c += "}\n";
+  return c;
+}
+
+int GetMaskType(int4 src_size, int4 mask_size) {
+  if (mask_size.z == 1) {
+    return 0;
+  } else if (src_size.x == mask_size.x && src_size.y == mask_size.y) {
+    return 1;
+  } else {
+    return 2;
+  }
+}
+
+}  // namespace
+
+ApplyMask::ApplyMask(ApplyMask&& operation)
+    : GPUOperation(std::move(operation)),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ApplyMask& ApplyMask::operator=(ApplyMask&& operation) {
+  if (this != &operation) {
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ApplyMask::Compile(const CreationContext& creation_context) {
+  const auto code = GetApplyMaskKernelCode(definition_, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ApplyMask::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(
+      GetMaskType(src_[0]->GetSizeWithDepth(), src_[1]->GetSizeWithDepth()))));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[1]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ApplyMask::GetGridSize() const {
+  return int3(dst_[0]->Width(), dst_[0]->Height(), dst_[0]->Depth());
+}
+
+Status ApplyMask::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status ApplyMask::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+ApplyMask CreateApplyMask(const OperationDef& definition) {
+  return ApplyMask(definition);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h
@ -0,0 +1,58 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_APPLY_MASK_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_APPLY_MASK_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ApplyMask : public GPUOperation {
+ public:
+  explicit ApplyMask(const OperationDef& definition)
+      : GPUOperation(definition) {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ApplyMask(ApplyMask&& operation);
+  ApplyMask& operator=(ApplyMask&& operation);
+  ApplyMask(const ApplyMask&) = delete;
+  ApplyMask& operator=(const ApplyMask&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+ApplyMask CreateApplyMask(const OperationDef& definition);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_APPLY_MASK_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask_test.cc
@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ApplyMaskOneChannel) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {-4.0f, -3.0f, -1.0f, 0.0f, 1.0f, 3.0f, 4.0f, 6.0f};
+  TensorFloat32 mask_tensor;
+  mask_tensor.shape = BHWC(1, 2, 2, 1);
+  mask_tensor.data = {2.0f, 0.5f, 1.0f, 0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ApplyMask operation = CreateApplyMask(op_def);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor, mask_tensor},
+                                    creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-8.0f, -6.0f, -0.5f, 0.0f, 1.0f,
+                                             3.0f, 0.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ApplyMaskEqualSizes) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {-4.0f, -3.0f, -1.0f, 0.0f, 1.0f, 3.0f, 4.0f, 6.0f};
+  TensorFloat32 mask_tensor;
+  mask_tensor.shape = BHWC(1, 2, 2, 2);
+  mask_tensor.data = {2.0f, 0.5f, 1.0f, 0.0f, 2.0f, 0.5f, 1.0f, 0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ApplyMask operation = CreateApplyMask(op_def);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor, mask_tensor},
+                                    creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-8.0f, -1.5f, -1.0f, 0.0f, 2.0f,
+                                             1.5f, 4.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ApplyMaskVector) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {-4.0f, -3.0f, -1.0f, 0.0f, 1.0f, 3.0f, 4.0f, 6.0f};
+  TensorFloat32 mask_tensor;
+  mask_tensor.shape = BHWC(1, 1, 1, 2);
+  mask_tensor.data = {2.0f, 0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ApplyMask operation = CreateApplyMask(op_def);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor, mask_tensor},
+                                    creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-8.0f, -1.5f, -2.0f, 0.0f, 2.0f,
+                                             1.5f, 8.0f, 3.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
@ -0,0 +1,83 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
+                           const CreationContext& creation_context,
+                           GPUOperation* operation,
+                           const std::vector<BHWC>& dst_sizes,
+                           const std::vector<TensorFloat32*>& dst_cpu) {
+  const OperationDef& op_def = operation->GetDefinition();
+  std::vector<Tensor> src(src_cpu.size());
+  for (int i = 0; i < src_cpu.size(); ++i) {
+    auto src_shape = src_cpu[i].shape;
+    RETURN_IF_ERROR(CreateTensor(
+        *creation_context.context, *creation_context.device, src_shape.w,
+        src_shape.h, src_shape.c, op_def.src_tensors[0].data_type,
+        op_def.src_tensors[0].storage_type, &src[i]));
+    RETURN_IF_ERROR(src[i].WriteData(creation_context.queue, src_cpu[i]));
+    operation->SetSrc(&src[i], i);
+  }
+
+  std::vector<Tensor> dst(dst_cpu.size());
+  for (int i = 0; i < dst_cpu.size(); ++i) {
+    auto dst_shape = dst_sizes[i];
+    RETURN_IF_ERROR(CreateTensor(
+        *creation_context.context, *creation_context.device, dst_shape.w,
+        dst_shape.h, dst_shape.c, op_def.dst_tensors[0].data_type,
+        op_def.dst_tensors[0].storage_type, &dst[i]));
+
+    operation->SetDst(&dst[i], i);
+  }
+
+  RETURN_IF_ERROR(operation->Compile(creation_context));
+  RETURN_IF_ERROR(operation->AddToQueue(creation_context.queue));
+  RETURN_IF_ERROR(creation_context.queue->WaitForCompletion());
+
+  for (int i = 0; i < dst_cpu.size(); ++i) {
+    dst_cpu[i]->shape = dst_sizes[i];
+    dst_cpu[i]->data = std::vector<float>(dst_sizes[i].DimensionsProduct(), 0);
+    RETURN_IF_ERROR(dst[i].ReadData(creation_context.queue, dst_cpu[i]));
+  }
+  return OkStatus();
+}
+
+Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
+                           const CreationContext& creation_context,
+                           GPUOperation* operation, const BHWC& dst_size,
+                           TensorFloat32* result) {
+  return ExecuteGPUOperation(
+      std::vector<TensorFloat32>{src_cpu}, creation_context, operation,
+      std::vector<BHWC>{dst_size}, std::vector<TensorFloat32*>{result});
+}
+
+Status ExecuteGPUOperation(const TensorFloat32& src_cpu,
+                           const CreationContext& creation_context,
+                           GPUOperation* operation, const BHWC& dst_size,
+                           TensorFloat32* result) {
+  return ExecuteGPUOperation(std::vector<TensorFloat32>{src_cpu},
+                             creation_context, operation, dst_size, result);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h
@ -0,0 +1,73 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CL_TEST_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CL_TEST_H_
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+#ifndef ASSERT_OK
+#define ASSERT_OK(x) ASSERT_TRUE(x.ok());
+#endif
+
+class OpenCLOperationTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    ASSERT_OK(LoadOpenCL());
+    ASSERT_OK(CreateDefaultEnvironment(&env_));
+    creation_context_.device = env_.GetDevicePtr();
+    creation_context_.context = &env_.context();
+    creation_context_.queue = env_.queue();
+    creation_context_.cache = env_.program_cache();
+  }
+
+ protected:
+  Environment env_;
+  CreationContext creation_context_;
+};
+
+Status ExecuteGPUOperation(const TensorFloat32& src_cpu,
+                           const CreationContext& creation_context,
+                           GPUOperation* operation, const BHWC& dst_size,
+                           TensorFloat32* result);
+
+Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
+                           const CreationContext& creation_context,
+                           GPUOperation* operation, const BHWC& dst_size,
+                           TensorFloat32* result);
+
+Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
+                           const CreationContext& creation_context,
+                           GPUOperation* operation,
+                           const std::vector<BHWC>& dst_sizes,
+                           const std::vector<TensorFloat32*>& dst_cpu);
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CL_TEST_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
@ -0,0 +1,170 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConcatWidth) {
+  TensorFloat32 src0, src1;
+  src0.shape = BHWC(1, 2, 1, 2);
+  src0.data = {half(0.0f), half(-1.0f), half(-0.05f), half(0.045f)};
+  src1.shape = BHWC(1, 2, 2, 2);
+  src1.data = {half(1.0f), half(-1.2f), half(-0.45f), half(1.045f),
+               half(1.1f), half(-1.3f), half(-0.55f), half(2.045f)};
+
+  ConcatAttributes attr;
+  attr.axis = Axis::WIDTH;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConcatXY operation = CreateConcatXY(op_def, attr, 2);
+      ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
+                                    BHWC(1, 2, 3, 2), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(0.0f),
+                    {half(0.0f), half(-1.0f), half(1.0f), half(-1.2f),
+                     half(-0.45f), half(1.045f), half(-0.05f), half(0.045f),
+                     half(1.1f), half(-1.3f), half(-0.55f), half(2.045f)}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConcatHeight) {
+  TensorFloat32 src0, src1;
+  src0.shape = BHWC(1, 2, 1, 2);
+  src0.data = {half(0.0f), half(-1.0f), half(-0.05f), half(0.045f)};
+  src1.shape = BHWC(1, 1, 1, 2);
+  src1.data = {half(1.0f), half(-1.2f)};
+
+  ConcatAttributes attr;
+  attr.axis = Axis::HEIGHT;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConcatXY operation = CreateConcatXY(op_def, attr, 2);
+      ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
+                                    BHWC(1, 3, 1, 2), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(0.0f), {half(0.0f), half(-1.0f), half(-0.05f),
+                                      half(0.045f), half(1.0f), half(-1.2f)}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConcatChannels) {
+  TensorFloat32 src0, src1, src2;
+  src0.shape = BHWC(1, 2, 1, 1);
+  src0.data = {half(0.0f), half(-1.0f)};
+  src1.shape = BHWC(1, 2, 1, 2);
+  src1.data = {half(1.0f), half(2.0f), half(3.0f), half(4.0f)};
+  src2.shape = BHWC(1, 2, 1, 3);
+  src2.data = {half(5.0f), half(6.0f), half(7.0f),
+               half(8.0f), half(9.0),  half(10.0f)};
+
+  ConcatAttributes attr;
+  attr.axis = Axis::CHANNELS;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConcatZ operation = CreateConcatZ(op_def, {1, 2, 3});
+      ASSERT_OK(ExecuteGPUOperation({src0, src1, src2}, creation_context_,
+                                    &operation, BHWC(1, 2, 1, 6), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(0.0f),
+                            {half(0.0f), half(1.0f), half(2.0f), half(5.0f),
+                             half(6.0f), half(7.0f), half(-1.0f), half(3.0f),
+                             half(4.0f), half(8.0f), half(9.0), half(10.0f)}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConcatChannelsAlignedx4) {
+  TensorFloat32 src0, src1;
+  src0.shape = BHWC(1, 2, 1, 4);
+  src0.data = {half(-1.0f), half(-2.0f), half(-3.0f), half(-4.0f),
+               half(1.0f),  half(2.0f),  half(3.0f),  half(4.0f)};
+  src1.shape = BHWC(1, 2, 1, 4);
+  src1.data = {half(5.0f),  half(6.0f),  half(7.0f),  half(8.0f),
+               half(-5.0f), half(-6.0f), half(-7.0f), half(-8.0f)};
+
+  ConcatAttributes attr;
+  attr.axis = Axis::CHANNELS;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConcatZ operation = CreateConcatZ(op_def, {4, 4});
+      ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 8), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(0.0f),
+                    {half(-1.0f), half(-2.0f), half(-3.0f), half(-4.0f),
+                     half(5.0f), half(6.0f), half(7.0f), half(8.0f), half(1.0f),
+                     half(2.0f), half(3.0f), half(4.0f), half(-5.0f),
+                     half(-6.0f), half(-7.0f), half(-8.0f)}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
@ -0,0 +1,164 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetConcatKernelCode(
+    const OperationDef& definition, int tensors_count,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  std::vector<std::shared_ptr<TensorCodeGenerator>> srcs(tensors_count);
+  for (int i = 0; i < tensors_count; ++i) {
+    const std::string tensor_name = "src_data_" + std::to_string(i);
+    const std::string uniform_name = "src_size_" + std::to_string(i);
+    srcs[i] = std::shared_ptr<TensorCodeGenerator>(new TensorCodeGenerator(
+        tensor_name, uniform_name, definition.src_tensors[i]));
+  }
+  TensorCodeGenerator dst("dst_data", "dst_size", definition.dst_tensors[0]);
+
+  std::string c = GetCommonDefines(definition.precision);
+
+  c += "__kernel void main_function(\n";
+  for (const auto& src : srcs) {
+    c += src->GetDeclaration(AccessType::READ) + ",\n";
+  }
+  c += dst.GetDeclaration(AccessType::WRITE);
+  c += GetArgsDeclaration(linked_operations);
+  for (int i = 0; i < tensors_count; ++i) {
+    const std::string uniform_name = "src_size_" + std::to_string(i);
+    c += "    int4 " + uniform_name + ",\n";
+  }
+  for (int i = 0; i < tensors_count; ++i) {
+    const std::string uniform_name = "dst_offset_" + std::to_string(i);
+    c += "    int2 " + uniform_name + ",\n";
+  }
+  c += "    int4 dst_size  \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  for (int i = 0; i < tensors_count; ++i) {
+    const std::string offset_name = "dst_offset_" + std::to_string(i);
+    const std::string size_name = "src_size_" + std::to_string(i);
+    c += "  if (X < " + size_name + ".x && Y < " + size_name + ".y) { \n";
+    c += "    FLT4 result = " + srcs[i]->Read3D("X", "Y", "Z") + ";\n";
+    c += "    int dst_x = X + " + offset_name + ".x;\n";
+    c += "    int dst_y = Y + " + offset_name + ".y;\n";
+    c += "    " + dst.GetAddress("dst_adr", "dst_x", "dst_y", "Z");
+    c += PostProcess(linked_operations, "result", "Z", "dst_adr");
+    c += "    " + dst.Write3D("result", "dst_adr");
+    c += "  } \n";
+  }
+  c += "}\n";
+  return c;
+}
+
+}  // namespace
+
+ConcatXY::ConcatXY(ConcatXY&& operation)
+    : GPUOperation(std::move(operation)),
+      attr_(operation.attr_),
+      tensors_count_(operation.tensors_count_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ConcatXY& ConcatXY::operator=(ConcatXY&& operation) {
+  if (this != &operation) {
+    attr_ = operation.attr_;
+    tensors_count_ = operation.tensors_count_;
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ConcatXY::Compile(const CreationContext& creation_context) {
+  const auto code =
+      GetConcatKernelCode(definition_, tensors_count_, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConcatXY::BindArguments() {
+  kernel_.ResetBindingCounter();
+  for (int i = 0; i < tensors_count_; ++i) {
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[i]->GetMemoryPtr()));
+  }
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  int max_src_width = 0;
+  int max_src_height = 0;
+  for (int i = 0; i < tensors_count_; ++i) {
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[i]->GetSizeWithDepth()));
+    max_src_width = std::max(max_src_width, src_[i]->Width());
+    max_src_height = std::max(max_src_height, src_[i]->Height());
+  }
+  int x_offset = 0;
+  int y_offset = 0;
+  for (int i = 0; i < tensors_count_; ++i) {
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(int2(x_offset, y_offset)));
+    x_offset += attr_.axis == Axis::WIDTH ? src_[i]->Width() : 0;
+    y_offset += attr_.axis == Axis::HEIGHT ? src_[i]->Height() : 0;
+  }
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ConcatXY::GetGridSize() const {
+  int max_src_width = 0;
+  int max_src_height = 0;
+  for (int i = 0; i < tensors_count_; ++i) {
+    max_src_width = std::max(max_src_width, src_[i]->Width());
+    max_src_height = std::max(max_src_height, src_[i]->Height());
+  }
+
+  const int grid_x = max_src_width;
+  const int grid_y = max_src_height;
+  const int grid_z = dst_[0]->Depth();
+
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConcatXY::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status ConcatXY::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+ConcatXY CreateConcatXY(const OperationDef& definition,
+                        const ConcatAttributes& attr, int tensors_count) {
+  return ConcatXY(definition, attr, tensors_count);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_XY_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_XY_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConcatXY : public GPUOperation {
+ public:
+  ConcatXY(const OperationDef& definition, const ConcatAttributes& attr,
+           int tensors_count)
+      : GPUOperation(definition), attr_(attr), tensors_count_(tensors_count) {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConcatXY(ConcatXY&& operation);
+  ConcatXY& operator=(ConcatXY&& operation);
+  ConcatXY(const ConcatXY&) = delete;
+  ConcatXY& operator=(const ConcatXY&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  ConcatAttributes attr_;
+  int tensors_count_;
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+ConcatXY CreateConcatXY(const OperationDef& definition,
+                        const ConcatAttributes& attr, int tensors_count);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_XY_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
@ -0,0 +1,216 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+namespace {
+bool IsAllChannelsX4(const std::vector<int>& channels) {
+  for (int channel : channels) {
+    if (channel % 4 != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::string GetConcatKernelCode(
+    const OperationDef& definition, const std::vector<int>& channels,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  std::vector<std::shared_ptr<TensorCodeGenerator>> srcs(channels.size());
+  for (int i = 0; i < channels.size(); ++i) {
+    const std::string tensor_name = "src_data_" + std::to_string(i);
+    const std::string uniform_name = "src_size_" + std::to_string(i);
+    srcs[i] = std::shared_ptr<TensorCodeGenerator>(new TensorCodeGenerator(
+        tensor_name, uniform_name, definition.src_tensors[i]));
+  }
+  TensorCodeGenerator dst("dst_data", "dst_size", definition.dst_tensors[0]);
+
+  std::string code = GetCommonDefines(definition.precision);
+  const std::string postfix[] = {".x", ".y", ".z", ".w"};
+
+  code += "__kernel void main_function(\n";
+  for (const auto& src : srcs) {
+    code += src->GetDeclaration(AccessType::READ) + ",\n";
+  }
+  code += dst.GetDeclaration(AccessType::WRITE);
+  code += GetArgsDeclaration(linked_operations);
+  for (int i = 0; i < channels.size(); ++i) {
+    const std::string uniform_name = "src_size_" + std::to_string(i);
+    code += "    int4 " + uniform_name + ",\n";
+  }
+  code += "    int4 dst_size\n";
+  code += ") {\n";
+  code += "  int X = get_global_id(0);\n";
+  code += "  int Y = get_global_id(1);\n";
+  code += "  if (X >= dst_size.x || Y >= dst_size.y) { \n";
+  code += "    return; \n";
+  code += "  } \n";
+
+  if (IsAllChannelsX4(channels)) {
+    // When all channels % 4 == 0 we can read/assign/write FLT4 elements easily.
+    // Also it is easy to write a loop in this case, to prevent long kernel
+    // generation.
+    code += "  int Z = 0;\n";
+    for (int i = 0; i < channels.size(); ++i) {
+      const std::string uniform_name = "src_size_" + std::to_string(i);
+      const int depth = IntegralDivideRoundUp(channels[i], 4);
+      if (depth % 2 == 0) {
+        // We can read more at once inside of loop in case depth % 2 == 0
+        // it should be better for reading latency hiding
+        code += "  for (int i = 0; i < " + uniform_name + ".w; i += 2) {\n";
+        code += "    FLT4 result0 = " + srcs[i]->Read3D("X", "Y", "i") + ";\n";
+        code +=
+            "    FLT4 result1 = " + srcs[i]->Read3D("X", "Y", "i + 1") + ";\n";
+        code += "    " + dst.GetAddress("dst_adr0", "X", "Y", "Z") + "\n";
+        code += "    " + dst.GetAddress("dst_adr1", "X", "Y", "Z + 1") + "\n";
+        code += PostProcess(linked_operations, "result0", "Z", "dst_adr0");
+        code += PostProcess(linked_operations, "result1", "Z + 1", "dst_adr1");
+        code += "    " + dst.Write3D("result0", "dst_adr0");
+        code += "    " + dst.Write3D("result1", "dst_adr1");
+        code += "    Z += 2;\n";
+        code += "  }\n";
+      } else {
+        code += "  for (int i = 0; i < " + uniform_name + ".w; ++i) {\n";
+        code += "    FLT4 result = " + srcs[i]->Read3D("X", "Y", "i") + ";\n";
+        code += "    " + dst.GetAddress("dst_adr", "X", "Y", "Z") + "\n";
+        code += PostProcess(linked_operations, "result", "Z", "dst_adr");
+        code += "    " + dst.Write3D("result", "dst_adr");
+        code += "    Z++;\n";
+        code += "  }\n";
+      }
+    }
+  } else {
+    code += "  FLT4 result = (FLT4)(0.0);\n";
+    int out_channel = 0;
+    int read_index = 0;
+    int z = 0;
+    for (int i = 0; i < channels.size(); ++i) {
+      const int depth = IntegralDivideRoundUp(channels[i], 4);
+      for (int d = 0; d < depth; ++d) {
+        const int channels_in_group = std::min(4, channels[i] - d * 4);
+        const std::string temp_name = "t" + std::to_string(read_index);
+        code += "  FLT4 " + temp_name + " = ";
+        code += srcs[i]->Read3D("X", "Y", std::to_string(d)) + ";\n";
+        for (int c = 0; c < channels_in_group; ++c) {
+          code += "  result" + postfix[out_channel] + " = ";
+          code += temp_name + postfix[c] + ";\n";
+          out_channel++;
+          if (out_channel == 4) {
+            out_channel = 0;
+            code += "  {\n";
+            code += "  " +
+                    dst.GetAddress("dst_adr", "X", "Y", std::to_string(z)) +
+                    "\n";
+            code += PostProcess(linked_operations, "result", std::to_string(z),
+                                "dst_adr");
+            code += "  " + dst.Write3D("result", "dst_adr");
+            code += "  }\n";
+            z++;
+          }
+        }
+        read_index++;
+      }
+    }
+    if (out_channel != 0) {
+      code += "  {\n";
+      code +=
+          "  " + dst.GetAddress("dst_adr", "X", "Y", std::to_string(z)) + "\n";
+      code += PostProcess(linked_operations, "result", std::to_string(z),
+                          "dst_adr");
+      code += "  " + dst.Write3D("result", "dst_adr");
+      code += "  }\n";
+    }
+  }
+  code += "}\n";
+  return code;
+}
+}  // namespace
+
+ConcatZ::ConcatZ(ConcatZ&& kernel)
+    : GPUOperation(std::move(kernel)),
+      channels_(std::move(kernel.channels_)),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+ConcatZ& ConcatZ::operator=(ConcatZ&& kernel) {
+  if (this != &kernel) {
+    channels_ = std::move(kernel.channels_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status ConcatZ::Compile(const CreationContext& creation_context) {
+  const auto code =
+      GetConcatKernelCode(definition_, channels_, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConcatZ::BindArguments() {
+  kernel_.ResetBindingCounter();
+  for (int i = 0; i < channels_.size(); ++i) {
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[i]->GetMemoryPtr()));
+  }
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  for (int i = 0; i < channels_.size(); ++i) {
+    int4 size(src_[i]->Width(), src_[i]->Height(), channels_[i],
+              IntegralDivideRoundUp(channels_[i], 4));
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(size));
+  }
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ConcatZ::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = 1;
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConcatZ::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status ConcatZ::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+ConcatZ CreateConcatZ(const OperationDef& definition,
+                      const std::vector<int>& channels) {
+  return ConcatZ(definition, channels);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_Z_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_Z_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConcatZ : public GPUOperation {
+ public:
+  ConcatZ(const OperationDef& definition, const std::vector<int>& channels)
+      : GPUOperation(definition), channels_(channels) {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConcatZ(ConcatZ&& kernel);
+  ConcatZ& operator=(ConcatZ&& kernel);
+  ConcatZ(const ConcatZ&) = delete;
+  ConcatZ& operator=(const ConcatZ&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  std::vector<int> channels_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+ConcatZ CreateConcatZ(const OperationDef& definition,
+                      const std::vector<int>& channels);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_Z_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.cc
@ -0,0 +1,281 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateConvBuffer(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    int x_elements, int y_elements,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  std::string c = GetCommonDefines(precision);
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F16:
+      c += "#define CONV(R, S)    \\\n";
+      c += "R += S.x * f0.s0123; \\\n";
+      c += "R += S.y * f0.s4567; \\\n";
+      c += "R += S.z * f0.s89ab; \\\n";
+      c += "R += S.w * f0.scdef;   \n";
+      break;
+    case CalculationsPrecision::F32_F16:
+      c += "#define CONV(R, S) \\\n";
+      c += "R += convert_float4(S.x * f0.s0123 + S.y * f0.s4567 + S.z * "
+           "f0.s89ab + S.w * f0.scdef);\n";
+      break;
+  }
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+      c += "#define FLT16 float16\n";
+      break;
+    case CalculationsPrecision::F32_F16:
+    case CalculationsPrecision::F16:
+      c += "#define FLT16 half16\n";
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __global FLT16* filters_buffer,   \n";
+  c += "    __global FLT4* biases             \n";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,                   \n";
+  c += "    int4 dst_size,                   \n";
+  c += "    int2 kernel_size,                \n";
+  c += "    int2 dillation,                  \n";
+  c += "    int2 stride,                     \n";
+  c += "    int2 padding                     \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0) * " + std::to_string(x_elements) + ";\n";
+  c += "  int Y = get_global_id(1) * " + std::to_string(y_elements) + ";\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
+  c += "  __global FLT16* temp = filters_buffer + Z * src_size.w * "
+       "kernel_size.x * kernel_size.y;\n";
+  c += "  ACCUM_FLT4 bias_val = TO_ACCUM_TYPE(biases[Z]);\n";
+  for (int i = 0; i < x_elements * y_elements; ++i) {
+    c += "  ACCUM_FLT4 r" + std::to_string(i) + " = bias_val;\n";
+  }
+  for (int x = 0; x < x_elements; ++x) {
+    std::string x_s = std::to_string(x);
+    c += "  int xc" + x_s + " = (X + " + x_s + ") * stride.x + padding.x;\n";
+  }
+  for (int y = 0; y < y_elements; ++y) {
+    std::string y_s = std::to_string(y);
+    c += "  int yc" + y_s + " = (Y + " + y_s + ") * stride.y + padding.y;\n";
+  }
+  c += "  for (int y = 0; y < kernel_size.y; ++y) {\n";
+  for (int y = 0; y < y_elements; ++y) {
+    std::string y_s = std::to_string(y);
+    c += "  int c" + y_s + "y = y * dillation.y + yc" + y_s + ";\n";
+    c += "  bool y" + y_s + "_in = c" + y_s + "y >= 0 && c" + y_s +
+         "y < src_size.y;\n";
+    c += "  c" + y_s + "y = clamp(c" + y_s + "y, 0, src_size.y - 1);\n";
+  }
+  c += "  for (int x = 0; x < kernel_size.x; ++x) {\n";
+  for (int x = 0; x < x_elements; ++x) {
+    std::string x_s = std::to_string(x);
+    c += "  int c" + x_s + "x = x * dillation.x + xc" + x_s + ";\n";
+    c += "  bool x" + x_s + "_in = c" + x_s + "x >= 0 && c" + x_s +
+         "x < src_size.x;\n";
+    c += "  c" + x_s + "x = clamp(c" + x_s + "x, 0, src_size.x - 1);\n";
+  }
+  for (int x = 0; x < x_elements; ++x) {
+    std::string x_s = std::to_string(x);
+    for (int y = 0; y < y_elements; ++y) {
+      std::string y_s = std::to_string(y);
+      std::string i_s = std::to_string(y * x_elements + x);
+      c += "  int src_addr_" + i_s + " = c" + y_s + "y * src_size.x + c" + x_s +
+           "x;\n";
+    }
+  }
+  c += "  for (int s = 0; s < src_size.w; ++s) {\n";
+  for (int x = 0; x < x_elements; ++x) {
+    std::string x_s = std::to_string(x);
+    for (int y = 0; y < y_elements; ++y) {
+      std::string y_s = std::to_string(y);
+      std::string i_s = std::to_string(y * x_elements + x);
+      c += "    FLT4 s" + i_s + " = src_data[src_addr_" + i_s + "] * (FLT)(y" +
+           y_s + "_in && x" + x_s + "_in);\n";
+    }
+  }
+  c += "    FLT16 f0 = temp[0];\n";
+  for (int i = 0; i < x_elements * y_elements; ++i) {
+    std::string i_s = std::to_string(i);
+    c += "    CONV(r" + i_s + ", s" + i_s + ");\n";
+  }
+  for (int i = 0; i < x_elements * y_elements; ++i) {
+    std::string i_s = std::to_string(i);
+    c += "    src_addr_" + i_s + " += src_size.z;\n";
+  }
+  c += "    temp += 1;\n";
+  c += "  }\n";  // src_size.w - SRC_DEPTH
+  c += "  }\n";  // kernel_size.x
+  c += "  }\n";  // kernel_size.y
+
+  for (int x = 0; x < x_elements; ++x) {
+    std::string x_s = std::to_string(x);
+    for (int y = 0; y < y_elements; ++y) {
+      std::string y_s = std::to_string(y);
+      std::string i_s = std::to_string(y * x_elements + x);
+      c += "  if (X + " + x_s + " < dst_size.x && Y + " + y_s +
+           " < dst_size.y) {\n";
+      c += "    FLT4 res = TO_FLT4(r" + i_s + ");\n";
+      c += "  " +
+           dst_tensor.GetAddress("address", "X + " + x_s, "Y + " + y_s, "Z") +
+           "\n";
+      c += PostProcess(linked_operations, "res", "Z", "address");
+      c += "  " + dst_tensor.Write3D("res", "address") + "\n";
+      c += "  }\n";
+    }
+  }
+  c += "}\n";
+  return c;
+}
+}  // namespace
+
+ConvBuffer::ConvBuffer(const OperationDef& definition,
+                       const Convolution2DAttributes& attr, int x_elements,
+                       int y_elements)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
+      dilation_(attr.dilations.w, attr.dilations.h),
+      x_elements_(x_elements),
+      y_elements_(y_elements),
+      work_group_size_(4, 4, 4) {}
+
+ConvBuffer::ConvBuffer(ConvBuffer&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_(std::move(operation.weights_)),
+      biases_(std::move(operation.biases_)),
+      kernel_size_(operation.kernel_size_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      dilation_(operation.dilation_),
+      x_elements_(operation.x_elements_),
+      y_elements_(operation.y_elements_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ConvBuffer& ConvBuffer::operator=(ConvBuffer&& operation) {
+  if (this != &operation) {
+    weights_ = std::move(operation.weights_);
+    biases_ = std::move(operation.biases_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(dilation_, operation.dilation_);
+    std::swap(x_elements_, operation.x_elements_);
+    std::swap(y_elements_, operation.y_elements_);
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ConvBuffer::Compile(const CreationContext& creation_context) {
+  std::string code = GenerateConvBuffer(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, x_elements_, y_elements_, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConvBuffer::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  int4 src_size = int4(src_[0]->Width(), src_[0]->Height(),
+                       src_[0]->Width() * src_[0]->Height(), src_[0]->Depth());
+  int4 dst_size = int4(dst_[0]->Width(), dst_[0]->Height(),
+                       dst_[0]->Width() * dst_[0]->Height(), dst_[0]->Depth());
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_size));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_size));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dilation_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  return OkStatus();
+}
+
+int3 ConvBuffer::GetGridSize() const {
+  const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), x_elements_);
+  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), y_elements_);
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConvBuffer::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
+                              &work_group_size_);
+}
+
+Status ConvBuffer::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Status CreateConvBuffer(const CreationContext& creation_context,
+                        const OperationDef& definition,
+                        const Convolution2DAttributes& attr,
+                        ConvBuffer* result) {
+  int x_elements = 2;
+  int y_elements = 1;
+  if (definition.precision != CalculationsPrecision::F16) {
+    x_elements = 1;
+    y_elements = 1;
+  }
+  *result = ConvBuffer(definition, attr, x_elements, y_elements);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::BUFFER;
+  create_info.data_type = definition.GetDataType();
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h
@ -0,0 +1,116 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConvBuffer : public GPUOperation {
+ public:
+  ConvBuffer() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConvBuffer(ConvBuffer&& operation);
+  ConvBuffer& operator=(ConvBuffer&& operation);
+  ConvBuffer(const ConvBuffer&) = delete;
+  ConvBuffer& operator=(const ConvBuffer&) = delete;
+
+ private:
+  friend Status CreateConvBuffer(const CreationContext& creation_context,
+                                 const OperationDef& definition,
+                                 const Convolution2DAttributes& attr,
+                                 ConvBuffer* result);
+  ConvBuffer(const OperationDef& definition,
+             const Convolution2DAttributes& attr, int x_elements,
+             int y_elements);
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Buffer weights_;
+  LinearStorage biases_;
+
+  int2 kernel_size_;
+  int2 stride_;
+  int2 padding_;
+  int2 dilation_;
+  int x_elements_;
+  int y_elements_;
+
+  CLKernel kernel_;
+  int3 work_group_size_;
+};
+
+template <DataType T>
+Status ConvBuffer::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                                 CLContext* context) {
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+
+  const int float4_size = definition_.precision == CalculationsPrecision::F32
+                              ? sizeof(float4)
+                              : sizeof(half4);
+
+  const int elements_count =
+      weights.shape.h * weights.shape.w * src_depth * dst_depth * 4;
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsToOHWI4I4O(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_);
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsToOHWI4I4O(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_);
+  }
+}
+
+Status CreateConvBuffer(const CreationContext& creation_context,
+                        const OperationDef& definition,
+                        const Convolution2DAttributes& attr,
+                        ConvBuffer* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
@ -0,0 +1,351 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
+
+#include <array>
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+// x_elements - amount of elements processed by thread in W dimension
+// y_elements - amount of elements processed by thread in H dimension
+// element_size must be 1, 2 or 4
+// 1 - is FLT4
+// 2 - is FLT8
+// 4 - is FLT16
+// This function generates code for arithmetic part of convolution
+std::string GetComputationPart(int x_elements, int y_elements, int element_size,
+                               CalculationsPrecision precision) {
+  const std::string hexes[16] = {"0", "1", "2", "3", "4", "5", "6", "7",
+                                 "8", "9", "a", "b", "c", "d", "e", "f"};
+  std::string c;
+  for (int y = 0; y < y_elements; ++y) {
+    for (int x = 0; x < x_elements; ++x) {
+      std::string s_index = std::to_string(y * x_elements + x);
+      for (int e = 0; e < element_size; ++e) {
+        std::string r_index =
+            std::to_string((y * x_elements + x) * element_size + e);
+        switch (precision) {
+          case CalculationsPrecision::F32:
+          case CalculationsPrecision::F16:
+            c += "    r" + r_index + " += f0.s0123 * s" + s_index + ".s" +
+                 hexes[e * 4 + 0] + ";\n";
+            c += "    r" + r_index + " += f0.s4567 * s" + s_index + ".s" +
+                 hexes[e * 4 + 1] + ";\n";
+            c += "    r" + r_index + " += f0.s89ab * s" + s_index + ".s" +
+                 hexes[e * 4 + 2] + ";\n";
+            c += "    r" + r_index + " += f0.scdef * s" + s_index + ".s" +
+                 hexes[e * 4 + 3] + ";\n";
+            break;
+          case CalculationsPrecision::F32_F16:
+            c += "    r" + r_index + " += convert_float4(f0.s0123 * s" +
+                 s_index + ".s" + hexes[e * 4 + 0] + " + f0.s4567 * s" +
+                 s_index + ".s" + hexes[e * 4 + 1] + " + f0.s89ab * s" +
+                 s_index + ".s" + hexes[e * 4 + 2] + " + f0.scdef * s" +
+                 s_index + ".s" + hexes[e * 4 + 3] + ");\n";
+            break;
+        }
+      }
+    }
+  }
+  return c;
+}
+
+std::string GetShiftFromElementSize(int element_size) {
+  if (element_size == 4) {
+    return " >> 2";
+  } else if (element_size == 2) {
+    return " >> 1";
+  } else {
+    return "";
+  }
+}
+
+std::string GenerateConvBuffer1x1(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    int x_elements, int y_elements, int element_size,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  std::string c = GetCommonDefines(precision);
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+      c += "#define FLT8 float8\n";
+      c += "#define FLT16 float16\n";
+      break;
+    case CalculationsPrecision::F32_F16:
+    case CalculationsPrecision::F16:
+      c += "#define FLT8 half8\n";
+      c += "#define FLT16 half16\n";
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += "    __global FLT" + std::to_string(element_size * 4) + "* src_data,\n";
+  c += "    __global FLT16* filters_buffer,   \n";
+  c += "    __global FLT4* biases             \n";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,                   \n";
+  c += "    int4 dst_size                    \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0) * " +
+       std::to_string(x_elements * element_size) + ";\n";
+  c += "  int Y = get_global_id(1) * " + std::to_string(y_elements) + ";\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
+  c += "  __global FLT16* temp = filters_buffer + Z * src_size.w;\n";
+  c += "  ACCUM_FLT4 bias_val = TO_ACCUM_TYPE(biases[Z]);\n";
+  for (int i = 0; i < x_elements * element_size * y_elements; ++i) {
+    c += "  ACCUM_FLT4 r" + std::to_string(i) + " = bias_val;\n";
+  }
+  for (int x = 0; x < x_elements; ++x) {
+    std::string x_s = std::to_string(x);
+    c += "  int xc" + x_s + " = min(X + " + std::to_string(x * element_size) +
+         ", src_size.x - 1);\n";
+  }
+  for (int y = 0; y < y_elements; ++y) {
+    std::string y_s = std::to_string(y);
+    c += "  int yc" + y_s + " = min(Y + " + y_s + ", src_size.y - 1);\n";
+  }
+  std::string shift = GetShiftFromElementSize(element_size);
+  for (int y = 0; y < y_elements; ++y) {
+    std::string y_s = std::to_string(y);
+    for (int x = 0; x < x_elements; ++x) {
+      std::string x_s = std::to_string(x);
+      std::string i_s = std::to_string(y * x_elements + x);
+      c += "  int src_addr_" + i_s + " = ((yc" + y_s + ") * src_size.x + (xc" +
+           x_s + "))" + shift + ";\n";
+    }
+  }
+  c += "  for (int s = 0; s < src_size.w; ++s) {\n";
+  for (int y = 0; y < y_elements; ++y) {
+    std::string y_s = std::to_string(y);
+    for (int x = 0; x < x_elements; ++x) {
+      std::string x_s = std::to_string(x);
+      std::string i_s = std::to_string(y * x_elements + x);
+      c += "    FLT" + std::to_string(element_size * 4) + " s" + i_s +
+           " = src_data[src_addr_" + i_s + "];\n";
+    }
+  }
+  c += "    FLT16 f0 = temp[0];\n";
+  c += GetComputationPart(x_elements, y_elements, element_size, precision);
+  for (int i = 0; i < x_elements * y_elements; ++i) {
+    std::string i_s = std::to_string(i);
+    c += "    src_addr_" + i_s + " += src_size.z;\n";
+  }
+  c += "    temp += 1;\n";
+  c += "  }\n";  // src_size.w = SRC_DEPTH
+
+  for (int y = 0; y < y_elements; ++y) {
+    std::string y_s = std::to_string(y);
+    for (int x = 0; x < x_elements * element_size; ++x) {
+      std::string x_s = std::to_string(x);
+      std::string i_s = std::to_string(y * x_elements * element_size + x);
+      c += "  if (X + " + x_s + " < dst_size.x && Y + " + y_s +
+           " < dst_size.y) {\n";
+      c += "    FLT4 res = TO_FLT4(r" + i_s + ");\n";
+      c += "  " +
+           dst_tensor.GetAddress("address", "X + " + x_s, "Y + " + y_s, "Z") +
+           "\n";
+      c += PostProcess(linked_operations, "res", "Z", "address");
+      c += "  " + dst_tensor.Write3D("res", "address") + "\n";
+      c += "  }\n";
+    }
+  }
+  c += "}\n";
+  return c;
+}
+
+int GetGridWidth(int width) {
+  if (width % 2 == 0) {  // using kernel_flt8_
+    return width / 2;
+  } else {  // using kernel_flt4_
+    return width;
+  }
+}
+
+}  // namespace
+
+ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition,
+                             const Convolution2DAttributes& attr,
+                             int flt4_x_count, int flt4_y_count,
+                             int flt8_x_count, int flt8_y_count)
+    : GPUOperation(definition),
+      flt4_x_count_(flt4_x_count),
+      flt4_y_count_(flt4_y_count),
+      flt8_x_count_(flt8_x_count),
+      flt8_y_count_(flt8_y_count),
+      work_group_size_(2, 4, 1) {}
+
+ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_(std::move(operation.weights_)),
+      biases_(std::move(operation.biases_)),
+      kernel_flt4_(std::move(operation.kernel_flt4_)),
+      flt4_x_count_(operation.flt4_x_count_),
+      flt4_y_count_(operation.flt4_y_count_),
+      kernel_flt8_(std::move(operation.kernel_flt8_)),
+      flt8_x_count_(operation.flt8_x_count_),
+      flt8_y_count_(operation.flt8_y_count_),
+      work_group_size_(operation.work_group_size_) {}
+
+ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
+  if (this != &operation) {
+    weights_ = std::move(operation.weights_);
+    biases_ = std::move(operation.biases_);
+    kernel_flt4_ = std::move(operation.kernel_flt4_);
+    std::swap(flt4_x_count_, operation.flt4_x_count_);
+    std::swap(flt4_y_count_, operation.flt4_y_count_);
+    kernel_flt8_ = std::move(operation.kernel_flt8_);
+    std::swap(flt8_x_count_, operation.flt8_x_count_);
+    std::swap(flt8_y_count_, operation.flt8_y_count_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
+  std::string code_flt4 = GenerateConvBuffer1x1(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, flt4_x_count_, flt4_y_count_, 1,
+      linked_operations_);
+  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+      code_flt4, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_flt4_));
+  std::string code_flt8 = GenerateConvBuffer1x1(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, flt8_x_count_, flt8_y_count_, 2,
+      linked_operations_);
+  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+      code_flt8, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_flt8_));
+  return OkStatus();
+}
+
+CLKernel* ConvBuffer1x1::GetKernel(int width) {
+  if (width % 2 == 0) {
+    return &kernel_flt8_;
+  } else {
+    return &kernel_flt4_;
+  }
+}
+
+Status ConvBuffer1x1::BindArguments() {
+  CLKernel* kernel = GetKernel(src_[0]->Width());
+  kernel->ResetBindingCounter();
+  RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel->SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel->SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(kernel, linked_operations_));
+  RETURN_IF_ERROR(kernel->SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  int4 src_size = int4(src_[0]->Width(), src_[0]->Height(),
+                       GetGridWidth(src_[0]->Width()) * src_[0]->Height(),
+                       src_[0]->Depth());
+  int4 dst_size = int4(dst_[0]->Width(), dst_[0]->Height(),
+                       dst_[0]->Width() * dst_[0]->Height(), dst_[0]->Depth());
+  RETURN_IF_ERROR(kernel->SetBytesAuto(src_size));
+  RETURN_IF_ERROR(kernel->SetBytesAuto(dst_size));
+  return OkStatus();
+}
+
+int3 ConvBuffer1x1::GetGridSize() const {
+  if (src_[0]->Width() % 2 == 0) {  // using kernel_flt8_
+    const int grid_x =
+        IntegralDivideRoundUp(GetGridWidth(dst_[0]->Width()), flt8_x_count_);
+    const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), flt8_y_count_);
+    const int grid_z = dst_[0]->Depth();
+    return int3(grid_x, grid_y, grid_z);
+  } else {  // using kernel_flt4_
+    const int grid_x =
+        IntegralDivideRoundUp(GetGridWidth(dst_[0]->Width()), flt4_x_count_);
+    const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), flt4_y_count_);
+    const int grid_z = dst_[0]->Depth();
+    return int3(grid_x, grid_y, grid_z);
+  }
+}
+
+Status ConvBuffer1x1::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroupConv(params, *GetKernel(src_[0]->Width()),
+                              GetGridSize(), &work_group_size_);
+}
+
+Status ConvBuffer1x1::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(*GetKernel(src_[0]->Width()), GetGridSize(),
+                                 work_group_size_);
+}
+
+bool IsConvBuffer1x1Supported(const OperationDef& definition,
+                              const Convolution2DAttributes& attr) {
+  auto src_storage_type = definition.src_tensors[0].storage_type;
+  return src_storage_type == TensorStorageType::BUFFER &&
+         attr.weights.shape.w == 1 && attr.weights.shape.h == 1 &&
+         attr.dilations.w == 1 && attr.dilations.w == 1 &&
+         attr.strides.w == 1 && attr.strides.h == 1 &&
+         attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 &&
+         attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
+}
+
+Status CreateConvBuffer1x1(const CreationContext& creation_context,
+                           const OperationDef& definition,
+                           const Convolution2DAttributes& attr,
+                           ConvBuffer1x1* result) {
+  if (!IsConvBuffer1x1Supported(definition, attr)) {
+    return InvalidArgumentError("ConvBuffer1x1 doesn't supported");
+  }
+  int flt4_x_count = 1;
+  int flt4_y_count = 1;
+  int flt8_x_count = 1;
+  int flt8_y_count = 1;
+  if (creation_context.device->vendor() == Vendor::MALI) {
+    if (definition.precision == CalculationsPrecision::F16 &&
+        creation_context.device->GetInfo().compute_units_count <= 4) {
+      flt4_x_count = 2;
+      flt8_x_count = 2;
+    }
+  }
+  *result = ConvBuffer1x1(definition, attr, flt4_x_count, flt4_y_count,
+                          flt8_x_count, flt8_y_count);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::BUFFER;
+  create_info.data_type = definition.GetDataType();
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
@ -0,0 +1,123 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_1X1_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_1X1_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConvBuffer1x1 : public GPUOperation {
+ public:
+  ConvBuffer1x1() = default;
+
+  // Move only
+  ConvBuffer1x1(ConvBuffer1x1&& operation);
+  ConvBuffer1x1& operator=(ConvBuffer1x1&& operation);
+  ConvBuffer1x1(const ConvBuffer1x1&) = delete;
+  ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
+
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+ private:
+  friend Status CreateConvBuffer1x1(const CreationContext& creation_context,
+                                    const OperationDef& definition,
+                                    const Convolution2DAttributes& attr,
+                                    ConvBuffer1x1* result);
+  ConvBuffer1x1(const OperationDef& definition,
+                const Convolution2DAttributes& attr, int flt4_x_count,
+                int flt4_y_count, int flt8_x_count, int flt8_y_count);
+  template <DataType T>
+
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  CLKernel* GetKernel(int width);
+
+  Buffer weights_;
+  LinearStorage biases_;
+
+  CLKernel kernel_flt4_;
+  int flt4_x_count_;
+  int flt4_y_count_;
+
+  CLKernel kernel_flt8_;
+  int flt8_x_count_;
+  int flt8_y_count_;
+
+  int3 work_group_size_;
+};
+
+template <DataType T>
+Status ConvBuffer1x1::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+
+  const int float4_size = definition_.precision == CalculationsPrecision::F32
+                              ? sizeof(float4)
+                              : sizeof(half4);
+
+  const int elements_count =
+      weights.shape.h * weights.shape.w * src_depth * dst_depth * 4;
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsToOHWI4I4O(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_);
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsToOHWI4I4O(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_);
+  }
+}
+
+bool IsConvBuffer1x1Supported(const OperationDef& definition,
+                              const Convolution2DAttributes& attr);
+
+Status CreateConvBuffer1x1(const CreationContext& creation_context,
+                           const OperationDef& definition,
+                           const Convolution2DAttributes& attr,
+                           ConvBuffer1x1* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_1X1_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc
@ -0,0 +1,103 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvBuffer1x1SimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 4);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(2, 1, 1, 4);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.0f, 0.0f};
+
+  for (auto precision : env_.GetSupportedPrecisions()) {
+    const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+    OperationDef op_def;
+    op_def.precision = precision;
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    TensorFloat32 dst_tensor;
+    ConvBuffer1x1 operation;
+    ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation));
+    ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                  BHWC(1, 2, 1, 2), &dst_tensor));
+    EXPECT_THAT(dst_tensor.data,
+                Pointwise(FloatNear(eps), {6.0f, 6.0f, 22.0f, 22.0f}));
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvBuffer1x1) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 4);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(4, 1, 1, 4);
+  attr.weights.data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
+                       9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
+  attr.bias.shape = Linear(4);
+  attr.bias.data = {0.5f, -0.5f, 0.5f, -0.5f};
+
+  for (auto precision : env_.GetSupportedPrecisions()) {
+    const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+    OperationDef op_def;
+    op_def.precision = precision;
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    TensorFloat32 dst_tensor;
+    ConvBuffer1x1 operation;
+    ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation));
+    ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                  BHWC(1, 2, 1, 4), &dst_tensor));
+    EXPECT_THAT(dst_tensor.data,
+                Pointwise(FloatNear(eps), {20.5f, 43.5f, 68.5f, 91.5f, 60.5f,
+                                           147.5f, 236.5f, 323.5f}));
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_test.cc
@ -0,0 +1,103 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvBufferSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 2, 2, 2);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.0f};
+
+  for (auto precision : env_.GetSupportedPrecisions()) {
+    const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+    OperationDef op_def;
+    op_def.precision = precision;
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    TensorFloat32 dst_tensor;
+    ConvBuffer operation;
+    ASSERT_OK(CreateConvBuffer(creation_context_, op_def, attr, &operation));
+    ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                  BHWC(1, 2, 2, 1), &dst_tensor));
+    EXPECT_THAT(dst_tensor.data,
+                Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvBuffer) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(2, 2, 2, 2);
+  attr.weights.data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
+                       9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.5f, -0.5f};
+
+  for (auto precision : env_.GetSupportedPrecisions()) {
+    const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+    OperationDef op_def;
+    op_def.precision = precision;
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER});
+    TensorFloat32 dst_tensor;
+    ConvBuffer operation;
+    ASSERT_OK(CreateConvBuffer(creation_context_, op_def, attr, &operation));
+    ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                  BHWC(1, 2, 2, 2), &dst_tensor));
+    EXPECT_THAT(dst_tensor.data,
+                Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f, 60.5f,
+                                           235.5f, 20.5f, 123.5f}));
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@ -0,0 +1,294 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateConvolutionConstantCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const int2& kernel_size, const int2& dilation, int src_channels,
+    int dst_channels,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+
+  const int out_z = IntegralDivideRoundUp(dst_channels, 4);
+  const std::string kOutZ = std::to_string(out_z);
+  const int src_depth = IntegralDivideRoundUp(src_channels, 4);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F16:
+      c += "#define CONV4(R, SRC, F, i) \\\n";
+      c += "  R += SRC.x * F[i + 0]; \\\n";
+      c += "  R += SRC.y * F[i + 1]; \\\n";
+      c += "  R += SRC.z * F[i + 2]; \\\n";
+      c += "  R += SRC.w * F[i + 3];   \n";
+
+      c += "#define CONV3(R, SRC, F, i) \\\n";
+      c += "  R += SRC.x * F[i + 0]; \\\n";
+      c += "  R += SRC.y * F[i + 1]; \\\n";
+      c += "  R += SRC.z * F[i + 2]; \n";
+
+      c += "#define CONV2(R, SRC, F, i) \\\n";
+      c += "  R += SRC.x * F[i + 0]; \\\n";
+      c += "  R += SRC.y * F[i + 1]; \n";
+
+      c += "#define CONV1(R, SRC, F, i) \\\n";
+      c += "  R += SRC * F[i + 0]; \n";
+      break;
+    case CalculationsPrecision::F32_F16:
+      c += "#define CONV4(R, SRC, F, i) \\\n";
+      c += "  R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
+      c += " + SRC.z * F[i + 2] + SRC.w * F[i + 3]);\n";
+
+      c += "#define CONV3(R, SRC, F, i) \\\n";
+      c += "  R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
+      c += " + SRC.z * F[i + 2]);\n";
+
+      c += "#define CONV2(R, SRC, F, i) \\\n";
+      c += "  R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]);\n";
+
+      c += "#define CONV1(R, SRC, F, i) \\\n";
+      c += "  R += convert_float4(SRC * F[i + 0]);\n";
+      break;
+  }
+
+  const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __constant FLT4* filters,  \n";
+  c += "    __constant FLT4* biases";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int2 stride,               \n";
+  c += "    int2 padding,              \n";
+  c += "    int4 src_size,             \n";
+  c += "    int4 dst_size              \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) return;\n";
+  c += "  int start_x = X * stride.x - padding.x;\n";
+  c += "  int start_y = Y * stride.y - padding.y;\n";
+  c += "  ACCUM_FLT4 r[" + kOutZ + "];\n";
+  c += "  for (int i = 0; i < " + kOutZ + "; ++i) {\n";
+  c += "    r[i] = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  c += "  }\n";
+  int filters_counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    const int ch_count = std::min(4, src_channels - s * 4);
+    const std::string s_conv = "CONV" + std::to_string(ch_count);
+    const std::string s_count = ch_count == 1 ? "" : std::to_string(ch_count);
+    const std::string s_type = absl::StrCat("FLT", s_count);
+    const std::string s_postfix = postfixes[ch_count - 1];
+    for (int ky = 0; ky < kernel_size.y; ++ky) {
+      std::string s_y = absl::StrCat("(start_y + ", ky * dilation.y, ")");
+      if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+        c += "  {\n";
+        c += "  bool y_out = " + s_y + " < 0 || " + s_y + " >= src_size.y;\n";
+      }
+      for (int kx = 0; kx < kernel_size.x; ++kx) {
+        c += "  {\n";
+        std::string s_x = absl::StrCat("(start_x + ", kx * dilation.x, ")");
+        if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+          c += "    bool x_out = " + s_x + "< 0 || " + s_x + ">= src_size.x;\n";
+          c += "    " + s_type + " src = x_out || y_out ?";
+          c += "(" + s_type + ")(0.0) : ";
+          c += src_tensor.Read3D(s_x, s_y, std::to_string(s)) + s_postfix +
+               ";\n";
+        } else {
+          c += "    " + s_type +
+               " src = " + src_tensor.Read3D(s_x, s_y, std::to_string(s)) +
+               s_postfix + ";\n";
+        }
+        for (int d = 0; d < out_z; ++d) {
+          c += "    " + s_conv + "(r[" + std::to_string(d) + "], src, filters,";
+          c += " " + std::to_string(filters_counter) + ");\n";
+          filters_counter += ch_count;
+        }
+        c += "  }\n";
+      }
+      if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+        c += "  }\n";
+      }
+    }
+  }
+  for (int i = 0; i < out_z; ++i) {
+    std::string s_i = std::to_string(i);
+    c += "  {\n";
+    c += "    FLT4 res = TO_FLT4(r[" + s_i + "]) + biases[" + s_i + "];\n";
+    c += "  " + dst_tensor.GetAddress("dst_adr", "X", "Y", s_i) + "\n";
+    c += PostProcess(linked_operations, "res", s_i, "dst_adr");
+    c += "  " + dst_tensor.Write3D("res", "dst_adr");
+    c += "  }\n";
+  }
+  c += "}\n";
+
+  return c;
+}
+
+// Adreno can provide up to ~3-4KB of constant memory, but in some cases even
+// 3KB can have very bad performance.
+int GetAdrenoOptimalMaxConstantSize(int gpu_version) {
+  if (gpu_version < 600) {
+    return 256 * 10;  // 2.5KB
+  } else {
+    return 256 * 14;  // 3.5KB
+  }
+}
+
+int GetOptimalMaxConstantSize(const DeviceInfo& info) {
+  if (info.vendor != Vendor::QUALCOMM) {
+    // In general we not expect that this kernel will be used with non Adreno
+    // so as it tuned for Adreno special memory.
+    return 256 * 16;  // 4KB
+  } else {
+    return GetAdrenoOptimalMaxConstantSize(info.adreno_info.gpu_version);
+  }
+}
+}  // namespace
+
+ConvConstants::ConvConstants(ConvConstants&& kernel)
+    : GPUOperation(std::move(kernel)),
+      weights_(std::move(kernel.weights_)),
+      biases_(std::move(kernel.biases_)),
+      kernel_size_(kernel.kernel_size_),
+      stride_(kernel.stride_),
+      padding_(kernel.padding_),
+      dilation_(kernel.dilation_),
+      src_channels_(kernel.src_channels_),
+      dst_channels_(kernel.dst_channels_),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
+  if (this != &kernel) {
+    weights_ = std::move(kernel.weights_);
+    biases_ = std::move(kernel.biases_);
+    std::swap(kernel_size_, kernel.kernel_size_);
+    std::swap(stride_, kernel.stride_);
+    std::swap(padding_, kernel.padding_);
+    std::swap(dilation_, kernel.dilation_);
+    std::swap(src_channels_, kernel.src_channels_);
+    std::swap(dst_channels_, kernel.dst_channels_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status ConvConstants::Compile(const CreationContext& creation_context) {
+  const auto code = GenerateConvolutionConstantCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, kernel_size_, dilation_, src_channels_,
+      dst_channels_, linked_operations_);
+  std::vector<CompilerOptions> options;
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      creation_context.device->IsAdreno3xx()) {
+    options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", options, *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConvConstants::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ConvConstants::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  return int3(grid_x, grid_y, 1);
+}
+
+Status ConvConstants::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status ConvConstants::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+bool IsConvConstantsSupported(const CLDevice& device,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr) {
+  if (!device.IsAdreno()) {
+    return false;
+  }
+  const auto& w_shape = attr.weights.shape;
+  const int dst_channels = AlignByN(w_shape.o, 4);
+  const int filters_count = w_shape.i * dst_channels * w_shape.h * w_shape.w;
+  const int float_size = definition.precision == CalculationsPrecision::F32
+                             ? sizeof(float)
+                             : sizeof(half);
+  const int filters_buffer_size = filters_count * float_size;
+  const int kConstantMaxSize = GetOptimalMaxConstantSize(device.GetInfo());
+  const int flt4_registers = IntegralDivideRoundUp(w_shape.o, 4);
+  return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
+}
+
+Status CreateConvConstants(const CreationContext& creation_context,
+                           const OperationDef& definition,
+                           const Convolution2DAttributes& attr,
+                           ConvConstants* result) {
+  if (!IsConvConstantsSupported(*creation_context.device, definition, attr)) {
+    return InvalidArgumentError("ConvConstants doesn't supported");
+  }
+  *result = ConvConstants(definition, attr);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::BUFFER;
+  create_info.data_type = definition.GetDataType();
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@ -0,0 +1,169 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_CONSTANTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_CONSTANTS_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConvConstants : public GPUOperation {
+ public:
+  ConvConstants() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConvConstants(ConvConstants&& kernel);
+  ConvConstants& operator=(ConvConstants&& kernel);
+  ConvConstants(const ConvConstants&) = delete;
+  ConvConstants& operator=(const ConvConstants&) = delete;
+
+ private:
+  friend Status CreateConvConstants(const CreationContext& creation_context,
+                                    const OperationDef& definition,
+                                    const Convolution2DAttributes& attr,
+                                    ConvConstants* result);
+  explicit ConvConstants(const OperationDef& definition,
+                         const Convolution2DAttributes& attr)
+      : GPUOperation(definition),
+        kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+        stride_(attr.strides.w, attr.strides.h),
+        padding_(attr.padding.prepended.w, attr.padding.prepended.h),
+        dilation_(attr.dilations.w, attr.dilations.h),
+        src_channels_(attr.weights.shape.i),
+        dst_channels_(attr.weights.shape.o) {}
+
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                            absl::Span<T> dst);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Buffer weights_;
+  LinearStorage biases_;
+
+  int2 kernel_size_;
+  int2 stride_;
+  int2 padding_;
+  int2 dilation_;
+  int src_channels_;
+  int dst_channels_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+template <DataType T>
+Status ConvConstants::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  const int float_size =
+      definition_.precision == CalculationsPrecision::F32 ? 4 : 2;
+  const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(float_count / 4);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float_size * float_count, gpu_data.data(),
+                                context, &weights_);
+  } else {
+    std::vector<half4> gpu_data(float_count / 4);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float_size * float_count, gpu_data.data(),
+                                context, &weights_);
+  }
+}
+
+template <DataType S, typename T>
+void ConvConstants::RearrangeWeightsData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        for (int d = 0; d < dst_depth; ++d) {
+          const int channels_count = std::min(4, src_channels_ - s * 4);
+          T filters[4];
+          for (int i = 0; i < 4; ++i) {
+            for (int j = 0; j < channels_count; ++j) {
+              const int s_ch = s * 4 + j;
+              const int d_ch = d * 4 + i;
+              if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                const int f_index =
+                    weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                filters[i][j] = weights.data[f_index];
+              } else {
+                filters[i][j] = 0.0f;
+              }
+            }
+          }
+          T filters_new[4];
+          for (int i = 0; i < 4; ++i) {
+            for (int j = 0; j < 4; ++j) {
+              filters_new[i][j] = filters[j][i];
+            }
+          }
+          for (int i = 0; i < channels_count; ++i) {
+            dst[counter++] = filters_new[i];
+          }
+        }
+      }
+    }
+  }
+}
+
+bool IsConvConstantsSupported(const CLDevice& device,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr);
+
+Status CreateConvConstants(const CreationContext& creation_context,
+                           const OperationDef& definition,
+                           const Convolution2DAttributes& attr,
+                           ConvConstants* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_CONSTANTS_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
@ -0,0 +1,109 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvConstantsSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 2, 2, 2);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvConstants operation;
+      ASSERT_OK(
+          CreateConvConstants(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvConstants) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(2, 2, 2, 2);
+  attr.weights.data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
+                       9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.5f, -0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvConstants operation;
+      ASSERT_OK(
+          CreateConvConstants(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f,
+                                             60.5f, 235.5f, 20.5f, 123.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@ -0,0 +1,312 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateConvCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    bool is1x1, bool adreno4xx_optimization,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  std::string c = GetCommonDefines(precision);
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F16:
+      c += "#define CONV1(R, S)    \\\n";
+      c += "R += S.x * f0; \\\n";
+      c += "R += S.y * f1; \\\n";
+      c += "R += S.z * f2; \\\n";
+      c += "R += S.w * f3;   \n";
+      c += "#define CONV2(R, S)    \\\n";
+      c += "R += S.x * f4; \\\n";
+      c += "R += S.y * f5; \\\n";
+      c += "R += S.z * f6; \\\n";
+      c += "R += S.w * f7;   \n";
+      break;
+    case CalculationsPrecision::F32_F16:
+      c += "#define CONV1(R, S) \\\n";
+      c += "R += convert_float4(S.x * f0 + S.y * f1 + S.z * f2 + S.w * f3);\n";
+      c += "#define CONV2(R, S) \\\n";
+      c += "R += convert_float4(S.x * f4 + S.y * f5 + S.z * f6 + S.w * f7);\n";
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __read_only image2d_t filters0,   \n";
+  c += "    __read_only image2d_t filters1,   \n";
+  c += "    __read_only image2d_t filters2,   \n";
+  c += "    __read_only image2d_t filters3,   \n";
+  c += "    __read_only image2d_t biases";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,                   \n";
+  c += "    int4 dst_size,                   \n";
+  if (!is1x1) {
+    c += "    int2 kernel_size,              \n";
+    c += "    int2 dillation,                \n";
+  }
+  c += "    int2 stride,                     \n";
+  c += "    int2 padding                     \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0) * 2;\n";
+  c += "  int Y = get_global_id(1) * 2;\n";
+  c += "  int Z = get_global_id(2) * 2;\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
+  c += "  int xc0 = X * stride.x + padding.x;\n";
+  c += "  int xc1 = (X + 1) * stride.x + padding.x;\n";
+  c += "  int yc0 = Y * stride.y + padding.y;\n";
+  c += "  int yc1 = (Y + 1) * stride.y + padding.y;\n";
+  for (int i = 0; i < 8; ++i) {
+    c += "  ACCUM_FLT4 r" + std::to_string(i) +
+         " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  }
+  std::string f_y = is1x1 ? "s" : "filter_offset";
+  std::string s_x0 = is1x1 ? "xc0" : "c0.x";
+  std::string s_x1 = is1x1 ? "xc1" : "c1.x";
+  std::string s_y0 = is1x1 ? "yc0" : "c0.y";
+  std::string s_y1 = is1x1 ? "yc1" : "c1.y";
+  if (!is1x1) {
+    c += "  int2 c0;\n";
+    c += "  int2 c1;\n";
+    c += "  int filter_offset = 0;\n";
+    c += "  for (int y = 0; y < kernel_size.y; ++y) {\n";
+    c += "  c0.y = y * dillation.y + yc0;\n";
+    c += "  c1.y = y * dillation.y + yc1;\n";
+    c += "  for (int x = 0; x < kernel_size.x; ++x) {\n";
+    c += "  c0.x = x * dillation.x + xc0;\n";
+    c += "  c1.x = x * dillation.x + xc1;\n";
+  }
+  c += "  for (int s = 0; s < src_size.w; ++s) {\n";
+  std::string fc0 = "(int2)(Z, " + f_y + ")";
+  std::string fc1 = "(int2)(Z + 1, " + f_y + ")";
+  c += "    FLT4 f0 = READ_IMAGE(filters0, smp_none, " + fc0 + ");\n";
+  c += "    FLT4 f1 = READ_IMAGE(filters1, smp_none, " + fc0 + ");\n";
+  c += "    FLT4 f2 = READ_IMAGE(filters2, smp_none, " + fc0 + ");\n";
+  c += "    FLT4 f3 = READ_IMAGE(filters3, smp_none, " + fc0 + ");\n";
+  c += "    FLT4 f4 = READ_IMAGE(filters0, smp_none, " + fc1 + ");\n";
+  c += "    FLT4 f5 = READ_IMAGE(filters1, smp_none, " + fc1 + ");\n";
+  c += "    FLT4 f6 = READ_IMAGE(filters2, smp_none, " + fc1 + ");\n";
+  c += "    FLT4 f7 = READ_IMAGE(filters3, smp_none, " + fc1 + ");\n";
+  c += "    FLT4 src0 =" + src_tensor.Read3D(s_x0, s_y0, "s") + ";\n";
+  c += "    FLT4 src1 =" + src_tensor.Read3D(s_x1, s_y0, "s") + ";\n";
+  c += "    FLT4 src2 =" + src_tensor.Read3D(s_x0, s_y1, "s") + ";\n";
+  c += "    FLT4 src3 =" + src_tensor.Read3D(s_x1, s_y1, "s") + ";\n";
+  for (int i = 0; i < 4; ++i) {
+    c += "    CONV1(r" + std::to_string(i) + ", src" + std::to_string(i) +
+         ");\n";
+  }
+  for (int i = 0; i < 4; ++i) {
+    c += "    CONV2(r" + std::to_string(i + 4) + ", src" + std::to_string(i) +
+         ");\n";
+  }
+  if (!is1x1) {
+    c += "    filter_offset++;\n";
+  }
+  c += "  }\n";  // src_size.w
+  if (!is1x1) {
+    c += "  }\n";  // kernel_size.x
+    c += "  }\n";  // kernel_size.y
+  }
+  // when is1x1 && adreno4xx_optimization is true, xc0 == X and yc0 == Y
+  std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X";
+  std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y";
+  c += "  if (Z < dst_size.w) {\n";
+  c += "    FLT4 bias_val = READ_IMAGE(biases, smp_none, (int2)(Z, 0));\n";
+  for (int i = 0; i < 4; ++i) {
+    c += "  {\n";
+    c += "  int xc = " + dst_x + " + " + std::to_string(i % 2) + ";\n";
+    c += "  int yc = " + dst_y + " + " + std::to_string(i / 2) + ";\n";
+    c += "  if (xc < dst_size.x && yc < dst_size.y) {\n";
+    c += "    FLT4 res = TO_FLT4(r" + std::to_string(i) + ") + bias_val;\n";
+    c += "  " + dst_tensor.GetAddress("address", "xc", "yc", "Z") + "\n";
+    c += PostProcess(linked_operations, "res", "Z", "address");
+    c += "  " + dst_tensor.Write3D("res", "address") + "\n";
+    c += "  }\n";
+    c += "  }\n";
+  }
+  c += "  }\n";
+  c += "  Z++;\n";
+  c += "  if (Z < dst_size.w) {\n";
+  c += "    FLT4 bias_val = READ_IMAGE(biases, smp_none, (int2)(Z, 0));\n";
+  for (int i = 0; i < 4; ++i) {
+    c += "  {\n";
+    c += "  int xc = " + dst_x + " + " + std::to_string(i % 2) + ";\n";
+    c += "  int yc = " + dst_y + " + " + std::to_string(i / 2) + ";\n";
+    c += "  if (xc < dst_size.x && yc < dst_size.y) {\n";
+    c += "    FLT4 res = TO_FLT4(r" + std::to_string(i + 4) + ") + bias_val;\n";
+    c += "  " + dst_tensor.GetAddress("address", "xc", "yc", "Z") + "\n";
+    c += PostProcess(linked_operations, "res", "Z", "address");
+    c += "  " + dst_tensor.Write3D("res", "address") + "\n";
+    c += "  }\n";
+    c += "  }\n";
+  }
+  c += "  }\n";
+  c += "}\n";
+  return c;
+}
+
+bool UseFP16SIMD(const CLDevice& device, CalculationsPrecision precision,
+                 bool kernel1x1) {
+  if (!device.IsAdreno()) {
+    return false;
+  }
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F32_F16:
+      return false;
+    case CalculationsPrecision::F16:
+      return device.IsAdreno3xx() && kernel1x1;
+  }
+}
+}  // namespace
+
+ConvTexture::ConvTexture(const OperationDef& definition,
+                         const Convolution2DAttributes& attr)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
+      dilation_(attr.dilations.w, attr.dilations.h),
+      work_group_size_(4, 4, 2) {}
+
+ConvTexture::ConvTexture(ConvTexture&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_0_(std::move(operation.weights_0_)),
+      weights_1_(std::move(operation.weights_1_)),
+      weights_2_(std::move(operation.weights_2_)),
+      weights_3_(std::move(operation.weights_3_)),
+      biases_(std::move(operation.biases_)),
+      kernel_size_(operation.kernel_size_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      dilation_(operation.dilation_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
+  if (this != &operation) {
+    weights_0_ = std::move(operation.weights_0_);
+    weights_1_ = std::move(operation.weights_1_);
+    weights_2_ = std::move(operation.weights_2_);
+    weights_3_ = std::move(operation.weights_3_);
+    biases_ = std::move(operation.biases_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(dilation_, operation.dilation_);
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ConvTexture::Compile(const CreationContext& creation_context) {
+  auto storage_type = definition_.GetPrimaryStorageType();
+  bool is1x1 = kernel_size_.x == 1 && kernel_size_.y == 1;
+  bool adreno4xx_optimization =
+      stride_.x == 1 && stride_.y == 1 && padding_.x == 0 && padding_.y == 0 &&
+      creation_context.device->IsAdreno4xx() &&
+      storage_type == TensorStorageType::TEXTURE_ARRAY &&
+      definition_.precision == CalculationsPrecision::F16;
+  std::string code = GenerateConvCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, is1x1, adreno4xx_optimization, linked_operations_);
+  std::vector<CompilerOptions> options;
+  if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
+    options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", options, *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConvTexture::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_0_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_1_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_2_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_3_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(dilation_));
+  }
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  return OkStatus();
+}
+
+int3 ConvTexture::GetGridSize() const {
+  const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), 2);
+  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
+  const int grid_z = IntegralDivideRoundUp(dst_[0]->Depth(), 2);
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConvTexture::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
+                              &work_group_size_);
+}
+
+Status ConvTexture::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Status CreateConvTexture(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const Convolution2DAttributes& attr,
+                         ConvTexture* result) {
+  *result = ConvTexture(definition, attr);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::TEXTURE_2D;
+  create_info.data_type = definition.GetDataType();
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@ -0,0 +1,193 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// This convolution process 2x2x2(XxYxZ) block of FLT4 values per thread.
+class ConvTexture : public GPUOperation {
+ public:
+  ConvTexture() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConvTexture(ConvTexture&& operation);
+  ConvTexture& operator=(ConvTexture&& operation);
+  ConvTexture(const ConvTexture&) = delete;
+  ConvTexture& operator=(const ConvTexture&) = delete;
+
+ private:
+  friend Status CreateConvTexture(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const Convolution2DAttributes& attr,
+                                  ConvTexture* result);
+  ConvTexture(const OperationDef& definition,
+              const Convolution2DAttributes& attr);
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                            absl::Span<T> dst_0, absl::Span<T> dst_1,
+                            absl::Span<T> dst_2, absl::Span<T> dst_3);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Texture2D weights_0_;
+  Texture2D weights_1_;
+  Texture2D weights_2_;
+  Texture2D weights_3_;
+  LinearStorage biases_;
+
+  int2 kernel_size_;
+  int2 stride_;
+  int2 padding_;
+  int2 dilation_;
+
+  CLKernel kernel_;
+  int3 work_group_size_;
+};
+
+template <DataType T>
+Status ConvTexture::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                                  CLContext* context) {
+  const int dst_depth = AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), 2);
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+
+  int texture_width = dst_depth;
+  int texture_height = src_depth * kernel_size_.x * kernel_size_.y;
+
+  DataType data_type = definition_.GetDataType();
+
+  const int elements_count = texture_width * texture_height;
+
+  if (data_type == DataType::FLOAT32) {
+    std::vector<float4> gpu_data_0(elements_count);
+    std::vector<float4> gpu_data_1(elements_count);
+    std::vector<float4> gpu_data_2(elements_count);
+    std::vector<float4> gpu_data_3(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
+                         absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
+                         absl::MakeSpan(gpu_data_3));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_0.data(),
+                                        context, &weights_0_));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_1.data(),
+                                        context, &weights_1_));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_2.data(),
+                                        context, &weights_2_));
+    return CreateTexture2DRGBA(data_type, texture_width, texture_height,
+                               gpu_data_3.data(), context, &weights_3_);
+  } else {
+    std::vector<half4> gpu_data_0(elements_count);
+    std::vector<half4> gpu_data_1(elements_count);
+    std::vector<half4> gpu_data_2(elements_count);
+    std::vector<half4> gpu_data_3(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
+                         absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
+                         absl::MakeSpan(gpu_data_3));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_0.data(),
+                                        context, &weights_0_));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_1.data(),
+                                        context, &weights_1_));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_2.data(),
+                                        context, &weights_2_));
+    return CreateTexture2DRGBA(data_type, texture_width, texture_height,
+                               gpu_data_3.data(), context, &weights_3_);
+  }
+}
+
+template <DataType S, typename T>
+void ConvTexture::RearrangeWeightsData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst_0,
+    absl::Span<T> dst_1, absl::Span<T> dst_2, absl::Span<T> dst_3) {
+  const int dst_depth = AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), 2);
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+
+  int texture_width = dst_depth;
+
+  for (int d = 0; d < dst_depth / 2; ++d) {
+    for (int y = 0; y < kernel_size_.y; ++y) {
+      for (int x = 0; x < kernel_size_.x; ++x) {
+        for (int s = 0; s < src_depth; ++s) {
+          for (int sub_d = 0; sub_d < 2; ++sub_d) {
+            T filters[4];
+            for (int i = 0; i < 4; ++i) {
+              for (int j = 0; j < 4; ++j) {
+                const int s_ch = s * 4 + j;
+                const int d_ch = (d * 2 + sub_d) * 4 + i;
+                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                  const int f_index =
+                      weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                  filters[j][i] = weights.data[f_index];
+                } else {
+                  filters[j][i] = 0.0f;
+                }
+              }
+            }
+            int x_coord = d * 2 + sub_d;
+            int y_coord = (y * kernel_size_.x + x) * src_depth + s;
+            int offset = y_coord * texture_width + x_coord;
+            dst_0[offset] = filters[0];
+            dst_1[offset] = filters[1];
+            dst_2[offset] = filters[2];
+            dst_3[offset] = filters[3];
+          }
+        }
+      }
+    }
+  }
+}
+
+Status CreateConvTexture(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const Convolution2DAttributes& attr,
+                         ConvTexture* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
@ -0,0 +1,107 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvTextureSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 2, 2, 2);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.0f};
+
+  for (auto storage : env_.GetSupportedTextureStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvTexture operation;
+      ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvTexture) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Convolution2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(2, 2, 2, 2);
+  attr.weights.data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
+                       9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.5f, -0.5f};
+
+  for (auto storage : env_.GetSupportedTextureStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvTexture operation;
+      ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f,
+                                             60.5f, 235.5f, 20.5f, 123.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
@ -0,0 +1,471 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/converter.h"
+
+#include <algorithm>
+#include <array>
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+class OpenClConverterImpl : public TensorObjectConverter {
+ public:
+  virtual Status Init(const TensorObjectDef& input_def,
+                      const TensorObjectDef& output_def,
+                      Environment* environment) = 0;
+
+ protected:
+  Status DispatchKernel(cl_mem input, cl_mem output) {
+    kernel_.ResetBindingCounter();
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(input));
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(output));
+    int3 grid = int3(dims_.w, dims_.h, dims_.d());
+    int4 size = int4(dims_.w, dims_.h, dims_.c, dims_.d());
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(size));
+    return queue_->DispatchImplicit(kernel_, grid, {16, 8, 1});
+  }
+
+  Dimensions dims_;
+  CLKernel kernel_;
+  CLCommandQueue* queue_ = nullptr;
+};
+
+bool IsSupportedDataType(DataType type) {
+  return type == DataType::FLOAT16 || type == DataType::FLOAT32;
+}
+
+// Implements conversion from OpenCL-specific tensor layout to BHWC.
+class FromTensorConverter : public OpenClConverterImpl {
+ public:
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return IsSupportedDataType(input.data_type) &&
+           IsSupportedDataType(output.data_type) &&
+           // Output is always Buffer/BHWC
+           output.object_type == ObjectType::OPENCL_BUFFER &&
+           (output.data_layout == DataLayout::BHWC ||
+            output.data_layout == DataLayout::DHWC4) &&
+           // Texture2D/HDWC4 ->
+           ((input.object_type == ObjectType::OPENCL_TEXTURE &&
+             input.data_layout == DataLayout::HDWC4) ||
+            // SingleTextureArray/BHWC ->
+            (input.object_type == ObjectType::OPENCL_TEXTURE &&
+             input.data_layout == DataLayout::BHWC) ||
+            // TextureArray/DHWC4 ->
+            (input.object_type == ObjectType::OPENCL_TEXTURE &&
+             input.data_layout == DataLayout::DHWC4) ||
+            // Buffer/DHWC4 ->
+            (input.object_type == ObjectType::OPENCL_BUFFER &&
+             input.data_layout == DataLayout::DHWC4));
+  }
+
+  std::pair<std::string, std::string> GetToDhwc4Kernel(
+      const TensorObjectDef& input_def,
+      const TensorObjectDef& output_def) const {
+    return std::make_pair(
+        "__global " + GetDataType4(output_def.object_def.data_type) + "* dst",
+        "dst[(d * size.y + y) * size.x + x] = " +
+            (output_def.object_def.data_type == input_def.object_def.data_type
+                 ? "input;"
+                 : "convert_" + GetDataType4(output_def.object_def.data_type) +
+                       "(input);"));
+  }
+
+  std::pair<std::string, std::string> GetToBhwcKernel(
+      const TensorObjectDef& input_def,
+      const TensorObjectDef& output_def) const {
+    return std::make_pair(
+        "__global " + GetDataType(output_def.object_def.data_type) + "* dst",
+        R"(
+  int c = d * 4;
+  int index = (y * size.x + x) * size.z + c;
+
+  dst[index] = input.x;
+  if (c + 1 < size.z) {
+    dst[index + 1] = input.y;
+  }
+  if (c + 2 < size.z) {
+    dst[index + 2] = input.z;
+  }
+  if (c + 3 < size.z) {
+    dst[index + 3] = input.w;
+  })");
+  }
+
+  Status Init(const TensorObjectDef& input_def,
+              const TensorObjectDef& output_def,
+              Environment* environment) final {
+    auto params_kernel = output_def.object_def.data_layout == DataLayout::BHWC
+                             ? GetToBhwcKernel(input_def, output_def)
+                             : GetToDhwc4Kernel(input_def, output_def);
+
+    TensorStorageType src_tensor_type = ToTensorStorageType(
+        input_def.object_def.object_type, input_def.object_def.data_layout);
+    TensorDescriptor src_descr;
+    src_descr.storage_type = src_tensor_type;
+    src_descr.data_type = input_def.object_def.data_type;
+    TensorCodeGenerator src_tensor("src", "size", src_descr);
+
+    std::string shader_src =
+        R"(
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+const sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+__kernel void from_tensor()" +
+        GetTensorDeclaration(src_tensor_type, AccessType::READ,
+                             input_def.object_def.data_type) +
+        " src, " + params_kernel.first + R"(, int4 size) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  int d = get_global_id(2);
+  if (x >= size.x || y >= size.y || d >= size.w) return;
+  )" + GetDataType4(input_def.object_def.data_type) +
+        " input = " + src_tensor.Read3D("x", "y", "d") + ";\n" +
+        params_kernel.second + "\n}";
+    queue_ = environment->queue();
+    dims_ = input_def.dimensions;
+    return CreateKernel(shader_src, "from_tensor", environment, &kernel_);
+  }
+
+  Status Convert(const TensorObject& input_obj,
+                 const TensorObject& output_obj) override {
+    auto output = absl::get_if<OpenClBuffer>(&output_obj);
+    if (!output || !output->memobj) {
+      return InvalidArgumentError("Missing output in to_bhwc converter");
+    }
+    auto input_texture = absl::get_if<OpenClTexture>(&input_obj);
+    if (input_texture && input_texture->memobj) {
+      return DispatchKernel(input_texture->memobj, output->memobj);
+    }
+    auto input_buffer = absl::get_if<OpenClBuffer>(&input_obj);
+    if (input_buffer && input_buffer->memobj) {
+      return DispatchKernel(input_buffer->memobj, output->memobj);
+    }
+    return InvalidArgumentError("Missing input in to_bhwc converter");
+  }
+};
+
+// Implements conversion from BHWC to OpenCL-specific tensor layout.
+class ToTensorConverter : public OpenClConverterImpl {
+ public:
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return IsSupportedDataType(input.data_type) &&
+           IsSupportedDataType(output.data_type) &&
+           // Input is always Buffer/BHWC
+           input.object_type == ObjectType::OPENCL_BUFFER &&
+           (input.data_layout == DataLayout::BHWC ||
+            input.data_layout == DataLayout::DHWC4) &&
+           // -> Texture2D/HDWC4
+           ((output.object_type == ObjectType::OPENCL_TEXTURE &&
+             output.data_layout == DataLayout::HDWC4) ||
+            // -> TextureArray/DHWC4
+            (output.object_type == ObjectType::OPENCL_TEXTURE &&
+             output.data_layout == DataLayout::DHWC4) ||
+            // -> SingleTextureArray/BHWC
+            (output.object_type == ObjectType::OPENCL_TEXTURE &&
+             output.data_layout == DataLayout::BHWC) ||
+            // -> Buffer/DHWC4
+            (output.object_type == ObjectType::OPENCL_BUFFER &&
+             output.data_layout == DataLayout::DHWC4));
+  }
+
+  std::pair<std::string, std::string> GetFromDhwc4Kernel(
+      const TensorObjectDef& input_def,
+      const TensorObjectDef& output_def) const {
+    return std::make_pair(
+        "__global " + GetDataType4(input_def.object_def.data_type) + "* src",
+        output_def.object_def.data_type == input_def.object_def.data_type
+            ? "result = src[(d * size.y + y) * size.x + x];"
+            : "result = convert_" +
+                  GetDataType4(output_def.object_def.data_type) +
+                  "(src[(d * size.y + y) * size.x + x]);");
+  }
+
+  std::pair<std::string, std::string> GetFromBhwcKernel(
+      const TensorObjectDef& input_def,
+      const TensorObjectDef& output_def) const {
+    return std::make_pair(
+        "__global " + GetDataType(input_def.object_def.data_type) + "* src",
+        R"(int c = d * 4;
+  int index = (y * size.x + x) * size.z + c;
+  result.x = src[index];
+  result.y = c + 1 < size.z ? src[index + 1] : 1;
+  result.z = c + 2 < size.z ? src[index + 2] : 2;
+  result.w = c + 3 < size.z ? src[index + 3] : 3;
+)");
+  }
+
+  Status Init(const TensorObjectDef& input_def,
+              const TensorObjectDef& output_def,
+              Environment* environment) final {
+    auto params_kernel = input_def.object_def.data_layout == DataLayout::BHWC
+                             ? GetFromBhwcKernel(input_def, output_def)
+                             : GetFromDhwc4Kernel(input_def, output_def);
+    TensorStorageType dst_tensor_type = ToTensorStorageType(
+        output_def.object_def.object_type, output_def.object_def.data_layout);
+    TensorDescriptor dst_descr;
+    dst_descr.storage_type = dst_tensor_type;
+    dst_descr.data_type = output_def.object_def.data_type;
+    TensorCodeGenerator dst_tensor("dst", "size", dst_descr);
+    std::string shader_src =
+        R"(
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void to_tensor()" +
+        params_kernel.first + ", " +
+        GetTensorDeclaration(dst_tensor_type, AccessType::WRITE,
+                             output_def.object_def.data_type) +
+        R"( dst, int4 size) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  int d = get_global_id(2);
+
+  if (x >= size.x || y >= size.y || d >= size.w) return;
+  )" + GetDataType4(output_def.object_def.data_type) +
+        " result;\n" + params_kernel.second + "\n  " +
+        dst_tensor.Write3D("result", "x", "y", "d") + ";\n}";
+    queue_ = environment->queue();
+    dims_ = output_def.dimensions;
+    return CreateKernel(shader_src, "to_tensor", environment, &kernel_);
+  }
+
+  Status Convert(const TensorObject& input_obj,
+                 const TensorObject& output_obj) override {
+    auto input = absl::get_if<OpenClBuffer>(&input_obj);
+    if (!input || !input->memobj) {
+      return InvalidArgumentError("Missing input in from_bhwc converter");
+    }
+    auto output_texture = absl::get_if<OpenClTexture>(&output_obj);
+    if (output_texture && output_texture->memobj) {
+      return DispatchKernel(input->memobj, output_texture->memobj);
+    }
+    auto output_buffer = absl::get_if<OpenClBuffer>(&output_obj);
+    if (output_buffer && output_buffer->memobj) {
+      return DispatchKernel(input->memobj, output_buffer->memobj);
+    }
+    return InvalidArgumentError("Missing input in from_bhwc converter");
+  }
+};
+
+std::array<size_t, 3> CalculateTextureRegion(const TensorObjectDef& def) {
+  const auto& dims = def.dimensions;
+  std::array<size_t, 3> region = {0, 0, 1};
+  switch (ToTensorStorageType(def.object_def.object_type,
+                              def.object_def.data_layout)) {
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      region[0] = static_cast<size_t>(dims.w);
+      region[1] = static_cast<size_t>(dims.h);
+      break;
+    case TensorStorageType::TEXTURE_2D:
+      region[0] = static_cast<size_t>(dims.w);
+      region[1] = static_cast<size_t>(dims.h * dims.d());
+      break;
+    case TensorStorageType::TEXTURE_ARRAY:
+      region[0] = static_cast<size_t>(dims.w);
+      region[1] = static_cast<size_t>(dims.h);
+      region[2] = static_cast<size_t>(dims.d());
+      break;
+    default:
+      break;
+  }
+  return region;
+}
+
+// Copies data from one object of the same type and layout to another object.
+class TrivialCopier : public OpenClConverterImpl {
+ public:
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return input.data_type == output.data_type &&
+           input.object_type == output.object_type &&
+           input.data_layout == output.data_layout;
+  }
+
+  Status Init(const TensorObjectDef& input_def,
+              const TensorObjectDef& output_def,
+              Environment* environment) final {
+    dims_ = input_def.dimensions;
+    data_type_ = input_def.object_def.data_type;
+    queue_ = environment->queue();
+    region_ = CalculateTextureRegion(output_def);
+    return OkStatus();
+  }
+
+  Status Convert(const TensorObject& input_obj,
+                 const TensorObject& output_obj) override {
+    auto texture_input = absl::get_if<OpenClTexture>(&input_obj);
+    auto texture_output = absl::get_if<OpenClTexture>(&output_obj);
+    if (texture_input && texture_output) {
+      return Copy(*texture_input, *texture_output);
+    }
+    auto buffer_input = absl::get_if<OpenClBuffer>(&input_obj);
+    auto buffer_output = absl::get_if<OpenClBuffer>(&output_obj);
+    if (buffer_input && buffer_output) {
+      return Copy(*buffer_input, *buffer_output);
+    }
+    return UnimplementedError("Unsupported conversion");
+  }
+
+  Status Copy(const OpenClBuffer& input, const OpenClBuffer& output) {
+    if (input.memobj == output.memobj) {
+      return OkStatus();
+    }
+    return GetOpenCLError(clEnqueueCopyBuffer(
+        queue_->queue(), input.memobj, output.memobj, 0, 0,
+        SizeOf(data_type_) * dims_.w * dims_.h * dims_.d() * 4, 0, nullptr,
+        nullptr));
+  }
+
+  Status Copy(const OpenClTexture& input, const OpenClTexture& output) {
+    if (input.memobj == output.memobj) {
+      return OkStatus();
+    }
+    size_t origin[3] = {0, 0, 0};
+    return GetOpenCLError(
+        clEnqueueCopyImage(queue_->queue(), input.memobj, output.memobj, origin,
+                           origin, region_.data(), 0, nullptr, nullptr));
+  }
+
+ private:
+  DataType data_type_ = DataType::UNKNOWN;
+  std::array<size_t, 3> region_;
+};
+
+static bool IsOpenClTextureOrBuffer(ObjectType type) {
+  return type == ObjectType::OPENCL_BUFFER ||
+         type == ObjectType::OPENCL_TEXTURE;
+}
+
+// Copies data from/to CPU into a tensor.
+class CpuCopier : public OpenClConverterImpl {
+ public:
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return input.data_type == output.data_type &&
+           input.data_layout == output.data_layout &&
+           ((input.object_type == ObjectType::CPU_MEMORY &&
+             IsOpenClTextureOrBuffer(output.object_type)) ||
+            (output.object_type == ObjectType::CPU_MEMORY &&
+             IsOpenClTextureOrBuffer(input.object_type)));
+  }
+
+  Status Init(const TensorObjectDef& input_def,
+              const TensorObjectDef& output_def,
+              Environment* environment) final {
+    region_ = CalculateTextureRegion(
+        input_def.object_def.object_type == ObjectType::CPU_MEMORY ? output_def
+                                                                   : input_def);
+    queue_ = environment->queue();
+    return OkStatus();
+  }
+
+  Status Convert(const TensorObject& input_obj,
+                 const TensorObject& output_obj) override {
+    auto cpu_input = absl::get_if<CpuMemory>(&input_obj);
+    auto cpu_output = absl::get_if<CpuMemory>(&output_obj);
+    if (cpu_input) {
+      auto texture_output = absl::get_if<OpenClTexture>(&output_obj);
+      if (texture_output) {
+        return queue_->EnqueueWriteImage(
+            texture_output->memobj, int3(region_[0], region_[1], region_[2]),
+            cpu_input->data);
+      }
+      auto buffer_output = absl::get_if<OpenClBuffer>(&output_obj);
+      if (buffer_output) {
+        return queue_->EnqueueWriteBuffer(
+            buffer_output->memobj, cpu_input->size_bytes, cpu_input->data);
+      }
+    } else if (cpu_output) {
+      auto texture_input = absl::get_if<OpenClTexture>(&input_obj);
+      if (texture_input) {
+        return queue_->EnqueueReadImage(
+            texture_input->memobj, int3(region_[0], region_[1], region_[2]),
+            cpu_output->data);
+      }
+      auto buffer_input = absl::get_if<OpenClBuffer>(&input_obj);
+      if (buffer_input) {
+        return queue_->EnqueueReadBuffer(
+            buffer_input->memobj, cpu_output->size_bytes, cpu_output->data);
+      }
+    }
+    return UnimplementedError("Unsupported conversion");
+  }
+
+ private:
+  std::array<size_t, 3> region_;
+};
+
+class OpenClTensorConverterBuilder : public TensorObjectConverterBuilder {
+ public:
+  explicit OpenClTensorConverterBuilder(Environment* environment)
+      : environment_(environment) {}
+
+  bool IsSupported(const TensorObjectDef& input,
+                   const TensorObjectDef& output) final {
+    const auto& input_def = input.object_def;
+    const auto& output_def = output.object_def;
+    return input.dimensions == output.dimensions &&
+           (TrivialCopier::IsSupported(input_def, output_def) ||
+            CpuCopier::IsSupported(input_def, output_def) ||
+            FromTensorConverter::IsSupported(input_def, output_def) ||
+            ToTensorConverter::IsSupported(input_def, output_def));
+  }
+
+  Status MakeConverter(
+      const TensorObjectDef& input, const TensorObjectDef& output,
+      std::unique_ptr<TensorObjectConverter>* converter) final {
+    std::unique_ptr<OpenClConverterImpl> impl;
+    const auto& input_def = input.object_def;
+    const auto& output_def = output.object_def;
+    if (TrivialCopier::IsSupported(input_def, output_def)) {
+      impl = absl::make_unique<TrivialCopier>();
+    } else if (CpuCopier::IsSupported(input_def, output_def)) {
+      impl = absl::make_unique<CpuCopier>();
+    } else if (FromTensorConverter::IsSupported(input_def, output_def)) {
+      impl = absl::make_unique<FromTensorConverter>();
+    } else if (ToTensorConverter::IsSupported(input_def, output_def)) {
+      impl = absl::make_unique<ToTensorConverter>();
+    } else {
+      return UnimplementedError("Unsupported conversion");
+    }
+    RETURN_IF_ERROR(impl->Init(input, output, environment_));
+    *converter = std::move(impl);
+    return OkStatus();
+  }
+
+  Environment* environment_;
+};
+
+}  // namespace
+
+std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(
+    Environment* environment) {
+  return absl::make_unique<OpenClTensorConverterBuilder>(environment);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/converter.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/converter.h
@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVERTER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVERTER_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/spi.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class TensorObjectConverterBuilder {
+ public:
+  virtual ~TensorObjectConverterBuilder() = default;
+
+  virtual bool IsSupported(const TensorObjectDef& input,
+                           const TensorObjectDef& output) = 0;
+
+  virtual Status MakeConverter(
+      const TensorObjectDef& input, const TensorObjectDef& output,
+      std::unique_ptr<TensorObjectConverter>* converter) = 0;
+};
+
+// Supports conversions from BHWC to internal OpenCL tensor representation and
+// back. Also supports F16/F32.
+std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(
+    Environment* environment);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVERTER_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@ -0,0 +1,282 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateConvolutionTransposedCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const LinearStorage& biases,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+  std::string c = GetCommonDefines(precision);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F16:
+      if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+        c += "#define CONV(R, S)   \\\n";
+        c += "R += S.x * f0.s0123; \\\n";
+        c += "R += S.y * f0.s4567; \\\n";
+        c += "R += S.z * f0.s89ab; \\\n";
+        c += "R += S.w * f0.scdef;   \n";
+      } else {
+        c += "#define CONV(R, S)  \\\n";
+        c += "R += S.x * f[0];    \\\n";
+        c += "R += S.y * f[1];    \\\n";
+        c += "R += S.z * f[2];    \\\n";
+        c += "R += S.w * f[3];      \n";
+      }
+      break;
+    case CalculationsPrecision::F32_F16:
+      if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+        c += "#define CONV(R, S) \\\n";
+        c += "R += convert_float4(S.x * f0.s0123 + S.y * f0.s4567 + S.z * "
+             "f0.s89ab + S.w * f0.scdef);\n";
+      } else {
+        c += "#define CONV(R, S) \\\n";
+        c += "R += convert_float4(S.x * f[0] + S.y * f[1]";
+        c += "+ S.z * f[2] + S.w * f[3]);\n";
+      }
+      break;
+  }
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+      c += "#define FLT16 float16\n";
+      break;
+    case CalculationsPrecision::F32_F16:
+    case CalculationsPrecision::F16:
+      c += "#define FLT16 half16\n";
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "    __global FLT16* filters,  \n";
+    c += "    __global FLT4* biases";
+  } else {
+    c += "    __read_only image2d_t filters,  \n";
+    c += "    __read_only image2d_t biases";
+  }
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int2 kernel_size,          \n";
+  c += "    int2 stride,               \n";
+  c += "    int2 padding,              \n";
+  c += "    int2 k_offset,        \n";
+  c += "    int2 inner_size,           \n";
+  c += "    int4 src_size,             \n";
+  c += "    int4 dst_size              \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) return;\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "  int f_base = Z * src_size.w * kernel_size.x * kernel_size.y;\n";
+  }
+  c += "  int2 offset = (int2)(X, Y) + padding - k_offset;\n";
+  c += "  offset.x = offset.x % stride.x;\n";
+  c += "  offset.y = offset.y % stride.y;\n";
+  c += "  offset += stride;\n";
+  c += "  offset.x = offset.x % stride.x;\n";
+  c += "  offset.y = offset.y % stride.y;\n";
+  c += "  int2 f_offset;\n";
+  c += "  f_offset.x = offset.x == 0 ? 0 : stride.x - offset.x;\n";
+  c += "  f_offset.y = offset.y == 0 ? 0 : stride.y - offset.y;\n";
+  c += "  ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  c += "  for (int ky = 0; ky < inner_size.y; ++ky) {\n";
+  c += "    int index_y = ky * stride.y + f_offset.y;\n";
+  c += "    bool inside_y = index_y < kernel_size.y;\n";
+  c += "    int s_y = (Y + index_y + padding.y - k_offset.y) / stride.y;\n";
+  c += "    index_y = kernel_size.y - 1 - index_y;\n";
+  c += "    bool out_y = s_y < 0 || s_y >= src_size.y;\n";
+  c += "    for (int kx = 0; kx < inner_size.x; ++kx) {\n";
+  c += "      int index_x = kx * stride.x + f_offset.x;\n";
+  c += "      bool inside_kernel = index_x < kernel_size.x && inside_y;\n";
+  c += "      int s_x = (X + index_x + padding.x - k_offset.x) / stride.x;\n";
+  c += "      index_x = kernel_size.x - 1 - index_x;\n";
+  c += "      bool out_x = s_x < 0 || s_x >= src_size.x;\n";
+  c += "      int kernel_index = index_y * kernel_size.x + index_x;\n";
+  c += "      if (inside_kernel && !(out_x || out_y)) {\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "        int f_offset = f_base + kernel_index * src_size.w;\n";
+  } else {
+    c += "        int x_c = kernel_index * src_size.w * 4;\n";
+  }
+  c += "        for (int l = 0; l < src_size.w; ++l) {\n";
+  c += "          FLT4 src =" + src_tensor.Read3D("s_x", "s_y", "l") + ";\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "          FLT16 f0 = filters[f_offset]; f_offset++;\n";
+  } else {
+    c += "          FLT4 f[4];\n";
+    c += "          f[0] = READ_IMAGE(filters, smp_none, (int2)(x_c, Z)); "
+         "x_c++;\n";
+    c += "          f[1] = READ_IMAGE(filters, smp_none, (int2)(x_c, Z)); "
+         "x_c++;\n";
+    c += "          f[2] = READ_IMAGE(filters, smp_none, (int2)(x_c, Z)); "
+         "x_c++;\n";
+    c += "          f[3] = READ_IMAGE(filters, smp_none, (int2)(x_c, Z)); "
+         "x_c++;\n";
+  }
+  c += "          CONV(r0, src);\n";
+  c += "        }\n";
+  c += "      }\n";
+  c += "    }\n";
+  c += "  }\n";
+  c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
+  c += "  FLT4 res0 = TO_FLT4(r0) + bias_val;\n";
+  c += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  c += PostProcess(linked_operations, "res0", "Z", "address");
+  c += "  " + dst_tensor.Write3D("res0", "address") + "\n";
+  c += "}\n";
+
+  return c;
+}
+}  // namespace
+
+ConvolutionTransposed::ConvolutionTransposed(
+    const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      stride_(attr.stride.w, attr.stride.h),
+      padding_(attr.padding.prepended.w, attr.padding.prepended.h),
+      src_channels_(attr.weights.shape.i),
+      dst_channels_(attr.weights.shape.o) {
+  const int inner_size_x = (kernel_size_.x - 1) / stride_.x + 1;
+  const int inner_size_y = (kernel_size_.y - 1) / stride_.y + 1;
+  inner_size_ = int2(inner_size_x, inner_size_y);
+  kernel_offset_ = int2(kernel_size_.x - 1, kernel_size_.y - 1);
+}
+
+ConvolutionTransposed::ConvolutionTransposed(ConvolutionTransposed&& kernel)
+    : GPUOperation(std::move(kernel)),
+      biases_(std::move(kernel.biases_)),
+      weights_tex2d_(std::move(kernel.weights_tex2d_)),
+      weights_buf_(std::move(kernel.weights_buf_)),
+      weights_(kernel.weights_),
+      kernel_size_(kernel.kernel_size_),
+      stride_(kernel.stride_),
+      padding_(kernel.padding_),
+      kernel_offset_(kernel.kernel_offset_),
+      inner_size_(kernel.inner_size_),
+      src_channels_(kernel.src_channels_),
+      dst_channels_(kernel.dst_channels_),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+ConvolutionTransposed& ConvolutionTransposed::operator=(
+    ConvolutionTransposed&& kernel) {
+  if (this != &kernel) {
+    biases_ = std::move(kernel.biases_);
+    weights_tex2d_ = std::move(kernel.weights_tex2d_);
+    weights_buf_ = std::move(kernel.weights_buf_);
+    std::swap(weights_, kernel.weights_);
+    std::swap(kernel_size_, kernel.kernel_size_);
+    std::swap(stride_, kernel.stride_);
+    std::swap(padding_, kernel.padding_);
+    std::swap(kernel_offset_, kernel.kernel_offset_);
+    std::swap(inner_size_, kernel.inner_size_);
+    std::swap(src_channels_, kernel.src_channels_);
+    std::swap(dst_channels_, kernel.dst_channels_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status ConvolutionTransposed::Compile(const CreationContext& creation_context) {
+  const auto code = GenerateConvolutionTransposedCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, biases_, linked_operations_);
+
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConvolutionTransposed::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_offset_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(inner_size_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ConvolutionTransposed::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConvolutionTransposed::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
+                              &work_group_size_);
+}
+
+Status ConvolutionTransposed::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Status CreateConvolutionTransposed(const CreationContext& creation_context,
+                                   const OperationDef& definition,
+                                   const ConvolutionTransposedAttributes& attr,
+                                   ConvolutionTransposed* result) {
+  *result = ConvolutionTransposed(definition, attr);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type =
+      DeduceLinearStorageType(definition.GetPrimaryStorageType());
+  create_info.data_type = definition.GetDataType();
+  create_info.name = "biases";
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
@ -0,0 +1,190 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConvolutionTransposed : public GPUOperation {
+ public:
+  ConvolutionTransposed() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConvolutionTransposed(ConvolutionTransposed&& kernel);
+  ConvolutionTransposed& operator=(ConvolutionTransposed&& kernel);
+  ConvolutionTransposed(const ConvolutionTransposed&) = delete;
+  ConvolutionTransposed& operator=(const ConvolutionTransposed&) = delete;
+
+ private:
+  friend Status CreateConvolutionTransposed(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr,
+      ConvolutionTransposed* result);
+  explicit ConvolutionTransposed(const OperationDef& definition,
+                                 const ConvolutionTransposedAttributes& attr);
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                            absl::Span<T> dst);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  LinearStorage biases_;
+
+  Texture2D weights_tex2d_;
+  Buffer weights_buf_;
+  cl_mem weights_;
+
+  int2 kernel_size_;
+  int2 stride_;
+  int2 padding_;
+  int2 kernel_offset_;
+  int2 inner_size_;
+  int src_channels_;
+  int dst_channels_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(16, 8, 1);
+};
+
+template <DataType T>
+Status ConvolutionTransposed::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
+  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int kernel_x = kernel_size_.x;
+  const int kernel_y = kernel_size_.y;
+
+  const int elements_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
+  bool is_buffer_storage =
+      definition_.GetPrimaryStorageType() == TensorStorageType::BUFFER;
+
+  const int float4_size =
+      definition_.precision == CalculationsPrecision::F32 ? 16 : 8;
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    if (is_buffer_storage) {
+      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                           gpu_data.data(), context,
+                                           &weights_buf_));
+    } else {
+      RETURN_IF_ERROR(CreateTexture2DRGBA(
+          definition_.GetDataType(), src_depth * kernel_x * kernel_y * 4,
+          dst_depth, gpu_data.data(), context, &weights_tex2d_));
+    }
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    if (is_buffer_storage) {
+      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                           gpu_data.data(), context,
+                                           &weights_buf_));
+    } else {
+      RETURN_IF_ERROR(CreateTexture2DRGBA(
+          definition_.GetDataType(), src_depth * kernel_x * kernel_y * 4,
+          dst_depth, gpu_data.data(), context, &weights_tex2d_));
+    }
+  }
+
+  if (is_buffer_storage) {
+    weights_ = weights_buf_.GetMemoryPtr();
+  } else {
+    weights_ = weights_tex2d_.GetMemoryPtr();
+  }
+
+  return OkStatus();
+}
+
+template <DataType S, typename T>
+void ConvolutionTransposed::RearrangeWeightsData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
+  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int kernel_x = kernel_size_.x;
+  const int kernel_y = kernel_size_.y;
+
+  int counter = 0;
+  for (int d = 0; d < dst_depth; ++d) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        for (int s = 0; s < src_depth; ++s) {
+          T filters[4];
+          for (int j = 0; j < 4; ++j) {
+            for (int i = 0; i < 4; ++i) {
+              const int s_ch = s * 4 + j;
+              const int d_ch = d * 4 + i;
+              if (s_ch < src_channels_ && d_ch < dst_channels_) {
+                const int f_index =
+                    weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                filters[i][j] = weights.data[f_index];
+              } else {
+                filters[i][j] = 0.0f;
+              }
+            }
+          }
+          T filters_new[4];
+          for (int i = 0; i < 4; ++i) {
+            for (int j = 0; j < 4; ++j) {
+              filters_new[i][j] = filters[j][i];
+            }
+          }
+          dst[counter++] = filters_new[0];
+          dst[counter++] = filters_new[1];
+          dst[counter++] = filters_new[2];
+          dst[counter++] = filters_new[3];
+        }
+      }
+    }
+  }
+}
+
+Status CreateConvolutionTransposed(const CreationContext& creation_context,
+                                   const OperationDef& definition,
+                                   const ConvolutionTransposedAttributes& attr,
+                                   ConvolutionTransposed* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
@ -0,0 +1,254 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateConvolutionTransposedCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const LinearStorage& biases, int src_depth, int dst_depth,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F16:
+      c += "#define CONV(R, SRC, F, i) \\\n";
+      c += "  R += SRC.x * F[i + 0]; \\\n";
+      c += "  R += SRC.y * F[i + 1]; \\\n";
+      c += "  R += SRC.z * F[i + 2]; \\\n";
+      c += "  R += SRC.w * F[i + 3];   \n";
+      break;
+    case CalculationsPrecision::F32_F16:
+      c += "#define CONV(R, SRC, F, i) \\\n";
+      c += "  R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
+      c += "+ SRC.z * F[i + 2] + SRC.w * F[i + 3]);\n";
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __constant FLT4* filters,  \n";
+  c += biases.GetDeclaration();
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,             \n";
+  c += "    int4 dst_size              \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  if (X >= src_size.x || Y >= src_size.y) return;\n";
+  for (int d = 0; d < dst_depth; ++d) {
+    const std::string layer = std::to_string(d);
+    c += "  ACCUM_FLT4 r" + layer + "[2][2];\n";
+    c += "  r" + layer + "[0][0] = (ACCUM_FLT4)(0.0f);\n";
+    c += "  r" + layer + "[0][1] = (ACCUM_FLT4)(0.0f);\n";
+    c += "  r" + layer + "[1][0] = (ACCUM_FLT4)(0.0f);\n";
+    c += "  r" + layer + "[1][1] = (ACCUM_FLT4)(0.0f);\n";
+  }
+  int filters_index = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    const std::string z = std::to_string(s);
+    c += "  {\n";
+    if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+      c += "  bool x_in = X + 1 < src_size.x;\n";
+      c += "  bool y_in = Y + 1 < src_size.y;\n";
+      c += "  FLT4 src0 = " + src_tensor.Read3D("X", "Y", z) + ";\n";
+      c += "  FLT4 src1 = (FLT4)(0.0);\n";
+      c += "  FLT4 src2 = (FLT4)(0.0);\n";
+      c += "  FLT4 src3 = (FLT4)(0.0);\n";
+      c += "  if (x_in) {\n";
+      c += "    src1 = " + src_tensor.Read3D("X + 1", "Y", z) + ";\n";
+      c += "  }\n";
+      c += "  if (y_in) {\n";
+      c += "    src2 = " + src_tensor.Read3D("X", "Y + 1", z) + ";\n";
+      c += "  }\n";
+      c += "  if (x_in && y_in) {\n";
+      c += "    src3 = " + src_tensor.Read3D("X + 1", "Y + 1", z) + ";\n";
+      c += "  }\n";
+    } else {
+      c += "  FLT4 src0 = " + src_tensor.Read3D("X", "Y", z) + ";\n";
+      c += "  FLT4 src1 = " + src_tensor.Read3D("X + 1", "Y", z) + ";\n";
+      c += "  FLT4 src2 = " + src_tensor.Read3D("X", "Y + 1", z) + ";\n";
+      c += "  FLT4 src3 = " + src_tensor.Read3D("X + 1", "Y + 1", z) + ";\n";
+    }
+    for (int d = 0; d < dst_depth; ++d) {
+      const std::string layer = std::to_string(d);
+      const std::string f_offset = std::to_string(filters_index);
+      filters_index++;
+      c += "  {\n";
+      c += "  __constant FLT4* L0 = filters + 36 * " + f_offset + ";\n";
+      c += "  CONV(r" + layer + "[0][0], src0, L0, 0);\n";
+      c += "  CONV(r" + layer + "[0][1], src0, L0, 4);\n";
+      c += "  CONV(r" + layer + "[0][1], src1, L0, 8);\n";
+      c += "  CONV(r" + layer + "[1][0], src0, L0, 12);\n";
+      c += "  CONV(r" + layer + "[1][0], src2, L0, 16);\n";
+      c += "  CONV(r" + layer + "[1][1], src0, L0, 20);\n";
+      c += "  CONV(r" + layer + "[1][1], src1, L0, 24);\n";
+      c += "  CONV(r" + layer + "[1][1], src2, L0, 28);\n";
+      c += "  CONV(r" + layer + "[1][1], src3, L0, 32);\n";
+      c += "  }\n";
+    }
+    c += "  }\n";
+  }
+  c += "  X *= 2;\n";
+  c += "  Y *= 2;\n";
+  for (int d = 0; d < dst_depth; ++d) {
+    const std::string layer = std::to_string(d);
+    c += "  {\n";
+    c += "  FLT4 bias_val = " + biases.ReadLinearFLT4(layer) + ";\n";
+    for (int y = 0; y < 2; ++y) {
+      for (int x = 0; x < 2; ++x) {
+        c += "  {\n";
+        c += "    FLT4 result = TO_FLT4(r" + layer + "[" + std::to_string(y) +
+             "][" + std::to_string(x) + "]) + bias_val;\n";
+        c += "    " +
+             dst_tensor.GetAddress("address", "X + " + std::to_string(x),
+                                   "Y + " + std::to_string(y), layer) +
+             "\n";
+        c += PostProcess(linked_operations, "result", layer, "address");
+        c += "    " + dst_tensor.Write3D("result", "address") + "\n";
+        c += "  }\n";
+      }
+    }
+    c += "  }\n";
+  }
+  c += "}\n";
+
+  return c;
+}
+}  // namespace
+
+ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
+    const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
+    : GPUOperation(definition),
+      src_channels_(attr.weights.shape.i),
+      dst_channels_(attr.weights.shape.o) {}
+
+ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
+    ConvolutionTransposed3x3Thin&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_(std::move(operation.weights_)),
+      biases_(std::move(operation.biases_)),
+      src_channels_(operation.src_channels_),
+      dst_channels_(operation.dst_channels_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
+    ConvolutionTransposed3x3Thin&& operation) {
+  if (this != &operation) {
+    weights_ = std::move(operation.weights_);
+    biases_ = std::move(operation.biases_);
+    std::swap(src_channels_, operation.src_channels_);
+    std::swap(dst_channels_, operation.dst_channels_);
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ConvolutionTransposed3x3Thin::Compile(
+    const CreationContext& creation_context) {
+  const auto code = GenerateConvolutionTransposedCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, biases_, IntegralDivideRoundUp(src_channels_, 4),
+      IntegralDivideRoundUp(dst_channels_, 4), linked_operations_);
+
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConvolutionTransposed3x3Thin::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
+  const int grid_x = src_[0]->Width();
+  const int grid_y = src_[0]->Height();
+  const int grid_z = 1;
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConvolutionTransposed3x3Thin::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status ConvolutionTransposed3x3Thin::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+bool IsConvolutionTransposed3x3ThinSupported(
+    const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
+  return device.IsAdreno() && attr.weights.shape.o <= 8 &&
+         attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
+         attr.stride.w == 2 && attr.stride.h == 2 &&
+         attr.padding.prepended.w == 1 && attr.padding.prepended.h == 1;
+}
+
+Status CreateConvolutionTransposed3x3Thin(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr,
+    ConvolutionTransposed3x3Thin* result) {
+  if (!IsConvolutionTransposed3x3ThinSupported(*creation_context.device,
+                                               attr)) {
+    return InvalidArgumentError(
+        "ConvolutionTransposed3x3Thin doesn't support this attributes");
+  }
+  *result = ConvolutionTransposed3x3Thin(definition, attr);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type =
+      DeduceLinearStorageType(definition.GetPrimaryStorageType());
+  create_info.data_type = definition.GetDataType();
+  create_info.name = "biases";
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
@ -0,0 +1,162 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3X3_THIN_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3X3_THIN_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConvolutionTransposed3x3Thin : public GPUOperation {
+ public:
+  ConvolutionTransposed3x3Thin() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConvolutionTransposed3x3Thin(ConvolutionTransposed3x3Thin&& operation);
+  ConvolutionTransposed3x3Thin& operator=(
+      ConvolutionTransposed3x3Thin&& operation);
+  ConvolutionTransposed3x3Thin(const ConvolutionTransposed3x3Thin&) = delete;
+  ConvolutionTransposed3x3Thin& operator=(const ConvolutionTransposed3x3Thin&) =
+      delete;
+
+ private:
+  friend Status CreateConvolutionTransposed3x3Thin(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr,
+      ConvolutionTransposed3x3Thin* result);
+  explicit ConvolutionTransposed3x3Thin(
+      const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                            absl::Span<T> dst);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Buffer weights_;
+  LinearStorage biases_;
+
+  int src_channels_;
+  int dst_channels_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+template <DataType T>
+Status ConvolutionTransposed3x3Thin::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
+  const int kernel_x = 3;  //  This operation support only 3x3 kernel
+  const int kernel_y = 3;
+  const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
+
+  const int flt4_size = definition_.precision == CalculationsPrecision::F32
+                            ? sizeof(float4)
+                            : sizeof(half4);
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(flt4_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
+                                context, &weights_);
+  } else {
+    std::vector<half4> gpu_data(flt4_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
+                                context, &weights_);
+  }
+}
+
+template <DataType S, typename T>
+void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
+  const int kernel_x = 3;
+  const int kernel_y = 3;
+
+  const int remap[9] = {4, 5, 3, 7, 1, 8, 6, 2, 0};
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int d = 0; d < dst_depth; ++d) {
+      for (int y = 0; y < kernel_y; ++y) {
+        for (int x = 0; x < kernel_x; ++x) {
+          const int kernel_index = remap[y * kernel_x + x];
+          const int kernel_index_x = kernel_index % kernel_x;
+          const int kernel_index_y = kernel_index / kernel_x;
+          T filters[4];
+          for (int j = 0; j < 4; ++j) {
+            for (int i = 0; i < 4; ++i) {
+              const int s_ch = s * 4 + i;
+              const int d_ch = d * 4 + j;
+              if (s_ch < src_channels_ && d_ch < dst_channels_) {
+                const int f_index = weights.shape.LinearIndex(
+                    {d_ch, kernel_index_y, kernel_index_x, s_ch});
+                filters[i][j] = weights.data[f_index];
+              } else {
+                filters[i][j] = 0.0f;
+              }
+            }
+          }
+          dst[counter++] = filters[0];
+          dst[counter++] = filters[1];
+          dst[counter++] = filters[2];
+          dst[counter++] = filters[3];
+        }
+      }
+    }
+  }
+}
+
+bool IsConvolutionTransposed3x3ThinSupported(
+    const CLDevice& device, const ConvolutionTransposedAttributes& attr);
+
+Status CreateConvolutionTransposed3x3Thin(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr,
+    ConvolutionTransposed3x3Thin* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3X3_THIN_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
@ -0,0 +1,110 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3ThinSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 1);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ConvolutionTransposedAttributes attr;
+  attr.padding.prepended = HW(1, 1);
+  attr.padding.appended = HW(1, 1);
+  attr.stride = HW(2, 2);
+  attr.weights.shape = OHWI(1, 3, 3, 1);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvolutionTransposed3x3Thin operation;
+      ASSERT_OK(CreateConvolutionTransposed3x3Thin(creation_context_, op_def,
+                                                   attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {0.0f, 1.0f, 1.0f, 1.0f, 2.0f, 6.0f, 4.0f, 4.0f,
+                             2.0f, 5.0f, 3.0f, 3.0f, 2.0f, 5.0f, 3.0f, 3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3Thin) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 1);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ConvolutionTransposedAttributes attr;
+  attr.padding.prepended = HW(1, 1);
+  attr.padding.appended = HW(1, 1);
+  attr.stride = HW(2, 2);
+  attr.weights.shape = OHWI(1, 3, 3, 1);
+  attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvolutionTransposed3x3Thin operation;
+      ASSERT_OK(CreateConvolutionTransposed3x3Thin(creation_context_, op_def,
+                                                   attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 1), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {0.5f, 4.5f, 5.5f, 6.5f, 4.5f, 16.5f, 14.5f, 18.5f, 10.5f,
+                     24.5f, 15.5f, 18.5f, 16.5f, 39.5f, 24.5f, 27.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc
@ -0,0 +1,114 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvolutionTransposedSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  ConvolutionTransposedAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.stride = HW(2, 2);
+  attr.weights.shape = OHWI(2, 2, 2, 2);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.0f, 0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvolutionTransposed operation;
+      ASSERT_OK(CreateConvolutionTransposed(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 2), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {1.0f, 1.0f, 1.0f, 1.0f, 5.0f,  5.0f,  5.0f,  5.0f,
+                     1.0f, 1.0f, 1.0f, 1.0f, 5.0f,  5.0f,  5.0f,  5.0f,
+                     9.0f, 9.0f, 9.0f, 9.0f, 13.0f, 13.0f, 13.0f, 13.0f,
+                     9.0f, 9.0f, 9.0f, 9.0f, 13.0f, 13.0f, 13.0f, 13.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvolutionTransposed) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  ConvolutionTransposedAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.stride = HW(2, 2);
+  attr.weights.shape = OHWI(1, 2, 2, 2);
+  attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvolutionTransposed operation;
+      ASSERT_OK(CreateConvolutionTransposed(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 1), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {2.5f, 4.5f, 8.5f, 18.5f, 6.5f, 8.5f, 28.5f, 38.5f, 14.5f,
+                     32.5f, 20.5f, 46.5f, 50.5f, 68.5f, 72.5f, 98.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
@ -0,0 +1,247 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateConvolutionTransposedCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    int src_depth, int dst_channels, const int2& kernel_size,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+  const std::string channel_x = dst_channels == 1 ? "" : ".x";
+  const std::vector<std::string> channel = {channel_x, ".y", ".z", ".w"};
+
+  const std::string type_postfix =
+      dst_channels == 1 ? "" : std::to_string(dst_channels);
+
+  std::string accum_type;
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F32_F16:
+      accum_type = "float" + type_postfix;
+      break;
+    case CalculationsPrecision::F16:
+      accum_type = "half" + type_postfix;
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __constant FLT4* filters";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,             \n";
+  c += "    int4 dst_size,             \n";
+  c += "    FLT4 bias_value            \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  if (X >= src_size.x || Y >= src_size.y) return;\n";
+  c += "  " + accum_type + " r[" + std::to_string(kernel_size.y) + "][" +
+       std::to_string(kernel_size.x) + "];\n";
+  c += "  {\n";
+  c += "  FLT4 src = " + src_tensor.Read3D("X", "Y", "0") + ";\n";
+  int index = 0;
+  for (int y = 0; y < kernel_size.y; ++y) {
+    for (int x = 0; x < kernel_size.x; ++x) {
+      std::string r_s =
+          "  r[" + std::to_string(y) + "][" + std::to_string(x) + "]";
+      for (int d = 0; d < dst_channels; ++d) {
+        c += r_s + channel[d] + " = TO_ACCUM_FLT(dot(src, filters[" +
+             std::to_string(index) + "]));\n";
+        index++;
+      }
+    }
+  }
+  c += "  }\n";
+  for (int i = 1; i < src_depth; ++i) {
+    if (precision != CalculationsPrecision::F32_F16) {
+      c += "  if (X < src_size.x + " + std::to_string(i + 1) + ") {\n";
+    } else {
+      c += "  {\n";
+    }
+    c += "  FLT4 src = " + src_tensor.Read3D("X", "Y", std::to_string(i)) +
+         ";\n";
+    for (int y = 0; y < kernel_size.y; ++y) {
+      for (int x = 0; x < kernel_size.x; ++x) {
+        std::string r_s =
+            "  r[" + std::to_string(y) + "][" + std::to_string(x) + "]";
+        for (int d = 0; d < dst_channels; ++d) {
+          c += r_s + channel[d] + " += TO_ACCUM_FLT(dot(src, filters[" +
+               std::to_string(index) + "]));\n";
+          index++;
+        }
+      }
+    }
+    c += "  }\n";
+  }
+  c += "  X *= " + std::to_string(kernel_size.x) + ";\n";
+  c += "  Y *= " + std::to_string(kernel_size.x) + ";\n";
+  for (int y = 0; y < kernel_size.y; ++y) {
+    for (int x = 0; x < kernel_size.x; ++x) {
+      if (precision != CalculationsPrecision::F32_F16) {
+        c += "  if (X + " + std::to_string(x) + " < dst_size.x && ";
+        c += "Y + " + std::to_string(y) + " < dst_size.y) {\n";
+      } else {
+        c += "  {\n";
+      }
+      c += "    FLT4 result = bias_value;\n";
+      for (int d = 0; d < dst_channels; ++d) {
+        c += "    result" + channel[d] + " += r[" + std::to_string(y) + "][" +
+             std::to_string(x) + "]" + channel[d] + ";\n";
+      }
+      c += "    " +
+           dst_tensor.GetAddress("address", "X + " + std::to_string(x),
+                                 "Y + " + std::to_string(y), "0") +
+           "\n";
+      c += PostProcess(linked_operations, "result", "0", "address");
+      c += "    " + dst_tensor.Write3D("result", "address") + "\n";
+      c += "  }\n";
+    }
+  }
+  c += "}\n";
+
+  return c;
+}
+}  // namespace
+
+ConvolutionTransposedThin::ConvolutionTransposedThin(
+    const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      src_channels_(attr.weights.shape.i),
+      dst_channels_(attr.weights.shape.o) {
+  float4 bias_value(0.0f);
+  for (int i = 0; i < attr.weights.shape.o; ++i) {
+    bias_value[i] = attr.bias.data[i];
+  }
+  bias_value_ = FLT4(definition_.precision, bias_value);
+}
+
+ConvolutionTransposedThin::ConvolutionTransposedThin(
+    ConvolutionTransposedThin&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_buf_(std::move(operation.weights_buf_)),
+      bias_value_(std::move(operation.bias_value_)),
+      kernel_size_(operation.kernel_size_),
+      src_channels_(operation.src_channels_),
+      dst_channels_(operation.dst_channels_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
+    ConvolutionTransposedThin&& operation) {
+  if (this != &operation) {
+    weights_buf_ = std::move(operation.weights_buf_);
+    bias_value_ = std::move(operation.bias_value_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(src_channels_, operation.src_channels_);
+    std::swap(dst_channels_, operation.dst_channels_);
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ConvolutionTransposedThin::Compile(
+    const CreationContext& creation_context) {
+  const auto code = GenerateConvolutionTransposedCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, IntegralDivideRoundUp(src_channels_, 4),
+      dst_channels_, kernel_size_, linked_operations_);
+
+  std::vector<CompilerOptions> options;
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      creation_context.device->IsAdreno3xx()) {
+    options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
+
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ConvolutionTransposedThin::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_buf_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(bias_value_));
+  return OkStatus();
+}
+
+int3 ConvolutionTransposedThin::GetGridSize() const {
+  const int grid_x = src_[0]->Width();
+  const int grid_y = src_[0]->Height();
+  const int grid_z = 1;
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ConvolutionTransposedThin::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status ConvolutionTransposedThin::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+bool IsConvolutionTransposedThinSupported(
+    const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
+  return device.IsAdreno() && attr.weights.shape.o <= 4 &&
+         attr.weights.shape.w == attr.stride.w &&
+         attr.weights.shape.h == attr.stride.h &&
+         attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0;
+}
+
+Status CreateConvolutionTransposedThin(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr,
+    ConvolutionTransposedThin* result) {
+  if (!IsConvolutionTransposedThinSupported(*creation_context.device, attr)) {
+    return InvalidArgumentError(
+        "ConvolutionTransposedThin doesn't support this attributes");
+  }
+  *result = ConvolutionTransposedThin(definition, attr);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
@ -0,0 +1,148 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_THIN_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_THIN_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class ConvolutionTransposedThin : public GPUOperation {
+ public:
+  ConvolutionTransposedThin() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ConvolutionTransposedThin(ConvolutionTransposedThin&& operation);
+  ConvolutionTransposedThin& operator=(ConvolutionTransposedThin&& operation);
+  ConvolutionTransposedThin(const ConvolutionTransposedThin&) = delete;
+  ConvolutionTransposedThin& operator=(const ConvolutionTransposedThin&) =
+      delete;
+
+ private:
+  friend Status CreateConvolutionTransposedThin(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr,
+      ConvolutionTransposedThin* result);
+  ConvolutionTransposedThin(const OperationDef& definition,
+                            const ConvolutionTransposedAttributes& attr);
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                            absl::Span<T> dst);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Buffer weights_buf_;
+  FLT4 bias_value_;
+
+  int2 kernel_size_;
+  int src_channels_;
+  int dst_channels_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+template <DataType T>
+Status ConvolutionTransposedThin::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int elements_count =
+      kernel_size_.x * kernel_size_.y * src_depth * 4 * dst_channels_;
+
+  const int float4_size =
+      definition_.precision == CalculationsPrecision::F32 ? 16 : 8;
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_buf_);
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
+                                context, &weights_buf_);
+  }
+}
+
+template <DataType S, typename T>
+void ConvolutionTransposedThin::RearrangeWeightsData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int kernel_x = kernel_size_.x;
+  const int kernel_y = kernel_size_.y;
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        std::vector<T> filters(dst_channels_);
+        for (int j = 0; j < dst_channels_; ++j) {
+          for (int i = 0; i < 4; ++i) {
+            const int s_ch = s * 4 + i;
+            const int d_ch = j;
+            if (s_ch < src_channels_ && d_ch < dst_channels_) {
+              const int f_index = weights.shape.LinearIndex({d_ch, y, x, s_ch});
+              filters[j][i] = weights.data[f_index];
+            } else {
+              filters[j][i] = 0.0f;
+            }
+          }
+        }
+        for (int j = 0; j < dst_channels_; ++j) {
+          dst[counter++] = filters[j];
+        }
+      }
+    }
+  }
+}
+
+bool IsConvolutionTransposedThinSupported(
+    const CLDevice& device, const ConvolutionTransposedAttributes& attr);
+
+Status CreateConvolutionTransposedThin(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr,
+    ConvolutionTransposedThin* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_THIN_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin_test.cc
@ -0,0 +1,114 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ConvolutionTransposedThinSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  ConvolutionTransposedAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.stride = HW(2, 2);
+  attr.weights.shape = OHWI(2, 2, 2, 2);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.0f, 0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvolutionTransposedThin operation;
+      ASSERT_OK(CreateConvolutionTransposedThin(creation_context_, op_def, attr,
+                                                &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 2), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {1.0f, 1.0f, 1.0f, 1.0f, 5.0f,  5.0f,  5.0f,  5.0f,
+                     1.0f, 1.0f, 1.0f, 1.0f, 5.0f,  5.0f,  5.0f,  5.0f,
+                     9.0f, 9.0f, 9.0f, 9.0f, 13.0f, 13.0f, 13.0f, 13.0f,
+                     9.0f, 9.0f, 9.0f, 9.0f, 13.0f, 13.0f, 13.0f, 13.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ConvolutionTransposedThin) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  ConvolutionTransposedAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.stride = HW(2, 2);
+  attr.weights.shape = OHWI(1, 2, 2, 2);
+  attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      ConvolutionTransposedThin operation;
+      ASSERT_OK(CreateConvolutionTransposedThin(creation_context_, op_def, attr,
+                                                &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 4, 4, 1), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {2.5f, 4.5f, 8.5f, 18.5f, 6.5f, 8.5f, 28.5f, 38.5f, 14.5f,
+                     32.5f, 20.5f, 46.5f, 50.5f, 68.5f, 72.5f, 98.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.cc
@ -0,0 +1,257 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+bool IsSpecializedCase(int channel_multiplier) {
+  return channel_multiplier == 1 || channel_multiplier == 2 ||
+         channel_multiplier == 4;
+}
+
+std::string GetSrcValue(const TensorCodeGenerator& src_tensor,
+                        int channel_multiplier) {
+  std::string c;
+  if (channel_multiplier == 1) {
+    c +=
+        "      FLT4 src_final =" + src_tensor.Read3D("x_c", "y_c", "Z") + ";\n";
+  } else if (channel_multiplier == 2) {
+    c += "      int z_layer = Z / 2;\n";
+    c +=
+        "      FLT4 src =" + src_tensor.Read3D("x_c", "y_c", "z_layer") + ";\n";
+    c += "      FLT2 t0 = Z % 2 == 0 ? src.xy : src.zw;\n";
+    c += "      FLT4 src_final = (FLT4)(t0.x, t0.x, t0.y, t0.y);\n";
+  } else if (channel_multiplier == 4) {
+    c += "      int z_layer = Z / 4;\n";
+    c +=
+        "      FLT4 src =" + src_tensor.Read3D("x_c", "y_c", "z_layer") + ";\n";
+    c += "      FLT t0 = src.x;\n";
+    c += "      int reminder = Z % 4;\n";
+    c += "      if (reminder == 1) t0 = src.y;\n";
+    c += "      if (reminder == 2) t0 = src.z;\n";
+    c += "      if (reminder == 3) t0 = src.w;\n";
+    c += "      FLT4 src_final = (FLT4)(t0, t0, t0, t0);\n";
+  } else {
+    c += "      int z_layer = Z / channel_multiplier;\n";
+    c +=
+        "      FLT4 src =" + src_tensor.Read3D("x_c", "y_c", "z_layer") + ";\n";
+    c += "      int z_offset = (Z % channel_multiplier) * 4;\n";
+    c += "      FLT4 src_final;\n";
+    c += "      FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n";
+    c += "      src_final.x = temp_arr[(z_offset + 0) / channel_multiplier];\n";
+    c += "      src_final.y = temp_arr[(z_offset + 1) / channel_multiplier];\n";
+    c += "      src_final.z = temp_arr[(z_offset + 2) / channel_multiplier];\n";
+    c += "      src_final.w = temp_arr[(z_offset + 3) / channel_multiplier];\n";
+  }
+
+  return c;
+}
+
+std::string GenerateDepthWiseConvolutionCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const LinearStorage& biases, int channel_multiplier,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "    __global FLT4* filters,  \n";
+  } else {
+    c += "    __read_only image2d_t filters,  \n";
+  }
+  c += biases.GetDeclaration();
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int2 kernel_size,                \n";
+  c += "    int2 stride,                     \n";
+  c += "    int2 padding,                    \n";
+  c += "    int2 dilation,                   \n";
+  if (!IsSpecializedCase(channel_multiplier)) {
+    c += "    int channel_multiplier,            \n";
+  }
+  c += "    int4 src_size,                   \n";
+  c += "    int4 dst_size                    \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
+  c += "  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  c += "  int x_offseted = X * stride.x - padding.x;\n";
+  c += "  int y_offseted = Y * stride.y - padding.y;\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "  int fx_c = Z * kernel_size.x * kernel_size.y;\n";
+  } else {
+    c += "  int fx_c = 0;\n";
+  }
+  c += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
+  c += "    int y_c = y_offseted + ky * dilation.y;\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "    bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
+  }
+  c += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
+  c += "      int x_c = x_offseted + kx * dilation.x;\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "      bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
+    c += "      if (!outside_x && !outside_y) {\n";
+    c += GetSrcValue(src_tensor, channel_multiplier);
+    c += "        FLT4 f = filters[fx_c];\n";
+    c += "        r += TO_ACCUM_TYPE(src_final * f);\n";
+    c += "      };\n";
+    c += "      fx_c++;\n";
+  } else {
+    c += GetSrcValue(src_tensor, channel_multiplier);
+    c += "      FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, Z)); "
+         "fx_c++;\n";
+    c += "      r += TO_ACCUM_TYPE(src_final * f);\n";
+  }
+  c += "    }\n";
+  c += "  }\n";
+  c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
+  c += "  FLT4 res0 = TO_FLT4(r) + bias_val;\n";
+  c += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  c += PostProcess(linked_operations, "res0", "Z", "address");
+  c += "  " + dst_tensor.Write3D("res0", "address") + "\n";
+  c += "}\n";
+
+  return c;
+}
+}  // namespace
+
+DepthWiseConvolution::DepthWiseConvolution(
+    const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(attr.padding.prepended.w, attr.padding.prepended.h),
+      dilation_(attr.dilations.w, attr.dilations.h),
+      channel_multiplier_(attr.weights.shape.o),
+      work_group_size_(8, 8, 1) {}
+
+DepthWiseConvolution::DepthWiseConvolution(DepthWiseConvolution&& operation)
+    : GPUOperation(std::move(operation)),
+      weights_tex2d_(std::move(operation.weights_tex2d_)),
+      weights_buf_(std::move(operation.weights_buf_)),
+      weights_(operation.weights_),
+      biases_(std::move(operation.biases_)),
+      kernel_size_(operation.kernel_size_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      dilation_(operation.dilation_),
+      channel_multiplier_(operation.channel_multiplier_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+DepthWiseConvolution& DepthWiseConvolution::operator=(
+    DepthWiseConvolution&& operation) {
+  if (this != &operation) {
+    weights_tex2d_ = std::move(operation.weights_tex2d_);
+    weights_buf_ = std::move(operation.weights_buf_);
+    std::swap(weights_, operation.weights_);
+    biases_ = std::move(operation.biases_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(dilation_, operation.dilation_);
+    std::swap(channel_multiplier_, operation.channel_multiplier_);
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status DepthWiseConvolution::Compile(const CreationContext& creation_context) {
+  const auto code = GenerateDepthWiseConvolutionCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, biases_, channel_multiplier_, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status DepthWiseConvolution::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dilation_));
+  if (!IsSpecializedCase(channel_multiplier_)) {
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(channel_multiplier_)));
+  }
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 DepthWiseConvolution::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status DepthWiseConvolution::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status DepthWiseConvolution::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Status CreateDepthWiseConvolution(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const DepthwiseConvolution2DAttributes& attr,
+                                  DepthWiseConvolution* result) {
+  *result = DepthWiseConvolution(definition, attr);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type =
+      DeduceLinearStorageType(definition.GetPrimaryStorageType());
+  create_info.data_type = definition.GetDataType();
+  create_info.name = "biases";
+  create_info.aligned_size = attr.weights.shape.o * attr.weights.shape.i;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h
@ -0,0 +1,175 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class DepthWiseConvolution : public GPUOperation {
+ public:
+  DepthWiseConvolution() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  DepthWiseConvolution(DepthWiseConvolution&& operation);
+  DepthWiseConvolution& operator=(DepthWiseConvolution&& operation);
+  DepthWiseConvolution(const DepthWiseConvolution&) = delete;
+  DepthWiseConvolution& operator=(const DepthWiseConvolution&) = delete;
+
+ private:
+  friend Status CreateDepthWiseConvolution(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const DepthwiseConvolution2DAttributes& attr,
+      DepthWiseConvolution* result);
+  explicit DepthWiseConvolution(const OperationDef& definition,
+                                const DepthwiseConvolution2DAttributes& attr);
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
+                            absl::Span<T> dst);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Texture2D weights_tex2d_;
+  Buffer weights_buf_;
+  cl_mem weights_;
+
+  LinearStorage biases_;
+
+  int2 kernel_size_;
+  int2 stride_;
+  int2 padding_;
+  int2 dilation_;
+  int channel_multiplier_;
+
+  CLKernel kernel_;
+  int3 work_group_size_;
+};
+
+template <DataType T>
+Status DepthWiseConvolution::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  const int elements_count = kernel_x * kernel_y * dst_depth;
+
+  bool is_buffer_storage =
+      definition_.GetPrimaryStorageType() == TensorStorageType::BUFFER;
+
+  const int float4_size =
+      definition_.precision == CalculationsPrecision::F32 ? 16 : 8;
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    if (is_buffer_storage) {
+      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                           gpu_data.data(), context,
+                                           &weights_buf_));
+    } else {
+      RETURN_IF_ERROR(CreateTexture2DRGBA(
+          definition_.GetDataType(), kernel_x * kernel_y, dst_depth,
+          gpu_data.data(), context, &weights_tex2d_));
+    }
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    if (is_buffer_storage) {
+      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                           gpu_data.data(), context,
+                                           &weights_buf_));
+    } else {
+      RETURN_IF_ERROR(CreateTexture2DRGBA(
+          definition_.GetDataType(), kernel_x * kernel_y, dst_depth,
+          gpu_data.data(), context, &weights_tex2d_));
+    }
+  }
+
+  if (is_buffer_storage) {
+    weights_ = weights_buf_.GetMemoryPtr();
+  } else {
+    weights_ = weights_tex2d_.GetMemoryPtr();
+  }
+
+  return OkStatus();
+}
+
+template <DataType S, typename T>
+void DepthWiseConvolution::RearrangeWeightsData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  int counter = 0;
+  for (int d = 0; d < dst_depth; ++d) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
+        T filter_val;
+        for (int i = 0; i < 4; ++i) {
+          const int d_ch = d * 4 + i;
+          if (d_ch < dst_channels) {
+            const int f_index = weights.shape.LinearIndex(
+                {d_ch % weights.shape.o, y, x, d_ch / weights.shape.o});
+            filter_val[i] = weights.data[f_index];
+          } else {
+            filter_val[i] = 0.0f;
+          }
+        }
+        dst[counter++] = filter_val;
+      }
+    }
+  }
+}
+
+Status CreateDepthWiseConvolution(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const DepthwiseConvolution2DAttributes& attr,
+                                  DepthWiseConvolution* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.cc
@ -0,0 +1,249 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GenerateDepthWiseConvCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  std::string c = GetCommonDefines(precision);
+  TensorCodeGenerator src_tensor("src_data", "dst_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __read_only image2d_t filters\n";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 dst_size\n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0) * 2;\n";
+  c += "  int Y = get_global_id(1) * 2;\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
+  c += "   ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f);\n";
+  c += "   ACCUM_FLT4 r1 = (ACCUM_FLT4)(0.0f);\n";
+  c += "   ACCUM_FLT4 r2 = (ACCUM_FLT4)(0.0f);\n";
+  c += "   ACCUM_FLT4 r3 = (ACCUM_FLT4)(0.0f);\n";
+  c += "   FLT4 f0 = READ_IMAGE(filters, smp_none, (int2)(0, Z));\n";
+  c += "   FLT4 f1 = READ_IMAGE(filters, smp_none, (int2)(1, Z));\n";
+  c += "   FLT4 f2 = READ_IMAGE(filters, smp_none, (int2)(2, Z));\n";
+  c += "   FLT4 f3 = READ_IMAGE(filters, smp_none, (int2)(3, Z));\n";
+  c += "   FLT4 f4 = READ_IMAGE(filters, smp_none, (int2)(4, Z));\n";
+  c += "   FLT4 f5 = READ_IMAGE(filters, smp_none, (int2)(5, Z));\n";
+  c += "   FLT4 f6 = READ_IMAGE(filters, smp_none, (int2)(6, Z));\n";
+  c += "   FLT4 f7 = READ_IMAGE(filters, smp_none, (int2)(7, Z));\n";
+  c += "   FLT4 f8 = READ_IMAGE(filters, smp_none, (int2)(8, Z));\n";
+  c += " \n";
+  c += "   FLT4 s0;\n";
+  c += "   FLT4 s1;\n";
+  c += "   FLT4 s2;\n";
+  c += "   FLT4 s3;\n";
+  c += " \n";
+  c += " {\n";
+  c += "   s0 = " + src_tensor.Read3D("X - 1", "Y - 1", "Z") + ";\n";
+  c += "   s1 = " + src_tensor.Read3D("X", "Y - 1", "Z") + ";\n";
+  c += "   s2 = " + src_tensor.Read3D("X + 1", "Y - 1", "Z") + ";\n";
+  c += "   s3 = " + src_tensor.Read3D("X + 2", "Y - 1", "Z") + ";\n";
+  c += "   r0 += TO_ACCUM_TYPE(f0 * s0);\n";
+  c += "   r0 += TO_ACCUM_TYPE(f1 * s1);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f0 * s1);\n";
+  c += "   r0 += TO_ACCUM_TYPE(f2 * s2);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f1 * s2);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f2 * s3);\n";
+  c += " }\n";
+  c += " {\n";
+  c += "   s0 = " + src_tensor.Read3D("X - 1", "Y", "Z") + ";\n";
+  c += "   s1 = " + src_tensor.Read3D("X", "Y", "Z") + ";\n";
+  c += "   s2 = " + src_tensor.Read3D("X + 1", "Y", "Z") + ";\n";
+  c += "   s3 = " + src_tensor.Read3D("X + 2", "Y", "Z") + ";\n";
+  c += "   r0 += TO_ACCUM_TYPE(f3 * s0);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f0 * s0);\n";
+  c += "   r0 += TO_ACCUM_TYPE(f4 * s1);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f3 * s1);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f1 * s1);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f0 * s1);\n";
+  c += "   r0 += TO_ACCUM_TYPE(f5 * s2);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f4 * s2);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f2 * s2);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f1 * s2);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f5 * s3);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f2 * s3);\n";
+  c += " }\n";
+  c += " {\n";
+  c += "   s0 = " + src_tensor.Read3D("X - 1", "Y + 1", "Z") + ";\n";
+  c += "   s1 = " + src_tensor.Read3D("X", "Y + 1", "Z") + ";\n";
+  c += "   s2 = " + src_tensor.Read3D("X + 1", "Y + 1", "Z") + ";\n";
+  c += "   s3 = " + src_tensor.Read3D("X + 2", "Y + 1", "Z") + ";\n";
+  c += "   r0 += TO_ACCUM_TYPE(f6 * s0);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f3 * s0);\n";
+  c += "   r0 += TO_ACCUM_TYPE(f7 * s1);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f6 * s1);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f4 * s1);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f3 * s1);\n";
+  c += "   r0 += TO_ACCUM_TYPE(f8 * s2);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f7 * s2);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f5 * s2);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f4 * s2);\n";
+  c += "   r1 += TO_ACCUM_TYPE(f8 * s3);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f5 * s3);\n";
+  c += " }\n";
+  c += " {\n";
+  c += "   s0 = " + src_tensor.Read3D("X - 1", "Y + 2", "Z") + ";\n";
+  c += "   s1 = " + src_tensor.Read3D("X", "Y + 2", "Z") + ";\n";
+  c += "   s2 = " + src_tensor.Read3D("X + 1", "Y + 2", "Z") + ";\n";
+  c += "   s3 = " + src_tensor.Read3D("X + 2", "Y + 2", "Z") + ";\n";
+  c += "   r2 += TO_ACCUM_TYPE(f6 * s0);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f7 * s1);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f6 * s1);\n";
+  c += "   r2 += TO_ACCUM_TYPE(f8 * s2);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f7 * s2);\n";
+  c += "   r3 += TO_ACCUM_TYPE(f8 * s3);\n";
+  c += " }\n";
+  c += "   FLT4 bias = READ_IMAGE(filters, smp_none, (int2)(9, Z));\n";
+  c += "   r0 += TO_ACCUM_TYPE(bias);\n";
+  c += "   r1 += TO_ACCUM_TYPE(bias);\n";
+  c += "   r2 += TO_ACCUM_TYPE(bias);\n";
+  c += "   r3 += TO_ACCUM_TYPE(bias);\n";
+  c += "   if(X + 0 < dst_size.x && Y + 0 < dst_size.y) {\n";
+  c += "     FLT4 result = TO_FLT4(r0);\n";
+  c += "  " + dst_tensor.GetAddress("address", "X + 0", "Y + 0", "Z") + "\n";
+  c += PostProcess(linked_operations, "result", "Z", "address");
+  c += "  " + dst_tensor.Write3D("result", "address") + "\n";
+  c += "   }\n";
+  c += "   if(X + 1 < dst_size.x && Y + 0 < dst_size.y) {\n";
+  c += "     FLT4 result = TO_FLT4(r1);\n";
+  c += "  " + dst_tensor.GetAddress("address", "X + 1", "Y + 0", "Z") + "\n";
+  c += PostProcess(linked_operations, "result", "Z", "address");
+  c += "  " + dst_tensor.Write3D("result", "address") + "\n";
+  c += "   }\n";
+  c += "   if(X + 0 < dst_size.x && Y + 1 < dst_size.y) {\n";
+  c += "     FLT4 result = TO_FLT4(r2);\n";
+  c += "  " + dst_tensor.GetAddress("address", "X + 0", "Y + 1", "Z") + "\n";
+  c += PostProcess(linked_operations, "result", "Z", "address");
+  c += "  " + dst_tensor.Write3D("result", "address") + "\n";
+  c += "   }\n";
+  c += "   if(X + 1 < dst_size.x && Y + 1 < dst_size.y) {\n";
+  c += "     FLT4 result = TO_FLT4(r3);\n";
+  c += "  " + dst_tensor.GetAddress("address", "X + 1", "Y + 1", "Z") + "\n";
+  c += PostProcess(linked_operations, "result", "Z", "address");
+  c += "  " + dst_tensor.Write3D("result", "address") + "\n";
+  c += "   }\n";
+  c += " }\n";
+
+  return c;
+}
+
+}  // namespace
+
+DepthWiseConv3x3Texture::DepthWiseConv3x3Texture(const OperationDef& definition)
+    : GPUOperation(definition) {}
+
+DepthWiseConv3x3Texture::DepthWiseConv3x3Texture(
+    DepthWiseConv3x3Texture&& kernel)
+    : GPUOperation(std::move(kernel)),
+      weights_(std::move(kernel.weights_)),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+DepthWiseConv3x3Texture& DepthWiseConv3x3Texture::operator=(
+    DepthWiseConv3x3Texture&& kernel) {
+  if (this != &kernel) {
+    weights_ = std::move(kernel.weights_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status DepthWiseConv3x3Texture::Compile(
+    const CreationContext& creation_context) {
+  std::string code = GenerateDepthWiseConvCode(
+      definition_.src_tensors[0], definition_.dst_tensors[0],
+      definition_.precision, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status DepthWiseConv3x3Texture::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+
+  return OkStatus();
+}
+
+int3 DepthWiseConv3x3Texture::GetGridSize() const {
+  const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), 2);
+  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status DepthWiseConv3x3Texture::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status DepthWiseConv3x3Texture::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+bool IsDepthWiseConv3x3TextureSupported(
+    const DepthwiseConvolution2DAttributes& attr) {
+  return attr.weights.shape.o == 1 && attr.dilations.w == 1 &&
+         attr.dilations.h == 1 && attr.weights.shape.w == 3 &&
+         attr.weights.shape.h == 3 && attr.strides.w == 1 &&
+         attr.strides.h == 1 && attr.padding.prepended.w == 1 &&
+         attr.padding.prepended.h == 1 && attr.padding.appended.w == 1 &&
+         attr.padding.appended.h == 1;
+}
+
+Status CreateDepthWiseConv3x3Texture(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr,
+    DepthWiseConv3x3Texture* result) {
+  if (!IsDepthWiseConv3x3TextureSupported(attr)) {
+    return InvalidArgumentError(
+        "DepthWiseConv3x3Texture doesn't support this attributes");
+  }
+  *result = DepthWiseConv3x3Texture(definition);
+  RETURN_IF_ERROR(result->UploadWeightsAndBiases(attr.weights, attr.bias,
+                                                 creation_context.context));
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.h
@ -0,0 +1,145 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_3X3_TEXTURE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_3X3_TEXTURE_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class DepthWiseConv3x3Texture : public GPUOperation {
+ public:
+  DepthWiseConv3x3Texture() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  DepthWiseConv3x3Texture(DepthWiseConv3x3Texture&& kernel);
+  DepthWiseConv3x3Texture& operator=(DepthWiseConv3x3Texture&& kernel);
+  DepthWiseConv3x3Texture(const DepthWiseConv3x3Texture&) = delete;
+  DepthWiseConv3x3Texture& operator=(const DepthWiseConv3x3Texture&) = delete;
+
+ private:
+  explicit DepthWiseConv3x3Texture(const OperationDef& definition);
+  template <DataType T>
+  Status UploadWeightsAndBiases(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                                const ::tflite::gpu::Tensor<Linear, T>& biases,
+                                CLContext* context);
+
+  friend Status CreateDepthWiseConv3x3Texture(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const DepthwiseConvolution2DAttributes& attr,
+      DepthWiseConv3x3Texture* result);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsAndBiasesData(
+      const ::tflite::gpu::Tensor<OHWI, S>& weights,
+      const ::tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
+
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  Texture2D weights_;
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+template <DataType T>
+Status DepthWiseConv3x3Texture::UploadWeightsAndBiases(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights,
+    const ::tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  int texture_width = 10;  // 3x3 kernel + 1 bias
+  int texture_height = src_depth;
+  const int elements_count = texture_width * texture_height;
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
+    return CreateTexture2DRGBA(definition_.GetDataType(), texture_width,
+                               texture_height, gpu_data.data(), context,
+                               &weights_);
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
+    return CreateTexture2DRGBA(definition_.GetDataType(), texture_width,
+                               texture_height, gpu_data.data(), context,
+                               &weights_);
+  }
+}
+
+template <DataType S, typename T>
+void DepthWiseConv3x3Texture::RearrangeWeightsAndBiasesData(
+    const ::tflite::gpu::Tensor<OHWI, S>& weights,
+    const ::tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
+  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+
+  int counter = 0;
+  for (int s = 0; s < src_depth; ++s) {
+    for (int y = 0; y < 3; ++y) {
+      for (int x = 0; x < 3; ++x) {
+        T filter_val;
+        for (int i = 0; i < 4; ++i) {
+          const int s_ch = s * 4 + i;
+          if (s_ch < weights.shape.i) {
+            const int f_index = weights.shape.LinearIndex({0, y, x, s_ch});
+            filter_val[i] = weights.data[f_index];
+          } else {
+            filter_val[i] = 0.0f;
+          }
+        }
+        dst[counter++] = filter_val;
+      }
+    }
+
+    T bias_val;
+    for (int i = 0; i < 4; ++i) {
+      const int dst_ch = s * 4 + i;
+      bias_val[i] = dst_ch >= biases.shape.v ? 0.0f : biases.data[dst_ch];
+    }
+    dst[counter++] = bias_val;
+  }
+}
+
+bool IsDepthWiseConv3x3TextureSupported(
+    const DepthwiseConvolution2DAttributes& attr);
+
+Status CreateDepthWiseConv3x3Texture(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr,
+    DepthWiseConv3x3Texture* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_3X3_TEXTURE_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture_test.cc
@ -0,0 +1,111 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, DepthWiseConv3x3TextureSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  DepthwiseConvolution2DAttributes attr;
+  attr.padding.prepended = HW(1, 1);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 3, 3, 2);
+  attr.weights.data = {0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                       1.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 1.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.0f, 0.0f};
+
+  for (auto storage : env_.GetSupportedTextureStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      DepthWiseConv3x3Texture operation;
+      ASSERT_OK(CreateDepthWiseConv3x3Texture(creation_context_, op_def, attr,
+                                              &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {6.0f, 16.0f, 8.0f, 16.0f, 10.0f,
+                                             16.0f, 12.0f, 16.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, DepthWiseConv3x3Texture) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  DepthwiseConvolution2DAttributes attr;
+  attr.padding.prepended = HW(1, 1);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 3, 3, 2);
+  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 0.0f, 1.0f, 2.0f,
+                       3.0f, 4.0f, 5.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.5f, -0.5f};
+
+  for (auto storage : env_.GetSupportedTextureStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      DepthWiseConv3x3Texture operation;
+      ASSERT_OK(CreateDepthWiseConv3x3Texture(creation_context_, op_def, attr,
+                                              &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {40.5f, 67.5f, 16.5f, 35.5f, 40.5f,
+                                             67.5f, 16.5f, 35.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_test.cc
@ -0,0 +1,148 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, DepthWiseConvSimpleWeights) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  DepthwiseConvolution2DAttributes attr;
+  attr.padding.prepended = HW(1, 0);
+  attr.padding.appended = HW(1, 0);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 3, 1, 2);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.0f, 0.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      DepthWiseConvolution operation;
+      ASSERT_OK(CreateDepthWiseConvolution(creation_context_, op_def, attr,
+                                           &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {4.0f, 6.0f, 8.0f, 10.0f, 4.0f,
+                                             6.0f, 8.0f, 10.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, DepthWiseConvNoMultiplier) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  DepthwiseConvolution2DAttributes attr;
+  attr.padding.prepended = HW(1, 0);
+  attr.padding.appended = HW(1, 0);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(1, 3, 1, 2);
+  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.5f, -0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      DepthWiseConvolution operation;
+      ASSERT_OK(CreateDepthWiseConvolution(creation_context_, op_def, attr,
+                                           &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {16.5f, 27.5f, 28.5f, 43.5f, 8.5f,
+                                             15.5f, 12.5f, 23.5f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, DepthWiseConvMultiplier2) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  DepthwiseConvolution2DAttributes attr;
+  attr.padding.prepended = HW(1, 0);
+  attr.padding.appended = HW(1, 0);
+  attr.strides = HW(1, 1);
+  attr.dilations = HW(1, 1);
+  attr.weights.shape = OHWI(2, 3, 1, 2);
+  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f,  5.0f,
+                       6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  attr.bias.shape = Linear(4);
+  attr.bias.data = {0.5f, -0.5f, 1.0f, -1.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      DepthWiseConvolution operation;
+      ASSERT_OK(CreateDepthWiseConvolution(creation_context_, op_def, attr,
+                                           &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 4), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {16.5f, 39.5f, 29.0f, 63.0f, 28.5f, 75.5f, 45.0f, 103.0f,
+                     8.5f, 31.5f, 17.0f, 51.0f, 12.5f, 59.5f, 25.0f, 83.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.cc
@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
+
+#include "absl/strings/str_cat.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+FLT::FLT(CalculationsPrecision precision, float value)
+    : f32_(precision == CalculationsPrecision::F32), active_(true) {
+  if (f32_) {
+    f_value_ = value;
+  } else {
+    h_value_ = half(value);
+  }
+}
+
+const void* FLT::GetData() const {
+  return f32_ ? static_cast<const void*>(&f_value_)
+              : static_cast<const void*>(&h_value_);
+}
+
+std::string FLT::GetDeclaration() const {
+  const std::string type = f32_ ? "float" : "half";
+  return absl::StrCat(type, " ", name_);
+}
+
+FLT4::FLT4(CalculationsPrecision precision, const float4& value)
+    : f32_(precision == CalculationsPrecision::F32), active_(true) {
+  if (f32_) {
+    f_value_ = value;
+  } else {
+    h_value_ = half4(value);
+  }
+}
+
+const void* FLT4::GetData() const {
+  return f32_ ? static_cast<const void*>(&f_value_)
+              : static_cast<const void*>(&h_value_);
+}
+
+std::string FLT4::GetDeclaration() const {
+  const std::string type = f32_ ? "float4" : "half4";
+  return absl::StrCat(type, " ", name_);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h
@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FLT_TYPE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FLT_TYPE_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class FLT {
+ public:
+  FLT() = default;
+  FLT(CalculationsPrecision precision, float value);
+
+  const void* GetData() const;
+  size_t GetSize() const { return f32_ ? sizeof(float) : sizeof(half); }
+  bool Active() const { return active_; }
+  std::string GetDeclaration() const;
+  std::string GetName() const { return name_; }
+  void SetName(const std::string& name) { name_ = name; }
+
+ private:
+  float f_value_;
+  half h_value_;
+  bool f32_;
+  bool active_ = false;
+  std::string name_;
+};
+
+class FLT4 {
+ public:
+  FLT4() {}
+  FLT4(CalculationsPrecision precision, const float4& value);
+
+  const void* GetData() const;
+  size_t GetSize() const { return f32_ ? sizeof(float4) : sizeof(half4); }
+  bool Active() const { return active_; }
+  std::string GetDeclaration() const;
+  std::string GetName() const { return name_; }
+  void SetName(const std::string& name) { name_ = name; }
+
+ private:
+  float4 f_value_;
+  half4 h_value_;
+  bool f32_;
+  bool active_ = false;
+  std::string name_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FLT_TYPE_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.cc
@ -0,0 +1,189 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+// We split vec vec dot (every thread do vec vec dot product in basic
+// vec mat mult) on 4 parts to create more threads
+// tid.y thread process every 4-th element in vec vec dot
+// Good results for ~1024 x 1024 sizes, for other can be written more
+// otimized shaders
+
+std::string GetFullyConnectedKernelCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations,
+    const int3& work_group_size) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+
+  switch (precision) {
+    case CalculationsPrecision::F32:
+      c += "#define READ_IMAGE read_imagef\n";
+      break;
+    case CalculationsPrecision::F32_F16:
+    case CalculationsPrecision::F16:
+      c += "#define READ_IMAGE read_imageh\n";
+      break;
+  }
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __read_only image2d_t filters,\n";
+  c += "    __read_only image2d_t biases";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,             \n";
+  c += "    int4 dst_size,             \n";
+  c += "    int src_depth_x4          \n";
+  c += ") {\n";
+  c += "  int gid = get_global_id(0);\n";
+  c += "  int2 tid = (int2)(get_local_id(0), get_local_id(1));\n";
+  c += "  ACCUM_FLT4 s = (ACCUM_FLT4)(0.0f);\n";
+  c += "  uint c = tid.y;\n";       // vector coord for every thread
+  c += "  uint c2 = tid.y * 2;\n";  // it should be * 4, so as we have FLT4
+  // but we keep half8 in float4 so, we have * 2 y_coord for texture
+  c += "  for (int i = 0; i < src_depth_x4; ++i, c += 4, c2 += 8) {\n";
+  c += "    FLT4 v = " + src_tensor.Read3D("0", "0", "c") + ";\n";
+  if (precision != CalculationsPrecision::F32) {
+    c += "   half8 m0 = as_half8(read_imagef(filters, smp_none, (int2)(gid, "
+         "c2+0)));\n";
+    c += "   half8 m1 = as_half8(read_imagef(filters, smp_none, (int2)(gid, "
+         "c2+1)));\n";
+    c += "   s.x += (v.x * m0.s0 + v.y * m0.s1 + v.z * m0.s2 + v.w * m0.s3);\n";
+    c += "   s.y += (v.x * m0.s4 + v.y * m0.s5 + v.z * m0.s6 + v.w * m0.s7);\n";
+    c += "   s.z += (v.x * m1.s0 + v.y * m1.s1 + v.z * m1.s2 + v.w * m1.s3);\n";
+    c += "   s.w += (v.x * m1.s4 + v.y * m1.s5 + v.z * m1.s6 + v.w * m1.s7);\n";
+  } else {
+    c += "   float4 m0 = read_imagef(filters, smp_none, (int2)(gid * 4 + 0, "
+         "c));\n";
+    c += "   float4 m1 = read_imagef(filters, smp_none, (int2)(gid * 4 + 1, "
+         "c));\n";
+    c += "   float4 m2 = read_imagef(filters, smp_none, (int2)(gid * 4 + 2, "
+         "c));\n";
+    c += "   float4 m3 = read_imagef(filters, smp_none, (int2)(gid * 4 + 3, "
+         "c));\n";
+    c += "   s.x += (v.x * m0.s0 + v.y * m0.s1 + v.z * m0.s2 + v.w * m0.s3);\n";
+    c += "   s.y += (v.x * m1.s0 + v.y * m1.s1 + v.z * m1.s2 + v.w * m1.s3);\n";
+    c += "   s.z += (v.x * m2.s0 + v.y * m2.s1 + v.z * m2.s2 + v.w * m2.s3);\n";
+    c += "   s.w += (v.x * m3.s0 + v.y * m3.s1 + v.z * m3.s2 + v.w * m3.s3);\n";
+  }
+  c += "  }\n";
+  c += "  __local ACCUM_FLT4 temp[" + std::to_string(work_group_size.x) + "][" +
+       std::to_string(work_group_size.y) + "];\n";
+  c += "  temp[tid.x][tid.y] = s;\n";
+  c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
+  c += "  if (tid.y == 0 && gid < dst_size.w) {\n";
+  c += "    s += temp[tid.x][1];\n";
+  c += "    s += temp[tid.x][2];\n";
+  c += "    s += temp[tid.x][3];\n";
+  c += "    FLT4 r0 = TO_FLT4(s) + READ_IMAGE(biases, smp_none, (int2)(gid, "
+       "0));\n";
+  c += "  " + dst_tensor.GetAddress("dst_adr", "0", "0", "gid") + "\n";
+  c += PostProcess(linked_operations, "r0", "gid", "dst_adr");
+  c += "  " + dst_tensor.Write3D("r0", "dst_adr") + "\n";
+  c += "  }\n";
+  c += "}\n";
+
+  return c;
+}
+}  // namespace
+
+FullyConnectedTexture::FullyConnectedTexture(const OperationDef& definition)
+    : GPUOperation(definition) {}
+
+FullyConnectedTexture::FullyConnectedTexture(FullyConnectedTexture&& kernel)
+    : GPUOperation(std::move(kernel)),
+      weights_(std::move(kernel.weights_)),
+      biases_(std::move(kernel.biases_)),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+FullyConnectedTexture& FullyConnectedTexture::operator=(
+    FullyConnectedTexture&& kernel) {
+  if (this != &kernel) {
+    weights_ = std::move(kernel.weights_), biases_ = std::move(kernel.biases_),
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status FullyConnectedTexture::Compile(const CreationContext& creation_context) {
+  int wg_width = 32;
+  int wg_height = 4;
+  int work_items;
+  do {
+    work_group_size_ = {wg_width, wg_height, 1};
+    wg_width /= 2;
+    const auto code = GetFullyConnectedKernelCode(
+        definition_.src_tensors[0], definition_.dst_tensors[0],
+        definition_.precision, linked_operations_, work_group_size_);
+    RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+        code, "main_function", *creation_context.context,
+        *creation_context.device, &kernel_));
+    work_items = work_group_size_.x * work_group_size_.y * work_group_size_.z;
+  } while (work_items > kernel_.GetMaxWorkGroupSize());
+  return OkStatus();
+}
+
+Status FullyConnectedTexture::AddToQueue(CLCommandQueue* queue) {
+  const int src_depth_x4 = IntegralDivideRoundUp(src_[0]->Depth(), 4);
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_depth_x4));
+
+  return queue->DispatchImplicit(kernel_, {dst_[0]->Depth(), 1, 1},
+                                 work_group_size_);
+}
+
+Status CreateFullyConnectedTexture(const CreationContext& creation_context,
+                                   const OperationDef& definition,
+                                   const FullyConnectedAttributes& attr,
+                                   FullyConnectedTexture* result) {
+  *result = FullyConnectedTexture(definition);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::TEXTURE_2D;
+  create_info.data_type = definition.GetDataType();
+  create_info.aligned_size = attr.weights.shape.o;
+  RETURN_IF_ERROR(CreateLinearStorage(
+      create_info, attr.bias, creation_context.context, &result->biases_));
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h
@ -0,0 +1,179 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_TEXTURE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_TEXTURE_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class FullyConnectedTexture : public GPUOperation {
+ public:
+  FullyConnectedTexture() = default;
+  Status AddToQueue(CLCommandQueue* queue) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  FullyConnectedTexture(FullyConnectedTexture&& kernel);
+  FullyConnectedTexture& operator=(FullyConnectedTexture&& kernel);
+  FullyConnectedTexture(const FullyConnectedTexture&) = delete;
+  FullyConnectedTexture& operator=(const FullyConnectedTexture&) = delete;
+
+ private:
+  explicit FullyConnectedTexture(const OperationDef& definition);
+  friend Status CreateFullyConnectedTexture(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const FullyConnectedAttributes& attr, FullyConnectedTexture* result);
+
+  template <DataType T>
+  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                       CLContext* context);
+
+  template <DataType T>
+  void RearrangeWeightsFP16(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                            absl::Span<half4> dst);
+  template <DataType T>
+  void RearrangeWeightsFP32(const ::tflite::gpu::Tensor<OHWI, T>& weights,
+                            absl::Span<float4> dst);
+
+  Texture2D weights_;
+  LinearStorage biases_;
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(0, 0, 0);
+};
+
+template <DataType T>
+Status FullyConnectedTexture::UploadWeights(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+  const int src_depth = AlignByN(IntegralDivideRoundUp(weights.shape.i, 4), 4);
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+
+  if (definition_.GetDataType() == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(dst_depth * src_depth * 4);
+    RearrangeWeightsFP32(weights, absl::MakeSpan(gpu_data));
+    return CreateTexture2DRGBA(DataType::FLOAT32, dst_depth * 4, src_depth,
+                               gpu_data.data(), context, &weights_);
+  } else {
+    std::vector<half4> gpu_data(dst_depth * src_depth * 4);
+    RearrangeWeightsFP16(weights, absl::MakeSpan(gpu_data));
+    return CreateTexture2DRGBA(DataType::FLOAT32, dst_depth, src_depth * 2,
+                               gpu_data.data(), context, &weights_);
+  }
+}
+
+template <DataType T>
+void FullyConnectedTexture::RearrangeWeightsFP16(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<half4> dst) {
+  const int src_depth = AlignByN(IntegralDivideRoundUp(weights.shape.i, 4), 4);
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  int counter = 0;
+
+  for (int s = 0; s < src_depth; ++s) {
+    for (int d = 0; d < dst_depth; ++d) {
+      half4 filters[2];
+      for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          const int dst_ch = d * 4 + i;
+          const int src_ch = s * 4 + j;
+          if (dst_ch < weights.shape.o && src_ch < weights.shape.i) {
+            const int f_index =
+                weights.shape.LinearIndex({dst_ch, 0, 0, src_ch});
+            filters[i][j] = weights.data[f_index];
+          } else {
+            filters[i][j] = 0.0;
+          }
+        }
+      }
+      dst[counter++] = filters[0];
+      dst[counter++] = filters[1];
+    }
+    for (int d = 0; d < dst_depth; ++d) {
+      half4 filters[2];
+      for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          const int dst_ch = d * 4 + 2 + i;
+          const int src_ch = s * 4 + j;
+          if (dst_ch < weights.shape.o && src_ch < weights.shape.i) {
+            const int f_index =
+                weights.shape.LinearIndex({dst_ch, 0, 0, src_ch});
+            filters[i][j] = weights.data[f_index];
+          } else {
+            filters[i][j] = 0.0;
+          }
+        }
+      }
+      dst[counter++] = filters[0];
+      dst[counter++] = filters[1];
+    }
+  }
+}
+
+template <DataType T>
+void FullyConnectedTexture::RearrangeWeightsFP32(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<float4> dst) {
+  const int src_depth = AlignByN(IntegralDivideRoundUp(weights.shape.i, 4), 4);
+  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  int counter = 0;
+
+  for (int s = 0; s < src_depth; ++s) {
+    for (int d = 0; d < dst_depth; ++d) {
+      float4 filters[4];
+      for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          const int dst_ch = d * 4 + i;
+          const int src_ch = s * 4 + j;
+          if (dst_ch < weights.shape.o && src_ch < weights.shape.i) {
+            const int f_index =
+                weights.shape.LinearIndex({dst_ch, 0, 0, src_ch});
+            filters[i][j] = weights.data[f_index];
+          } else {
+            filters[i][j] = 0.0;
+          }
+        }
+      }
+      dst[counter++] = filters[0];
+      dst[counter++] = filters[1];
+      dst[counter++] = filters[2];
+      dst[counter++] = filters[3];
+    }
+  }
+}
+
+Status CreateFullyConnectedTexture(const CreationContext& creation_context,
+                                   const OperationDef& definition,
+                                   const FullyConnectedAttributes& attr,
+                                   FullyConnectedTexture* result);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_TEXTURE_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture_test.cc
@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, FullyConnectedTexture) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 4);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  FullyConnectedAttributes attr;
+  attr.weights.shape = OHWI(2, 1, 1, 4);
+  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+  attr.bias.shape = Linear(2);
+  attr.bias.data = {0.5f, -0.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      FullyConnectedTexture operation;
+      ASSERT_OK(CreateFullyConnectedTexture(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {14.5f, 37.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@ -0,0 +1,192 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetElementWiseCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const ElementwiseOperation& op,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "dst_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string c = GetCommonDefines(precision);
+
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ);
+  c += op.GetArgsDeclaration();
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 dst_size\n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  " + src_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  c += "  FLT4 src = " + src_tensor.Read3D("address") + ";\n";
+  c += "  " + op.GetCoreCode("src", "Z", "address");
+  c += PostProcess(linked_operations, "src", "Z", "address");
+  c += "  " + dst_tensor.Write3D("src", "address") + "\n";
+  c += "} \n";
+  return c;
+}
+
+}  // namespace
+
+DataType OperationDef::GetDataType() const {
+  return DeduceDataTypeFromPrecision(precision);
+}
+
+DataType OperationDef::GetPrimaryDataType() const {
+  return src_tensors[0].data_type;
+}
+TensorStorageType OperationDef::GetPrimaryStorageType() const {
+  return src_tensors[0].storage_type;
+}
+
+GPUOperation::GPUOperation(const OperationDef& definition)
+    : definition_(definition) {}
+
+void GPUOperation::SetSrc(Tensor* ptr, int index) {
+  if (index >= src_.size()) {
+    src_.resize(index + 1, nullptr);
+  }
+  src_[index] = ptr;
+}
+
+void GPUOperation::SetDst(Tensor* ptr, int index) {
+  if (index >= dst_.size()) {
+    dst_.resize(index + 1, nullptr);
+  }
+  dst_[index] = ptr;
+}
+
+GPUOperation::GPUOperation(GPUOperation&& operation)
+    : definition_(std::move(operation.definition_)),
+      src_(std::move(operation.src_)),
+      dst_(std::move(operation.dst_)),
+      linked_operations_(std::move(operation.linked_operations_)) {}
+
+GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
+  if (this != &operation) {
+    definition_ = std::move(operation.definition_);
+    src_ = std::move(operation.src_);
+    dst_ = std::move(operation.dst_);
+    linked_operations_ = std::move(operation.linked_operations_);
+  }
+  return *this;
+}
+
+void GPUOperation::AddOperation(ElementwiseOperation* operation) {
+  linked_operations_.push_back(operation);
+  operation->SetLinkIndex(linked_operations_.size());
+}
+
+ElementwiseOperation::ElementwiseOperation(ElementwiseOperation&& operation)
+    : GPUOperation(std::move(operation)),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+ElementwiseOperation& ElementwiseOperation::operator=(
+    ElementwiseOperation&& operation) {
+  if (this != &operation) {
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status ElementwiseOperation::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArguments(&kernel_));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  return OkStatus();
+}
+
+int3 ElementwiseOperation::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status ElementwiseOperation::Compile(const CreationContext& creation_context) {
+  const auto code =
+      GetElementWiseCode(definition_.src_tensors[0], definition_.dst_tensors[0],
+                         definition_.precision, *this, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status ElementwiseOperation::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Status ElementwiseOperation::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+std::string GetArgsDeclaration(
+    const std::vector<ElementwiseOperation*>& linked_ops) {
+  std::string code;
+  for (auto linked_op : linked_ops) {
+    code += linked_op->GetArgsDeclaration();
+  }
+  code += ",\n";
+
+  return code;
+}
+
+std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
+                        const std::string& var_name, const std::string& z_coord,
+                        const std::string& global_address) {
+  std::string code;
+  for (auto linked_op : linked_ops) {
+    code += linked_op->GetCoreCode(var_name, z_coord, global_address);
+  }
+  return code;
+}
+
+Status BindArgs(CLKernel* kernel,
+                const std::vector<ElementwiseOperation*>& linked_ops) {
+  for (auto linked_op : linked_ops) {
+    RETURN_IF_ERROR(linked_op->BindArguments(kernel));
+  }
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@ -0,0 +1,180 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_GPU_OPERATION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_GPU_OPERATION_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct CreationContext {
+  const CLDevice* device;
+  CLContext* context;
+  CLCommandQueue* queue;
+  ProgramCache* cache;
+};
+
+struct OperationDef {
+  CalculationsPrecision precision;
+  std::vector<TensorDescriptor> src_tensors;
+  std::vector<TensorDescriptor> dst_tensors;
+
+  // returns FLOAT32 for F32 precision and FLOAT16 for F16 precision
+  DataType GetDataType() const;
+  // Primary means the first src tensor, because first tensor usually defines
+  // the structure of kernel, all other resources(biases) types and etc.
+  DataType GetPrimaryDataType() const;
+  TensorStorageType GetPrimaryStorageType() const;
+};
+
+class ElementwiseOperation;
+
+// GPUOperation represents some implementation of neural network operation on
+// GPU. GPUOperation can contain ElementwiseOperation operations, in this case,
+// ElementwiseOperation still hold necessary data and should be alive.
+// When GPUOperation contains ElementwiseOperations, this GPUoperation replaces
+// some sequence of operations Op + el_op0 + el_op1 + ...
+// Because of this abilities of GPUOperation, usage scenario is next:
+// Create instance of GPUOperation.
+// Create all instances of ElementwiseOperations that we will(probably) attach
+// to GPUOperation. Attach all ElementwiseOperations to GPUOperation. Call
+// GPUOperation.Compile(). Don't call ElementwiseOperation.Compile() if it
+// attached, it useless(and may be error)
+class GPUOperation {
+ public:
+  GPUOperation() = default;
+  explicit GPUOperation(const OperationDef& definition);
+  virtual ~GPUOperation() = default;
+  // Move only
+  GPUOperation(GPUOperation&& operation);
+  GPUOperation& operator=(GPUOperation&& operation);
+  GPUOperation(const GPUOperation&) = delete;
+  GPUOperation& operator=(const GPUOperation&) = delete;
+
+  void AddOperation(ElementwiseOperation* operation);
+
+  void SetSrc(Tensor* ptr, int index = 0);
+  void SetDst(Tensor* ptr, int index = 0);
+
+  virtual Status AddToQueue(CLCommandQueue* queue) { return OkStatus(); }
+  virtual Status Tune(const TuningParameters& params) { return OkStatus(); }
+
+  virtual Status Compile(const CreationContext& creation_context) {
+    return OkStatus();
+  }
+
+  const OperationDef& GetDefinition() const { return definition_; }
+
+ protected:
+  // Defines operation calculation precision and format of src/dst tensors.
+  OperationDef definition_;
+  std::vector<Tensor*> src_;
+  std::vector<Tensor*> dst_;
+  std::vector<ElementwiseOperation*> linked_operations_;
+};
+
+// ElementwiseOperation can be fused(linked) to another operation.
+// field linked_ indicate about this
+// link_index_ used mostly for generating of correct names for
+//   linked code variables
+// link_index_ is number of operation in sequence of linked operations
+// and should be unique in this sequence
+// link_index_ = 0 is equivalent that operation not linked.
+class ElementwiseOperation : public GPUOperation {
+ public:
+  ElementwiseOperation() {}
+  explicit ElementwiseOperation(const OperationDef& definition)
+      : GPUOperation(definition) {}
+
+  virtual ~ElementwiseOperation() {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  ElementwiseOperation(ElementwiseOperation&& operation);
+  ElementwiseOperation& operator=(ElementwiseOperation&& operation);
+  ElementwiseOperation(const ElementwiseOperation&) = delete;
+  ElementwiseOperation& operator=(const ElementwiseOperation&) = delete;
+
+  // We need this function for resolving naming conflicts.
+  // Unfortunately we don't know upfront(at creation time) will be the operation
+  // linked or not. Operation should be created and SetLinkIndex(0) must be
+  // called to initialize specific for this op linked info, and this is mean
+  // that operation is not linked. But if we decided to link it, we need update
+  // operation linked info and use names for kernel arguments according to this
+  // index(this is responsibility of particular implementation of
+  // ElementwiseOperation to generate right names).
+  virtual void SetLinkIndex(int index) {}
+
+  virtual std::string GetCoreCode(const std::string& src,
+                                  const std::string& z_coord,
+                                  const std::string& address) const = 0;
+  virtual std::string GetArgsDeclaration() const { return ""; }
+  virtual Status BindArguments(CLKernel* kernel) { return OkStatus(); }
+
+ protected:
+  Status BindArguments();
+  int3 GetGridSize() const;
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+// Generates arguments declarations string for elementwise
+// operations in linked_ops.
+// Every ElementwiseOperation can generate arguments declarations.
+std::string GetArgsDeclaration(
+    const std::vector<ElementwiseOperation*>& linked_ops);
+
+// Generates shader code for every elementwise operation in
+// linked_ops.
+// linked_ops - vector of operations pointers
+// var_name - name of variable in shader code that we update/change
+// z_coord - name of variable in shader code for currently processed Z -
+//   coordinate in 3D grid (WHC/XYZ) for tensor, this coordinate is in
+//   layer/slice(group of 4 channels) space not in channels.
+// global_address - name of variable for coordinates in 3D grid (WHC/XYZ) for
+//   tensor, different tensor layouts encode this address differently.
+std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
+                        const std::string& var_name, const std::string& z_coord,
+                        const std::string& global_address);
+
+// Binds arguments to given kernel for elementwise operations in
+// linked_ops.
+// Every ElementwiseOperation can bind her arguments.
+Status BindArgs(CLKernel* kernel,
+                const std::vector<ElementwiseOperation*>& linked_ops);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_GPU_OPERATION_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish.h
@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_HARD_SWISH_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_HARD_SWISH_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class HardSwish : public ElementwiseOperation {
+ public:
+  static std::unique_ptr<HardSwish> Create(const OperationDef& op_def) {
+    auto h_swish = absl::make_unique<HardSwish>(op_def);
+    h_swish->SetLinkIndex(0);
+    return h_swish;
+  }
+
+  HardSwish() = delete;
+  explicit HardSwish(const OperationDef& op_def)
+      : ElementwiseOperation(op_def) {}
+  HardSwish(const HardSwish&) = delete;
+  HardSwish(HardSwish&& h_swish) : ElementwiseOperation(std::move(h_swish)) {}
+
+  HardSwish& operator=(const HardSwish&) = delete;
+  HardSwish& operator=(HardSwish&& h_swish) {
+    if (this != &h_swish) ElementwiseOperation::operator=(std::move(h_swish));
+    return *this;
+  }
+
+  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
+                          const std::string& address) const override {
+    return absl::Substitute(
+        "$0 *= clamp($0 / 6.0f + (FLT4)(0.5f), (FLT4)(0.0f), (FLT4)(1.0f));\n",
+        src);
+  }
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_HARD_SWISH_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish_test.cc
@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/hard_swish.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, HardSwish) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 7);
+  src_tensor.data = {-4.5f, -3.0f, -1.5f, 0.0f, 1.5f, 3.0f, 4.5f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      auto h_swish = HardSwish::Create(op_def);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_,
+                                    h_swish.get(), src_tensor.shape,
+                                    &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          testing::Pointwise(testing::FloatNear(eps),
+                             {0.0f, 0.0f, -0.375f, 0.0f, 1.125f, 3.f, 4.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
@ -0,0 +1,164 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetMaxUnoolingKernelCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& src_ind_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator src_ind("src_data_indices", "src_size",
+                              src_ind_descriptor);
+  TensorCodeGenerator dst("dst_data", "dst_size", dst_descriptor);
+
+  std::string code = GetCommonDefines(precision);
+
+  code += "__kernel void main_function(\n";
+  code += src.GetDeclaration(AccessType::READ) + ",\n";
+  code += src_ind.GetDeclaration(AccessType::READ);
+  code += GetArgsDeclaration(linked_operations);
+  code += dst.GetDeclaration(AccessType::WRITE) + ",\n";
+  code += "    int4 src_size,      \n";
+  code += "    int4 dst_size,      \n";
+  code += "    int2 kernel_size,   \n";
+  code += "    int2 padding,       \n";
+  code += "    int2 stride         \n";
+  code += ") {\n";
+  code += "  int X = get_global_id(0);\n";
+  code += "  int Y = get_global_id(1);\n";
+  code += "  int Z = get_global_id(2);\n";
+  code += "  if (X >= dst_size.x || Y >= dst_size.y) return; \n";
+  code += "  int src_x = (X + padding.x) / stride.x;\n";
+  code += "  int src_y = (Y + padding.y) / stride.y;\n";
+  code += "  " + src.GetAddress("src_adr", "src_x", "src_y", "Z") + "\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    code += "  bool outside = src_x < 0 || src_y < 0 ||";
+    code += "  src_x >= src_size.x || src_y >= src_size.y;\n";
+    code += "  FLT4 src = (FLT4)(0.0f);\n";
+    code += "  int4 ind = (int4)(0);\n";
+    code += "  if (!outside) {\n";
+    code += "    src = " + src.Read3D("src_adr") + ";\n";
+    code += "    ind = convert_int4(" + src_ind.Read3D("src_adr") + ");\n";
+    code += "  }\n";
+  } else {  // for textures no boundary checks
+    code += "  FLT4 src = " + src.Read3D("src_adr") + ";\n";
+    code += "  int4 ind = convert_int4(" + src_ind.Read3D("src_adr") + ");\n";
+  }
+  code += "  int t_x = X - (src_x * stride.x - padding.x);\n";
+  code += "  int t_y = Y - (src_y * stride.y - padding.y);\n";
+  code += "  int t_index = t_y * kernel_size.x + t_x;\n";
+  code += "  FLT4 result;\n";
+  const std::string channels[] = {".x", ".y", ".z", ".w"};
+  for (int i = 0; i < 4; ++i) {
+    const auto& s = channels[i];
+    code += "  result" + s + "= t_index == ind" + s + "? src" + s + ": 0.0f;\n";
+  }
+  code += "  " + dst.GetAddress("address", "X", "Y", "Z") + "\n";
+  code += PostProcess(linked_operations, "result", "Z", "address");
+  code += "  " + dst.Write3D("result", "address");
+  code += "}\n";
+
+  return code;
+}
+}  // namespace
+
+MaxUnpooling::MaxUnpooling(const OperationDef& definition,
+                           const MaxUnpooling2DAttributes& attr)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(attr.padding.appended.w, attr.padding.appended.h),
+      kernel_size_(attr.kernel.w, attr.kernel.h) {}
+
+MaxUnpooling::MaxUnpooling(MaxUnpooling&& kernel)
+    : GPUOperation(std::move(kernel)),
+      stride_(kernel.stride_),
+      padding_(kernel.padding_),
+      kernel_size_(kernel.kernel_size_),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
+  if (this != &kernel) {
+    std::swap(stride_, kernel.stride_);
+    std::swap(padding_, kernel.padding_);
+    std::swap(kernel_size_, kernel.kernel_size_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status MaxUnpooling::Compile(const CreationContext& creation_context) {
+  const auto code = GetMaxUnoolingKernelCode(
+      definition_.src_tensors[0], definition_.src_tensors[1],
+      definition_.dst_tensors[0], definition_.precision, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status MaxUnpooling::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+
+  return OkStatus();
+}
+
+int3 MaxUnpooling::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status MaxUnpooling::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status MaxUnpooling::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
+                                const MaxUnpooling2DAttributes& attr) {
+  return MaxUnpooling(definition, attr);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MAX_UNPOOLING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MAX_UNPOOLING_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class MaxUnpooling : public GPUOperation {
+ public:
+  MaxUnpooling(const OperationDef& definition,
+               const MaxUnpooling2DAttributes& attr);
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  MaxUnpooling(MaxUnpooling&& kernel);
+  MaxUnpooling& operator=(MaxUnpooling&& kernel);
+  MaxUnpooling(const MaxUnpooling&) = delete;
+  MaxUnpooling& operator=(const MaxUnpooling&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  int2 stride_;
+  int2 padding_;
+  int2 kernel_size_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
+                                const MaxUnpooling2DAttributes& attr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MAX_UNPOOLING_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc
@ -0,0 +1,73 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, MaxUnpooling) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 1);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+  TensorFloat32 src_ind_tensor;
+  src_ind_tensor.shape = BHWC(1, 2, 2, 1);
+  src_ind_tensor.data = {0.1f, 1.1f, 2.1f, 3.1f};
+
+  MaxUnpooling2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(2, 2);
+  attr.kernel = HW(2, 2);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      MaxUnpooling operation = CreateMaxUnpooling(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor, src_ind_tensor},
+                                    creation_context_, &operation,
+                                    BHWC(1, 4, 4, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                             0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 0.0f, 0.0f, 3.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc
@ -0,0 +1,166 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h"
+
+#include "absl/types/variant.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+MultiplyAdd::MultiplyAdd(MultiplyAdd&& operation)
+    : ElementwiseOperation(std::move(operation)),
+      mul_vec_(std::move(operation.mul_vec_)),
+      add_vec_(std::move(operation.add_vec_)),
+      use_mul_vec_(operation.use_mul_vec_),
+      use_add_vec_(operation.use_add_vec_),
+      scalar_mul_(std::move(operation.scalar_mul_)),
+      scalar_add_(std::move(operation.scalar_add_)) {}
+
+MultiplyAdd& MultiplyAdd::operator=(MultiplyAdd&& operation) {
+  if (this != &operation) {
+    mul_vec_ = std::move(operation.mul_vec_);
+    add_vec_ = std::move(operation.add_vec_);
+    use_mul_vec_ = operation.use_mul_vec_;
+    use_add_vec_ = operation.use_add_vec_;
+    scalar_mul_ = std::move(operation.scalar_mul_);
+    scalar_add_ = std::move(operation.scalar_add_);
+    ElementwiseOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+void MultiplyAdd::SetLinkIndex(int index) {
+  scalar_mul_.SetName(absl::StrCat("mad_scalar_mul_", index));
+  scalar_add_.SetName(absl::StrCat("mad_scalar_add_", index));
+  mul_vec_.SetName(absl::StrCat("mad_mul_", index));
+  add_vec_.SetName(absl::StrCat("mad_add_", index));
+}
+
+std::string MultiplyAdd::GetCoreCode(const std::string& src,
+                                     const std::string& z_coord,
+                                     const std::string& address) const {
+  std::string result = absl::StrCat(src, " = ", src);
+  if (use_mul_vec_) {
+    result = absl::StrCat(result, " * ", mul_vec_.ReadLinearFLT4(z_coord));
+  }
+  if (scalar_mul_.Active()) {
+    absl::StrAppend(&result, " * ", scalar_mul_.GetName());
+  }
+  if (use_add_vec_) {
+    result = absl::StrCat(result, " + ", add_vec_.ReadLinearFLT4(z_coord));
+  }
+  if (scalar_add_.Active()) {
+    absl::StrAppend(&result, " + ", scalar_add_.GetName());
+  }
+  return absl::StrCat(result, ";\n");
+}
+
+std::string MultiplyAdd::GetArgsDeclaration() const {
+  std::string args;
+  if (use_mul_vec_) {
+    args = absl::StrCat(args, ",\n    ", mul_vec_.GetDeclaration());
+  }
+  if (use_add_vec_) {
+    args = absl::StrCat(args, ",\n    ", add_vec_.GetDeclaration());
+  }
+  if (scalar_mul_.Active()) {
+    absl::StrAppend(&args, ",\n    ", scalar_mul_.GetDeclaration());
+  }
+  if (scalar_add_.Active()) {
+    absl::StrAppend(&args, ",\n    ", scalar_add_.GetDeclaration());
+  }
+  return args;
+}
+
+Status MultiplyAdd::BindArguments(CLKernel* kernel) {
+  if (use_mul_vec_) {
+    RETURN_IF_ERROR(kernel->SetMemoryAuto(mul_vec_.GetMemoryPtr()));
+  }
+  if (use_add_vec_) {
+    RETURN_IF_ERROR(kernel->SetMemoryAuto(add_vec_.GetMemoryPtr()));
+  }
+  if (scalar_mul_.Active()) {
+    RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_mul_));
+  }
+  if (scalar_add_.Active()) {
+    RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_add_));
+  }
+  return OkStatus();
+}
+
+Status MultiplyAdd::UploadMul(const MultiplyScalarAttributes& attr,
+                              CLContext* context) {
+  auto mul = absl::get_if<::tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
+      &attr.param);
+  auto mul_scalar = absl::get_if<float>(&attr.param);
+  if (mul) {
+    RETURN_IF_ERROR(UploadMul(*mul, context));
+  } else {
+    scalar_mul_ = FLT(definition_.precision, *mul_scalar);
+  }
+  return OkStatus();
+}
+
+Status MultiplyAdd::UploadAdd(const AddAttributes& attr, CLContext* context) {
+  auto add = absl::get_if<::tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
+      &attr.param);
+  auto add_scalar = absl::get_if<float>(&attr.param);
+  if (add) {
+    RETURN_IF_ERROR(UploadAdd(*add, context));
+  } else {
+    scalar_add_ = FLT(definition_.precision, *add_scalar);
+  }
+  return OkStatus();
+}
+
+Status CreateMultiplyAdd(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const MultiplyScalarAttributes& attr,
+                         MultiplyAdd* result) {
+  *result = MultiplyAdd(definition);
+  RETURN_IF_ERROR(result->UploadMul(attr, creation_context.context));
+  result->SetLinkIndex(0);
+  return OkStatus();
+}
+
+Status CreateMultiplyAdd(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const AddAttributes& attr, MultiplyAdd* result) {
+  *result = MultiplyAdd(definition);
+  RETURN_IF_ERROR(result->UploadAdd(attr, creation_context.context));
+  result->SetLinkIndex(0);
+  return OkStatus();
+}
+
+Status CreateMultiplyAdd(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const MultiplyScalarAttributes& mul_attr,
+                         const AddAttributes& add_attr, MultiplyAdd* result) {
+  *result = MultiplyAdd(definition);
+  RETURN_IF_ERROR(result->UploadMul(mul_attr, creation_context.context));
+  RETURN_IF_ERROR(result->UploadAdd(add_attr, creation_context.context));
+  result->SetLinkIndex(0);
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h
@ -0,0 +1,132 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class MultiplyAdd : public ElementwiseOperation {
+ public:
+  // Move only
+  MultiplyAdd() = default;
+  MultiplyAdd(MultiplyAdd&& operation);
+  MultiplyAdd& operator=(MultiplyAdd&& operation);
+  MultiplyAdd(const MultiplyAdd&) = delete;
+  MultiplyAdd& operator=(const MultiplyAdd&) = delete;
+
+  Status UploadMul(const MultiplyScalarAttributes& attr, CLContext* context);
+  Status UploadAdd(const AddAttributes& attr, CLContext* context);
+
+  template <DataType T>
+  Status UploadMul(const ::tflite::gpu::Tensor<Linear, T>& mul,
+                   CLContext* context);
+
+  template <DataType T>
+  Status UploadAdd(const ::tflite::gpu::Tensor<Linear, T>& add,
+                   CLContext* context);
+
+  void SetLinkIndex(int index) override;
+  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
+                          const std::string& address) const override;
+  std::string GetArgsDeclaration() const override;
+  Status BindArguments(CLKernel* kernel) override;
+
+  friend Status CreateMultiplyAdd(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const MultiplyScalarAttributes& attr,
+                                  MultiplyAdd* result);
+
+  friend Status CreateMultiplyAdd(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const AddAttributes& attr,
+                                  MultiplyAdd* result);
+
+  friend Status CreateMultiplyAdd(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const MultiplyScalarAttributes& mul_attr,
+                                  const AddAttributes& add_attr,
+                                  MultiplyAdd* result);
+
+ private:
+  explicit MultiplyAdd(const OperationDef& definition)
+      : ElementwiseOperation(definition),
+        use_mul_vec_(false),
+        use_add_vec_(false) {}
+
+  LinearStorage mul_vec_;
+  LinearStorage add_vec_;
+  bool use_mul_vec_;
+  bool use_add_vec_;
+  FLT scalar_mul_;
+  FLT scalar_add_;
+};
+
+Status CreateMultiplyAdd(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const MultiplyScalarAttributes& attr,
+                         MultiplyAdd* result);
+
+Status CreateMultiplyAdd(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const AddAttributes& attr, MultiplyAdd* result);
+
+Status CreateMultiplyAdd(const CreationContext& creation_context,
+                         const OperationDef& definition,
+                         const MultiplyScalarAttributes& mul_attr,
+                         const AddAttributes& add_attr, MultiplyAdd* result);
+
+template <DataType T>
+Status MultiplyAdd::UploadMul(const ::tflite::gpu::Tensor<Linear, T>& mul,
+                              CLContext* context) {
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type =
+      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
+  create_info.data_type = definition_.GetDataType();
+  RETURN_IF_ERROR(CreateLinearStorage(create_info, mul, context, &mul_vec_));
+  use_mul_vec_ = true;
+  return OkStatus();
+}
+
+template <DataType T>
+Status MultiplyAdd::UploadAdd(const ::tflite::gpu::Tensor<Linear, T>& add,
+                              CLContext* context) {
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type =
+      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
+  create_info.data_type = definition_.GetDataType();
+  RETURN_IF_ERROR(CreateLinearStorage(create_info, add, context, &add_vec_));
+  use_add_vec_ = true;
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
@ -0,0 +1,187 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, MultiplyAddVectorMul) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  MultiplyScalarAttributes attr;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
+  parameters.shape = Linear(2);
+  parameters.data = {0.5f, 2.0f};
+  attr.param = parameters;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      MultiplyAdd operation;
+      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 2.0f, 1.0f, 6.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MultiplyAddVectorAdd) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  AddAttributes attr;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
+  parameters.shape = Linear(2);
+  parameters.data = {0.5f, 2.0f};
+  attr.param = parameters;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      MultiplyAdd operation;
+      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.5f, 3.0f, 2.5f, 5.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MultiplyAddScalarMul) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  MultiplyScalarAttributes attr;
+  attr.param = 0.5f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      MultiplyAdd operation;
+      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 0.5f, 1.0f, 1.5f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MultiplyAddScalarAdd) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  AddAttributes attr;
+  attr.param = -0.5f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      MultiplyAdd operation;
+      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-0.5f, 0.5f, 1.5f, 2.5f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MultiplyAddVectorMad) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  MultiplyScalarAttributes mul_attr;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
+  parameters.shape = Linear(2);
+  parameters.data = {0.5f, 2.0f};
+  mul_attr.param = parameters;
+
+  AddAttributes add_attr;
+  parameters.data = {-0.5f, 0.5f};
+  add_attr.param = parameters;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      MultiplyAdd operation;
+      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, mul_attr, add_attr,
+                                  &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-0.5f, 2.5f, 0.5f, 6.5f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
@ -0,0 +1,152 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/padding.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetPaddingCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string code = GetCommonDefines(precision);
+  const std::string channels[] = {".x", ".y", ".z", ".w"};
+
+  code += "__kernel void main_function(\n";
+  code += src_tensor.GetDeclaration(AccessType::READ);
+  code += GetArgsDeclaration(linked_operations);
+  code += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  code += "    int4 src_size,      \n";
+  code += "    int4 dst_size,      \n";
+  code += "    int4 prepended      \n";
+  code += ") {\n";
+  code += "  int X = get_global_id(0);\n";
+  code += "  int Y = get_global_id(1);\n";
+  code += "  int Z = get_global_id(2);\n";
+  code += "  if (X >= dst_size.x || Y >= dst_size.y) return; \n";
+  code += "  FLT4 result = (FLT4)(0.0);\n";
+  code += "  int s_x = X - prepended.x;\n";
+  code += "  int s_y = Y - prepended.y;\n";
+  code += "  bool inside_x = s_x >= 0 && s_x < src_size.x;\n";
+  code += "  bool inside_y = s_y >= 0 && s_y < src_size.y;\n";
+  code += "  if (inside_x && inside_y) {\n";
+  code += "    int start_channel = Z * 4;\n";
+  for (int i = 0; i < 4; ++i) {
+    const auto& s = channels[i];
+    code += "    {\n";
+    code += "    int channel = start_channel + " + std::to_string(i) + ";\n";
+    code += "    int s_z = channel - prepended.z;\n";
+    code += "    if (s_z >= 0 && s_z < src_size.z) {\n";
+    code +=
+        "      FLT4 t = " + src_tensor.Read3D("s_x", "s_y", "s_z / 4") + ";\n";
+    code += "      FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
+    code += "      result" + s + " = t_ar[s_z % 4];\n";
+    code += "    }\n";
+    code += "    }\n";
+  }
+  code += "  }\n";
+  code += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  code += PostProcess(linked_operations, "result", "Z", "address");
+  code += "  " + dst_tensor.Write3D("result", "address");
+  code += "}\n";
+
+  return code;
+}
+}  // namespace
+
+Padding::Padding(const OperationDef& definition, const PadAttributes& attr)
+    : GPUOperation(definition) {
+  SetPrepended(int3(attr.prepended.w, attr.prepended.h, attr.prepended.c));
+}
+
+Padding::Padding(Padding&& kernel)
+    : GPUOperation(std::move(kernel)),
+      prepended_(kernel.prepended_),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+Padding& Padding::operator=(Padding&& kernel) {
+  if (this != &kernel) {
+    std::swap(prepended_, kernel.prepended_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+void Padding::SetPrepended(const int3& prepended) {
+  prepended_.x = prepended.x;
+  prepended_.y = prepended.y;
+  prepended_.z = prepended.z;
+  prepended_.w = 0;
+}
+
+Status Padding::Compile(const CreationContext& creation_context) {
+  const auto code =
+      GetPaddingCode(definition_.src_tensors[0], definition_.dst_tensors[0],
+                     definition_.precision, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status Padding::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(prepended_));
+  return OkStatus();
+}
+
+int3 Padding::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status Padding::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status Padding::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Padding CreatePadding(const OperationDef& definition,
+                      const PadAttributes& attr) {
+  return Padding(definition, attr);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
@ -0,0 +1,59 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_PADDING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_PADDING_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Padding : public GPUOperation {
+ public:
+  Padding(const OperationDef& definition, const PadAttributes& attr);
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  Padding(Padding&& kernel);
+  Padding& operator=(Padding&& kernel);
+  Padding(const Padding&) = delete;
+  Padding& operator=(const Padding&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  void SetPrepended(const int3& prepended);
+  int4 prepended_;
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+Padding CreatePadding(const OperationDef& definition,
+                      const PadAttributes& attr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_PADDING_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc
@ -0,0 +1,236 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/padding.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, PaddingAppendWidth) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(0, 0, 0);
+  attr.appended = HWC(0, 1, 0);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, PaddingPrependWidth) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(0, 1, 0);
+  attr.appended = HWC(0, 0, 0);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, PaddingAppendHeight) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(0, 0, 0);
+  attr.appended = HWC(1, 0, 0);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 1, 2), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps), {0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, PaddingPrependHeight) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(1, 0, 0);
+  attr.appended = HWC(0, 0, 0);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 1, 2), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps), {0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, PaddingAppendChannels) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(0, 0, 0);
+  attr.appended = HWC(0, 0, 1);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 3), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps), {0.0f, 1.0f, 0.0f, 2.0f, 3.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, PaddingPrependChannels) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(0, 0, 1);
+  attr.appended = HWC(0, 0, 0);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 3), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps), {0.0f, 0.0f, 1.0f, 0.0f, 2.0f, 3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, PaddingComplex) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  PadAttributes attr;
+  attr.prepended = HWC(0, 1, 1);
+  attr.appended = HWC(1, 1, 0);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Padding operation = CreatePadding(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 3, 3), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f,
+                     0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 0.0f,
+                     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
@ -0,0 +1,255 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/pooling.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetAveragePoolingKernelCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  std::string code = GetCommonDefines(precision);
+
+  code += "__kernel void main_function(\n";
+  code += src_tensor.GetDeclaration(AccessType::READ);
+  code += GetArgsDeclaration(linked_operations);
+  code += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  code += "    int4 src_size,             \n";
+  code += "    int4 dst_size,             \n";
+  code += "    int2 kernel_size,          \n";
+  code += "    int2 padding,              \n";
+  code += "    int2 stride                \n";
+  code += ") {\n";
+  code += "  int X = get_global_id(0);\n";
+  code += "  int Y = get_global_id(1);\n";
+  code += "  int Z = get_global_id(2);\n";
+  code += "  if (X >= dst_size.x || Y >= dst_size.y) return; \n";
+  code += "  float4 r = (float4)(0.0f);\n";
+  code += "  float window_size = 0.0;\n";
+  code += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
+  code += "    int y_c = Y * stride.y - padding.y + ky;\n";
+  code += "    bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
+  code += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
+  code += "      int x_c = X * stride.x - padding.x + kx;\n";
+  code += "      bool outside = outside_y || x_c < 0 || x_c >= src_size.x;\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    code += "     r += !outside ? " +
+            src_tensor.ReadAsFloat3D("x_c", "y_c", "Z") +
+            " : (float4)(0.0f);\n";
+  } else {
+    code += "      r += " + src_tensor.ReadAsFloat3D("x_c", "y_c", "Z") + ";\n";
+  }
+  code += "        window_size += !outside ? 1.0 : 0.0;\n";
+  code += "    }\n";
+  code += "  }\n";
+  // If window_size==0, window covered nothing. This situation is a sign of
+  // incorrectly constructed operation. NaNs are expected as output.
+  code += "  FLT4 result = TO_FLT4(r / window_size);\n";
+  code += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  code += PostProcess(linked_operations, "result", "Z", "address");
+  code += "  " + dst_tensor.Write3D("result", "address");
+  code += "}\n";
+
+  return code;
+}
+
+std::string GetMaxPoolingKernelCode(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const std::vector<ElementwiseOperation*>& linked_operations,
+    bool output_indices) {
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+  TensorCodeGenerator indices_tensor("dst_indices", "dst_size", dst_descriptor);
+
+  std::string code = GetCommonDefines(precision);
+
+  code += "__kernel void main_function(\n";
+  code += src_tensor.GetDeclaration(AccessType::READ);
+  code += GetArgsDeclaration(linked_operations);
+  code += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  if (output_indices) {
+    code += indices_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  }
+  code += "    int4 src_size,             \n";
+  code += "    int4 dst_size,             \n";
+  code += "    int2 kernel_size,          \n";
+  code += "    int2 padding,              \n";
+  code += "    int2 stride                \n";
+  code += ") {\n";
+  code += "  int X = get_global_id(0);\n";
+  code += "  int Y = get_global_id(1);\n";
+  code += "  int Z = get_global_id(2);\n";
+  code += "  if (X >= dst_size.x || Y >= dst_size.y) return; \n";
+  code += "  FLT4 maximum = (FLT4)(-10000.0f);\n";
+  if (output_indices) {
+    code += "  int4 indexes = (int4)(0);\n";
+    code += "  int index_counter = 0;\n";
+  }
+  code += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
+  code += "    int y_c = Y * stride.y - padding.y + ky;\n";
+  code += "    bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
+  code += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
+  code += "      int x_c = X * stride.x - padding.x + kx;\n";
+  code += "      bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
+  code += "      if (!outside_x && !outside_y) {\n";
+  code += "        FLT4 src = " + src_tensor.Read3D("x_c", "y_c", "Z") + ";\n";
+  if (output_indices) {
+    code += "        if (src.x > maximum.x) {\n";
+    code += "          indexes.x = index_counter;\n";
+    code += "          maximum.x = src.x;\n";
+    code += "        }\n";
+    code += "        if (src.y > maximum.y) {\n";
+    code += "          indexes.y = index_counter;\n";
+    code += "          maximum.y = src.y;\n";
+    code += "        }\n";
+    code += "        if (src.z > maximum.z) {\n";
+    code += "          indexes.z = index_counter;\n";
+    code += "          maximum.z = src.z;\n";
+    code += "        }\n";
+    code += "        if (src.w > maximum.w) {\n";
+    code += "          indexes.w = index_counter;\n";
+    code += "          maximum.w = src.w;\n";
+    code += "        }\n";
+    code += "      index_counter++;\n";
+  }
+  code += "        maximum = max(src, maximum);\n";
+  code += "      };\n";
+  code += "    }\n";
+  code += "  }\n";
+  code += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
+  code += PostProcess(linked_operations, "maximum", "Z", "address");
+  code += "  " + dst_tensor.Write3D("maximum", "address");
+  if (output_indices) {
+    code += "  FLT4 result_value = TO_FLT4(indexes) + (FLT4)(0.1);\n";
+    code += "  " + indices_tensor.Write3D("result_value", "address");
+  }
+  code += "}\n";
+
+  return code;
+}
+
+}  // namespace
+
+Pooling::Pooling(const OperationDef& definition,
+                 const Pooling2DAttributes& attr)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(attr.padding.prepended.w, attr.padding.prepended.h),
+      kernel_size_(attr.kernel.w, attr.kernel.h),
+      type_(attr.type),
+      output_indices_(attr.output_indices) {}
+
+Pooling::Pooling(Pooling&& kernel)
+    : GPUOperation(std::move(kernel)),
+      stride_(kernel.stride_),
+      padding_(kernel.padding_),
+      kernel_size_(kernel.kernel_size_),
+      type_(kernel.type_),
+      output_indices_(kernel.output_indices_),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+Pooling& Pooling::operator=(Pooling&& kernel) {
+  if (this != &kernel) {
+    std::swap(stride_, kernel.stride_);
+    std::swap(padding_, kernel.padding_);
+    std::swap(kernel_size_, kernel.kernel_size_);
+    std::swap(type_, kernel.type_);
+    std::swap(output_indices_, kernel.output_indices_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status Pooling::Compile(const CreationContext& creation_context) {
+  std::string code;
+  switch (type_) {
+    case PoolingType::AVERAGE:
+      code = GetAveragePoolingKernelCode(
+          definition_.src_tensors[0], definition_.dst_tensors[0],
+          definition_.precision, linked_operations_);
+      break;
+    case PoolingType::MAX:
+      code = GetMaxPoolingKernelCode(
+          definition_.src_tensors[0], definition_.dst_tensors[0],
+          definition_.precision, linked_operations_, output_indices_);
+      break;
+    default:
+      return InvalidArgumentError(
+          "You should create another kernel with this params");
+      break;
+  }
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status Pooling::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  if (output_indices_) {
+    RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[1]->GetMemoryPtr()));
+  }
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
+
+  return OkStatus();
+}
+
+int3 Pooling::GetGridSize() const {
+  const int grid_x = dst_[0]->Width();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status Pooling::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status Pooling::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+Pooling CreatePooling(const OperationDef& definition,
+                      const Pooling2DAttributes& attr) {
+  return Pooling(definition, attr);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_POOLING_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_POOLING_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Pooling : public GPUOperation {
+ public:
+  Pooling(const OperationDef& definition, const Pooling2DAttributes& attr);
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  Pooling(Pooling&& kernel);
+  Pooling& operator=(Pooling&& kernel);
+  Pooling(const Pooling&) = delete;
+  Pooling& operator=(const Pooling&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  int2 stride_;
+  int2 padding_;
+  int2 kernel_size_;
+
+  PoolingType type_;
+  bool output_indices_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+Pooling CreatePooling(const OperationDef& definition,
+                      const Pooling2DAttributes& attr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_POOLING_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc
@ -0,0 +1,162 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/pooling.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, AveragePooling) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Pooling2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(2, 2);
+  attr.kernel = HW(2, 2);
+  attr.type = PoolingType::AVERAGE;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Pooling operation = CreatePooling(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {3.0f, 4.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, AveragePoolingNonEmptyPadding) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 1);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  Pooling2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 1);
+  attr.strides = HW(1, 1);
+  attr.kernel = HW(2, 2);
+  attr.type = PoolingType::AVERAGE;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Pooling operation = CreatePooling(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.5f, 2.0f, 2.5f, 3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MaxPooling) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {8.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Pooling2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(2, 2);
+  attr.kernel = HW(2, 2);
+  attr.type = PoolingType::MAX;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      Pooling operation = CreatePooling(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {8.0f, 7.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MaxPoolingIndices) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {8.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  Pooling2DAttributes attr;
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.strides = HW(2, 2);
+  attr.kernel = HW(2, 2);
+  attr.type = PoolingType::MAX;
+  attr.output_indices = true;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      op_def.dst_tensors.push_back({data_type, storage});
+      TensorFloat32 dst_tensor;
+      TensorFloat32 dst_tensor_ind;
+      Pooling operation = CreatePooling(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
+                                    {BHWC(1, 1, 1, 2), BHWC(1, 1, 1, 2)},
+                                    {&dst_tensor, &dst_tensor_ind}));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {8.0f, 7.0f}));
+      for (auto& v : dst_tensor_ind.data) {
+        v = static_cast<int>(v);
+      }
+      EXPECT_THAT(dst_tensor_ind.data, Pointwise(FloatNear(eps), {0.0f, 3.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
--- a/Show More
+++ b/Show More