From c1a32fd496228b8dd021d719ebce27b9b4b791e5 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 18 Aug 2020 13:40:57 -0700
Subject: [PATCH] Added CPU representation for Tensor.

PiperOrigin-RevId: 327297658
Change-Id: Iff651c9c21df506cf6a968d8c5000707d9bcf4cf
---
 .../delegates/gpu/cl/kernels/elementwise.cc   |  22 +-
 tensorflow/lite/delegates/gpu/cl/tensor.cc    | 417 +++++++++---------
 tensorflow/lite/delegates/gpu/cl/tensor.h     |  34 +-
 .../lite/delegates/gpu/cl/tensor_type.cc      | 147 ++++++
 .../lite/delegates/gpu/cl/tensor_type.h       |  26 ++
 5 files changed, 380 insertions(+), 266 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
index edd6dee7fc0..d433006ac4b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
@@ -170,17 +170,12 @@ absl::Status CreateElementwiseTwoInput(
       creation_context.device->info_, shape, definition.GetPrimaryStorageType(),
       definition.GetDataType(), Layout::HWC);
   TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
-  Tensor gpu_tensor;
-  RETURN_IF_ERROR(
-      CreateTensor(*creation_context.context, shape, desc, &gpu_tensor));
-  RETURN_IF_ERROR(
-      gpu_tensor.WriteData(creation_context.queue, constant_tensor));
+  desc.UploadData(constant_tensor);
 
   *result = GPUOperation(definition);
   result->elementwise_ = true;
-  result->args_.AddObject("second_tensor", AccessType::READ,
-                          absl::make_unique<Tensor>(std::move(gpu_tensor)),
-                          absl::make_unique<TensorDescriptor>(desc));
+  result->args_.AddObject("second_tensor",
+                          absl::make_unique<TensorDescriptor>(std::move(desc)));
   const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
   result->code_ = absl::StrCat(
       "FLT4 second_val = args.second_tensor.Read(0, 0, ", s_coord, ");\n");
@@ -207,17 +202,12 @@ absl::Status CreateElementwiseTwoInput(
       creation_context.device->info_, shape, definition.GetPrimaryStorageType(),
       definition.GetDataType(), Layout::HWC);
   TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
-  Tensor gpu_tensor;
-  RETURN_IF_ERROR(
-      CreateTensor(*creation_context.context, shape, desc, &gpu_tensor));
-  RETURN_IF_ERROR(
-      gpu_tensor.WriteData(creation_context.queue, constant_tensor));
+  desc.UploadData(constant_tensor);
 
   *result = GPUOperation(definition);
   result->elementwise_ = true;
-  result->args_.AddObject("second_tensor", AccessType::READ,
-                          absl::make_unique<Tensor>(std::move(gpu_tensor)),
-                          absl::make_unique<TensorDescriptor>(desc));
+  result->args_.AddObject("second_tensor",
+                          absl::make_unique<TensorDescriptor>(std::move(desc)));
   const std::string x_coord = shape.w == 1 ? "0" : "X_COORD";
   const std::string y_coord = shape.h == 1 ? "0" : "Y_COORD";
   const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index 9fd9778a17f..72c53c5b1ac 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -28,6 +28,164 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
+absl::Status AllocateTensorMemory(const CLContext& context, const BHWDC& shape,
+                                  const TensorDescriptor& descriptor,
+                                  const void* data_ptr, CLMemory* result) {
+  const int slices = DivideRoundUp(shape.c, 4);
+  cl_mem_flags mem_flags = CL_MEM_READ_WRITE;
+  if (data_ptr) {
+    mem_flags |= CL_MEM_COPY_HOST_PTR;
+  }
+  switch (descriptor.storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER: {
+      const size_t data_size = shape.b * shape.w * shape.h * shape.d * slices *
+                               4 * SizeOf(descriptor.data_type);
+      cl_int error_code;
+      cl_mem memory = clCreateBuffer(context.context(), mem_flags, data_size,
+                                     const_cast<void*>(data_ptr), &error_code);
+      if (!memory) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
+                         CLErrorCodeToString(error_code)));
+      }
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+    case TensorStorageType::TEXTURE_2D: {
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+      desc.image_width = shape.w * shape.b * shape.d;
+      desc.image_height = shape.h * slices;
+      desc.image_depth = 0;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      format.image_channel_order = CL_RGBA;
+      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
+
+      cl_int error_code;
+      cl_mem memory =
+          CreateImage2DLegacy(context.context(), mem_flags, &format, &desc,
+                              const_cast<void*>(data_ptr), &error_code);
+      if (error_code != CL_SUCCESS) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to create 2D texture (clCreateImage): ",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+    case TensorStorageType::TEXTURE_3D: {
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+      desc.image_width = shape.w * shape.b;
+      desc.image_height = shape.h;
+      desc.image_depth = slices * shape.d;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      format.image_channel_order = CL_RGBA;
+      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
+
+      cl_int error_code;
+      cl_mem memory =
+          CreateImage3DLegacy(context.context(), mem_flags, &format, &desc,
+                              const_cast<void*>(data_ptr), &error_code);
+      if (error_code != CL_SUCCESS) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to create 3D texture (clCreateImage): ",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+    case TensorStorageType::TEXTURE_ARRAY: {
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+      desc.image_width = shape.w * shape.b;
+      desc.image_height = shape.h;
+      desc.image_depth = 0;
+      desc.image_array_size = slices * shape.d;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      format.image_channel_order = CL_RGBA;
+      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
+
+      cl_int error_code;
+      cl_mem memory =
+          clCreateImage(context.context(), mem_flags, &format, &desc,
+                        const_cast<void*>(data_ptr), &error_code);
+      if (error_code != CL_SUCCESS) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to create 2D texture array (clCreateImage): ",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+
+    case TensorStorageType::SINGLE_TEXTURE_2D: {
+      if (slices != 1) {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "SINGLE_TEXTURE_2D support only channels in range [1-4], but ",
+            shape.c, "was provided"));
+      }
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+      desc.image_width = shape.w * shape.b * shape.d;
+      desc.image_height = shape.h;
+      desc.image_depth = 0;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      if (context.IsFloatTexture2DSupported(shape.c, descriptor.data_type)) {
+        format.image_channel_order = ToChannelOrder(shape.c);
+        format.image_channel_data_type =
+            ToImageChannelType(descriptor.data_type);
+      } else {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "This device doesn't support ", shape.c, "-channel textures."));
+      }
+
+      cl_int error_code;
+      cl_mem memory =
+          CreateImage2DLegacy(context.context(), mem_flags, &format, &desc,
+                              const_cast<void*>(data_ptr), &error_code);
+      if (error_code != CL_SUCCESS) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to create single 2D texture (clCreateImage): ",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+
+    default:
+      return absl::InternalError("Unsupported tensor storage type");
+  }
+}
 
 absl::Status CreateImageBufferFromBuffer(const CLContext& context,
                                          cl_mem memory, DataType data_type,
@@ -59,7 +217,8 @@ absl::Status CreateTensor(const CLContext& context, const BHWDC& shape,
   const bool memory_owner = memory == nullptr;
   if (memory_owner) {
     CLMemory mem;
-    RETURN_IF_ERROR(AllocateTensorMemory(context, shape, descriptor, &mem));
+    RETURN_IF_ERROR(
+        AllocateTensorMemory(context, shape, descriptor, nullptr, &mem));
     memory = mem.Release();
   }
   if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
@@ -94,6 +253,14 @@ absl::Status CreateTensorShared(const CLContext& context, const BHWDC& shape,
 
 }  // namespace
 
+absl::Status TensorDescriptor::CreateGPUObject(CLContext* context,
+                                               GPUObjectPtr* result) const {
+  Tensor gpu_tensor;
+  RETURN_IF_ERROR(gpu_tensor.CreateFromDescriptor(*this, context));
+  *result = absl::make_unique<Tensor>(std::move(gpu_tensor));
+  return absl::OkStatus();
+}
+
 Tensor::Tensor(cl_mem memory, bool memory_owner, const BHWC& shape,
                const TensorDescriptor& descriptor)
     : memory_(memory),
@@ -279,12 +446,6 @@ absl::Status Tensor::IsValid(const BHWDC& shape) const {
   return absl::OkStatus();
 }
 
-int Tensor::GetChannelsAlignment() const {
-  return descriptor_.storage_type == TensorStorageType::SINGLE_TEXTURE_2D
-             ? shape_.c
-             : 4;
-}
-
 int Tensor::GetAlignedChannels() const {
   return descriptor_.storage_type == TensorStorageType::SINGLE_TEXTURE_2D
              ? shape_.c
@@ -329,11 +490,13 @@ absl::Status Tensor::WriteDataBHWDC(absl::Span<const float> in,
   if (descriptor_.data_type == DataType::FLOAT32) {
     data_f.resize(elements_count);
     data_ptr = data_f.data();
-    DataFromBHWDC(in, absl::MakeSpan(data_f.data(), data_f.size()));
+    DataFromBHWDC(in, shape_, descriptor_,
+                  absl::MakeSpan(data_f.data(), data_f.size()));
   } else {
     data_h.resize(elements_count);
     data_ptr = data_h.data();
-    DataFromBHWDC(in, absl::MakeSpan(data_h.data(), data_h.size()));
+    DataFromBHWDC(in, shape_, descriptor_,
+                  absl::MakeSpan(data_h.data(), data_h.size()));
   }
 
   switch (descriptor_.storage_type) {
@@ -413,9 +576,11 @@ absl::Status Tensor::ReadDataBHWDC(absl::Span<float> out,
   }
 
   if (descriptor_.data_type == DataType::FLOAT32) {
-    DataToBHWDC(absl::MakeConstSpan(data_f.data(), data_f.size()), out);
+    DataToBHWDC(absl::MakeConstSpan(data_f.data(), data_f.size()), shape_,
+                descriptor_, out);
   } else {
-    DataToBHWDC(absl::MakeConstSpan(data_h.data(), data_h.size()), out);
+    DataToBHWDC(absl::MakeConstSpan(data_h.data(), data_h.size()), shape_,
+                descriptor_, out);
   }
 
   return absl::OkStatus();
@@ -432,6 +597,26 @@ absl::Status Tensor::ReadData(CLCommandQueue* queue,
   return ReadDataBHWDC(absl::MakeSpan(dst->data), queue);
 }
 
+absl::Status Tensor::CreateFromDescriptor(const TensorDescriptor& desc,
+                                          CLContext* context) {
+  shape_ = desc.shape;
+  descriptor_.data_type = desc.data_type;
+  descriptor_.storage_type = desc.storage_type;
+  descriptor_.layout = desc.layout;
+  memory_owner_ = true;
+  CLMemory memory;
+  RETURN_IF_ERROR(AllocateTensorMemory(*context, shape_, descriptor_,
+                                       desc.data.data(), &memory));
+  memory_ = memory.Release();
+  if (desc.storage_type == TensorStorageType::IMAGE_BUFFER) {
+    RETURN_IF_ERROR(CreateImageBufferFromBuffer(
+        *context, memory_, desc.data_type,
+        shape_.b * shape_.w * shape_.h * shape_.d * DivideRoundUp(shape_.c, 4),
+        &image_buffer_memory_));
+  }
+  return absl::OkStatus();
+}
+
 absl::Status CreateTensor(const CLContext& context, const BHWC& shape,
                           const TensorDescriptor& descriptor, Tensor* result) {
   const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
@@ -462,221 +647,15 @@ absl::Status AllocateTensorMemory(const CLContext& context, const BHWC& shape,
                                   const TensorDescriptor& descriptor,
                                   CLMemory* result) {
   const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
-  return AllocateTensorMemory(context, shape5D, descriptor, result);
+  return AllocateTensorMemory(context, shape5D, descriptor, nullptr, result);
 }
 
 absl::Status AllocateTensorMemory(const CLContext& context, const BHWDC& shape,
                                   const TensorDescriptor& descriptor,
                                   CLMemory* result) {
-  const int slices = DivideRoundUp(shape.c, 4);
-  switch (descriptor.storage_type) {
-    case TensorStorageType::BUFFER:
-    case TensorStorageType::IMAGE_BUFFER: {
-      const size_t data_size = shape.b * shape.w * shape.h * shape.d * slices *
-                               4 * SizeOf(descriptor.data_type);
-      cl_int error_code;
-      cl_mem memory = clCreateBuffer(context.context(), CL_MEM_READ_WRITE,
-                                     data_size, nullptr, &error_code);
-      if (!memory) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
-                         CLErrorCodeToString(error_code)));
-      }
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-    case TensorStorageType::TEXTURE_2D: {
-      cl_image_desc desc;
-      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
-      desc.image_width = shape.w * shape.b * shape.d;
-      desc.image_height = shape.h * slices;
-      desc.image_depth = 0;
-      desc.image_row_pitch = 0;
-      desc.image_slice_pitch = 0;
-      desc.num_mip_levels = 0;
-      desc.num_samples = 0;
-      desc.buffer = nullptr;
-
-      cl_image_format format;
-      format.image_channel_order = CL_RGBA;
-      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
-
-      cl_int error_code;
-      cl_mem memory = CreateImage2DLegacy(context.context(), CL_MEM_READ_WRITE,
-                                          &format, &desc, nullptr, &error_code);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to create 2D texture (clCreateImage): ",
-                         CLErrorCodeToString(error_code)));
-      }
-
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-    case TensorStorageType::TEXTURE_3D: {
-      cl_image_desc desc;
-      desc.image_type = CL_MEM_OBJECT_IMAGE3D;
-      desc.image_width = shape.w * shape.b;
-      desc.image_height = shape.h;
-      desc.image_depth = slices * shape.d;
-      desc.image_row_pitch = 0;
-      desc.image_slice_pitch = 0;
-      desc.num_mip_levels = 0;
-      desc.num_samples = 0;
-      desc.buffer = nullptr;
-
-      cl_image_format format;
-      format.image_channel_order = CL_RGBA;
-      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
-
-      cl_int error_code;
-      cl_mem memory = CreateImage3DLegacy(context.context(), CL_MEM_READ_WRITE,
-                                          &format, &desc, nullptr, &error_code);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to create 3D texture (clCreateImage): ",
-                         CLErrorCodeToString(error_code)));
-      }
-
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-    case TensorStorageType::TEXTURE_ARRAY: {
-      cl_image_desc desc;
-      desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
-      desc.image_width = shape.w * shape.b;
-      desc.image_height = shape.h;
-      desc.image_depth = 0;
-      desc.image_array_size = slices * shape.d;
-      desc.image_row_pitch = 0;
-      desc.image_slice_pitch = 0;
-      desc.num_mip_levels = 0;
-      desc.num_samples = 0;
-      desc.buffer = nullptr;
-
-      cl_image_format format;
-      format.image_channel_order = CL_RGBA;
-      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
-
-      cl_int error_code;
-      cl_mem memory = clCreateImage(context.context(), CL_MEM_READ_WRITE,
-                                    &format, &desc, nullptr, &error_code);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to create 2D texture array (clCreateImage): ",
-                         CLErrorCodeToString(error_code)));
-      }
-
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-
-    case TensorStorageType::SINGLE_TEXTURE_2D: {
-      if (slices != 1) {
-        return absl::InvalidArgumentError(absl::StrCat(
-            "SINGLE_TEXTURE_2D support only channels in range [1-4], but ",
-            shape.c, "was provided"));
-      }
-      cl_image_desc desc;
-      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
-      desc.image_width = shape.w * shape.b * shape.d;
-      desc.image_height = shape.h;
-      desc.image_depth = 0;
-      desc.image_row_pitch = 0;
-      desc.image_slice_pitch = 0;
-      desc.num_mip_levels = 0;
-      desc.num_samples = 0;
-      desc.buffer = nullptr;
-
-      cl_image_format format;
-      if (context.IsFloatTexture2DSupported(shape.c, descriptor.data_type)) {
-        format.image_channel_order = ToChannelOrder(shape.c);
-        format.image_channel_data_type =
-            ToImageChannelType(descriptor.data_type);
-      } else {
-        return absl::InvalidArgumentError(absl::StrCat(
-            "This device doesn't support ", shape.c, "-channel textures."));
-      }
-
-      cl_int error_code;
-      cl_mem memory = CreateImage2DLegacy(context.context(), CL_MEM_READ_WRITE,
-                                          &format, &desc, nullptr, &error_code);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to create 2D texture (clCreateImage): ",
-                         CLErrorCodeToString(error_code)));
-      }
-
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-
-    default:
-      return absl::InternalError("Unsupported tensor storage type");
-  }
+  return AllocateTensorMemory(context, shape, descriptor, nullptr, result);
 }
 
-template <typename T>
-void Tensor::DataFromBHWDC(absl::Span<const float> src,
-                           absl::Span<T> dst) const {
-  const int channels_batch = GetChannelsAlignment();
-  for (int b = 0; b < shape_.b; ++b) {
-    for (int s = 0; s < Slices(); ++s) {
-      for (int y = 0; y < shape_.h; ++y) {
-        for (int x = 0; x < shape_.w; ++x) {
-          for (int d = 0; d < shape_.d; ++d) {
-            for (int c = 0; c < channels_batch; ++c) {
-              float value;
-              if (s * 4 + c < shape_.c) {
-                const int cpu_index =
-                    shape_.LinearIndex({b, y, x, d, s * 4 + c});
-                value = src[cpu_index];
-              } else {
-                value = 0.0f;
-              }
-              const int gpu_index = GetLinearIndex(b, x, y, d, s, c);
-              dst[gpu_index] = value;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template void Tensor::DataFromBHWDC<float>(absl::Span<const float> src,
-                                           absl::Span<float> dst) const;
-template void Tensor::DataFromBHWDC<half>(absl::Span<const float> src,
-                                          absl::Span<half> dst) const;
-
-template <typename T>
-void Tensor::DataToBHWDC(absl::Span<const T> src, absl::Span<float> dst) const {
-  const int channels_batch = GetChannelsAlignment();
-  for (int b = 0; b < shape_.b; ++b) {
-    for (int s = 0; s < Slices(); ++s) {
-      for (int y = 0; y < shape_.h; ++y) {
-        for (int x = 0; x < shape_.w; ++x) {
-          for (int d = 0; d < shape_.d; ++d) {
-            for (int c = 0; c < channels_batch; ++c) {
-              if (s * 4 + c >= shape_.c) {
-                continue;
-              }
-              const int cpu_index = shape_.LinearIndex({b, y, x, d, s * 4 + c});
-              const int gpu_index = GetLinearIndex(b, x, y, d, s, c);
-              dst[cpu_index] = src[gpu_index];
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template void Tensor::DataToBHWDC<float>(absl::Span<const float> src,
-                                         absl::Span<float> dst) const;
-template void Tensor::DataToBHWDC<half>(absl::Span<const half> src,
-                                        absl::Span<float> dst) const;
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index 1e02c77fd13..c6056dbbbec 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -92,6 +92,9 @@ class Tensor : public GPUObject {
   absl::Status ReadData(CLCommandQueue* queue, TensorFloat32* dst) const;
   absl::Status ReadData(CLCommandQueue* queue, Tensor5DFloat32* dst) const;
 
+  absl::Status CreateFromDescriptor(const TensorDescriptor& desc,
+                                    CLContext* context);
+
  private:
   absl::Status IsValid(const BHWC& shape) const;
   absl::Status IsValid(const BHWDC& shape) const;
@@ -104,37 +107,6 @@ class Tensor : public GPUObject {
   absl::Status ReadDataBHWDC(absl::Span<float> out,
                              CLCommandQueue* queue) const;
 
-  template <typename T>
-  void DataFromBHWDC(absl::Span<const float> src, absl::Span<T> dst) const;
-  template <typename T>
-  void DataToBHWDC(absl::Span<const T> src, absl::Span<float> dst) const;
-
-  // TODO(sorokin) might be bad performance
-  int GetLinearIndex(int b, int x, int y, int d, int s, int sub_c) const {
-    switch (descriptor_.storage_type) {
-      case TensorStorageType::BUFFER:
-      case TensorStorageType::IMAGE_BUFFER:
-      case TensorStorageType::TEXTURE_ARRAY:
-      case TensorStorageType::TEXTURE_3D:
-        return ((((d * Slices() + s) * shape_.h + y) * shape_.w + x) *
-                    shape_.b +
-                b) *
-                   4 +
-               sub_c;  // DSHWBC4
-      case TensorStorageType::TEXTURE_2D:
-        return ((((y * Slices() + s) * shape_.w + x) * shape_.b + b) *
-                    shape_.d +
-                d) *
-                   4 +
-               sub_c;  // HSWBDC4
-      case TensorStorageType::SINGLE_TEXTURE_2D:
-        return (((y * shape_.w + x) * shape_.b + b) * shape_.d + d) * shape_.c +
-               sub_c;  // HWBDC
-      case TensorStorageType::UNKNOWN:
-        return -1;
-    }
-  }
-
   int3 GetFullTensorRegion() const;
   void Release();
 
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index e19de02d59d..7bd5de6e31e 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -73,6 +73,25 @@ std::string ToString(TensorStorageType type) {
   }
 }
 
+TensorDescriptor::TensorDescriptor(TensorDescriptor&& desc)
+    : GPUObjectDescriptor(std::move(desc)),
+      data_type(desc.data_type),
+      storage_type(desc.storage_type),
+      layout(desc.layout),
+      shape(desc.shape),
+      data(std::move(desc.data)) {}
+TensorDescriptor& TensorDescriptor::operator=(TensorDescriptor&& desc) {
+  if (this != &desc) {
+    std::swap(data_type, desc.data_type);
+    std::swap(storage_type, desc.storage_type);
+    std::swap(layout, desc.layout);
+    std::swap(shape, desc.shape);
+    data = std::move(desc.data);
+    GPUObjectDescriptor::operator=(std::move(desc));
+  }
+  return *this;
+}
+
 GPUResources TensorDescriptor::GetGPUResources() const {
   GPUResources resources;
   if (HasAxis(Axis::WIDTH)) {
@@ -725,6 +744,134 @@ TextureAddressMode TensorDescriptor::ModeFromState() const {
   }
 }
 
+void TensorDescriptor::UploadData(
+    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src) {
+  shape = BHWDC(1, src.shape.h, src.shape.w, 1, src.shape.c);
+  UploadData(absl::MakeConstSpan(src.data));
+}
+
+void TensorDescriptor::UploadData(
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src) {
+  shape = BHWDC(1, 1, 1, 1, src.shape.v);
+  UploadData(absl::MakeConstSpan(src.data));
+}
+
+void TensorDescriptor::UploadData(absl::Span<const float> src) {
+  int aligned_channels = storage_type == TensorStorageType::SINGLE_TEXTURE_2D
+                             ? shape.c
+                             : AlignByN(shape.c, 4);
+  int elements_count = shape.b * shape.w * shape.h * shape.d * aligned_channels;
+  data.resize(elements_count * SizeOf(data_type));
+  if (data_type == DataType::FLOAT32) {
+    float* gpu_data = reinterpret_cast<float*>(data.data());
+    DataFromBHWDC(src, shape, *this, absl::MakeSpan(gpu_data, elements_count));
+  } else {
+    half* gpu_data = reinterpret_cast<half*>(data.data());
+    DataFromBHWDC(src, shape, *this, absl::MakeSpan(gpu_data, elements_count));
+  }
+}
+
+namespace {
+int GetLinearIndex(const TensorDescriptor& desc, const BHWDC& shape, int b,
+                   int x, int y, int d, int s, int sub_c) {
+  const int slices = DivideRoundUp(shape.c, 4);
+  switch (desc.storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER:
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_3D:
+      return ((((d * slices + s) * shape.h + y) * shape.w + x) * shape.b + b) *
+                 4 +
+             sub_c;  // DSHWBC4
+    case TensorStorageType::TEXTURE_2D:
+      return ((((y * slices + s) * shape.w + x) * shape.b + b) * shape.d + d) *
+                 4 +
+             sub_c;  // HSWBDC4
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return (((y * shape.w + x) * shape.b + b) * shape.d + d) * shape.c +
+             sub_c;  // HWBDC
+    case TensorStorageType::UNKNOWN:
+      return -1;
+  }
+}
+
+int GetChannelsAlignment(const TensorDescriptor& desc, const BHWDC& shape) {
+  return desc.storage_type == TensorStorageType::SINGLE_TEXTURE_2D ? shape.c
+                                                                   : 4;
+}
+}  // namespace
+
+template <typename T>
+void DataFromBHWDC(absl::Span<const float> src, const BHWDC& shape,
+                   const TensorDescriptor& desc, absl::Span<T> dst) {
+  const int channels_alignment = GetChannelsAlignment(desc, shape);
+  const int slices = DivideRoundUp(shape.c, 4);
+  for (int b = 0; b < shape.b; ++b) {
+    for (int s = 0; s < slices; ++s) {
+      for (int y = 0; y < shape.h; ++y) {
+        for (int x = 0; x < shape.w; ++x) {
+          for (int d = 0; d < shape.d; ++d) {
+            for (int c = 0; c < channels_alignment; ++c) {
+              float value;
+              if (s * 4 + c < shape.c) {
+                const int cpu_index =
+                    shape.LinearIndex({b, y, x, d, s * 4 + c});
+                value = src[cpu_index];
+              } else {
+                value = 0.0f;
+              }
+              int gpu_index = GetLinearIndex(desc, shape, b, x, y, d, s, c);
+              dst[gpu_index] = value;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template void DataFromBHWDC<float>(absl::Span<const float> src,
+                                   const BHWDC& shape,
+                                   const TensorDescriptor& desc,
+                                   absl::Span<float> dst);
+template void DataFromBHWDC<half>(absl::Span<const float> src,
+                                  const BHWDC& shape,
+                                  const TensorDescriptor& desc,
+                                  absl::Span<half> dst);
+
+template <typename T>
+void DataToBHWDC(absl::Span<const T> src, const BHWDC& shape,
+                 const TensorDescriptor& desc, absl::Span<float> dst) {
+  const int channels_alignment = GetChannelsAlignment(desc, shape);
+  const int slices = DivideRoundUp(shape.c, 4);
+  for (int b = 0; b < shape.b; ++b) {
+    for (int s = 0; s < slices; ++s) {
+      for (int y = 0; y < shape.h; ++y) {
+        for (int x = 0; x < shape.w; ++x) {
+          for (int d = 0; d < shape.d; ++d) {
+            for (int c = 0; c < channels_alignment; ++c) {
+              if (s * 4 + c >= shape.c) {
+                continue;
+              }
+              int cpu_index = shape.LinearIndex({b, y, x, d, s * 4 + c});
+              int gpu_index = GetLinearIndex(desc, shape, b, x, y, d, s, c);
+              dst[cpu_index] = src[gpu_index];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template void DataToBHWDC<float>(absl::Span<const float> src,
+                                 const BHWDC& shape,
+                                 const TensorDescriptor& desc,
+                                 absl::Span<float> dst);
+template void DataToBHWDC<half>(absl::Span<const half> src, const BHWDC& shape,
+                                const TensorDescriptor& desc,
+                                absl::Span<float> dst);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 73b15ca322d..094e3905966 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -49,6 +49,11 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   TensorDescriptor(DataType dt, TensorStorageType st, Layout l)
       : data_type(dt), storage_type(st), layout(l) {}
 
+  TensorDescriptor(const TensorDescriptor&) = default;
+  TensorDescriptor& operator=(const TensorDescriptor&) = default;
+  TensorDescriptor(TensorDescriptor&& desc);
+  TensorDescriptor& operator=(TensorDescriptor&& desc);
+
   bool operator==(const TensorDescriptor& d) const {
     return data_type == d.data_type && storage_type == d.storage_type &&
            layout == d.layout;
@@ -63,6 +68,10 @@ struct TensorDescriptor : public GPUObjectDescriptor {
 
   GPUResources GetGPUResources() const override;
 
+  absl::Status CreateGPUObject(CLContext* context,
+                               GPUObjectPtr* result) const override;
+  void Release() override { data.clear(); }
+
   bool HasAxis(Axis axis) const;
   void SetTextureAddressMode(TextureAddressMode mode);
 
@@ -70,6 +79,9 @@ struct TensorDescriptor : public GPUObjectDescriptor {
       const std::vector<std::string>& args, std::string* value_name,
       std::string* x_coord, std::string* y_coord, std::string* s_coord) const;
 
+  void UploadData(const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src);
+  void UploadData(const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src);
+
   DataType data_type = DataType::UNKNOWN;
   TensorStorageType storage_type = TensorStorageType::UNKNOWN;
   // This field describes logical layout, actual(physical) GPU layout can be
@@ -77,6 +89,10 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   Layout layout =
       Layout::UNKNOWN;  // Supported layouts is HWC, BHWC, HWDC, BHWDC
 
+  // optional
+  BHWDC shape;
+  std::vector<uint8_t> data;
+
  private:
   absl::Status PerformReadSelector(
       const std::vector<std::string>& args,
@@ -145,8 +161,18 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   bool ParseCoordsFromArgs(const std::vector<std::string>& args, int offset,
                            std::string* xc, std::string* yc, std::string* zc,
                            std::string* sc, std::string* bc) const;
+
+  void UploadData(absl::Span<const float> src);
 };
 
+template <typename T>
+void DataFromBHWDC(absl::Span<const float> src, const BHWDC& shape,
+                   const TensorDescriptor& desc, absl::Span<T> dst);
+
+template <typename T>
+void DataToBHWDC(absl::Span<const T> src, const BHWDC& shape,
+                 const TensorDescriptor& desc, absl::Span<float> dst);
+
 std::string ToString(TensorStorageType type);
 
 }  // namespace cl