From d80fda0877194b4fe0b294d64a2251bddd7af9b6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 8 Jan 2020 23:06:14 -0800
Subject: [PATCH] MaxUnpooling3D for OpenCL backend.

PiperOrigin-RevId: 288841849
Change-Id: I558f27ebb7f9de122c784e393e293b976f7d836e
---
 .../delegates/gpu/cl/kernels/max_unpooling.cc | 167 ++++++++++++++++++
 .../delegates/gpu/cl/kernels/max_unpooling.h  |  30 ++++
 2 files changed, 197 insertions(+)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
index 89a7b9ca84f..44f475dadd9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
@@ -101,6 +101,94 @@ std::string GetMaxUnoolingKernelCode(
 
   return c;
 }
+
+std::string GetMaxUnooling3DKernelCode(
+    const OperationDef& op_def, const CLDevice& device,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src(
+      "src_data",
+      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
+      op_def.src_tensors[0]);
+  TensorCodeGenerator src_ind(
+      "src_data_indices",
+      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
+      op_def.src_tensors[1]);
+  TensorCodeGenerator dst(
+      "dst_data",
+      WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
+      op_def.dst_tensors[0]);
+
+  const auto address_mode = GetFastestZeroMode(device);
+
+  std::string c = GetCommonDefines(op_def.precision);
+
+  c += "__kernel void main_function(\n";
+  c += src.GetDeclaration(AccessType::READ) + ",\n";
+  c += src_ind.GetDeclaration(AccessType::READ);
+  c += GetArgsDeclaration(linked_operations);
+  c += dst.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,      \n";
+  c += "    int4 dst_size,      \n";
+  if (op_def.batch_support) {
+    c += "    int batch_size,          \n";
+  }
+  c += "    int4 kernel_size,   \n";
+  c += "    int4 padding,       \n";
+  c += "    int4 stride         \n";
+  c += ") {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int linear_id_z = get_global_id(2);\n";
+  c += "  int S = linear_id_z % dst_size.w;\n";
+  c += "  int Z = linear_id_z / dst_size.w;\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  if (op_def.batch_support) {
+    c += "  int linear_id = get_global_id(0);\n";
+    c += "  int X0 = linear_id / batch_size;\n";
+    c += "  int B = linear_id % batch_size;\n";
+    c += "  int src_x0 = (X0 + padding.x) / stride.x;\n";
+    c += "  int src_x = src_x0 * batch_size + B;\n";
+  } else {
+    c += "  int src_x = (X + padding.x) / stride.x;\n";
+  }
+  c += "  int src_y = (Y + padding.y) / stride.y;\n";
+  c += "  int src_z = (Z + padding.z) / stride.z;\n";
+  c += "  " + src.GetAddressWHDS("src_adr", "src_x", "src_y", "src_z", "S") +
+       "\n";
+  if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
+    c += "  bool outside = src_x < 0 || src_y < 0 || src_z < 0 || ";
+    c += "  src_x >= src_size.x || src_y >= src_size.y || src_z >= "
+         "src_size.z;\n";
+    c += "  FLT4 src = (FLT4)(0.0f);\n";
+    c += "  int4 ind = (int4)(0);\n";
+    c += "  if (!outside) {\n";
+    c += "    src = " + src.Read("src_adr") + ";\n";
+    c += "    ind = convert_int4(" + src_ind.Read("src_adr") + ");\n";
+    c += "  }\n";
+  } else {
+    c += "  FLT4 src = " + src.Read("src_adr", address_mode) + ";\n";
+    c += "  int4 ind = convert_int4(" + src_ind.Read("src_adr", address_mode) +
+         ");\n";
+  }
+  if (op_def.batch_support) {
+    c += "  int t_x = X0 - (src_x0 * stride.x - padding.x);\n";
+  } else {
+    c += "  int t_x = X - (src_x * stride.x - padding.x);\n";
+  }
+  c += "  int t_y = Y - (src_y * stride.y - padding.y);\n";
+  c += "  int t_z = Z - (src_z * stride.z - padding.z);\n";
+  c += "  int t_index = (t_y * kernel_size.x + t_x) * kernel_size.z + t_z;\n";
+  c += "  FLT4 result;\n";
+  const std::string channels[] = {".x", ".y", ".z", ".w"};
+  for (int i = 0; i < 4; ++i) {
+    const auto& s = channels[i];
+    c += "  result" + s + " = t_index == ind" + s + " ? src" + s + ": 0.0f;\n";
+  }
+  c += PostProcess(linked_operations, {"result", "X", "Y", "S"});
+  c += "  " + dst.WriteWHDS("result", "X", "Y", "Z", "S");
+  c += "}\n";
+  return c;
+}
 }  // namespace
 
 MaxUnpooling::MaxUnpooling(const OperationDef& definition,
@@ -175,6 +263,85 @@ MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
   return MaxUnpooling(definition, attr);
 }
 
+MaxUnpooling3D::MaxUnpooling3D(const OperationDef& definition,
+                               const MaxUnpooling3DAttributes& attr)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h, attr.strides.d),
+      padding_(attr.padding.appended.w, attr.padding.appended.h,
+               attr.padding.appended.d),
+      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d) {}
+
+MaxUnpooling3D::MaxUnpooling3D(MaxUnpooling3D&& kernel)
+    : GPUOperation(std::move(kernel)),
+      stride_(kernel.stride_),
+      padding_(kernel.padding_),
+      kernel_size_(kernel.kernel_size_),
+      kernel_(std::move(kernel.kernel_)),
+      work_group_size_(kernel.work_group_size_) {}
+
+MaxUnpooling3D& MaxUnpooling3D::operator=(MaxUnpooling3D&& kernel) {
+  if (this != &kernel) {
+    std::swap(stride_, kernel.stride_);
+    std::swap(padding_, kernel.padding_);
+    std::swap(kernel_size_, kernel.kernel_size_);
+    kernel_ = std::move(kernel.kernel_);
+    std::swap(work_group_size_, kernel.work_group_size_);
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+Status MaxUnpooling3D::Compile(const CreationContext& creation_context) {
+  const auto code = GetMaxUnooling3DKernelCode(
+      definition_, *creation_context.device, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status MaxUnpooling3D::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
+  if (definition_.batch_support) {
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
+  }
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(
+      int4(kernel_size_.x, kernel_size_.y, kernel_size_.z, 1)));
+  RETURN_IF_ERROR(
+      kernel_.SetBytesAuto(int4(padding_.x, padding_.y, padding_.z, 1)));
+  RETURN_IF_ERROR(
+      kernel_.SetBytesAuto(int4(stride_.x, stride_.y, stride_.z, 1)));
+
+  return OkStatus();
+}
+
+int3 MaxUnpooling3D::GetGridSize() const {
+  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status MaxUnpooling3D::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status MaxUnpooling3D::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+MaxUnpooling3D CreateMaxUnpooling3D(const OperationDef& definition,
+                                    const MaxUnpooling3DAttributes& attr) {
+  return MaxUnpooling3D(definition, attr);
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
index 2af3c5e3fe2..c7479acb728 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
@@ -55,6 +55,36 @@ class MaxUnpooling : public GPUOperation {
 MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
                                 const MaxUnpooling2DAttributes& attr);
 
+class MaxUnpooling3D : public GPUOperation {
+ public:
+  MaxUnpooling3D(const OperationDef& definition,
+                 const MaxUnpooling3DAttributes& attr);
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+
+  Status Compile(const CreationContext& creation_context) override;
+
+  // Move only
+  MaxUnpooling3D(MaxUnpooling3D&& kernel);
+  MaxUnpooling3D& operator=(MaxUnpooling3D&& kernel);
+  MaxUnpooling3D(const MaxUnpooling3D&) = delete;
+  MaxUnpooling3D& operator=(const MaxUnpooling3D&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  int3 stride_;
+  int3 padding_;
+  int3 kernel_size_;
+
+  CLKernel kernel_;
+  int3 work_group_size_ = int3(8, 4, 1);
+};
+
+MaxUnpooling3D CreateMaxUnpooling3D(const OperationDef& definition,
+                                    const MaxUnpooling3DAttributes& attr);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite