From 8f31b06f53b92cdd172587dc3300e23c846d1973 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 26 May 2020 21:50:49 -0700
Subject: [PATCH] Added generic arguments to abstract int/float uniforms.

PiperOrigin-RevId: 313327440
Change-Id: I12c82d0499b3ed9eb4f839cf8016a87bd0ea4807
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |  14 ++
 tensorflow/lite/delegates/gpu/cl/arguments.cc | 173 ++++++++++++++++++
 tensorflow/lite/delegates/gpu/cl/arguments.h  |  88 +++++++++
 tensorflow/lite/delegates/gpu/cl/cl_kernel.h  |   1 +
 .../lite/delegates/gpu/cl/kernels/BUILD       |   2 +
 .../delegates/gpu/cl/kernels/transpose.cc     |  72 +++++---
 .../lite/delegates/gpu/cl/kernels/transpose.h |   2 +
 7 files changed, 326 insertions(+), 26 deletions(-)
 create mode 100644 tensorflow/lite/delegates/gpu/cl/arguments.cc
 create mode 100644 tensorflow/lite/delegates/gpu/cl/arguments.h

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 2e686810767..c149479ae4c 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -38,6 +38,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "arguments",
+    srcs = ["arguments.cc"],
+    hdrs = ["arguments.h"],
+    deps = [
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "buffer",
     srcs = ["buffer.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
new file mode 100644
index 00000000000..26d9fc778b3
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -0,0 +1,173 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+std::string GetNextWord(const std::string& code, size_t first_position) {
+  size_t pos = first_position;
+  char t = code[pos];
+  while (absl::ascii_isalnum(t) || t == '_') {
+    pos++;
+    t = code[pos];
+  }
+  return code.substr(first_position, pos - first_position);
+}
+}  // namespace
+
+Arguments::Arguments(Arguments&& args)
+    : int_values_(std::move(args.int_values_)),
+      shared_int4s_data_(std::move(args.shared_int4s_data_)),
+      float_values_(std::move(args.float_values_)),
+      shared_float4s_data_(std::move(args.shared_float4s_data_)) {}
+Arguments& Arguments::operator=(Arguments&& args) {
+  if (this != &args) {
+    int_values_ = std::move(args.int_values_);
+    shared_int4s_data_ = std::move(args.shared_int4s_data_);
+    float_values_ = std::move(args.float_values_);
+    shared_float4s_data_ = std::move(args.shared_float4s_data_);
+  }
+  return *this;
+}
+
+void Arguments::AddFloat(const std::string& name, float value) {
+  float_values_[name].value = value;
+}
+void Arguments::AddInt(const std::string& name, int value) {
+  int_values_[name].value = value;
+}
+
+absl::Status Arguments::SetInt(const std::string& name, int value) {
+  auto ii = int_values_.find(name);
+  if (ii == int_values_.end()) {
+    return absl::NotFoundError(absl::StrCat("No argument with name - ", name));
+  }
+  ii->second.value = value;
+  if (ii->second.active) {
+    shared_int4s_data_[ii->second.offset] = value;
+  }
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetFloat(const std::string& name, float value) {
+  auto fi = float_values_.find(name);
+  if (fi == float_values_.end()) {
+    return absl::NotFoundError(absl::StrCat("No argument with name - ", name));
+  }
+  fi->second.value = value;
+  if (fi->second.active) {
+    shared_float4s_data_[fi->second.offset] = value;
+  }
+  return absl::OkStatus();
+}
+
+std::string Arguments::GetListOfArgs() {
+  std::string result;
+  for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
+    absl::StrAppend(&result, ",\n  int4 shared_int4_", i);
+  }
+  for (int i = 0; i < shared_float4s_data_.size() / 4; ++i) {
+    absl::StrAppend(&result, ",\n  float4 shared_float4_", i);
+  }
+  return result;
+}
+
+absl::Status Arguments::Bind(cl_kernel kernel, int offset) {
+  for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
+    const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
+                                          &shared_int4s_data_[i * 4]);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (int i = 0; i < shared_float4s_data_.size() / 4; ++i) {
+    const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
+                                          &shared_float4s_data_[i * 4]);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  return absl::OkStatus();
+}
+
+std::string Arguments::AddActiveArgument(const std::string& arg_name) {
+  if (auto it = int_values_.find(arg_name); it != int_values_.end()) {
+    int int_index;
+    if (it->second.active) {
+      int_index = it->second.offset;
+    } else {
+      it->second.active = true;
+      it->second.offset = shared_int4s_data_.size();
+      int_index = it->second.offset;
+      shared_int4s_data_.push_back(it->second.value);
+    }
+    std::string index = std::to_string(int_index / 4);
+    std::string postfixes[4] = {"x", "y", "z", "w"};
+    return "shared_int4_" + index + "." + postfixes[int_index % 4];
+  }
+  if (auto it = float_values_.find(arg_name); it != float_values_.end()) {
+    int float_index;
+    if (it->second.active) {
+      float_index = it->second.offset;
+    } else {
+      it->second.active = true;
+      it->second.offset = shared_float4s_data_.size();
+      float_index = it->second.offset;
+      shared_float4s_data_.push_back(it->second.value);
+    }
+    std::string index = std::to_string(float_index / 4);
+    std::string postfixes[4] = {"x", "y", "z", "w"};
+    return "shared_float4_" + index + "." + postfixes[float_index % 4];
+  }
+  return arg_name;
+}
+
+void Arguments::ResolveArgsPass(std::string* code) {
+  std::string result;
+  constexpr char kPrefix[] = "args.";
+  size_t position = 0;
+  size_t next_position = code->find(kPrefix);
+  while (next_position != std::string::npos) {
+    size_t arg_pos = next_position;
+    next_position += strlen(kPrefix);
+    std::string object_name = GetNextWord(*code, next_position);
+    std::string new_name = AddActiveArgument(object_name);
+    code->replace(arg_pos, object_name.size() + strlen(kPrefix), new_name);
+    position = arg_pos + new_name.size();
+    next_position = code->find(kPrefix, position);
+  }
+
+  int shared_int4s_aligned_size = AlignByN(shared_int4s_data_.size(), 4);
+  shared_int4s_data_.resize(shared_int4s_aligned_size);
+  int shared_float4s_aligned_size = AlignByN(shared_float4s_data_.size(), 4);
+  shared_float4s_data_.resize(shared_float4s_aligned_size);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
new file mode 100644
index 00000000000..274532d0199
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_ARGUMENTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_ARGUMENTS_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Arguments {
+ public:
+  Arguments() = default;
+  void AddFloat(const std::string& name, float value = 0.0f);
+  void AddInt(const std::string& name, int value = 0);
+
+  absl::Status SetInt(const std::string& name, int value);
+  absl::Status SetFloat(const std::string& name, float value);
+
+  std::string GetListOfArgs();
+
+  absl::Status Bind(cl_kernel kernel, int offset);
+
+  void ResolveArgsPass(std::string* code);
+
+  // Move only
+  Arguments(Arguments&& args);
+  Arguments& operator=(Arguments&& args);
+  Arguments(const Arguments&) = delete;
+  Arguments& operator=(const Arguments&) = delete;
+
+ private:
+  std::string AddActiveArgument(const std::string& arg_name);
+
+  struct IntValue {
+    int value;
+
+    // many uniforms generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared uniform storage.
+    uint32_t offset = -1;
+  };
+  std::map<std::string, IntValue> int_values_;
+  std::vector<int32_t> shared_int4s_data_;
+
+  struct FloatValue {
+    float value;
+
+    // many uniforms generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared uniform storage.
+    uint32_t offset = -1;
+  };
+  std::map<std::string, FloatValue> float_values_;
+  std::vector<float> shared_float4s_data_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_ARGUMENTS_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
index b575684d2b4..be9dc6dbf03 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
@@ -65,6 +65,7 @@ class CLKernel {
   int GetPrivateMemorySize() const { return private_memory_size_; }
   int GetMaxWorkGroupSize() const { return max_work_group_size_; }
 
+  int GetBindingCounter() const { return binding_counter_; }
   void ResetBindingCounter() { binding_counter_ = 0; }
 
   // Do not use this function
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index ff6f06eeb68..b5510b3e8df 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -1290,8 +1290,10 @@ cc_library(
         ":gpu_operation",
         ":util",
         ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:arguments",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index 66a272fa2da..fc3efe32c3b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/substitute.h"
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 
@@ -27,37 +29,45 @@ namespace {
 
 std::string GetTransposeCode(
     const OperationDef& op_def, const TransposeAttributes& attr,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+    const std::vector<ElementwiseOperation*>& linked_operations,
+    Arguments* args) {
+  TensorCodeGenerator src_tensor("src_data",
+                                 WHSBPoint{"args.src_width", "args.src_height",
+                                           "args.src_slices", "args.src_batch"},
+                                 op_def.src_tensors[0]);
+  TensorCodeGenerator dst_tensor("dst_data",
+                                 WHSBPoint{"args.dst_width", "args.dst_height",
+                                           "args.dst_slices", "args.dst_batch"},
+                                 op_def.dst_tensors[0]);
+
+  args->AddInt("src_width");
+  args->AddInt("src_height");
+  args->AddInt("src_slices");
+  args->AddInt("src_batch");
+  args->AddInt("dst_width");
+  args->AddInt("dst_height");
+  args->AddInt("dst_slices");
+  args->AddInt("dst_batch");
+  args->AddInt("dst_channels");
 
   const std::string batch_id = op_def.IsBatchSupported() ? "B" : "";
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += src_tensor.GetDeclaration(AccessType::READ);
   c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    int src_channels,          \n";
-  c += "    int dst_channels           \n";
-  c += ") {\n";
+  c += dst_tensor.GetDeclaration(AccessType::WRITE);
+  c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
+    c += "  int X = linear_id / args.dst_batch;\n";
+    c += "  int B = linear_id % args.dst_batch;\n";
   } else {
     c += "  int X = get_global_id(0);\n";
   }
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) { \n";
+  c += "  if (X >= args.dst_width || Y >= args.dst_height || Z >= "
+       "args.dst_slices) { \n";
   c += "    return; \n";
   c += "  } \n";
   c += "  FLT temps[4];\n";
@@ -83,7 +93,7 @@ std::string GetTransposeCode(
   } else {
     c += "  for (int i = 0; i < 4; ++i) {\n";
     c += "    int dst_channel = Z * 4 + i;\n";
-    c += "    if (dst_channel < dst_channels) {;\n";
+    c += "    if (dst_channel < args.dst_channels) {;\n";
     const std::string bhwc[] = {"B", "Y", "X", "dst_channel"};
     std::string src_b = op_def.IsBatchSupported() ? bhwc[remap[0]] : "";
     c += "      int s_y = " + bhwc[remap[1]] + ";\n";
@@ -100,24 +110,27 @@ std::string GetTransposeCode(
   }
   c += "  FLT4 result = (FLT4)(temps[0], temps[1], temps[2], temps[3]);\n";
   std::string x_3dcoord =
-      op_def.IsBatchSupported() ? "X * dst_size.w + B" : "X";
+      op_def.IsBatchSupported() ? "X * args.dst_batch + B" : "X";
   const LinkingContext context{"result", x_3dcoord, "Y", "Z"};
   c += PostProcess(linked_operations, context);
   c += "  " + dst_tensor.WriteWHSB("result", "X", "Y", "Z", batch_id);
   c += "}\n";
-  return c;
+  args->ResolveArgsPass(&c);
+  return absl::Substitute(c, args->GetListOfArgs());
 }
 }  // namespace
 
 Transpose::Transpose(Transpose&& operation)
     : GPUOperation(std::move(operation)),
       attr_(operation.attr_),
+      args_(std::move(operation.args_)),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_) {}
 
 Transpose& Transpose::operator=(Transpose&& operation) {
   if (this != &operation) {
     attr_ = operation.attr_;
+    args_ = std::move(operation.args_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
     GPUOperation::operator=(std::move(operation));
@@ -126,21 +139,28 @@ Transpose& Transpose::operator=(Transpose&& operation) {
 }
 
 absl::Status Transpose::Compile(const CreationContext& creation_context) {
-  const auto code = GetTransposeCode(definition_, attr_, linked_operations_);
+  const auto code =
+      GetTransposeCode(definition_, attr_, linked_operations_, &args_);
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Transpose::BindArguments() {
+  RETURN_IF_ERROR(args_.SetInt("src_width", src_[0]->Width()));
+  RETURN_IF_ERROR(args_.SetInt("src_height", src_[0]->Height()));
+  RETURN_IF_ERROR(args_.SetInt("src_slices", src_[0]->Slices()));
+  RETURN_IF_ERROR(args_.SetInt("src_batch", src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("dst_width", dst_[0]->Width()));
+  RETURN_IF_ERROR(args_.SetInt("dst_height", dst_[0]->Height()));
+  RETURN_IF_ERROR(args_.SetInt("dst_slices", dst_[0]->Slices()));
+  RETURN_IF_ERROR(args_.SetInt("dst_batch", dst_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("dst_channels", dst_[0]->Channels()));
   kernel_.ResetBindingCounter();
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
   RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Channels()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->Channels()));
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
index 61038b1e0ca..13f06281012 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_TRANSPOSE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_TRANSPOSE_H_
 
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
@@ -43,6 +44,7 @@ class Transpose : public GPUOperation {
   int3 GetGridSize() const;
 
   TransposeAttributes attr_;
+  Arguments args_;
   CLKernel kernel_;
   int3 work_group_size_;
 };