From 1219f682f7faa3619b58f41cc3f479445588cf24 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Tue, 18 Aug 2020 13:55:38 -0700
Subject: [PATCH] Enable OpenCL 2.0 or 3.0 compilation when the device supports
 it.

By default OpenCL programs are compiled as 1.x only.

PiperOrigin-RevId: 327300390
Change-Id: I7e31c3c0253bc9175f156614a47f5ef8dddf2147
---
 tensorflow/lite/delegates/gpu/cl/cl_program.cc     |  2 ++
 tensorflow/lite/delegates/gpu/cl/cl_program.h      |  3 ++-
 tensorflow/lite/delegates/gpu/cl/kernels/BUILD     |  2 ++
 .../gpu/cl/kernels/mean_stddev_normalization.cc    | 14 +++++++++++---
 .../gpu/cl/kernels/mean_stddev_normalization.h     |  6 ++++--
 .../gpu/cl/selectors/operation_selector.cc         |  3 ++-
 6 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.cc b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
index fd29ebec2d7..a67ebae8ca3 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
@@ -95,6 +95,8 @@ std::string CompilerOptionToString(const CLDevice& device,
       return "-cl-opt-disable";
     case CompilerOptions::CL_2_0:
       return "-cl-std=CL2.0";
+    case CompilerOptions::CL_3_0:
+      return "-cl-std=CL3.0";
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.h b/tensorflow/lite/delegates/gpu/cl/cl_program.h
index 138b7d9fbd0..af8239ae7f5 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.h
@@ -41,7 +41,8 @@ enum class CompilerOptions {
   ADRENO_MORE_WAVES,
   POWERVR_FP16,
   CL_OPT_DISABLE,
-  CL_2_0
+  CL_2_0,
+  CL_3_0,
 };
 
 std::string CompilerOptionsToString(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index c8351304188..7e995e0062b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -774,6 +774,8 @@ cc_library(
         ":gpu_operation",
         ":util",
         ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_program",
+        "//tensorflow/lite/delegates/gpu/cl:device_info",
         "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
index 0702f797d84..a89d7126b99 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
@@ -64,7 +66,8 @@ static inline float local_reduce(float input, __local float* tmp) {
 }
 }  // namespace
 
-MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition)
+MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
+                                                 const DeviceInfo& device_info)
     : GPUOperation(definition) {
   // The kernel code does not inherently need a fixed size, but in order to not
   // hardcode the __local array's size for the reductions, we would need to pass
@@ -74,6 +77,11 @@ MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition)
   work_group_size_.y = 1;  // Required
   work_group_size_.z = 1;  // Required
   code_ = GetNormalizationCode();
+  if (device_info.cl_version >= OpenCLVersion::CL_3_0) {
+    compiler_options_.push_back(CompilerOptions::CL_3_0);
+  } else if (device_info.cl_version >= OpenCLVersion::CL_2_0) {
+    compiler_options_.push_back(CompilerOptions::CL_2_0);
+  }
 }
 
 std::string MeanStdDevNormalization::GetNormalizationCode() {
@@ -145,8 +153,8 @@ int3 MeanStdDevNormalization::GetGridSize() const {
 }
 
 MeanStdDevNormalization CreateMeanStdDevNormalization(
-    const OperationDef& definition) {
-  return MeanStdDevNormalization(definition);
+    const OperationDef& definition, const DeviceInfo& device_info) {
+  return MeanStdDevNormalization(definition, device_info);
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
index 47cc7ff46d1..e898803e377 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_LSTM_NORMALIZATION_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_LSTM_NORMALIZATION_H_
 
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -28,7 +29,8 @@ namespace cl {
 // Implements tensor_utils::MeanStddevNormalization
 class MeanStdDevNormalization : public GPUOperation {
  public:
-  explicit MeanStdDevNormalization(const OperationDef& definition);
+  explicit MeanStdDevNormalization(const OperationDef& definition,
+                                   const DeviceInfo& device_info);
 
   void GetPossibleKernelWorkGroups(
       TuningType tuning_type, const DeviceInfo& device_info,
@@ -50,7 +52,7 @@ class MeanStdDevNormalization : public GPUOperation {
 };
 
 MeanStdDevNormalization CreateMeanStdDevNormalization(
-    const OperationDef& definition);
+    const OperationDef& definition, const DeviceInfo& device_info);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index b257e5a85da..58c91ccf191 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -262,7 +262,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       return SelectMean(attr, op_def, creation_context.device->info_, gpu_op);
     }
     case OperationType::MEAN_STDDEV_NORMALIZATION: {
-      MeanStdDevNormalization operation = CreateMeanStdDevNormalization(op_def);
+      MeanStdDevNormalization operation =
+          CreateMeanStdDevNormalization(op_def, creation_context.device->info_);
       *gpu_op =
           absl::make_unique<MeanStdDevNormalization>(std::move(operation));
       return absl::OkStatus();