From 1219f682f7faa3619b58f41cc3f479445588cf24 Mon Sep 17 00:00:00 2001 From: Robert David <lrdx@google.com> Date: Tue, 18 Aug 2020 13:55:38 -0700 Subject: [PATCH] Enable OpenCL 2.0 or 3.0 compilation when the device supports it. By default OpenCL programs are compiled as 1.x only. PiperOrigin-RevId: 327300390 Change-Id: I7e31c3c0253bc9175f156614a47f5ef8dddf2147 --- tensorflow/lite/delegates/gpu/cl/cl_program.cc | 2 ++ tensorflow/lite/delegates/gpu/cl/cl_program.h | 3 ++- tensorflow/lite/delegates/gpu/cl/kernels/BUILD | 2 ++ .../gpu/cl/kernels/mean_stddev_normalization.cc | 14 +++++++++++--- .../gpu/cl/kernels/mean_stddev_normalization.h | 6 ++++-- .../gpu/cl/selectors/operation_selector.cc | 3 ++- 6 files changed, 23 insertions(+), 7 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.cc b/tensorflow/lite/delegates/gpu/cl/cl_program.cc index fd29ebec2d7..a67ebae8ca3 100644 --- a/tensorflow/lite/delegates/gpu/cl/cl_program.cc +++ b/tensorflow/lite/delegates/gpu/cl/cl_program.cc @@ -95,6 +95,8 @@ std::string CompilerOptionToString(const CLDevice& device, return "-cl-opt-disable"; case CompilerOptions::CL_2_0: return "-cl-std=CL2.0"; + case CompilerOptions::CL_3_0: + return "-cl-std=CL3.0"; } } diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.h b/tensorflow/lite/delegates/gpu/cl/cl_program.h index 138b7d9fbd0..af8239ae7f5 100644 --- a/tensorflow/lite/delegates/gpu/cl/cl_program.h +++ b/tensorflow/lite/delegates/gpu/cl/cl_program.h @@ -41,7 +41,8 @@ enum class CompilerOptions { ADRENO_MORE_WAVES, POWERVR_FP16, CL_OPT_DISABLE, - CL_2_0 + CL_2_0, + CL_3_0, }; std::string CompilerOptionsToString( diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD index c8351304188..7e995e0062b 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD +++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD @@ -774,6 +774,8 @@ cc_library( ":gpu_operation", ":util", ":work_group_picking", + "//tensorflow/lite/delegates/gpu/cl:cl_program", + "//tensorflow/lite/delegates/gpu/cl:device_info", "//tensorflow/lite/delegates/gpu/cl:precision", "//tensorflow/lite/delegates/gpu/common:operations", "//tensorflow/lite/delegates/gpu/common:status", diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc index 0702f797d84..a89d7126b99 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc @@ -17,6 +17,8 @@ limitations under the License. #include <string> +#include "tensorflow/lite/delegates/gpu/cl/cl_program.h" +#include "tensorflow/lite/delegates/gpu/cl/device_info.h" #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h" #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h" #include "tensorflow/lite/delegates/gpu/cl/precision.h" @@ -64,7 +66,8 @@ static inline float local_reduce(float input, __local float* tmp) { } } // namespace -MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition) +MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition, + const DeviceInfo& device_info) : GPUOperation(definition) { // The kernel code does not inherently need a fixed size, but in order to not // hardcode the __local array's size for the reductions, we would need to pass @@ -74,6 +77,11 @@ MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition) work_group_size_.y = 1; // Required work_group_size_.z = 1; // Required code_ = GetNormalizationCode(); + if (device_info.cl_version >= OpenCLVersion::CL_3_0) { + compiler_options_.push_back(CompilerOptions::CL_3_0); + } else if (device_info.cl_version >= OpenCLVersion::CL_2_0) { + compiler_options_.push_back(CompilerOptions::CL_2_0); + } } std::string MeanStdDevNormalization::GetNormalizationCode() { @@ -145,8 +153,8 @@ int3 MeanStdDevNormalization::GetGridSize() const { } MeanStdDevNormalization CreateMeanStdDevNormalization( - const OperationDef& definition) { - return MeanStdDevNormalization(definition); + const OperationDef& definition, const DeviceInfo& device_info) { + return MeanStdDevNormalization(definition, device_info); } } // namespace cl diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h index 47cc7ff46d1..e898803e377 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h +++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_LSTM_NORMALIZATION_H_ #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_LSTM_NORMALIZATION_H_ +#include "tensorflow/lite/delegates/gpu/cl/device_info.h" #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h" #include "tensorflow/lite/delegates/gpu/common/operations.h" #include "tensorflow/lite/delegates/gpu/common/status.h" @@ -28,7 +29,8 @@ namespace cl { // Implements tensor_utils::MeanStddevNormalization class MeanStdDevNormalization : public GPUOperation { public: - explicit MeanStdDevNormalization(const OperationDef& definition); + explicit MeanStdDevNormalization(const OperationDef& definition, + const DeviceInfo& device_info); void GetPossibleKernelWorkGroups( TuningType tuning_type, const DeviceInfo& device_info, @@ -50,7 +52,7 @@ class MeanStdDevNormalization : public GPUOperation { }; MeanStdDevNormalization CreateMeanStdDevNormalization( - const OperationDef& definition); + const OperationDef& definition, const DeviceInfo& device_info); } // namespace cl } // namespace gpu diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc index b257e5a85da..58c91ccf191 100644 --- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc +++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc @@ -262,7 +262,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context, return SelectMean(attr, op_def, creation_context.device->info_, gpu_op); } case OperationType::MEAN_STDDEV_NORMALIZATION: { - MeanStdDevNormalization operation = CreateMeanStdDevNormalization(op_def); + MeanStdDevNormalization operation = + CreateMeanStdDevNormalization(op_def, creation_context.device->info_); *gpu_op = absl::make_unique<MeanStdDevNormalization>(std::move(operation)); return absl::OkStatus();