From 588854df78c92610b1d658fa060507efb0189c8e Mon Sep 17 00:00:00 2001 From: Raman Sarokin Date: Fri, 12 Jun 2020 09:36:01 -0700 Subject: [PATCH] Added device info to TransformToCLCode function. Storing half parameters on PowerVR as float32 values. PiperOrigin-RevId: 316119180 Change-Id: I60e48cbd7e16cbb35b960acfd31e78d1dc379854 --- tensorflow/lite/delegates/gpu/cl/BUILD | 1 + tensorflow/lite/delegates/gpu/cl/arguments.cc | 37 ++++++++++++++----- tensorflow/lite/delegates/gpu/cl/arguments.h | 10 ++++- .../lite/delegates/gpu/cl/kernels/softmax.cc | 3 +- .../delegates/gpu/cl/kernels/transpose.cc | 5 ++- .../lite/delegates/gpu/cl/kernels/winograd.cc | 3 +- 6 files changed, 44 insertions(+), 15 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD index 6730da1fdc6..e6e7e4747c4 100644 --- a/tensorflow/lite/delegates/gpu/cl/BUILD +++ b/tensorflow/lite/delegates/gpu/cl/BUILD @@ -43,6 +43,7 @@ cc_library( srcs = ["arguments.cc"], hdrs = ["arguments.h"], deps = [ + ":cl_device", ":gpu_object", ":opencl_wrapper", ":tensor_type", diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc index 69616741966..afb58ba46ad 100644 --- a/tensorflow/lite/delegates/gpu/cl/arguments.cc +++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc @@ -283,7 +283,11 @@ absl::Status Arguments::SetHalf(const std::string& name, half value) { } it->second.value = value; if (it->second.active) { - shared_half4s_data_[it->second.offset] = value; + if (it->second.store_as_f32) { + shared_float4s_data_[it->second.offset] = value; + } else { + shared_half4s_data_[it->second.offset] = value; + } } return absl::OkStatus(); } @@ -436,10 +440,11 @@ absl::Status Arguments::Merge(Arguments&& args, const std::string& postfix) { } absl::Status Arguments::TransformToCLCode( + const DeviceInfo& device_info, const std::map& linkables, std::string* code) { RETURN_IF_ERROR(AddObjectArgs()); RETURN_IF_ERROR(ResolveSelectorsPass(linkables, code)); - ResolveArgsPass(code); + ResolveArgsPass(device_info, code); return absl::OkStatus(); } @@ -568,7 +573,8 @@ absl::Status Arguments::Bind(cl_kernel kernel, int offset) { return absl::OkStatus(); } -std::string Arguments::AddActiveArgument(const std::string& arg_name) { +std::string Arguments::AddActiveArgument(const std::string& arg_name, + bool use_f32_for_halfs) { if (auto it = int_values_.find(arg_name); it != int_values_.end()) { int int_index; if (it->second.active) { @@ -603,26 +609,39 @@ std::string Arguments::AddActiveArgument(const std::string& arg_name) { half_index = it->second.offset; } else { it->second.active = true; - it->second.offset = shared_half4s_data_.size(); + if (use_f32_for_halfs) { + it->second.store_as_f32 = true; + it->second.offset = shared_float4s_data_.size(); + shared_float4s_data_.push_back(it->second.value); + } else { + it->second.offset = shared_half4s_data_.size(); + shared_half4s_data_.push_back(it->second.value); + } half_index = it->second.offset; - shared_half4s_data_.push_back(it->second.value); } std::string index = std::to_string(half_index / 4); std::string postfixes[4] = {"x", "y", "z", "w"}; - return "shared_half4_" + index + "." + postfixes[half_index % 4]; + if (it->second.store_as_f32) { + return "(half)(shared_float4_" + index + "." + postfixes[half_index % 4] + + ")"; + } else { + return "shared_half4_" + index + "." + postfixes[half_index % 4]; + } } return arg_name; } -void Arguments::ResolveArgsPass(std::string* code) { - std::string result; +void Arguments::ResolveArgsPass(const DeviceInfo& device_info, + std::string* code) { + bool use_f32_for_half_arguments = device_info.vendor == Vendor::POWERVR; size_t position = 0; size_t next_position = code->find(kArgsPrefix); while (next_position != std::string::npos) { size_t arg_pos = next_position; next_position += strlen(kArgsPrefix); std::string object_name = GetNextWord(*code, next_position); - std::string new_name = AddActiveArgument(object_name); + std::string new_name = + AddActiveArgument(object_name, use_f32_for_half_arguments); code->replace(arg_pos, object_name.size() + strlen(kArgsPrefix), new_name); position = arg_pos + new_name.size(); next_position = code->find(kArgsPrefix, position); diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h index 18681c8a473..edeab4a603b 100644 --- a/tensorflow/lite/delegates/gpu/cl/arguments.h +++ b/tensorflow/lite/delegates/gpu/cl/arguments.h @@ -20,6 +20,7 @@ limitations under the License. #include #include +#include "tensorflow/lite/delegates/gpu/cl/cl_device.h" #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h" #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h" #include "tensorflow/lite/delegates/gpu/cl/util.h" @@ -69,6 +70,7 @@ class Arguments { absl::Status Merge(Arguments&& args, const std::string& postfix); absl::Status TransformToCLCode( + const DeviceInfo& device_info, const std::map& linkables, std::string* code); // Move only @@ -78,7 +80,8 @@ class Arguments { Arguments& operator=(const Arguments&) = delete; private: - std::string AddActiveArgument(const std::string& arg_name); + std::string AddActiveArgument(const std::string& arg_name, + bool use_f32_for_halfs); void AddGPUResources(const std::string& name, const GPUResources& resources); absl::Status SetGPUResources(const std::string& name, @@ -86,7 +89,7 @@ class Arguments { absl::Status AddObjectArgs(); - void ResolveArgsPass(std::string* code); + void ResolveArgsPass(const DeviceInfo& device_info, std::string* code); absl::Status ResolveSelectorsPass( const std::map& linkables, std::string* code); @@ -135,6 +138,9 @@ class Arguments { // to reduce amount of data transferred we adding this optimization bool active = false; + // some devices have issues with half parameters. + bool store_as_f32 = false; + // offset to shared uniform storage. uint32_t offset = -1; }; diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc index b92bab4ed22..e5f0933401a 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc @@ -91,7 +91,8 @@ Softmax& Softmax::operator=(Softmax&& kernel) { absl::Status Softmax::Compile(const CreationContext& creation_context) { std::string code = GetSoftmaxKernelCode(definition_, linked_operations_, &args_); - RETURN_IF_ERROR(args_.TransformToCLCode({}, &code)); + RETURN_IF_ERROR( + args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code)); code = absl::Substitute(code, args_.GetListOfArgs()); return creation_context.cache->GetOrCreateCLKernel( code, "main_function", *creation_context.context, diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc index da4e1c0820d..78bc1b14abb 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc @@ -130,8 +130,9 @@ absl::Status Transpose::Compile(const CreationContext& creation_context) { element_wise_code += "{\n" + code + "\n}\n"; RETURN_IF_ERROR(args_.Merge(std::move(link_args), postfix)); } - RETURN_IF_ERROR( - args_.TransformToCLCode({{"dst_tensor", element_wise_code}}, &code)); + RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(), + {{"dst_tensor", element_wise_code}}, + &code)); code = absl::Substitute(code, args_.GetListOfArgs()); return creation_context.cache->GetOrCreateCLKernel( code, "main_function", *creation_context.context, diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc index 20a7c0e9f61..aa47e3a1c24 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc @@ -391,7 +391,8 @@ absl::Status Winograd4x4To36::Compile(const CreationContext& creation_context) { RETURN_IF_ERROR(UploadBt(creation_context.context)); std::string code = GetWinograd4x4To36Code(definition_, linked_operations_, &args_); - RETURN_IF_ERROR(args_.TransformToCLCode({}, &code)); + RETURN_IF_ERROR( + args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code)); code = absl::Substitute(code, args_.GetListOfArgs()); RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel( code, "main_function", options, *creation_context.context,