diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD index ac975ac377d..5da82109829 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD +++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD @@ -328,6 +328,7 @@ cc_library( "//tensorflow/lite/delegates/gpu/common:tensor", "//tensorflow/lite/delegates/gpu/common:types", "//tensorflow/lite/delegates/gpu/common/task:weights_conversion", + "//tensorflow/lite/delegates/gpu/common/task:weights_layout", ], ) @@ -368,6 +369,7 @@ cc_library( "//tensorflow/lite/delegates/gpu/common:tensor", "//tensorflow/lite/delegates/gpu/common:types", "//tensorflow/lite/delegates/gpu/common/task:weights_conversion", + "//tensorflow/lite/delegates/gpu/common/task:weights_layout", ], ) @@ -407,6 +409,7 @@ cc_library( "//tensorflow/lite/delegates/gpu/common:tensor", "//tensorflow/lite/delegates/gpu/common:types", "//tensorflow/lite/delegates/gpu/common/task:weights_conversion", + "//tensorflow/lite/delegates/gpu/common/task:weights_layout", ], ) diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc index c4b8b2176ba..a57138c23ba 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc @@ -80,6 +80,19 @@ std::string ConvolutionTransposed3x3::GenerateConvolutionTransposedCode( } AddDstTensor("dst_tensor", dst_desc); + if (op_def.src_tensors.size() == 2) { + // dynamic weights + BufferDescriptor desc; + desc.element_type = op_def.src_tensors[1].data_type; + desc.element_size = 4; + desc.memory_type = + weights_upload_type == + ConvolutionTransposed3x3::WeightsUploadType::CONSTANT_MEM + ? MemoryType::CONSTANT + : MemoryType::GLOBAL; + AddSrcBuffer("weights", desc); + } + args_.AddInt("filter_offset"); args_.AddInt("padding_x"); args_.AddInt("padding_y"); @@ -389,6 +402,21 @@ ConvolutionTransposed3x3 CreateConvolutionTransposed3x3( return result; } +ConvolutionTransposed3x3 CreateConvolutionTransposed3x3DynamicWeights( + const GpuInfo& gpu_info, const OperationDef& definition, + const ConvolutionTransposedAttributes& attr) { + const int2 padding = int2(attr.padding.prepended.w, attr.padding.prepended.h); + ConvolutionTransposed3x3 result(definition, gpu_info, padding); + + TensorLinearDescriptor desc; + desc.storage_type = LinearStorageType::TEXTURE_2D; + desc.element_type = definition.GetDataType(); + desc.UploadLinearData(attr.bias); + result.args_.AddObject( + "biases", absl::make_unique(std::move(desc))); + return result; +} + } // namespace cl } // namespace gpu } // namespace tflite diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h index 69ba443db1b..ef381246ef5 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h +++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h @@ -29,6 +29,7 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/common/shape.h" #include "tensorflow/lite/delegates/gpu/common/status.h" #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h" +#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h" #include "tensorflow/lite/delegates/gpu/common/tensor.h" #include "tensorflow/lite/delegates/gpu/common/types.h" @@ -52,6 +53,13 @@ class ConvolutionTransposed3x3 : public GPUOperation { ConvolutionTransposed3x3(const ConvolutionTransposed3x3&) = delete; ConvolutionTransposed3x3& operator=(const ConvolutionTransposed3x3&) = delete; + WeightsDescription GetWeightsDescription() const { + WeightsDescription desc; + desc.layout = WeightsLayout::kOICustomSSpatialI4O4; + desc.spatial_remap = GetSpatialWeightsRemap(); + return desc; + } + enum class WeightsUploadType { LOCAL_MEM_ASYNC, LOCAL_MEM_BY_THREADS, @@ -65,6 +73,10 @@ class ConvolutionTransposed3x3 : public GPUOperation { friend ConvolutionTransposed3x3 CreateConvolutionTransposed3x3( const GpuInfo& gpu_info, const OperationDef& definition, const ConvolutionTransposedAttributes& attr); + friend ConvolutionTransposed3x3 CreateConvolutionTransposed3x3DynamicWeights( + const GpuInfo& gpu_info, const OperationDef& definition, + const ConvolutionTransposedAttributes& attr); + template void UploadWeights(const tflite::gpu::Tensor& weights); @@ -124,6 +136,10 @@ ConvolutionTransposed3x3 CreateConvolutionTransposed3x3( const GpuInfo& gpu_info, const OperationDef& definition, const ConvolutionTransposedAttributes& attr); +ConvolutionTransposed3x3 CreateConvolutionTransposed3x3DynamicWeights( + const GpuInfo& gpu_info, const OperationDef& definition, + const ConvolutionTransposedAttributes& attr); + } // namespace cl } // namespace gpu } // namespace tflite diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc index b01ef5c701b..ec58f362587 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc @@ -53,6 +53,15 @@ std::string ConvolutionTransposed3x3Thin::GenerateConvolutionTransposedCode( AddSrcTensor("src_tensor", src_desc); AddDstTensor("dst_tensor", op_def.dst_tensors[0]); + if (op_def.src_tensors.size() == 2) { + // dynamic weights + BufferDescriptor desc; + desc.element_type = op_def.src_tensors[1].data_type; + desc.element_size = 4; + desc.memory_type = MemoryType::CONSTANT; + AddSrcBuffer("weights", desc); + } + const auto src_tensor_type = op_def.src_tensors[0].storage_type; std::string c = GetCommonDefines(op_def.precision); @@ -160,8 +169,7 @@ std::string ConvolutionTransposed3x3Thin::GenerateConvolutionTransposedCode( for (int d = 0; d < dst_depth; ++d) { const std::string layer = std::to_string(d); c += " {\n"; - c += " FLT4 bias_val = args.weights.Read(" + - std::to_string(36 * filters_index + d) + ");\n"; + c += " FLT4 bias_val = args.biases.Read(" + layer + ");\n"; for (int y = 0; y < 2; ++y) { for (int x = 0; x < 2; ++x) { const std::string x_coord = "X + " + std::to_string(x); @@ -205,7 +213,28 @@ ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin( const GpuInfo& gpu_info, const OperationDef& definition, const ConvolutionTransposedAttributes& attr) { ConvolutionTransposed3x3Thin result(definition, attr); - result.UploadData(attr.weights, attr.bias); + result.UploadWeights(attr.weights); + + TensorLinearDescriptor desc; + desc.storage_type = LinearStorageType::TEXTURE_2D; + desc.element_type = definition.GetDataType(); + desc.UploadLinearData(attr.bias); + result.args_.AddObject( + "biases", absl::make_unique(std::move(desc))); + return result; +} + +ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3ThinDynamicWeights( + const GpuInfo& gpu_info, const OperationDef& definition, + const ConvolutionTransposedAttributes& attr) { + ConvolutionTransposed3x3Thin result(definition, attr); + + TensorLinearDescriptor desc; + desc.storage_type = LinearStorageType::TEXTURE_2D; + desc.element_type = definition.GetDataType(); + desc.UploadLinearData(attr.bias); + result.args_.AddObject( + "biases", absl::make_unique(std::move(desc))); return result; } diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h index 606e33dc4c6..80712e6aa81 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h +++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h @@ -29,6 +29,7 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/common/shape.h" #include "tensorflow/lite/delegates/gpu/common/status.h" #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h" +#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h" #include "tensorflow/lite/delegates/gpu/common/tensor.h" #include "tensorflow/lite/delegates/gpu/common/types.h" @@ -49,16 +50,28 @@ class ConvolutionTransposed3x3Thin : public GPUOperation { ConvolutionTransposed3x3Thin& operator=(const ConvolutionTransposed3x3Thin&) = delete; + WeightsDescription GetWeightsDescription() const { + WeightsDescription desc; + desc.layout = WeightsLayout::kOICustomSSpatialI4O4; + desc.spatial_remap = GetSpatialWeightsRemap(); + return desc; + } + private: - friend ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin( - const GpuInfo& gpu_info, const OperationDef& definition, - const ConvolutionTransposedAttributes& attr); explicit ConvolutionTransposed3x3Thin( const OperationDef& definition, const ConvolutionTransposedAttributes& attr); + + friend ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin( + const GpuInfo& gpu_info, const OperationDef& definition, + const ConvolutionTransposedAttributes& attr); + friend ConvolutionTransposed3x3Thin + CreateConvolutionTransposed3x3ThinDynamicWeights( + const GpuInfo& gpu_info, const OperationDef& definition, + const ConvolutionTransposedAttributes& attr); + template - void UploadData(const tflite::gpu::Tensor& weights, - const tflite::gpu::Tensor& biases); + void UploadWeights(const tflite::gpu::Tensor& weights); std::vector GetSpatialWeightsRemap() const; @@ -67,9 +80,8 @@ class ConvolutionTransposed3x3Thin : public GPUOperation { }; template -void ConvolutionTransposed3x3Thin::UploadData( - const tflite::gpu::Tensor& weights, - const tflite::gpu::Tensor& biases) { +void ConvolutionTransposed3x3Thin::UploadWeights( + const tflite::gpu::Tensor& weights) { const int src_depth = DivideRoundUp(weights.shape.i, 4); const int dst_depth = DivideRoundUp(weights.shape.o, 4); const int kernel_x = 3; // This operation support only 3x3 kernel @@ -83,33 +95,17 @@ void ConvolutionTransposed3x3Thin::UploadData( desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16; desc.element_size = 4; desc.memory_type = MemoryType::CONSTANT; - desc.size = flt4_size * (flt4_count + dst_depth); + desc.size = flt4_size * flt4_count; desc.data.resize(desc.size); if (f32_weights) { float4* gpu_data = reinterpret_cast(desc.data.data()); RearrangeWeightsToOICustomSpatialI4O4(weights, GetSpatialWeightsRemap(), absl::MakeSpan(gpu_data, flt4_count)); - for (int i = 0; i < dst_depth; ++i) { - float4 bias_value(0.0f); - for (int c = 0; c < 4; ++c) { - int ch = i * 4 + c; - bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f; - } - gpu_data[flt4_count + i] = bias_value; - } } else { half4* gpu_data = reinterpret_cast(desc.data.data()); RearrangeWeightsToOICustomSpatialI4O4(weights, GetSpatialWeightsRemap(), absl::MakeSpan(gpu_data, flt4_count)); - for (int i = 0; i < dst_depth; ++i) { - half4 bias_value(0.0f); - for (int c = 0; c < 4; ++c) { - int ch = i * 4 + c; - bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f; - } - gpu_data[flt4_count + i] = bias_value; - } } args_.AddObject("weights", @@ -123,6 +119,10 @@ ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin( const GpuInfo& gpu_info, const OperationDef& definition, const ConvolutionTransposedAttributes& attr); +ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3ThinDynamicWeights( + const GpuInfo& gpu_info, const OperationDef& definition, + const ConvolutionTransposedAttributes& attr); + } // namespace cl } // namespace gpu } // namespace tflite diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc index d6ead75274d..6888c18e39b 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc @@ -25,24 +25,35 @@ limitations under the License. namespace tflite { namespace gpu { namespace cl { +namespace { +ConvolutionTransposed4x4::WeightsUploadType GetBestWeightsUploadType( + const GpuInfo& gpu_info) { + ConvolutionTransposed4x4::WeightsUploadType weights_upload_type = + ConvolutionTransposed4x4::WeightsUploadType::GLOBAL_MEM; + if (gpu_info.IsPowerVR()) { + weights_upload_type = + ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC; + } else if (gpu_info.IsNvidia() || gpu_info.IsIntel()) { + weights_upload_type = + ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS; + } else if (gpu_info.IsAMD()) { + weights_upload_type = + ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM; + } else { + weights_upload_type = + ConvolutionTransposed4x4::WeightsUploadType::GLOBAL_MEM; + } + return weights_upload_type; +} +} // namespace + ConvolutionTransposed4x4::ConvolutionTransposed4x4( - const OperationDef& definition, const GpuInfo& gpu_info, - const ConvolutionTransposedAttributes& attr) + const OperationDef& definition, const GpuInfo& gpu_info) : GPUOperation(definition) { work_group_size_ = int3(8, 4, 1); - WeightsUploadType weights_upload_type = WeightsUploadType::GLOBAL_MEM; - if (gpu_info.IsPowerVR()) { - weights_upload_type = WeightsUploadType::LOCAL_MEM_ASYNC; - } else if (gpu_info.IsNvidia() || gpu_info.IsIntel()) { - weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS; - } else if (gpu_info.IsAMD()) { - weights_upload_type = WeightsUploadType::CONSTANT_MEM; - } else { - weights_upload_type = WeightsUploadType::GLOBAL_MEM; - } - code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type); - UploadWeights(attr.weights, weights_upload_type); + code_ = GenerateConvolutionTransposedCode(definition_, + GetBestWeightsUploadType(gpu_info)); if (definition_.precision == CalculationsPrecision::F16 && gpu_info.IsPowerVR()) { compiler_options_.push_back(CompilerOptions::kClPowervrFp16); @@ -76,6 +87,19 @@ std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode( } AddDstTensor("dst_tensor", dst_desc); + if (op_def.src_tensors.size() == 2) { + // dynamic weights + BufferDescriptor desc; + desc.element_type = op_def.src_tensors[1].data_type; + desc.element_size = 4; + desc.memory_type = + weights_upload_type == + ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM + ? MemoryType::CONSTANT + : MemoryType::GLOBAL; + AddSrcBuffer("weights", desc); + } + args_.AddInt("filter_offset"); const bool need_local_mem = @@ -338,7 +362,22 @@ bool IsConvolutionTransposed4x4Supported( ConvolutionTransposed4x4 CreateConvolutionTransposed4x4( const GpuInfo& gpu_info, const OperationDef& definition, const ConvolutionTransposedAttributes& attr) { - ConvolutionTransposed4x4 result(definition, gpu_info, attr); + ConvolutionTransposed4x4 result(definition, gpu_info); + result.UploadWeights(attr.weights, GetBestWeightsUploadType(gpu_info)); + + TensorLinearDescriptor desc; + desc.storage_type = LinearStorageType::TEXTURE_2D; + desc.element_type = definition.GetDataType(); + desc.UploadLinearData(attr.bias); + result.args_.AddObject( + "biases", absl::make_unique(std::move(desc))); + return result; +} + +ConvolutionTransposed4x4 CreateConvolutionTransposed4x4DynamicWeights( + const GpuInfo& gpu_info, const OperationDef& definition, + const ConvolutionTransposedAttributes& attr) { + ConvolutionTransposed4x4 result(definition, gpu_info); TensorLinearDescriptor desc; desc.storage_type = LinearStorageType::TEXTURE_2D; diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h index ed481bf9c5d..19434fe1074 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h +++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h @@ -29,6 +29,7 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/common/shape.h" #include "tensorflow/lite/delegates/gpu/common/status.h" #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h" +#include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h" #include "tensorflow/lite/delegates/gpu/common/tensor.h" #include "tensorflow/lite/delegates/gpu/common/types.h" @@ -54,6 +55,13 @@ class ConvolutionTransposed4x4 : public GPUOperation { ConvolutionTransposed4x4(const ConvolutionTransposed4x4&) = delete; ConvolutionTransposed4x4& operator=(const ConvolutionTransposed4x4&) = delete; + WeightsDescription GetWeightsDescription() const { + WeightsDescription desc; + desc.layout = WeightsLayout::kOICustomSSpatialI4O4; + desc.spatial_remap = GetSpatialWeightsRemap(); + return desc; + } + enum class WeightsUploadType { LOCAL_MEM_ASYNC, LOCAL_MEM_BY_THREADS, @@ -63,11 +71,15 @@ class ConvolutionTransposed4x4 : public GPUOperation { private: ConvolutionTransposed4x4(const OperationDef& definition, - const GpuInfo& gpu_info, - const ConvolutionTransposedAttributes& attr); + const GpuInfo& gpu_info); + friend ConvolutionTransposed4x4 CreateConvolutionTransposed4x4( const GpuInfo& gpu_info, const OperationDef& definition, const ConvolutionTransposedAttributes& attr); + friend ConvolutionTransposed4x4 CreateConvolutionTransposed4x4DynamicWeights( + const GpuInfo& gpu_info, const OperationDef& definition, + const ConvolutionTransposedAttributes& attr); + template void UploadWeights(const tflite::gpu::Tensor& weights, WeightsUploadType weights_upload_type); @@ -124,6 +136,10 @@ ConvolutionTransposed4x4 CreateConvolutionTransposed4x4( const GpuInfo& gpu_info, const OperationDef& definition, const ConvolutionTransposedAttributes& attr); +ConvolutionTransposed4x4 CreateConvolutionTransposed4x4DynamicWeights( + const GpuInfo& gpu_info, const OperationDef& definition, + const ConvolutionTransposedAttributes& attr); + } // namespace cl } // namespace gpu } // namespace tflite