From 2cdb2b4d7619282a3c8787b38eceb8c37261e778 Mon Sep 17 00:00:00 2001 From: Raman Sarokin Date: Wed, 2 Sep 2020 16:49:34 -0700 Subject: [PATCH] Added new utility functions to TensorDescriptor for codegen simplification and generalization. Used in ConvolutionTransposed generation. PiperOrigin-RevId: 329814907 Change-Id: Ib32f3540de704dadb65705b704c37ecaf4e563bd --- .../gpu/cl/kernels/convolution_transposed.cc | 132 ++++++++++-------- .../lite/delegates/gpu/cl/tensor_type.cc | 40 ++++++ .../lite/delegates/gpu/cl/tensor_type.h | 10 ++ 3 files changed, 126 insertions(+), 56 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc index 8d4532e5d09..d22dbbd88cf 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc @@ -17,12 +17,14 @@ limitations under the License. #include #include +#include #include "absl/strings/substitute.h" #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h" #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h" #include "tensorflow/lite/delegates/gpu/cl/precision.h" #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" +#include "tensorflow/lite/delegates/gpu/common/shape.h" #include "tensorflow/lite/delegates/gpu/common/status.h" namespace tflite { @@ -123,13 +125,9 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( auto src_desc = op_def.src_tensors[0]; src_desc.SetTextureAddressMode(TextureAddressMode::ZERO); AddSrcTensor("src_tensor", src_desc); - AddDstTensor("dst_tensor", op_def.dst_tensors[0]); - const auto src_tensor_type = op_def.src_tensors[0].storage_type; - bool image_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER; - bool manual_clamp = - image_buffer || src_tensor_type == TensorStorageType::BUFFER; + const auto& src_def = op_def.src_tensors[0]; std::string c = GetCommonDefines(op_def.precision); @@ -166,13 +164,13 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( auto generate_id = [&](const std::string& x, const std::string& y, const std::string& z) { std::string id; - if (op_def.src_tensors[0].HasAxis(Axis::WIDTH)) { + if (src_def.HasAxis(Axis::WIDTH)) { id += "_w" + x; } - if (op_def.src_tensors[0].HasAxis(Axis::HEIGHT)) { + if (src_def.HasAxis(Axis::HEIGHT)) { id += "_h" + y; } - if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { + if (src_def.HasAxis(Axis::DEPTH)) { id += "_d" + z; } return id; @@ -186,14 +184,18 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( auto generate_check = [&](const std::string& x, const std::string& y, const std::string& z) { std::string check; - if (op_def.src_tensors[0].HasAxis(Axis::WIDTH)) { - check += "in_x" + x; - } - if (op_def.src_tensors[0].HasAxis(Axis::HEIGHT)) { - check += " && in_y" + y; - } - if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { - check += " && in_z" + z; + const std::vector axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH}; + const std::vector names{"in_x", "in_y", "in_z"}; + const std::vector coords{x, y, z}; + for (int i = 0; i < axes.size(); ++i) { + const auto& axis = axes[i]; + if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis) && + block_size[i] != 1) { + if (!check.empty()) { + check += " && "; + } + check += names[i] + coords[i]; + } } return check; }; @@ -223,7 +225,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( c += " int ceil_x = dst_x / args.stride_x;\n"; c += " dst_x = ceil_x * args.stride_x * " + std::to_string(block_size.x) + " + rem_x;\n"; - if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { + if (src_def.HasAxis(Axis::DEPTH)) { c += " int linear_id_y = get_global_id(1);\n"; c += " int dst_y = linear_id_y % args.grid_size_y;\n"; c += " int dst_z = linear_id_y / args.grid_size_y;\n"; @@ -247,7 +249,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( if (weights_are_buffer) { c += " int f_base = dst_s * args.src_tensor.Slices() * args.kernel_size_x " "* args.kernel_size_y"; - if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { + if (src_def.HasAxis(Axis::DEPTH)) { c += " * args.kernel_size_z"; } c += ";\n"; @@ -280,7 +282,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( c += " int src_y = (kernel_first_dst_y + offset_y_strided) / args.stride_y - " "offset_y;\n"; - if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { + if (src_def.HasAxis(Axis::DEPTH)) { c += " int kernel_first_dst_z = dst_z + args.padding_z;\n"; c += " int kernel_last_dst_z = kernel_first_dst_z - args.kernel_size_z;\n"; c += " int offset_z = abs(args.padding_z);\n"; @@ -294,13 +296,16 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( for (int z = 0; z < block_size.z; ++z) { const std::string zindex = std::to_string(z); c += " int sz" + zindex + " = src_z + " + zindex + ";\n"; - if (src_tensor_type != TensorStorageType::TEXTURE_3D) { + if (!src_def.SupportsZeroClamp(Axis::DEPTH)) { c += " bool in_z" + zindex + " = sz" + zindex + " >= 0 && sz" + zindex + " < args.src_tensor.Depth();\n"; + if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) { + c += " sz" + zindex + " = clamp(sz" + zindex + + ", 0, args.src_tensor.Depth() - 1);\n"; + } } } - if (block_size.z == 1 && - (src_tensor_type != TensorStorageType::TEXTURE_3D)) { + if (block_size.z == 1 && !src_def.SupportsZeroClamp(Axis::DEPTH)) { c += " if (!in_z0) continue;\n"; } c += " int kernel_z = kernel_first_dst_z - src_as_dst_z;\n"; @@ -316,17 +321,20 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( for (int y = 0; y < block_size.y; ++y) { const std::string yindex = std::to_string(y); const std::string src_y = - op_def.src_tensors[0].HasAxis(Axis::DEPTH) ? "src_y_copy" : "src_y"; + src_def.HasAxis(Axis::DEPTH) ? "src_y_copy" : "src_y"; c += " int sy" + yindex + " = " + src_y + " + " + yindex + ";\n"; - if (manual_clamp) { + if (!src_def.SupportsZeroClamp(Axis::HEIGHT)) { c += " bool in_y" + yindex + " = sy" + yindex + " >= 0 && sy" + yindex + " < args.src_tensor.Height();\n"; - if (!image_buffer) { + if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) { c += " sy" + yindex + " = clamp(sy" + yindex + ", 0, args.src_tensor.Height() - 1);\n"; } } } + if (block_size.y == 1 && !src_def.SupportsZeroClamp(Axis::HEIGHT)) { + c += " if (!in_y0) continue;\n"; + } c += " int kernel_y = kernel_first_dst_y - src_as_dst_y;\n"; c += " int src_as_dst_x = src_x * args.stride_x;\n"; c += " int src_x_copy = src_x;\n"; @@ -336,15 +344,18 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( for (int x = 0; x < block_size.x; ++x) { const std::string xindex = std::to_string(x); c += " int sx" + xindex + " = src_x_copy + " + xindex + ";\n"; - if (manual_clamp) { + if (!src_def.SupportsZeroClamp(Axis::WIDTH)) { c += " bool in_x" + xindex + " = sx" + xindex + " >= 0 && sx" + xindex + " < args.src_tensor.Width();\n"; - if (!image_buffer) { + if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) { c += " sx" + xindex + " = clamp(sx" + xindex + ", 0, args.src_tensor.Width() - 1);\n"; } } } + if (block_size.x == 1 && !src_def.SupportsZeroClamp(Axis::WIDTH)) { + c += " if (!in_x0) continue;\n"; + } for (int z = 0; z < block_size.z; ++z) { const std::string zind = std::to_string(z); for (int y = 0; y < block_size.y; ++y) { @@ -354,12 +365,14 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( const std::string id = generate_id(xind, yind, zind); const std::string check = generate_check(xind, yind, zind); std::string coords = "sx" + xind + ", sy" + yind; - if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { + if (src_def.HasAxis(Axis::DEPTH)) { coords += ", sz" + zind; } - c += " args.src_tensor.GetAddress(addr" + id + ", " + coords + - ", 0);\n"; - if (image_buffer) { + if (src_def.IsLinear()) { + c += " args.src_tensor.GetAddress(addr" + id + ", " + coords + + ", 0);\n"; + } + if (src_def.ReturnsZeroForNegOneRead()) { c += " addr" + id + " = select(-1, addr" + id + ", (" + check + "));\n"; c += " int ds" + id + @@ -369,14 +382,11 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( } } } - if (src_tensor_type == TensorStorageType::BUFFER) { + if (src_def.storage_type == TensorStorageType::BUFFER) { c += " int ds = args.src_tensor.SliceStride();\n"; } - if (block_size.x == 1 && block_size.y == 1 && manual_clamp) { - c += " if (!in_x0 || !in_y0) continue;\n"; - } c += " int kernel_x = kernel_first_dst_x - src_as_dst_x;\n"; - if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { + if (src_def.HasAxis(Axis::DEPTH)) { c += " int kernel_index = (kernel_z * args.kernel_size_y + kernel_y) " "* args.kernel_size_x + kernel_x;\n"; } else { @@ -398,26 +408,36 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( for (int x = 0; x < block_size.x; ++x) { const std::string xind = std::to_string(x); const std::string id = generate_id(xind, yind, zind); - const std::string check = generate_check(xind, yind, zind); - if (image_buffer) { - c += " FLT4 src" + id + " = args.src_tensor.Read(addr" + id + - "); addr" + id + " += ds" + id + ";\n"; - } else if (manual_clamp) { - if (conditional_read) { - c += " FLT4 src" + id + " = " + check + - " ? args.src_tensor.Read(addr" + id + - ") : (FLT4)(0.0f); addr" + id + " += ds;\n"; - } else { - c += " FLT4 src" + id + " = args.src_tensor.Read(addr" + id + - ") * (FLT)(" + check + "); addr" + id + " += ds;\n"; - } + std::string address; + if (src_def.IsLinear()) { + address = "addr" + id; } else { - std::string coords = "sx" + xind + ", sy" + yind; - if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { - coords += ", sz" + zind; + address = "sx" + xind + ", sy" + yind; + if (src_def.HasAxis(Axis::DEPTH)) { + address += ", sz" + zind; + } + address += ", s"; + } + if (src_def.ReturnsZeroForNegOneRead()) { + c += " FLT4 src" + id + " = args.src_tensor.Read(" + address + + "); " + address + " += ds" + id + ";\n"; + } else { + const std::string check = generate_check(xind, yind, zind); + if (!check.empty()) { + if (conditional_read) { + c += " FLT4 src" + id + " = " + check + + " ? args.src_tensor.Read(" + address + ") : (FLT4)(0.0f);\n"; + } else { + c += " FLT4 src" + id + " = args.src_tensor.Read(" + + address + ") * (FLT)(" + check + ");\n"; + } + } else { + c += " FLT4 src" + id + " = args.src_tensor.Read(" + + address + ");\n"; + } + if (src_def.IsLinear()) { + c += " addr" + id + " += ds;\n"; } - c += " FLT4 src" + id + " = args.src_tensor.Read(" + coords + - ", s);\n"; } } } @@ -456,7 +476,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( c += " }\n"; c += " }\n"; c += " }\n"; - if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { + if (src_def.HasAxis(Axis::DEPTH)) { c += " }\n"; } for (int s = 0; s < block_size.w; ++s) { @@ -476,7 +496,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode( c += " {\n"; c += " int xc = dst_x + args.stride_x * " + xind + ";\n"; c += " int yc = dst_y + args.stride_y * " + yind + ";\n"; - if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { + if (src_def.HasAxis(Axis::DEPTH)) { c += " int zc = dst_z + args.stride_z * " + zind + ";\n"; checks += " && zc < args.dst_tensor.Depth()"; coords += ", zc"; diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc index 7bd5de6e31e..f31df43539e 100644 --- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc +++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc @@ -771,6 +771,46 @@ void TensorDescriptor::UploadData(absl::Span src) { } } +bool TensorDescriptor::SupportsZeroClamp(const Axis& axis) const { + switch (storage_type) { + case TensorStorageType::UNKNOWN: + return false; + case TensorStorageType::BUFFER: + case TensorStorageType::IMAGE_BUFFER: + return false; + case TensorStorageType::TEXTURE_ARRAY: + case TensorStorageType::TEXTURE_2D: + case TensorStorageType::SINGLE_TEXTURE_2D: + return axis == Axis::WIDTH || axis == Axis::HEIGHT; + case TensorStorageType::TEXTURE_3D: + return axis == Axis::WIDTH || axis == Axis::HEIGHT || axis == Axis::DEPTH; + } +} + +bool TensorDescriptor::CanReadOutOfBorder(const Axis& axis) const { + switch (storage_type) { + case TensorStorageType::UNKNOWN: + return false; + case TensorStorageType::BUFFER: + return false; + case TensorStorageType::IMAGE_BUFFER: + case TensorStorageType::TEXTURE_2D: + case TensorStorageType::TEXTURE_3D: + case TensorStorageType::SINGLE_TEXTURE_2D: + case TensorStorageType::TEXTURE_ARRAY: + return true; + } +} + +bool TensorDescriptor::IsLinear() const { + return storage_type == TensorStorageType::BUFFER || + storage_type == TensorStorageType::IMAGE_BUFFER; +} + +bool TensorDescriptor::ReturnsZeroForNegOneRead() const { + return storage_type == TensorStorageType::IMAGE_BUFFER; +} + namespace { int GetLinearIndex(const TensorDescriptor& desc, const BHWDC& shape, int b, int x, int y, int d, int s, int sub_c) { diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h index 094e3905966..2157bf05543 100644 --- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h +++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h @@ -82,6 +82,16 @@ struct TensorDescriptor : public GPUObjectDescriptor { void UploadData(const tflite::gpu::Tensor& src); void UploadData(const tflite::gpu::Tensor& src); + bool SupportsZeroClamp(const Axis& axis) const; + bool CanReadOutOfBorder(const Axis& axis) const; + bool IsLinear() const; + + // applicable only for types that: IsLinear -> true. + // In this case for address we have 1d component - addr (int) + // If for addr == -1 this linear storage type returns FLT4(0.0), this function + // returns true, otherwise false + bool ReturnsZeroForNegOneRead() const; + DataType data_type = DataType::UNKNOWN; TensorStorageType storage_type = TensorStorageType::UNKNOWN; // This field describes logical layout, actual(physical) GPU layout can be