Added new utility functions to TensorDescriptor for codegen simplification and generalization.

Used in ConvolutionTransposed generation.

PiperOrigin-RevId: 329814907
Change-Id: Ib32f3540de704dadb65705b704c37ecaf4e563bd
This commit is contained in:
Raman Sarokin 2020-09-02 16:49:34 -07:00 committed by TensorFlower Gardener
parent d08e1a80eb
commit 2cdb2b4d76
3 changed files with 126 additions and 56 deletions

View File

@ -17,12 +17,14 @@ limitations under the License.
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector>
#include "absl/strings/substitute.h" #include "absl/strings/substitute.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h" #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h" #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h" #include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h" #include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite { namespace tflite {
@ -123,13 +125,9 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
auto src_desc = op_def.src_tensors[0]; auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(TextureAddressMode::ZERO); src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
AddSrcTensor("src_tensor", src_desc); AddSrcTensor("src_tensor", src_desc);
AddDstTensor("dst_tensor", op_def.dst_tensors[0]); AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
const auto src_tensor_type = op_def.src_tensors[0].storage_type; const auto& src_def = op_def.src_tensors[0];
bool image_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER;
bool manual_clamp =
image_buffer || src_tensor_type == TensorStorageType::BUFFER;
std::string c = GetCommonDefines(op_def.precision); std::string c = GetCommonDefines(op_def.precision);
@ -166,13 +164,13 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
auto generate_id = [&](const std::string& x, const std::string& y, auto generate_id = [&](const std::string& x, const std::string& y,
const std::string& z) { const std::string& z) {
std::string id; std::string id;
if (op_def.src_tensors[0].HasAxis(Axis::WIDTH)) { if (src_def.HasAxis(Axis::WIDTH)) {
id += "_w" + x; id += "_w" + x;
} }
if (op_def.src_tensors[0].HasAxis(Axis::HEIGHT)) { if (src_def.HasAxis(Axis::HEIGHT)) {
id += "_h" + y; id += "_h" + y;
} }
if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { if (src_def.HasAxis(Axis::DEPTH)) {
id += "_d" + z; id += "_d" + z;
} }
return id; return id;
@ -186,14 +184,18 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
auto generate_check = [&](const std::string& x, const std::string& y, auto generate_check = [&](const std::string& x, const std::string& y,
const std::string& z) { const std::string& z) {
std::string check; std::string check;
if (op_def.src_tensors[0].HasAxis(Axis::WIDTH)) { const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
check += "in_x" + x; const std::vector<std::string> names{"in_x", "in_y", "in_z"};
} const std::vector<std::string> coords{x, y, z};
if (op_def.src_tensors[0].HasAxis(Axis::HEIGHT)) { for (int i = 0; i < axes.size(); ++i) {
check += " && in_y" + y; const auto& axis = axes[i];
} if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis) &&
if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { block_size[i] != 1) {
check += " && in_z" + z; if (!check.empty()) {
check += " && ";
}
check += names[i] + coords[i];
}
} }
return check; return check;
}; };
@ -223,7 +225,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
c += " int ceil_x = dst_x / args.stride_x;\n"; c += " int ceil_x = dst_x / args.stride_x;\n";
c += " dst_x = ceil_x * args.stride_x * " + std::to_string(block_size.x) + c += " dst_x = ceil_x * args.stride_x * " + std::to_string(block_size.x) +
" + rem_x;\n"; " + rem_x;\n";
if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { if (src_def.HasAxis(Axis::DEPTH)) {
c += " int linear_id_y = get_global_id(1);\n"; c += " int linear_id_y = get_global_id(1);\n";
c += " int dst_y = linear_id_y % args.grid_size_y;\n"; c += " int dst_y = linear_id_y % args.grid_size_y;\n";
c += " int dst_z = linear_id_y / args.grid_size_y;\n"; c += " int dst_z = linear_id_y / args.grid_size_y;\n";
@ -247,7 +249,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
if (weights_are_buffer) { if (weights_are_buffer) {
c += " int f_base = dst_s * args.src_tensor.Slices() * args.kernel_size_x " c += " int f_base = dst_s * args.src_tensor.Slices() * args.kernel_size_x "
"* args.kernel_size_y"; "* args.kernel_size_y";
if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { if (src_def.HasAxis(Axis::DEPTH)) {
c += " * args.kernel_size_z"; c += " * args.kernel_size_z";
} }
c += ";\n"; c += ";\n";
@ -280,7 +282,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
c += c +=
" int src_y = (kernel_first_dst_y + offset_y_strided) / args.stride_y - " " int src_y = (kernel_first_dst_y + offset_y_strided) / args.stride_y - "
"offset_y;\n"; "offset_y;\n";
if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { if (src_def.HasAxis(Axis::DEPTH)) {
c += " int kernel_first_dst_z = dst_z + args.padding_z;\n"; c += " int kernel_first_dst_z = dst_z + args.padding_z;\n";
c += " int kernel_last_dst_z = kernel_first_dst_z - args.kernel_size_z;\n"; c += " int kernel_last_dst_z = kernel_first_dst_z - args.kernel_size_z;\n";
c += " int offset_z = abs(args.padding_z);\n"; c += " int offset_z = abs(args.padding_z);\n";
@ -294,13 +296,16 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
for (int z = 0; z < block_size.z; ++z) { for (int z = 0; z < block_size.z; ++z) {
const std::string zindex = std::to_string(z); const std::string zindex = std::to_string(z);
c += " int sz" + zindex + " = src_z + " + zindex + ";\n"; c += " int sz" + zindex + " = src_z + " + zindex + ";\n";
if (src_tensor_type != TensorStorageType::TEXTURE_3D) { if (!src_def.SupportsZeroClamp(Axis::DEPTH)) {
c += " bool in_z" + zindex + " = sz" + zindex + " >= 0 && sz" + c += " bool in_z" + zindex + " = sz" + zindex + " >= 0 && sz" +
zindex + " < args.src_tensor.Depth();\n"; zindex + " < args.src_tensor.Depth();\n";
if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
c += " sz" + zindex + " = clamp(sz" + zindex +
", 0, args.src_tensor.Depth() - 1);\n";
}
} }
} }
if (block_size.z == 1 && if (block_size.z == 1 && !src_def.SupportsZeroClamp(Axis::DEPTH)) {
(src_tensor_type != TensorStorageType::TEXTURE_3D)) {
c += " if (!in_z0) continue;\n"; c += " if (!in_z0) continue;\n";
} }
c += " int kernel_z = kernel_first_dst_z - src_as_dst_z;\n"; c += " int kernel_z = kernel_first_dst_z - src_as_dst_z;\n";
@ -316,17 +321,20 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
for (int y = 0; y < block_size.y; ++y) { for (int y = 0; y < block_size.y; ++y) {
const std::string yindex = std::to_string(y); const std::string yindex = std::to_string(y);
const std::string src_y = const std::string src_y =
op_def.src_tensors[0].HasAxis(Axis::DEPTH) ? "src_y_copy" : "src_y"; src_def.HasAxis(Axis::DEPTH) ? "src_y_copy" : "src_y";
c += " int sy" + yindex + " = " + src_y + " + " + yindex + ";\n"; c += " int sy" + yindex + " = " + src_y + " + " + yindex + ";\n";
if (manual_clamp) { if (!src_def.SupportsZeroClamp(Axis::HEIGHT)) {
c += " bool in_y" + yindex + " = sy" + yindex + " >= 0 && sy" + c += " bool in_y" + yindex + " = sy" + yindex + " >= 0 && sy" +
yindex + " < args.src_tensor.Height();\n"; yindex + " < args.src_tensor.Height();\n";
if (!image_buffer) { if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
c += " sy" + yindex + " = clamp(sy" + yindex + c += " sy" + yindex + " = clamp(sy" + yindex +
", 0, args.src_tensor.Height() - 1);\n"; ", 0, args.src_tensor.Height() - 1);\n";
} }
} }
} }
if (block_size.y == 1 && !src_def.SupportsZeroClamp(Axis::HEIGHT)) {
c += " if (!in_y0) continue;\n";
}
c += " int kernel_y = kernel_first_dst_y - src_as_dst_y;\n"; c += " int kernel_y = kernel_first_dst_y - src_as_dst_y;\n";
c += " int src_as_dst_x = src_x * args.stride_x;\n"; c += " int src_as_dst_x = src_x * args.stride_x;\n";
c += " int src_x_copy = src_x;\n"; c += " int src_x_copy = src_x;\n";
@ -336,15 +344,18 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
for (int x = 0; x < block_size.x; ++x) { for (int x = 0; x < block_size.x; ++x) {
const std::string xindex = std::to_string(x); const std::string xindex = std::to_string(x);
c += " int sx" + xindex + " = src_x_copy + " + xindex + ";\n"; c += " int sx" + xindex + " = src_x_copy + " + xindex + ";\n";
if (manual_clamp) { if (!src_def.SupportsZeroClamp(Axis::WIDTH)) {
c += " bool in_x" + xindex + " = sx" + xindex + " >= 0 && sx" + c += " bool in_x" + xindex + " = sx" + xindex + " >= 0 && sx" +
xindex + " < args.src_tensor.Width();\n"; xindex + " < args.src_tensor.Width();\n";
if (!image_buffer) { if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
c += " sx" + xindex + " = clamp(sx" + xindex + c += " sx" + xindex + " = clamp(sx" + xindex +
", 0, args.src_tensor.Width() - 1);\n"; ", 0, args.src_tensor.Width() - 1);\n";
} }
} }
} }
if (block_size.x == 1 && !src_def.SupportsZeroClamp(Axis::WIDTH)) {
c += " if (!in_x0) continue;\n";
}
for (int z = 0; z < block_size.z; ++z) { for (int z = 0; z < block_size.z; ++z) {
const std::string zind = std::to_string(z); const std::string zind = std::to_string(z);
for (int y = 0; y < block_size.y; ++y) { for (int y = 0; y < block_size.y; ++y) {
@ -354,12 +365,14 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
const std::string id = generate_id(xind, yind, zind); const std::string id = generate_id(xind, yind, zind);
const std::string check = generate_check(xind, yind, zind); const std::string check = generate_check(xind, yind, zind);
std::string coords = "sx" + xind + ", sy" + yind; std::string coords = "sx" + xind + ", sy" + yind;
if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { if (src_def.HasAxis(Axis::DEPTH)) {
coords += ", sz" + zind; coords += ", sz" + zind;
} }
c += " args.src_tensor.GetAddress(addr" + id + ", " + coords + if (src_def.IsLinear()) {
", 0);\n"; c += " args.src_tensor.GetAddress(addr" + id + ", " + coords +
if (image_buffer) { ", 0);\n";
}
if (src_def.ReturnsZeroForNegOneRead()) {
c += " addr" + id + " = select(-1, addr" + id + ", (" + check + c += " addr" + id + " = select(-1, addr" + id + ", (" + check +
"));\n"; "));\n";
c += " int ds" + id + c += " int ds" + id +
@ -369,14 +382,11 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
} }
} }
} }
if (src_tensor_type == TensorStorageType::BUFFER) { if (src_def.storage_type == TensorStorageType::BUFFER) {
c += " int ds = args.src_tensor.SliceStride();\n"; c += " int ds = args.src_tensor.SliceStride();\n";
} }
if (block_size.x == 1 && block_size.y == 1 && manual_clamp) {
c += " if (!in_x0 || !in_y0) continue;\n";
}
c += " int kernel_x = kernel_first_dst_x - src_as_dst_x;\n"; c += " int kernel_x = kernel_first_dst_x - src_as_dst_x;\n";
if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { if (src_def.HasAxis(Axis::DEPTH)) {
c += " int kernel_index = (kernel_z * args.kernel_size_y + kernel_y) " c += " int kernel_index = (kernel_z * args.kernel_size_y + kernel_y) "
"* args.kernel_size_x + kernel_x;\n"; "* args.kernel_size_x + kernel_x;\n";
} else { } else {
@ -398,26 +408,36 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
for (int x = 0; x < block_size.x; ++x) { for (int x = 0; x < block_size.x; ++x) {
const std::string xind = std::to_string(x); const std::string xind = std::to_string(x);
const std::string id = generate_id(xind, yind, zind); const std::string id = generate_id(xind, yind, zind);
const std::string check = generate_check(xind, yind, zind); std::string address;
if (image_buffer) { if (src_def.IsLinear()) {
c += " FLT4 src" + id + " = args.src_tensor.Read(addr" + id + address = "addr" + id;
"); addr" + id + " += ds" + id + ";\n";
} else if (manual_clamp) {
if (conditional_read) {
c += " FLT4 src" + id + " = " + check +
" ? args.src_tensor.Read(addr" + id +
") : (FLT4)(0.0f); addr" + id + " += ds;\n";
} else {
c += " FLT4 src" + id + " = args.src_tensor.Read(addr" + id +
") * (FLT)(" + check + "); addr" + id + " += ds;\n";
}
} else { } else {
std::string coords = "sx" + xind + ", sy" + yind; address = "sx" + xind + ", sy" + yind;
if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { if (src_def.HasAxis(Axis::DEPTH)) {
coords += ", sz" + zind; address += ", sz" + zind;
}
address += ", s";
}
if (src_def.ReturnsZeroForNegOneRead()) {
c += " FLT4 src" + id + " = args.src_tensor.Read(" + address +
"); " + address + " += ds" + id + ";\n";
} else {
const std::string check = generate_check(xind, yind, zind);
if (!check.empty()) {
if (conditional_read) {
c += " FLT4 src" + id + " = " + check +
" ? args.src_tensor.Read(" + address + ") : (FLT4)(0.0f);\n";
} else {
c += " FLT4 src" + id + " = args.src_tensor.Read(" +
address + ") * (FLT)(" + check + ");\n";
}
} else {
c += " FLT4 src" + id + " = args.src_tensor.Read(" +
address + ");\n";
}
if (src_def.IsLinear()) {
c += " addr" + id + " += ds;\n";
} }
c += " FLT4 src" + id + " = args.src_tensor.Read(" + coords +
", s);\n";
} }
} }
} }
@ -456,7 +476,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
c += " }\n"; c += " }\n";
c += " }\n"; c += " }\n";
c += " }\n"; c += " }\n";
if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { if (src_def.HasAxis(Axis::DEPTH)) {
c += " }\n"; c += " }\n";
} }
for (int s = 0; s < block_size.w; ++s) { for (int s = 0; s < block_size.w; ++s) {
@ -476,7 +496,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
c += " {\n"; c += " {\n";
c += " int xc = dst_x + args.stride_x * " + xind + ";\n"; c += " int xc = dst_x + args.stride_x * " + xind + ";\n";
c += " int yc = dst_y + args.stride_y * " + yind + ";\n"; c += " int yc = dst_y + args.stride_y * " + yind + ";\n";
if (op_def.src_tensors[0].HasAxis(Axis::DEPTH)) { if (src_def.HasAxis(Axis::DEPTH)) {
c += " int zc = dst_z + args.stride_z * " + zind + ";\n"; c += " int zc = dst_z + args.stride_z * " + zind + ";\n";
checks += " && zc < args.dst_tensor.Depth()"; checks += " && zc < args.dst_tensor.Depth()";
coords += ", zc"; coords += ", zc";

View File

@ -771,6 +771,46 @@ void TensorDescriptor::UploadData(absl::Span<const float> src) {
} }
} }
bool TensorDescriptor::SupportsZeroClamp(const Axis& axis) const {
switch (storage_type) {
case TensorStorageType::UNKNOWN:
return false;
case TensorStorageType::BUFFER:
case TensorStorageType::IMAGE_BUFFER:
return false;
case TensorStorageType::TEXTURE_ARRAY:
case TensorStorageType::TEXTURE_2D:
case TensorStorageType::SINGLE_TEXTURE_2D:
return axis == Axis::WIDTH || axis == Axis::HEIGHT;
case TensorStorageType::TEXTURE_3D:
return axis == Axis::WIDTH || axis == Axis::HEIGHT || axis == Axis::DEPTH;
}
}
bool TensorDescriptor::CanReadOutOfBorder(const Axis& axis) const {
switch (storage_type) {
case TensorStorageType::UNKNOWN:
return false;
case TensorStorageType::BUFFER:
return false;
case TensorStorageType::IMAGE_BUFFER:
case TensorStorageType::TEXTURE_2D:
case TensorStorageType::TEXTURE_3D:
case TensorStorageType::SINGLE_TEXTURE_2D:
case TensorStorageType::TEXTURE_ARRAY:
return true;
}
}
bool TensorDescriptor::IsLinear() const {
return storage_type == TensorStorageType::BUFFER ||
storage_type == TensorStorageType::IMAGE_BUFFER;
}
bool TensorDescriptor::ReturnsZeroForNegOneRead() const {
return storage_type == TensorStorageType::IMAGE_BUFFER;
}
namespace { namespace {
int GetLinearIndex(const TensorDescriptor& desc, const BHWDC& shape, int b, int GetLinearIndex(const TensorDescriptor& desc, const BHWDC& shape, int b,
int x, int y, int d, int s, int sub_c) { int x, int y, int d, int s, int sub_c) {

View File

@ -82,6 +82,16 @@ struct TensorDescriptor : public GPUObjectDescriptor {
void UploadData(const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src); void UploadData(const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src);
void UploadData(const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src); void UploadData(const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src);
bool SupportsZeroClamp(const Axis& axis) const;
bool CanReadOutOfBorder(const Axis& axis) const;
bool IsLinear() const;
// applicable only for types that: IsLinear -> true.
// In this case for address we have 1d component - addr (int)
// If for addr == -1 this linear storage type returns FLT4(0.0), this function
// returns true, otherwise false
bool ReturnsZeroForNegOneRead() const;
DataType data_type = DataType::UNKNOWN; DataType data_type = DataType::UNKNOWN;
TensorStorageType storage_type = TensorStorageType::UNKNOWN; TensorStorageType storage_type = TensorStorageType::UNKNOWN;
// This field describes logical layout, actual(physical) GPU layout can be // This field describes logical layout, actual(physical) GPU layout can be