ConvTexture converted to new style.

Texture2D implements GPUObject.

PiperOrigin-RevId: 317752276
Change-Id: Id1c66b2f6ca7b70475cc82abc422935fc3f1a251
This commit is contained in:
Raman Sarokin 2020-06-22 15:53:37 -07:00 committed by TensorFlower Gardener
parent 26421826a0
commit 23d5a2e00a
5 changed files with 218 additions and 135 deletions

View File

@ -523,6 +523,7 @@ cc_library(
deps = [ deps = [
":cl_command_queue", ":cl_command_queue",
":cl_context", ":cl_context",
":gpu_object",
":opencl_wrapper", ":opencl_wrapper",
":tensor_type", ":tensor_type",
":util", ":util",

View File

@ -31,18 +31,32 @@ namespace gpu {
namespace cl { namespace cl {
namespace { namespace {
std::string GenerateConvCode( std::string GenerateConvCode(const OperationDef& op_def, const int3& block_size,
const OperationDef& op_def, const int3& block_size, bool is1x1, bool is1x1, bool adreno4xx_optimization,
bool adreno4xx_optimization, bool stride_correction, bool stride_correction,
bool different_weights_for_height, const CLDevice& device, bool different_weights_for_height,
const std::vector<ElementwiseOperation*>& linked_operations) { const CLDevice& device, Arguments* args) {
std::string c = GetCommonDefines(op_def.precision); auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
TensorCodeGenerator src_tensor( src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
"src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"}, if (op_def.IsBatchSupported()) {
op_def.src_tensors[0]); src_desc->SetStateVar("BatchedWidth", "true");
TensorCodeGenerator dst_tensor( }
"dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"}, args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
op_def.dst_tensors[0]); auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
if (op_def.IsBatchSupported()) {
dst_desc->SetStateVar("BatchedWidth", "true");
}
args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
if (!is1x1) {
args->AddInt("kernel_size_x");
args->AddInt("kernel_size_y");
args->AddInt("dilation_x");
args->AddInt("dilation_y");
}
args->AddInt("stride_x");
args->AddInt("stride_y");
args->AddInt("padding_x");
args->AddInt("padding_y");
const auto src_tensor_type = op_def.src_tensors[0].storage_type; const auto src_tensor_type = op_def.src_tensors[0].storage_type;
const bool is_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER || const bool is_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER ||
@ -63,6 +77,7 @@ std::string GenerateConvCode(
zs[z] = std::to_string(z); zs[z] = std::to_string(z);
} }
std::string c = GetCommonDefines(op_def.precision);
for (int z = 0; z < block_size.z; ++z) { for (int z = 0; z < block_size.z; ++z) {
const std::string f0 = std::to_string(z * 4 + 0); const std::string f0 = std::to_string(z * 4 + 0);
const std::string f1 = std::to_string(z * 4 + 1); const std::string f1 = std::to_string(z * 4 + 1);
@ -86,43 +101,29 @@ std::string GenerateConvCode(
} }
c += "__kernel void main_function(\n"; c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n"; c += "$0) {\n";
c += " __read_only image2d_t filters0, \n";
c += " __read_only image2d_t filters1, \n";
c += " __read_only image2d_t filters2, \n";
c += " __read_only image2d_t filters3, \n";
c += " __read_only image2d_t biases";
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 src_size, \n";
c += " int4 dst_size, \n";
if (!is1x1) {
c += " int2 kernel_size, \n";
c += " int2 dilation, \n";
}
c += " int2 stride, \n";
c += " int2 padding \n";
c += ") {\n";
c += " int X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n"; c += " int X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n";
c += " int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n"; c += " int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
c += " int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n"; c += " int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n"; c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
"|| Z >= args.dst_tensor.Slices()) return;\n";
std::vector<std::string> s_x(block_size.x); std::vector<std::string> s_x(block_size.x);
std::vector<std::string> s_y(block_size.y); std::vector<std::string> s_y(block_size.y);
for (int x = 0; x < block_size.x; ++x) { for (int x = 0; x < block_size.x; ++x) {
if (stride_correction) { if (stride_correction) {
c += " int xc" + xs[x] + " = " + c += " int xc" + xs[x] + " = " +
GetXStrideCorrected("X + " + xs[x], "src_size.w", "stride.x", GetXStrideCorrected("X + " + xs[x], "args.src_tensor.Batch()",
"padding.x") + "args.stride_x", "args.padding_x") +
";\n"; ";\n";
} else { } else {
c += " int xc" + xs[x] + " = (X +" + xs[x] + c += " int xc" + xs[x] + " = (X +" + xs[x] +
") * stride.x + padding.x;\n"; ") * args.stride_x + args.padding_x;\n";
} }
s_x[x] = is1x1 ? "xc" + xs[x] : "cx" + xs[x]; s_x[x] = is1x1 ? "xc" + xs[x] : "cx" + xs[x];
} }
for (int y = 0; y < block_size.y; ++y) { for (int y = 0; y < block_size.y; ++y) {
c += " int yc" + ys[y] + " = (Y +" + ys[y] + ") * stride.y + padding.y;\n"; c += " int yc" + ys[y] + " = (Y +" + ys[y] +
") * args.stride_y + args.padding_y;\n";
s_y[y] = is1x1 ? "yc" + ys[y] : "cy" + ys[y]; s_y[y] = is1x1 ? "yc" + ys[y] : "cy" + ys[y];
} }
for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) { for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
@ -131,7 +132,7 @@ std::string GenerateConvCode(
} }
std::string f_y = is1x1 ? "s" : "filter_offset"; std::string f_y = is1x1 ? "s" : "filter_offset";
if (different_weights_for_height) { if (different_weights_for_height) {
f_y = "Y * src_size.z + s"; f_y = "Y * args.src_tensor.Slices() + s";
} }
if (!is1x1) { if (!is1x1) {
for (int x = 0; x < block_size.x; ++x) { for (int x = 0; x < block_size.x; ++x) {
@ -141,31 +142,31 @@ std::string GenerateConvCode(
c += " int cy" + ys[y] + ";\n"; c += " int cy" + ys[y] + ";\n";
} }
c += " int filter_offset = 0;\n"; c += " int filter_offset = 0;\n";
c += " for (int y = 0; y < kernel_size.y; ++y) {\n"; c += " for (int y = 0; y < args.kernel_size_y; ++y) {\n";
for (int y = 0; y < block_size.y; ++y) { for (int y = 0; y < block_size.y; ++y) {
c += " cy" + ys[y] + " = y * dilation.y + yc" + ys[y] + ";\n"; c += " cy" + ys[y] + " = y * args.dilation_y + yc" + ys[y] + ";\n";
} }
if (is_buffer) { if (is_buffer) {
for (int y = 0; y < block_size.y; ++y) { for (int y = 0; y < block_size.y; ++y) {
c += " bool in_y" + ys[y] + " = cy" + ys[y] + " >= 0 && cy" + ys[y] + c += " bool in_y" + ys[y] + " = cy" + ys[y] + " >= 0 && cy" + ys[y] +
" < src_size.y;\n"; " < args.src_tensor.Height();\n";
if (src_tensor_type == TensorStorageType::BUFFER) { if (src_tensor_type == TensorStorageType::BUFFER) {
c += " cy" + ys[y] + " = clamp(cy" + ys[y] + c += " cy" + ys[y] + " = clamp(cy" + ys[y] +
", 0, src_size.y - 1);\n"; ", 0, args.src_tensor.Height() - 1);\n";
} }
} }
} }
c += " for (int x = 0; x < kernel_size.x; ++x) {\n"; c += " for (int x = 0; x < args.kernel_size_x; ++x) {\n";
for (int x = 0; x < block_size.x; ++x) { for (int x = 0; x < block_size.x; ++x) {
c += " cx" + xs[x] + " = x * dilation.x + xc" + xs[x] + ";\n"; c += " cx" + xs[x] + " = x * args.dilation_x + xc" + xs[x] + ";\n";
} }
if (is_buffer) { if (is_buffer) {
for (int x = 0; x < block_size.x; ++x) { for (int x = 0; x < block_size.x; ++x) {
c += " bool in_x" + xs[x] + " = cx" + xs[x] + " >= 0 && cx" + xs[x] + c += " bool in_x" + xs[x] + " = cx" + xs[x] + " >= 0 && cx" + xs[x] +
" < src_size.x;\n"; " < args.src_tensor.Width();\n";
if (src_tensor_type == TensorStorageType::BUFFER) { if (src_tensor_type == TensorStorageType::BUFFER) {
c += " cx" + xs[x] + " = clamp(cx" + xs[x] + c += " cx" + xs[x] + " = clamp(cx" + xs[x] +
", 0, src_size.x - 1);\n"; ", 0, args.src_tensor.Width() - 1);\n";
} }
} }
for (int x = 0; x < block_size.x; ++x) { for (int x = 0; x < block_size.x; ++x) {
@ -173,90 +174,95 @@ std::string GenerateConvCode(
const std::string id = std::to_string(y * block_size.x + x); const std::string id = std::to_string(y * block_size.x + x);
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) { if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
c += absl::Substitute( c += absl::Substitute(
" int addr_$0 = select(-1, cy$2 * src_size.x + cx$1, (in_x$1 " " int addr_$0 = select(-1, cy$2 * args.src_tensor.Width() + "
"cx$1, (in_x$1 "
"&& " "&& "
"in_y$2));\n", "in_y$2));\n",
y * block_size.x + x, x, y); y * block_size.x + x, x, y);
c += absl::Substitute( c += absl::Substitute(
" int dz_$0 = select(0, src_size.x * src_size.y, (in_x$1 && " " int dz_$0 = select(0, args.src_tensor.Width() * "
"args.src_tensor.Height(), (in_x$1 && "
"in_y$2));\n", "in_y$2));\n",
y * block_size.x + x, x, y); y * block_size.x + x, x, y);
} else { } else {
c += absl::Substitute(" int addr_$0 = cy$2 * src_size.x + cx$1;\n", c += absl::Substitute(
" int addr_$0 = cy$2 * args.src_tensor.Width() + cx$1;\n",
y * block_size.x + x, x, y); y * block_size.x + x, x, y);
} }
} }
} }
if (src_tensor_type == TensorStorageType::BUFFER) { if (src_tensor_type == TensorStorageType::BUFFER) {
c += " int dz = src_size.x * src_size.y;\n"; c += " int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
} }
} }
} else if (is_buffer) { } else if (is_buffer) {
for (int y = 0; y < block_size.y; ++y) { for (int y = 0; y < block_size.y; ++y) {
c += " bool in_y" + ys[y] + " = yc" + ys[y] + " >= 0 && yc" + ys[y] + c += " bool in_y" + ys[y] + " = yc" + ys[y] + " >= 0 && yc" + ys[y] +
" < src_size.y;\n"; " < args.src_tensor.Height();\n";
} }
for (int x = 0; x < block_size.x; ++x) { for (int x = 0; x < block_size.x; ++x) {
c += " bool in_x" + xs[x] + " = xc" + xs[x] + " >= 0 && xc" + xs[x] + c += " bool in_x" + xs[x] + " = xc" + xs[x] + " >= 0 && xc" + xs[x] +
" < src_size.x;\n"; " < args.src_tensor.Width();\n";
} }
for (int x = 0; x < block_size.x; ++x) { for (int x = 0; x < block_size.x; ++x) {
for (int y = 0; y < block_size.y; ++y) { for (int y = 0; y < block_size.y; ++y) {
const std::string id = std::to_string(y * block_size.x + x); const std::string id = std::to_string(y * block_size.x + x);
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) { if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
c += absl::Substitute( c += absl::Substitute(
" int addr_$0 = select(-1, yc$2 * src_size.x + xc$1, (in_x$1 && " " int addr_$0 = select(-1, yc$2 * args.src_tensor.Width() + "
"xc$1, (in_x$1 && "
"in_y$2));\n", "in_y$2));\n",
y * block_size.x + x, x, y); y * block_size.x + x, x, y);
c += absl::Substitute( c += absl::Substitute(
" int dz_$0 = select(0, src_size.x * src_size.y, (in_x$1 && " " int dz_$0 = select(0, args.src_tensor.Width() * "
"args.src_tensor.Height(), (in_x$1 && "
"in_y$2));\n", "in_y$2));\n",
y * block_size.x + x, x, y); y * block_size.x + x, x, y);
} else { } else {
c += absl::Substitute(" int addr_$0 = yc$2 * src_size.x + xc$1;\n", c += absl::Substitute(
" int addr_$0 = yc$2 * args.src_tensor.Width() + xc$1;\n",
y * block_size.x + x, x, y); y * block_size.x + x, x, y);
} }
} }
} }
if (src_tensor_type == TensorStorageType::BUFFER) { if (src_tensor_type == TensorStorageType::BUFFER) {
c += " int dz = src_size.x * src_size.y;\n"; c += " int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
} }
} }
c += " for (int s = 0; s < src_size.z; ++s) {\n"; c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
if (is_buffer) { if (is_buffer) {
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) { if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
for (int index = 0; index < block_size.x * block_size.y; ++index) { for (int index = 0; index < block_size.x * block_size.y; ++index) {
const std::string id = std::to_string(index); const std::string id = std::to_string(index);
c += c +=
" FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) + ";\n"; " FLT4 src" + id + " = args.src_tensor.Read(addr_" + id + ");\n";
} }
} else { } else {
for (int x = 0; x < block_size.x; ++x) { for (int x = 0; x < block_size.x; ++x) {
for (int y = 0; y < block_size.y; ++y) { for (int y = 0; y < block_size.y; ++y) {
const std::string id = std::to_string(y * block_size.x + x); const std::string id = std::to_string(y * block_size.x + x);
c += " FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) + c += " FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
" * (FLT)(in_x" + xs[x] + " && in_y" + ys[y] + "); addr_" + id + ") * (FLT)(in_x" + xs[x] + " && in_y" + ys[y] + "); addr_" + id +
" += dz;\n"; " += dz;\n";
} }
} }
} }
} }
for (int z = 0; z < block_size.z; ++z) { for (int z = 0; z < block_size.z; ++z) {
const std::string fc = "(int2)(Z + " + zs[z] + ", " + f_y + ")"; c += absl::Substitute(R"( FLT4 f$2 = args.weights0.Read($0, $1);
c += absl::Substitute(R"( FLT4 f$1 = READ_IMAGE(filters0, smp_none, $0); FLT4 f$3 = args.weights1.Read($0, $1);
FLT4 f$2 = READ_IMAGE(filters1, smp_none, $0); FLT4 f$4 = args.weights2.Read($0, $1);
FLT4 f$3 = READ_IMAGE(filters2, smp_none, $0); FLT4 f$5 = args.weights3.Read($0, $1);
FLT4 f$4 = READ_IMAGE(filters3, smp_none, $0);
)", )",
fc, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3); "Z + " + zs[z], f_y, z * 4 + 0, z * 4 + 1, z * 4 + 2,
z * 4 + 3);
} }
if (!is_buffer) { if (!is_buffer) {
const auto mode = GetFastestZeroMode(device);
for (int x = 0; x < block_size.x; ++x) { for (int x = 0; x < block_size.x; ++x) {
for (int y = 0; y < block_size.y; ++y) { for (int y = 0; y < block_size.y; ++y) {
const std::string id = std::to_string(y * block_size.x + x); const std::string id = std::to_string(y * block_size.x + x);
c += " FLT4 src" + id + " = " + c += " FLT4 src" + id + " = args.src_tensor.Read(" + s_x[x] + ", " +
src_tensor.ReadWHS(s_x[x], s_y[y], "s", mode) + ";\n"; s_y[y] + ", s);\n";
} }
} }
} }
@ -278,17 +284,17 @@ std::string GenerateConvCode(
} }
} }
} }
c += " }\n"; // src_size.z c += " }\n"; // args.src_tensor.Slices()
if (!is1x1) { if (!is1x1) {
c += " }\n"; // kernel_size.x c += " }\n"; // kernel_size_x
c += " }\n"; // kernel_size.y c += " }\n"; // kernel_size_y
} }
// when is1x1 && adreno4xx_optimization is true, xc0 == X and yc0 == Y // when is1x1 && adreno4xx_optimization is true, xc0 == X and yc0 == Y
std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X"; std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X";
std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y"; std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y";
for (int z = 0; z < block_size.z; ++z) { for (int z = 0; z < block_size.z; ++z) {
c += " if (Z < dst_size.z) {\n"; c += " if (Z < args.dst_tensor.Slices()) {\n";
c += " FLT4 bias_val = READ_IMAGE(biases, smp_none, (int2)(Z, 0));\n"; c += " FLT4 bias_val = args.biases.Read(Z);\n";
for (int y = 0; y < block_size.y; ++y) { for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) { for (int x = 0; x < block_size.x; ++x) {
const std::string id = const std::string id =
@ -296,11 +302,10 @@ std::string GenerateConvCode(
c += " {\n"; c += " {\n";
c += " int xc = " + dst_x + " + " + xs[x] + ";\n"; c += " int xc = " + dst_x + " + " + xs[x] + ";\n";
c += " int yc = " + dst_y + " + " + ys[y] + ";\n"; c += " int yc = " + dst_y + " + " + ys[y] + ";\n";
c += " if (xc < dst_size.x && yc < dst_size.y) {\n"; c += " if (xc < args.dst_tensor.Width() && yc < "
"args.dst_tensor.Height()) {\n";
c += " FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n"; c += " FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
const LinkingContext context{"res", "xc", "yc", "Z"}; c += " args.dst_tensor.Write(res, xc, yc, Z);\n";
c += PostProcess(linked_operations, context);
c += " " + dst_tensor.WriteWHS("res", "xc", "yc", "Z") + "\n";
c += " }\n"; c += " }\n";
c += " }\n"; c += " }\n";
} }
@ -350,11 +355,6 @@ ConvTexture::ConvTexture(const OperationDef& definition)
ConvTexture::ConvTexture(ConvTexture&& operation) ConvTexture::ConvTexture(ConvTexture&& operation)
: GPUOperation(std::move(operation)), : GPUOperation(std::move(operation)),
weights_0_(std::move(operation.weights_0_)),
weights_1_(std::move(operation.weights_1_)),
weights_2_(std::move(operation.weights_2_)),
weights_3_(std::move(operation.weights_3_)),
biases_(std::move(operation.biases_)),
kernel_size_(operation.kernel_size_), kernel_size_(operation.kernel_size_),
stride_(operation.stride_), stride_(operation.stride_),
padding_(operation.padding_), padding_(operation.padding_),
@ -366,11 +366,6 @@ ConvTexture::ConvTexture(ConvTexture&& operation)
ConvTexture& ConvTexture::operator=(ConvTexture&& operation) { ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
if (this != &operation) { if (this != &operation) {
weights_0_ = std::move(operation.weights_0_);
weights_1_ = std::move(operation.weights_1_);
weights_2_ = std::move(operation.weights_2_);
weights_3_ = std::move(operation.weights_3_);
biases_ = std::move(operation.biases_);
std::swap(kernel_size_, operation.kernel_size_); std::swap(kernel_size_, operation.kernel_size_);
std::swap(stride_, operation.stride_); std::swap(stride_, operation.stride_);
std::swap(padding_, operation.padding_); std::swap(padding_, operation.padding_);
@ -395,10 +390,16 @@ absl::Status ConvTexture::Compile(const CreationContext& creation_context) {
definition_.precision == CalculationsPrecision::F16; definition_.precision == CalculationsPrecision::F16;
const bool stride_correction = const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1; definition_.IsBatchSupported() && stride_.x != 1;
const std::string code = std::string code =
GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization, GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
stride_correction, different_weights_for_height_, stride_correction, different_weights_for_height_,
*creation_context.device, linked_operations_); *creation_context.device, &args_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
std::vector<CompilerOptions> options; std::vector<CompilerOptions> options;
if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) { if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE); options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
@ -409,25 +410,20 @@ absl::Status ConvTexture::Compile(const CreationContext& creation_context) {
} }
absl::Status ConvTexture::BindArguments() { absl::Status ConvTexture::BindArguments() {
kernel_.ResetBindingCounter(); RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr())); RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_0_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_1_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_2_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_3_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) { if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_)); RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
RETURN_IF_ERROR(kernel_.SetBytesAuto( RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
int2(dilation_.x * src_[0]->Batch(), dilation_.y))); RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
} }
RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_)); RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
RETURN_IF_ERROR( RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y))); RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
return absl::OkStatus(); return absl::OkStatus();
} }

View File

@ -92,12 +92,6 @@ class ConvTexture : public GPUOperation {
absl::Status BindArguments(); absl::Status BindArguments();
int3 GetGridSize() const; int3 GetGridSize() const;
Texture2D weights_0_;
Texture2D weights_1_;
Texture2D weights_2_;
Texture2D weights_3_;
LinearStorage biases_;
int2 kernel_size_; int2 kernel_size_;
int2 stride_; int2 stride_;
int2 padding_; int2 padding_;
@ -119,11 +113,16 @@ absl::Status ConvTexture::UploadData(
const tflite::gpu::Tensor<OHWI, T>& weights, const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) { const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
RETURN_IF_ERROR(UploadWeights(weights, context)); RETURN_IF_ERROR(UploadWeights(weights, context));
LinearStorageCreateInfo create_info;
create_info.storage_type = LinearStorageType::TEXTURE_2D; TensorLinearDescriptor desc;
create_info.data_type = definition_.GetDataType(); desc.storage_type = LinearStorageType::TEXTURE_2D;
create_info.aligned_size = weights.shape.o; desc.element_type = definition_.GetDataType();
RETURN_IF_ERROR(CreateLinearStorage(create_info, biases, context, &biases_));
LinearStorage lt;
RETURN_IF_ERROR(CreateLinearStorage(desc, biases, context, &lt));
args_.AddObject("biases", AccessType::READ,
absl::make_unique<LinearStorage>(std::move(lt)),
absl::make_unique<TensorLinearDescriptor>(desc));
return absl::OkStatus(); return absl::OkStatus();
} }
@ -135,14 +134,19 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights); RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
RETURN_IF_ERROR(UploadWeights(wino_weights, context)); RETURN_IF_ERROR(UploadWeights(wino_weights, context));
LinearStorageCreateInfo create_info;
create_info.storage_type = LinearStorageType::TEXTURE_2D;
create_info.data_type = definition_.GetDataType();
create_info.aligned_size = 1;
tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias; tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
bias.shape = Linear(1); bias.shape = Linear(1);
bias.data = {0.0f}; bias.data = {0.0f};
return CreateLinearStorage(create_info, bias, context, &biases_); TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::TEXTURE_2D;
desc.element_type = definition_.GetDataType();
LinearStorage lt;
RETURN_IF_ERROR(CreateLinearStorage(desc, bias, context, &lt));
args_.AddObject("biases", AccessType::READ,
absl::make_unique<LinearStorage>(std::move(lt)),
absl::make_unique<TensorLinearDescriptor>(desc));
return absl::OkStatus();
} }
template <DataType T> template <DataType T>
@ -157,11 +161,20 @@ absl::Status ConvTexture::UploadWeights(
int texture_width = dst_depth; int texture_width = dst_depth;
int texture_height = src_depth * kernel_x * kernel_y; int texture_height = src_depth * kernel_x * kernel_y;
DataType data_type = definition_.GetDataType(); const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
DataType data_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
const int elements_count = texture_width * texture_height; const int elements_count = texture_width * texture_height;
if (data_type == DataType::FLOAT32) { Texture2DDescriptor desc;
desc.element_type = data_type;
Texture2D weights_0;
Texture2D weights_1;
Texture2D weights_2;
Texture2D weights_3;
if (f32_weights) {
std::vector<float4> gpu_data_0(elements_count); std::vector<float4> gpu_data_0(elements_count);
std::vector<float4> gpu_data_1(elements_count); std::vector<float4> gpu_data_1(elements_count);
std::vector<float4> gpu_data_2(elements_count); std::vector<float4> gpu_data_2(elements_count);
@ -171,15 +184,16 @@ absl::Status ConvTexture::UploadWeights(
absl::MakeSpan(gpu_data_3)); absl::MakeSpan(gpu_data_3));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width, RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_0.data(), texture_height, gpu_data_0.data(),
context, &weights_0_)); context, &weights_0));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width, RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_1.data(), texture_height, gpu_data_1.data(),
context, &weights_1_)); context, &weights_1));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width, RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_2.data(), texture_height, gpu_data_2.data(),
context, &weights_2_)); context, &weights_2));
return CreateTexture2DRGBA(data_type, texture_width, texture_height, RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
gpu_data_3.data(), context, &weights_3_); texture_height, gpu_data_3.data(),
context, &weights_3));
} else { } else {
std::vector<half4> gpu_data_0(elements_count); std::vector<half4> gpu_data_0(elements_count);
std::vector<half4> gpu_data_1(elements_count); std::vector<half4> gpu_data_1(elements_count);
@ -190,16 +204,31 @@ absl::Status ConvTexture::UploadWeights(
absl::MakeSpan(gpu_data_3)); absl::MakeSpan(gpu_data_3));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width, RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_0.data(), texture_height, gpu_data_0.data(),
context, &weights_0_)); context, &weights_0));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width, RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_1.data(), texture_height, gpu_data_1.data(),
context, &weights_1_)); context, &weights_1));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width, RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_2.data(), texture_height, gpu_data_2.data(),
context, &weights_2_)); context, &weights_2));
return CreateTexture2DRGBA(data_type, texture_width, texture_height, RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
gpu_data_3.data(), context, &weights_3_); texture_height, gpu_data_3.data(),
context, &weights_3));
} }
args_.AddObject("weights0", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_0)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights1", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_1)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights2", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_2)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights3", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_3)),
absl::make_unique<Texture2DDescriptor>(desc));
return absl::OkStatus();
} }
template <DataType S, typename T> template <DataType S, typename T>

View File

@ -59,6 +59,41 @@ absl::Status CreateTexture2D(int width, int height, cl_channel_type type,
} }
} // namespace } // namespace
GPUResources Texture2DDescriptor::GetGPUResources(
AccessType access_type) const {
GPUResources resources;
GPUImage2DDescriptor desc;
desc.data_type = element_type;
desc.access_type = access_type;
resources.images2d.push_back({"tex2d", desc});
return resources;
}
absl::Status Texture2DDescriptor::PerformSelector(
const std::string& selector, const std::vector<std::string>& args,
const std::vector<std::string>& template_args, std::string* result) const {
if (selector == "Read") {
return PerformReadSelector(args, result);
} else {
return absl::NotFoundError(absl::StrCat(
"TensorLinearDescriptor don't have selector with name - ", selector));
}
}
absl::Status Texture2DDescriptor::PerformReadSelector(
const std::vector<std::string>& args, std::string* result) const {
if (args.size() != 2) {
return absl::NotFoundError(
absl::StrCat("Texture2DDescriptor Read require one argument, but ",
args.size(), " was passed"));
}
const std::string read =
element_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef";
*result = absl::StrCat(read, "(tex2d, smp_none, (int2)(", args[0],
", " + args[1] + "))");
return absl::OkStatus();
}
Texture2D::Texture2D(cl_mem texture, int width, int height, Texture2D::Texture2D(cl_mem texture, int width, int height,
cl_channel_type type) cl_channel_type type)
: texture_(texture), width_(width), height_(height), channel_type_(type) {} : texture_(texture), width_(width), height_(height), channel_type_(type) {}
@ -95,6 +130,12 @@ void Texture2D::Release() {
} }
} }
GPUResourcesWithValue Texture2D::GetGPUResources(AccessType access_type) const {
GPUResourcesWithValue resources;
resources.images2d.push_back({"tex2d", texture_});
return resources;
}
// Creates new 4-channel 2D texture with f32 elements // Creates new 4-channel 2D texture with f32 elements
absl::Status CreateTexture2DRGBA32F(int width, int height, CLContext* context, absl::Status CreateTexture2DRGBA32F(int width, int height, CLContext* context,
Texture2D* result) { Texture2D* result) {

View File

@ -20,6 +20,7 @@ limitations under the License.
#include "absl/types/span.h" #include "absl/types/span.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h" #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h" #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h" #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h" #include "tensorflow/lite/delegates/gpu/cl/util.h"
@ -30,9 +31,22 @@ namespace tflite {
namespace gpu { namespace gpu {
namespace cl { namespace cl {
struct Texture2DDescriptor : public GPUObjectDescriptor {
DataType element_type; // FLOAT32 or FLOAT16
absl::Status PerformSelector(const std::string& selector,
const std::vector<std::string>& args,
const std::vector<std::string>& template_args,
std::string* result) const override;
GPUResources GetGPUResources(AccessType access_type) const override;
absl::Status PerformReadSelector(const std::vector<std::string>& args,
std::string* result) const;
};
// Texture2D represent formatted GPU data storage. // Texture2D represent formatted GPU data storage.
// Texture2D is moveable but not copyable. // Texture2D is moveable but not copyable.
class Texture2D { class Texture2D : public GPUObject {
public: public:
Texture2D() {} // just for using Texture2D as a class members Texture2D() {} // just for using Texture2D as a class members
Texture2D(cl_mem texture, int width, int height, cl_channel_type type); Texture2D(cl_mem texture, int width, int height, cl_channel_type type);
@ -56,6 +70,8 @@ class Texture2D {
template <typename T> template <typename T>
absl::Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const; absl::Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
private: private:
void Release(); void Release();