ConvTexture converted to new style.
Texture2D implements GPUObject. PiperOrigin-RevId: 317752276 Change-Id: Id1c66b2f6ca7b70475cc82abc422935fc3f1a251
This commit is contained in:
parent
26421826a0
commit
23d5a2e00a
@ -523,6 +523,7 @@ cc_library(
|
||||
deps = [
|
||||
":cl_command_queue",
|
||||
":cl_context",
|
||||
":gpu_object",
|
||||
":opencl_wrapper",
|
||||
":tensor_type",
|
||||
":util",
|
||||
|
@ -31,18 +31,32 @@ namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GenerateConvCode(
|
||||
const OperationDef& op_def, const int3& block_size, bool is1x1,
|
||||
bool adreno4xx_optimization, bool stride_correction,
|
||||
bool different_weights_for_height, const CLDevice& device,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
TensorCodeGenerator src_tensor(
|
||||
"src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
|
||||
op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor(
|
||||
"dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
|
||||
op_def.dst_tensors[0]);
|
||||
std::string GenerateConvCode(const OperationDef& op_def, const int3& block_size,
|
||||
bool is1x1, bool adreno4xx_optimization,
|
||||
bool stride_correction,
|
||||
bool different_weights_for_height,
|
||||
const CLDevice& device, Arguments* args) {
|
||||
auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
|
||||
src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
|
||||
if (op_def.IsBatchSupported()) {
|
||||
src_desc->SetStateVar("BatchedWidth", "true");
|
||||
}
|
||||
args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
|
||||
auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
|
||||
if (op_def.IsBatchSupported()) {
|
||||
dst_desc->SetStateVar("BatchedWidth", "true");
|
||||
}
|
||||
args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
|
||||
if (!is1x1) {
|
||||
args->AddInt("kernel_size_x");
|
||||
args->AddInt("kernel_size_y");
|
||||
args->AddInt("dilation_x");
|
||||
args->AddInt("dilation_y");
|
||||
}
|
||||
args->AddInt("stride_x");
|
||||
args->AddInt("stride_y");
|
||||
args->AddInt("padding_x");
|
||||
args->AddInt("padding_y");
|
||||
|
||||
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
|
||||
const bool is_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER ||
|
||||
@ -63,6 +77,7 @@ std::string GenerateConvCode(
|
||||
zs[z] = std::to_string(z);
|
||||
}
|
||||
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
const std::string f0 = std::to_string(z * 4 + 0);
|
||||
const std::string f1 = std::to_string(z * 4 + 1);
|
||||
@ -86,43 +101,29 @@ std::string GenerateConvCode(
|
||||
}
|
||||
|
||||
c += "__kernel void main_function(\n";
|
||||
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
|
||||
c += " __read_only image2d_t filters0, \n";
|
||||
c += " __read_only image2d_t filters1, \n";
|
||||
c += " __read_only image2d_t filters2, \n";
|
||||
c += " __read_only image2d_t filters3, \n";
|
||||
c += " __read_only image2d_t biases";
|
||||
c += GetArgsDeclaration(linked_operations);
|
||||
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
|
||||
c += " int4 src_size, \n";
|
||||
c += " int4 dst_size, \n";
|
||||
if (!is1x1) {
|
||||
c += " int2 kernel_size, \n";
|
||||
c += " int2 dilation, \n";
|
||||
}
|
||||
c += " int2 stride, \n";
|
||||
c += " int2 padding \n";
|
||||
c += ") {\n";
|
||||
c += "$0) {\n";
|
||||
c += " int X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n";
|
||||
c += " int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
|
||||
c += " int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
|
||||
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
|
||||
c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
|
||||
"|| Z >= args.dst_tensor.Slices()) return;\n";
|
||||
std::vector<std::string> s_x(block_size.x);
|
||||
std::vector<std::string> s_y(block_size.y);
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
if (stride_correction) {
|
||||
c += " int xc" + xs[x] + " = " +
|
||||
GetXStrideCorrected("X + " + xs[x], "src_size.w", "stride.x",
|
||||
"padding.x") +
|
||||
GetXStrideCorrected("X + " + xs[x], "args.src_tensor.Batch()",
|
||||
"args.stride_x", "args.padding_x") +
|
||||
";\n";
|
||||
} else {
|
||||
c += " int xc" + xs[x] + " = (X +" + xs[x] +
|
||||
") * stride.x + padding.x;\n";
|
||||
") * args.stride_x + args.padding_x;\n";
|
||||
}
|
||||
s_x[x] = is1x1 ? "xc" + xs[x] : "cx" + xs[x];
|
||||
}
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
c += " int yc" + ys[y] + " = (Y +" + ys[y] + ") * stride.y + padding.y;\n";
|
||||
c += " int yc" + ys[y] + " = (Y +" + ys[y] +
|
||||
") * args.stride_y + args.padding_y;\n";
|
||||
s_y[y] = is1x1 ? "yc" + ys[y] : "cy" + ys[y];
|
||||
}
|
||||
for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
|
||||
@ -131,7 +132,7 @@ std::string GenerateConvCode(
|
||||
}
|
||||
std::string f_y = is1x1 ? "s" : "filter_offset";
|
||||
if (different_weights_for_height) {
|
||||
f_y = "Y * src_size.z + s";
|
||||
f_y = "Y * args.src_tensor.Slices() + s";
|
||||
}
|
||||
if (!is1x1) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
@ -141,31 +142,31 @@ std::string GenerateConvCode(
|
||||
c += " int cy" + ys[y] + ";\n";
|
||||
}
|
||||
c += " int filter_offset = 0;\n";
|
||||
c += " for (int y = 0; y < kernel_size.y; ++y) {\n";
|
||||
c += " for (int y = 0; y < args.kernel_size_y; ++y) {\n";
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
c += " cy" + ys[y] + " = y * dilation.y + yc" + ys[y] + ";\n";
|
||||
c += " cy" + ys[y] + " = y * args.dilation_y + yc" + ys[y] + ";\n";
|
||||
}
|
||||
if (is_buffer) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
c += " bool in_y" + ys[y] + " = cy" + ys[y] + " >= 0 && cy" + ys[y] +
|
||||
" < src_size.y;\n";
|
||||
" < args.src_tensor.Height();\n";
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " cy" + ys[y] + " = clamp(cy" + ys[y] +
|
||||
", 0, src_size.y - 1);\n";
|
||||
", 0, args.src_tensor.Height() - 1);\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
c += " for (int x = 0; x < kernel_size.x; ++x) {\n";
|
||||
c += " for (int x = 0; x < args.kernel_size_x; ++x) {\n";
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
c += " cx" + xs[x] + " = x * dilation.x + xc" + xs[x] + ";\n";
|
||||
c += " cx" + xs[x] + " = x * args.dilation_x + xc" + xs[x] + ";\n";
|
||||
}
|
||||
if (is_buffer) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
c += " bool in_x" + xs[x] + " = cx" + xs[x] + " >= 0 && cx" + xs[x] +
|
||||
" < src_size.x;\n";
|
||||
" < args.src_tensor.Width();\n";
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " cx" + xs[x] + " = clamp(cx" + xs[x] +
|
||||
", 0, src_size.x - 1);\n";
|
||||
", 0, args.src_tensor.Width() - 1);\n";
|
||||
}
|
||||
}
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
@ -173,90 +174,95 @@ std::string GenerateConvCode(
|
||||
const std::string id = std::to_string(y * block_size.x + x);
|
||||
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
c += absl::Substitute(
|
||||
" int addr_$0 = select(-1, cy$2 * src_size.x + cx$1, (in_x$1 "
|
||||
" int addr_$0 = select(-1, cy$2 * args.src_tensor.Width() + "
|
||||
"cx$1, (in_x$1 "
|
||||
"&& "
|
||||
"in_y$2));\n",
|
||||
y * block_size.x + x, x, y);
|
||||
c += absl::Substitute(
|
||||
" int dz_$0 = select(0, src_size.x * src_size.y, (in_x$1 && "
|
||||
" int dz_$0 = select(0, args.src_tensor.Width() * "
|
||||
"args.src_tensor.Height(), (in_x$1 && "
|
||||
"in_y$2));\n",
|
||||
y * block_size.x + x, x, y);
|
||||
} else {
|
||||
c += absl::Substitute(" int addr_$0 = cy$2 * src_size.x + cx$1;\n",
|
||||
y * block_size.x + x, x, y);
|
||||
c += absl::Substitute(
|
||||
" int addr_$0 = cy$2 * args.src_tensor.Width() + cx$1;\n",
|
||||
y * block_size.x + x, x, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " int dz = src_size.x * src_size.y;\n";
|
||||
c += " int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
|
||||
}
|
||||
}
|
||||
} else if (is_buffer) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
c += " bool in_y" + ys[y] + " = yc" + ys[y] + " >= 0 && yc" + ys[y] +
|
||||
" < src_size.y;\n";
|
||||
" < args.src_tensor.Height();\n";
|
||||
}
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
c += " bool in_x" + xs[x] + " = xc" + xs[x] + " >= 0 && xc" + xs[x] +
|
||||
" < src_size.x;\n";
|
||||
" < args.src_tensor.Width();\n";
|
||||
}
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
const std::string id = std::to_string(y * block_size.x + x);
|
||||
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
c += absl::Substitute(
|
||||
" int addr_$0 = select(-1, yc$2 * src_size.x + xc$1, (in_x$1 && "
|
||||
" int addr_$0 = select(-1, yc$2 * args.src_tensor.Width() + "
|
||||
"xc$1, (in_x$1 && "
|
||||
"in_y$2));\n",
|
||||
y * block_size.x + x, x, y);
|
||||
c += absl::Substitute(
|
||||
" int dz_$0 = select(0, src_size.x * src_size.y, (in_x$1 && "
|
||||
" int dz_$0 = select(0, args.src_tensor.Width() * "
|
||||
"args.src_tensor.Height(), (in_x$1 && "
|
||||
"in_y$2));\n",
|
||||
y * block_size.x + x, x, y);
|
||||
} else {
|
||||
c += absl::Substitute(" int addr_$0 = yc$2 * src_size.x + xc$1;\n",
|
||||
y * block_size.x + x, x, y);
|
||||
c += absl::Substitute(
|
||||
" int addr_$0 = yc$2 * args.src_tensor.Width() + xc$1;\n",
|
||||
y * block_size.x + x, x, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " int dz = src_size.x * src_size.y;\n";
|
||||
c += " int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
|
||||
}
|
||||
}
|
||||
c += " for (int s = 0; s < src_size.z; ++s) {\n";
|
||||
c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
|
||||
if (is_buffer) {
|
||||
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
for (int index = 0; index < block_size.x * block_size.y; ++index) {
|
||||
const std::string id = std::to_string(index);
|
||||
c +=
|
||||
" FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) + ";\n";
|
||||
" FLT4 src" + id + " = args.src_tensor.Read(addr_" + id + ");\n";
|
||||
}
|
||||
} else {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
const std::string id = std::to_string(y * block_size.x + x);
|
||||
c += " FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) +
|
||||
" * (FLT)(in_x" + xs[x] + " && in_y" + ys[y] + "); addr_" + id +
|
||||
c += " FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
|
||||
") * (FLT)(in_x" + xs[x] + " && in_y" + ys[y] + "); addr_" + id +
|
||||
" += dz;\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
const std::string fc = "(int2)(Z + " + zs[z] + ", " + f_y + ")";
|
||||
c += absl::Substitute(R"( FLT4 f$1 = READ_IMAGE(filters0, smp_none, $0);
|
||||
FLT4 f$2 = READ_IMAGE(filters1, smp_none, $0);
|
||||
FLT4 f$3 = READ_IMAGE(filters2, smp_none, $0);
|
||||
FLT4 f$4 = READ_IMAGE(filters3, smp_none, $0);
|
||||
c += absl::Substitute(R"( FLT4 f$2 = args.weights0.Read($0, $1);
|
||||
FLT4 f$3 = args.weights1.Read($0, $1);
|
||||
FLT4 f$4 = args.weights2.Read($0, $1);
|
||||
FLT4 f$5 = args.weights3.Read($0, $1);
|
||||
)",
|
||||
fc, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3);
|
||||
"Z + " + zs[z], f_y, z * 4 + 0, z * 4 + 1, z * 4 + 2,
|
||||
z * 4 + 3);
|
||||
}
|
||||
if (!is_buffer) {
|
||||
const auto mode = GetFastestZeroMode(device);
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
const std::string id = std::to_string(y * block_size.x + x);
|
||||
c += " FLT4 src" + id + " = " +
|
||||
src_tensor.ReadWHS(s_x[x], s_y[y], "s", mode) + ";\n";
|
||||
c += " FLT4 src" + id + " = args.src_tensor.Read(" + s_x[x] + ", " +
|
||||
s_y[y] + ", s);\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -278,17 +284,17 @@ std::string GenerateConvCode(
|
||||
}
|
||||
}
|
||||
}
|
||||
c += " }\n"; // src_size.z
|
||||
c += " }\n"; // args.src_tensor.Slices()
|
||||
if (!is1x1) {
|
||||
c += " }\n"; // kernel_size.x
|
||||
c += " }\n"; // kernel_size.y
|
||||
c += " }\n"; // kernel_size_x
|
||||
c += " }\n"; // kernel_size_y
|
||||
}
|
||||
// when is1x1 && adreno4xx_optimization is true, xc0 == X and yc0 == Y
|
||||
std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X";
|
||||
std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y";
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
c += " if (Z < dst_size.z) {\n";
|
||||
c += " FLT4 bias_val = READ_IMAGE(biases, smp_none, (int2)(Z, 0));\n";
|
||||
c += " if (Z < args.dst_tensor.Slices()) {\n";
|
||||
c += " FLT4 bias_val = args.biases.Read(Z);\n";
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
const std::string id =
|
||||
@ -296,11 +302,10 @@ std::string GenerateConvCode(
|
||||
c += " {\n";
|
||||
c += " int xc = " + dst_x + " + " + xs[x] + ";\n";
|
||||
c += " int yc = " + dst_y + " + " + ys[y] + ";\n";
|
||||
c += " if (xc < dst_size.x && yc < dst_size.y) {\n";
|
||||
c += " if (xc < args.dst_tensor.Width() && yc < "
|
||||
"args.dst_tensor.Height()) {\n";
|
||||
c += " FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
|
||||
const LinkingContext context{"res", "xc", "yc", "Z"};
|
||||
c += PostProcess(linked_operations, context);
|
||||
c += " " + dst_tensor.WriteWHS("res", "xc", "yc", "Z") + "\n";
|
||||
c += " args.dst_tensor.Write(res, xc, yc, Z);\n";
|
||||
c += " }\n";
|
||||
c += " }\n";
|
||||
}
|
||||
@ -350,11 +355,6 @@ ConvTexture::ConvTexture(const OperationDef& definition)
|
||||
|
||||
ConvTexture::ConvTexture(ConvTexture&& operation)
|
||||
: GPUOperation(std::move(operation)),
|
||||
weights_0_(std::move(operation.weights_0_)),
|
||||
weights_1_(std::move(operation.weights_1_)),
|
||||
weights_2_(std::move(operation.weights_2_)),
|
||||
weights_3_(std::move(operation.weights_3_)),
|
||||
biases_(std::move(operation.biases_)),
|
||||
kernel_size_(operation.kernel_size_),
|
||||
stride_(operation.stride_),
|
||||
padding_(operation.padding_),
|
||||
@ -366,11 +366,6 @@ ConvTexture::ConvTexture(ConvTexture&& operation)
|
||||
|
||||
ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
|
||||
if (this != &operation) {
|
||||
weights_0_ = std::move(operation.weights_0_);
|
||||
weights_1_ = std::move(operation.weights_1_);
|
||||
weights_2_ = std::move(operation.weights_2_);
|
||||
weights_3_ = std::move(operation.weights_3_);
|
||||
biases_ = std::move(operation.biases_);
|
||||
std::swap(kernel_size_, operation.kernel_size_);
|
||||
std::swap(stride_, operation.stride_);
|
||||
std::swap(padding_, operation.padding_);
|
||||
@ -395,10 +390,16 @@ absl::Status ConvTexture::Compile(const CreationContext& creation_context) {
|
||||
definition_.precision == CalculationsPrecision::F16;
|
||||
const bool stride_correction =
|
||||
definition_.IsBatchSupported() && stride_.x != 1;
|
||||
const std::string code =
|
||||
std::string code =
|
||||
GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
|
||||
stride_correction, different_weights_for_height_,
|
||||
*creation_context.device, linked_operations_);
|
||||
*creation_context.device, &args_);
|
||||
std::string element_wise_code;
|
||||
RETURN_IF_ERROR(
|
||||
MergeOperations(linked_operations_, &args_, &element_wise_code));
|
||||
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
|
||||
{{"dst_tensor", element_wise_code}},
|
||||
&code));
|
||||
std::vector<CompilerOptions> options;
|
||||
if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
|
||||
options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
|
||||
@ -409,25 +410,20 @@ absl::Status ConvTexture::Compile(const CreationContext& creation_context) {
|
||||
}
|
||||
|
||||
absl::Status ConvTexture::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_0_.GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_1_.GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_2_.GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_3_.GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
|
||||
RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
|
||||
RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
|
||||
if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(
|
||||
int2(dilation_.x * src_[0]->Batch(), dilation_.y)));
|
||||
RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
|
||||
RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
|
||||
RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
|
||||
RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
|
||||
}
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
|
||||
RETURN_IF_ERROR(
|
||||
kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y)));
|
||||
RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
|
||||
RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
|
||||
RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
|
||||
RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
|
||||
RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
|
||||
RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
|
@ -92,12 +92,6 @@ class ConvTexture : public GPUOperation {
|
||||
absl::Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
Texture2D weights_0_;
|
||||
Texture2D weights_1_;
|
||||
Texture2D weights_2_;
|
||||
Texture2D weights_3_;
|
||||
LinearStorage biases_;
|
||||
|
||||
int2 kernel_size_;
|
||||
int2 stride_;
|
||||
int2 padding_;
|
||||
@ -119,11 +113,16 @@ absl::Status ConvTexture::UploadData(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
|
||||
RETURN_IF_ERROR(UploadWeights(weights, context));
|
||||
LinearStorageCreateInfo create_info;
|
||||
create_info.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
create_info.data_type = definition_.GetDataType();
|
||||
create_info.aligned_size = weights.shape.o;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(create_info, biases, context, &biases_));
|
||||
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
desc.element_type = definition_.GetDataType();
|
||||
|
||||
LinearStorage lt;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(desc, biases, context, <));
|
||||
args_.AddObject("biases", AccessType::READ,
|
||||
absl::make_unique<LinearStorage>(std::move(lt)),
|
||||
absl::make_unique<TensorLinearDescriptor>(desc));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
@ -135,14 +134,19 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
|
||||
RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
|
||||
RETURN_IF_ERROR(UploadWeights(wino_weights, context));
|
||||
|
||||
LinearStorageCreateInfo create_info;
|
||||
create_info.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
create_info.data_type = definition_.GetDataType();
|
||||
create_info.aligned_size = 1;
|
||||
tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
|
||||
bias.shape = Linear(1);
|
||||
bias.data = {0.0f};
|
||||
return CreateLinearStorage(create_info, bias, context, &biases_);
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
desc.element_type = definition_.GetDataType();
|
||||
|
||||
LinearStorage lt;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(desc, bias, context, <));
|
||||
args_.AddObject("biases", AccessType::READ,
|
||||
absl::make_unique<LinearStorage>(std::move(lt)),
|
||||
absl::make_unique<TensorLinearDescriptor>(desc));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
@ -157,11 +161,20 @@ absl::Status ConvTexture::UploadWeights(
|
||||
int texture_width = dst_depth;
|
||||
int texture_height = src_depth * kernel_x * kernel_y;
|
||||
|
||||
DataType data_type = definition_.GetDataType();
|
||||
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
DataType data_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
|
||||
const int elements_count = texture_width * texture_height;
|
||||
|
||||
if (data_type == DataType::FLOAT32) {
|
||||
Texture2DDescriptor desc;
|
||||
desc.element_type = data_type;
|
||||
|
||||
Texture2D weights_0;
|
||||
Texture2D weights_1;
|
||||
Texture2D weights_2;
|
||||
Texture2D weights_3;
|
||||
|
||||
if (f32_weights) {
|
||||
std::vector<float4> gpu_data_0(elements_count);
|
||||
std::vector<float4> gpu_data_1(elements_count);
|
||||
std::vector<float4> gpu_data_2(elements_count);
|
||||
@ -171,15 +184,16 @@ absl::Status ConvTexture::UploadWeights(
|
||||
absl::MakeSpan(gpu_data_3));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_0.data(),
|
||||
context, &weights_0_));
|
||||
context, &weights_0));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_1.data(),
|
||||
context, &weights_1_));
|
||||
context, &weights_1));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_2.data(),
|
||||
context, &weights_2_));
|
||||
return CreateTexture2DRGBA(data_type, texture_width, texture_height,
|
||||
gpu_data_3.data(), context, &weights_3_);
|
||||
context, &weights_2));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_3.data(),
|
||||
context, &weights_3));
|
||||
} else {
|
||||
std::vector<half4> gpu_data_0(elements_count);
|
||||
std::vector<half4> gpu_data_1(elements_count);
|
||||
@ -190,16 +204,31 @@ absl::Status ConvTexture::UploadWeights(
|
||||
absl::MakeSpan(gpu_data_3));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_0.data(),
|
||||
context, &weights_0_));
|
||||
context, &weights_0));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_1.data(),
|
||||
context, &weights_1_));
|
||||
context, &weights_1));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_2.data(),
|
||||
context, &weights_2_));
|
||||
return CreateTexture2DRGBA(data_type, texture_width, texture_height,
|
||||
gpu_data_3.data(), context, &weights_3_);
|
||||
context, &weights_2));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_3.data(),
|
||||
context, &weights_3));
|
||||
}
|
||||
|
||||
args_.AddObject("weights0", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_0)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights1", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_1)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights2", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_2)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights3", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_3)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
template <DataType S, typename T>
|
||||
|
@ -59,6 +59,41 @@ absl::Status CreateTexture2D(int width, int height, cl_channel_type type,
|
||||
}
|
||||
} // namespace
|
||||
|
||||
GPUResources Texture2DDescriptor::GetGPUResources(
|
||||
AccessType access_type) const {
|
||||
GPUResources resources;
|
||||
GPUImage2DDescriptor desc;
|
||||
desc.data_type = element_type;
|
||||
desc.access_type = access_type;
|
||||
resources.images2d.push_back({"tex2d", desc});
|
||||
return resources;
|
||||
}
|
||||
|
||||
absl::Status Texture2DDescriptor::PerformSelector(
|
||||
const std::string& selector, const std::vector<std::string>& args,
|
||||
const std::vector<std::string>& template_args, std::string* result) const {
|
||||
if (selector == "Read") {
|
||||
return PerformReadSelector(args, result);
|
||||
} else {
|
||||
return absl::NotFoundError(absl::StrCat(
|
||||
"TensorLinearDescriptor don't have selector with name - ", selector));
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status Texture2DDescriptor::PerformReadSelector(
|
||||
const std::vector<std::string>& args, std::string* result) const {
|
||||
if (args.size() != 2) {
|
||||
return absl::NotFoundError(
|
||||
absl::StrCat("Texture2DDescriptor Read require one argument, but ",
|
||||
args.size(), " was passed"));
|
||||
}
|
||||
const std::string read =
|
||||
element_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef";
|
||||
*result = absl::StrCat(read, "(tex2d, smp_none, (int2)(", args[0],
|
||||
", " + args[1] + "))");
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
Texture2D::Texture2D(cl_mem texture, int width, int height,
|
||||
cl_channel_type type)
|
||||
: texture_(texture), width_(width), height_(height), channel_type_(type) {}
|
||||
@ -95,6 +130,12 @@ void Texture2D::Release() {
|
||||
}
|
||||
}
|
||||
|
||||
GPUResourcesWithValue Texture2D::GetGPUResources(AccessType access_type) const {
|
||||
GPUResourcesWithValue resources;
|
||||
resources.images2d.push_back({"tex2d", texture_});
|
||||
return resources;
|
||||
}
|
||||
|
||||
// Creates new 4-channel 2D texture with f32 elements
|
||||
absl::Status CreateTexture2DRGBA32F(int width, int height, CLContext* context,
|
||||
Texture2D* result) {
|
||||
|
@ -20,6 +20,7 @@ limitations under the License.
|
||||
#include "absl/types/span.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/util.h"
|
||||
@ -30,9 +31,22 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
struct Texture2DDescriptor : public GPUObjectDescriptor {
|
||||
DataType element_type; // FLOAT32 or FLOAT16
|
||||
|
||||
absl::Status PerformSelector(const std::string& selector,
|
||||
const std::vector<std::string>& args,
|
||||
const std::vector<std::string>& template_args,
|
||||
std::string* result) const override;
|
||||
|
||||
GPUResources GetGPUResources(AccessType access_type) const override;
|
||||
absl::Status PerformReadSelector(const std::vector<std::string>& args,
|
||||
std::string* result) const;
|
||||
};
|
||||
|
||||
// Texture2D represent formatted GPU data storage.
|
||||
// Texture2D is moveable but not copyable.
|
||||
class Texture2D {
|
||||
class Texture2D : public GPUObject {
|
||||
public:
|
||||
Texture2D() {} // just for using Texture2D as a class members
|
||||
Texture2D(cl_mem texture, int width, int height, cl_channel_type type);
|
||||
@ -56,6 +70,8 @@ class Texture2D {
|
||||
template <typename T>
|
||||
absl::Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
|
||||
|
||||
GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
|
||||
|
||||
private:
|
||||
void Release();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user