ConvTexture replaced with ConvPowerVR.

PiperOrigin-RevId: 336742538
Change-Id: I78fdaaa2d98d59ededfec90cffe1cb53747137f4
This commit is contained in:
Raman Sarokin 2020-10-12 14:22:01 -07:00 committed by TensorFlower Gardener
parent 1782a45957
commit b082980a84
8 changed files with 57 additions and 833 deletions

View File

@ -239,50 +239,6 @@ cc_test(
],
)
cc_library(
name = "conv_texture",
srcs = ["conv_texture.cc"],
hdrs = ["conv_texture.h"],
deps = [
":gpu_operation",
":util",
":work_group_picking",
"//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
"//tensorflow/lite/delegates/gpu/cl:cl_context",
"//tensorflow/lite/delegates/gpu/cl:linear_storage",
"//tensorflow/lite/delegates/gpu/cl:precision",
"//tensorflow/lite/delegates/gpu/cl:tensor",
"//tensorflow/lite/delegates/gpu/cl:tensor_type",
"//tensorflow/lite/delegates/gpu/cl:texture2d",
"//tensorflow/lite/delegates/gpu/cl:util",
"//tensorflow/lite/delegates/gpu/common:data_type",
"//tensorflow/lite/delegates/gpu/common:operations",
"//tensorflow/lite/delegates/gpu/common:shape",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:tensor",
"//tensorflow/lite/delegates/gpu/common:types",
"//tensorflow/lite/delegates/gpu/common:winograd_util",
"@com_google_absl//absl/strings",
],
)
cc_test(
name = "conv_texture_test",
srcs = ["conv_texture_test.cc"],
linkstatic = True,
tags = tf_gpu_tests_tags() + [
"linux",
"local",
],
deps = [
":cl_test",
":conv_texture",
"//tensorflow/lite/delegates/gpu/common:operations",
"//tensorflow/lite/delegates/gpu/common:status",
"@com_google_googletest//:gtest_main",
],
)
cc_library(
name = "conv_weights_converter",
srcs = ["conv_weights_converter.cc"],
@ -1397,7 +1353,6 @@ test_suite(
"conv_buffer_1x1_test",
"conv_constants_test",
"conv_powervr_test",
"conv_texture_test",
"convolution_transposed_3x3_thin_test",
"convolution_transposed_4x4_test",
"convolution_transposed_test",

View File

@ -233,6 +233,16 @@ void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
if (conv_params_.IsPrivateMemBroadcast() && device_info.IsCL20OrHigher()) {
compiler_options_.push_back(CompilerOptions::CL_2_0);
}
bool kernel_is_trivial =
conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
kernel_is_trivial = kernel_is_trivial & conv_params_.z_kernel_is_1;
}
if (device_info.IsAdreno3xx() &&
definition_.precision == CalculationsPrecision::F16 &&
kernel_is_trivial) {
compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
}
absl::Status ConvPowerVR::BindArguments() {
@ -284,23 +294,34 @@ int3 ConvPowerVR::GetGridSize() const {
if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
grid_x *= task_size_z;
}
wg.x = DivideRoundUp(grid_x, work_group_size_.x);
wg.y = DivideRoundUp(task_size_s, work_group_size_.y);
return int3(
wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y, 1);
if (conv_params_.work_group_launch_order[0] == 0 &&
conv_params_.work_group_launch_order[1] == 1) {
return int3(grid_x, task_size_s, 1);
} else {
wg.x = DivideRoundUp(grid_x, work_group_size_.x);
wg.y = DivideRoundUp(task_size_s, work_group_size_.y);
return int3(
wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y, 1);
}
} else {
int grid_y = task_size_y;
if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
grid_y *= task_size_z;
}
wg.x = DivideRoundUp(task_size_x, work_group_size_.x);
wg.y = DivideRoundUp(grid_y, work_group_size_.y);
wg.z = DivideRoundUp(task_size_s, work_group_size_.z);
return int3(
wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
if (conv_params_.work_group_launch_order[0] == 0 &&
conv_params_.work_group_launch_order[1] == 1 &&
conv_params_.work_group_launch_order[2] == 2) {
return int3(task_size_x, grid_y, task_size_s);
} else {
wg.x = DivideRoundUp(task_size_x, work_group_size_.x);
wg.y = DivideRoundUp(grid_y, work_group_size_.y);
wg.z = DivideRoundUp(task_size_s, work_group_size_.z);
return int3(
wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
}
}
}
@ -1190,7 +1211,16 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
conv_params.fixed_work_group_size = false;
conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
} else if (device_info.IsAdreno()) {
conv_params.block_size = int4(2, 2, 1, 1);
conv_params.block_size = int4(2, 2, 1, 2);
if (device_info.IsAdreno3xx()) {
if (definition.precision == CalculationsPrecision::F16) {
conv_params.block_size = int4(2, 2, 1, 2);
} else if (definition.precision == CalculationsPrecision::F32_F16) {
conv_params.block_size = int4(2, 1, 1, 2);
} else { // F32
conv_params.block_size = int4(2, 2, 1, 1);
}
}
work_group_size_ = int3(8, 2, 1);
conv_params.work_group_launch_order = int3(0, 1, 2);
conv_params.fixed_work_group_size = false;

View File

@ -1,461 +0,0 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
#include <string>
#include <utility>
#include <vector>
#include "absl/strings/substitute.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
bool UseFP16SIMD(const DeviceInfo& device_info, CalculationsPrecision precision,
bool kernel1x1) {
if (!device_info.IsAdreno()) {
return false;
}
switch (precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F32_F16:
return false;
case CalculationsPrecision::F16:
return device_info.IsAdreno3xx() && kernel1x1;
}
}
} // namespace
ConvTexture::ConvTexture(const OperationDef& definition,
const Convolution2DAttributes& attr)
: GPUOperation(definition),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
stride_(attr.strides.w, attr.strides.h),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
dilation_(attr.dilations.w, attr.dilations.h),
different_weights_for_height_(false),
block_size_(2, 2, 2) {
work_group_size_ = int3(4, 4, 2);
}
ConvTexture::ConvTexture(const OperationDef& definition)
: GPUOperation(definition),
kernel_size_(1, 1),
stride_(1, 1),
padding_(0, 0),
dilation_(1, 1),
different_weights_for_height_(false),
block_size_(4, 1, 2) {
work_group_size_ = int3(16, 1, 2);
}
ConvTexture::ConvTexture(ConvTexture&& operation)
: GPUOperation(std::move(operation)),
kernel_size_(operation.kernel_size_),
stride_(operation.stride_),
padding_(operation.padding_),
dilation_(operation.dilation_),
different_weights_for_height_(operation.different_weights_for_height_),
block_size_(operation.block_size_) {}
ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
if (this != &operation) {
std::swap(kernel_size_, operation.kernel_size_);
std::swap(stride_, operation.stride_);
std::swap(padding_, operation.padding_);
std::swap(dilation_, operation.dilation_);
std::swap(different_weights_for_height_,
operation.different_weights_for_height_);
std::swap(block_size_, operation.block_size_);
GPUOperation::operator=(std::move(operation));
}
return *this;
}
std::string ConvTexture::GenerateConvCode(const OperationDef& op_def,
const int3& block_size, bool is1x1,
bool adreno4xx_optimization,
bool stride_correction,
bool different_weights_for_height) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
if (op_def.IsBatchSupported()) {
src_desc.SetStateVar("BatchedWidth", "true");
}
AddSrcTensor("src_tensor", src_desc);
auto dst_desc = op_def.dst_tensors[0];
if (op_def.IsBatchSupported()) {
dst_desc.SetStateVar("BatchedWidth", "true");
}
AddDstTensor("dst_tensor", dst_desc);
if (!is1x1) {
args_.AddInt("kernel_size_x");
args_.AddInt("kernel_size_y");
args_.AddInt("dilation_x");
args_.AddInt("dilation_y");
}
args_.AddInt("stride_x");
args_.AddInt("stride_y");
args_.AddInt("padding_x");
args_.AddInt("padding_y");
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
const bool is_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER ||
src_tensor_type == TensorStorageType::BUFFER;
std::vector<std::string> xs(block_size.x);
for (int x = 0; x < block_size.x; ++x) {
xs[x] = std::to_string(x);
}
std::vector<std::string> ys(block_size.y);
for (int y = 0; y < block_size.y; ++y) {
ys[y] = std::to_string(y);
}
std::vector<std::string> zs(block_size.z);
for (int z = 0; z < block_size.z; ++z) {
zs[z] = std::to_string(z);
}
std::string c = GetCommonDefines(op_def.precision);
for (int z = 0; z < block_size.z; ++z) {
const std::string f0 = std::to_string(z * 4 + 0);
const std::string f1 = std::to_string(z * 4 + 1);
const std::string f2 = std::to_string(z * 4 + 2);
const std::string f3 = std::to_string(z * 4 + 3);
switch (op_def.precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
c += "#define CONV" + zs[z] + "(R, S) \\\n";
c += "R += S.x * f" + f0 + "; \\\n";
c += "R += S.y * f" + f1 + "; \\\n";
c += "R += S.z * f" + f2 + "; \\\n";
c += "R += S.w * f" + f3 + "; \n";
break;
case CalculationsPrecision::F32_F16:
c += "#define CONV" + zs[z] + "(R, S) \\\n";
c += "R += convert_float4(S.x * f" + f0 + " + S.y * f" + f1 +
" + S.z * f" + f2 + " + S.w * f" + f3 + ");\n";
break;
}
}
c += "__kernel void main_function(\n";
c += "$0) {\n";
c += " int X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n";
c += " int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
c += " int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
"|| Z >= args.dst_tensor.Slices()) return;\n";
std::vector<std::string> s_x(block_size.x);
std::vector<std::string> s_y(block_size.y);
for (int x = 0; x < block_size.x; ++x) {
if (stride_correction) {
c += " int xc" + xs[x] + " = " +
GetXStrideCorrected("X + " + xs[x], "args.src_tensor.Batch()",
"args.stride_x", "args.padding_x") +
";\n";
} else {
c += " int xc" + xs[x] + " = (X +" + xs[x] +
") * args.stride_x + args.padding_x;\n";
}
s_x[x] = is1x1 ? "xc" + xs[x] : "cx" + xs[x];
}
for (int y = 0; y < block_size.y; ++y) {
c += " int yc" + ys[y] + " = (Y +" + ys[y] +
") * args.stride_y + args.padding_y;\n";
s_y[y] = is1x1 ? "yc" + ys[y] : "cy" + ys[y];
}
for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
c += " ACCUM_FLT4 r" + std::to_string(i) +
" = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
}
std::string f_y = is1x1 ? "s" : "filter_offset";
if (different_weights_for_height) {
f_y = "Y * args.src_tensor.Slices() + s";
}
if (!is1x1) {
for (int x = 0; x < block_size.x; ++x) {
c += " int cx" + xs[x] + ";\n";
}
for (int y = 0; y < block_size.y; ++y) {
c += " int cy" + ys[y] + ";\n";
}
c += " int filter_offset = 0;\n";
c += " for (int y = 0; y < args.kernel_size_y; ++y) {\n";
for (int y = 0; y < block_size.y; ++y) {
c += " cy" + ys[y] + " = y * args.dilation_y + yc" + ys[y] + ";\n";
}
if (is_buffer) {
for (int y = 0; y < block_size.y; ++y) {
c += " bool in_y" + ys[y] + " = cy" + ys[y] + " >= 0 && cy" + ys[y] +
" < args.src_tensor.Height();\n";
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " cy" + ys[y] + " = clamp(cy" + ys[y] +
", 0, args.src_tensor.Height() - 1);\n";
}
}
}
c += " for (int x = 0; x < args.kernel_size_x; ++x) {\n";
for (int x = 0; x < block_size.x; ++x) {
c += " cx" + xs[x] + " = x * args.dilation_x + xc" + xs[x] + ";\n";
}
if (is_buffer) {
for (int x = 0; x < block_size.x; ++x) {
c += " bool in_x" + xs[x] + " = cx" + xs[x] + " >= 0 && cx" + xs[x] +
" < args.src_tensor.Width();\n";
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " cx" + xs[x] + " = clamp(cx" + xs[x] +
", 0, args.src_tensor.Width() - 1);\n";
}
}
for (int x = 0; x < block_size.x; ++x) {
for (int y = 0; y < block_size.y; ++y) {
const std::string id = std::to_string(y * block_size.x + x);
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
c += absl::Substitute(
" int addr_$0 = select(-1, cy$2 * args.src_tensor.Width() + "
"cx$1, (in_x$1 "
"&& "
"in_y$2));\n",
y * block_size.x + x, x, y);
c += absl::Substitute(
" int dz_$0 = select(0, args.src_tensor.Width() * "
"args.src_tensor.Height(), (in_x$1 && "
"in_y$2));\n",
y * block_size.x + x, x, y);
} else {
c += absl::Substitute(
" int addr_$0 = cy$2 * args.src_tensor.Width() + cx$1;\n",
y * block_size.x + x, x, y);
}
}
}
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
}
}
} else if (is_buffer) {
for (int y = 0; y < block_size.y; ++y) {
c += " bool in_y" + ys[y] + " = yc" + ys[y] + " >= 0 && yc" + ys[y] +
" < args.src_tensor.Height();\n";
}
for (int x = 0; x < block_size.x; ++x) {
c += " bool in_x" + xs[x] + " = xc" + xs[x] + " >= 0 && xc" + xs[x] +
" < args.src_tensor.Width();\n";
}
for (int x = 0; x < block_size.x; ++x) {
for (int y = 0; y < block_size.y; ++y) {
const std::string id = std::to_string(y * block_size.x + x);
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
c += absl::Substitute(
" int addr_$0 = select(-1, yc$2 * args.src_tensor.Width() + "
"xc$1, (in_x$1 && "
"in_y$2));\n",
y * block_size.x + x, x, y);
c += absl::Substitute(
" int dz_$0 = select(0, args.src_tensor.Width() * "
"args.src_tensor.Height(), (in_x$1 && "
"in_y$2));\n",
y * block_size.x + x, x, y);
} else {
c += absl::Substitute(
" int addr_$0 = yc$2 * args.src_tensor.Width() + xc$1;\n",
y * block_size.x + x, x, y);
}
}
}
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
}
}
c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
if (is_buffer) {
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
for (int index = 0; index < block_size.x * block_size.y; ++index) {
const std::string id = std::to_string(index);
c +=
" FLT4 src" + id + " = args.src_tensor.Read(addr_" + id + ");\n";
}
} else {
for (int x = 0; x < block_size.x; ++x) {
for (int y = 0; y < block_size.y; ++y) {
const std::string id = std::to_string(y * block_size.x + x);
c += " FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
") * (FLT)(in_x" + xs[x] + " && in_y" + ys[y] + "); addr_" + id +
" += dz;\n";
}
}
}
}
for (int z = 0; z < block_size.z; ++z) {
c += absl::Substitute(R"( FLT4 f$2 = args.weights0.Read($0, $1);
FLT4 f$3 = args.weights1.Read($0, $1);
FLT4 f$4 = args.weights2.Read($0, $1);
FLT4 f$5 = args.weights3.Read($0, $1);
)",
"Z + " + zs[z], f_y, z * 4 + 0, z * 4 + 1, z * 4 + 2,
z * 4 + 3);
}
if (!is_buffer) {
for (int x = 0; x < block_size.x; ++x) {
for (int y = 0; y < block_size.y; ++y) {
const std::string id = std::to_string(y * block_size.x + x);
c += " FLT4 src" + id + " = args.src_tensor.Read(" + s_x[x] + ", " +
s_y[y] + ", s);\n";
}
}
}
for (int z = 0; z < block_size.z; ++z) {
for (int i = 0; i < block_size.x * block_size.y; ++i) {
c += " CONV" + zs[z] + "(r" +
std::to_string(i + z * block_size.x * block_size.y) + ", src" +
std::to_string(i) + ");\n";
}
}
if (!is1x1) {
c += " filter_offset++;\n";
}
if (is_buffer) {
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
for (int index = 0; index < block_size.x * block_size.y; ++index) {
const std::string id = std::to_string(index);
c += " addr_" + id + " += dz_" + id + ";\n";
}
}
}
c += " }\n"; // args.src_tensor.Slices()
if (!is1x1) {
c += " }\n"; // kernel_size_x
c += " }\n"; // kernel_size_y
}
// when is1x1 && adreno4xx_optimization is true, xc0 == X and yc0 == Y
std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X";
std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y";
for (int z = 0; z < block_size.z; ++z) {
c += " if (Z < args.dst_tensor.Slices()) {\n";
c += " FLT4 bias_val = args.biases.Read(Z);\n";
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
const std::string id =
std::to_string((z * block_size.y + y) * block_size.x + x);
c += " {\n";
c += " int xc = " + dst_x + " + " + xs[x] + ";\n";
c += " int yc = " + dst_y + " + " + ys[y] + ";\n";
c += " if (xc < args.dst_tensor.Width() && yc < "
"args.dst_tensor.Height()) {\n";
c += " FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
c += " args.dst_tensor.Write(res, xc, yc, Z);\n";
c += " }\n";
c += " }\n";
}
}
c += " }\n";
c += " Z++;\n";
}
c += "}\n";
return c;
}
void ConvTexture::GenerateCode(const DeviceInfo& device_info) {
auto storage_type = definition_.GetPrimaryStorageType();
bool is1x1 = kernel_size_.x == 1 && kernel_size_.y == 1;
bool adreno4xx_optimization =
stride_.x == 1 && stride_.y == 1 && padding_.x == 0 && padding_.y == 0 &&
device_info.IsAdreno4xx() &&
storage_type == TensorStorageType::TEXTURE_ARRAY &&
definition_.precision == CalculationsPrecision::F16;
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
code_ =
GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
stride_correction, different_weights_for_height_);
if (UseFP16SIMD(device_info, definition_.precision, is1x1)) {
compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
}
absl::Status ConvTexture::BindArguments() {
if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
}
RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
return absl::OkStatus();
}
int3 ConvTexture::GetGridSize() const {
const int grid_x =
DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
const int grid_y = DivideRoundUp(dst_[0]->Height(), block_size_.y);
const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
return int3(grid_x, grid_y, grid_z);
}
void ConvTexture::GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
work_groups);
}
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr) {
ConvTexture result(definition, attr);
result.GenerateCode(device_info);
result.UploadData(attr.weights, attr.bias);
return result;
}
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr) {
ConvTexture result(definition);
result.GenerateCode(device_info);
result.UploadData(attr.weights, attr.bias);
return result;
}
ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr) {
ConvTexture result(definition);
result.different_weights_for_height_ = true;
result.block_size_ = {4, 1, 2};
result.GenerateCode(device_info);
result.UploadDataForWinograd4x4To6x6(attr.weights);
return result;
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -1,193 +0,0 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
#include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
namespace tflite {
namespace gpu {
namespace cl {
// This convolution process BLOCK_SIZE(XxYxZ) of FLT4 values per thread.
class ConvTexture : public GPUOperation {
public:
ConvTexture() = default;
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override;
absl::Status BindArguments() override;
int3 GetGridSize() const override;
// Move only
ConvTexture(ConvTexture&& operation);
ConvTexture& operator=(ConvTexture&& operation);
ConvTexture(const ConvTexture&) = delete;
ConvTexture& operator=(const ConvTexture&) = delete;
private:
friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr);
friend ConvTexture CreateConvTextureWino4x4To6x6(
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution2DAttributes& attr);
ConvTexture(const OperationDef& definition,
const Convolution2DAttributes& attr);
explicit ConvTexture(const OperationDef& definition);
template <DataType T>
void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases);
template <DataType T>
void UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights);
template <DataType T>
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
void GenerateCode(const DeviceInfo& device_info);
std::string GenerateConvCode(const OperationDef& op_def,
const int3& block_size, bool is1x1,
bool adreno4xx_optimization,
bool stride_correction,
bool different_weights_for_height);
int2 kernel_size_;
int2 stride_;
int2 padding_;
int2 dilation_;
// By default in 2d convolution we have the same weights for WH dims, but in
// some cases we need separate weights for H dimension and convolution kernel
// requires very small modifications to support it.
bool different_weights_for_height_;
int3 block_size_ = int3(2, 2, 2);
};
template <DataType T>
void ConvTexture::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases) {
UploadWeights(weights);
TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::TEXTURE_2D;
desc.element_type = definition_.GetDataType();
desc.UploadLinearData(biases);
args_.AddObject("biases",
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
}
template <DataType T>
void ConvTexture::UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights) {
tflite::gpu::Tensor<OHWI, T> wino_weights;
RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
UploadWeights(wino_weights);
tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
bias.shape = Linear(1);
bias.data = {0.0f};
TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::TEXTURE_2D;
desc.element_type = definition_.GetDataType();
desc.UploadLinearData(bias);
args_.AddObject("biases",
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
}
template <DataType T>
void ConvTexture::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
int dst_depth = DivideRoundUp(weights.shape.o, 4);
dst_depth = AlignByN(dst_depth, block_size_.z);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
DataType data_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
const int elements_count = dst_depth * src_depth * kernel_x * kernel_y * 4;
const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
std::vector<uint8_t> data(float4_size * elements_count);
if (f32_weights) {
float4* ptr = reinterpret_cast<float4*>(data.data());
RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.z,
absl::MakeSpan(ptr, elements_count));
} else {
half4* ptr = reinterpret_cast<half4*>(data.data());
RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.z,
absl::MakeSpan(ptr, elements_count));
}
const int texture_width = dst_depth;
const int texture_height = src_depth * kernel_x * kernel_y;
const int sub_size = float4_size * texture_width * texture_height;
for (int i = 0; i < 4; ++i) {
Texture2DDescriptor desc;
desc.element_type = data_type;
desc.size = int2(texture_width, texture_height);
desc.data.resize(sub_size);
memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
const std::string name = "weights" + std::to_string(i);
args_.AddObject(name,
absl::make_unique<Texture2DDescriptor>(std::move(desc)));
}
}
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr);
ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_

View File

@ -1,107 +0,0 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, ConvTextureSimpleWeights) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
Convolution2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(1, 1);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(1, 2, 2, 2);
attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
attr.bias.shape = Linear(1);
attr.bias.data = {0.0f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvTexture operation =
CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
}
}
}
TEST_F(OpenCLOperationTest, ConvTexture) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
Convolution2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(1, 1);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(2, 2, 2, 2);
attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
attr.bias.shape = Linear(2);
attr.bias.data = {0.5f, -0.5f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvTexture operation =
CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f,
60.5f, 235.5f, 20.5f, 123.5f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -14,7 +14,6 @@ cc_library(
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_common",
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_constants",
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_powervr",
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_texture",
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_weights_converter",
"//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
"//tensorflow/lite/delegates/gpu/cl/kernels:work_group_picking",
@ -82,7 +81,6 @@ cc_library(
deps = [
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_buffer_1x1",
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_powervr",
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_texture",
"//tensorflow/lite/delegates/gpu/cl/kernels:fully_connected",
"//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
"//tensorflow/lite/delegates/gpu/common:operations",

View File

@ -19,7 +19,6 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
@ -38,8 +37,8 @@ std::unique_ptr<GPUOperation> SelectConvolutionAdreno(
GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
return absl::make_unique<GPUOperation>(std::move(conv));
} else {
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
return absl::make_unique<ConvTexture>(std::move(conv));
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
}
}
@ -47,8 +46,8 @@ std::unique_ptr<GPUOperation> SelectConvolutionWinogradAdreno(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const DeviceInfo& device_info, const OperationDef& op_def,
ModelHints hints) {
ConvTexture conv = CreateConvTextureWino4x4To6x6(device_info, op_def, attr);
return absl::make_unique<ConvTexture>(std::move(conv));
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
}
std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsAdreno(

View File

@ -18,7 +18,6 @@ limitations under the License.
#include "absl/memory/memory.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
@ -31,8 +30,9 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedGeneric(
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
const OperationDef& op_def, int batch_size) {
if (op_def.IsBatchSupported()) {
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
return absl::make_unique<ConvTexture>(std::move(conv));
BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
} else {
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
return absl::make_unique<FullyConnected>(std::move(fc));
@ -43,8 +43,9 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedAdreno(
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
const OperationDef& op_def, int batch_size) {
if (op_def.IsBatchSupported()) {
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
return absl::make_unique<ConvTexture>(std::move(conv));
BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
} else {
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
return absl::make_unique<FullyConnected>(std::move(fc));
@ -71,8 +72,10 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedMali(
ConvBuffer1x1 conv = CreateConvBuffer1x1(device_info, op_def, attr);
return absl::make_unique<ConvBuffer1x1>(std::move(conv));
} else {
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
return absl::make_unique<ConvTexture>(std::move(conv));
BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
ConvPowerVR conv =
CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
}
} else {
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);