ConvPowerVR improvements for FP16 mode.
PiperOrigin-RevId: 265767693
This commit is contained in:
parent
28e24ad2f1
commit
7301b746f2
@ -190,6 +190,7 @@ cc_library(
|
||||
":gpu_operation",
|
||||
":util",
|
||||
"//tensorflow/lite/delegates/gpu/cl:buffer",
|
||||
"//tensorflow/lite/delegates/gpu/cl:cl_device",
|
||||
"//tensorflow/lite/delegates/gpu/cl:linear_storage",
|
||||
"//tensorflow/lite/delegates/gpu/cl:precision",
|
||||
"//tensorflow/lite/delegates/gpu/cl:tensor",
|
||||
|
@ -15,6 +15,7 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
@ -27,155 +28,16 @@ limitations under the License.
|
||||
namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GenerateConvPowerVR1x1(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const int3& block_size,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
std::string c = GetCommonDefines(precision);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
|
||||
bool power_vr = true;
|
||||
c += "#define SIMD_BARRIER " +
|
||||
(power_vr ? std::string("")
|
||||
: std::string("barrier(CLK_LOCAL_MEM_FENCE)")) +
|
||||
"\n";
|
||||
c += "#define SIMD_WAIT_EVENT(E) " +
|
||||
(power_vr ? std::string("") : std::string("wait_group_events(1, &E);")) +
|
||||
"\n";
|
||||
c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
|
||||
c += "__kernel void main_function(\n";
|
||||
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
|
||||
c += " __global ACCUM_FLT4* filters_buffer, \n";
|
||||
c += " __global ACCUM_FLT4* biases \n";
|
||||
c += GetArgsDeclaration(linked_operations);
|
||||
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
|
||||
c += " int4 src_size, \n";
|
||||
c += " int4 dst_size \n";
|
||||
c += ") {\n";
|
||||
c += " int X = (get_group_id(1) * 8 + get_local_id(0)) * " +
|
||||
std::to_string(block_size.x) + ";\n";
|
||||
c += " int Y = (get_group_id(2) * 4 + get_local_id(1)) * " +
|
||||
std::to_string(block_size.y) + ";\n";
|
||||
c += " int Z = (get_group_id(0) * 1 + get_local_id(2)) * " +
|
||||
std::to_string(block_size.z) + ";\n";
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
c += " ACCUM_FLT4 r" + std::to_string(z) + std::to_string(y) +
|
||||
std::to_string(x) + " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
c += " __local ACCUM_FLT4 data[" + std::to_string(block_size.z * 4) + "];\n";
|
||||
c += " __global ACCUM_FLT4* filters_loc = filters_buffer + Z * 4 * "
|
||||
"src_size.w;\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
c += " const int src_layer_offset = src_size.x * src_size.y;\n";
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
std::string xc = "min(X + " + std::to_string(x) + ", src_size.x - 1)";
|
||||
std::string yc = "min(Y + " + std::to_string(y) + ", src_size.y - 1)";
|
||||
std::string id = std::to_string(y) + std::to_string(x);
|
||||
c += " int src_a_" + id + " = " + yc + " * src_size.x + " + xc + ";\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
c += " int s = 0;\n";
|
||||
c += " do {\n";
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
std::string id = std::to_string(y) + std::to_string(x);
|
||||
if (precision == CalculationsPrecision::F32_F16) {
|
||||
c += " ACCUM_FLT4 src" + id + " = convert_float4(src_data[src_a_" +
|
||||
id + "]);\n";
|
||||
} else {
|
||||
c += " FLT4 src" + id + " = src_data[src_a_" + id + "];\n";
|
||||
}
|
||||
c += " src_a_" + id + " += src_layer_offset;\n";
|
||||
} else {
|
||||
std::string id = std::to_string(y) + std::to_string(x);
|
||||
if (precision == CalculationsPrecision::F32_F16) {
|
||||
c += " ACCUM_FLT4 src" + id + " = " +
|
||||
src_tensor.ReadAsFloat3D("X + " + std::to_string(x),
|
||||
"Y + " + std::to_string(y), "s",
|
||||
TextureAddressMode::DONT_CARE) +
|
||||
";\n";
|
||||
} else {
|
||||
c += " FLT4 src" + id + " = " +
|
||||
src_tensor.Read3D("X + " + std::to_string(x),
|
||||
"Y + " + std::to_string(y), "s",
|
||||
TextureAddressMode::DONT_CARE) +
|
||||
";\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
c += " SIMD_BARRIER;\n";
|
||||
c += " event_t e = async_work_group_copy(data, filters_loc, " +
|
||||
std::to_string(block_size.z * 4) + ", 0);\n";
|
||||
c += " SIMD_WAIT_EVENT(e);\n";
|
||||
c += " s += 1;\n";
|
||||
const std::string channels[] = {"x", "y", "z", "w"};
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
for (int ch = 0; ch < 4; ++ch) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
std::string id = std::to_string(y) + std::to_string(x);
|
||||
c += " r" + std::to_string(z) + id + " += data[" +
|
||||
std::to_string(z * 4 + ch) + "] * src" + id + "." +
|
||||
channels[ch] + ";\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
c += " filters_loc += " + std::to_string(block_size.z * 4) + ";\n";
|
||||
c += " } while (s < src_size.w);\n";
|
||||
c += " SIMD_BARRIER;\n";
|
||||
c += " event_t e = async_work_group_copy(data, biases + Z, " +
|
||||
std::to_string(block_size.z) + ", 0);\n";
|
||||
c += " SIMD_WAIT_EVENT(e);\n";
|
||||
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) {\n";
|
||||
c += " return;\n";
|
||||
c += " }\n";
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
c += " if (Z + " + std::to_string(z) + " >= dst_size.w) return;\n";
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
const std::string xs = "X + " + std::to_string(x);
|
||||
const std::string ys = "Y + " + std::to_string(y);
|
||||
const std::string zs = "Z + " + std::to_string(z);
|
||||
const std::string r_id =
|
||||
std::to_string(z) + std::to_string(y) + std::to_string(x);
|
||||
c += " if (" + xs + " < dst_size.x && " + ys + " < dst_size.y) {\n";
|
||||
c += " FLT4 res = TO_FLT4(r" + r_id + " + data[" +
|
||||
std::to_string(z) + "]);\n";
|
||||
c += " " + dst_tensor.GetAddress("address", xs, ys, zs) + "\n";
|
||||
c += PostProcess(linked_operations, "res", zs, "address");
|
||||
c += " " + dst_tensor.Write3D("res", "address") + "\n";
|
||||
c += " }\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
c += "}\n";
|
||||
return c;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const int3& block_size)
|
||||
const ConvParams& conv_params)
|
||||
: GPUOperation(definition),
|
||||
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
|
||||
stride_(attr.strides.w, attr.strides.h),
|
||||
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
|
||||
dilation_(attr.dilations.w, attr.dilations.h),
|
||||
block_size_(block_size),
|
||||
work_group_size_(8, 4, 1) {}
|
||||
conv_params_(conv_params) {}
|
||||
|
||||
ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
|
||||
: GPUOperation(std::move(operation)),
|
||||
@ -185,9 +47,8 @@ ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
|
||||
stride_(operation.stride_),
|
||||
padding_(operation.padding_),
|
||||
dilation_(operation.dilation_),
|
||||
block_size_(operation.block_size_),
|
||||
kernel_(std::move(operation.kernel_)),
|
||||
work_group_size_(operation.work_group_size_) {}
|
||||
conv_params_(operation.conv_params_),
|
||||
kernel_(std::move(operation.kernel_)) {}
|
||||
|
||||
ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
|
||||
if (this != &operation) {
|
||||
@ -197,9 +58,8 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
|
||||
std::swap(stride_, operation.stride_);
|
||||
std::swap(padding_, operation.padding_);
|
||||
std::swap(dilation_, operation.dilation_);
|
||||
std::swap(block_size_, operation.block_size_);
|
||||
std::swap(conv_params_, operation.conv_params_);
|
||||
kernel_ = std::move(operation.kernel_);
|
||||
std::swap(work_group_size_, operation.work_group_size_);
|
||||
GPUOperation::operator=(std::move(operation));
|
||||
}
|
||||
return *this;
|
||||
@ -208,7 +68,7 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
|
||||
Status ConvPowerVR::Compile(const CreationContext& creation_context) {
|
||||
const std::string code = GenerateConvPowerVR1x1(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, block_size_, linked_operations_);
|
||||
definition_.precision, conv_params_, linked_operations_);
|
||||
std::vector<CompilerOptions> options;
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
creation_context.device->IsPowerVR()) {
|
||||
@ -232,19 +92,210 @@ Status ConvPowerVR::BindArguments() {
|
||||
}
|
||||
|
||||
int3 ConvPowerVR::GetGridSize() const {
|
||||
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), block_size_.x);
|
||||
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), block_size_.y);
|
||||
const int grid_z = IntegralDivideRoundUp(dst_[0]->Depth(), block_size_.z);
|
||||
const int wg_x = IntegralDivideRoundUp(grid_x, work_group_size_.x);
|
||||
const int wg_y = IntegralDivideRoundUp(grid_y, work_group_size_.y);
|
||||
const int wg_z = IntegralDivideRoundUp(grid_z, work_group_size_.z);
|
||||
return int3(wg_z * work_group_size_.x, wg_x * work_group_size_.y,
|
||||
wg_y * work_group_size_.z);
|
||||
const int grid_x =
|
||||
IntegralDivideRoundUp(dst_[0]->Width(), conv_params_.block_size.x);
|
||||
const int grid_y =
|
||||
IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
|
||||
const int grid_z =
|
||||
IntegralDivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
|
||||
const int wg_x =
|
||||
IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
|
||||
const int wg_y =
|
||||
IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
|
||||
const int wg_z =
|
||||
IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
|
||||
return int3(wg_z * conv_params_.work_group_size.x,
|
||||
wg_x * conv_params_.work_group_size.y,
|
||||
wg_y * conv_params_.work_group_size.z);
|
||||
}
|
||||
|
||||
Status ConvPowerVR::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(),
|
||||
conv_params_.work_group_size);
|
||||
}
|
||||
|
||||
std::string GenerateConvPowerVR1x1(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const ConvPowerVR::ConvParams& conv_params,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
std::string c = GetCommonDefines(precision);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
|
||||
c += "#define SIMD_BARRIER " +
|
||||
(!conv_params.explicit_sync
|
||||
? std::string("")
|
||||
: std::string("barrier(CLK_LOCAL_MEM_FENCE)")) +
|
||||
"\n";
|
||||
c += "#define SIMD_WAIT_EVENT(E) " +
|
||||
(!conv_params.explicit_sync ? std::string("")
|
||||
: std::string("wait_group_events(1, &E);")) +
|
||||
"\n";
|
||||
const int3 work_group_size = conv_params.work_group_size;
|
||||
const int3 block_size = conv_params.block_size;
|
||||
c += "__attribute__((reqd_work_group_size(" +
|
||||
std::to_string(work_group_size.x) + ", " +
|
||||
std::to_string(work_group_size.y) + ", " +
|
||||
std::to_string(work_group_size.z) + ")))\n";
|
||||
c += "__kernel void main_function(\n";
|
||||
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
|
||||
c += " __global ACCUM_FLT4* filters_buffer, \n";
|
||||
c += " __global ACCUM_FLT4* biases \n";
|
||||
c += GetArgsDeclaration(linked_operations);
|
||||
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
|
||||
c += " int4 src_size, \n";
|
||||
c += " int4 dst_size \n";
|
||||
c += ") {\n";
|
||||
c += " int X = (get_group_id(1) * 8 + get_local_id(0)) * " +
|
||||
std::to_string(block_size.x) + ";\n";
|
||||
c += " int Y = (get_group_id(2) * 4 + get_local_id(1)) * " +
|
||||
std::to_string(block_size.y) + ";\n";
|
||||
c += " int Z = (get_group_id(0) * 1 + get_local_id(2)) * " +
|
||||
std::to_string(block_size.z) + ";\n";
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
c += " ACCUM_FLT4 r" + std::to_string(z) + std::to_string(y) +
|
||||
std::to_string(x) + " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
c += " __local ACCUM_FLT4 data[" +
|
||||
std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
|
||||
"];\n";
|
||||
c += " __global ACCUM_FLT4* filters_loc = filters_buffer + Z * 4 * "
|
||||
"src_size.w;\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
c += " const int src_layer_offset = src_size.x * src_size.y;\n";
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
std::string xc = "min(X + " + std::to_string(x) + ", src_size.x - 1)";
|
||||
std::string yc = "min(Y + " + std::to_string(y) + ", src_size.y - 1)";
|
||||
std::string id = std::to_string(y) + std::to_string(x);
|
||||
c += " int src_a_" + id + " = " + yc + " * src_size.x + " + xc + ";\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto declare_src = [&]() {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
const std::string id = std::to_string(y) + std::to_string(x);
|
||||
if (precision == CalculationsPrecision::F32_F16) {
|
||||
c += " ACCUM_FLT4 src" + id + ";\n";
|
||||
} else {
|
||||
c += " FLT4 src" + id + ";\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
auto read_src = [&]() {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
std::string id = std::to_string(y) + std::to_string(x);
|
||||
if (precision == CalculationsPrecision::F32_F16) {
|
||||
c += " src" + id + " = convert_float4(src_data[src_a_" + id +
|
||||
"]);\n";
|
||||
} else {
|
||||
c += " src" + id + " = src_data[src_a_" + id + "];\n";
|
||||
}
|
||||
c += " src_a_" + id + " += src_layer_offset;\n";
|
||||
} else {
|
||||
std::string id = std::to_string(y) + std::to_string(x);
|
||||
if (precision == CalculationsPrecision::F32_F16) {
|
||||
c += " src" + id + " = " +
|
||||
src_tensor.ReadAsFloat3D("X + " + std::to_string(x),
|
||||
"Y + " + std::to_string(y), "s",
|
||||
TextureAddressMode::DONT_CARE) +
|
||||
";\n";
|
||||
} else {
|
||||
c += " src" + id + " = " +
|
||||
src_tensor.Read3D("X + " + std::to_string(x),
|
||||
"Y + " + std::to_string(y), "s",
|
||||
TextureAddressMode::DONT_CARE) +
|
||||
";\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
auto conv_core = [&]() {
|
||||
const std::string channels[] = {"x", "y", "z", "w"};
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
for (int ch = 0; ch < 4; ++ch) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
std::string id = std::to_string(y) + std::to_string(x);
|
||||
c += " r" + std::to_string(z) + id + " += data[" +
|
||||
std::to_string(z * 4 + ch) + "] * src" + id + "." +
|
||||
channels[ch] + ";\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
c += " int s = 0;\n";
|
||||
c += " do {\n";
|
||||
declare_src();
|
||||
c += " SIMD_BARRIER;\n";
|
||||
c += " event_t e = async_work_group_copy(data, filters_loc, " +
|
||||
std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
|
||||
", 0);\n";
|
||||
read_src();
|
||||
c += " SIMD_WAIT_EVENT(e);\n";
|
||||
c += " s += 1;\n";
|
||||
conv_core();
|
||||
for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
|
||||
read_src();
|
||||
conv_core();
|
||||
c += " s += 1;\n";
|
||||
}
|
||||
c += " filters_loc += " +
|
||||
std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
|
||||
";\n";
|
||||
c += " } while (s < src_size.w);\n";
|
||||
c += " SIMD_BARRIER;\n";
|
||||
c += " event_t e = async_work_group_copy(data, biases + Z, " +
|
||||
std::to_string(block_size.z) + ", 0);\n";
|
||||
c += " SIMD_WAIT_EVENT(e);\n";
|
||||
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) {\n";
|
||||
c += " return;\n";
|
||||
c += " }\n";
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
c += " if (Z + " + std::to_string(z) + " >= dst_size.w) return;\n";
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
const std::string xs = "X + " + std::to_string(x);
|
||||
const std::string ys = "Y + " + std::to_string(y);
|
||||
const std::string zs = "Z + " + std::to_string(z);
|
||||
const std::string r_id =
|
||||
std::to_string(z) + std::to_string(y) + std::to_string(x);
|
||||
bool need_x_check = x != 0;
|
||||
bool need_y_check = y != 0;
|
||||
if (need_x_check && need_y_check) {
|
||||
c += " if (" + xs + " < dst_size.x && " + ys + " < dst_size.y) {\n";
|
||||
} else if (need_x_check && !need_y_check) {
|
||||
c += " if (" + xs + " < dst_size.x) {\n";
|
||||
} else if (!need_x_check && need_y_check) {
|
||||
c += " if (" + ys + " < dst_size.y) {\n";
|
||||
} else {
|
||||
c += " {\n";
|
||||
}
|
||||
c += " FLT4 res = TO_FLT4(r" + r_id + " + data[" +
|
||||
std::to_string(z) + "]);\n";
|
||||
c += " " + dst_tensor.GetAddress("address", xs, ys, zs) + "\n";
|
||||
c += PostProcess(linked_operations, "res", zs, "address");
|
||||
c += " " + dst_tensor.Write3D("res", "address") + "\n";
|
||||
c += " }\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
c += "}\n";
|
||||
return c;
|
||||
}
|
||||
|
||||
bool IsConvPowerVRSupported(const OperationDef& definition,
|
||||
@ -255,25 +306,61 @@ bool IsConvPowerVRSupported(const OperationDef& definition,
|
||||
attr.padding.appended == HW(0, 0);
|
||||
}
|
||||
|
||||
ConvPowerVR::ConvParams GuessBestParams(const CLDevice& device,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr) {
|
||||
ConvPowerVR::ConvParams conv_params;
|
||||
conv_params.block_size = int3(1, 1, 4);
|
||||
conv_params.work_group_size = int3(8, 4, 1);
|
||||
conv_params.src_depth_loop_size = 1;
|
||||
conv_params.explicit_sync = !device.IsPowerVR();
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
if (dst_depth % 8 == 0 || dst_depth >= 32) {
|
||||
conv_params.block_size.z = 8;
|
||||
} else if (dst_depth % 4 == 0 || dst_depth >= 8) {
|
||||
conv_params.block_size.z = 4;
|
||||
} else if (dst_depth % 2 == 0 || dst_depth >= 4) {
|
||||
conv_params.block_size.z = 2;
|
||||
} else {
|
||||
conv_params.block_size.z = dst_depth;
|
||||
}
|
||||
if (definition.precision == CalculationsPrecision::F16) {
|
||||
conv_params.block_size.z = std::min(4, conv_params.block_size.z);
|
||||
if (src_depth % 2 == 0) {
|
||||
conv_params.src_depth_loop_size = 2;
|
||||
}
|
||||
if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
|
||||
conv_params.src_depth_loop_size = 4;
|
||||
}
|
||||
if (conv_params.block_size.z == 1) {
|
||||
if (src_depth % 8 == 0) {
|
||||
conv_params.src_depth_loop_size = 8;
|
||||
}
|
||||
if (src_depth % 4 == 0) {
|
||||
conv_params.src_depth_loop_size = 4;
|
||||
}
|
||||
if (src_depth % 2 == 0) {
|
||||
conv_params.src_depth_loop_size = 2;
|
||||
}
|
||||
if (src_depth <= 8) {
|
||||
conv_params.src_depth_loop_size = src_depth;
|
||||
}
|
||||
}
|
||||
conv_params.block_size.x = 2;
|
||||
conv_params.work_group_size = int3(4, 8, 1);
|
||||
}
|
||||
|
||||
return conv_params;
|
||||
}
|
||||
|
||||
Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result) {
|
||||
int3 block_size = int3(1, 1, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
if (dst_depth % 8 == 0 || dst_depth >= 32) {
|
||||
block_size.z = 8;
|
||||
} else if (dst_depth % 4 == 0 || dst_depth >= 8) {
|
||||
block_size.z = 4;
|
||||
} else if (dst_depth % 2 == 0 || dst_depth >= 4) {
|
||||
block_size.z = 2;
|
||||
} else {
|
||||
block_size.z = dst_depth;
|
||||
}
|
||||
if (definition.precision == CalculationsPrecision::F16) {
|
||||
block_size.y = 2;
|
||||
}
|
||||
*result = ConvPowerVR(definition, attr, block_size);
|
||||
*result =
|
||||
ConvPowerVR(definition, attr,
|
||||
GuessBestParams(*creation_context.device, definition, attr));
|
||||
RETURN_IF_ERROR(
|
||||
result->UploadWeights(attr.weights, creation_context.context));
|
||||
LinearStorageCreateInfo create_info;
|
||||
|
@ -19,6 +19,7 @@ limitations under the License.
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
|
||||
@ -49,12 +50,16 @@ class ConvPowerVR : public GPUOperation {
|
||||
ConvPowerVR& operator=(const ConvPowerVR&) = delete;
|
||||
|
||||
private:
|
||||
friend Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result);
|
||||
struct ConvParams {
|
||||
int3 block_size;
|
||||
int3 work_group_size;
|
||||
int src_depth_loop_size;
|
||||
bool explicit_sync;
|
||||
};
|
||||
|
||||
ConvPowerVR(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const int3& block_size);
|
||||
const Convolution2DAttributes& attr,
|
||||
const ConvParams& conv_params);
|
||||
template <DataType T>
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
@ -62,6 +67,21 @@ class ConvPowerVR : public GPUOperation {
|
||||
void RearrangeWeight(const ::tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
absl::Span<T> dst);
|
||||
|
||||
friend Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result);
|
||||
|
||||
friend std::string GenerateConvPowerVR1x1(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const ConvParams& conv_params,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations);
|
||||
|
||||
friend ConvParams GuessBestParams(const CLDevice& device,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
@ -72,10 +92,9 @@ class ConvPowerVR : public GPUOperation {
|
||||
int2 stride_;
|
||||
int2 padding_;
|
||||
int2 dilation_;
|
||||
int3 block_size_;
|
||||
ConvParams conv_params_;
|
||||
|
||||
CLKernel kernel_;
|
||||
int3 work_group_size_;
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
@ -87,7 +106,7 @@ Status ConvPowerVR::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const bool f32_weights = definition_.precision != CalculationsPrecision::F16;
|
||||
const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
|
||||
|
||||
const int dst_depth_aligned = AlignByN(dst_depth, block_size_.z);
|
||||
const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z);
|
||||
const int elements_count =
|
||||
weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
|
||||
|
||||
@ -113,16 +132,17 @@ void ConvPowerVR::RearrangeWeight(const ::tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
const int kernel_y = weights.shape.h;
|
||||
|
||||
int counter = 0;
|
||||
for (int d = 0; d < IntegralDivideRoundUp(dst_depth, block_size_.z); ++d) {
|
||||
for (int d = 0;
|
||||
d < IntegralDivideRoundUp(dst_depth, conv_params_.block_size.z); ++d) {
|
||||
for (int y = 0; y < kernel_y; ++y) {
|
||||
for (int x = 0; x < kernel_x; ++x) {
|
||||
for (int s = 0; s < src_depth; ++s) {
|
||||
for (int k = 0; k < block_size_.z; ++k) {
|
||||
for (int k = 0; k < conv_params_.block_size.z; ++k) {
|
||||
T filters[4];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
const int s_ch = s * 4 + j;
|
||||
const int d_ch = (d * block_size_.z + k) * 4 + i;
|
||||
const int d_ch = (d * conv_params_.block_size.z + k) * 4 + i;
|
||||
if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
|
||||
const int f_index =
|
||||
weights.shape.LinearIndex({d_ch, y, x, s_ch});
|
||||
|
Loading…
Reference in New Issue
Block a user