ConvPowerVR improvements for FP16 mode.

PiperOrigin-RevId: 265767693
This commit is contained in:
A. Unique TensorFlower 2019-08-27 14:30:37 -07:00 committed by TensorFlower Gardener
parent 28e24ad2f1
commit 7301b746f2
3 changed files with 290 additions and 182 deletions

View File

@ -190,6 +190,7 @@ cc_library(
":gpu_operation",
":util",
"//tensorflow/lite/delegates/gpu/cl:buffer",
"//tensorflow/lite/delegates/gpu/cl:cl_device",
"//tensorflow/lite/delegates/gpu/cl:linear_storage",
"//tensorflow/lite/delegates/gpu/cl:precision",
"//tensorflow/lite/delegates/gpu/cl:tensor",

View File

@ -15,6 +15,7 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
#include <algorithm>
#include <string>
#include <utility>
@ -27,155 +28,16 @@ limitations under the License.
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GenerateConvPowerVR1x1(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const int3& block_size,
const std::vector<ElementwiseOperation*>& linked_operations) {
std::string c = GetCommonDefines(precision);
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
bool power_vr = true;
c += "#define SIMD_BARRIER " +
(power_vr ? std::string("")
: std::string("barrier(CLK_LOCAL_MEM_FENCE)")) +
"\n";
c += "#define SIMD_WAIT_EVENT(E) " +
(power_vr ? std::string("") : std::string("wait_group_events(1, &E);")) +
"\n";
c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
c += " __global ACCUM_FLT4* filters_buffer, \n";
c += " __global ACCUM_FLT4* biases \n";
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 src_size, \n";
c += " int4 dst_size \n";
c += ") {\n";
c += " int X = (get_group_id(1) * 8 + get_local_id(0)) * " +
std::to_string(block_size.x) + ";\n";
c += " int Y = (get_group_id(2) * 4 + get_local_id(1)) * " +
std::to_string(block_size.y) + ";\n";
c += " int Z = (get_group_id(0) * 1 + get_local_id(2)) * " +
std::to_string(block_size.z) + ";\n";
for (int z = 0; z < block_size.z; ++z) {
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
c += " ACCUM_FLT4 r" + std::to_string(z) + std::to_string(y) +
std::to_string(x) + " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
}
}
}
c += " __local ACCUM_FLT4 data[" + std::to_string(block_size.z * 4) + "];\n";
c += " __global ACCUM_FLT4* filters_loc = filters_buffer + Z * 4 * "
"src_size.w;\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " const int src_layer_offset = src_size.x * src_size.y;\n";
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
std::string xc = "min(X + " + std::to_string(x) + ", src_size.x - 1)";
std::string yc = "min(Y + " + std::to_string(y) + ", src_size.y - 1)";
std::string id = std::to_string(y) + std::to_string(x);
c += " int src_a_" + id + " = " + yc + " * src_size.x + " + xc + ";\n";
}
}
}
c += " int s = 0;\n";
c += " do {\n";
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
std::string id = std::to_string(y) + std::to_string(x);
if (precision == CalculationsPrecision::F32_F16) {
c += " ACCUM_FLT4 src" + id + " = convert_float4(src_data[src_a_" +
id + "]);\n";
} else {
c += " FLT4 src" + id + " = src_data[src_a_" + id + "];\n";
}
c += " src_a_" + id + " += src_layer_offset;\n";
} else {
std::string id = std::to_string(y) + std::to_string(x);
if (precision == CalculationsPrecision::F32_F16) {
c += " ACCUM_FLT4 src" + id + " = " +
src_tensor.ReadAsFloat3D("X + " + std::to_string(x),
"Y + " + std::to_string(y), "s",
TextureAddressMode::DONT_CARE) +
";\n";
} else {
c += " FLT4 src" + id + " = " +
src_tensor.Read3D("X + " + std::to_string(x),
"Y + " + std::to_string(y), "s",
TextureAddressMode::DONT_CARE) +
";\n";
}
}
}
}
c += " SIMD_BARRIER;\n";
c += " event_t e = async_work_group_copy(data, filters_loc, " +
std::to_string(block_size.z * 4) + ", 0);\n";
c += " SIMD_WAIT_EVENT(e);\n";
c += " s += 1;\n";
const std::string channels[] = {"x", "y", "z", "w"};
for (int z = 0; z < block_size.z; ++z) {
for (int ch = 0; ch < 4; ++ch) {
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
std::string id = std::to_string(y) + std::to_string(x);
c += " r" + std::to_string(z) + id + " += data[" +
std::to_string(z * 4 + ch) + "] * src" + id + "." +
channels[ch] + ";\n";
}
}
}
}
c += " filters_loc += " + std::to_string(block_size.z * 4) + ";\n";
c += " } while (s < src_size.w);\n";
c += " SIMD_BARRIER;\n";
c += " event_t e = async_work_group_copy(data, biases + Z, " +
std::to_string(block_size.z) + ", 0);\n";
c += " SIMD_WAIT_EVENT(e);\n";
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) {\n";
c += " return;\n";
c += " }\n";
for (int z = 0; z < block_size.z; ++z) {
c += " if (Z + " + std::to_string(z) + " >= dst_size.w) return;\n";
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
const std::string xs = "X + " + std::to_string(x);
const std::string ys = "Y + " + std::to_string(y);
const std::string zs = "Z + " + std::to_string(z);
const std::string r_id =
std::to_string(z) + std::to_string(y) + std::to_string(x);
c += " if (" + xs + " < dst_size.x && " + ys + " < dst_size.y) {\n";
c += " FLT4 res = TO_FLT4(r" + r_id + " + data[" +
std::to_string(z) + "]);\n";
c += " " + dst_tensor.GetAddress("address", xs, ys, zs) + "\n";
c += PostProcess(linked_operations, "res", zs, "address");
c += " " + dst_tensor.Write3D("res", "address") + "\n";
c += " }\n";
}
}
}
c += "}\n";
return c;
}
} // namespace
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
const Convolution2DAttributes& attr,
const int3& block_size)
const ConvParams& conv_params)
: GPUOperation(definition),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
stride_(attr.strides.w, attr.strides.h),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
dilation_(attr.dilations.w, attr.dilations.h),
block_size_(block_size),
work_group_size_(8, 4, 1) {}
conv_params_(conv_params) {}
ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
: GPUOperation(std::move(operation)),
@ -185,9 +47,8 @@ ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
stride_(operation.stride_),
padding_(operation.padding_),
dilation_(operation.dilation_),
block_size_(operation.block_size_),
kernel_(std::move(operation.kernel_)),
work_group_size_(operation.work_group_size_) {}
conv_params_(operation.conv_params_),
kernel_(std::move(operation.kernel_)) {}
ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
if (this != &operation) {
@ -197,9 +58,8 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
std::swap(stride_, operation.stride_);
std::swap(padding_, operation.padding_);
std::swap(dilation_, operation.dilation_);
std::swap(block_size_, operation.block_size_);
std::swap(conv_params_, operation.conv_params_);
kernel_ = std::move(operation.kernel_);
std::swap(work_group_size_, operation.work_group_size_);
GPUOperation::operator=(std::move(operation));
}
return *this;
@ -208,7 +68,7 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
Status ConvPowerVR::Compile(const CreationContext& creation_context) {
const std::string code = GenerateConvPowerVR1x1(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, block_size_, linked_operations_);
definition_.precision, conv_params_, linked_operations_);
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsPowerVR()) {
@ -232,19 +92,210 @@ Status ConvPowerVR::BindArguments() {
}
int3 ConvPowerVR::GetGridSize() const {
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), block_size_.x);
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), block_size_.y);
const int grid_z = IntegralDivideRoundUp(dst_[0]->Depth(), block_size_.z);
const int wg_x = IntegralDivideRoundUp(grid_x, work_group_size_.x);
const int wg_y = IntegralDivideRoundUp(grid_y, work_group_size_.y);
const int wg_z = IntegralDivideRoundUp(grid_z, work_group_size_.z);
return int3(wg_z * work_group_size_.x, wg_x * work_group_size_.y,
wg_y * work_group_size_.z);
const int grid_x =
IntegralDivideRoundUp(dst_[0]->Width(), conv_params_.block_size.x);
const int grid_y =
IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
const int grid_z =
IntegralDivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
const int wg_x =
IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
const int wg_y =
IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
const int wg_z =
IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
return int3(wg_z * conv_params_.work_group_size.x,
wg_x * conv_params_.work_group_size.y,
wg_y * conv_params_.work_group_size.z);
}
Status ConvPowerVR::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
return queue->DispatchImplicit(kernel_, GetGridSize(),
conv_params_.work_group_size);
}
std::string GenerateConvPowerVR1x1(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const ConvPowerVR::ConvParams& conv_params,
const std::vector<ElementwiseOperation*>& linked_operations) {
std::string c = GetCommonDefines(precision);
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
c += "#define SIMD_BARRIER " +
(!conv_params.explicit_sync
? std::string("")
: std::string("barrier(CLK_LOCAL_MEM_FENCE)")) +
"\n";
c += "#define SIMD_WAIT_EVENT(E) " +
(!conv_params.explicit_sync ? std::string("")
: std::string("wait_group_events(1, &E);")) +
"\n";
const int3 work_group_size = conv_params.work_group_size;
const int3 block_size = conv_params.block_size;
c += "__attribute__((reqd_work_group_size(" +
std::to_string(work_group_size.x) + ", " +
std::to_string(work_group_size.y) + ", " +
std::to_string(work_group_size.z) + ")))\n";
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
c += " __global ACCUM_FLT4* filters_buffer, \n";
c += " __global ACCUM_FLT4* biases \n";
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 src_size, \n";
c += " int4 dst_size \n";
c += ") {\n";
c += " int X = (get_group_id(1) * 8 + get_local_id(0)) * " +
std::to_string(block_size.x) + ";\n";
c += " int Y = (get_group_id(2) * 4 + get_local_id(1)) * " +
std::to_string(block_size.y) + ";\n";
c += " int Z = (get_group_id(0) * 1 + get_local_id(2)) * " +
std::to_string(block_size.z) + ";\n";
for (int z = 0; z < block_size.z; ++z) {
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
c += " ACCUM_FLT4 r" + std::to_string(z) + std::to_string(y) +
std::to_string(x) + " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
}
}
}
c += " __local ACCUM_FLT4 data[" +
std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
"];\n";
c += " __global ACCUM_FLT4* filters_loc = filters_buffer + Z * 4 * "
"src_size.w;\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " const int src_layer_offset = src_size.x * src_size.y;\n";
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
std::string xc = "min(X + " + std::to_string(x) + ", src_size.x - 1)";
std::string yc = "min(Y + " + std::to_string(y) + ", src_size.y - 1)";
std::string id = std::to_string(y) + std::to_string(x);
c += " int src_a_" + id + " = " + yc + " * src_size.x + " + xc + ";\n";
}
}
}
auto declare_src = [&]() {
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
const std::string id = std::to_string(y) + std::to_string(x);
if (precision == CalculationsPrecision::F32_F16) {
c += " ACCUM_FLT4 src" + id + ";\n";
} else {
c += " FLT4 src" + id + ";\n";
}
}
}
};
auto read_src = [&]() {
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
std::string id = std::to_string(y) + std::to_string(x);
if (precision == CalculationsPrecision::F32_F16) {
c += " src" + id + " = convert_float4(src_data[src_a_" + id +
"]);\n";
} else {
c += " src" + id + " = src_data[src_a_" + id + "];\n";
}
c += " src_a_" + id + " += src_layer_offset;\n";
} else {
std::string id = std::to_string(y) + std::to_string(x);
if (precision == CalculationsPrecision::F32_F16) {
c += " src" + id + " = " +
src_tensor.ReadAsFloat3D("X + " + std::to_string(x),
"Y + " + std::to_string(y), "s",
TextureAddressMode::DONT_CARE) +
";\n";
} else {
c += " src" + id + " = " +
src_tensor.Read3D("X + " + std::to_string(x),
"Y + " + std::to_string(y), "s",
TextureAddressMode::DONT_CARE) +
";\n";
}
}
}
}
};
auto conv_core = [&]() {
const std::string channels[] = {"x", "y", "z", "w"};
for (int z = 0; z < block_size.z; ++z) {
for (int ch = 0; ch < 4; ++ch) {
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
std::string id = std::to_string(y) + std::to_string(x);
c += " r" + std::to_string(z) + id + " += data[" +
std::to_string(z * 4 + ch) + "] * src" + id + "." +
channels[ch] + ";\n";
}
}
}
}
};
c += " int s = 0;\n";
c += " do {\n";
declare_src();
c += " SIMD_BARRIER;\n";
c += " event_t e = async_work_group_copy(data, filters_loc, " +
std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
", 0);\n";
read_src();
c += " SIMD_WAIT_EVENT(e);\n";
c += " s += 1;\n";
conv_core();
for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
read_src();
conv_core();
c += " s += 1;\n";
}
c += " filters_loc += " +
std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
";\n";
c += " } while (s < src_size.w);\n";
c += " SIMD_BARRIER;\n";
c += " event_t e = async_work_group_copy(data, biases + Z, " +
std::to_string(block_size.z) + ", 0);\n";
c += " SIMD_WAIT_EVENT(e);\n";
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) {\n";
c += " return;\n";
c += " }\n";
for (int z = 0; z < block_size.z; ++z) {
c += " if (Z + " + std::to_string(z) + " >= dst_size.w) return;\n";
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
const std::string xs = "X + " + std::to_string(x);
const std::string ys = "Y + " + std::to_string(y);
const std::string zs = "Z + " + std::to_string(z);
const std::string r_id =
std::to_string(z) + std::to_string(y) + std::to_string(x);
bool need_x_check = x != 0;
bool need_y_check = y != 0;
if (need_x_check && need_y_check) {
c += " if (" + xs + " < dst_size.x && " + ys + " < dst_size.y) {\n";
} else if (need_x_check && !need_y_check) {
c += " if (" + xs + " < dst_size.x) {\n";
} else if (!need_x_check && need_y_check) {
c += " if (" + ys + " < dst_size.y) {\n";
} else {
c += " {\n";
}
c += " FLT4 res = TO_FLT4(r" + r_id + " + data[" +
std::to_string(z) + "]);\n";
c += " " + dst_tensor.GetAddress("address", xs, ys, zs) + "\n";
c += PostProcess(linked_operations, "res", zs, "address");
c += " " + dst_tensor.Write3D("res", "address") + "\n";
c += " }\n";
}
}
}
c += "}\n";
return c;
}
bool IsConvPowerVRSupported(const OperationDef& definition,
@ -255,25 +306,61 @@ bool IsConvPowerVRSupported(const OperationDef& definition,
attr.padding.appended == HW(0, 0);
}
ConvPowerVR::ConvParams GuessBestParams(const CLDevice& device,
const OperationDef& definition,
const Convolution2DAttributes& attr) {
ConvPowerVR::ConvParams conv_params;
conv_params.block_size = int3(1, 1, 4);
conv_params.work_group_size = int3(8, 4, 1);
conv_params.src_depth_loop_size = 1;
conv_params.explicit_sync = !device.IsPowerVR();
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
if (dst_depth % 8 == 0 || dst_depth >= 32) {
conv_params.block_size.z = 8;
} else if (dst_depth % 4 == 0 || dst_depth >= 8) {
conv_params.block_size.z = 4;
} else if (dst_depth % 2 == 0 || dst_depth >= 4) {
conv_params.block_size.z = 2;
} else {
conv_params.block_size.z = dst_depth;
}
if (definition.precision == CalculationsPrecision::F16) {
conv_params.block_size.z = std::min(4, conv_params.block_size.z);
if (src_depth % 2 == 0) {
conv_params.src_depth_loop_size = 2;
}
if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
conv_params.src_depth_loop_size = 4;
}
if (conv_params.block_size.z == 1) {
if (src_depth % 8 == 0) {
conv_params.src_depth_loop_size = 8;
}
if (src_depth % 4 == 0) {
conv_params.src_depth_loop_size = 4;
}
if (src_depth % 2 == 0) {
conv_params.src_depth_loop_size = 2;
}
if (src_depth <= 8) {
conv_params.src_depth_loop_size = src_depth;
}
}
conv_params.block_size.x = 2;
conv_params.work_group_size = int3(4, 8, 1);
}
return conv_params;
}
Status CreateConvPowerVR(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvPowerVR* result) {
int3 block_size = int3(1, 1, 4);
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
if (dst_depth % 8 == 0 || dst_depth >= 32) {
block_size.z = 8;
} else if (dst_depth % 4 == 0 || dst_depth >= 8) {
block_size.z = 4;
} else if (dst_depth % 2 == 0 || dst_depth >= 4) {
block_size.z = 2;
} else {
block_size.z = dst_depth;
}
if (definition.precision == CalculationsPrecision::F16) {
block_size.y = 2;
}
*result = ConvPowerVR(definition, attr, block_size);
*result =
ConvPowerVR(definition, attr,
GuessBestParams(*creation_context.device, definition, attr));
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
LinearStorageCreateInfo create_info;

View File

@ -19,6 +19,7 @@ limitations under the License.
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
@ -49,12 +50,16 @@ class ConvPowerVR : public GPUOperation {
ConvPowerVR& operator=(const ConvPowerVR&) = delete;
private:
friend Status CreateConvPowerVR(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvPowerVR* result);
struct ConvParams {
int3 block_size;
int3 work_group_size;
int src_depth_loop_size;
bool explicit_sync;
};
ConvPowerVR(const OperationDef& definition,
const Convolution2DAttributes& attr, const int3& block_size);
const Convolution2DAttributes& attr,
const ConvParams& conv_params);
template <DataType T>
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
@ -62,6 +67,21 @@ class ConvPowerVR : public GPUOperation {
void RearrangeWeight(const ::tflite::gpu::Tensor<OHWI, S>& weights,
absl::Span<T> dst);
friend Status CreateConvPowerVR(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvPowerVR* result);
friend std::string GenerateConvPowerVR1x1(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const ConvParams& conv_params,
const std::vector<ElementwiseOperation*>& linked_operations);
friend ConvParams GuessBestParams(const CLDevice& device,
const OperationDef& definition,
const Convolution2DAttributes& attr);
Status BindArguments();
int3 GetGridSize() const;
@ -72,10 +92,9 @@ class ConvPowerVR : public GPUOperation {
int2 stride_;
int2 padding_;
int2 dilation_;
int3 block_size_;
ConvParams conv_params_;
CLKernel kernel_;
int3 work_group_size_;
};
template <DataType T>
@ -87,7 +106,7 @@ Status ConvPowerVR::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
const bool f32_weights = definition_.precision != CalculationsPrecision::F16;
const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
const int dst_depth_aligned = AlignByN(dst_depth, block_size_.z);
const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z);
const int elements_count =
weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
@ -113,16 +132,17 @@ void ConvPowerVR::RearrangeWeight(const ::tflite::gpu::Tensor<OHWI, S>& weights,
const int kernel_y = weights.shape.h;
int counter = 0;
for (int d = 0; d < IntegralDivideRoundUp(dst_depth, block_size_.z); ++d) {
for (int d = 0;
d < IntegralDivideRoundUp(dst_depth, conv_params_.block_size.z); ++d) {
for (int y = 0; y < kernel_y; ++y) {
for (int x = 0; x < kernel_x; ++x) {
for (int s = 0; s < src_depth; ++s) {
for (int k = 0; k < block_size_.z; ++k) {
for (int k = 0; k < conv_params_.block_size.z; ++k) {
T filters[4];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
const int s_ch = s * 4 + j;
const int d_ch = (d * block_size_.z + k) * 4 + i;
const int d_ch = (d * conv_params_.block_size.z + k) * 4 + i;
if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
const int f_index =
weights.shape.LinearIndex({d_ch, y, x, s_ch});