Rename IntegralDivideRoundUp to DivideRoundUp.

PiperOrigin-RevId: 307447663
Change-Id: I1e0f6c9f058e3f0457a7522f1d10f7da8ab8610d
This commit is contained in:
Juhyun Lee 2020-04-20 12:02:49 -07:00 committed by TensorFlower Gardener
parent 55c4d9e49c
commit b26b6b5669
80 changed files with 435 additions and 452 deletions

View File

@ -54,7 +54,7 @@ namespace gpu {
// H - height // H - height
// W - width // W - width
// C - channels // C - channels
// D - depth := IntegralDivideRoundUp(C, 4) // D - depth := DivideRoundUp(C, 4)
// C4 - is the constant = 4. // C4 - is the constant = 4.
enum class DataLayout { enum class DataLayout {
UNKNOWN, UNKNOWN,
@ -164,7 +164,7 @@ struct Dimensions {
Dimensions(int32_t batch, int32_t height, int32_t width, int32_t channels) Dimensions(int32_t batch, int32_t height, int32_t width, int32_t channels)
: b(batch), h(height), w(width), c(channels) {} : b(batch), h(height), w(width), c(channels) {}
int32_t d() const { return IntegralDivideRoundUp(c, 4); } int32_t d() const { return DivideRoundUp(c, 4); }
int32_t product() const { return b * h * w * c; } int32_t product() const { return b * h * w * c; }

View File

@ -70,10 +70,10 @@ std::string Add::GetElementWiseCode(
Add::Add(const OperationDef& definition, const std::vector<int>& channels, Add::Add(const OperationDef& definition, const std::vector<int>& channels,
int dst_channels) int dst_channels)
: ElementwiseOperation(definition), : ElementwiseOperation(definition),
dst_depth_(IntegralDivideRoundUp(dst_channels, 4)) { dst_depth_(DivideRoundUp(dst_channels, 4)) {
src_depthes_.resize(channels.size()); src_depthes_.resize(channels.size());
for (int i = 0; i < channels.size(); ++i) { for (int i = 0; i < channels.size(); ++i) {
src_depthes_[i] = IntegralDivideRoundUp(channels[i], 4); src_depthes_[i] = DivideRoundUp(channels[i], 4);
} }
} }

View File

@ -79,7 +79,7 @@ std::string GetConcatKernelCode(
// generation. // generation.
c += " int Z = 0;\n"; c += " int Z = 0;\n";
for (int i = 0; i < channels.size(); ++i) { for (int i = 0; i < channels.size(); ++i) {
const int depth = IntegralDivideRoundUp(channels[i], 4); const int depth = DivideRoundUp(channels[i], 4);
if (depth % 2 == 0) { if (depth % 2 == 0) {
// We can read more at once inside of loop in case depth % 2 == 0 // We can read more at once inside of loop in case depth % 2 == 0
// it should be better for reading latency hiding // it should be better for reading latency hiding
@ -112,7 +112,7 @@ std::string GetConcatKernelCode(
int read_index = 0; int read_index = 0;
int z = 0; int z = 0;
for (int i = 0; i < channels.size(); ++i) { for (int i = 0; i < channels.size(); ++i) {
const int depth = IntegralDivideRoundUp(channels[i], 4); const int depth = DivideRoundUp(channels[i], 4);
for (int d = 0; d < depth; ++d) { for (int d = 0; d < depth; ++d) {
const int channels_in_group = std::min(4, channels[i] - d * 4); const int channels_in_group = std::min(4, channels[i] - d * 4);
const std::string temp_name = "t" + std::to_string(read_index); const std::string temp_name = "t" + std::to_string(read_index);

View File

@ -128,24 +128,24 @@ absl::Status Conv3D::BindArguments() {
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch())); RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
} }
RETURN_IF_ERROR(kernel_.SetBytesAuto( RETURN_IF_ERROR(kernel_.SetBytesAuto(
IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w))); DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w)));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS())); RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS())); RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
return absl::OkStatus(); return absl::OkStatus();
} }
int3 Conv3D::GetGridSize() const { int3 Conv3D::GetGridSize() const {
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
conv_params_.block_size.x); conv_params_.block_size.x);
const int grid_y = const int grid_y =
IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y); DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
const int grid_z = const int grid_z =
IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) * DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) *
IntegralDivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z); DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
int3 wg; int3 wg;
wg.x = IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x); wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
wg.y = IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y); wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
wg.z = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z); wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
return int3(wg[conv_params_.work_group_launch_order[0]] * return int3(wg[conv_params_.work_group_launch_order[0]] *
conv_params_.work_group_size.x, conv_params_.work_group_size.x,
wg[conv_params_.work_group_launch_order[1]] * wg[conv_params_.work_group_launch_order[1]] *
@ -885,8 +885,8 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
Conv3D::ConvParams Conv3D::GuessBestParams( Conv3D::ConvParams Conv3D::GuessBestParams(
const CLDevice& device, const OperationDef& definition, const CLDevice& device, const OperationDef& definition,
const Convolution3DAttributes& attr) const { const Convolution3DAttributes& attr) const {
const int dst_slices = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 && const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
attr.dilations.w == 1 && attr.dilations.w == 1 &&
attr.padding.prepended.w == 0 && attr.padding.prepended.w == 0 &&

View File

@ -147,8 +147,8 @@ absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
CLContext* context) { CLContext* context) {
const int block_size = conv_params_.block_size.w; const int block_size = conv_params_.block_size.w;
const int dst_slices = const int dst_slices =
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size); AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4); const int src_slices = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = kernel_size_.x; const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y; const int kernel_y = kernel_size_.y;
const int kernel_z = kernel_size_.z; const int kernel_z = kernel_size_.z;
@ -219,8 +219,8 @@ void Conv3D::RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
absl::Span<T> dst) { absl::Span<T> dst) {
const int block_size = conv_params_.block_size.w; const int block_size = conv_params_.block_size.w;
const int dst_slices = const int dst_slices =
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size); AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4); const int src_slices = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = kernel_size_.x; const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y; const int kernel_y = kernel_size_.y;
const int kernel_z = kernel_size_.z; const int kernel_z = kernel_size_.z;

View File

@ -307,7 +307,7 @@ absl::Status ConvBuffer1x1::BindArguments() {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr())); RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_)); RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting())); RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
const int src_width_elements = IntegralDivideRoundUp( const int src_width_elements = DivideRoundUp(
src_[0]->Width() * src_[0]->Batch(), (conv_params_.element_size / 4)); src_[0]->Width() * src_[0]->Batch(), (conv_params_.element_size / 4));
int4 src_size = int4(src_width_elements, src_[0]->Height(), src_[0]->Slices(), int4 src_size = int4(src_width_elements, src_[0]->Height(), src_[0]->Slices(),
src_width_elements * src_[0]->Height()); src_width_elements * src_[0]->Height());
@ -317,14 +317,14 @@ absl::Status ConvBuffer1x1::BindArguments() {
} }
int3 ConvBuffer1x1::GetGridSize() const { int3 ConvBuffer1x1::GetGridSize() const {
const int dst_width_elements = IntegralDivideRoundUp( const int dst_width_elements = DivideRoundUp(
dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4)); dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4));
const int grid_x = const int grid_x =
IntegralDivideRoundUp(dst_width_elements, conv_params_.block_size.x); DivideRoundUp(dst_width_elements, conv_params_.block_size.x);
const int grid_y = const int grid_y =
IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y); DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
const int grid_z = const int grid_z =
IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z); DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
return int3(grid_x, grid_y, grid_z); return int3(grid_x, grid_y, grid_z);
} }
@ -358,8 +358,8 @@ absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
if (!IsConvBuffer1x1Supported(definition, attr)) { if (!IsConvBuffer1x1Supported(definition, attr)) {
return absl::InvalidArgumentError("ConvBuffer1x1 doesn't supported"); return absl::InvalidArgumentError("ConvBuffer1x1 doesn't supported");
} }
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvBuffer1x1::ConvParams conv_params; ConvBuffer1x1::ConvParams conv_params;
if (shape) { if (shape) {
conv_params = GetBestParams(*creation_context.device, definition, *shape, conv_params = GetBestParams(*creation_context.device, definition, *shape,
@ -376,8 +376,8 @@ absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
const OperationDef& definition, const OperationDef& definition,
const FullyConnectedAttributes& attr, const FullyConnectedAttributes& attr,
ConvBuffer1x1* result, const BHWC* shape) { ConvBuffer1x1* result, const BHWC* shape) {
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvBuffer1x1::ConvParams conv_params; ConvBuffer1x1::ConvParams conv_params;
if (shape) { if (shape) {
conv_params = GetBestParams(*creation_context.device, definition, *shape, conv_params = GetBestParams(*creation_context.device, definition, *shape,
@ -396,8 +396,8 @@ absl::Status CreateConvBuffer1x1Wino4x4To6x6(
const CreationContext& creation_context, const OperationDef& definition, const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvBuffer1x1* result, const Convolution2DAttributes& attr, ConvBuffer1x1* result,
const BHWC* shape) { const BHWC* shape) {
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvBuffer1x1::ConvParams conv_params; ConvBuffer1x1::ConvParams conv_params;
if (shape) { if (shape) {
conv_params = GetBestParams(*creation_context.device, definition, *shape, conv_params = GetBestParams(*creation_context.device, definition, *shape,

View File

@ -135,8 +135,8 @@ absl::Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
template <DataType T> template <DataType T>
absl::Status ConvBuffer1x1::UploadWeights( absl::Status ConvBuffer1x1::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) { const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4); const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const bool f32_weights = definition_.precision == CalculationsPrecision::F32; const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4); const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);

View File

@ -40,9 +40,9 @@ std::string GenerateConvolutionConstantCode(
std::string c = GetCommonDefines(op_def.precision); std::string c = GetCommonDefines(op_def.precision);
const int out_z = IntegralDivideRoundUp(dst_channels, 4); const int out_z = DivideRoundUp(dst_channels, 4);
const std::string kOutZ = std::to_string(out_z); const std::string kOutZ = std::to_string(out_z);
const int src_depth = IntegralDivideRoundUp(src_channels, 4); const int src_depth = DivideRoundUp(src_channels, 4);
const auto src_tensor_type = op_def.src_tensors[0].storage_type; const auto src_tensor_type = op_def.src_tensors[0].storage_type;
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER || const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
@ -290,7 +290,7 @@ bool IsConvConstantsSupported(const CLDevice& device,
: sizeof(half); : sizeof(half);
const int filters_buffer_size = filters_count * float_size; const int filters_buffer_size = filters_count * float_size;
const int kConstantMaxSize = GetOptimalMaxConstantSize(device.GetInfo()); const int kConstantMaxSize = GetOptimalMaxConstantSize(device.GetInfo());
const int flt4_registers = IntegralDivideRoundUp(w_shape.o, 4); const int flt4_registers = DivideRoundUp(w_shape.o, 4);
return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8; return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
} }

View File

@ -88,7 +88,7 @@ class ConvConstants : public GPUOperation {
template <DataType T> template <DataType T>
absl::Status ConvConstants::UploadWeights( absl::Status ConvConstants::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) { const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4); const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = weights.shape.w; const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h; const int kernel_y = weights.shape.h;
@ -112,8 +112,8 @@ absl::Status ConvConstants::UploadWeights(
template <DataType S, typename T> template <DataType S, typename T>
void ConvConstants::RearrangeWeightsData( void ConvConstants::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) { const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4); const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = weights.shape.w; const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h; const int kernel_y = weights.shape.h;

View File

@ -205,8 +205,8 @@ absl::Status ConvPowerVR::BindArguments() {
kernel_dilation_.z * src_[0]->Batch(), kernel_dilation_.w))); kernel_dilation_.z * src_[0]->Batch(), kernel_dilation_.w)));
} }
if (conv_params_.linear_hw) { if (conv_params_.linear_hw) {
const int grid_x = IntegralDivideRoundUp( const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
dst_[0]->Width() * dst_[0]->Batch(), conv_params_.block_size.x); conv_params_.block_size.x);
RETURN_IF_ERROR(kernel_.SetBytesAuto(grid_x)); RETURN_IF_ERROR(kernel_.SetBytesAuto(grid_x));
} }
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB())); RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
@ -215,27 +215,26 @@ absl::Status ConvPowerVR::BindArguments() {
} }
int3 ConvPowerVR::GetGridSize() const { int3 ConvPowerVR::GetGridSize() const {
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
conv_params_.block_size.x); conv_params_.block_size.x);
const int grid_y = const int grid_y =
IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y); DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
const int grid_z = const int grid_z =
IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z); DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
int3 wg; int3 wg;
if (conv_params_.linear_hw) { if (conv_params_.linear_hw) {
wg.x = wg.x = DivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x);
IntegralDivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x); wg.y = DivideRoundUp(grid_z, conv_params_.work_group_size.y);
wg.y = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.y);
return int3(wg[conv_params_.work_group_launch_order[0]] * return int3(wg[conv_params_.work_group_launch_order[0]] *
conv_params_.work_group_size.x, conv_params_.work_group_size.x,
wg[conv_params_.work_group_launch_order[1]] * wg[conv_params_.work_group_launch_order[1]] *
conv_params_.work_group_size.y, conv_params_.work_group_size.y,
1); 1);
} else { } else {
wg.x = IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x); wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
wg.y = IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y); wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
wg.z = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z); wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
return int3(wg[conv_params_.work_group_launch_order[0]] * return int3(wg[conv_params_.work_group_launch_order[0]] *
conv_params_.work_group_size.x, conv_params_.work_group_size.x,
wg[conv_params_.work_group_launch_order[1]] * wg[conv_params_.work_group_launch_order[1]] *
@ -808,8 +807,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams( ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const CLDevice& device, const OperationDef& definition, const CLDevice& device, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* dst_shape) const { const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 && const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
attr.dilations.w == 1 && attr.dilations.w == 1 &&
attr.padding.prepended.w == 0 && attr.padding.prepended.w == 0 &&
@ -825,8 +824,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams( ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const CLDevice& device, const OperationDef& definition, const CLDevice& device, const OperationDef& definition,
const FullyConnectedAttributes& attr, const BHWC* dst_shape) const { const FullyConnectedAttributes& attr, const BHWC* dst_shape) const {
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvPowerVR::ConvParams params = GuessBestParams( ConvPowerVR::ConvParams params = GuessBestParams(
device, definition, src_depth, dst_depth, true, true, false, dst_shape); device, definition, src_depth, dst_depth, true, true, false, dst_shape);
params.work_group_size.x *= params.work_group_size.y; params.work_group_size.x *= params.work_group_size.y;
@ -839,8 +838,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd( ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
const CLDevice& device, const OperationDef& definition, const CLDevice& device, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* dst_shape) const { const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvPowerVR::ConvParams params = GuessBestParams( ConvPowerVR::ConvParams params = GuessBestParams(
device, definition, src_depth, dst_depth, true, true, true, dst_shape); device, definition, src_depth, dst_depth, true, true, true, dst_shape);
params.block_size.x *= params.block_size.y; params.block_size.x *= params.block_size.y;

View File

@ -188,8 +188,8 @@ absl::Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
template <DataType T> template <DataType T>
absl::Status ConvPowerVR::UploadWeights( absl::Status ConvPowerVR::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) { const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4); const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const bool f32_weights = conv_params_.weights_data_type == DataType::FLOAT32; const bool f32_weights = conv_params_.weights_data_type == DataType::FLOAT32;
const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4); const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);

View File

@ -433,9 +433,9 @@ absl::Status ConvTexture::BindArguments() {
int3 ConvTexture::GetGridSize() const { int3 ConvTexture::GetGridSize() const {
const int grid_x = const int grid_x =
IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x); DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), block_size_.y); const int grid_y = DivideRoundUp(dst_[0]->Height(), block_size_.y);
const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.z); const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
return int3(grid_x, grid_y, grid_z); return int3(grid_x, grid_y, grid_z);
} }

View File

@ -148,9 +148,9 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
template <DataType T> template <DataType T>
absl::Status ConvTexture::UploadWeights( absl::Status ConvTexture::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) { const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4); int dst_depth = DivideRoundUp(weights.shape.o, 4);
dst_depth = AlignByN(dst_depth, block_size_.z); dst_depth = AlignByN(dst_depth, block_size_.z);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = weights.shape.w; const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h; const int kernel_y = weights.shape.h;
@ -206,9 +206,9 @@ template <DataType S, typename T>
void ConvTexture::RearrangeWeightsData( void ConvTexture::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst_0, const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst_0,
absl::Span<T> dst_1, absl::Span<T> dst_2, absl::Span<T> dst_3) { absl::Span<T> dst_1, absl::Span<T> dst_2, absl::Span<T> dst_3) {
int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4); int dst_depth = DivideRoundUp(weights.shape.o, 4);
dst_depth = AlignByN(dst_depth, block_size_.z); dst_depth = AlignByN(dst_depth, block_size_.z);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = weights.shape.w; const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h; const int kernel_y = weights.shape.h;

View File

@ -322,7 +322,7 @@ ConvolutionTransposed::ConvolutionTransposed(
block_size_ = is_f16 ? int3(2, 2, 2) : int3(2, 2, 1); block_size_ = is_f16 ? int3(2, 2, 2) : int3(2, 2, 1);
} }
} }
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
if (dst_depth == 1 || dst_depth == 3) { if (dst_depth == 1 || dst_depth == 3) {
if (!device.IsMali()) { if (!device.IsMali()) {
block_size_.y *= block_size_.z; block_size_.y *= block_size_.z;
@ -406,10 +406,9 @@ absl::Status ConvolutionTransposed::BindArguments() {
int3 ConvolutionTransposed::GetGridSize() const { int3 ConvolutionTransposed::GetGridSize() const {
const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x); const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y); const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
const int grid_x = const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
IntegralDivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch(); const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
const int grid_y = IntegralDivideRoundUp(aligned_h, block_size_.y); const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.z);
return int3(grid_x, grid_y, grid_z); return int3(grid_x, grid_y, grid_z);
} }

View File

@ -91,8 +91,8 @@ template <DataType T>
absl::Status ConvolutionTransposed::UploadWeights( absl::Status ConvolutionTransposed::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) { const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_depth = const int dst_depth =
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z); AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = kernel_size_.x; const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y; const int kernel_y = kernel_size_.y;
int texture_width = dst_depth; int texture_width = dst_depth;
@ -160,8 +160,8 @@ template <DataType S, typename T>
void ConvolutionTransposed::RearrangeWeightsData( void ConvolutionTransposed::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) { const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int dst_depth = const int dst_depth =
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z); AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = kernel_size_.x; const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y; const int kernel_y = kernel_size_.y;
int texture_width = dst_depth; int texture_width = dst_depth;

View File

@ -440,8 +440,8 @@ absl::Status ConvolutionTransposed3D::BindArguments() {
if (definition_.IsBatchSupported()) { if (definition_.IsBatchSupported()) {
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch())); RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
} }
RETURN_IF_ERROR(kernel_.SetBytesAuto( RETURN_IF_ERROR(
IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.w))); kernel_.SetBytesAuto(DivideRoundUp(dst_[0]->Slices(), block_size_.w)));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHDS())); RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHDS()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHDS())); RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHDS()));
return absl::OkStatus(); return absl::OkStatus();
@ -451,11 +451,10 @@ int3 ConvolutionTransposed3D::GetGridSize() const {
const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x); const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y); const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z); const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z);
const int grid_x = const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
IntegralDivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch(); const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
const int grid_y = IntegralDivideRoundUp(aligned_h, block_size_.y); const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.w) *
const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.w) * DivideRoundUp(aligned_d, block_size_.z);
IntegralDivideRoundUp(aligned_d, block_size_.z);
return int3(grid_x, grid_y, grid_z); return int3(grid_x, grid_y, grid_z);
} }

View File

@ -91,8 +91,8 @@ template <DataType T>
absl::Status ConvolutionTransposed3D::UploadWeights( absl::Status ConvolutionTransposed3D::UploadWeights(
const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) { const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
const int dst_depth = const int dst_depth =
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z); AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = kernel_size_.x; const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y; const int kernel_y = kernel_size_.y;
const int kernel_z = kernel_size_.z; const int kernel_z = kernel_size_.z;
@ -162,8 +162,8 @@ template <DataType S, typename T>
void ConvolutionTransposed3D::RearrangeWeightsData( void ConvolutionTransposed3D::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) { const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
const int dst_depth = const int dst_depth =
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.w); AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.w);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = kernel_size_.x; const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y; const int kernel_y = kernel_size_.y;
const int kernel_z = kernel_size_.z; const int kernel_z = kernel_size_.z;

View File

@ -339,14 +339,13 @@ absl::Status ConvolutionTransposed3x3::BindArguments() {
} }
int3 ConvolutionTransposed3x3::GetGridSize() const { int3 ConvolutionTransposed3x3::GetGridSize() const {
const int grid_x = const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
IntegralDivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch(); const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
const int grid_z = dst_[0]->Slices(); const int grid_z = dst_[0]->Slices();
int3 wg; int3 wg;
wg.x = IntegralDivideRoundUp(grid_x, work_group_size_.x); wg.x = DivideRoundUp(grid_x, work_group_size_.x);
wg.y = IntegralDivideRoundUp(grid_y, work_group_size_.y); wg.y = DivideRoundUp(grid_y, work_group_size_.y);
wg.z = IntegralDivideRoundUp(grid_z, work_group_size_.z); wg.z = DivideRoundUp(grid_z, work_group_size_.z);
return int3(wg[work_group_launch_order_[0]] * work_group_size_.x, return int3(wg[work_group_launch_order_[0]] * work_group_size_.x,
wg[work_group_launch_order_[1]] * work_group_size_.y, wg[work_group_launch_order_[1]] * work_group_size_.y,
wg[work_group_launch_order_[2]] * work_group_size_.z); wg[work_group_launch_order_[2]] * work_group_size_.z);

View File

@ -84,8 +84,8 @@ class ConvolutionTransposed3x3 : public GPUOperation {
template <DataType T> template <DataType T>
absl::Status ConvolutionTransposed3x3::UploadWeights( absl::Status ConvolutionTransposed3x3::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) { const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4); const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = 3; // This operation support only 3x3 kernel const int kernel_x = 3; // This operation support only 3x3 kernel
const int kernel_y = 3; const int kernel_y = 3;
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4; const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@ -109,8 +109,8 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
template <DataType S, typename T> template <DataType S, typename T>
void ConvolutionTransposed3x3::RearrangeWeightsData( void ConvolutionTransposed3x3::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) { const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4); const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = 3; const int kernel_x = 3;
const int kernel_y = 3; const int kernel_y = 3;

View File

@ -224,8 +224,8 @@ ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
absl::Status ConvolutionTransposed3x3Thin::Compile( absl::Status ConvolutionTransposed3x3Thin::Compile(
const CreationContext& creation_context) { const CreationContext& creation_context) {
const auto code = GenerateConvolutionTransposedCode( const auto code = GenerateConvolutionTransposedCode(
definition_, biases_, IntegralDivideRoundUp(src_channels_, 4), definition_, biases_, DivideRoundUp(src_channels_, 4),
IntegralDivideRoundUp(dst_channels_, 4), *creation_context.device, DivideRoundUp(dst_channels_, 4), *creation_context.device,
linked_operations_); linked_operations_);
return creation_context.cache->GetOrCreateCLKernel( return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context, code, "main_function", *creation_context.context,

View File

@ -82,8 +82,8 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
template <DataType T> template <DataType T>
absl::Status ConvolutionTransposed3x3Thin::UploadWeights( absl::Status ConvolutionTransposed3x3Thin::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) { const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(src_channels_, 4); const int src_depth = DivideRoundUp(src_channels_, 4);
const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4); const int dst_depth = DivideRoundUp(dst_channels_, 4);
const int kernel_x = 3; // This operation support only 3x3 kernel const int kernel_x = 3; // This operation support only 3x3 kernel
const int kernel_y = 3; const int kernel_y = 3;
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4; const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@ -108,8 +108,8 @@ absl::Status ConvolutionTransposed3x3Thin::UploadWeights(
template <DataType S, typename T> template <DataType S, typename T>
void ConvolutionTransposed3x3Thin::RearrangeWeightsData( void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) { const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(src_channels_, 4); const int src_depth = DivideRoundUp(src_channels_, 4);
const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4); const int dst_depth = DivideRoundUp(dst_channels_, 4);
const int kernel_x = 3; const int kernel_x = 3;
const int kernel_y = 3; const int kernel_y = 3;

View File

@ -332,9 +332,8 @@ absl::Status ConvolutionTransposed4x4::BindArguments() {
} }
int3 ConvolutionTransposed4x4::GetGridSize() const { int3 ConvolutionTransposed4x4::GetGridSize() const {
const int grid_x = const int grid_x = DivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch();
IntegralDivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch(); const int grid_y = DivideRoundUp(dst_[0]->Height() + 2, 2);
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height() + 2, 2);
const int grid_z = dst_[0]->Slices(); const int grid_z = dst_[0]->Slices();
return int3(grid_x, grid_y, grid_z); return int3(grid_x, grid_y, grid_z);
} }

View File

@ -82,8 +82,8 @@ class ConvolutionTransposed4x4 : public GPUOperation {
template <DataType T> template <DataType T>
absl::Status ConvolutionTransposed4x4::UploadWeights( absl::Status ConvolutionTransposed4x4::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) { const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4); const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = 4; // This operation support only 4x4 kernel const int kernel_x = 4; // This operation support only 4x4 kernel
const int kernel_y = 4; const int kernel_y = 4;
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4; const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@ -107,8 +107,8 @@ absl::Status ConvolutionTransposed4x4::UploadWeights(
template <DataType S, typename T> template <DataType S, typename T>
void ConvolutionTransposed4x4::RearrangeWeightsData( void ConvolutionTransposed4x4::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) { const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4); const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = 4; const int kernel_x = 4;
const int kernel_y = 4; const int kernel_y = 4;

View File

@ -187,8 +187,8 @@ ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
absl::Status ConvolutionTransposedThin::Compile( absl::Status ConvolutionTransposedThin::Compile(
const CreationContext& creation_context) { const CreationContext& creation_context) {
const auto code = GenerateConvolutionTransposedCode( const auto code = GenerateConvolutionTransposedCode(
definition_, IntegralDivideRoundUp(src_channels_, 4), dst_channels_, definition_, DivideRoundUp(src_channels_, 4), dst_channels_, kernel_size_,
kernel_size_, *creation_context.device, linked_operations_); *creation_context.device, linked_operations_);
std::vector<CompilerOptions> options; std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 && if (definition_.precision == CalculationsPrecision::F16 &&

View File

@ -82,7 +82,7 @@ class ConvolutionTransposedThin : public GPUOperation {
template <DataType T> template <DataType T>
absl::Status ConvolutionTransposedThin::UploadWeights( absl::Status ConvolutionTransposedThin::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) { const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(src_channels_, 4); const int src_depth = DivideRoundUp(src_channels_, 4);
const int elements_count = const int elements_count =
kernel_size_.x * kernel_size_.y * src_depth * 4 * dst_channels_; kernel_size_.x * kernel_size_.y * src_depth * 4 * dst_channels_;
@ -104,7 +104,7 @@ absl::Status ConvolutionTransposedThin::UploadWeights(
template <DataType S, typename T> template <DataType S, typename T>
void ConvolutionTransposedThin::RearrangeWeightsData( void ConvolutionTransposedThin::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) { const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(src_channels_, 4); const int src_depth = DivideRoundUp(src_channels_, 4);
const int kernel_x = kernel_size_.x; const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y; const int kernel_y = kernel_size_.y;

View File

@ -89,7 +89,7 @@ template <DataType T>
absl::Status DepthwiseConvolution::UploadWeights( absl::Status DepthwiseConvolution::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) { const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_channels = weights.shape.i * weights.shape.o; const int dst_channels = weights.shape.i * weights.shape.o;
const int dst_depth = IntegralDivideRoundUp(dst_channels, 4); const int dst_depth = DivideRoundUp(dst_channels, 4);
const int kernel_x = weights.shape.w; const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h; const int kernel_y = weights.shape.h;
@ -137,7 +137,7 @@ template <DataType S, typename T>
void DepthwiseConvolution::RearrangeWeightsData( void DepthwiseConvolution::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) { const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int dst_channels = weights.shape.i * weights.shape.o; const int dst_channels = weights.shape.i * weights.shape.o;
const int dst_depth = IntegralDivideRoundUp(dst_channels, 4); const int dst_depth = DivideRoundUp(dst_channels, 4);
const int kernel_x = weights.shape.w; const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h; const int kernel_y = weights.shape.h;

View File

@ -88,7 +88,7 @@ template <DataType T>
absl::Status DepthwiseConvolution3D::UploadWeights( absl::Status DepthwiseConvolution3D::UploadWeights(
const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) { const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
const int dst_channels = weights.shape.i * weights.shape.o; const int dst_channels = weights.shape.i * weights.shape.o;
const int dst_slices = IntegralDivideRoundUp(dst_channels, 4); const int dst_slices = DivideRoundUp(dst_channels, 4);
const int kernel_x = weights.shape.w; const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h; const int kernel_y = weights.shape.h;
const int kernel_z = weights.shape.d; const int kernel_z = weights.shape.d;
@ -130,7 +130,7 @@ template <DataType S, typename T>
void DepthwiseConvolution3D::RearrangeWeightsData( void DepthwiseConvolution3D::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) { const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
const int dst_channels = weights.shape.i * weights.shape.o; const int dst_channels = weights.shape.i * weights.shape.o;
const int dst_slices = IntegralDivideRoundUp(dst_channels, 4); const int dst_slices = DivideRoundUp(dst_channels, 4);
const int kernel_x = weights.shape.w; const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h; const int kernel_y = weights.shape.h;
const int kernel_z = weights.shape.d; const int kernel_z = weights.shape.d;

View File

@ -323,8 +323,8 @@ absl::Status DepthwiseConv3x3::BindArguments() {
} }
int3 DepthwiseConv3x3::GetGridSize() const { int3 DepthwiseConv3x3::GetGridSize() const {
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), 2); const int grid_x = DivideRoundUp(dst_[0]->Width(), 2);
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2); const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
const int grid_z = dst_[0]->Slices(); const int grid_z = dst_[0]->Slices();
return int3(grid_x, grid_y, grid_z); return int3(grid_x, grid_y, grid_z);
} }

View File

@ -83,7 +83,7 @@ template <DataType T>
absl::Status DepthwiseConv3x3::UploadWeightsAndBiases( absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
const tflite::gpu::Tensor<OHWI, T>& weights, const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) { const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
int texture_width = 10; // 3x3 kernel + 1 bias int texture_width = 10; // 3x3 kernel + 1 bias
int texture_height = src_depth; int texture_height = src_depth;
const int elements_count = texture_width * texture_height; const int elements_count = texture_width * texture_height;
@ -129,7 +129,7 @@ template <DataType S, typename T>
void DepthwiseConv3x3::RearrangeWeightsAndBiasesData( void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(
const tflite::gpu::Tensor<OHWI, S>& weights, const tflite::gpu::Tensor<OHWI, S>& weights,
const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) { const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
int counter = 0; int counter = 0;
for (int s = 0; s < src_depth; ++s) { for (int s = 0; s < src_depth; ++s) {

View File

@ -70,8 +70,8 @@ class FullyConnected : public GPUOperation {
template <DataType T> template <DataType T>
absl::Status FullyConnected::UploadWeights( absl::Status FullyConnected::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) { const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4); const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int elements_count = src_depth * dst_depth * 4; const int elements_count = src_depth * dst_depth * 4;
const bool f32_weights = definition_.precision == CalculationsPrecision::F32; const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
@ -94,8 +94,8 @@ absl::Status FullyConnected::UploadWeights(
template <DataType T, typename S> template <DataType T, typename S>
void FullyConnected::RearrangeWeights( void FullyConnected::RearrangeWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<S> dst) { const tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<S> dst) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4); const int dst_depth = DivideRoundUp(weights.shape.o, 4);
int counter = 0; int counter = 0;
for (int s = 0; s < src_depth; ++s) { for (int s = 0; s < src_depth; ++s) {

View File

@ -129,8 +129,7 @@ absl::Status Softmax1x1::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting())); RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB())); RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
const int depth = src_[0]->Slices(); const int depth = src_[0]->Slices();
RETURN_IF_ERROR( RETURN_IF_ERROR(kernel_.SetBytesAuto(int2(depth, DivideRoundUp(depth, 32))));
kernel_.SetBytesAuto(int2(depth, IntegralDivideRoundUp(depth, 32))));
RETURN_IF_ERROR( RETURN_IF_ERROR(
kernel_.SetBytesAuto(GetMaskForLastPlane(src_[0]->Channels()))); kernel_.SetBytesAuto(GetMaskForLastPlane(src_[0]->Channels())));

View File

@ -234,12 +234,12 @@ template <DataType S, typename T>
void RearrangeWeightsToOHWIOGroupI4O4( void RearrangeWeightsToOHWIOGroupI4O4(
const tflite::gpu::Tensor<OHWI, S>& weights, int out_group_size, const tflite::gpu::Tensor<OHWI, S>& weights, int out_group_size,
absl::Span<T> dst) { absl::Span<T> dst) {
const int dst_slices = IntegralDivideRoundUp(weights.shape.o, 4); const int dst_slices = DivideRoundUp(weights.shape.o, 4);
const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4); const int src_slices = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = weights.shape.w; const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h; const int kernel_y = weights.shape.h;
const int dst_groups = IntegralDivideRoundUp(dst_slices, out_group_size); const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
int counter = 0; int counter = 0;
for (int d = 0; d < dst_groups; ++d) { for (int d = 0; d < dst_groups; ++d) {

View File

@ -436,9 +436,9 @@ absl::Status Winograd4x4To36::BindArguments() {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting())); RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB())); RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB())); RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
const int tiles_x = IntegralDivideRoundUp( const int tiles_x = DivideRoundUp(
src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4); src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
const int tiles_y = IntegralDivideRoundUp( const int tiles_y = DivideRoundUp(
src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4); src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
const int tiles_total = tiles_x * tiles_y; const int tiles_total = tiles_x * tiles_y;
RETURN_IF_ERROR( RETURN_IF_ERROR(
@ -550,14 +550,14 @@ absl::Status Winograd36To4x4::BindArguments() {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting())); RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB())); RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB())); RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
const int tiles_x = IntegralDivideRoundUp(dst_[0]->Width(), 4); const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
RETURN_IF_ERROR(kernel_.SetBytesAuto(tiles_x)); RETURN_IF_ERROR(kernel_.SetBytesAuto(tiles_x));
return absl::OkStatus(); return absl::OkStatus();
} }
int3 Winograd36To4x4::GetGridSize() const { int3 Winograd36To4x4::GetGridSize() const {
const int tiles_x = IntegralDivideRoundUp(dst_[0]->Width(), 4); const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
const int tiles_y = IntegralDivideRoundUp(dst_[0]->Height(), 4); const int tiles_y = DivideRoundUp(dst_[0]->Height(), 4);
const int grid_x = tiles_x * tiles_y * dst_[0]->Batch(); const int grid_x = tiles_x * tiles_y * dst_[0]->Batch();
const int grid_y = 4; const int grid_y = 4;
const int grid_z = dst_[0]->Slices(); const int grid_z = dst_[0]->Slices();

View File

@ -187,7 +187,7 @@ int3 GetWorkGroupXY128Simple(const int3& grid) { return int3(16, 8, 1); }
int3 GetWorkGroup(const int3& grid, int max_size) { int3 GetWorkGroup(const int3& grid, int max_size) {
int wg_z = GetBiggestDividerWithPriority(grid.z, 8); int wg_z = GetBiggestDividerWithPriority(grid.z, 8);
int wg_xy_size = max_size / wg_z; int wg_xy_size = max_size / wg_z;
int wg_x = std::min(IntegralDivideRoundUp(grid.x, 2), wg_xy_size); int wg_x = std::min(DivideRoundUp(grid.x, 2), wg_xy_size);
int wg_y = std::min(wg_xy_size / wg_x, grid.y); int wg_y = std::min(wg_xy_size / wg_x, grid.y);
return int3(wg_x, wg_y, wg_z); return int3(wg_x, wg_y, wg_z);
} }
@ -231,12 +231,12 @@ absl::Status GetBestWorkGroupXY128Linear(const TuningParameters& params,
} }
bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) { bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
int planar_work_groups = IntegralDivideRoundUp(width * height, 128); int planar_work_groups = DivideRoundUp(width * height, 128);
auto base_work_groups = Get2DWorkgroupsEqualTo128(); auto base_work_groups = Get2DWorkgroupsEqualTo128();
bool have_equal_work_groups = false; bool have_equal_work_groups = false;
for (auto& work_group : base_work_groups) { for (auto& work_group : base_work_groups) {
int x_groups = IntegralDivideRoundUp(width, work_group.x); int x_groups = DivideRoundUp(width, work_group.x);
int y_groups = IntegralDivideRoundUp(height, work_group.y); int y_groups = DivideRoundUp(height, work_group.y);
int xy_groups = x_groups * y_groups; int xy_groups = x_groups * y_groups;
if (xy_groups == planar_work_groups) { if (xy_groups == planar_work_groups) {
have_equal_work_groups = true; have_equal_work_groups = true;

View File

@ -101,7 +101,7 @@ absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
CLContext* context, LinearStorage* result) { CLContext* context, LinearStorage* result) {
int size = creation_info.aligned_size != 0 ? creation_info.aligned_size int size = creation_info.aligned_size != 0 ? creation_info.aligned_size
: tensor.shape.v; : tensor.shape.v;
const int depth = IntegralDivideRoundUp(size, 4); const int depth = DivideRoundUp(size, 4);
if (creation_info.data_type == DataType::FLOAT32) { if (creation_info.data_type == DataType::FLOAT32) {
std::vector<float4> gpu_data(depth); std::vector<float4> gpu_data(depth);
CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data)); CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));

View File

@ -56,10 +56,10 @@ bool IsChannelsBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr, bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
const CLDevice& device, const CLDevice& device,
const BHWC& dst_shape) { const BHWC& dst_shape) {
const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4); const int tiles_x = DivideRoundUp(dst_shape.w, 4);
const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4); const int tiles_y = DivideRoundUp(dst_shape.h, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const bool suitable_attributes = const bool suitable_attributes =
attr.weights.shape.w == 3 && attr.weights.shape.h == 3 && attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
attr.dilations == HW(1, 1) && attr.strides == HW(1, 1); attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
@ -82,8 +82,8 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
return absl::UnimplementedError("No implementation for this case."); return absl::UnimplementedError("No implementation for this case.");
} }
const int tiles_x = IntegralDivideRoundUp(output_shape.w, 4); const int tiles_x = DivideRoundUp(output_shape.w, 4);
const int tiles_y = IntegralDivideRoundUp(output_shape.h, 4); const int tiles_y = DivideRoundUp(output_shape.h, 4);
const BHWC shape_0{input_shape.b, 36, tiles_x * tiles_y, input_shape.c}; const BHWC shape_0{input_shape.b, 36, tiles_x * tiles_y, input_shape.c};
const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c}; const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
TensorDescriptor td_0; TensorDescriptor td_0;

View File

@ -28,7 +28,7 @@ namespace cl {
bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device, bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
const BHWDC& shape, const BHWDC& shape,
const TensorDescriptor& descriptor) { const TensorDescriptor& descriptor) {
const int slices = IntegralDivideRoundUp(shape.c, 4); const int slices = DivideRoundUp(shape.c, 4);
switch (descriptor.storage_type) { switch (descriptor.storage_type) {
case TensorStorageType::BUFFER: { case TensorStorageType::BUFFER: {
const int flt4_size = const int flt4_size =

View File

@ -65,11 +65,10 @@ absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
} }
if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) { if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
cl_mem image_memory; cl_mem image_memory;
RETURN_IF_ERROR( RETURN_IF_ERROR(CreateImageBufferFromBuffer(
CreateImageBufferFromBuffer(context, memory, descriptor.data_type, context, memory, descriptor.data_type,
shape.b * shape.w * shape.h * shape.d * shape.b * shape.w * shape.h * shape.d * DivideRoundUp(shape.c, 4),
IntegralDivideRoundUp(shape.c, 4), &image_memory));
&image_memory));
*result = Tensor(memory, memory_owner, image_memory, shape, descriptor); *result = Tensor(memory, memory_owner, image_memory, shape, descriptor);
} else { } else {
*result = Tensor(memory, memory_owner, shape, descriptor); *result = Tensor(memory, memory_owner, shape, descriptor);
@ -386,7 +385,7 @@ absl::Status AllocateTensorMemory(const CLContext& context,
const CLDevice& device, const BHWDC& shape, const CLDevice& device, const BHWDC& shape,
const TensorDescriptor& descriptor, const TensorDescriptor& descriptor,
CLMemory* result) { CLMemory* result) {
const int slices = IntegralDivideRoundUp(shape.c, 4); const int slices = DivideRoundUp(shape.c, 4);
switch (descriptor.storage_type) { switch (descriptor.storage_type) {
case TensorStorageType::BUFFER: case TensorStorageType::BUFFER:
case TensorStorageType::IMAGE_BUFFER: { case TensorStorageType::IMAGE_BUFFER: {

View File

@ -61,7 +61,7 @@ class Tensor {
int Height() const { return shape_.h; } int Height() const { return shape_.h; }
int Depth() const { return shape_.d; } int Depth() const { return shape_.d; }
int Channels() const { return shape_.c; } int Channels() const { return shape_.c; }
int Slices() const { return IntegralDivideRoundUp(shape_.c, 4); } int Slices() const { return DivideRoundUp(shape_.c, 4); }
int Batch() const { return shape_.b; } int Batch() const { return shape_.b; }
// returns int4(width * batch, height, slices, batch) // returns int4(width * batch, height, slices, batch)

View File

@ -44,12 +44,11 @@ absl::Status ConvertToPHWO4I4(absl::Span<const float> in, const OHWI& shape,
} }
float* output = out.data(); float* output = out.data();
for (int p = 0; p < IntegralDivideRoundUp(shape.o, kPhwo4i4ChannelsInPlane); for (int p = 0; p < DivideRoundUp(shape.o, kPhwo4i4ChannelsInPlane); ++p) {
++p) {
for (int h = 0; h < shape.h; ++h) { for (int h = 0; h < shape.h; ++h) {
for (int w = 0; w < shape.w; ++w) { for (int w = 0; w < shape.w; ++w) {
for (int c = 0; for (int c = 0; c < DivideRoundUp(shape.i, kPhwo4i4ChannelsInPlane);
c < IntegralDivideRoundUp(shape.i, kPhwo4i4ChannelsInPlane); ++c) { ++c) {
for (int co = 0; co < kPhwo4i4ChannelsInPlane; ++co) { for (int co = 0; co < kPhwo4i4ChannelsInPlane; ++co) {
for (int ci = 0; ci < kPhwo4i4ChannelsInPlane; ++ci) { for (int ci = 0; ci < kPhwo4i4ChannelsInPlane; ++ci) {
float value = 0; float value = 0;
@ -106,7 +105,7 @@ std::vector<float> ConvertToPHWO4I4Transposed(
uint3 Get3DSizeForPHWO4I4(const OHWI& shape) { uint3 Get3DSizeForPHWO4I4(const OHWI& shape) {
return uint3(AlignByN(shape.i, 4), shape.h * shape.w, return uint3(AlignByN(shape.i, 4), shape.h * shape.w,
IntegralDivideRoundUp(shape.o, 4)); DivideRoundUp(shape.o, 4));
} }
// Layout is Po,H,W,OI4x4. // Layout is Po,H,W,OI4x4.
@ -123,8 +122,8 @@ absl::Status ConvertToPHWO4I4(absl::Span<const float> in, const IHWO& shape,
out.size(), " != ", GetElementsSizeForPHWO4I4(shape))); out.size(), " != ", GetElementsSizeForPHWO4I4(shape)));
} }
const int dst_depth = IntegralDivideRoundUp(shape.o, 4); const int dst_depth = DivideRoundUp(shape.o, 4);
const int src_depth = IntegralDivideRoundUp(shape.i, 4); const int src_depth = DivideRoundUp(shape.i, 4);
float* output = out.data(); float* output = out.data();
for (int f = 0; f < dst_depth; ++f) { for (int f = 0; f < dst_depth; ++f) {
@ -178,8 +177,7 @@ absl::Status ConvertToPIOHW4(absl::Span<const float> in, const OHWI& shape,
} }
int32_t output_channels = shape.o * shape.i; int32_t output_channels = shape.o * shape.i;
int32_t num_planes = int32_t num_planes = DivideRoundUp(output_channels, kPiohw4ChannelsInPlane);
IntegralDivideRoundUp(output_channels, kPiohw4ChannelsInPlane);
float* output = out.data(); float* output = out.data();
for (int p = 0; p < num_planes; ++p) { for (int p = 0; p < num_planes; ++p) {
for (int h = 0; h < shape.h; ++h) { for (int h = 0; h < shape.h; ++h) {
@ -232,7 +230,7 @@ absl::Status ConvertToPHWC4(absl::Span<const float> in, const BHWC& shape,
return absl::OkStatus(); return absl::OkStatus();
} }
// Layout is Pc,H,W,C4 where P - is a plane based on channels. // Layout is Pc,H,W,C4 where P - is a plane based on channels.
int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane); int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
const int num_pixels = shape.h * shape.w; const int num_pixels = shape.h * shape.w;
// A layer is a set of kPhwc4ChannelsInPlane channels images. // A layer is a set of kPhwc4ChannelsInPlane channels images.
const int num_full_planes = shape.c / kPhwc4ChannelsInPlane; const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
@ -281,7 +279,7 @@ absl::Status ConvertToPHWC4Half(absl::Span<const float> in, const BHWC& shape,
RETURN_IF_ERROR(ValidateConvertToPHWC4(in, shape, out)); RETURN_IF_ERROR(ValidateConvertToPHWC4(in, shape, out));
// Layout is Pc,H,W,C4 where P - is a plane based on channels. // Layout is Pc,H,W,C4 where P - is a plane based on channels.
int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane); int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
const int num_pixels = shape.h * shape.w; const int num_pixels = shape.h * shape.w;
// A layer is a set of kPhwc4ChannelsInPlane channels images. // A layer is a set of kPhwc4ChannelsInPlane channels images.
const int num_full_planes = shape.c / kPhwc4ChannelsInPlane; const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
@ -407,7 +405,7 @@ absl::Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
return absl::OkStatus(); return absl::OkStatus();
} }
int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane); int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
const int num_pixels = shape.h * shape.w; const int num_pixels = shape.h * shape.w;
const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane; const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
// A layer is a set of kPhwc4ChannelsInPlane channels images. // A layer is a set of kPhwc4ChannelsInPlane channels images.
@ -449,7 +447,7 @@ absl::Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
absl::Status ConvertFromPHWC4Half(absl::Span<const HalfBits> in, absl::Status ConvertFromPHWC4Half(absl::Span<const HalfBits> in,
const BHWC& shape, absl::Span<float> out) { const BHWC& shape, absl::Span<float> out) {
RETURN_IF_ERROR(ValidateConvertFromPHWC4(in, shape, out)); RETURN_IF_ERROR(ValidateConvertFromPHWC4(in, shape, out));
int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane); int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
const int num_pixels = shape.h * shape.w; const int num_pixels = shape.h * shape.w;
const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane; const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
// A layer is a set of kPhwc4ChannelsInPlane channels images. // A layer is a set of kPhwc4ChannelsInPlane channels images.

View File

@ -209,7 +209,7 @@ OperationType OperationTypeFromString(const std::string& name) {
namespace { namespace {
template <typename T> template <typename T>
T IntegralDivideRoundUp(T n, T divisor) { T DivideRoundUp(T n, T divisor) {
return (n - 1) / divisor + 1; return (n - 1) / divisor + 1;
} }
@ -272,7 +272,7 @@ int32_t CalculateOutput(const BHWDC& input,
} }
inline int32_t StridedSize(int32_t size, int32_t stride) { inline int32_t StridedSize(int32_t size, int32_t stride) {
return stride == 0 ? -1 : IntegralDivideRoundUp(size, stride); return stride == 0 ? -1 : DivideRoundUp(size, stride);
} }
template <Axis AxisT, typename AttrT> template <Axis AxisT, typename AttrT>

View File

@ -24,24 +24,23 @@ namespace gpu {
// @param n must be non negative // @param n must be non negative
// @param divisor must be greater than zero // @param divisor must be greater than zero
template <typename T, typename N> template <typename T, typename N>
T IntegralDivideRoundUp(T n, N divisor) { T DivideRoundUp(T n, N divisor) {
const T div = static_cast<T>(divisor); const T div = static_cast<T>(divisor);
const T q = n / div; const T q = n / div;
return n % div == 0 ? q : q + 1; return n % div == 0 ? q : q + 1;
} }
template <> template <>
inline uint3 IntegralDivideRoundUp(uint3 n, uint3 divisor) { inline uint3 DivideRoundUp(uint3 n, uint3 divisor) {
return uint3(IntegralDivideRoundUp(n.x, divisor.x), return uint3(DivideRoundUp(n.x, divisor.x), DivideRoundUp(n.y, divisor.y),
IntegralDivideRoundUp(n.y, divisor.y), DivideRoundUp(n.z, divisor.z));
IntegralDivideRoundUp(n.z, divisor.z));
} }
// @param number or its components must be greater than zero // @param number or its components must be greater than zero
// @param n must be greater than zero // @param n must be greater than zero
template <typename T, typename N> template <typename T, typename N>
T AlignByN(T number, N n) { T AlignByN(T number, N n) {
return IntegralDivideRoundUp(number, n) * n; return DivideRoundUp(number, n) * n;
} }
} // namespace gpu } // namespace gpu

View File

@ -24,16 +24,16 @@ namespace {
using testing::Eq; using testing::Eq;
TEST(UtilTest, IntegralDivideRoundUp) { TEST(UtilTest, DivideRoundUp) {
EXPECT_THAT(IntegralDivideRoundUp(0, 256), Eq(0)); EXPECT_THAT(DivideRoundUp(0, 256), Eq(0));
EXPECT_THAT(IntegralDivideRoundUp(2u, 256), Eq(1)); EXPECT_THAT(DivideRoundUp(2u, 256), Eq(1));
EXPECT_THAT(IntegralDivideRoundUp(2, 256), Eq(1)); EXPECT_THAT(DivideRoundUp(2, 256), Eq(1));
EXPECT_THAT(IntegralDivideRoundUp(255u, 256), Eq(1)); EXPECT_THAT(DivideRoundUp(255u, 256), Eq(1));
EXPECT_THAT(IntegralDivideRoundUp(255, 256), Eq(1)); EXPECT_THAT(DivideRoundUp(255, 256), Eq(1));
EXPECT_THAT(IntegralDivideRoundUp(256u, 256), Eq(1)); EXPECT_THAT(DivideRoundUp(256u, 256), Eq(1));
EXPECT_THAT(IntegralDivideRoundUp(256, 256), Eq(1)); EXPECT_THAT(DivideRoundUp(256, 256), Eq(1));
EXPECT_THAT(IntegralDivideRoundUp(257u, 256), Eq(2)); EXPECT_THAT(DivideRoundUp(257u, 256), Eq(2));
EXPECT_THAT(IntegralDivideRoundUp(257, 256), Eq(2)); EXPECT_THAT(DivideRoundUp(257, 256), Eq(2));
} }
TEST(UtilTest, AlignByN) { TEST(UtilTest, AlignByN) {

View File

@ -34,9 +34,9 @@ void AddCornerCases(const T& grid, int max_work_group_total_size,
for (int x = 1; x <= 4; ++x) { for (int x = 1; x <= 4; ++x) {
for (int y = 1; y <= 4; ++y) { for (int y = 1; y <= 4; ++y) {
for (int z = 1; z <= 4; ++z) { for (int z = 1; z <= 4; ++z) {
int wg_x = IntegralDivideRoundUp(grid.x, x); int wg_x = DivideRoundUp(grid.x, x);
int wg_y = IntegralDivideRoundUp(grid.y, y); int wg_y = DivideRoundUp(grid.y, y);
int wg_z = IntegralDivideRoundUp(grid.z, z); int wg_z = DivideRoundUp(grid.z, z);
if (wg_x > max_work_group_sizes.x || wg_y > max_work_group_sizes.y || if (wg_x > max_work_group_sizes.x || wg_y > max_work_group_sizes.y ||
wg_z > max_work_group_sizes.z || wg_z > max_work_group_sizes.z ||
wg_x * wg_y * wg_z > max_work_group_total_size) { wg_x * wg_y * wg_z > max_work_group_total_size) {

View File

@ -201,7 +201,7 @@ class CompiledModelImpl
ShaderCode code) { ShaderCode code) {
// Calculate workgroup size. // Calculate workgroup size.
uint3 workgroup_size = workgroup_calculator.Calculate(code); uint3 workgroup_size = workgroup_calculator.Calculate(code);
uint3 num_workgroups = IntegralDivideRoundUp(code.workload, workgroup_size); uint3 num_workgroups = DivideRoundUp(code.workload, workgroup_size);
for (const auto& object : code.objects) { for (const auto& object : code.objects) {
if (IsRef(object)) { if (IsRef(object)) {

View File

@ -569,7 +569,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
} else { } else {
shader_index = it->second; shader_index = it->second;
} }
auto num_workgroups = IntegralDivideRoundUp(code.workload, workgroup); auto num_workgroups = DivideRoundUp(code.workload, workgroup);
return runtime_ptr->AddProgram(shaders[shader_index], code.parameters, return runtime_ptr->AddProgram(shaders[shader_index], code.parameters,
code.objects, num_workgroups); code.objects, num_workgroups);
})); }));

View File

@ -190,8 +190,7 @@ class CompilerImpl : public Compiler {
"Workload uint3() requires all output sizes to match"); "Workload uint3() requires all output sizes to match");
} }
} }
attr.code.workload = attr.code.workload = uint3(shape.w, shape.h, DivideRoundUp(shape.c, 4));
uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4));
} }
int num_textures = 0; int num_textures = 0;

View File

@ -88,8 +88,8 @@ absl::Status ConverterBhwcToPhwc4::Convert(const BHWC& shape,
return absl::UnimplementedError( return absl::UnimplementedError(
"BhwcToPhwc4: Batch size is not equal to 1."); "BhwcToPhwc4: Batch size is not equal to 1.");
} }
uint3 workload = uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4)); uint3 workload = uint3(shape.w, shape.h, DivideRoundUp(shape.c, 4));
uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_); uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
RETURN_IF_ERROR(program_.SetParameter( RETURN_IF_ERROR(program_.SetParameter(
{"sizes_", {"sizes_",

View File

@ -83,7 +83,7 @@ absl::Status ConverterPhwc4ToBhwc::Convert(const BHWC& shape,
} }
uint3 workload = uint3(shape.w, shape.h, shape.c); uint3 workload = uint3(shape.w, shape.h, shape.c);
uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_); uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
// TODO(akulik): simply pass workload as soon as UniformParameter // TODO(akulik): simply pass workload as soon as UniformParameter
// supports uint3 // supports uint3

View File

@ -102,7 +102,7 @@ class Add : public NodeShader {
// Declare workload explicitly because shader depends on gid.z. // Declare workload explicitly because shader depends on gid.z.
/*workload=*/ /*workload=*/
uint3(ctx.input_shapes[0][2], ctx.input_shapes[0][1], uint3(ctx.input_shapes[0][2], ctx.input_shapes[0][1],
IntegralDivideRoundUp(ctx.input_shapes[0][3], 4)), DivideRoundUp(ctx.input_shapes[0][3], 4)),
/*workgroup=*/uint3(), /*workgroup=*/uint3(),
/*source_code=*/"value_0 += $add_buffer[gid.z]$;", /*source_code=*/"value_0 += $add_buffer[gid.z]$;",
/*input=*/IOStructure::AUTO, /*input=*/IOStructure::AUTO,

View File

@ -210,7 +210,7 @@ vec4 val = vec4(0.0f);
// * - you are going to write into these cells // * - you are going to write into these cells
// @ - you will fill these cells next cycles // @ - you will fill these cells next cycles
// ^ - first elem you start writing from // ^ - first elem you start writing from
int blocks_amount = IntegralDivideRoundUp<int>(in_ch, 4); int blocks_amount = DivideRoundUp<int>(in_ch, 4);
code += "// Aligned case\n"; code += "// Aligned case\n";
code += "// I'm going to make " + std::to_string(blocks_amount) + code += "// I'm going to make " + std::to_string(blocks_amount) +
" write(s)\n\n"; " write(s)\n\n";

View File

@ -55,7 +55,7 @@ class Convolution : public NodeShader {
{"dilation_h", attr.dilations.h}, {"dilation_h", attr.dilations.h},
{"kernel_w", weights.w}, {"kernel_w", weights.w},
{"kernel_h", weights.h}, {"kernel_h", weights.h},
{"src_depth", IntegralDivideRoundUp(weights.i, 4)}, {"src_depth", DivideRoundUp(weights.i, 4)},
{"stride", int2(attr.strides.w, attr.strides.h)}, {"stride", int2(attr.strides.w, attr.strides.h)},
}; };
} else { } else {
@ -71,7 +71,7 @@ class Convolution : public NodeShader {
{"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])}, {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
{"offsets_count", offsets_count}, {"offsets_count", offsets_count},
{"offsets", offsets}, {"offsets", offsets},
{"src_depth", IntegralDivideRoundUp(weights.i, 4)}, {"src_depth", DivideRoundUp(weights.i, 4)},
{"stride", int2(attr.strides.w, attr.strides.h)}, {"stride", int2(attr.strides.w, attr.strides.h)},
}; };
} }
@ -181,14 +181,14 @@ class Convolution1x1 : public NodeShader {
std::vector<Variable> parameters = { std::vector<Variable> parameters = {
{"src_depth", {"src_depth",
IntegralDivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)}, DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)},
}; };
std::vector<std::pair<std::string, Object>> objects = { std::vector<std::pair<std::string, Object>> objects = {
{"weights", MakeReadonlyObject( {"weights",
uint3(4, IntegralDivideRoundUp(attr.weights.shape.i, 4), MakeReadonlyObject(uint3(4, DivideRoundUp(attr.weights.shape.i, 4),
IntegralDivideRoundUp(attr.weights.shape.o, 4)), DivideRoundUp(attr.weights.shape.o, 4)),
ConvertToPHWO4I4(attr.weights))}}; ConvertToPHWO4I4(attr.weights))}};
std::string source; std::string source;
for (int i = 0; i < multiplier; i++) { for (int i = 0; i < multiplier; i++) {
absl::StrAppend(&source, "highp vec4 result", i, " = vec4(0);\n"); absl::StrAppend(&source, "highp vec4 result", i, " = vec4(0);\n");
@ -224,7 +224,7 @@ class Convolution1x1 : public NodeShader {
absl::StrAppend(&source, "value_0 = result0;\n"); absl::StrAppend(&source, "value_0 = result0;\n");
} }
auto dst_depth = IntegralDivideRoundUp(ctx.output_shapes[0][3], 4); auto dst_depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
uint3 workgroup = uint3(16, 16, 1); uint3 workgroup = uint3(16, 16, 1);
if (ctx.gpu_info->type == GpuType::ADRENO) { if (ctx.gpu_info->type == GpuType::ADRENO) {
if (dst_depth >= 2) { if (dst_depth >= 2) {
@ -265,7 +265,7 @@ class Convolution1x1 : public NodeShader {
/*shared_variables=*/{}, /*shared_variables=*/{},
/*workload=*/ /*workload=*/
uint3(ctx.output_shapes[0][2] / multiplier, ctx.output_shapes[0][1], uint3(ctx.output_shapes[0][2] / multiplier, ctx.output_shapes[0][1],
IntegralDivideRoundUp(ctx.output_shapes[0][3], 4)), DivideRoundUp(ctx.output_shapes[0][3], 4)),
/*workgroup=*/ /*workgroup=*/
GetIdealWorkgroupIfPossible( GetIdealWorkgroupIfPossible(
ctx.gpu_info->gpu_model, OperationType::CONVOLUTION_2D, ctx.gpu_info->gpu_model, OperationType::CONVOLUTION_2D,

View File

@ -64,7 +64,7 @@ class OpenGlConverterImpl : public TensorObjectConverter {
} }
absl::Status Dispatch(const uint3& workload) { absl::Status Dispatch(const uint3& workload) {
uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_); uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
if (command_queue_) { if (command_queue_) {
return command_queue_->Dispatch(program_, num_workgroups); return command_queue_->Dispatch(program_, num_workgroups);
} }
@ -256,7 +256,7 @@ class ToTensorConverter : public OpenGlConverterImpl {
return absl::InvalidArgumentError( return absl::InvalidArgumentError(
"ToTensorConverter: output data size does not match expected size."); "ToTensorConverter: output data size does not match expected size.");
} }
auto d = IntegralDivideRoundUp(shape_.c, 4); auto d = DivideRoundUp(shape_.c, 4);
RETURN_IF_ERROR(program_.SetParameter( RETURN_IF_ERROR(program_.SetParameter(
{"sizes", {"sizes",
int4(static_cast<int32_t>(shape_.w), static_cast<int32_t>(shape_.h), int4(static_cast<int32_t>(shape_.w), static_cast<int32_t>(shape_.h),

View File

@ -54,7 +54,7 @@ class DepthwiseConvolution : public NodeShader {
{"dilation_h", attr.dilations.h}, {"dilation_h", attr.dilations.h},
{"kernel_w", weights.w}, {"kernel_w", weights.w},
{"kernel_h", weights.h}, {"kernel_h", weights.h},
{"src_depth", IntegralDivideRoundUp(weights.i, 4)}, {"src_depth", DivideRoundUp(weights.i, 4)},
{"channel_multiplier", weights.o}, {"channel_multiplier", weights.o},
{"stride", int2(attr.strides.w, attr.strides.h)}, {"stride", int2(attr.strides.w, attr.strides.h)},
}; };
@ -71,7 +71,7 @@ class DepthwiseConvolution : public NodeShader {
{"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])}, {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
{"offsets_count", offsets_count}, {"offsets_count", offsets_count},
{"offsets", offsets}, {"offsets", offsets},
{"src_depth", IntegralDivideRoundUp(weights.i, 4)}, {"src_depth", DivideRoundUp(weights.i, 4)},
{"channel_multiplier", weights.o}, {"channel_multiplier", weights.o},
{"stride", int2(attr.strides.w, attr.strides.h)}, {"stride", int2(attr.strides.w, attr.strides.h)},
}; };

View File

@ -39,8 +39,8 @@ class FullyConnectedBuffers : public NodeShader {
const auto& attr = const auto& attr =
absl::any_cast<const FullyConnectedAttributes&>(ctx.op_attr); absl::any_cast<const FullyConnectedAttributes&>(ctx.op_attr);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
// This shader can work with any workgroup size, the values below work well // This shader can work with any workgroup size, the values below work well
// for OpenGL. // for OpenGL.

View File

@ -105,10 +105,9 @@ absl::Status GenerateMultiplyScalarCode(
/*shared_variables=*/{}, /*shared_variables=*/{},
// Declare workload explicitly because shader depends on gid.z. // Declare workload explicitly because shader depends on gid.z.
/*workload=*/ /*workload=*/
uint3( uint3(static_cast<int>(ctx.input_shapes[0][2]),
static_cast<int>(ctx.input_shapes[0][2]), static_cast<int>(ctx.input_shapes[0][1]),
static_cast<int>(ctx.input_shapes[0][1]), DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)),
IntegralDivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)),
/*workgroup=*/uint3(), /*workgroup=*/uint3(),
/*source_code=*/"value_0 *= $mul_buffer[gid.z]$;", /*source_code=*/"value_0 *= $mul_buffer[gid.z]$;",
/*input=*/IOStructure::AUTO, /*input=*/IOStructure::AUTO,

View File

@ -98,8 +98,8 @@ class Pad : public NodeShader {
source += " value_0 = $input_data_0[src_x, src_y, gid.z]$;\n"; source += " value_0 = $input_data_0[src_x, src_y, gid.z]$;\n";
} else if (attr.prepended.c % 4 == 0) { } else if (attr.prepended.c % 4 == 0) {
parameters.push_back( parameters.push_back(
{"src_slices", IntegralDivideRoundUp( {"src_slices",
static_cast<int>(ctx.input_shapes[0][3]), 4)}); DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)});
source += R"( source += R"(
int src_z = gid.z - $prepended.z$ / 4; int src_z = gid.z - $prepended.z$ / 4;
if (src_z >= 0 && src_z < $src_slices$) { if (src_z >= 0 && src_z < $src_slices$) {

View File

@ -69,8 +69,8 @@ class PReLULinearAlpha : public NodeShader {
/*workload=*/ /*workload=*/
uint3(static_cast<int>(ctx.output_shapes[0][2]), uint3(static_cast<int>(ctx.output_shapes[0][2]),
static_cast<int>(ctx.output_shapes[0][1]), static_cast<int>(ctx.output_shapes[0][1]),
IntegralDivideRoundUp( DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
static_cast<int>(ctx.output_shapes[0][3]), 4)), 4)),
/*workgroup=*/uint3(), /*workgroup=*/uint3(),
/*source_code=*/ /*source_code=*/
"value_0 = max(value_0, 0.0) + $alpha[gid.z]$ * min(value_0, " "value_0 = max(value_0, 0.0) + $alpha[gid.z]$ * min(value_0, "
@ -98,10 +98,10 @@ class PReLUFull : public NodeShader {
"Alpha shape does not match input shape."); "Alpha shape does not match input shape.");
} }
ObjectSize obj_size = uint3( ObjectSize obj_size =
static_cast<int>(ctx.output_shapes[0][2]), uint3(static_cast<int>(ctx.output_shapes[0][2]),
static_cast<int>(ctx.output_shapes[0][1]), static_cast<int>(ctx.output_shapes[0][1]),
IntegralDivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4)); DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4));
*generated_code = *generated_code =
attr.clip attr.clip
@ -116,8 +116,8 @@ class PReLUFull : public NodeShader {
/*workload=*/ /*workload=*/
uint3(static_cast<int>(ctx.output_shapes[0][2]), uint3(static_cast<int>(ctx.output_shapes[0][2]),
static_cast<int>(ctx.output_shapes[0][1]), static_cast<int>(ctx.output_shapes[0][1]),
IntegralDivideRoundUp( DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
static_cast<int>(ctx.output_shapes[0][3]), 4)), 4)),
/*workgroup=*/uint3(), /*workgroup=*/uint3(),
/*source_code=*/ /*source_code=*/
"value_0 = clamp(value_0, 0.0, $clip$) + " "value_0 = clamp(value_0, 0.0, $clip$) + "
@ -136,8 +136,8 @@ class PReLUFull : public NodeShader {
/*workload=*/ /*workload=*/
uint3(static_cast<int>(ctx.output_shapes[0][2]), uint3(static_cast<int>(ctx.output_shapes[0][2]),
static_cast<int>(ctx.output_shapes[0][1]), static_cast<int>(ctx.output_shapes[0][1]),
IntegralDivideRoundUp( DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
static_cast<int>(ctx.output_shapes[0][3]), 4)), 4)),
/*workgroup=*/uint3(), /*workgroup=*/uint3(),
/*source_code=*/ /*source_code=*/
"value_0 = max(value_0, 0.0) + $alpha[gid.x, gid.y, gid.z]$ " "value_0 = max(value_0, 0.0) + $alpha[gid.x, gid.y, gid.z]$ "

View File

@ -60,13 +60,13 @@ class Softmax : public NodeShader {
private: private:
absl::Status GenerateCodeFor1x1(const GenerationContext& ctx, absl::Status GenerateCodeFor1x1(const GenerationContext& ctx,
GeneratedCode* generated_code) const { GeneratedCode* generated_code) const {
const int depth = IntegralDivideRoundUp(ctx.output_shapes[0][3], 4); const int depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
std::vector<Variable> shared_variables = { std::vector<Variable> shared_variables = {
{"partial_sum", std::vector<float4>(8)}, {"partial_sum", std::vector<float4>(8)},
}; };
std::vector<Variable> uniform_parameters = { std::vector<Variable> uniform_parameters = {
{"depth", depth}, {"depth", depth},
{"depth_div_32", IntegralDivideRoundUp(depth, 32)}, {"depth_div_32", DivideRoundUp(depth, 32)},
{"mask", GetMask(ctx.output_shapes[0][3])}, {"mask", GetMask(ctx.output_shapes[0][3])},
}; };
std::string source_code = R"( std::string source_code = R"(
@ -138,7 +138,7 @@ class Softmax : public NodeShader {
GeneratedCode* generated_code) const { GeneratedCode* generated_code) const {
std::vector<Variable> parameters = { std::vector<Variable> parameters = {
{"src_depth", {"src_depth",
IntegralDivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4)}, DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4)},
{"mask", GetMask(ctx.output_shapes[0][3])}, {"mask", GetMask(ctx.output_shapes[0][3])},
}; };

View File

@ -44,7 +44,7 @@ class ConvolutionTransposedBuffers : public NodeShader {
std::vector<Variable> parameters = { std::vector<Variable> parameters = {
{"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])}, {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
{"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])}, {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
{"src_depth", IntegralDivideRoundUp(weights.i, 4)}, {"src_depth", DivideRoundUp(weights.i, 4)},
{"kernel_size", int2(weights.w, weights.h)}, {"kernel_size", int2(weights.w, weights.h)},
{"stride", int2(attr.stride.w, attr.stride.h)}, {"stride", int2(attr.stride.w, attr.stride.h)},
{"padding", int2(weights.w - 1 - attr.padding.prepended.w, {"padding", int2(weights.w - 1 - attr.padding.prepended.w,

View File

@ -153,17 +153,17 @@ inline Object MakeReadonlyBuffer(const ObjectSize& size,
inline Object MakeReadonlyObject(const std::vector<float>& data) { inline Object MakeReadonlyObject(const std::vector<float>& data) {
return MakeReadonlyObject( return MakeReadonlyObject(
IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data); DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
} }
inline Object MakeReadonlyTexture(const std::vector<float>& data) { inline Object MakeReadonlyTexture(const std::vector<float>& data) {
return MakeReadonlyTexture( return MakeReadonlyTexture(
IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data); DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
} }
inline Object MakeReadonlyBuffer(const std::vector<float>& data) { inline Object MakeReadonlyBuffer(const std::vector<float>& data) {
return MakeReadonlyBuffer( return MakeReadonlyBuffer(
IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data); DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
} }
// TODO(akulik): find better place for functions below. // TODO(akulik): find better place for functions below.
@ -172,7 +172,7 @@ inline uint3 GetPHWC4Size(const BHWC& shape) {
uint3 size; uint3 size;
size.x = shape.w; size.x = shape.w;
size.y = shape.h; size.y = shape.h;
size.z = shape.b * IntegralDivideRoundUp(shape.c, 4); size.z = shape.b * DivideRoundUp(shape.c, 4);
return size; return size;
} }

View File

@ -167,10 +167,10 @@ std::vector<ComputeTaskDescriptorPtr> SelectWinograd36To4x4(
bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr, bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
const BHWC& dst_shape) { const BHWC& dst_shape) {
const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4); const int tiles_x = DivideRoundUp(dst_shape.w, 4);
const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4); const int tiles_y = DivideRoundUp(dst_shape.h, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const bool suitable_attributes = const bool suitable_attributes =
attr.weights.shape.w == 3 && attr.weights.shape.h == 3 && attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
attr.dilations == HW(1, 1) && attr.strides == HW(1, 1); attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
@ -229,8 +229,8 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
auto attr = auto attr =
absl::any_cast<Convolution2DAttributes>(node->operation.attributes); absl::any_cast<Convolution2DAttributes>(node->operation.attributes);
if (IsSuitableForWinograd4x4To6x6(attr, dst_shape)) { if (IsSuitableForWinograd4x4To6x6(attr, dst_shape)) {
int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4); int tiles_x = DivideRoundUp(dst_shape.w, 4);
int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4); int tiles_y = DivideRoundUp(dst_shape.h, 4);
Winograd4x4To36Attributes wino_up_attr; Winograd4x4To36Attributes wino_up_attr;
wino_up_attr.padding = attr.padding; wino_up_attr.padding = attr.padding;

View File

@ -21,8 +21,8 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/common/util.h" #include "tensorflow/lite/delegates/gpu/common/util.h"
#include "tensorflow/lite/delegates/gpu/metal/common.h" #include "tensorflow/lite/delegates/gpu/metal/common.h"
using ::tflite::gpu::IntegralDivideRoundUp;
using ::tflite::gpu::BHWC; using ::tflite::gpu::BHWC;
using ::tflite::gpu::DivideRoundUp;
using ::tflite::gpu::metal::CreateComputeProgram; using ::tflite::gpu::metal::CreateComputeProgram;
@implementation TFLBufferConvert { @implementation TFLBufferConvert {
@ -102,10 +102,10 @@ using ::tflite::gpu::metal::CreateComputeProgram;
[encoder setBytes:uniforms.data() length:uniforms.size() * sizeof(int) atIndex:2]; [encoder setBytes:uniforms.data() length:uniforms.size() * sizeof(int) atIndex:2];
MTLSize group_size = MTLSizeMake(16, 16, 1); MTLSize group_size = MTLSizeMake(16, 16, 1);
int layers = IntegralDivideRoundUp(shape.c, 4); int layers = DivideRoundUp(shape.c, 4);
int groups_x = IntegralDivideRoundUp(shape.w, group_size.width); int groups_x = DivideRoundUp(shape.w, group_size.width);
int groups_y = IntegralDivideRoundUp(shape.h, group_size.height); int groups_y = DivideRoundUp(shape.h, group_size.height);
int groups_z = IntegralDivideRoundUp(layers, group_size.depth); int groups_z = DivideRoundUp(layers, group_size.depth);
MTLSize groups_count = MTLSizeMake(groups_x, groups_y, groups_z); MTLSize groups_count = MTLSizeMake(groups_x, groups_y, groups_z);
[encoder dispatchThreadgroups:groups_count threadsPerThreadgroup:group_size]; [encoder dispatchThreadgroups:groups_count threadsPerThreadgroup:group_size];
} }

View File

@ -440,9 +440,9 @@ ComputeTaskDescriptorPtr NonLinkableStub(int operation_id, ValueId input_id,
desc->resize_function = [input_id](const std::map<ValueId, BHWC>& buffers) { desc->resize_function = [input_id](const std::map<ValueId, BHWC>& buffers) {
const auto& dimension = buffers.find(input_id)->second; const auto& dimension = buffers.find(input_id)->second;
uint3 groups_size{16, 16, 1}; uint3 groups_size{16, 16, 1};
uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x), uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
IntegralDivideRoundUp(dimension.h, groups_size.y), DivideRoundUp(dimension.h, groups_size.y),
IntegralDivideRoundUp(dimension.c, 4)}; DivideRoundUp(dimension.c, 4)};
return std::make_pair(groups_size, groups_count); return std::make_pair(groups_size, groups_count);
}; };

View File

@ -71,7 +71,7 @@ std::string GetConcatZCode(const std::vector<int> channels) {
// Also it is easy to write a loop in this case, to prevent long kernel // Also it is easy to write a loop in this case, to prevent long kernel
// generation. // generation.
for (int i = 0; i < channels.size(); ++i) { for (int i = 0; i < channels.size(); ++i) {
const int depth = IntegralDivideRoundUp(channels[i], 4); const int depth = DivideRoundUp(channels[i], 4);
const std::string src_buffer = "src_buffer" + std::to_string(i); const std::string src_buffer = "src_buffer" + std::to_string(i);
c += " for (int i = 0; i < " + std::to_string(depth) + "; ++i) {\n"; c += " for (int i = 0; i < " + std::to_string(depth) + "; ++i) {\n";
c += " int src_index = i * U.src_size.w + xy_offset;\n"; c += " int src_index = i * U.src_size.w + xy_offset;\n";
@ -88,7 +88,7 @@ std::string GetConcatZCode(const std::vector<int> channels) {
int read_index = 0; int read_index = 0;
int z = 0; int z = 0;
for (int i = 0; i < channels.size(); ++i) { for (int i = 0; i < channels.size(); ++i) {
const int depth = IntegralDivideRoundUp(channels[i], 4); const int depth = DivideRoundUp(channels[i], 4);
const std::string src_buffer = "src_buffer" + std::to_string(i); const std::string src_buffer = "src_buffer" + std::to_string(i);
for (int d = 0; d < depth; ++d) { for (int d = 0; d < depth; ++d) {
const int channels_in_group = std::min(4, channels[i] - d * 4); const int channels_in_group = std::min(4, channels[i] - d * 4);
@ -168,11 +168,11 @@ std::vector<ComputeTaskDescriptorPtr> ConcatZ(
std::vector<int> uniform_params{ std::vector<int> uniform_params{
src_shape.w, src_shape.w,
src_shape.h, src_shape.h,
IntegralDivideRoundUp(src_shape.c, 4), DivideRoundUp(src_shape.c, 4),
src_shape.w * src_shape.h, src_shape.w * src_shape.h,
dst_shape.w, dst_shape.w,
dst_shape.h, dst_shape.h,
IntegralDivideRoundUp(dst_shape.c, 4), DivideRoundUp(dst_shape.c, 4),
dst_shape.w * dst_shape.h, dst_shape.w * dst_shape.h,
}; };
return GetByteBuffer(uniform_params); return GetByteBuffer(uniform_params);
@ -184,9 +184,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatZ(
uint3 grid(dst_shape.w, dst_shape.h, 1); uint3 grid(dst_shape.w, dst_shape.h, 1);
uint3 group_size{8u, 4u, 1u}; uint3 group_size{8u, 4u, 1u};
uint3 groups; uint3 groups;
groups.x = IntegralDivideRoundUp(grid.x, group_size.x); groups.x = DivideRoundUp(grid.x, group_size.x);
groups.y = IntegralDivideRoundUp(grid.y, group_size.y); groups.y = DivideRoundUp(grid.y, group_size.y);
groups.z = IntegralDivideRoundUp(grid.z, group_size.z); groups.z = DivideRoundUp(grid.z, group_size.z);
return std::make_pair(group_size, groups); return std::make_pair(group_size, groups);
}; };
@ -265,7 +265,7 @@ std::vector<ComputeTaskDescriptorPtr> ConcatX(
[output_id](const std::map<ValueId, BHWC>& buffers) { [output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& dimension = buffers.find(output_id)->second; const auto& dimension = buffers.find(output_id)->second;
std::vector<int> uniform_params{dimension.w, dimension.h, std::vector<int> uniform_params{dimension.w, dimension.h,
IntegralDivideRoundUp(dimension.c, 4), DivideRoundUp(dimension.c, 4),
/*padding=*/0}; /*padding=*/0};
return GetByteBuffer(uniform_params); return GetByteBuffer(uniform_params);
}}, }},
@ -274,9 +274,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatX(
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) { desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& output_dims = buffers.find(output_id)->second; const auto& output_dims = buffers.find(output_id)->second;
const uint3 groups_size{8, 4, 1}; const uint3 groups_size{8, 4, 1};
int groups_x = IntegralDivideRoundUp(output_dims.w, groups_size.x); int groups_x = DivideRoundUp(output_dims.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(output_dims.h, groups_size.y); int groups_y = DivideRoundUp(output_dims.h, groups_size.y);
int groups_z = IntegralDivideRoundUp(output_dims.c, 4); int groups_z = DivideRoundUp(output_dims.c, 4);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };
@ -356,7 +356,7 @@ std::vector<ComputeTaskDescriptorPtr> ConcatY(
[output_id](const std::map<ValueId, BHWC>& buffers) { [output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& dimension = buffers.find(output_id)->second; const auto& dimension = buffers.find(output_id)->second;
std::vector<int> uniform_params{dimension.w, dimension.h, std::vector<int> uniform_params{dimension.w, dimension.h,
IntegralDivideRoundUp(dimension.c, 4), DivideRoundUp(dimension.c, 4),
/*padding=*/0}; /*padding=*/0};
return GetByteBuffer(uniform_params); return GetByteBuffer(uniform_params);
}}, }},
@ -365,9 +365,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatY(
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) { desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& output_dims = buffers.find(output_id)->second; const auto& output_dims = buffers.find(output_id)->second;
const uint3 groups_size{8, 4, 1}; const uint3 groups_size{8, 4, 1};
int groups_x = IntegralDivideRoundUp(output_dims.w, groups_size.x); int groups_x = DivideRoundUp(output_dims.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(output_dims.h, groups_size.y); int groups_y = DivideRoundUp(output_dims.h, groups_size.y);
int groups_z = IntegralDivideRoundUp(output_dims.c, 4); int groups_z = DivideRoundUp(output_dims.c, 4);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };

View File

@ -72,7 +72,7 @@ struct ConvParams {
namespace { namespace {
int GetNumOutputSlices(int dst_channels) { int GetNumOutputSlices(int dst_channels) {
const int dst_depth = IntegralDivideRoundUp(dst_channels, 4); const int dst_depth = DivideRoundUp(dst_channels, 4);
if (dst_depth % 4 == 0 || dst_depth >= 16) { if (dst_depth % 4 == 0 || dst_depth >= 16) {
return 4; return 4;
} else if (dst_depth % 2 == 0 || dst_depth >= 4) { } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
@ -571,8 +571,8 @@ kernel void ComputeFunction(
std::vector<float> ReorderWeightsForConv( std::vector<float> ReorderWeightsForConv(
const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights, const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights,
const ConvParams& params) { const ConvParams& params) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4); const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); const int src_depth = DivideRoundUp(weights.shape.i, 4);
std::vector<float> weights_reordered( std::vector<float> weights_reordered(
weights.shape.w * weights.shape.h * weights.shape.w * weights.shape.h *
AlignByN(dst_depth, params.block_size.z) * 4 * src_depth * 4); AlignByN(dst_depth, params.block_size.z) * 4 * src_depth * 4);
@ -580,8 +580,7 @@ std::vector<float> ReorderWeightsForConv(
bool isO4I4 = params.weight_layout == WeightsInnerBlockLayout::O4I4; bool isO4I4 = params.weight_layout == WeightsInnerBlockLayout::O4I4;
int counter = 0; int counter = 0;
for (int d = 0; d < IntegralDivideRoundUp(dst_depth, params.block_size.z); for (int d = 0; d < DivideRoundUp(dst_depth, params.block_size.z); ++d) {
++d) {
for (int y = 0; y < weights.shape.h; ++y) { for (int y = 0; y < weights.shape.h; ++y) {
for (int x = 0; x < weights.shape.w; ++x) { for (int x = 0; x < weights.shape.w; ++x) {
for (int s = 0; s < src_depth; ++s) { for (int s = 0; s < src_depth; ++s) {
@ -618,17 +617,17 @@ std::vector<uint8_t> GetUniformBuffer(const BHWC& src_size,
const BHWC& dst_size, const BHWC& dst_size,
const Convolution2DAttributes& attr, const Convolution2DAttributes& attr,
const ConvParams& params) { const ConvParams& params) {
const int grid_x = IntegralDivideRoundUp(dst_size.w, params.block_size.x); const int grid_x = DivideRoundUp(dst_size.w, params.block_size.x);
const int grid_y = IntegralDivideRoundUp(dst_size.h, params.block_size.y); const int grid_y = DivideRoundUp(dst_size.h, params.block_size.y);
std::vector<int> uniform_params = { std::vector<int> uniform_params = {
src_size.w, src_size.w,
src_size.h, src_size.h,
src_size.w * src_size.h, src_size.w * src_size.h,
IntegralDivideRoundUp(src_size.c, 4), DivideRoundUp(src_size.c, 4),
dst_size.w, dst_size.w,
dst_size.h, dst_size.h,
dst_size.w * dst_size.h, dst_size.w * dst_size.h,
IntegralDivideRoundUp(dst_size.c, 4), DivideRoundUp(dst_size.c, 4),
attr.strides.w, attr.strides.w,
attr.strides.h, attr.strides.h,
-attr.padding.prepended.w, -attr.padding.prepended.w,
@ -652,17 +651,17 @@ std::vector<uint8_t> GetUniformBuffer(const BHWC& src_size,
std::vector<uint8_t> GetUniformBufferForWinograd(const BHWC& src_size, std::vector<uint8_t> GetUniformBufferForWinograd(const BHWC& src_size,
const BHWC& dst_size, const BHWC& dst_size,
const ConvParams& params) { const ConvParams& params) {
const int grid_x = IntegralDivideRoundUp(dst_size.w, params.block_size.x); const int grid_x = DivideRoundUp(dst_size.w, params.block_size.x);
const int grid_y = IntegralDivideRoundUp(dst_size.h, params.block_size.y); const int grid_y = DivideRoundUp(dst_size.h, params.block_size.y);
std::vector<int> uniform_params = { std::vector<int> uniform_params = {
src_size.w, src_size.w,
src_size.h, src_size.h,
src_size.w * src_size.h, src_size.w * src_size.h,
IntegralDivideRoundUp(src_size.c, 4), DivideRoundUp(src_size.c, 4),
dst_size.w, dst_size.w,
dst_size.h, dst_size.h,
dst_size.w * dst_size.h, dst_size.w * dst_size.h,
IntegralDivideRoundUp(dst_size.c, 4), DivideRoundUp(dst_size.c, 4),
1, 1,
1, 1,
0, 0,
@ -685,38 +684,37 @@ std::vector<uint8_t> GetUniformBufferForWinograd(const BHWC& src_size,
int GetGroupsCount(const BHWC& dst_shape, const int3& wg_size, int GetGroupsCount(const BHWC& dst_shape, const int3& wg_size,
const int3& block_size) { const int3& block_size) {
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4); const int dst_slices = DivideRoundUp(dst_shape.c, 4);
int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x); int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y); int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z); int grid_z = DivideRoundUp(dst_slices, block_size.z);
return IntegralDivideRoundUp(grid_x, wg_size.x) * return DivideRoundUp(grid_x, wg_size.x) * DivideRoundUp(grid_y, wg_size.y) *
IntegralDivideRoundUp(grid_y, wg_size.y) * DivideRoundUp(grid_z, wg_size.z);
IntegralDivideRoundUp(grid_z, wg_size.z);
} }
int GetGroupsCountForLinearWH(const BHWC& dst_shape, const int3& wg_size, int GetGroupsCountForLinearWH(const BHWC& dst_shape, const int3& wg_size,
const int3& block_size) { const int3& block_size) {
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4); const int dst_slices = DivideRoundUp(dst_shape.c, 4);
int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x); int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y); int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z); int grid_z = DivideRoundUp(dst_slices, block_size.z);
return IntegralDivideRoundUp(grid_x * grid_y, wg_size.x) * return DivideRoundUp(grid_x * grid_y, wg_size.x) *
IntegralDivideRoundUp(grid_z, wg_size.y); DivideRoundUp(grid_z, wg_size.y);
} }
int GetGroupsCountForLinearWHS(const BHWC& dst_shape, const int3& wg_size, int GetGroupsCountForLinearWHS(const BHWC& dst_shape, const int3& wg_size,
const int3& block_size) { const int3& block_size) {
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4); const int dst_slices = DivideRoundUp(dst_shape.c, 4);
int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x); int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y); int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z); int grid_z = DivideRoundUp(dst_slices, block_size.z);
return IntegralDivideRoundUp(grid_x * grid_y * grid_z, wg_size.x); return DivideRoundUp(grid_x * grid_y * grid_z, wg_size.x);
} }
bool IsKernelXIs1(const Convolution2DAttributes& attr) { bool IsKernelXIs1(const Convolution2DAttributes& attr) {
@ -758,8 +756,8 @@ int GetRecommendedBlockSize(const AppleGPUInfo& apple_info,
ConvParams GetConvParamsForA7A8(const AppleGPUInfo& apple_info, ConvParams GetConvParamsForA7A8(const AppleGPUInfo& apple_info,
const Convolution2DAttributes& attr, const Convolution2DAttributes& attr,
const BHWC& dst_shape) { const BHWC& dst_shape) {
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4); const int dst_slices = DivideRoundUp(dst_shape.c, 4);
const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
ConvParams params; ConvParams params;
params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS; params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
@ -835,8 +833,8 @@ ConvParams GetConvParamsForA7A8(const AppleGPUInfo& apple_info,
ConvParams GetConvParamsForA9AndHigher(const AppleGPUInfo& apple_info, ConvParams GetConvParamsForA9AndHigher(const AppleGPUInfo& apple_info,
const Convolution2DAttributes& attr, const Convolution2DAttributes& attr,
const BHWC& dst_shape) { const BHWC& dst_shape) {
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4); const int dst_slices = DivideRoundUp(dst_shape.c, 4);
const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
int blk_total_size = GetRecommendedBlockSize(apple_info, dst_shape); int blk_total_size = GetRecommendedBlockSize(apple_info, dst_shape);
int3 block_size = int3(1, 1, 1); int3 block_size = int3(1, 1, 1);
if (blk_total_size >= 2 && apple_info.IsBionic()) { if (blk_total_size >= 2 && apple_info.IsBionic()) {
@ -917,8 +915,8 @@ ConvParams GetConvParamsForA9AndHigher(const AppleGPUInfo& apple_info,
ConvParams GetConvParamsForIntel(const Convolution2DAttributes& attr, ConvParams GetConvParamsForIntel(const Convolution2DAttributes& attr,
const RuntimeOptions& options, const RuntimeOptions& options,
const BHWC& dst_shape) { const BHWC& dst_shape) {
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4); const int dst_slices = DivideRoundUp(dst_shape.c, 4);
const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
ConvParams params; ConvParams params;
params.weights_upload_type = WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST; params.weights_upload_type = WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST;
params.x_kernel_is_1 = IsKernelXIs1(attr); params.x_kernel_is_1 = IsKernelXIs1(attr);
@ -1017,29 +1015,28 @@ ConvParams GetConvParams(const DeviceInfo& device_info,
std::pair<uint3, uint3> GetDispatchSizes(const ConvParams& params, std::pair<uint3, uint3> GetDispatchSizes(const ConvParams& params,
const BHWC& shape) { const BHWC& shape) {
const int dst_slices = IntegralDivideRoundUp(shape.c, 4); const int dst_slices = DivideRoundUp(shape.c, 4);
int grid_x = IntegralDivideRoundUp(shape.w, params.block_size.x); int grid_x = DivideRoundUp(shape.w, params.block_size.x);
int grid_y = IntegralDivideRoundUp(shape.h, params.block_size.y); int grid_y = DivideRoundUp(shape.h, params.block_size.y);
int grid_z = IntegralDivideRoundUp(dst_slices, params.block_size.z); int grid_z = DivideRoundUp(dst_slices, params.block_size.z);
const uint3 group_size(params.work_group_size.x, params.work_group_size.y, const uint3 group_size(params.work_group_size.x, params.work_group_size.y,
params.work_group_size.z); params.work_group_size.z);
int3 wg; int3 wg;
uint3 groups_count; uint3 groups_count;
if (params.linear_whs) { if (params.linear_whs) {
wg.x = IntegralDivideRoundUp(grid_x * grid_y * grid_z, wg.x = DivideRoundUp(grid_x * grid_y * grid_z, params.work_group_size.x);
params.work_group_size.x);
groups_count = uint3(wg.x, 1, 1); groups_count = uint3(wg.x, 1, 1);
} else if (params.linear_wh) { } else if (params.linear_wh) {
wg.x = IntegralDivideRoundUp(grid_x * grid_y, params.work_group_size.x); wg.x = DivideRoundUp(grid_x * grid_y, params.work_group_size.x);
wg.y = IntegralDivideRoundUp(grid_z, params.work_group_size.y); wg.y = DivideRoundUp(grid_z, params.work_group_size.y);
groups_count = uint3(wg[params.work_group_launch_order.x], groups_count = uint3(wg[params.work_group_launch_order.x],
wg[params.work_group_launch_order.y], 1); wg[params.work_group_launch_order.y], 1);
} else { } else {
wg.x = IntegralDivideRoundUp(grid_x, params.work_group_size.x); wg.x = DivideRoundUp(grid_x, params.work_group_size.x);
wg.y = IntegralDivideRoundUp(grid_y, params.work_group_size.y); wg.y = DivideRoundUp(grid_y, params.work_group_size.y);
wg.z = IntegralDivideRoundUp(grid_z, params.work_group_size.z); wg.z = DivideRoundUp(grid_z, params.work_group_size.z);
groups_count = uint3(wg[params.work_group_launch_order.x], groups_count = uint3(wg[params.work_group_launch_order.x],
wg[params.work_group_launch_order.y], wg[params.work_group_launch_order.y],
wg[params.work_group_launch_order.z]); wg[params.work_group_launch_order.z]);
@ -1076,7 +1073,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionGeneric(
std::string addr_space = std::string addr_space =
params.weights_upload_type == WeightsUploadType::CONSTANT_MEM ? "constant" params.weights_upload_type == WeightsUploadType::CONSTANT_MEM ? "constant"
: "device"; : "device";
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
desc->immutable_buffers = { desc->immutable_buffers = {
{addr_space + " FLT4* const filters", {addr_space + " FLT4* const filters",
GetByteBufferConverted(weights_reordered, options.storage_precision)}, GetByteBufferConverted(weights_reordered, options.storage_precision)},
@ -1108,7 +1105,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionWino4x4To6x6(
int id, ValueId input_id, ValueId output_id, const BHWC& dst_shape, int id, ValueId input_id, ValueId output_id, const BHWC& dst_shape,
const Convolution2DAttributes& attr, const DeviceInfo& device_info, const Convolution2DAttributes& attr, const DeviceInfo& device_info,
const RuntimeOptions& options) { const RuntimeOptions& options) {
const int dst_slices = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
ConvParams params; ConvParams params;
params.work_group_launch_order = int3(2, 0, 1); params.work_group_launch_order = int3(2, 0, 1);
params.src_depth_loop_size = 1; params.src_depth_loop_size = 1;

View File

@ -34,6 +34,7 @@ using ::tflite::gpu::Axis;
using ::tflite::gpu::BHWC; using ::tflite::gpu::BHWC;
using ::tflite::gpu::Convolution2DAttributes; using ::tflite::gpu::Convolution2DAttributes;
using ::tflite::gpu::DataType; using ::tflite::gpu::DataType;
using ::tflite::gpu::DivideRoundUp;
using ::tflite::gpu::HW; using ::tflite::gpu::HW;
using ::tflite::gpu::Linear; using ::tflite::gpu::Linear;
using ::tflite::gpu::OHWI; using ::tflite::gpu::OHWI;
@ -44,7 +45,6 @@ using ::tflite::gpu::TensorRef;
using ::tflite::gpu::ValueId; using ::tflite::gpu::ValueId;
using ::tflite::gpu::metal::ConvolutionGeneric; using ::tflite::gpu::metal::ConvolutionGeneric;
using ::tflite::gpu::metal::ConvolutionWino4x4To6x6; using ::tflite::gpu::metal::ConvolutionWino4x4To6x6;
using ::tflite::gpu::IntegralDivideRoundUp;
using ::tflite::gpu::metal::CompareVectors; using ::tflite::gpu::metal::CompareVectors;
using ::tflite::gpu::metal::SingleOpModel; using ::tflite::gpu::metal::SingleOpModel;
@ -275,7 +275,7 @@ using ::tflite::gpu::metal::SingleOpModel;
BHWC conv_shape; BHWC conv_shape;
conv_shape.b = dst_shape.b; conv_shape.b = dst_shape.b;
conv_shape.h = 36; conv_shape.h = 36;
conv_shape.w = IntegralDivideRoundUp(new_width, 4) * IntegralDivideRoundUp(new_height, 4); conv_shape.w = DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
conv_shape.c = dst_shape.c; conv_shape.c = dst_shape.c;
TensorFloat32 src_tensor; TensorFloat32 src_tensor;

View File

@ -208,7 +208,7 @@ kernel void ComputeFunction(
// DepthWiseConv3x3Stride1x1 // DepthWiseConv3x3Stride1x1
std::vector<float> ReorderWeightsDepthWiseConv3x3Stride1x1( std::vector<float> ReorderWeightsDepthWiseConv3x3Stride1x1(
const DepthwiseConvolution2DAttributes& attr) { const DepthwiseConvolution2DAttributes& attr) {
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int kernel_x = 3; const int kernel_x = 3;
const int kernel_y = 3; const int kernel_y = 3;
std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth * std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth *
@ -250,11 +250,11 @@ static std::vector<uint8_t> GetUniformBufferDepthWiseConv3x3Stride1x1(
src_size.w, src_size.w,
src_size.h, src_size.h,
src_size.w * src_size.h, src_size.w * src_size.h,
IntegralDivideRoundUp(src_size.c, 4), DivideRoundUp(src_size.c, 4),
dst_size.w, dst_size.w,
dst_size.h, dst_size.h,
dst_size.w * dst_size.h, dst_size.w * dst_size.h,
IntegralDivideRoundUp(dst_size.c, 4), DivideRoundUp(dst_size.c, 4),
-params.padding.prepended.w, -params.padding.prepended.w,
-params.padding.prepended.h, -params.padding.prepended.h,
0, // dummy, for alignment 0, // dummy, for alignment
@ -403,7 +403,7 @@ kernel void ComputeFunction(
// DepthWiseConv3x3Stride2 // DepthWiseConv3x3Stride2
std::vector<float> ReorderWeightsDepthWiseConv3x3Stride2( std::vector<float> ReorderWeightsDepthWiseConv3x3Stride2(
const DepthwiseConvolution2DAttributes& attr) { const DepthwiseConvolution2DAttributes& attr) {
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int kernel_x = 3; const int kernel_x = 3;
const int kernel_y = 3; const int kernel_y = 3;
std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth * std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth *
@ -445,11 +445,11 @@ static std::vector<uint8_t> GetUniformBufferDepthWiseConv3x3Stride2(
src_size.w, src_size.w,
src_size.h, src_size.h,
src_size.w * src_size.h, src_size.w * src_size.h,
IntegralDivideRoundUp(src_size.c, 4), DivideRoundUp(src_size.c, 4),
dst_size.w, dst_size.w,
dst_size.h, dst_size.h,
dst_size.w * dst_size.h, dst_size.w * dst_size.h,
IntegralDivideRoundUp(dst_size.c, 4), DivideRoundUp(dst_size.c, 4),
-attr.padding.prepended.w, -attr.padding.prepended.w,
-attr.padding.prepended.h, -attr.padding.prepended.h,
attr.strides.w, attr.strides.w,
@ -586,11 +586,11 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
std::vector<int> uniform_params{ std::vector<int> uniform_params{
dimension.w, dimension.w,
dimension.h, dimension.h,
IntegralDivideRoundUp(dimension.c, 4), DivideRoundUp(dimension.c, 4),
0, 0,
output_dimension.w, output_dimension.w,
output_dimension.h, output_dimension.h,
IntegralDivideRoundUp(output_dimension.c, 4), DivideRoundUp(output_dimension.c, 4),
0, 0,
attr.strides.w, attr.strides.w,
attr.strides.h, attr.strides.h,
@ -612,9 +612,9 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) { desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& dimension = buffers.find(output_id)->second; const auto& dimension = buffers.find(output_id)->second;
uint3 groups_size{8, 4, 1}; uint3 groups_size{8, 4, 1};
uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x), uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
IntegralDivideRoundUp(dimension.h, groups_size.y), DivideRoundUp(dimension.h, groups_size.y),
IntegralDivideRoundUp(dimension.c, 4)}; DivideRoundUp(dimension.c, 4)};
return std::make_pair(groups_size, groups_count); return std::make_pair(groups_size, groups_count);
}; };
@ -661,17 +661,17 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride1x1(
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) { desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& dimension = buffers.find(output_id)->second; const auto& dimension = buffers.find(output_id)->second;
const int grid_x = IntegralDivideRoundUp(dimension.w, 2); const int grid_x = DivideRoundUp(dimension.w, 2);
const int grid_y = IntegralDivideRoundUp(dimension.h, 2); const int grid_y = DivideRoundUp(dimension.h, 2);
const int grid_z = IntegralDivideRoundUp(dimension.c, 4); const int grid_z = DivideRoundUp(dimension.c, 4);
uint3 group_size{8, 4, 1}; uint3 group_size{8, 4, 1};
if (grid_x <= 4) { if (grid_x <= 4) {
group_size.x = 4; group_size.x = 4;
group_size.z = grid_z % 2 == 0 ? 2 : 1; group_size.z = grid_z % 2 == 0 ? 2 : 1;
} }
const int groups_x = IntegralDivideRoundUp(grid_x, group_size.x); const int groups_x = DivideRoundUp(grid_x, group_size.x);
const int groups_y = IntegralDivideRoundUp(grid_y, group_size.y); const int groups_y = DivideRoundUp(grid_y, group_size.y);
const int groups_z = IntegralDivideRoundUp(grid_z, group_size.z); const int groups_z = DivideRoundUp(grid_z, group_size.z);
return std::make_pair(group_size, uint3(groups_x, groups_y, groups_z)); return std::make_pair(group_size, uint3(groups_x, groups_y, groups_z));
}; };
@ -726,12 +726,12 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride2(
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) { desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& dimension = buffers.find(output_id)->second; const auto& dimension = buffers.find(output_id)->second;
const int grid_x = dimension.w; const int grid_x = dimension.w;
const int grid_y = IntegralDivideRoundUp(dimension.h, 2); const int grid_y = DivideRoundUp(dimension.h, 2);
const int grid_z = IntegralDivideRoundUp(dimension.c, 4); const int grid_z = DivideRoundUp(dimension.c, 4);
const uint3 group_size{8, 4, 1}; const uint3 group_size{8, 4, 1};
const int groups_x = IntegralDivideRoundUp(grid_x, group_size.x); const int groups_x = DivideRoundUp(grid_x, group_size.x);
const int groups_y = IntegralDivideRoundUp(grid_y, group_size.y); const int groups_y = DivideRoundUp(grid_y, group_size.y);
const int groups_z = IntegralDivideRoundUp(grid_z, group_size.z); const int groups_z = DivideRoundUp(grid_z, group_size.z);
return std::make_pair(group_size, uint3(groups_x, groups_y, groups_z)); return std::make_pair(group_size, uint3(groups_x, groups_y, groups_z));
}; };

View File

@ -45,7 +45,7 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
const std::string barrier = device_info.IsWaveSizeEqualTo32() const std::string barrier = device_info.IsWaveSizeEqualTo32()
? "SIMDGROUP_BARRIER" ? "SIMDGROUP_BARRIER"
: "threadgroup_barrier"; : "threadgroup_barrier";
const int src_depth = IntegralDivideRoundUp(src_channels, 4); const int src_depth = DivideRoundUp(src_channels, 4);
std::stringstream code; std::stringstream code;
code << R"( code << R"(
#include <metal_stdlib> #include <metal_stdlib>
@ -116,9 +116,8 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
} }
} }
)"; )";
const int src_depth_sub_groups = shared_memory const int src_depth_sub_groups = shared_memory ? DivideRoundUp(src_depth, 32)
? IntegralDivideRoundUp(src_depth, 32) : DivideRoundUp(src_depth, 4);
: IntegralDivideRoundUp(src_depth, 4);
return absl::Substitute(code.str(), src_depth_sub_groups, barrier); return absl::Substitute(code.str(), src_depth_sub_groups, barrier);
} }
} // namespace } // namespace
@ -146,7 +145,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
bool shared_memory = bool shared_memory =
device_info.IsAppleGPU() && device_info.IsAppleGPU() &&
device_info.apple_info.IsLocalMemoryPreferredOverGlobal(); device_info.apple_info.IsLocalMemoryPreferredOverGlobal();
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int src_depth_aligned = AlignByN(src_depth, shared_memory ? 32 : 4); const int src_depth_aligned = AlignByN(src_depth, shared_memory ? 32 : 4);
const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8); const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8);
@ -179,8 +178,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
{"constant uniforms& params", {"constant uniforms& params",
[attr](const std::map<ValueId, BHWC>& buffers) { [attr](const std::map<ValueId, BHWC>& buffers) {
std::vector<uint32_t> uniform_params{ std::vector<uint32_t> uniform_params{
static_cast<uint32_t>( static_cast<uint32_t>(DivideRoundUp(attr.weights.shape.i, 4)),
IntegralDivideRoundUp(attr.weights.shape.i, 4)),
static_cast<uint32_t>(AlignByN(attr.weights.shape.o, 8)), static_cast<uint32_t>(AlignByN(attr.weights.shape.o, 8)),
static_cast<uint32_t>(attr.weights.shape.o), static_cast<uint32_t>(attr.weights.shape.o),
static_cast<uint32_t>(0), static_cast<uint32_t>(0),
@ -192,7 +190,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) { desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
const uint3 groups_size{8, 4, 1}; const uint3 groups_size{8, 4, 1};
const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8); const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8);
int groups_x = IntegralDivideRoundUp(dst_channels_aligned, groups_size.x); int groups_x = DivideRoundUp(dst_channels_aligned, groups_size.x);
return std::make_pair(groups_size, uint3{groups_x, 1, 1}); return std::make_pair(groups_size, uint3{groups_x, 1, 1});
}; };

View File

@ -131,9 +131,9 @@ std::vector<ComputeTaskDescriptorPtr> MaxUnpooling(
const auto& src_shape = buffers.find(input_id)->second; const auto& src_shape = buffers.find(input_id)->second;
BHWC dst_shape = CalculateOutputShape(src_shape, params); BHWC dst_shape = CalculateOutputShape(src_shape, params);
const uint3 groups_size{16, 16, 1}; const uint3 groups_size{16, 16, 1};
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x); int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y); int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
int groups_z = IntegralDivideRoundUp(dst_shape.c, 4); int groups_z = DivideRoundUp(dst_shape.c, 4);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };

View File

@ -133,7 +133,7 @@ std::vector<ComputeTaskDescriptorPtr> Mean(int id, ValueId input_id,
[input_id, output_id, [input_id, output_id,
work_group_size](const std::map<ValueId, BHWC>& buffers) { work_group_size](const std::map<ValueId, BHWC>& buffers) {
const auto& src_shape = buffers.find(input_id)->second; const auto& src_shape = buffers.find(input_id)->second;
const int src_slices = IntegralDivideRoundUp(src_shape.c, 4); const int src_slices = DivideRoundUp(src_shape.c, 4);
struct uniforms { struct uniforms {
int4 src_size; int4 src_size;
float4 inv_multipliers; float4 inv_multipliers;
@ -153,8 +153,8 @@ std::vector<ComputeTaskDescriptorPtr> Mean(int id, ValueId input_id,
desc->resize_function = [output_id, work_group_size]( desc->resize_function = [output_id, work_group_size](
const std::map<ValueId, BHWC>& buffers) { const std::map<ValueId, BHWC>& buffers) {
BHWC dst_shape = buffers.find(output_id)->second; BHWC dst_shape = buffers.find(output_id)->second;
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4); const int dst_slices = DivideRoundUp(dst_shape.c, 4);
const int groups_z = IntegralDivideRoundUp(dst_slices, work_group_size.z); const int groups_z = DivideRoundUp(dst_slices, work_group_size.z);
return std::make_pair(work_group_size, uint3{1, 1, groups_z}); return std::make_pair(work_group_size, uint3{1, 1, groups_z});
}; };
return {desc}; return {desc};

View File

@ -177,12 +177,12 @@ std::vector<ComputeTaskDescriptorPtr> Padding(int id, ValueId input_id,
dimension.w, dimension.w,
dimension.h, dimension.h,
dimension.c, dimension.c,
IntegralDivideRoundUp(dimension.c, 4), DivideRoundUp(dimension.c, 4),
// int4 dst_size // int4 dst_size
output_dimension.w, output_dimension.w,
output_dimension.h, output_dimension.h,
output_dimension.c, output_dimension.c,
IntegralDivideRoundUp(output_dimension.c, 4), DivideRoundUp(output_dimension.c, 4),
// int4 prepended padding // int4 prepended padding
attr.prepended.w, attr.prepended.w,
attr.prepended.h, attr.prepended.h,
@ -198,10 +198,10 @@ std::vector<ComputeTaskDescriptorPtr> Padding(int id, ValueId input_id,
const uint3 groups_size{16, 16, 1}; const uint3 groups_size{16, 16, 1};
const auto& src_shape = buffers.find(input_id)->second; const auto& src_shape = buffers.find(input_id)->second;
BHWC dst_shape = CalculateOutputShape(src_shape, attr); BHWC dst_shape = CalculateOutputShape(src_shape, attr);
const int dst_layers = IntegralDivideRoundUp(dst_shape.c, 4); const int dst_layers = DivideRoundUp(dst_shape.c, 4);
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x); int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y); int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z); int groups_z = DivideRoundUp(dst_layers, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };

View File

@ -224,11 +224,11 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
std::vector<int> uniform_params = { std::vector<int> uniform_params = {
dimension.w, dimension.w,
dimension.h, dimension.h,
IntegralDivideRoundUp(dimension.c, 4), DivideRoundUp(dimension.c, 4),
dimension.w * dimension.h, dimension.w * dimension.h,
output_dimension.w, output_dimension.w,
output_dimension.h, output_dimension.h,
IntegralDivideRoundUp(dimension.c, 4), DivideRoundUp(dimension.c, 4),
output_dimension.w * output_dimension.h, output_dimension.w * output_dimension.h,
params.strides.w, params.strides.w,
params.strides.h, params.strides.h,
@ -242,11 +242,11 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) { desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
BHWC dst_shape = buffers.find(output_id)->second; BHWC dst_shape = buffers.find(output_id)->second;
const uint3 grid = const uint3 grid =
uint3(dst_shape.w, dst_shape.h, IntegralDivideRoundUp(dst_shape.c, 4)); uint3(dst_shape.w, dst_shape.h, DivideRoundUp(dst_shape.c, 4));
const uint3 groups_size = GetWorkGroupSizeForGrid(grid); const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x); int groups_x = DivideRoundUp(grid.x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y); int groups_y = DivideRoundUp(grid.y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z); int groups_z = DivideRoundUp(grid.z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };

View File

@ -159,11 +159,11 @@ std::vector<ComputeTaskDescriptorPtr> Reshape(int id, ValueId input_id,
desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) { desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h, const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
IntegralDivideRoundUp(attr.new_shape.c, 4)); DivideRoundUp(attr.new_shape.c, 4));
const uint3 groups_size = GetWorkGroupSizeForGrid(grid); const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x); int groups_x = DivideRoundUp(grid.x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y); int groups_y = DivideRoundUp(grid.y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z); int groups_z = DivideRoundUp(grid.z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };
@ -197,14 +197,14 @@ std::vector<ComputeTaskDescriptorPtr> Reshapex4(int id, ValueId input_id,
const auto& dst_dim = buffers.find(output_id)->second; const auto& dst_dim = buffers.find(output_id)->second;
std::vector<int32_t> uniform_params{ std::vector<int32_t> uniform_params{
// int4 src_size // int4 src_size
src_dim.w, src_dim.h, IntegralDivideRoundUp(src_dim.c, 4), src_dim.w, src_dim.h, DivideRoundUp(src_dim.c, 4),
src_dim.w * src_dim.h, src_dim.w * src_dim.h,
// int4 dst_size // int4 dst_size
dst_dim.w, dst_dim.h, IntegralDivideRoundUp(dst_dim.c, 4), dst_dim.w, dst_dim.h, DivideRoundUp(dst_dim.c, 4),
dst_dim.w * dst_dim.h, dst_dim.w * dst_dim.h,
// int2 plane_xz // int2 plane_xz
src_dim.w * IntegralDivideRoundUp(src_dim.c, 4), src_dim.w * DivideRoundUp(src_dim.c, 4),
dst_dim.w * IntegralDivideRoundUp(dst_dim.c, 4), dst_dim.w * DivideRoundUp(dst_dim.c, 4),
0, // dummy, for alignment 0, // dummy, for alignment
0, // dummy, for alignment 0, // dummy, for alignment
0, // dummy, for alignment 0, // dummy, for alignment
@ -218,11 +218,11 @@ std::vector<ComputeTaskDescriptorPtr> Reshapex4(int id, ValueId input_id,
desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) { desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h, const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
IntegralDivideRoundUp(attr.new_shape.c, 4)); DivideRoundUp(attr.new_shape.c, 4));
const uint3 groups_size = GetWorkGroupSizeForGrid(grid); const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x); int groups_x = DivideRoundUp(grid.x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y); int groups_y = DivideRoundUp(grid.y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z); int groups_z = DivideRoundUp(grid.z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };

View File

@ -153,10 +153,10 @@ std::vector<ComputeTaskDescriptorPtr> Resize(int id, ValueId input_id,
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) { desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const uint3 groups_size{16, 16, 1}; const uint3 groups_size{16, 16, 1};
const auto& dst_dim = buffers.find(output_id)->second; const auto& dst_dim = buffers.find(output_id)->second;
int groups_x = IntegralDivideRoundUp(dst_dim.w, groups_size.x); int groups_x = DivideRoundUp(dst_dim.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(dst_dim.h, groups_size.y); int groups_y = DivideRoundUp(dst_dim.h, groups_size.y);
const int dst_layers = IntegralDivideRoundUp(dst_dim.c, 4); const int dst_layers = DivideRoundUp(dst_dim.c, 4);
int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z); int groups_z = DivideRoundUp(dst_layers, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };
return {desc}; return {desc};

View File

@ -157,12 +157,12 @@ std::vector<ComputeTaskDescriptorPtr> Slice(int id, ValueId input_id,
dimension.w, dimension.w,
dimension.h, dimension.h,
dimension.c, dimension.c,
IntegralDivideRoundUp(dimension.c, 4), DivideRoundUp(dimension.c, 4),
// int4 dst_size // int4 dst_size
output_dimension.w, output_dimension.w,
output_dimension.h, output_dimension.h,
output_dimension.c, output_dimension.c,
IntegralDivideRoundUp(output_dimension.c, 4), DivideRoundUp(output_dimension.c, 4),
}; };
return GetByteBuffer(uniform_params); return GetByteBuffer(uniform_params);
}}, }},
@ -173,10 +173,10 @@ std::vector<ComputeTaskDescriptorPtr> Slice(int id, ValueId input_id,
const uint3 groups_size{16, 16, 1}; const uint3 groups_size{16, 16, 1};
const auto& src_shape = buffers.find(input_id)->second; const auto& src_shape = buffers.find(input_id)->second;
BHWC dst_shape = CalculateOutputShape(src_shape, attr); BHWC dst_shape = CalculateOutputShape(src_shape, attr);
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x); int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y); int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
const int dst_layers = IntegralDivideRoundUp(dst_shape.c, 4); const int dst_layers = DivideRoundUp(dst_shape.c, 4);
int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z); int groups_z = DivideRoundUp(dst_layers, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };

View File

@ -169,8 +169,8 @@ std::vector<ComputeTaskDescriptorPtr> Softmax(int id, ValueId input_id,
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) { desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
uint3 groups_size{8, 4, 1}; uint3 groups_size{8, 4, 1};
const auto& dimension = buffers.find(output_id)->second; const auto& dimension = buffers.find(output_id)->second;
uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x), uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
IntegralDivideRoundUp(dimension.h, groups_size.y), 1}; DivideRoundUp(dimension.h, groups_size.y), 1};
return std::make_pair(groups_size, groups_count); return std::make_pair(groups_size, groups_count);
}; };
@ -198,13 +198,13 @@ std::vector<ComputeTaskDescriptorPtr> Softmax1x1(int id, ValueId input_id,
desc->uniform_buffers = { desc->uniform_buffers = {
{"constant uniforms& params", {"constant uniforms& params",
[channels_count](const std::map<ValueId, BHWC>& buffers) { [channels_count](const std::map<ValueId, BHWC>& buffers) {
const int src_depth = IntegralDivideRoundUp(channels_count, 4); const int src_depth = DivideRoundUp(channels_count, 4);
struct uniforms { struct uniforms {
int4 size; int4 size;
float4 mask; float4 mask;
}; };
uniforms params; uniforms params;
params.size = {src_depth, IntegralDivideRoundUp(src_depth, 32), 1, 1}; params.size = {src_depth, DivideRoundUp(src_depth, 32), 1, 1};
params.mask = {0.0f, 0.0f, 0.0f, 0.0f}; params.mask = {0.0f, 0.0f, 0.0f, 0.0f};
const int reminder = channels_count % 4 == 0 ? 4 : channels_count % 4; const int reminder = channels_count % 4 == 0 ? 4 : channels_count % 4;
for (int i = 0; i < reminder; ++i) { for (int i = 0; i < reminder; ++i) {

View File

@ -113,12 +113,12 @@ kernel void ComputeFunction($1 uint3 gid[[thread_position_in_grid]]) {
input_shape.h / attr.block_size, input_shape.h / attr.block_size,
input_shape.w / attr.block_size, input_shape.w / attr.block_size,
input_shape.c * attr.block_size * attr.block_size); input_shape.c * attr.block_size * attr.block_size);
const uint3 grid = uint3(output_shape.w, output_shape.h, const uint3 grid =
IntegralDivideRoundUp(output_shape.c, 4)); uint3(output_shape.w, output_shape.h, DivideRoundUp(output_shape.c, 4));
const uint3 groups_size = GetWorkGroupSizeForGrid(grid); const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
const int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x); const int groups_x = DivideRoundUp(grid.x, groups_size.x);
const int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y); const int groups_y = DivideRoundUp(grid.y, groups_size.y);
const int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z); const int groups_z = DivideRoundUp(grid.z, groups_size.z);
return std::make_pair(groups_size, uint3(groups_x, groups_y, groups_z)); return std::make_pair(groups_size, uint3(groups_x, groups_y, groups_z));
}; };
return {desc}; return {desc};

View File

@ -130,8 +130,8 @@ std::string GetDeconvolution(const ConvolutionTransposedAttributes& attr) {
constant_args, attr.padding.prepended.w, attr.padding.prepended.h, constant_args, attr.padding.prepended.w, attr.padding.prepended.h,
attr.stride.w, attr.stride.h, kernel_x, kernel_y, inner_size_x, attr.stride.w, attr.stride.h, kernel_x, kernel_y, inner_size_x,
inner_size_y, kernel_x - 1, kernel_y - 1); inner_size_y, kernel_x - 1, kernel_y - 1);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 4); const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 4);
return absl::Substitute(shader_source, src_depth * dst_channels_aligned, return absl::Substitute(shader_source, src_depth * dst_channels_aligned,
src_depth, dst_depth, attr.weights.shape.o, src_depth, dst_depth, attr.weights.shape.o,
@ -264,8 +264,8 @@ std::string GetDeconvolutionShared(const ConvolutionTransposedAttributes& attr,
constant_args, attr.padding.prepended.w, attr.padding.prepended.h, constant_args, attr.padding.prepended.w, attr.padding.prepended.h,
attr.stride.w, attr.stride.h, kernel_x, kernel_y, inner_size_x, attr.stride.w, attr.stride.h, kernel_x, kernel_y, inner_size_x,
inner_size_y, kernel_x - 1, kernel_y - 1); inner_size_y, kernel_x - 1, kernel_y - 1);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4); const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4); const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 4); const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 4);
const int src_local_size_x = (workgroup_x + kernel_x) / attr.stride.w; const int src_local_size_x = (workgroup_x + kernel_x) / attr.stride.w;
const int src_local_size_y = (workgroup_y + kernel_y) / attr.stride.h; const int src_local_size_y = (workgroup_y + kernel_y) / attr.stride.h;
@ -464,7 +464,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
(kThreadGroupWidth + params.weights.shape.w) / params.stride.w; (kThreadGroupWidth + params.weights.shape.w) / params.stride.w;
const int src_local_size_y = const int src_local_size_y =
(kThreadGroupHeight + params.weights.shape.h) / params.stride.h; (kThreadGroupHeight + params.weights.shape.h) / params.stride.h;
const int src_depth = IntegralDivideRoundUp(params.weights.shape.i, 4); const int src_depth = DivideRoundUp(params.weights.shape.i, 4);
const int shared_size = const int shared_size =
sizeof(float) * 4 * src_depth * src_local_size_x * src_local_size_y; sizeof(float) * 4 * src_depth * src_local_size_x * src_local_size_y;
if (shared_size < 1000 * 16 && if (shared_size < 1000 * 16 &&
@ -543,8 +543,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
const uint3 groups_size{kThreadGroupWidth, kThreadGroupHeight, 1}; const uint3 groups_size{kThreadGroupWidth, kThreadGroupHeight, 1};
BHWC dst_shape = BHWC dst_shape =
CalculateOutputShape(buffers.find(input_id)->second, params); CalculateOutputShape(buffers.find(input_id)->second, params);
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x); int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y); int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
int groups_z = 1; int groups_z = 1;
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };
@ -556,8 +556,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
int id, ValueId input_id, ValueId output_id, int id, ValueId input_id, ValueId output_id,
const ConvolutionTransposedAttributes& params, const ConvolutionTransposedAttributes& params,
const DeviceInfo& device_info, const RuntimeOptions& options) { const DeviceInfo& device_info, const RuntimeOptions& options) {
const int src_depth = IntegralDivideRoundUp(params.weights.shape.i, 4); const int src_depth = DivideRoundUp(params.weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(params.weights.shape.o, 4); const int dst_depth = DivideRoundUp(params.weights.shape.o, 4);
const int kernel_x = 4; const int kernel_x = 4;
const int kernel_y = 4; const int kernel_y = 4;
@ -645,7 +645,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
[input_id, output_id, params](const std::map<ValueId, BHWC>& buffers) { [input_id, output_id, params](const std::map<ValueId, BHWC>& buffers) {
const auto& src_shape = buffers.find(input_id)->second; const auto& src_shape = buffers.find(input_id)->second;
const auto& dst_shape = buffers.find(output_id)->second; const auto& dst_shape = buffers.find(output_id)->second;
const int src_depth = IntegralDivideRoundUp(src_shape.c, 4); const int src_depth = DivideRoundUp(src_shape.c, 4);
std::vector<int> uniform_params{ std::vector<int> uniform_params{
src_shape.w, src_shape.w,
src_shape.h, src_shape.h,
@ -653,7 +653,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
src_shape.w * src_shape.h, src_shape.w * src_shape.h,
dst_shape.w, dst_shape.w,
dst_shape.h, dst_shape.h,
IntegralDivideRoundUp(dst_shape.c, 4), DivideRoundUp(dst_shape.c, 4),
0, 0,
4 * 16 * src_depth, 4 * 16 * src_depth,
0, 0,
@ -667,13 +667,13 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
desc->resize_function = [output_id, block_size, desc->resize_function = [output_id, block_size,
params](const std::map<ValueId, BHWC>& buffers) { params](const std::map<ValueId, BHWC>& buffers) {
const auto& dst_shape = buffers.find(output_id)->second; const auto& dst_shape = buffers.find(output_id)->second;
const int grid_x = IntegralDivideRoundUp(dst_shape.w + 2, 2 * block_size.x); const int grid_x = DivideRoundUp(dst_shape.w + 2, 2 * block_size.x);
const int grid_y = IntegralDivideRoundUp(dst_shape.h + 2, 2 * block_size.y); const int grid_y = DivideRoundUp(dst_shape.h + 2, 2 * block_size.y);
const int grid_z = IntegralDivideRoundUp(dst_shape.c, 4); const int grid_z = DivideRoundUp(dst_shape.c, 4);
const uint3 group_size{8, 4, 1}; const uint3 group_size{8, 4, 1};
int groups_x = IntegralDivideRoundUp(grid_x, group_size.x); int groups_x = DivideRoundUp(grid_x, group_size.x);
int groups_y = IntegralDivideRoundUp(grid_y, group_size.y); int groups_y = DivideRoundUp(grid_y, group_size.y);
int groups_z = IntegralDivideRoundUp(grid_z, group_size.z); int groups_z = DivideRoundUp(grid_z, group_size.z);
return std::make_pair(group_size, uint3{groups_z, groups_x, groups_y}); return std::make_pair(group_size, uint3{groups_z, groups_x, groups_y});
}; };

View File

@ -486,8 +486,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
BHWC dst_shape; BHWC dst_shape;
dst_shape.b = src_shape.b; dst_shape.b = src_shape.b;
dst_shape.h = 36; dst_shape.h = 36;
dst_shape.w = IntegralDivideRoundUp(new_width, 4) * dst_shape.w =
IntegralDivideRoundUp(new_height, 4); DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
dst_shape.c = src_shape.c; dst_shape.c = src_shape.c;
return dst_shape; return dst_shape;
}}; }};
@ -501,16 +501,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
attr.padding.appended.w - 2; attr.padding.appended.w - 2;
int new_height = src_shape.h + attr.padding.prepended.h + int new_height = src_shape.h + attr.padding.prepended.h +
attr.padding.appended.h - 2; attr.padding.appended.h - 2;
int tiles_x = IntegralDivideRoundUp(new_width, 4); int tiles_x = DivideRoundUp(new_width, 4);
int tiles_y = IntegralDivideRoundUp(new_height, 4); int tiles_y = DivideRoundUp(new_height, 4);
std::vector<int> sizes = { std::vector<int> sizes = {
src_shape.w, src_shape.w,
src_shape.h, src_shape.h,
IntegralDivideRoundUp(src_shape.c, 4), DivideRoundUp(src_shape.c, 4),
0, 0,
dst_shape.w, dst_shape.w,
dst_shape.h, dst_shape.h,
IntegralDivideRoundUp(dst_shape.c, 4), DivideRoundUp(dst_shape.c, 4),
0, 0,
-attr.padding.prepended.w, -attr.padding.prepended.w,
-attr.padding.prepended.h, -attr.padding.prepended.h,
@ -529,12 +529,12 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
src_shape.w + attr.padding.prepended.w + attr.padding.appended.w - 2; src_shape.w + attr.padding.prepended.w + attr.padding.appended.w - 2;
int new_height = int new_height =
src_shape.h + attr.padding.prepended.h + attr.padding.appended.h - 2; src_shape.h + attr.padding.prepended.h + attr.padding.appended.h - 2;
int grid_x = IntegralDivideRoundUp(new_width, 4); int grid_x = DivideRoundUp(new_width, 4);
int grid_y = IntegralDivideRoundUp(new_height, 4); int grid_y = DivideRoundUp(new_height, 4);
int grid_z = IntegralDivideRoundUp(src_shape.c, 4); int grid_z = DivideRoundUp(src_shape.c, 4);
int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x); int groups_x = DivideRoundUp(grid_x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y); int groups_y = DivideRoundUp(grid_y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z); int groups_z = DivideRoundUp(grid_z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };
return {desc}; return {desc};
@ -563,8 +563,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
BHWC dst_shape; BHWC dst_shape;
dst_shape.b = src_shape.b; dst_shape.b = src_shape.b;
dst_shape.h = 36; dst_shape.h = 36;
dst_shape.w = IntegralDivideRoundUp(new_width, 4) * dst_shape.w =
IntegralDivideRoundUp(new_height, 4); DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
dst_shape.c = src_shape.c; dst_shape.c = src_shape.c;
return dst_shape; return dst_shape;
}}; }};
@ -593,16 +593,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
attr.padding.appended.w - 2; attr.padding.appended.w - 2;
int new_height = src_shape.h + attr.padding.prepended.h + int new_height = src_shape.h + attr.padding.prepended.h +
attr.padding.appended.h - 2; attr.padding.appended.h - 2;
int tiles_x = IntegralDivideRoundUp(new_width, 4); int tiles_x = DivideRoundUp(new_width, 4);
int tiles_y = IntegralDivideRoundUp(new_height, 4); int tiles_y = DivideRoundUp(new_height, 4);
std::vector<int> sizes = { std::vector<int> sizes = {
src_shape.w, src_shape.w,
src_shape.h, src_shape.h,
IntegralDivideRoundUp(src_shape.c, 4), DivideRoundUp(src_shape.c, 4),
0, 0,
dst_shape.w, dst_shape.w,
dst_shape.h, dst_shape.h,
IntegralDivideRoundUp(dst_shape.c, 4), DivideRoundUp(dst_shape.c, 4),
0, 0,
-attr.padding.prepended.w, -attr.padding.prepended.w,
-attr.padding.prepended.h, -attr.padding.prepended.h,
@ -619,10 +619,10 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
const auto& dst_shape = buffers.find(output_id)->second; const auto& dst_shape = buffers.find(output_id)->second;
int grid_x = dst_shape.w; int grid_x = dst_shape.w;
int grid_y = 6; int grid_y = 6;
int grid_z = IntegralDivideRoundUp(dst_shape.c, 4); int grid_z = DivideRoundUp(dst_shape.c, 4);
int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x); int groups_x = DivideRoundUp(grid_x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y); int groups_y = DivideRoundUp(grid_y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z); int groups_z = DivideRoundUp(grid_z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };
return {desc}; return {desc};
@ -665,8 +665,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4(
const auto& src_shape = buffers.find(input_id)->second; const auto& src_shape = buffers.find(input_id)->second;
const auto& dst_shape = buffers.find(output_id)->second; const auto& dst_shape = buffers.find(output_id)->second;
std::vector<int> sizes = { std::vector<int> sizes = {
src_shape.w, src_shape.h, IntegralDivideRoundUp(src_shape.c, 4), 0, src_shape.w, src_shape.h, DivideRoundUp(src_shape.c, 4), 0,
dst_shape.w, dst_shape.h, IntegralDivideRoundUp(dst_shape.c, 4), 0, dst_shape.w, dst_shape.h, DivideRoundUp(dst_shape.c, 4), 0,
}; };
return GetByteBuffer(sizes); return GetByteBuffer(sizes);
}}, }},
@ -677,10 +677,10 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4(
const auto& src_shape = buffers.find(input_id)->second; const auto& src_shape = buffers.find(input_id)->second;
int grid_x = src_shape.w; int grid_x = src_shape.w;
int grid_y = 1; int grid_y = 1;
int grid_z = IntegralDivideRoundUp(src_shape.c, 4); int grid_z = DivideRoundUp(src_shape.c, 4);
int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x); int groups_x = DivideRoundUp(grid_x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y); int groups_y = DivideRoundUp(grid_y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z); int groups_z = DivideRoundUp(grid_z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };
return {desc}; return {desc};
@ -734,16 +734,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4Tile4x1(
[input_id, output_id](const std::map<ValueId, BHWC>& buffers) { [input_id, output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& src_shape = buffers.find(input_id)->second; const auto& src_shape = buffers.find(input_id)->second;
const auto& dst_shape = buffers.find(output_id)->second; const auto& dst_shape = buffers.find(output_id)->second;
const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4); const int tiles_x = DivideRoundUp(dst_shape.w, 4);
const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4); const int tiles_y = DivideRoundUp(dst_shape.h, 4);
std::vector<int> sizes = { std::vector<int> sizes = {
src_shape.w, src_shape.w,
src_shape.h, src_shape.h,
IntegralDivideRoundUp(src_shape.c, 4), DivideRoundUp(src_shape.c, 4),
0, 0,
dst_shape.w, dst_shape.w,
dst_shape.h, dst_shape.h,
IntegralDivideRoundUp(dst_shape.c, 4), DivideRoundUp(dst_shape.c, 4),
0, 0,
tiles_x, tiles_x,
tiles_y, tiles_y,
@ -757,14 +757,14 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4Tile4x1(
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) { desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const uint3 groups_size{8, 4, 1}; const uint3 groups_size{8, 4, 1};
const auto& dst_shape = buffers.find(output_id)->second; const auto& dst_shape = buffers.find(output_id)->second;
const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4); const int tiles_x = DivideRoundUp(dst_shape.w, 4);
const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4); const int tiles_y = DivideRoundUp(dst_shape.h, 4);
int grid_x = tiles_x * tiles_y; int grid_x = tiles_x * tiles_y;
int grid_y = 4; int grid_y = 4;
int grid_z = IntegralDivideRoundUp(dst_shape.c, 4); int grid_z = DivideRoundUp(dst_shape.c, 4);
int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x); int groups_x = DivideRoundUp(grid_x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y); int groups_y = DivideRoundUp(grid_y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z); int groups_z = DivideRoundUp(grid_z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
}; };
return {desc}; return {desc};