Rename IntegralDivideRoundUp to DivideRoundUp.

PiperOrigin-RevId: 307447663
Change-Id: I1e0f6c9f058e3f0457a7522f1d10f7da8ab8610d
This commit is contained in:
Juhyun Lee 2020-04-20 12:02:49 -07:00 committed by TensorFlower Gardener
parent 55c4d9e49c
commit b26b6b5669
80 changed files with 435 additions and 452 deletions

View File

@ -54,7 +54,7 @@ namespace gpu {
// H - height
// W - width
// C - channels
// D - depth := IntegralDivideRoundUp(C, 4)
// D - depth := DivideRoundUp(C, 4)
// C4 - is the constant = 4.
enum class DataLayout {
UNKNOWN,
@ -164,7 +164,7 @@ struct Dimensions {
Dimensions(int32_t batch, int32_t height, int32_t width, int32_t channels)
: b(batch), h(height), w(width), c(channels) {}
int32_t d() const { return IntegralDivideRoundUp(c, 4); }
int32_t d() const { return DivideRoundUp(c, 4); }
int32_t product() const { return b * h * w * c; }

View File

@ -70,10 +70,10 @@ std::string Add::GetElementWiseCode(
Add::Add(const OperationDef& definition, const std::vector<int>& channels,
int dst_channels)
: ElementwiseOperation(definition),
dst_depth_(IntegralDivideRoundUp(dst_channels, 4)) {
dst_depth_(DivideRoundUp(dst_channels, 4)) {
src_depthes_.resize(channels.size());
for (int i = 0; i < channels.size(); ++i) {
src_depthes_[i] = IntegralDivideRoundUp(channels[i], 4);
src_depthes_[i] = DivideRoundUp(channels[i], 4);
}
}

View File

@ -79,7 +79,7 @@ std::string GetConcatKernelCode(
// generation.
c += " int Z = 0;\n";
for (int i = 0; i < channels.size(); ++i) {
const int depth = IntegralDivideRoundUp(channels[i], 4);
const int depth = DivideRoundUp(channels[i], 4);
if (depth % 2 == 0) {
// We can read more at once inside of loop in case depth % 2 == 0
// it should be better for reading latency hiding
@ -112,7 +112,7 @@ std::string GetConcatKernelCode(
int read_index = 0;
int z = 0;
for (int i = 0; i < channels.size(); ++i) {
const int depth = IntegralDivideRoundUp(channels[i], 4);
const int depth = DivideRoundUp(channels[i], 4);
for (int d = 0; d < depth; ++d) {
const int channels_in_group = std::min(4, channels[i] - d * 4);
const std::string temp_name = "t" + std::to_string(read_index);

View File

@ -128,24 +128,24 @@ absl::Status Conv3D::BindArguments() {
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
}
RETURN_IF_ERROR(kernel_.SetBytesAuto(
IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w)));
DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w)));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
return absl::OkStatus();
}
int3 Conv3D::GetGridSize() const {
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
conv_params_.block_size.x);
const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
conv_params_.block_size.x);
const int grid_y =
IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
const int grid_z =
IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) *
IntegralDivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) *
DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
int3 wg;
wg.x = IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
wg.y = IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
wg.z = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
return int3(wg[conv_params_.work_group_launch_order[0]] *
conv_params_.work_group_size.x,
wg[conv_params_.work_group_launch_order[1]] *
@ -885,8 +885,8 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
Conv3D::ConvParams Conv3D::GuessBestParams(
const CLDevice& device, const OperationDef& definition,
const Convolution3DAttributes& attr) const {
const int dst_slices = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
attr.dilations.w == 1 &&
attr.padding.prepended.w == 0 &&

View File

@ -147,8 +147,8 @@ absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
CLContext* context) {
const int block_size = conv_params_.block_size.w;
const int dst_slices =
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size);
const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4);
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
const int src_slices = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y;
const int kernel_z = kernel_size_.z;
@ -219,8 +219,8 @@ void Conv3D::RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
absl::Span<T> dst) {
const int block_size = conv_params_.block_size.w;
const int dst_slices =
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size);
const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4);
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
const int src_slices = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y;
const int kernel_z = kernel_size_.z;

View File

@ -307,7 +307,7 @@ absl::Status ConvBuffer1x1::BindArguments() {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
const int src_width_elements = IntegralDivideRoundUp(
const int src_width_elements = DivideRoundUp(
src_[0]->Width() * src_[0]->Batch(), (conv_params_.element_size / 4));
int4 src_size = int4(src_width_elements, src_[0]->Height(), src_[0]->Slices(),
src_width_elements * src_[0]->Height());
@ -317,14 +317,14 @@ absl::Status ConvBuffer1x1::BindArguments() {
}
int3 ConvBuffer1x1::GetGridSize() const {
const int dst_width_elements = IntegralDivideRoundUp(
const int dst_width_elements = DivideRoundUp(
dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4));
const int grid_x =
IntegralDivideRoundUp(dst_width_elements, conv_params_.block_size.x);
DivideRoundUp(dst_width_elements, conv_params_.block_size.x);
const int grid_y =
IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
const int grid_z =
IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
return int3(grid_x, grid_y, grid_z);
}
@ -358,8 +358,8 @@ absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
if (!IsConvBuffer1x1Supported(definition, attr)) {
return absl::InvalidArgumentError("ConvBuffer1x1 doesn't supported");
}
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvBuffer1x1::ConvParams conv_params;
if (shape) {
conv_params = GetBestParams(*creation_context.device, definition, *shape,
@ -376,8 +376,8 @@ absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
ConvBuffer1x1* result, const BHWC* shape) {
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvBuffer1x1::ConvParams conv_params;
if (shape) {
conv_params = GetBestParams(*creation_context.device, definition, *shape,
@ -396,8 +396,8 @@ absl::Status CreateConvBuffer1x1Wino4x4To6x6(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
const BHWC* shape) {
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvBuffer1x1::ConvParams conv_params;
if (shape) {
conv_params = GetBestParams(*creation_context.device, definition, *shape,

View File

@ -135,8 +135,8 @@ absl::Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
template <DataType T>
absl::Status ConvBuffer1x1::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);

View File

@ -40,9 +40,9 @@ std::string GenerateConvolutionConstantCode(
std::string c = GetCommonDefines(op_def.precision);
const int out_z = IntegralDivideRoundUp(dst_channels, 4);
const int out_z = DivideRoundUp(dst_channels, 4);
const std::string kOutZ = std::to_string(out_z);
const int src_depth = IntegralDivideRoundUp(src_channels, 4);
const int src_depth = DivideRoundUp(src_channels, 4);
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
@ -290,7 +290,7 @@ bool IsConvConstantsSupported(const CLDevice& device,
: sizeof(half);
const int filters_buffer_size = filters_count * float_size;
const int kConstantMaxSize = GetOptimalMaxConstantSize(device.GetInfo());
const int flt4_registers = IntegralDivideRoundUp(w_shape.o, 4);
const int flt4_registers = DivideRoundUp(w_shape.o, 4);
return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
}

View File

@ -88,7 +88,7 @@ class ConvConstants : public GPUOperation {
template <DataType T>
absl::Status ConvConstants::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
@ -112,8 +112,8 @@ absl::Status ConvConstants::UploadWeights(
template <DataType S, typename T>
void ConvConstants::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;

View File

@ -205,8 +205,8 @@ absl::Status ConvPowerVR::BindArguments() {
kernel_dilation_.z * src_[0]->Batch(), kernel_dilation_.w)));
}
if (conv_params_.linear_hw) {
const int grid_x = IntegralDivideRoundUp(
dst_[0]->Width() * dst_[0]->Batch(), conv_params_.block_size.x);
const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
conv_params_.block_size.x);
RETURN_IF_ERROR(kernel_.SetBytesAuto(grid_x));
}
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
@ -215,27 +215,26 @@ absl::Status ConvPowerVR::BindArguments() {
}
int3 ConvPowerVR::GetGridSize() const {
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
conv_params_.block_size.x);
const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
conv_params_.block_size.x);
const int grid_y =
IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
const int grid_z =
IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
int3 wg;
if (conv_params_.linear_hw) {
wg.x =
IntegralDivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x);
wg.y = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.y);
wg.x = DivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x);
wg.y = DivideRoundUp(grid_z, conv_params_.work_group_size.y);
return int3(wg[conv_params_.work_group_launch_order[0]] *
conv_params_.work_group_size.x,
wg[conv_params_.work_group_launch_order[1]] *
conv_params_.work_group_size.y,
1);
} else {
wg.x = IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
wg.y = IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
wg.z = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
return int3(wg[conv_params_.work_group_launch_order[0]] *
conv_params_.work_group_size.x,
wg[conv_params_.work_group_launch_order[1]] *
@ -808,8 +807,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const CLDevice& device, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
attr.dilations.w == 1 &&
attr.padding.prepended.w == 0 &&
@ -825,8 +824,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const CLDevice& device, const OperationDef& definition,
const FullyConnectedAttributes& attr, const BHWC* dst_shape) const {
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvPowerVR::ConvParams params = GuessBestParams(
device, definition, src_depth, dst_depth, true, true, false, dst_shape);
params.work_group_size.x *= params.work_group_size.y;
@ -839,8 +838,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
const CLDevice& device, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvPowerVR::ConvParams params = GuessBestParams(
device, definition, src_depth, dst_depth, true, true, true, dst_shape);
params.block_size.x *= params.block_size.y;

View File

@ -188,8 +188,8 @@ absl::Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
template <DataType T>
absl::Status ConvPowerVR::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const bool f32_weights = conv_params_.weights_data_type == DataType::FLOAT32;
const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);

View File

@ -433,9 +433,9 @@ absl::Status ConvTexture::BindArguments() {
int3 ConvTexture::GetGridSize() const {
const int grid_x =
IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), block_size_.y);
const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.z);
DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
const int grid_y = DivideRoundUp(dst_[0]->Height(), block_size_.y);
const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
return int3(grid_x, grid_y, grid_z);
}

View File

@ -148,9 +148,9 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
template <DataType T>
absl::Status ConvTexture::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
int dst_depth = DivideRoundUp(weights.shape.o, 4);
dst_depth = AlignByN(dst_depth, block_size_.z);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
@ -206,9 +206,9 @@ template <DataType S, typename T>
void ConvTexture::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst_0,
absl::Span<T> dst_1, absl::Span<T> dst_2, absl::Span<T> dst_3) {
int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
int dst_depth = DivideRoundUp(weights.shape.o, 4);
dst_depth = AlignByN(dst_depth, block_size_.z);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;

View File

@ -322,7 +322,7 @@ ConvolutionTransposed::ConvolutionTransposed(
block_size_ = is_f16 ? int3(2, 2, 2) : int3(2, 2, 1);
}
}
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
if (dst_depth == 1 || dst_depth == 3) {
if (!device.IsMali()) {
block_size_.y *= block_size_.z;
@ -406,10 +406,9 @@ absl::Status ConvolutionTransposed::BindArguments() {
int3 ConvolutionTransposed::GetGridSize() const {
const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
const int grid_x =
IntegralDivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
const int grid_y = IntegralDivideRoundUp(aligned_h, block_size_.y);
const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.z);
const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
return int3(grid_x, grid_y, grid_z);
}

View File

@ -91,8 +91,8 @@ template <DataType T>
absl::Status ConvolutionTransposed::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_depth =
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y;
int texture_width = dst_depth;
@ -160,8 +160,8 @@ template <DataType S, typename T>
void ConvolutionTransposed::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int dst_depth =
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y;
int texture_width = dst_depth;

View File

@ -440,8 +440,8 @@ absl::Status ConvolutionTransposed3D::BindArguments() {
if (definition_.IsBatchSupported()) {
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
}
RETURN_IF_ERROR(kernel_.SetBytesAuto(
IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.w)));
RETURN_IF_ERROR(
kernel_.SetBytesAuto(DivideRoundUp(dst_[0]->Slices(), block_size_.w)));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHDS()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHDS()));
return absl::OkStatus();
@ -451,11 +451,10 @@ int3 ConvolutionTransposed3D::GetGridSize() const {
const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z);
const int grid_x =
IntegralDivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
const int grid_y = IntegralDivideRoundUp(aligned_h, block_size_.y);
const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.w) *
IntegralDivideRoundUp(aligned_d, block_size_.z);
const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.w) *
DivideRoundUp(aligned_d, block_size_.z);
return int3(grid_x, grid_y, grid_z);
}

View File

@ -91,8 +91,8 @@ template <DataType T>
absl::Status ConvolutionTransposed3D::UploadWeights(
const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
const int dst_depth =
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y;
const int kernel_z = kernel_size_.z;
@ -162,8 +162,8 @@ template <DataType S, typename T>
void ConvolutionTransposed3D::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
const int dst_depth =
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.w);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.w);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y;
const int kernel_z = kernel_size_.z;

View File

@ -339,14 +339,13 @@ absl::Status ConvolutionTransposed3x3::BindArguments() {
}
int3 ConvolutionTransposed3x3::GetGridSize() const {
const int grid_x =
IntegralDivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
const int grid_z = dst_[0]->Slices();
int3 wg;
wg.x = IntegralDivideRoundUp(grid_x, work_group_size_.x);
wg.y = IntegralDivideRoundUp(grid_y, work_group_size_.y);
wg.z = IntegralDivideRoundUp(grid_z, work_group_size_.z);
wg.x = DivideRoundUp(grid_x, work_group_size_.x);
wg.y = DivideRoundUp(grid_y, work_group_size_.y);
wg.z = DivideRoundUp(grid_z, work_group_size_.z);
return int3(wg[work_group_launch_order_[0]] * work_group_size_.x,
wg[work_group_launch_order_[1]] * work_group_size_.y,
wg[work_group_launch_order_[2]] * work_group_size_.z);

View File

@ -84,8 +84,8 @@ class ConvolutionTransposed3x3 : public GPUOperation {
template <DataType T>
absl::Status ConvolutionTransposed3x3::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = 3; // This operation support only 3x3 kernel
const int kernel_y = 3;
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@ -109,8 +109,8 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
template <DataType S, typename T>
void ConvolutionTransposed3x3::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = 3;
const int kernel_y = 3;

View File

@ -224,8 +224,8 @@ ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
absl::Status ConvolutionTransposed3x3Thin::Compile(
const CreationContext& creation_context) {
const auto code = GenerateConvolutionTransposedCode(
definition_, biases_, IntegralDivideRoundUp(src_channels_, 4),
IntegralDivideRoundUp(dst_channels_, 4), *creation_context.device,
definition_, biases_, DivideRoundUp(src_channels_, 4),
DivideRoundUp(dst_channels_, 4), *creation_context.device,
linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,

View File

@ -82,8 +82,8 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
template <DataType T>
absl::Status ConvolutionTransposed3x3Thin::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
const int src_depth = DivideRoundUp(src_channels_, 4);
const int dst_depth = DivideRoundUp(dst_channels_, 4);
const int kernel_x = 3; // This operation support only 3x3 kernel
const int kernel_y = 3;
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@ -108,8 +108,8 @@ absl::Status ConvolutionTransposed3x3Thin::UploadWeights(
template <DataType S, typename T>
void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
const int src_depth = DivideRoundUp(src_channels_, 4);
const int dst_depth = DivideRoundUp(dst_channels_, 4);
const int kernel_x = 3;
const int kernel_y = 3;

View File

@ -332,9 +332,8 @@ absl::Status ConvolutionTransposed4x4::BindArguments() {
}
int3 ConvolutionTransposed4x4::GetGridSize() const {
const int grid_x =
IntegralDivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch();
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height() + 2, 2);
const int grid_x = DivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch();
const int grid_y = DivideRoundUp(dst_[0]->Height() + 2, 2);
const int grid_z = dst_[0]->Slices();
return int3(grid_x, grid_y, grid_z);
}

View File

@ -82,8 +82,8 @@ class ConvolutionTransposed4x4 : public GPUOperation {
template <DataType T>
absl::Status ConvolutionTransposed4x4::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = 4; // This operation support only 4x4 kernel
const int kernel_y = 4;
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@ -107,8 +107,8 @@ absl::Status ConvolutionTransposed4x4::UploadWeights(
template <DataType S, typename T>
void ConvolutionTransposed4x4::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = 4;
const int kernel_y = 4;

View File

@ -187,8 +187,8 @@ ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
absl::Status ConvolutionTransposedThin::Compile(
const CreationContext& creation_context) {
const auto code = GenerateConvolutionTransposedCode(
definition_, IntegralDivideRoundUp(src_channels_, 4), dst_channels_,
kernel_size_, *creation_context.device, linked_operations_);
definition_, DivideRoundUp(src_channels_, 4), dst_channels_, kernel_size_,
*creation_context.device, linked_operations_);
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&

View File

@ -82,7 +82,7 @@ class ConvolutionTransposedThin : public GPUOperation {
template <DataType T>
absl::Status ConvolutionTransposedThin::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
const int src_depth = DivideRoundUp(src_channels_, 4);
const int elements_count =
kernel_size_.x * kernel_size_.y * src_depth * 4 * dst_channels_;
@ -104,7 +104,7 @@ absl::Status ConvolutionTransposedThin::UploadWeights(
template <DataType S, typename T>
void ConvolutionTransposedThin::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
const int src_depth = DivideRoundUp(src_channels_, 4);
const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y;

View File

@ -89,7 +89,7 @@ template <DataType T>
absl::Status DepthwiseConvolution::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_channels = weights.shape.i * weights.shape.o;
const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
const int dst_depth = DivideRoundUp(dst_channels, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
@ -137,7 +137,7 @@ template <DataType S, typename T>
void DepthwiseConvolution::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int dst_channels = weights.shape.i * weights.shape.o;
const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
const int dst_depth = DivideRoundUp(dst_channels, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;

View File

@ -88,7 +88,7 @@ template <DataType T>
absl::Status DepthwiseConvolution3D::UploadWeights(
const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
const int dst_channels = weights.shape.i * weights.shape.o;
const int dst_slices = IntegralDivideRoundUp(dst_channels, 4);
const int dst_slices = DivideRoundUp(dst_channels, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
const int kernel_z = weights.shape.d;
@ -130,7 +130,7 @@ template <DataType S, typename T>
void DepthwiseConvolution3D::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
const int dst_channels = weights.shape.i * weights.shape.o;
const int dst_slices = IntegralDivideRoundUp(dst_channels, 4);
const int dst_slices = DivideRoundUp(dst_channels, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
const int kernel_z = weights.shape.d;

View File

@ -323,8 +323,8 @@ absl::Status DepthwiseConv3x3::BindArguments() {
}
int3 DepthwiseConv3x3::GetGridSize() const {
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), 2);
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
const int grid_x = DivideRoundUp(dst_[0]->Width(), 2);
const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
const int grid_z = dst_[0]->Slices();
return int3(grid_x, grid_y, grid_z);
}

View File

@ -83,7 +83,7 @@ template <DataType T>
absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
int texture_width = 10; // 3x3 kernel + 1 bias
int texture_height = src_depth;
const int elements_count = texture_width * texture_height;
@ -129,7 +129,7 @@ template <DataType S, typename T>
void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(
const tflite::gpu::Tensor<OHWI, S>& weights,
const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
int counter = 0;
for (int s = 0; s < src_depth; ++s) {

View File

@ -70,8 +70,8 @@ class FullyConnected : public GPUOperation {
template <DataType T>
absl::Status FullyConnected::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int elements_count = src_depth * dst_depth * 4;
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
@ -94,8 +94,8 @@ absl::Status FullyConnected::UploadWeights(
template <DataType T, typename S>
void FullyConnected::RearrangeWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<S> dst) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
int counter = 0;
for (int s = 0; s < src_depth; ++s) {

View File

@ -129,8 +129,7 @@ absl::Status Softmax1x1::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
const int depth = src_[0]->Slices();
RETURN_IF_ERROR(
kernel_.SetBytesAuto(int2(depth, IntegralDivideRoundUp(depth, 32))));
RETURN_IF_ERROR(kernel_.SetBytesAuto(int2(depth, DivideRoundUp(depth, 32))));
RETURN_IF_ERROR(
kernel_.SetBytesAuto(GetMaskForLastPlane(src_[0]->Channels())));

View File

@ -234,12 +234,12 @@ template <DataType S, typename T>
void RearrangeWeightsToOHWIOGroupI4O4(
const tflite::gpu::Tensor<OHWI, S>& weights, int out_group_size,
absl::Span<T> dst) {
const int dst_slices = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4);
const int dst_slices = DivideRoundUp(weights.shape.o, 4);
const int src_slices = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
const int dst_groups = IntegralDivideRoundUp(dst_slices, out_group_size);
const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
int counter = 0;
for (int d = 0; d < dst_groups; ++d) {

View File

@ -436,9 +436,9 @@ absl::Status Winograd4x4To36::BindArguments() {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
const int tiles_x = IntegralDivideRoundUp(
const int tiles_x = DivideRoundUp(
src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
const int tiles_y = IntegralDivideRoundUp(
const int tiles_y = DivideRoundUp(
src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
const int tiles_total = tiles_x * tiles_y;
RETURN_IF_ERROR(
@ -550,14 +550,14 @@ absl::Status Winograd36To4x4::BindArguments() {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
const int tiles_x = IntegralDivideRoundUp(dst_[0]->Width(), 4);
const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
RETURN_IF_ERROR(kernel_.SetBytesAuto(tiles_x));
return absl::OkStatus();
}
int3 Winograd36To4x4::GetGridSize() const {
const int tiles_x = IntegralDivideRoundUp(dst_[0]->Width(), 4);
const int tiles_y = IntegralDivideRoundUp(dst_[0]->Height(), 4);
const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
const int tiles_y = DivideRoundUp(dst_[0]->Height(), 4);
const int grid_x = tiles_x * tiles_y * dst_[0]->Batch();
const int grid_y = 4;
const int grid_z = dst_[0]->Slices();

View File

@ -187,7 +187,7 @@ int3 GetWorkGroupXY128Simple(const int3& grid) { return int3(16, 8, 1); }
int3 GetWorkGroup(const int3& grid, int max_size) {
int wg_z = GetBiggestDividerWithPriority(grid.z, 8);
int wg_xy_size = max_size / wg_z;
int wg_x = std::min(IntegralDivideRoundUp(grid.x, 2), wg_xy_size);
int wg_x = std::min(DivideRoundUp(grid.x, 2), wg_xy_size);
int wg_y = std::min(wg_xy_size / wg_x, grid.y);
return int3(wg_x, wg_y, wg_z);
}
@ -231,12 +231,12 @@ absl::Status GetBestWorkGroupXY128Linear(const TuningParameters& params,
}
bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
int planar_work_groups = IntegralDivideRoundUp(width * height, 128);
int planar_work_groups = DivideRoundUp(width * height, 128);
auto base_work_groups = Get2DWorkgroupsEqualTo128();
bool have_equal_work_groups = false;
for (auto& work_group : base_work_groups) {
int x_groups = IntegralDivideRoundUp(width, work_group.x);
int y_groups = IntegralDivideRoundUp(height, work_group.y);
int x_groups = DivideRoundUp(width, work_group.x);
int y_groups = DivideRoundUp(height, work_group.y);
int xy_groups = x_groups * y_groups;
if (xy_groups == planar_work_groups) {
have_equal_work_groups = true;

View File

@ -101,7 +101,7 @@ absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
CLContext* context, LinearStorage* result) {
int size = creation_info.aligned_size != 0 ? creation_info.aligned_size
: tensor.shape.v;
const int depth = IntegralDivideRoundUp(size, 4);
const int depth = DivideRoundUp(size, 4);
if (creation_info.data_type == DataType::FLOAT32) {
std::vector<float4> gpu_data(depth);
CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));

View File

@ -56,10 +56,10 @@ bool IsChannelsBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
const CLDevice& device,
const BHWC& dst_shape) {
const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int tiles_x = DivideRoundUp(dst_shape.w, 4);
const int tiles_y = DivideRoundUp(dst_shape.h, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const bool suitable_attributes =
attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
@ -82,8 +82,8 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
return absl::UnimplementedError("No implementation for this case.");
}
const int tiles_x = IntegralDivideRoundUp(output_shape.w, 4);
const int tiles_y = IntegralDivideRoundUp(output_shape.h, 4);
const int tiles_x = DivideRoundUp(output_shape.w, 4);
const int tiles_y = DivideRoundUp(output_shape.h, 4);
const BHWC shape_0{input_shape.b, 36, tiles_x * tiles_y, input_shape.c};
const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
TensorDescriptor td_0;

View File

@ -28,7 +28,7 @@ namespace cl {
bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
const BHWDC& shape,
const TensorDescriptor& descriptor) {
const int slices = IntegralDivideRoundUp(shape.c, 4);
const int slices = DivideRoundUp(shape.c, 4);
switch (descriptor.storage_type) {
case TensorStorageType::BUFFER: {
const int flt4_size =

View File

@ -65,11 +65,10 @@ absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
}
if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
cl_mem image_memory;
RETURN_IF_ERROR(
CreateImageBufferFromBuffer(context, memory, descriptor.data_type,
shape.b * shape.w * shape.h * shape.d *
IntegralDivideRoundUp(shape.c, 4),
&image_memory));
RETURN_IF_ERROR(CreateImageBufferFromBuffer(
context, memory, descriptor.data_type,
shape.b * shape.w * shape.h * shape.d * DivideRoundUp(shape.c, 4),
&image_memory));
*result = Tensor(memory, memory_owner, image_memory, shape, descriptor);
} else {
*result = Tensor(memory, memory_owner, shape, descriptor);
@ -386,7 +385,7 @@ absl::Status AllocateTensorMemory(const CLContext& context,
const CLDevice& device, const BHWDC& shape,
const TensorDescriptor& descriptor,
CLMemory* result) {
const int slices = IntegralDivideRoundUp(shape.c, 4);
const int slices = DivideRoundUp(shape.c, 4);
switch (descriptor.storage_type) {
case TensorStorageType::BUFFER:
case TensorStorageType::IMAGE_BUFFER: {

View File

@ -61,7 +61,7 @@ class Tensor {
int Height() const { return shape_.h; }
int Depth() const { return shape_.d; }
int Channels() const { return shape_.c; }
int Slices() const { return IntegralDivideRoundUp(shape_.c, 4); }
int Slices() const { return DivideRoundUp(shape_.c, 4); }
int Batch() const { return shape_.b; }
// returns int4(width * batch, height, slices, batch)

View File

@ -44,12 +44,11 @@ absl::Status ConvertToPHWO4I4(absl::Span<const float> in, const OHWI& shape,
}
float* output = out.data();
for (int p = 0; p < IntegralDivideRoundUp(shape.o, kPhwo4i4ChannelsInPlane);
++p) {
for (int p = 0; p < DivideRoundUp(shape.o, kPhwo4i4ChannelsInPlane); ++p) {
for (int h = 0; h < shape.h; ++h) {
for (int w = 0; w < shape.w; ++w) {
for (int c = 0;
c < IntegralDivideRoundUp(shape.i, kPhwo4i4ChannelsInPlane); ++c) {
for (int c = 0; c < DivideRoundUp(shape.i, kPhwo4i4ChannelsInPlane);
++c) {
for (int co = 0; co < kPhwo4i4ChannelsInPlane; ++co) {
for (int ci = 0; ci < kPhwo4i4ChannelsInPlane; ++ci) {
float value = 0;
@ -106,7 +105,7 @@ std::vector<float> ConvertToPHWO4I4Transposed(
uint3 Get3DSizeForPHWO4I4(const OHWI& shape) {
return uint3(AlignByN(shape.i, 4), shape.h * shape.w,
IntegralDivideRoundUp(shape.o, 4));
DivideRoundUp(shape.o, 4));
}
// Layout is Po,H,W,OI4x4.
@ -123,8 +122,8 @@ absl::Status ConvertToPHWO4I4(absl::Span<const float> in, const IHWO& shape,
out.size(), " != ", GetElementsSizeForPHWO4I4(shape)));
}
const int dst_depth = IntegralDivideRoundUp(shape.o, 4);
const int src_depth = IntegralDivideRoundUp(shape.i, 4);
const int dst_depth = DivideRoundUp(shape.o, 4);
const int src_depth = DivideRoundUp(shape.i, 4);
float* output = out.data();
for (int f = 0; f < dst_depth; ++f) {
@ -178,8 +177,7 @@ absl::Status ConvertToPIOHW4(absl::Span<const float> in, const OHWI& shape,
}
int32_t output_channels = shape.o * shape.i;
int32_t num_planes =
IntegralDivideRoundUp(output_channels, kPiohw4ChannelsInPlane);
int32_t num_planes = DivideRoundUp(output_channels, kPiohw4ChannelsInPlane);
float* output = out.data();
for (int p = 0; p < num_planes; ++p) {
for (int h = 0; h < shape.h; ++h) {
@ -232,7 +230,7 @@ absl::Status ConvertToPHWC4(absl::Span<const float> in, const BHWC& shape,
return absl::OkStatus();
}
// Layout is Pc,H,W,C4 where P - is a plane based on channels.
int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
const int num_pixels = shape.h * shape.w;
// A layer is a set of kPhwc4ChannelsInPlane channels images.
const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
@ -281,7 +279,7 @@ absl::Status ConvertToPHWC4Half(absl::Span<const float> in, const BHWC& shape,
RETURN_IF_ERROR(ValidateConvertToPHWC4(in, shape, out));
// Layout is Pc,H,W,C4 where P - is a plane based on channels.
int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
const int num_pixels = shape.h * shape.w;
// A layer is a set of kPhwc4ChannelsInPlane channels images.
const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
@ -407,7 +405,7 @@ absl::Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
return absl::OkStatus();
}
int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
const int num_pixels = shape.h * shape.w;
const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
// A layer is a set of kPhwc4ChannelsInPlane channels images.
@ -449,7 +447,7 @@ absl::Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
absl::Status ConvertFromPHWC4Half(absl::Span<const HalfBits> in,
const BHWC& shape, absl::Span<float> out) {
RETURN_IF_ERROR(ValidateConvertFromPHWC4(in, shape, out));
int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
const int num_pixels = shape.h * shape.w;
const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
// A layer is a set of kPhwc4ChannelsInPlane channels images.

View File

@ -209,7 +209,7 @@ OperationType OperationTypeFromString(const std::string& name) {
namespace {
template <typename T>
T IntegralDivideRoundUp(T n, T divisor) {
T DivideRoundUp(T n, T divisor) {
return (n - 1) / divisor + 1;
}
@ -272,7 +272,7 @@ int32_t CalculateOutput(const BHWDC& input,
}
inline int32_t StridedSize(int32_t size, int32_t stride) {
return stride == 0 ? -1 : IntegralDivideRoundUp(size, stride);
return stride == 0 ? -1 : DivideRoundUp(size, stride);
}
template <Axis AxisT, typename AttrT>

View File

@ -24,24 +24,23 @@ namespace gpu {
// @param n must be non negative
// @param divisor must be greater than zero
template <typename T, typename N>
T IntegralDivideRoundUp(T n, N divisor) {
T DivideRoundUp(T n, N divisor) {
const T div = static_cast<T>(divisor);
const T q = n / div;
return n % div == 0 ? q : q + 1;
}
template <>
inline uint3 IntegralDivideRoundUp(uint3 n, uint3 divisor) {
return uint3(IntegralDivideRoundUp(n.x, divisor.x),
IntegralDivideRoundUp(n.y, divisor.y),
IntegralDivideRoundUp(n.z, divisor.z));
inline uint3 DivideRoundUp(uint3 n, uint3 divisor) {
return uint3(DivideRoundUp(n.x, divisor.x), DivideRoundUp(n.y, divisor.y),
DivideRoundUp(n.z, divisor.z));
}
// @param number or its components must be greater than zero
// @param n must be greater than zero
template <typename T, typename N>
T AlignByN(T number, N n) {
return IntegralDivideRoundUp(number, n) * n;
return DivideRoundUp(number, n) * n;
}
} // namespace gpu

View File

@ -24,16 +24,16 @@ namespace {
using testing::Eq;
TEST(UtilTest, IntegralDivideRoundUp) {
EXPECT_THAT(IntegralDivideRoundUp(0, 256), Eq(0));
EXPECT_THAT(IntegralDivideRoundUp(2u, 256), Eq(1));
EXPECT_THAT(IntegralDivideRoundUp(2, 256), Eq(1));
EXPECT_THAT(IntegralDivideRoundUp(255u, 256), Eq(1));
EXPECT_THAT(IntegralDivideRoundUp(255, 256), Eq(1));
EXPECT_THAT(IntegralDivideRoundUp(256u, 256), Eq(1));
EXPECT_THAT(IntegralDivideRoundUp(256, 256), Eq(1));
EXPECT_THAT(IntegralDivideRoundUp(257u, 256), Eq(2));
EXPECT_THAT(IntegralDivideRoundUp(257, 256), Eq(2));
TEST(UtilTest, DivideRoundUp) {
EXPECT_THAT(DivideRoundUp(0, 256), Eq(0));
EXPECT_THAT(DivideRoundUp(2u, 256), Eq(1));
EXPECT_THAT(DivideRoundUp(2, 256), Eq(1));
EXPECT_THAT(DivideRoundUp(255u, 256), Eq(1));
EXPECT_THAT(DivideRoundUp(255, 256), Eq(1));
EXPECT_THAT(DivideRoundUp(256u, 256), Eq(1));
EXPECT_THAT(DivideRoundUp(256, 256), Eq(1));
EXPECT_THAT(DivideRoundUp(257u, 256), Eq(2));
EXPECT_THAT(DivideRoundUp(257, 256), Eq(2));
}
TEST(UtilTest, AlignByN) {

View File

@ -34,9 +34,9 @@ void AddCornerCases(const T& grid, int max_work_group_total_size,
for (int x = 1; x <= 4; ++x) {
for (int y = 1; y <= 4; ++y) {
for (int z = 1; z <= 4; ++z) {
int wg_x = IntegralDivideRoundUp(grid.x, x);
int wg_y = IntegralDivideRoundUp(grid.y, y);
int wg_z = IntegralDivideRoundUp(grid.z, z);
int wg_x = DivideRoundUp(grid.x, x);
int wg_y = DivideRoundUp(grid.y, y);
int wg_z = DivideRoundUp(grid.z, z);
if (wg_x > max_work_group_sizes.x || wg_y > max_work_group_sizes.y ||
wg_z > max_work_group_sizes.z ||
wg_x * wg_y * wg_z > max_work_group_total_size) {

View File

@ -201,7 +201,7 @@ class CompiledModelImpl
ShaderCode code) {
// Calculate workgroup size.
uint3 workgroup_size = workgroup_calculator.Calculate(code);
uint3 num_workgroups = IntegralDivideRoundUp(code.workload, workgroup_size);
uint3 num_workgroups = DivideRoundUp(code.workload, workgroup_size);
for (const auto& object : code.objects) {
if (IsRef(object)) {

View File

@ -569,7 +569,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
} else {
shader_index = it->second;
}
auto num_workgroups = IntegralDivideRoundUp(code.workload, workgroup);
auto num_workgroups = DivideRoundUp(code.workload, workgroup);
return runtime_ptr->AddProgram(shaders[shader_index], code.parameters,
code.objects, num_workgroups);
}));

View File

@ -190,8 +190,7 @@ class CompilerImpl : public Compiler {
"Workload uint3() requires all output sizes to match");
}
}
attr.code.workload =
uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4));
attr.code.workload = uint3(shape.w, shape.h, DivideRoundUp(shape.c, 4));
}
int num_textures = 0;

View File

@ -88,8 +88,8 @@ absl::Status ConverterBhwcToPhwc4::Convert(const BHWC& shape,
return absl::UnimplementedError(
"BhwcToPhwc4: Batch size is not equal to 1.");
}
uint3 workload = uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4));
uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
uint3 workload = uint3(shape.w, shape.h, DivideRoundUp(shape.c, 4));
uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
RETURN_IF_ERROR(program_.SetParameter(
{"sizes_",

View File

@ -83,7 +83,7 @@ absl::Status ConverterPhwc4ToBhwc::Convert(const BHWC& shape,
}
uint3 workload = uint3(shape.w, shape.h, shape.c);
uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
// TODO(akulik): simply pass workload as soon as UniformParameter
// supports uint3

View File

@ -102,7 +102,7 @@ class Add : public NodeShader {
// Declare workload explicitly because shader depends on gid.z.
/*workload=*/
uint3(ctx.input_shapes[0][2], ctx.input_shapes[0][1],
IntegralDivideRoundUp(ctx.input_shapes[0][3], 4)),
DivideRoundUp(ctx.input_shapes[0][3], 4)),
/*workgroup=*/uint3(),
/*source_code=*/"value_0 += $add_buffer[gid.z]$;",
/*input=*/IOStructure::AUTO,

View File

@ -210,7 +210,7 @@ vec4 val = vec4(0.0f);
// * - you are going to write into these cells
// @ - you will fill these cells next cycles
// ^ - first elem you start writing from
int blocks_amount = IntegralDivideRoundUp<int>(in_ch, 4);
int blocks_amount = DivideRoundUp<int>(in_ch, 4);
code += "// Aligned case\n";
code += "// I'm going to make " + std::to_string(blocks_amount) +
" write(s)\n\n";

View File

@ -55,7 +55,7 @@ class Convolution : public NodeShader {
{"dilation_h", attr.dilations.h},
{"kernel_w", weights.w},
{"kernel_h", weights.h},
{"src_depth", IntegralDivideRoundUp(weights.i, 4)},
{"src_depth", DivideRoundUp(weights.i, 4)},
{"stride", int2(attr.strides.w, attr.strides.h)},
};
} else {
@ -71,7 +71,7 @@ class Convolution : public NodeShader {
{"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
{"offsets_count", offsets_count},
{"offsets", offsets},
{"src_depth", IntegralDivideRoundUp(weights.i, 4)},
{"src_depth", DivideRoundUp(weights.i, 4)},
{"stride", int2(attr.strides.w, attr.strides.h)},
};
}
@ -181,14 +181,14 @@ class Convolution1x1 : public NodeShader {
std::vector<Variable> parameters = {
{"src_depth",
IntegralDivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)},
DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)},
};
std::vector<std::pair<std::string, Object>> objects = {
{"weights", MakeReadonlyObject(
uint3(4, IntegralDivideRoundUp(attr.weights.shape.i, 4),
IntegralDivideRoundUp(attr.weights.shape.o, 4)),
ConvertToPHWO4I4(attr.weights))}};
{"weights",
MakeReadonlyObject(uint3(4, DivideRoundUp(attr.weights.shape.i, 4),
DivideRoundUp(attr.weights.shape.o, 4)),
ConvertToPHWO4I4(attr.weights))}};
std::string source;
for (int i = 0; i < multiplier; i++) {
absl::StrAppend(&source, "highp vec4 result", i, " = vec4(0);\n");
@ -224,7 +224,7 @@ class Convolution1x1 : public NodeShader {
absl::StrAppend(&source, "value_0 = result0;\n");
}
auto dst_depth = IntegralDivideRoundUp(ctx.output_shapes[0][3], 4);
auto dst_depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
uint3 workgroup = uint3(16, 16, 1);
if (ctx.gpu_info->type == GpuType::ADRENO) {
if (dst_depth >= 2) {
@ -265,7 +265,7 @@ class Convolution1x1 : public NodeShader {
/*shared_variables=*/{},
/*workload=*/
uint3(ctx.output_shapes[0][2] / multiplier, ctx.output_shapes[0][1],
IntegralDivideRoundUp(ctx.output_shapes[0][3], 4)),
DivideRoundUp(ctx.output_shapes[0][3], 4)),
/*workgroup=*/
GetIdealWorkgroupIfPossible(
ctx.gpu_info->gpu_model, OperationType::CONVOLUTION_2D,

View File

@ -64,7 +64,7 @@ class OpenGlConverterImpl : public TensorObjectConverter {
}
absl::Status Dispatch(const uint3& workload) {
uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
if (command_queue_) {
return command_queue_->Dispatch(program_, num_workgroups);
}
@ -256,7 +256,7 @@ class ToTensorConverter : public OpenGlConverterImpl {
return absl::InvalidArgumentError(
"ToTensorConverter: output data size does not match expected size.");
}
auto d = IntegralDivideRoundUp(shape_.c, 4);
auto d = DivideRoundUp(shape_.c, 4);
RETURN_IF_ERROR(program_.SetParameter(
{"sizes",
int4(static_cast<int32_t>(shape_.w), static_cast<int32_t>(shape_.h),

View File

@ -54,7 +54,7 @@ class DepthwiseConvolution : public NodeShader {
{"dilation_h", attr.dilations.h},
{"kernel_w", weights.w},
{"kernel_h", weights.h},
{"src_depth", IntegralDivideRoundUp(weights.i, 4)},
{"src_depth", DivideRoundUp(weights.i, 4)},
{"channel_multiplier", weights.o},
{"stride", int2(attr.strides.w, attr.strides.h)},
};
@ -71,7 +71,7 @@ class DepthwiseConvolution : public NodeShader {
{"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
{"offsets_count", offsets_count},
{"offsets", offsets},
{"src_depth", IntegralDivideRoundUp(weights.i, 4)},
{"src_depth", DivideRoundUp(weights.i, 4)},
{"channel_multiplier", weights.o},
{"stride", int2(attr.strides.w, attr.strides.h)},
};

View File

@ -39,8 +39,8 @@ class FullyConnectedBuffers : public NodeShader {
const auto& attr =
absl::any_cast<const FullyConnectedAttributes&>(ctx.op_attr);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
// This shader can work with any workgroup size, the values below work well
// for OpenGL.

View File

@ -105,10 +105,9 @@ absl::Status GenerateMultiplyScalarCode(
/*shared_variables=*/{},
// Declare workload explicitly because shader depends on gid.z.
/*workload=*/
uint3(
static_cast<int>(ctx.input_shapes[0][2]),
static_cast<int>(ctx.input_shapes[0][1]),
IntegralDivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)),
uint3(static_cast<int>(ctx.input_shapes[0][2]),
static_cast<int>(ctx.input_shapes[0][1]),
DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)),
/*workgroup=*/uint3(),
/*source_code=*/"value_0 *= $mul_buffer[gid.z]$;",
/*input=*/IOStructure::AUTO,

View File

@ -98,8 +98,8 @@ class Pad : public NodeShader {
source += " value_0 = $input_data_0[src_x, src_y, gid.z]$;\n";
} else if (attr.prepended.c % 4 == 0) {
parameters.push_back(
{"src_slices", IntegralDivideRoundUp(
static_cast<int>(ctx.input_shapes[0][3]), 4)});
{"src_slices",
DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)});
source += R"(
int src_z = gid.z - $prepended.z$ / 4;
if (src_z >= 0 && src_z < $src_slices$) {

View File

@ -69,8 +69,8 @@ class PReLULinearAlpha : public NodeShader {
/*workload=*/
uint3(static_cast<int>(ctx.output_shapes[0][2]),
static_cast<int>(ctx.output_shapes[0][1]),
IntegralDivideRoundUp(
static_cast<int>(ctx.output_shapes[0][3]), 4)),
DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
4)),
/*workgroup=*/uint3(),
/*source_code=*/
"value_0 = max(value_0, 0.0) + $alpha[gid.z]$ * min(value_0, "
@ -98,10 +98,10 @@ class PReLUFull : public NodeShader {
"Alpha shape does not match input shape.");
}
ObjectSize obj_size = uint3(
static_cast<int>(ctx.output_shapes[0][2]),
static_cast<int>(ctx.output_shapes[0][1]),
IntegralDivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4));
ObjectSize obj_size =
uint3(static_cast<int>(ctx.output_shapes[0][2]),
static_cast<int>(ctx.output_shapes[0][1]),
DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4));
*generated_code =
attr.clip
@ -116,8 +116,8 @@ class PReLUFull : public NodeShader {
/*workload=*/
uint3(static_cast<int>(ctx.output_shapes[0][2]),
static_cast<int>(ctx.output_shapes[0][1]),
IntegralDivideRoundUp(
static_cast<int>(ctx.output_shapes[0][3]), 4)),
DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
4)),
/*workgroup=*/uint3(),
/*source_code=*/
"value_0 = clamp(value_0, 0.0, $clip$) + "
@ -136,8 +136,8 @@ class PReLUFull : public NodeShader {
/*workload=*/
uint3(static_cast<int>(ctx.output_shapes[0][2]),
static_cast<int>(ctx.output_shapes[0][1]),
IntegralDivideRoundUp(
static_cast<int>(ctx.output_shapes[0][3]), 4)),
DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
4)),
/*workgroup=*/uint3(),
/*source_code=*/
"value_0 = max(value_0, 0.0) + $alpha[gid.x, gid.y, gid.z]$ "

View File

@ -60,13 +60,13 @@ class Softmax : public NodeShader {
private:
absl::Status GenerateCodeFor1x1(const GenerationContext& ctx,
GeneratedCode* generated_code) const {
const int depth = IntegralDivideRoundUp(ctx.output_shapes[0][3], 4);
const int depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
std::vector<Variable> shared_variables = {
{"partial_sum", std::vector<float4>(8)},
};
std::vector<Variable> uniform_parameters = {
{"depth", depth},
{"depth_div_32", IntegralDivideRoundUp(depth, 32)},
{"depth_div_32", DivideRoundUp(depth, 32)},
{"mask", GetMask(ctx.output_shapes[0][3])},
};
std::string source_code = R"(
@ -138,7 +138,7 @@ class Softmax : public NodeShader {
GeneratedCode* generated_code) const {
std::vector<Variable> parameters = {
{"src_depth",
IntegralDivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4)},
DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4)},
{"mask", GetMask(ctx.output_shapes[0][3])},
};

View File

@ -44,7 +44,7 @@ class ConvolutionTransposedBuffers : public NodeShader {
std::vector<Variable> parameters = {
{"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
{"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
{"src_depth", IntegralDivideRoundUp(weights.i, 4)},
{"src_depth", DivideRoundUp(weights.i, 4)},
{"kernel_size", int2(weights.w, weights.h)},
{"stride", int2(attr.stride.w, attr.stride.h)},
{"padding", int2(weights.w - 1 - attr.padding.prepended.w,

View File

@ -153,17 +153,17 @@ inline Object MakeReadonlyBuffer(const ObjectSize& size,
inline Object MakeReadonlyObject(const std::vector<float>& data) {
return MakeReadonlyObject(
IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
}
inline Object MakeReadonlyTexture(const std::vector<float>& data) {
return MakeReadonlyTexture(
IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
}
inline Object MakeReadonlyBuffer(const std::vector<float>& data) {
return MakeReadonlyBuffer(
IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
}
// TODO(akulik): find better place for functions below.
@ -172,7 +172,7 @@ inline uint3 GetPHWC4Size(const BHWC& shape) {
uint3 size;
size.x = shape.w;
size.y = shape.h;
size.z = shape.b * IntegralDivideRoundUp(shape.c, 4);
size.z = shape.b * DivideRoundUp(shape.c, 4);
return size;
}

View File

@ -167,10 +167,10 @@ std::vector<ComputeTaskDescriptorPtr> SelectWinograd36To4x4(
bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
const BHWC& dst_shape) {
const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int tiles_x = DivideRoundUp(dst_shape.w, 4);
const int tiles_y = DivideRoundUp(dst_shape.h, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const bool suitable_attributes =
attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
@ -229,8 +229,8 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
auto attr =
absl::any_cast<Convolution2DAttributes>(node->operation.attributes);
if (IsSuitableForWinograd4x4To6x6(attr, dst_shape)) {
int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
int tiles_x = DivideRoundUp(dst_shape.w, 4);
int tiles_y = DivideRoundUp(dst_shape.h, 4);
Winograd4x4To36Attributes wino_up_attr;
wino_up_attr.padding = attr.padding;

View File

@ -21,8 +21,8 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/common/util.h"
#include "tensorflow/lite/delegates/gpu/metal/common.h"
using ::tflite::gpu::IntegralDivideRoundUp;
using ::tflite::gpu::BHWC;
using ::tflite::gpu::DivideRoundUp;
using ::tflite::gpu::metal::CreateComputeProgram;
@implementation TFLBufferConvert {
@ -102,10 +102,10 @@ using ::tflite::gpu::metal::CreateComputeProgram;
[encoder setBytes:uniforms.data() length:uniforms.size() * sizeof(int) atIndex:2];
MTLSize group_size = MTLSizeMake(16, 16, 1);
int layers = IntegralDivideRoundUp(shape.c, 4);
int groups_x = IntegralDivideRoundUp(shape.w, group_size.width);
int groups_y = IntegralDivideRoundUp(shape.h, group_size.height);
int groups_z = IntegralDivideRoundUp(layers, group_size.depth);
int layers = DivideRoundUp(shape.c, 4);
int groups_x = DivideRoundUp(shape.w, group_size.width);
int groups_y = DivideRoundUp(shape.h, group_size.height);
int groups_z = DivideRoundUp(layers, group_size.depth);
MTLSize groups_count = MTLSizeMake(groups_x, groups_y, groups_z);
[encoder dispatchThreadgroups:groups_count threadsPerThreadgroup:group_size];
}

View File

@ -440,9 +440,9 @@ ComputeTaskDescriptorPtr NonLinkableStub(int operation_id, ValueId input_id,
desc->resize_function = [input_id](const std::map<ValueId, BHWC>& buffers) {
const auto& dimension = buffers.find(input_id)->second;
uint3 groups_size{16, 16, 1};
uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x),
IntegralDivideRoundUp(dimension.h, groups_size.y),
IntegralDivideRoundUp(dimension.c, 4)};
uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
DivideRoundUp(dimension.h, groups_size.y),
DivideRoundUp(dimension.c, 4)};
return std::make_pair(groups_size, groups_count);
};

View File

@ -71,7 +71,7 @@ std::string GetConcatZCode(const std::vector<int> channels) {
// Also it is easy to write a loop in this case, to prevent long kernel
// generation.
for (int i = 0; i < channels.size(); ++i) {
const int depth = IntegralDivideRoundUp(channels[i], 4);
const int depth = DivideRoundUp(channels[i], 4);
const std::string src_buffer = "src_buffer" + std::to_string(i);
c += " for (int i = 0; i < " + std::to_string(depth) + "; ++i) {\n";
c += " int src_index = i * U.src_size.w + xy_offset;\n";
@ -88,7 +88,7 @@ std::string GetConcatZCode(const std::vector<int> channels) {
int read_index = 0;
int z = 0;
for (int i = 0; i < channels.size(); ++i) {
const int depth = IntegralDivideRoundUp(channels[i], 4);
const int depth = DivideRoundUp(channels[i], 4);
const std::string src_buffer = "src_buffer" + std::to_string(i);
for (int d = 0; d < depth; ++d) {
const int channels_in_group = std::min(4, channels[i] - d * 4);
@ -168,11 +168,11 @@ std::vector<ComputeTaskDescriptorPtr> ConcatZ(
std::vector<int> uniform_params{
src_shape.w,
src_shape.h,
IntegralDivideRoundUp(src_shape.c, 4),
DivideRoundUp(src_shape.c, 4),
src_shape.w * src_shape.h,
dst_shape.w,
dst_shape.h,
IntegralDivideRoundUp(dst_shape.c, 4),
DivideRoundUp(dst_shape.c, 4),
dst_shape.w * dst_shape.h,
};
return GetByteBuffer(uniform_params);
@ -184,9 +184,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatZ(
uint3 grid(dst_shape.w, dst_shape.h, 1);
uint3 group_size{8u, 4u, 1u};
uint3 groups;
groups.x = IntegralDivideRoundUp(grid.x, group_size.x);
groups.y = IntegralDivideRoundUp(grid.y, group_size.y);
groups.z = IntegralDivideRoundUp(grid.z, group_size.z);
groups.x = DivideRoundUp(grid.x, group_size.x);
groups.y = DivideRoundUp(grid.y, group_size.y);
groups.z = DivideRoundUp(grid.z, group_size.z);
return std::make_pair(group_size, groups);
};
@ -265,7 +265,7 @@ std::vector<ComputeTaskDescriptorPtr> ConcatX(
[output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& dimension = buffers.find(output_id)->second;
std::vector<int> uniform_params{dimension.w, dimension.h,
IntegralDivideRoundUp(dimension.c, 4),
DivideRoundUp(dimension.c, 4),
/*padding=*/0};
return GetByteBuffer(uniform_params);
}},
@ -274,9 +274,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatX(
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& output_dims = buffers.find(output_id)->second;
const uint3 groups_size{8, 4, 1};
int groups_x = IntegralDivideRoundUp(output_dims.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(output_dims.h, groups_size.y);
int groups_z = IntegralDivideRoundUp(output_dims.c, 4);
int groups_x = DivideRoundUp(output_dims.w, groups_size.x);
int groups_y = DivideRoundUp(output_dims.h, groups_size.y);
int groups_z = DivideRoundUp(output_dims.c, 4);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};
@ -356,7 +356,7 @@ std::vector<ComputeTaskDescriptorPtr> ConcatY(
[output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& dimension = buffers.find(output_id)->second;
std::vector<int> uniform_params{dimension.w, dimension.h,
IntegralDivideRoundUp(dimension.c, 4),
DivideRoundUp(dimension.c, 4),
/*padding=*/0};
return GetByteBuffer(uniform_params);
}},
@ -365,9 +365,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatY(
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& output_dims = buffers.find(output_id)->second;
const uint3 groups_size{8, 4, 1};
int groups_x = IntegralDivideRoundUp(output_dims.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(output_dims.h, groups_size.y);
int groups_z = IntegralDivideRoundUp(output_dims.c, 4);
int groups_x = DivideRoundUp(output_dims.w, groups_size.x);
int groups_y = DivideRoundUp(output_dims.h, groups_size.y);
int groups_z = DivideRoundUp(output_dims.c, 4);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};

View File

@ -72,7 +72,7 @@ struct ConvParams {
namespace {
int GetNumOutputSlices(int dst_channels) {
const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
const int dst_depth = DivideRoundUp(dst_channels, 4);
if (dst_depth % 4 == 0 || dst_depth >= 16) {
return 4;
} else if (dst_depth % 2 == 0 || dst_depth >= 4) {
@ -571,8 +571,8 @@ kernel void ComputeFunction(
std::vector<float> ReorderWeightsForConv(
const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights,
const ConvParams& params) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
std::vector<float> weights_reordered(
weights.shape.w * weights.shape.h *
AlignByN(dst_depth, params.block_size.z) * 4 * src_depth * 4);
@ -580,8 +580,7 @@ std::vector<float> ReorderWeightsForConv(
bool isO4I4 = params.weight_layout == WeightsInnerBlockLayout::O4I4;
int counter = 0;
for (int d = 0; d < IntegralDivideRoundUp(dst_depth, params.block_size.z);
++d) {
for (int d = 0; d < DivideRoundUp(dst_depth, params.block_size.z); ++d) {
for (int y = 0; y < weights.shape.h; ++y) {
for (int x = 0; x < weights.shape.w; ++x) {
for (int s = 0; s < src_depth; ++s) {
@ -618,17 +617,17 @@ std::vector<uint8_t> GetUniformBuffer(const BHWC& src_size,
const BHWC& dst_size,
const Convolution2DAttributes& attr,
const ConvParams& params) {
const int grid_x = IntegralDivideRoundUp(dst_size.w, params.block_size.x);
const int grid_y = IntegralDivideRoundUp(dst_size.h, params.block_size.y);
const int grid_x = DivideRoundUp(dst_size.w, params.block_size.x);
const int grid_y = DivideRoundUp(dst_size.h, params.block_size.y);
std::vector<int> uniform_params = {
src_size.w,
src_size.h,
src_size.w * src_size.h,
IntegralDivideRoundUp(src_size.c, 4),
DivideRoundUp(src_size.c, 4),
dst_size.w,
dst_size.h,
dst_size.w * dst_size.h,
IntegralDivideRoundUp(dst_size.c, 4),
DivideRoundUp(dst_size.c, 4),
attr.strides.w,
attr.strides.h,
-attr.padding.prepended.w,
@ -652,17 +651,17 @@ std::vector<uint8_t> GetUniformBuffer(const BHWC& src_size,
std::vector<uint8_t> GetUniformBufferForWinograd(const BHWC& src_size,
const BHWC& dst_size,
const ConvParams& params) {
const int grid_x = IntegralDivideRoundUp(dst_size.w, params.block_size.x);
const int grid_y = IntegralDivideRoundUp(dst_size.h, params.block_size.y);
const int grid_x = DivideRoundUp(dst_size.w, params.block_size.x);
const int grid_y = DivideRoundUp(dst_size.h, params.block_size.y);
std::vector<int> uniform_params = {
src_size.w,
src_size.h,
src_size.w * src_size.h,
IntegralDivideRoundUp(src_size.c, 4),
DivideRoundUp(src_size.c, 4),
dst_size.w,
dst_size.h,
dst_size.w * dst_size.h,
IntegralDivideRoundUp(dst_size.c, 4),
DivideRoundUp(dst_size.c, 4),
1,
1,
0,
@ -685,38 +684,37 @@ std::vector<uint8_t> GetUniformBufferForWinograd(const BHWC& src_size,
int GetGroupsCount(const BHWC& dst_shape, const int3& wg_size,
const int3& block_size) {
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x);
int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y);
int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z);
int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
int grid_z = DivideRoundUp(dst_slices, block_size.z);
return IntegralDivideRoundUp(grid_x, wg_size.x) *
IntegralDivideRoundUp(grid_y, wg_size.y) *
IntegralDivideRoundUp(grid_z, wg_size.z);
return DivideRoundUp(grid_x, wg_size.x) * DivideRoundUp(grid_y, wg_size.y) *
DivideRoundUp(grid_z, wg_size.z);
}
int GetGroupsCountForLinearWH(const BHWC& dst_shape, const int3& wg_size,
const int3& block_size) {
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x);
int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y);
int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z);
int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
int grid_z = DivideRoundUp(dst_slices, block_size.z);
return IntegralDivideRoundUp(grid_x * grid_y, wg_size.x) *
IntegralDivideRoundUp(grid_z, wg_size.y);
return DivideRoundUp(grid_x * grid_y, wg_size.x) *
DivideRoundUp(grid_z, wg_size.y);
}
int GetGroupsCountForLinearWHS(const BHWC& dst_shape, const int3& wg_size,
const int3& block_size) {
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x);
int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y);
int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z);
int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
int grid_z = DivideRoundUp(dst_slices, block_size.z);
return IntegralDivideRoundUp(grid_x * grid_y * grid_z, wg_size.x);
return DivideRoundUp(grid_x * grid_y * grid_z, wg_size.x);
}
bool IsKernelXIs1(const Convolution2DAttributes& attr) {
@ -758,8 +756,8 @@ int GetRecommendedBlockSize(const AppleGPUInfo& apple_info,
ConvParams GetConvParamsForA7A8(const AppleGPUInfo& apple_info,
const Convolution2DAttributes& attr,
const BHWC& dst_shape) {
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
ConvParams params;
params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
@ -835,8 +833,8 @@ ConvParams GetConvParamsForA7A8(const AppleGPUInfo& apple_info,
ConvParams GetConvParamsForA9AndHigher(const AppleGPUInfo& apple_info,
const Convolution2DAttributes& attr,
const BHWC& dst_shape) {
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
int blk_total_size = GetRecommendedBlockSize(apple_info, dst_shape);
int3 block_size = int3(1, 1, 1);
if (blk_total_size >= 2 && apple_info.IsBionic()) {
@ -917,8 +915,8 @@ ConvParams GetConvParamsForA9AndHigher(const AppleGPUInfo& apple_info,
ConvParams GetConvParamsForIntel(const Convolution2DAttributes& attr,
const RuntimeOptions& options,
const BHWC& dst_shape) {
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
ConvParams params;
params.weights_upload_type = WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST;
params.x_kernel_is_1 = IsKernelXIs1(attr);
@ -1017,29 +1015,28 @@ ConvParams GetConvParams(const DeviceInfo& device_info,
std::pair<uint3, uint3> GetDispatchSizes(const ConvParams& params,
const BHWC& shape) {
const int dst_slices = IntegralDivideRoundUp(shape.c, 4);
const int dst_slices = DivideRoundUp(shape.c, 4);
int grid_x = IntegralDivideRoundUp(shape.w, params.block_size.x);
int grid_y = IntegralDivideRoundUp(shape.h, params.block_size.y);
int grid_z = IntegralDivideRoundUp(dst_slices, params.block_size.z);
int grid_x = DivideRoundUp(shape.w, params.block_size.x);
int grid_y = DivideRoundUp(shape.h, params.block_size.y);
int grid_z = DivideRoundUp(dst_slices, params.block_size.z);
const uint3 group_size(params.work_group_size.x, params.work_group_size.y,
params.work_group_size.z);
int3 wg;
uint3 groups_count;
if (params.linear_whs) {
wg.x = IntegralDivideRoundUp(grid_x * grid_y * grid_z,
params.work_group_size.x);
wg.x = DivideRoundUp(grid_x * grid_y * grid_z, params.work_group_size.x);
groups_count = uint3(wg.x, 1, 1);
} else if (params.linear_wh) {
wg.x = IntegralDivideRoundUp(grid_x * grid_y, params.work_group_size.x);
wg.y = IntegralDivideRoundUp(grid_z, params.work_group_size.y);
wg.x = DivideRoundUp(grid_x * grid_y, params.work_group_size.x);
wg.y = DivideRoundUp(grid_z, params.work_group_size.y);
groups_count = uint3(wg[params.work_group_launch_order.x],
wg[params.work_group_launch_order.y], 1);
} else {
wg.x = IntegralDivideRoundUp(grid_x, params.work_group_size.x);
wg.y = IntegralDivideRoundUp(grid_y, params.work_group_size.y);
wg.z = IntegralDivideRoundUp(grid_z, params.work_group_size.z);
wg.x = DivideRoundUp(grid_x, params.work_group_size.x);
wg.y = DivideRoundUp(grid_y, params.work_group_size.y);
wg.z = DivideRoundUp(grid_z, params.work_group_size.z);
groups_count = uint3(wg[params.work_group_launch_order.x],
wg[params.work_group_launch_order.y],
wg[params.work_group_launch_order.z]);
@ -1076,7 +1073,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionGeneric(
std::string addr_space =
params.weights_upload_type == WeightsUploadType::CONSTANT_MEM ? "constant"
: "device";
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
desc->immutable_buffers = {
{addr_space + " FLT4* const filters",
GetByteBufferConverted(weights_reordered, options.storage_precision)},
@ -1108,7 +1105,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionWino4x4To6x6(
int id, ValueId input_id, ValueId output_id, const BHWC& dst_shape,
const Convolution2DAttributes& attr, const DeviceInfo& device_info,
const RuntimeOptions& options) {
const int dst_slices = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
ConvParams params;
params.work_group_launch_order = int3(2, 0, 1);
params.src_depth_loop_size = 1;

View File

@ -34,6 +34,7 @@ using ::tflite::gpu::Axis;
using ::tflite::gpu::BHWC;
using ::tflite::gpu::Convolution2DAttributes;
using ::tflite::gpu::DataType;
using ::tflite::gpu::DivideRoundUp;
using ::tflite::gpu::HW;
using ::tflite::gpu::Linear;
using ::tflite::gpu::OHWI;
@ -44,7 +45,6 @@ using ::tflite::gpu::TensorRef;
using ::tflite::gpu::ValueId;
using ::tflite::gpu::metal::ConvolutionGeneric;
using ::tflite::gpu::metal::ConvolutionWino4x4To6x6;
using ::tflite::gpu::IntegralDivideRoundUp;
using ::tflite::gpu::metal::CompareVectors;
using ::tflite::gpu::metal::SingleOpModel;
@ -275,7 +275,7 @@ using ::tflite::gpu::metal::SingleOpModel;
BHWC conv_shape;
conv_shape.b = dst_shape.b;
conv_shape.h = 36;
conv_shape.w = IntegralDivideRoundUp(new_width, 4) * IntegralDivideRoundUp(new_height, 4);
conv_shape.w = DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
conv_shape.c = dst_shape.c;
TensorFloat32 src_tensor;

View File

@ -208,7 +208,7 @@ kernel void ComputeFunction(
// DepthWiseConv3x3Stride1x1
std::vector<float> ReorderWeightsDepthWiseConv3x3Stride1x1(
const DepthwiseConvolution2DAttributes& attr) {
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int kernel_x = 3;
const int kernel_y = 3;
std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth *
@ -250,11 +250,11 @@ static std::vector<uint8_t> GetUniformBufferDepthWiseConv3x3Stride1x1(
src_size.w,
src_size.h,
src_size.w * src_size.h,
IntegralDivideRoundUp(src_size.c, 4),
DivideRoundUp(src_size.c, 4),
dst_size.w,
dst_size.h,
dst_size.w * dst_size.h,
IntegralDivideRoundUp(dst_size.c, 4),
DivideRoundUp(dst_size.c, 4),
-params.padding.prepended.w,
-params.padding.prepended.h,
0, // dummy, for alignment
@ -403,7 +403,7 @@ kernel void ComputeFunction(
// DepthWiseConv3x3Stride2
std::vector<float> ReorderWeightsDepthWiseConv3x3Stride2(
const DepthwiseConvolution2DAttributes& attr) {
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int kernel_x = 3;
const int kernel_y = 3;
std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth *
@ -445,11 +445,11 @@ static std::vector<uint8_t> GetUniformBufferDepthWiseConv3x3Stride2(
src_size.w,
src_size.h,
src_size.w * src_size.h,
IntegralDivideRoundUp(src_size.c, 4),
DivideRoundUp(src_size.c, 4),
dst_size.w,
dst_size.h,
dst_size.w * dst_size.h,
IntegralDivideRoundUp(dst_size.c, 4),
DivideRoundUp(dst_size.c, 4),
-attr.padding.prepended.w,
-attr.padding.prepended.h,
attr.strides.w,
@ -586,11 +586,11 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
std::vector<int> uniform_params{
dimension.w,
dimension.h,
IntegralDivideRoundUp(dimension.c, 4),
DivideRoundUp(dimension.c, 4),
0,
output_dimension.w,
output_dimension.h,
IntegralDivideRoundUp(output_dimension.c, 4),
DivideRoundUp(output_dimension.c, 4),
0,
attr.strides.w,
attr.strides.h,
@ -612,9 +612,9 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& dimension = buffers.find(output_id)->second;
uint3 groups_size{8, 4, 1};
uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x),
IntegralDivideRoundUp(dimension.h, groups_size.y),
IntegralDivideRoundUp(dimension.c, 4)};
uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
DivideRoundUp(dimension.h, groups_size.y),
DivideRoundUp(dimension.c, 4)};
return std::make_pair(groups_size, groups_count);
};
@ -661,17 +661,17 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride1x1(
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& dimension = buffers.find(output_id)->second;
const int grid_x = IntegralDivideRoundUp(dimension.w, 2);
const int grid_y = IntegralDivideRoundUp(dimension.h, 2);
const int grid_z = IntegralDivideRoundUp(dimension.c, 4);
const int grid_x = DivideRoundUp(dimension.w, 2);
const int grid_y = DivideRoundUp(dimension.h, 2);
const int grid_z = DivideRoundUp(dimension.c, 4);
uint3 group_size{8, 4, 1};
if (grid_x <= 4) {
group_size.x = 4;
group_size.z = grid_z % 2 == 0 ? 2 : 1;
}
const int groups_x = IntegralDivideRoundUp(grid_x, group_size.x);
const int groups_y = IntegralDivideRoundUp(grid_y, group_size.y);
const int groups_z = IntegralDivideRoundUp(grid_z, group_size.z);
const int groups_x = DivideRoundUp(grid_x, group_size.x);
const int groups_y = DivideRoundUp(grid_y, group_size.y);
const int groups_z = DivideRoundUp(grid_z, group_size.z);
return std::make_pair(group_size, uint3(groups_x, groups_y, groups_z));
};
@ -726,12 +726,12 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride2(
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& dimension = buffers.find(output_id)->second;
const int grid_x = dimension.w;
const int grid_y = IntegralDivideRoundUp(dimension.h, 2);
const int grid_z = IntegralDivideRoundUp(dimension.c, 4);
const int grid_y = DivideRoundUp(dimension.h, 2);
const int grid_z = DivideRoundUp(dimension.c, 4);
const uint3 group_size{8, 4, 1};
const int groups_x = IntegralDivideRoundUp(grid_x, group_size.x);
const int groups_y = IntegralDivideRoundUp(grid_y, group_size.y);
const int groups_z = IntegralDivideRoundUp(grid_z, group_size.z);
const int groups_x = DivideRoundUp(grid_x, group_size.x);
const int groups_y = DivideRoundUp(grid_y, group_size.y);
const int groups_z = DivideRoundUp(grid_z, group_size.z);
return std::make_pair(group_size, uint3(groups_x, groups_y, groups_z));
};

View File

@ -45,7 +45,7 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
const std::string barrier = device_info.IsWaveSizeEqualTo32()
? "SIMDGROUP_BARRIER"
: "threadgroup_barrier";
const int src_depth = IntegralDivideRoundUp(src_channels, 4);
const int src_depth = DivideRoundUp(src_channels, 4);
std::stringstream code;
code << R"(
#include <metal_stdlib>
@ -116,9 +116,8 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
}
}
)";
const int src_depth_sub_groups = shared_memory
? IntegralDivideRoundUp(src_depth, 32)
: IntegralDivideRoundUp(src_depth, 4);
const int src_depth_sub_groups = shared_memory ? DivideRoundUp(src_depth, 32)
: DivideRoundUp(src_depth, 4);
return absl::Substitute(code.str(), src_depth_sub_groups, barrier);
}
} // namespace
@ -146,7 +145,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
bool shared_memory =
device_info.IsAppleGPU() &&
device_info.apple_info.IsLocalMemoryPreferredOverGlobal();
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int src_depth_aligned = AlignByN(src_depth, shared_memory ? 32 : 4);
const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8);
@ -179,8 +178,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
{"constant uniforms& params",
[attr](const std::map<ValueId, BHWC>& buffers) {
std::vector<uint32_t> uniform_params{
static_cast<uint32_t>(
IntegralDivideRoundUp(attr.weights.shape.i, 4)),
static_cast<uint32_t>(DivideRoundUp(attr.weights.shape.i, 4)),
static_cast<uint32_t>(AlignByN(attr.weights.shape.o, 8)),
static_cast<uint32_t>(attr.weights.shape.o),
static_cast<uint32_t>(0),
@ -192,7 +190,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
const uint3 groups_size{8, 4, 1};
const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8);
int groups_x = IntegralDivideRoundUp(dst_channels_aligned, groups_size.x);
int groups_x = DivideRoundUp(dst_channels_aligned, groups_size.x);
return std::make_pair(groups_size, uint3{groups_x, 1, 1});
};

View File

@ -131,9 +131,9 @@ std::vector<ComputeTaskDescriptorPtr> MaxUnpooling(
const auto& src_shape = buffers.find(input_id)->second;
BHWC dst_shape = CalculateOutputShape(src_shape, params);
const uint3 groups_size{16, 16, 1};
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
int groups_z = IntegralDivideRoundUp(dst_shape.c, 4);
int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
int groups_z = DivideRoundUp(dst_shape.c, 4);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};

View File

@ -133,7 +133,7 @@ std::vector<ComputeTaskDescriptorPtr> Mean(int id, ValueId input_id,
[input_id, output_id,
work_group_size](const std::map<ValueId, BHWC>& buffers) {
const auto& src_shape = buffers.find(input_id)->second;
const int src_slices = IntegralDivideRoundUp(src_shape.c, 4);
const int src_slices = DivideRoundUp(src_shape.c, 4);
struct uniforms {
int4 src_size;
float4 inv_multipliers;
@ -153,8 +153,8 @@ std::vector<ComputeTaskDescriptorPtr> Mean(int id, ValueId input_id,
desc->resize_function = [output_id, work_group_size](
const std::map<ValueId, BHWC>& buffers) {
BHWC dst_shape = buffers.find(output_id)->second;
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
const int groups_z = IntegralDivideRoundUp(dst_slices, work_group_size.z);
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
const int groups_z = DivideRoundUp(dst_slices, work_group_size.z);
return std::make_pair(work_group_size, uint3{1, 1, groups_z});
};
return {desc};

View File

@ -177,12 +177,12 @@ std::vector<ComputeTaskDescriptorPtr> Padding(int id, ValueId input_id,
dimension.w,
dimension.h,
dimension.c,
IntegralDivideRoundUp(dimension.c, 4),
DivideRoundUp(dimension.c, 4),
// int4 dst_size
output_dimension.w,
output_dimension.h,
output_dimension.c,
IntegralDivideRoundUp(output_dimension.c, 4),
DivideRoundUp(output_dimension.c, 4),
// int4 prepended padding
attr.prepended.w,
attr.prepended.h,
@ -198,10 +198,10 @@ std::vector<ComputeTaskDescriptorPtr> Padding(int id, ValueId input_id,
const uint3 groups_size{16, 16, 1};
const auto& src_shape = buffers.find(input_id)->second;
BHWC dst_shape = CalculateOutputShape(src_shape, attr);
const int dst_layers = IntegralDivideRoundUp(dst_shape.c, 4);
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
const int dst_layers = DivideRoundUp(dst_shape.c, 4);
int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
int groups_z = DivideRoundUp(dst_layers, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};

View File

@ -224,11 +224,11 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
std::vector<int> uniform_params = {
dimension.w,
dimension.h,
IntegralDivideRoundUp(dimension.c, 4),
DivideRoundUp(dimension.c, 4),
dimension.w * dimension.h,
output_dimension.w,
output_dimension.h,
IntegralDivideRoundUp(dimension.c, 4),
DivideRoundUp(dimension.c, 4),
output_dimension.w * output_dimension.h,
params.strides.w,
params.strides.h,
@ -242,11 +242,11 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
BHWC dst_shape = buffers.find(output_id)->second;
const uint3 grid =
uint3(dst_shape.w, dst_shape.h, IntegralDivideRoundUp(dst_shape.c, 4));
uint3(dst_shape.w, dst_shape.h, DivideRoundUp(dst_shape.c, 4));
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
int groups_x = DivideRoundUp(grid.x, groups_size.x);
int groups_y = DivideRoundUp(grid.y, groups_size.y);
int groups_z = DivideRoundUp(grid.z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};

View File

@ -159,11 +159,11 @@ std::vector<ComputeTaskDescriptorPtr> Reshape(int id, ValueId input_id,
desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
IntegralDivideRoundUp(attr.new_shape.c, 4));
DivideRoundUp(attr.new_shape.c, 4));
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
int groups_x = DivideRoundUp(grid.x, groups_size.x);
int groups_y = DivideRoundUp(grid.y, groups_size.y);
int groups_z = DivideRoundUp(grid.z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};
@ -197,14 +197,14 @@ std::vector<ComputeTaskDescriptorPtr> Reshapex4(int id, ValueId input_id,
const auto& dst_dim = buffers.find(output_id)->second;
std::vector<int32_t> uniform_params{
// int4 src_size
src_dim.w, src_dim.h, IntegralDivideRoundUp(src_dim.c, 4),
src_dim.w, src_dim.h, DivideRoundUp(src_dim.c, 4),
src_dim.w * src_dim.h,
// int4 dst_size
dst_dim.w, dst_dim.h, IntegralDivideRoundUp(dst_dim.c, 4),
dst_dim.w, dst_dim.h, DivideRoundUp(dst_dim.c, 4),
dst_dim.w * dst_dim.h,
// int2 plane_xz
src_dim.w * IntegralDivideRoundUp(src_dim.c, 4),
dst_dim.w * IntegralDivideRoundUp(dst_dim.c, 4),
src_dim.w * DivideRoundUp(src_dim.c, 4),
dst_dim.w * DivideRoundUp(dst_dim.c, 4),
0, // dummy, for alignment
0, // dummy, for alignment
0, // dummy, for alignment
@ -218,11 +218,11 @@ std::vector<ComputeTaskDescriptorPtr> Reshapex4(int id, ValueId input_id,
desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
IntegralDivideRoundUp(attr.new_shape.c, 4));
DivideRoundUp(attr.new_shape.c, 4));
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
int groups_x = DivideRoundUp(grid.x, groups_size.x);
int groups_y = DivideRoundUp(grid.y, groups_size.y);
int groups_z = DivideRoundUp(grid.z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};

View File

@ -153,10 +153,10 @@ std::vector<ComputeTaskDescriptorPtr> Resize(int id, ValueId input_id,
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const uint3 groups_size{16, 16, 1};
const auto& dst_dim = buffers.find(output_id)->second;
int groups_x = IntegralDivideRoundUp(dst_dim.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(dst_dim.h, groups_size.y);
const int dst_layers = IntegralDivideRoundUp(dst_dim.c, 4);
int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
int groups_x = DivideRoundUp(dst_dim.w, groups_size.x);
int groups_y = DivideRoundUp(dst_dim.h, groups_size.y);
const int dst_layers = DivideRoundUp(dst_dim.c, 4);
int groups_z = DivideRoundUp(dst_layers, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};
return {desc};

View File

@ -157,12 +157,12 @@ std::vector<ComputeTaskDescriptorPtr> Slice(int id, ValueId input_id,
dimension.w,
dimension.h,
dimension.c,
IntegralDivideRoundUp(dimension.c, 4),
DivideRoundUp(dimension.c, 4),
// int4 dst_size
output_dimension.w,
output_dimension.h,
output_dimension.c,
IntegralDivideRoundUp(output_dimension.c, 4),
DivideRoundUp(output_dimension.c, 4),
};
return GetByteBuffer(uniform_params);
}},
@ -173,10 +173,10 @@ std::vector<ComputeTaskDescriptorPtr> Slice(int id, ValueId input_id,
const uint3 groups_size{16, 16, 1};
const auto& src_shape = buffers.find(input_id)->second;
BHWC dst_shape = CalculateOutputShape(src_shape, attr);
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
const int dst_layers = IntegralDivideRoundUp(dst_shape.c, 4);
int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
const int dst_layers = DivideRoundUp(dst_shape.c, 4);
int groups_z = DivideRoundUp(dst_layers, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};

View File

@ -169,8 +169,8 @@ std::vector<ComputeTaskDescriptorPtr> Softmax(int id, ValueId input_id,
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
uint3 groups_size{8, 4, 1};
const auto& dimension = buffers.find(output_id)->second;
uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x),
IntegralDivideRoundUp(dimension.h, groups_size.y), 1};
uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
DivideRoundUp(dimension.h, groups_size.y), 1};
return std::make_pair(groups_size, groups_count);
};
@ -198,13 +198,13 @@ std::vector<ComputeTaskDescriptorPtr> Softmax1x1(int id, ValueId input_id,
desc->uniform_buffers = {
{"constant uniforms& params",
[channels_count](const std::map<ValueId, BHWC>& buffers) {
const int src_depth = IntegralDivideRoundUp(channels_count, 4);
const int src_depth = DivideRoundUp(channels_count, 4);
struct uniforms {
int4 size;
float4 mask;
};
uniforms params;
params.size = {src_depth, IntegralDivideRoundUp(src_depth, 32), 1, 1};
params.size = {src_depth, DivideRoundUp(src_depth, 32), 1, 1};
params.mask = {0.0f, 0.0f, 0.0f, 0.0f};
const int reminder = channels_count % 4 == 0 ? 4 : channels_count % 4;
for (int i = 0; i < reminder; ++i) {

View File

@ -113,12 +113,12 @@ kernel void ComputeFunction($1 uint3 gid[[thread_position_in_grid]]) {
input_shape.h / attr.block_size,
input_shape.w / attr.block_size,
input_shape.c * attr.block_size * attr.block_size);
const uint3 grid = uint3(output_shape.w, output_shape.h,
IntegralDivideRoundUp(output_shape.c, 4));
const uint3 grid =
uint3(output_shape.w, output_shape.h, DivideRoundUp(output_shape.c, 4));
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
const int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
const int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
const int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
const int groups_x = DivideRoundUp(grid.x, groups_size.x);
const int groups_y = DivideRoundUp(grid.y, groups_size.y);
const int groups_z = DivideRoundUp(grid.z, groups_size.z);
return std::make_pair(groups_size, uint3(groups_x, groups_y, groups_z));
};
return {desc};

View File

@ -130,8 +130,8 @@ std::string GetDeconvolution(const ConvolutionTransposedAttributes& attr) {
constant_args, attr.padding.prepended.w, attr.padding.prepended.h,
attr.stride.w, attr.stride.h, kernel_x, kernel_y, inner_size_x,
inner_size_y, kernel_x - 1, kernel_y - 1);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 4);
return absl::Substitute(shader_source, src_depth * dst_channels_aligned,
src_depth, dst_depth, attr.weights.shape.o,
@ -264,8 +264,8 @@ std::string GetDeconvolutionShared(const ConvolutionTransposedAttributes& attr,
constant_args, attr.padding.prepended.w, attr.padding.prepended.h,
attr.stride.w, attr.stride.h, kernel_x, kernel_y, inner_size_x,
inner_size_y, kernel_x - 1, kernel_y - 1);
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 4);
const int src_local_size_x = (workgroup_x + kernel_x) / attr.stride.w;
const int src_local_size_y = (workgroup_y + kernel_y) / attr.stride.h;
@ -464,7 +464,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
(kThreadGroupWidth + params.weights.shape.w) / params.stride.w;
const int src_local_size_y =
(kThreadGroupHeight + params.weights.shape.h) / params.stride.h;
const int src_depth = IntegralDivideRoundUp(params.weights.shape.i, 4);
const int src_depth = DivideRoundUp(params.weights.shape.i, 4);
const int shared_size =
sizeof(float) * 4 * src_depth * src_local_size_x * src_local_size_y;
if (shared_size < 1000 * 16 &&
@ -543,8 +543,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
const uint3 groups_size{kThreadGroupWidth, kThreadGroupHeight, 1};
BHWC dst_shape =
CalculateOutputShape(buffers.find(input_id)->second, params);
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
int groups_z = 1;
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};
@ -556,8 +556,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
int id, ValueId input_id, ValueId output_id,
const ConvolutionTransposedAttributes& params,
const DeviceInfo& device_info, const RuntimeOptions& options) {
const int src_depth = IntegralDivideRoundUp(params.weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(params.weights.shape.o, 4);
const int src_depth = DivideRoundUp(params.weights.shape.i, 4);
const int dst_depth = DivideRoundUp(params.weights.shape.o, 4);
const int kernel_x = 4;
const int kernel_y = 4;
@ -645,7 +645,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
[input_id, output_id, params](const std::map<ValueId, BHWC>& buffers) {
const auto& src_shape = buffers.find(input_id)->second;
const auto& dst_shape = buffers.find(output_id)->second;
const int src_depth = IntegralDivideRoundUp(src_shape.c, 4);
const int src_depth = DivideRoundUp(src_shape.c, 4);
std::vector<int> uniform_params{
src_shape.w,
src_shape.h,
@ -653,7 +653,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
src_shape.w * src_shape.h,
dst_shape.w,
dst_shape.h,
IntegralDivideRoundUp(dst_shape.c, 4),
DivideRoundUp(dst_shape.c, 4),
0,
4 * 16 * src_depth,
0,
@ -667,13 +667,13 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
desc->resize_function = [output_id, block_size,
params](const std::map<ValueId, BHWC>& buffers) {
const auto& dst_shape = buffers.find(output_id)->second;
const int grid_x = IntegralDivideRoundUp(dst_shape.w + 2, 2 * block_size.x);
const int grid_y = IntegralDivideRoundUp(dst_shape.h + 2, 2 * block_size.y);
const int grid_z = IntegralDivideRoundUp(dst_shape.c, 4);
const int grid_x = DivideRoundUp(dst_shape.w + 2, 2 * block_size.x);
const int grid_y = DivideRoundUp(dst_shape.h + 2, 2 * block_size.y);
const int grid_z = DivideRoundUp(dst_shape.c, 4);
const uint3 group_size{8, 4, 1};
int groups_x = IntegralDivideRoundUp(grid_x, group_size.x);
int groups_y = IntegralDivideRoundUp(grid_y, group_size.y);
int groups_z = IntegralDivideRoundUp(grid_z, group_size.z);
int groups_x = DivideRoundUp(grid_x, group_size.x);
int groups_y = DivideRoundUp(grid_y, group_size.y);
int groups_z = DivideRoundUp(grid_z, group_size.z);
return std::make_pair(group_size, uint3{groups_z, groups_x, groups_y});
};

View File

@ -486,8 +486,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
BHWC dst_shape;
dst_shape.b = src_shape.b;
dst_shape.h = 36;
dst_shape.w = IntegralDivideRoundUp(new_width, 4) *
IntegralDivideRoundUp(new_height, 4);
dst_shape.w =
DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
dst_shape.c = src_shape.c;
return dst_shape;
}};
@ -501,16 +501,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
attr.padding.appended.w - 2;
int new_height = src_shape.h + attr.padding.prepended.h +
attr.padding.appended.h - 2;
int tiles_x = IntegralDivideRoundUp(new_width, 4);
int tiles_y = IntegralDivideRoundUp(new_height, 4);
int tiles_x = DivideRoundUp(new_width, 4);
int tiles_y = DivideRoundUp(new_height, 4);
std::vector<int> sizes = {
src_shape.w,
src_shape.h,
IntegralDivideRoundUp(src_shape.c, 4),
DivideRoundUp(src_shape.c, 4),
0,
dst_shape.w,
dst_shape.h,
IntegralDivideRoundUp(dst_shape.c, 4),
DivideRoundUp(dst_shape.c, 4),
0,
-attr.padding.prepended.w,
-attr.padding.prepended.h,
@ -529,12 +529,12 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
src_shape.w + attr.padding.prepended.w + attr.padding.appended.w - 2;
int new_height =
src_shape.h + attr.padding.prepended.h + attr.padding.appended.h - 2;
int grid_x = IntegralDivideRoundUp(new_width, 4);
int grid_y = IntegralDivideRoundUp(new_height, 4);
int grid_z = IntegralDivideRoundUp(src_shape.c, 4);
int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
int grid_x = DivideRoundUp(new_width, 4);
int grid_y = DivideRoundUp(new_height, 4);
int grid_z = DivideRoundUp(src_shape.c, 4);
int groups_x = DivideRoundUp(grid_x, groups_size.x);
int groups_y = DivideRoundUp(grid_y, groups_size.y);
int groups_z = DivideRoundUp(grid_z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};
return {desc};
@ -563,8 +563,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
BHWC dst_shape;
dst_shape.b = src_shape.b;
dst_shape.h = 36;
dst_shape.w = IntegralDivideRoundUp(new_width, 4) *
IntegralDivideRoundUp(new_height, 4);
dst_shape.w =
DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
dst_shape.c = src_shape.c;
return dst_shape;
}};
@ -593,16 +593,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
attr.padding.appended.w - 2;
int new_height = src_shape.h + attr.padding.prepended.h +
attr.padding.appended.h - 2;
int tiles_x = IntegralDivideRoundUp(new_width, 4);
int tiles_y = IntegralDivideRoundUp(new_height, 4);
int tiles_x = DivideRoundUp(new_width, 4);
int tiles_y = DivideRoundUp(new_height, 4);
std::vector<int> sizes = {
src_shape.w,
src_shape.h,
IntegralDivideRoundUp(src_shape.c, 4),
DivideRoundUp(src_shape.c, 4),
0,
dst_shape.w,
dst_shape.h,
IntegralDivideRoundUp(dst_shape.c, 4),
DivideRoundUp(dst_shape.c, 4),
0,
-attr.padding.prepended.w,
-attr.padding.prepended.h,
@ -619,10 +619,10 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
const auto& dst_shape = buffers.find(output_id)->second;
int grid_x = dst_shape.w;
int grid_y = 6;
int grid_z = IntegralDivideRoundUp(dst_shape.c, 4);
int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
int grid_z = DivideRoundUp(dst_shape.c, 4);
int groups_x = DivideRoundUp(grid_x, groups_size.x);
int groups_y = DivideRoundUp(grid_y, groups_size.y);
int groups_z = DivideRoundUp(grid_z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};
return {desc};
@ -665,8 +665,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4(
const auto& src_shape = buffers.find(input_id)->second;
const auto& dst_shape = buffers.find(output_id)->second;
std::vector<int> sizes = {
src_shape.w, src_shape.h, IntegralDivideRoundUp(src_shape.c, 4), 0,
dst_shape.w, dst_shape.h, IntegralDivideRoundUp(dst_shape.c, 4), 0,
src_shape.w, src_shape.h, DivideRoundUp(src_shape.c, 4), 0,
dst_shape.w, dst_shape.h, DivideRoundUp(dst_shape.c, 4), 0,
};
return GetByteBuffer(sizes);
}},
@ -677,10 +677,10 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4(
const auto& src_shape = buffers.find(input_id)->second;
int grid_x = src_shape.w;
int grid_y = 1;
int grid_z = IntegralDivideRoundUp(src_shape.c, 4);
int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
int grid_z = DivideRoundUp(src_shape.c, 4);
int groups_x = DivideRoundUp(grid_x, groups_size.x);
int groups_y = DivideRoundUp(grid_y, groups_size.y);
int groups_z = DivideRoundUp(grid_z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};
return {desc};
@ -734,16 +734,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4Tile4x1(
[input_id, output_id](const std::map<ValueId, BHWC>& buffers) {
const auto& src_shape = buffers.find(input_id)->second;
const auto& dst_shape = buffers.find(output_id)->second;
const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
const int tiles_x = DivideRoundUp(dst_shape.w, 4);
const int tiles_y = DivideRoundUp(dst_shape.h, 4);
std::vector<int> sizes = {
src_shape.w,
src_shape.h,
IntegralDivideRoundUp(src_shape.c, 4),
DivideRoundUp(src_shape.c, 4),
0,
dst_shape.w,
dst_shape.h,
IntegralDivideRoundUp(dst_shape.c, 4),
DivideRoundUp(dst_shape.c, 4),
0,
tiles_x,
tiles_y,
@ -757,14 +757,14 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4Tile4x1(
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
const uint3 groups_size{8, 4, 1};
const auto& dst_shape = buffers.find(output_id)->second;
const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
const int tiles_x = DivideRoundUp(dst_shape.w, 4);
const int tiles_y = DivideRoundUp(dst_shape.h, 4);
int grid_x = tiles_x * tiles_y;
int grid_y = 4;
int grid_z = IntegralDivideRoundUp(dst_shape.c, 4);
int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
int grid_z = DivideRoundUp(dst_shape.c, 4);
int groups_x = DivideRoundUp(grid_x, groups_size.x);
int groups_y = DivideRoundUp(grid_y, groups_size.y);
int groups_z = DivideRoundUp(grid_z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};
return {desc};