Rename IntegralDivideRoundUp to DivideRoundUp.
PiperOrigin-RevId: 307447663 Change-Id: I1e0f6c9f058e3f0457a7522f1d10f7da8ab8610d
This commit is contained in:
parent
55c4d9e49c
commit
b26b6b5669
@ -54,7 +54,7 @@ namespace gpu {
|
||||
// H - height
|
||||
// W - width
|
||||
// C - channels
|
||||
// D - depth := IntegralDivideRoundUp(C, 4)
|
||||
// D - depth := DivideRoundUp(C, 4)
|
||||
// C4 - is the constant = 4.
|
||||
enum class DataLayout {
|
||||
UNKNOWN,
|
||||
@ -164,7 +164,7 @@ struct Dimensions {
|
||||
Dimensions(int32_t batch, int32_t height, int32_t width, int32_t channels)
|
||||
: b(batch), h(height), w(width), c(channels) {}
|
||||
|
||||
int32_t d() const { return IntegralDivideRoundUp(c, 4); }
|
||||
int32_t d() const { return DivideRoundUp(c, 4); }
|
||||
|
||||
int32_t product() const { return b * h * w * c; }
|
||||
|
||||
|
@ -70,10 +70,10 @@ std::string Add::GetElementWiseCode(
|
||||
Add::Add(const OperationDef& definition, const std::vector<int>& channels,
|
||||
int dst_channels)
|
||||
: ElementwiseOperation(definition),
|
||||
dst_depth_(IntegralDivideRoundUp(dst_channels, 4)) {
|
||||
dst_depth_(DivideRoundUp(dst_channels, 4)) {
|
||||
src_depthes_.resize(channels.size());
|
||||
for (int i = 0; i < channels.size(); ++i) {
|
||||
src_depthes_[i] = IntegralDivideRoundUp(channels[i], 4);
|
||||
src_depthes_[i] = DivideRoundUp(channels[i], 4);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -79,7 +79,7 @@ std::string GetConcatKernelCode(
|
||||
// generation.
|
||||
c += " int Z = 0;\n";
|
||||
for (int i = 0; i < channels.size(); ++i) {
|
||||
const int depth = IntegralDivideRoundUp(channels[i], 4);
|
||||
const int depth = DivideRoundUp(channels[i], 4);
|
||||
if (depth % 2 == 0) {
|
||||
// We can read more at once inside of loop in case depth % 2 == 0
|
||||
// it should be better for reading latency hiding
|
||||
@ -112,7 +112,7 @@ std::string GetConcatKernelCode(
|
||||
int read_index = 0;
|
||||
int z = 0;
|
||||
for (int i = 0; i < channels.size(); ++i) {
|
||||
const int depth = IntegralDivideRoundUp(channels[i], 4);
|
||||
const int depth = DivideRoundUp(channels[i], 4);
|
||||
for (int d = 0; d < depth; ++d) {
|
||||
const int channels_in_group = std::min(4, channels[i] - d * 4);
|
||||
const std::string temp_name = "t" + std::to_string(read_index);
|
||||
|
@ -128,24 +128,24 @@ absl::Status Conv3D::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
|
||||
}
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(
|
||||
IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w)));
|
||||
DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w)));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
int3 Conv3D::GetGridSize() const {
|
||||
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
|
||||
conv_params_.block_size.x);
|
||||
const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
|
||||
conv_params_.block_size.x);
|
||||
const int grid_y =
|
||||
IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
|
||||
DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
|
||||
const int grid_z =
|
||||
IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) *
|
||||
IntegralDivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
|
||||
DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) *
|
||||
DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
|
||||
int3 wg;
|
||||
wg.x = IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
|
||||
wg.y = IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
|
||||
wg.z = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
|
||||
wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
|
||||
wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
|
||||
wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
|
||||
return int3(wg[conv_params_.work_group_launch_order[0]] *
|
||||
conv_params_.work_group_size.x,
|
||||
wg[conv_params_.work_group_launch_order[1]] *
|
||||
@ -885,8 +885,8 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
|
||||
Conv3D::ConvParams Conv3D::GuessBestParams(
|
||||
const CLDevice& device, const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr) const {
|
||||
const int dst_slices = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
|
||||
attr.dilations.w == 1 &&
|
||||
attr.padding.prepended.w == 0 &&
|
||||
|
@ -147,8 +147,8 @@ absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
CLContext* context) {
|
||||
const int block_size = conv_params_.block_size.w;
|
||||
const int dst_slices =
|
||||
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size);
|
||||
const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
|
||||
const int src_slices = DivideRoundUp(weights.shape.i, 4);
|
||||
const int kernel_x = kernel_size_.x;
|
||||
const int kernel_y = kernel_size_.y;
|
||||
const int kernel_z = kernel_size_.z;
|
||||
@ -219,8 +219,8 @@ void Conv3D::RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
|
||||
absl::Span<T> dst) {
|
||||
const int block_size = conv_params_.block_size.w;
|
||||
const int dst_slices =
|
||||
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size);
|
||||
const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
|
||||
const int src_slices = DivideRoundUp(weights.shape.i, 4);
|
||||
const int kernel_x = kernel_size_.x;
|
||||
const int kernel_y = kernel_size_.y;
|
||||
const int kernel_z = kernel_size_.z;
|
||||
|
@ -307,7 +307,7 @@ absl::Status ConvBuffer1x1::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
|
||||
const int src_width_elements = IntegralDivideRoundUp(
|
||||
const int src_width_elements = DivideRoundUp(
|
||||
src_[0]->Width() * src_[0]->Batch(), (conv_params_.element_size / 4));
|
||||
int4 src_size = int4(src_width_elements, src_[0]->Height(), src_[0]->Slices(),
|
||||
src_width_elements * src_[0]->Height());
|
||||
@ -317,14 +317,14 @@ absl::Status ConvBuffer1x1::BindArguments() {
|
||||
}
|
||||
|
||||
int3 ConvBuffer1x1::GetGridSize() const {
|
||||
const int dst_width_elements = IntegralDivideRoundUp(
|
||||
const int dst_width_elements = DivideRoundUp(
|
||||
dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4));
|
||||
const int grid_x =
|
||||
IntegralDivideRoundUp(dst_width_elements, conv_params_.block_size.x);
|
||||
DivideRoundUp(dst_width_elements, conv_params_.block_size.x);
|
||||
const int grid_y =
|
||||
IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
|
||||
DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
|
||||
const int grid_z =
|
||||
IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
|
||||
DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
@ -358,8 +358,8 @@ absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
if (!IsConvBuffer1x1Supported(definition, attr)) {
|
||||
return absl::InvalidArgumentError("ConvBuffer1x1 doesn't supported");
|
||||
}
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
if (shape) {
|
||||
conv_params = GetBestParams(*creation_context.device, definition, *shape,
|
||||
@ -376,8 +376,8 @@ absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvBuffer1x1* result, const BHWC* shape) {
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
if (shape) {
|
||||
conv_params = GetBestParams(*creation_context.device, definition, *shape,
|
||||
@ -396,8 +396,8 @@ absl::Status CreateConvBuffer1x1Wino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
|
||||
const BHWC* shape) {
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
if (shape) {
|
||||
conv_params = GetBestParams(*creation_context.device, definition, *shape,
|
||||
|
@ -135,8 +135,8 @@ absl::Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
|
||||
template <DataType T>
|
||||
absl::Status ConvBuffer1x1::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
|
||||
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
|
||||
|
@ -40,9 +40,9 @@ std::string GenerateConvolutionConstantCode(
|
||||
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
|
||||
const int out_z = IntegralDivideRoundUp(dst_channels, 4);
|
||||
const int out_z = DivideRoundUp(dst_channels, 4);
|
||||
const std::string kOutZ = std::to_string(out_z);
|
||||
const int src_depth = IntegralDivideRoundUp(src_channels, 4);
|
||||
const int src_depth = DivideRoundUp(src_channels, 4);
|
||||
|
||||
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
|
||||
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
|
||||
@ -290,7 +290,7 @@ bool IsConvConstantsSupported(const CLDevice& device,
|
||||
: sizeof(half);
|
||||
const int filters_buffer_size = filters_count * float_size;
|
||||
const int kConstantMaxSize = GetOptimalMaxConstantSize(device.GetInfo());
|
||||
const int flt4_registers = IntegralDivideRoundUp(w_shape.o, 4);
|
||||
const int flt4_registers = DivideRoundUp(w_shape.o, 4);
|
||||
return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
|
||||
}
|
||||
|
||||
|
@ -88,7 +88,7 @@ class ConvConstants : public GPUOperation {
|
||||
template <DataType T>
|
||||
absl::Status ConvConstants::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
const int kernel_x = weights.shape.w;
|
||||
const int kernel_y = weights.shape.h;
|
||||
|
||||
@ -112,8 +112,8 @@ absl::Status ConvConstants::UploadWeights(
|
||||
template <DataType S, typename T>
|
||||
void ConvConstants::RearrangeWeightsData(
|
||||
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int kernel_x = weights.shape.w;
|
||||
const int kernel_y = weights.shape.h;
|
||||
|
||||
|
@ -205,8 +205,8 @@ absl::Status ConvPowerVR::BindArguments() {
|
||||
kernel_dilation_.z * src_[0]->Batch(), kernel_dilation_.w)));
|
||||
}
|
||||
if (conv_params_.linear_hw) {
|
||||
const int grid_x = IntegralDivideRoundUp(
|
||||
dst_[0]->Width() * dst_[0]->Batch(), conv_params_.block_size.x);
|
||||
const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
|
||||
conv_params_.block_size.x);
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(grid_x));
|
||||
}
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
|
||||
@ -215,27 +215,26 @@ absl::Status ConvPowerVR::BindArguments() {
|
||||
}
|
||||
|
||||
int3 ConvPowerVR::GetGridSize() const {
|
||||
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
|
||||
conv_params_.block_size.x);
|
||||
const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
|
||||
conv_params_.block_size.x);
|
||||
const int grid_y =
|
||||
IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
|
||||
DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
|
||||
const int grid_z =
|
||||
IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
|
||||
DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
|
||||
int3 wg;
|
||||
|
||||
if (conv_params_.linear_hw) {
|
||||
wg.x =
|
||||
IntegralDivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x);
|
||||
wg.y = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.y);
|
||||
wg.x = DivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x);
|
||||
wg.y = DivideRoundUp(grid_z, conv_params_.work_group_size.y);
|
||||
return int3(wg[conv_params_.work_group_launch_order[0]] *
|
||||
conv_params_.work_group_size.x,
|
||||
wg[conv_params_.work_group_launch_order[1]] *
|
||||
conv_params_.work_group_size.y,
|
||||
1);
|
||||
} else {
|
||||
wg.x = IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
|
||||
wg.y = IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
|
||||
wg.z = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
|
||||
wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
|
||||
wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
|
||||
wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
|
||||
return int3(wg[conv_params_.work_group_launch_order[0]] *
|
||||
conv_params_.work_group_size.x,
|
||||
wg[conv_params_.work_group_launch_order[1]] *
|
||||
@ -808,8 +807,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
const CLDevice& device, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
|
||||
attr.dilations.w == 1 &&
|
||||
attr.padding.prepended.w == 0 &&
|
||||
@ -825,8 +824,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
const CLDevice& device, const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr, const BHWC* dst_shape) const {
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvPowerVR::ConvParams params = GuessBestParams(
|
||||
device, definition, src_depth, dst_depth, true, true, false, dst_shape);
|
||||
params.work_group_size.x *= params.work_group_size.y;
|
||||
@ -839,8 +838,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
|
||||
const CLDevice& device, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvPowerVR::ConvParams params = GuessBestParams(
|
||||
device, definition, src_depth, dst_depth, true, true, true, dst_shape);
|
||||
params.block_size.x *= params.block_size.y;
|
||||
|
@ -188,8 +188,8 @@ absl::Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
|
||||
template <DataType T>
|
||||
absl::Status ConvPowerVR::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
|
||||
const bool f32_weights = conv_params_.weights_data_type == DataType::FLOAT32;
|
||||
const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
|
||||
|
@ -433,9 +433,9 @@ absl::Status ConvTexture::BindArguments() {
|
||||
|
||||
int3 ConvTexture::GetGridSize() const {
|
||||
const int grid_x =
|
||||
IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
|
||||
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), block_size_.y);
|
||||
const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.z);
|
||||
DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
|
||||
const int grid_y = DivideRoundUp(dst_[0]->Height(), block_size_.y);
|
||||
const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
|
@ -148,9 +148,9 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
|
||||
template <DataType T>
|
||||
absl::Status ConvTexture::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
dst_depth = AlignByN(dst_depth, block_size_.z);
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int kernel_x = weights.shape.w;
|
||||
const int kernel_y = weights.shape.h;
|
||||
|
||||
@ -206,9 +206,9 @@ template <DataType S, typename T>
|
||||
void ConvTexture::RearrangeWeightsData(
|
||||
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst_0,
|
||||
absl::Span<T> dst_1, absl::Span<T> dst_2, absl::Span<T> dst_3) {
|
||||
int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
dst_depth = AlignByN(dst_depth, block_size_.z);
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int kernel_x = weights.shape.w;
|
||||
const int kernel_y = weights.shape.h;
|
||||
|
||||
|
@ -322,7 +322,7 @@ ConvolutionTransposed::ConvolutionTransposed(
|
||||
block_size_ = is_f16 ? int3(2, 2, 2) : int3(2, 2, 1);
|
||||
}
|
||||
}
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
if (dst_depth == 1 || dst_depth == 3) {
|
||||
if (!device.IsMali()) {
|
||||
block_size_.y *= block_size_.z;
|
||||
@ -406,10 +406,9 @@ absl::Status ConvolutionTransposed::BindArguments() {
|
||||
int3 ConvolutionTransposed::GetGridSize() const {
|
||||
const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
|
||||
const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
|
||||
const int grid_x =
|
||||
IntegralDivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
|
||||
const int grid_y = IntegralDivideRoundUp(aligned_h, block_size_.y);
|
||||
const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.z);
|
||||
const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
|
||||
const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
|
||||
const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
|
@ -91,8 +91,8 @@ template <DataType T>
|
||||
absl::Status ConvolutionTransposed::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int dst_depth =
|
||||
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int kernel_x = kernel_size_.x;
|
||||
const int kernel_y = kernel_size_.y;
|
||||
int texture_width = dst_depth;
|
||||
@ -160,8 +160,8 @@ template <DataType S, typename T>
|
||||
void ConvolutionTransposed::RearrangeWeightsData(
|
||||
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
|
||||
const int dst_depth =
|
||||
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int kernel_x = kernel_size_.x;
|
||||
const int kernel_y = kernel_size_.y;
|
||||
int texture_width = dst_depth;
|
||||
|
@ -440,8 +440,8 @@ absl::Status ConvolutionTransposed3D::BindArguments() {
|
||||
if (definition_.IsBatchSupported()) {
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
|
||||
}
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(
|
||||
IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.w)));
|
||||
RETURN_IF_ERROR(
|
||||
kernel_.SetBytesAuto(DivideRoundUp(dst_[0]->Slices(), block_size_.w)));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHDS()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHDS()));
|
||||
return absl::OkStatus();
|
||||
@ -451,11 +451,10 @@ int3 ConvolutionTransposed3D::GetGridSize() const {
|
||||
const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
|
||||
const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
|
||||
const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z);
|
||||
const int grid_x =
|
||||
IntegralDivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
|
||||
const int grid_y = IntegralDivideRoundUp(aligned_h, block_size_.y);
|
||||
const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.w) *
|
||||
IntegralDivideRoundUp(aligned_d, block_size_.z);
|
||||
const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
|
||||
const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
|
||||
const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.w) *
|
||||
DivideRoundUp(aligned_d, block_size_.z);
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
|
@ -91,8 +91,8 @@ template <DataType T>
|
||||
absl::Status ConvolutionTransposed3D::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
|
||||
const int dst_depth =
|
||||
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int kernel_x = kernel_size_.x;
|
||||
const int kernel_y = kernel_size_.y;
|
||||
const int kernel_z = kernel_size_.z;
|
||||
@ -162,8 +162,8 @@ template <DataType S, typename T>
|
||||
void ConvolutionTransposed3D::RearrangeWeightsData(
|
||||
const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
|
||||
const int dst_depth =
|
||||
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.w);
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.w);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int kernel_x = kernel_size_.x;
|
||||
const int kernel_y = kernel_size_.y;
|
||||
const int kernel_z = kernel_size_.z;
|
||||
|
@ -339,14 +339,13 @@ absl::Status ConvolutionTransposed3x3::BindArguments() {
|
||||
}
|
||||
|
||||
int3 ConvolutionTransposed3x3::GetGridSize() const {
|
||||
const int grid_x =
|
||||
IntegralDivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
|
||||
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
|
||||
const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
|
||||
const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
|
||||
const int grid_z = dst_[0]->Slices();
|
||||
int3 wg;
|
||||
wg.x = IntegralDivideRoundUp(grid_x, work_group_size_.x);
|
||||
wg.y = IntegralDivideRoundUp(grid_y, work_group_size_.y);
|
||||
wg.z = IntegralDivideRoundUp(grid_z, work_group_size_.z);
|
||||
wg.x = DivideRoundUp(grid_x, work_group_size_.x);
|
||||
wg.y = DivideRoundUp(grid_y, work_group_size_.y);
|
||||
wg.z = DivideRoundUp(grid_z, work_group_size_.z);
|
||||
return int3(wg[work_group_launch_order_[0]] * work_group_size_.x,
|
||||
wg[work_group_launch_order_[1]] * work_group_size_.y,
|
||||
wg[work_group_launch_order_[2]] * work_group_size_.z);
|
||||
|
@ -84,8 +84,8 @@ class ConvolutionTransposed3x3 : public GPUOperation {
|
||||
template <DataType T>
|
||||
absl::Status ConvolutionTransposed3x3::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
const int kernel_x = 3; // This operation support only 3x3 kernel
|
||||
const int kernel_y = 3;
|
||||
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
|
||||
@ -109,8 +109,8 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
|
||||
template <DataType S, typename T>
|
||||
void ConvolutionTransposed3x3::RearrangeWeightsData(
|
||||
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
const int kernel_x = 3;
|
||||
const int kernel_y = 3;
|
||||
|
||||
|
@ -224,8 +224,8 @@ ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
|
||||
absl::Status ConvolutionTransposed3x3Thin::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
const auto code = GenerateConvolutionTransposedCode(
|
||||
definition_, biases_, IntegralDivideRoundUp(src_channels_, 4),
|
||||
IntegralDivideRoundUp(dst_channels_, 4), *creation_context.device,
|
||||
definition_, biases_, DivideRoundUp(src_channels_, 4),
|
||||
DivideRoundUp(dst_channels_, 4), *creation_context.device,
|
||||
linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
|
@ -82,8 +82,8 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
|
||||
template <DataType T>
|
||||
absl::Status ConvolutionTransposed3x3Thin::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
|
||||
const int src_depth = DivideRoundUp(src_channels_, 4);
|
||||
const int dst_depth = DivideRoundUp(dst_channels_, 4);
|
||||
const int kernel_x = 3; // This operation support only 3x3 kernel
|
||||
const int kernel_y = 3;
|
||||
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
|
||||
@ -108,8 +108,8 @@ absl::Status ConvolutionTransposed3x3Thin::UploadWeights(
|
||||
template <DataType S, typename T>
|
||||
void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
|
||||
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
|
||||
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
|
||||
const int src_depth = DivideRoundUp(src_channels_, 4);
|
||||
const int dst_depth = DivideRoundUp(dst_channels_, 4);
|
||||
const int kernel_x = 3;
|
||||
const int kernel_y = 3;
|
||||
|
||||
|
@ -332,9 +332,8 @@ absl::Status ConvolutionTransposed4x4::BindArguments() {
|
||||
}
|
||||
|
||||
int3 ConvolutionTransposed4x4::GetGridSize() const {
|
||||
const int grid_x =
|
||||
IntegralDivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch();
|
||||
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height() + 2, 2);
|
||||
const int grid_x = DivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch();
|
||||
const int grid_y = DivideRoundUp(dst_[0]->Height() + 2, 2);
|
||||
const int grid_z = dst_[0]->Slices();
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
@ -82,8 +82,8 @@ class ConvolutionTransposed4x4 : public GPUOperation {
|
||||
template <DataType T>
|
||||
absl::Status ConvolutionTransposed4x4::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
const int kernel_x = 4; // This operation support only 4x4 kernel
|
||||
const int kernel_y = 4;
|
||||
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
|
||||
@ -107,8 +107,8 @@ absl::Status ConvolutionTransposed4x4::UploadWeights(
|
||||
template <DataType S, typename T>
|
||||
void ConvolutionTransposed4x4::RearrangeWeightsData(
|
||||
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
const int kernel_x = 4;
|
||||
const int kernel_y = 4;
|
||||
|
||||
|
@ -187,8 +187,8 @@ ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
|
||||
absl::Status ConvolutionTransposedThin::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
const auto code = GenerateConvolutionTransposedCode(
|
||||
definition_, IntegralDivideRoundUp(src_channels_, 4), dst_channels_,
|
||||
kernel_size_, *creation_context.device, linked_operations_);
|
||||
definition_, DivideRoundUp(src_channels_, 4), dst_channels_, kernel_size_,
|
||||
*creation_context.device, linked_operations_);
|
||||
|
||||
std::vector<CompilerOptions> options;
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
|
@ -82,7 +82,7 @@ class ConvolutionTransposedThin : public GPUOperation {
|
||||
template <DataType T>
|
||||
absl::Status ConvolutionTransposedThin::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
|
||||
const int src_depth = DivideRoundUp(src_channels_, 4);
|
||||
const int elements_count =
|
||||
kernel_size_.x * kernel_size_.y * src_depth * 4 * dst_channels_;
|
||||
|
||||
@ -104,7 +104,7 @@ absl::Status ConvolutionTransposedThin::UploadWeights(
|
||||
template <DataType S, typename T>
|
||||
void ConvolutionTransposedThin::RearrangeWeightsData(
|
||||
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
|
||||
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
|
||||
const int src_depth = DivideRoundUp(src_channels_, 4);
|
||||
const int kernel_x = kernel_size_.x;
|
||||
const int kernel_y = kernel_size_.y;
|
||||
|
||||
|
@ -89,7 +89,7 @@ template <DataType T>
|
||||
absl::Status DepthwiseConvolution::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int dst_channels = weights.shape.i * weights.shape.o;
|
||||
const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
|
||||
const int dst_depth = DivideRoundUp(dst_channels, 4);
|
||||
const int kernel_x = weights.shape.w;
|
||||
const int kernel_y = weights.shape.h;
|
||||
|
||||
@ -137,7 +137,7 @@ template <DataType S, typename T>
|
||||
void DepthwiseConvolution::RearrangeWeightsData(
|
||||
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
|
||||
const int dst_channels = weights.shape.i * weights.shape.o;
|
||||
const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
|
||||
const int dst_depth = DivideRoundUp(dst_channels, 4);
|
||||
const int kernel_x = weights.shape.w;
|
||||
const int kernel_y = weights.shape.h;
|
||||
|
||||
|
@ -88,7 +88,7 @@ template <DataType T>
|
||||
absl::Status DepthwiseConvolution3D::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
|
||||
const int dst_channels = weights.shape.i * weights.shape.o;
|
||||
const int dst_slices = IntegralDivideRoundUp(dst_channels, 4);
|
||||
const int dst_slices = DivideRoundUp(dst_channels, 4);
|
||||
const int kernel_x = weights.shape.w;
|
||||
const int kernel_y = weights.shape.h;
|
||||
const int kernel_z = weights.shape.d;
|
||||
@ -130,7 +130,7 @@ template <DataType S, typename T>
|
||||
void DepthwiseConvolution3D::RearrangeWeightsData(
|
||||
const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
|
||||
const int dst_channels = weights.shape.i * weights.shape.o;
|
||||
const int dst_slices = IntegralDivideRoundUp(dst_channels, 4);
|
||||
const int dst_slices = DivideRoundUp(dst_channels, 4);
|
||||
const int kernel_x = weights.shape.w;
|
||||
const int kernel_y = weights.shape.h;
|
||||
const int kernel_z = weights.shape.d;
|
||||
|
@ -323,8 +323,8 @@ absl::Status DepthwiseConv3x3::BindArguments() {
|
||||
}
|
||||
|
||||
int3 DepthwiseConv3x3::GetGridSize() const {
|
||||
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), 2);
|
||||
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
|
||||
const int grid_x = DivideRoundUp(dst_[0]->Width(), 2);
|
||||
const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
|
||||
const int grid_z = dst_[0]->Slices();
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
@ -83,7 +83,7 @@ template <DataType T>
|
||||
absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
int texture_width = 10; // 3x3 kernel + 1 bias
|
||||
int texture_height = src_depth;
|
||||
const int elements_count = texture_width * texture_height;
|
||||
@ -129,7 +129,7 @@ template <DataType S, typename T>
|
||||
void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(
|
||||
const tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
|
||||
int counter = 0;
|
||||
for (int s = 0; s < src_depth; ++s) {
|
||||
|
@ -70,8 +70,8 @@ class FullyConnected : public GPUOperation {
|
||||
template <DataType T>
|
||||
absl::Status FullyConnected::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
|
||||
const int elements_count = src_depth * dst_depth * 4;
|
||||
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
@ -94,8 +94,8 @@ absl::Status FullyConnected::UploadWeights(
|
||||
template <DataType T, typename S>
|
||||
void FullyConnected::RearrangeWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<S> dst) {
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
int counter = 0;
|
||||
|
||||
for (int s = 0; s < src_depth; ++s) {
|
||||
|
@ -129,8 +129,7 @@ absl::Status Softmax1x1::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
|
||||
const int depth = src_[0]->Slices();
|
||||
RETURN_IF_ERROR(
|
||||
kernel_.SetBytesAuto(int2(depth, IntegralDivideRoundUp(depth, 32))));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(int2(depth, DivideRoundUp(depth, 32))));
|
||||
RETURN_IF_ERROR(
|
||||
kernel_.SetBytesAuto(GetMaskForLastPlane(src_[0]->Channels())));
|
||||
|
||||
|
@ -234,12 +234,12 @@ template <DataType S, typename T>
|
||||
void RearrangeWeightsToOHWIOGroupI4O4(
|
||||
const tflite::gpu::Tensor<OHWI, S>& weights, int out_group_size,
|
||||
absl::Span<T> dst) {
|
||||
const int dst_slices = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_slices = DivideRoundUp(weights.shape.o, 4);
|
||||
const int src_slices = DivideRoundUp(weights.shape.i, 4);
|
||||
const int kernel_x = weights.shape.w;
|
||||
const int kernel_y = weights.shape.h;
|
||||
|
||||
const int dst_groups = IntegralDivideRoundUp(dst_slices, out_group_size);
|
||||
const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
|
||||
|
||||
int counter = 0;
|
||||
for (int d = 0; d < dst_groups; ++d) {
|
||||
|
@ -436,9 +436,9 @@ absl::Status Winograd4x4To36::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
|
||||
const int tiles_x = IntegralDivideRoundUp(
|
||||
const int tiles_x = DivideRoundUp(
|
||||
src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
|
||||
const int tiles_y = IntegralDivideRoundUp(
|
||||
const int tiles_y = DivideRoundUp(
|
||||
src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
|
||||
const int tiles_total = tiles_x * tiles_y;
|
||||
RETURN_IF_ERROR(
|
||||
@ -550,14 +550,14 @@ absl::Status Winograd36To4x4::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
|
||||
const int tiles_x = IntegralDivideRoundUp(dst_[0]->Width(), 4);
|
||||
const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(tiles_x));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
int3 Winograd36To4x4::GetGridSize() const {
|
||||
const int tiles_x = IntegralDivideRoundUp(dst_[0]->Width(), 4);
|
||||
const int tiles_y = IntegralDivideRoundUp(dst_[0]->Height(), 4);
|
||||
const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
|
||||
const int tiles_y = DivideRoundUp(dst_[0]->Height(), 4);
|
||||
const int grid_x = tiles_x * tiles_y * dst_[0]->Batch();
|
||||
const int grid_y = 4;
|
||||
const int grid_z = dst_[0]->Slices();
|
||||
|
@ -187,7 +187,7 @@ int3 GetWorkGroupXY128Simple(const int3& grid) { return int3(16, 8, 1); }
|
||||
int3 GetWorkGroup(const int3& grid, int max_size) {
|
||||
int wg_z = GetBiggestDividerWithPriority(grid.z, 8);
|
||||
int wg_xy_size = max_size / wg_z;
|
||||
int wg_x = std::min(IntegralDivideRoundUp(grid.x, 2), wg_xy_size);
|
||||
int wg_x = std::min(DivideRoundUp(grid.x, 2), wg_xy_size);
|
||||
int wg_y = std::min(wg_xy_size / wg_x, grid.y);
|
||||
return int3(wg_x, wg_y, wg_z);
|
||||
}
|
||||
@ -231,12 +231,12 @@ absl::Status GetBestWorkGroupXY128Linear(const TuningParameters& params,
|
||||
}
|
||||
|
||||
bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
|
||||
int planar_work_groups = IntegralDivideRoundUp(width * height, 128);
|
||||
int planar_work_groups = DivideRoundUp(width * height, 128);
|
||||
auto base_work_groups = Get2DWorkgroupsEqualTo128();
|
||||
bool have_equal_work_groups = false;
|
||||
for (auto& work_group : base_work_groups) {
|
||||
int x_groups = IntegralDivideRoundUp(width, work_group.x);
|
||||
int y_groups = IntegralDivideRoundUp(height, work_group.y);
|
||||
int x_groups = DivideRoundUp(width, work_group.x);
|
||||
int y_groups = DivideRoundUp(height, work_group.y);
|
||||
int xy_groups = x_groups * y_groups;
|
||||
if (xy_groups == planar_work_groups) {
|
||||
have_equal_work_groups = true;
|
||||
|
@ -101,7 +101,7 @@ absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
|
||||
CLContext* context, LinearStorage* result) {
|
||||
int size = creation_info.aligned_size != 0 ? creation_info.aligned_size
|
||||
: tensor.shape.v;
|
||||
const int depth = IntegralDivideRoundUp(size, 4);
|
||||
const int depth = DivideRoundUp(size, 4);
|
||||
if (creation_info.data_type == DataType::FLOAT32) {
|
||||
std::vector<float4> gpu_data(depth);
|
||||
CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
|
||||
|
@ -56,10 +56,10 @@ bool IsChannelsBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
|
||||
bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
|
||||
const CLDevice& device,
|
||||
const BHWC& dst_shape) {
|
||||
const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
|
||||
const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int tiles_x = DivideRoundUp(dst_shape.w, 4);
|
||||
const int tiles_y = DivideRoundUp(dst_shape.h, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const bool suitable_attributes =
|
||||
attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
|
||||
attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
|
||||
@ -82,8 +82,8 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
|
||||
return absl::UnimplementedError("No implementation for this case.");
|
||||
}
|
||||
|
||||
const int tiles_x = IntegralDivideRoundUp(output_shape.w, 4);
|
||||
const int tiles_y = IntegralDivideRoundUp(output_shape.h, 4);
|
||||
const int tiles_x = DivideRoundUp(output_shape.w, 4);
|
||||
const int tiles_y = DivideRoundUp(output_shape.h, 4);
|
||||
const BHWC shape_0{input_shape.b, 36, tiles_x * tiles_y, input_shape.c};
|
||||
const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
|
||||
TensorDescriptor td_0;
|
||||
|
@ -28,7 +28,7 @@ namespace cl {
|
||||
bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
|
||||
const BHWDC& shape,
|
||||
const TensorDescriptor& descriptor) {
|
||||
const int slices = IntegralDivideRoundUp(shape.c, 4);
|
||||
const int slices = DivideRoundUp(shape.c, 4);
|
||||
switch (descriptor.storage_type) {
|
||||
case TensorStorageType::BUFFER: {
|
||||
const int flt4_size =
|
||||
|
@ -65,11 +65,10 @@ absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
|
||||
}
|
||||
if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
cl_mem image_memory;
|
||||
RETURN_IF_ERROR(
|
||||
CreateImageBufferFromBuffer(context, memory, descriptor.data_type,
|
||||
shape.b * shape.w * shape.h * shape.d *
|
||||
IntegralDivideRoundUp(shape.c, 4),
|
||||
&image_memory));
|
||||
RETURN_IF_ERROR(CreateImageBufferFromBuffer(
|
||||
context, memory, descriptor.data_type,
|
||||
shape.b * shape.w * shape.h * shape.d * DivideRoundUp(shape.c, 4),
|
||||
&image_memory));
|
||||
*result = Tensor(memory, memory_owner, image_memory, shape, descriptor);
|
||||
} else {
|
||||
*result = Tensor(memory, memory_owner, shape, descriptor);
|
||||
@ -386,7 +385,7 @@ absl::Status AllocateTensorMemory(const CLContext& context,
|
||||
const CLDevice& device, const BHWDC& shape,
|
||||
const TensorDescriptor& descriptor,
|
||||
CLMemory* result) {
|
||||
const int slices = IntegralDivideRoundUp(shape.c, 4);
|
||||
const int slices = DivideRoundUp(shape.c, 4);
|
||||
switch (descriptor.storage_type) {
|
||||
case TensorStorageType::BUFFER:
|
||||
case TensorStorageType::IMAGE_BUFFER: {
|
||||
|
@ -61,7 +61,7 @@ class Tensor {
|
||||
int Height() const { return shape_.h; }
|
||||
int Depth() const { return shape_.d; }
|
||||
int Channels() const { return shape_.c; }
|
||||
int Slices() const { return IntegralDivideRoundUp(shape_.c, 4); }
|
||||
int Slices() const { return DivideRoundUp(shape_.c, 4); }
|
||||
int Batch() const { return shape_.b; }
|
||||
|
||||
// returns int4(width * batch, height, slices, batch)
|
||||
|
@ -44,12 +44,11 @@ absl::Status ConvertToPHWO4I4(absl::Span<const float> in, const OHWI& shape,
|
||||
}
|
||||
|
||||
float* output = out.data();
|
||||
for (int p = 0; p < IntegralDivideRoundUp(shape.o, kPhwo4i4ChannelsInPlane);
|
||||
++p) {
|
||||
for (int p = 0; p < DivideRoundUp(shape.o, kPhwo4i4ChannelsInPlane); ++p) {
|
||||
for (int h = 0; h < shape.h; ++h) {
|
||||
for (int w = 0; w < shape.w; ++w) {
|
||||
for (int c = 0;
|
||||
c < IntegralDivideRoundUp(shape.i, kPhwo4i4ChannelsInPlane); ++c) {
|
||||
for (int c = 0; c < DivideRoundUp(shape.i, kPhwo4i4ChannelsInPlane);
|
||||
++c) {
|
||||
for (int co = 0; co < kPhwo4i4ChannelsInPlane; ++co) {
|
||||
for (int ci = 0; ci < kPhwo4i4ChannelsInPlane; ++ci) {
|
||||
float value = 0;
|
||||
@ -106,7 +105,7 @@ std::vector<float> ConvertToPHWO4I4Transposed(
|
||||
|
||||
uint3 Get3DSizeForPHWO4I4(const OHWI& shape) {
|
||||
return uint3(AlignByN(shape.i, 4), shape.h * shape.w,
|
||||
IntegralDivideRoundUp(shape.o, 4));
|
||||
DivideRoundUp(shape.o, 4));
|
||||
}
|
||||
|
||||
// Layout is Po,H,W,OI4x4.
|
||||
@ -123,8 +122,8 @@ absl::Status ConvertToPHWO4I4(absl::Span<const float> in, const IHWO& shape,
|
||||
out.size(), " != ", GetElementsSizeForPHWO4I4(shape)));
|
||||
}
|
||||
|
||||
const int dst_depth = IntegralDivideRoundUp(shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(shape.i, 4);
|
||||
|
||||
float* output = out.data();
|
||||
for (int f = 0; f < dst_depth; ++f) {
|
||||
@ -178,8 +177,7 @@ absl::Status ConvertToPIOHW4(absl::Span<const float> in, const OHWI& shape,
|
||||
}
|
||||
|
||||
int32_t output_channels = shape.o * shape.i;
|
||||
int32_t num_planes =
|
||||
IntegralDivideRoundUp(output_channels, kPiohw4ChannelsInPlane);
|
||||
int32_t num_planes = DivideRoundUp(output_channels, kPiohw4ChannelsInPlane);
|
||||
float* output = out.data();
|
||||
for (int p = 0; p < num_planes; ++p) {
|
||||
for (int h = 0; h < shape.h; ++h) {
|
||||
@ -232,7 +230,7 @@ absl::Status ConvertToPHWC4(absl::Span<const float> in, const BHWC& shape,
|
||||
return absl::OkStatus();
|
||||
}
|
||||
// Layout is Pc,H,W,C4 where P - is a plane based on channels.
|
||||
int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
|
||||
int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
|
||||
const int num_pixels = shape.h * shape.w;
|
||||
// A layer is a set of kPhwc4ChannelsInPlane channels images.
|
||||
const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
|
||||
@ -281,7 +279,7 @@ absl::Status ConvertToPHWC4Half(absl::Span<const float> in, const BHWC& shape,
|
||||
RETURN_IF_ERROR(ValidateConvertToPHWC4(in, shape, out));
|
||||
|
||||
// Layout is Pc,H,W,C4 where P - is a plane based on channels.
|
||||
int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
|
||||
int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
|
||||
const int num_pixels = shape.h * shape.w;
|
||||
// A layer is a set of kPhwc4ChannelsInPlane channels images.
|
||||
const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
|
||||
@ -407,7 +405,7 @@ absl::Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
|
||||
int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
|
||||
const int num_pixels = shape.h * shape.w;
|
||||
const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
|
||||
// A layer is a set of kPhwc4ChannelsInPlane channels images.
|
||||
@ -449,7 +447,7 @@ absl::Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
|
||||
absl::Status ConvertFromPHWC4Half(absl::Span<const HalfBits> in,
|
||||
const BHWC& shape, absl::Span<float> out) {
|
||||
RETURN_IF_ERROR(ValidateConvertFromPHWC4(in, shape, out));
|
||||
int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
|
||||
int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
|
||||
const int num_pixels = shape.h * shape.w;
|
||||
const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
|
||||
// A layer is a set of kPhwc4ChannelsInPlane channels images.
|
||||
|
@ -209,7 +209,7 @@ OperationType OperationTypeFromString(const std::string& name) {
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
T IntegralDivideRoundUp(T n, T divisor) {
|
||||
T DivideRoundUp(T n, T divisor) {
|
||||
return (n - 1) / divisor + 1;
|
||||
}
|
||||
|
||||
@ -272,7 +272,7 @@ int32_t CalculateOutput(const BHWDC& input,
|
||||
}
|
||||
|
||||
inline int32_t StridedSize(int32_t size, int32_t stride) {
|
||||
return stride == 0 ? -1 : IntegralDivideRoundUp(size, stride);
|
||||
return stride == 0 ? -1 : DivideRoundUp(size, stride);
|
||||
}
|
||||
|
||||
template <Axis AxisT, typename AttrT>
|
||||
|
@ -24,24 +24,23 @@ namespace gpu {
|
||||
// @param n must be non negative
|
||||
// @param divisor must be greater than zero
|
||||
template <typename T, typename N>
|
||||
T IntegralDivideRoundUp(T n, N divisor) {
|
||||
T DivideRoundUp(T n, N divisor) {
|
||||
const T div = static_cast<T>(divisor);
|
||||
const T q = n / div;
|
||||
return n % div == 0 ? q : q + 1;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint3 IntegralDivideRoundUp(uint3 n, uint3 divisor) {
|
||||
return uint3(IntegralDivideRoundUp(n.x, divisor.x),
|
||||
IntegralDivideRoundUp(n.y, divisor.y),
|
||||
IntegralDivideRoundUp(n.z, divisor.z));
|
||||
inline uint3 DivideRoundUp(uint3 n, uint3 divisor) {
|
||||
return uint3(DivideRoundUp(n.x, divisor.x), DivideRoundUp(n.y, divisor.y),
|
||||
DivideRoundUp(n.z, divisor.z));
|
||||
}
|
||||
|
||||
// @param number or its components must be greater than zero
|
||||
// @param n must be greater than zero
|
||||
template <typename T, typename N>
|
||||
T AlignByN(T number, N n) {
|
||||
return IntegralDivideRoundUp(number, n) * n;
|
||||
return DivideRoundUp(number, n) * n;
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
|
@ -24,16 +24,16 @@ namespace {
|
||||
|
||||
using testing::Eq;
|
||||
|
||||
TEST(UtilTest, IntegralDivideRoundUp) {
|
||||
EXPECT_THAT(IntegralDivideRoundUp(0, 256), Eq(0));
|
||||
EXPECT_THAT(IntegralDivideRoundUp(2u, 256), Eq(1));
|
||||
EXPECT_THAT(IntegralDivideRoundUp(2, 256), Eq(1));
|
||||
EXPECT_THAT(IntegralDivideRoundUp(255u, 256), Eq(1));
|
||||
EXPECT_THAT(IntegralDivideRoundUp(255, 256), Eq(1));
|
||||
EXPECT_THAT(IntegralDivideRoundUp(256u, 256), Eq(1));
|
||||
EXPECT_THAT(IntegralDivideRoundUp(256, 256), Eq(1));
|
||||
EXPECT_THAT(IntegralDivideRoundUp(257u, 256), Eq(2));
|
||||
EXPECT_THAT(IntegralDivideRoundUp(257, 256), Eq(2));
|
||||
TEST(UtilTest, DivideRoundUp) {
|
||||
EXPECT_THAT(DivideRoundUp(0, 256), Eq(0));
|
||||
EXPECT_THAT(DivideRoundUp(2u, 256), Eq(1));
|
||||
EXPECT_THAT(DivideRoundUp(2, 256), Eq(1));
|
||||
EXPECT_THAT(DivideRoundUp(255u, 256), Eq(1));
|
||||
EXPECT_THAT(DivideRoundUp(255, 256), Eq(1));
|
||||
EXPECT_THAT(DivideRoundUp(256u, 256), Eq(1));
|
||||
EXPECT_THAT(DivideRoundUp(256, 256), Eq(1));
|
||||
EXPECT_THAT(DivideRoundUp(257u, 256), Eq(2));
|
||||
EXPECT_THAT(DivideRoundUp(257, 256), Eq(2));
|
||||
}
|
||||
|
||||
TEST(UtilTest, AlignByN) {
|
||||
|
@ -34,9 +34,9 @@ void AddCornerCases(const T& grid, int max_work_group_total_size,
|
||||
for (int x = 1; x <= 4; ++x) {
|
||||
for (int y = 1; y <= 4; ++y) {
|
||||
for (int z = 1; z <= 4; ++z) {
|
||||
int wg_x = IntegralDivideRoundUp(grid.x, x);
|
||||
int wg_y = IntegralDivideRoundUp(grid.y, y);
|
||||
int wg_z = IntegralDivideRoundUp(grid.z, z);
|
||||
int wg_x = DivideRoundUp(grid.x, x);
|
||||
int wg_y = DivideRoundUp(grid.y, y);
|
||||
int wg_z = DivideRoundUp(grid.z, z);
|
||||
if (wg_x > max_work_group_sizes.x || wg_y > max_work_group_sizes.y ||
|
||||
wg_z > max_work_group_sizes.z ||
|
||||
wg_x * wg_y * wg_z > max_work_group_total_size) {
|
||||
|
@ -201,7 +201,7 @@ class CompiledModelImpl
|
||||
ShaderCode code) {
|
||||
// Calculate workgroup size.
|
||||
uint3 workgroup_size = workgroup_calculator.Calculate(code);
|
||||
uint3 num_workgroups = IntegralDivideRoundUp(code.workload, workgroup_size);
|
||||
uint3 num_workgroups = DivideRoundUp(code.workload, workgroup_size);
|
||||
|
||||
for (const auto& object : code.objects) {
|
||||
if (IsRef(object)) {
|
||||
|
@ -569,7 +569,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
|
||||
} else {
|
||||
shader_index = it->second;
|
||||
}
|
||||
auto num_workgroups = IntegralDivideRoundUp(code.workload, workgroup);
|
||||
auto num_workgroups = DivideRoundUp(code.workload, workgroup);
|
||||
return runtime_ptr->AddProgram(shaders[shader_index], code.parameters,
|
||||
code.objects, num_workgroups);
|
||||
}));
|
||||
|
@ -190,8 +190,7 @@ class CompilerImpl : public Compiler {
|
||||
"Workload uint3() requires all output sizes to match");
|
||||
}
|
||||
}
|
||||
attr.code.workload =
|
||||
uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4));
|
||||
attr.code.workload = uint3(shape.w, shape.h, DivideRoundUp(shape.c, 4));
|
||||
}
|
||||
|
||||
int num_textures = 0;
|
||||
|
@ -88,8 +88,8 @@ absl::Status ConverterBhwcToPhwc4::Convert(const BHWC& shape,
|
||||
return absl::UnimplementedError(
|
||||
"BhwcToPhwc4: Batch size is not equal to 1.");
|
||||
}
|
||||
uint3 workload = uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4));
|
||||
uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
|
||||
uint3 workload = uint3(shape.w, shape.h, DivideRoundUp(shape.c, 4));
|
||||
uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
|
||||
|
||||
RETURN_IF_ERROR(program_.SetParameter(
|
||||
{"sizes_",
|
||||
|
@ -83,7 +83,7 @@ absl::Status ConverterPhwc4ToBhwc::Convert(const BHWC& shape,
|
||||
}
|
||||
|
||||
uint3 workload = uint3(shape.w, shape.h, shape.c);
|
||||
uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
|
||||
uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
|
||||
|
||||
// TODO(akulik): simply pass workload as soon as UniformParameter
|
||||
// supports uint3
|
||||
|
@ -102,7 +102,7 @@ class Add : public NodeShader {
|
||||
// Declare workload explicitly because shader depends on gid.z.
|
||||
/*workload=*/
|
||||
uint3(ctx.input_shapes[0][2], ctx.input_shapes[0][1],
|
||||
IntegralDivideRoundUp(ctx.input_shapes[0][3], 4)),
|
||||
DivideRoundUp(ctx.input_shapes[0][3], 4)),
|
||||
/*workgroup=*/uint3(),
|
||||
/*source_code=*/"value_0 += $add_buffer[gid.z]$;",
|
||||
/*input=*/IOStructure::AUTO,
|
||||
|
@ -210,7 +210,7 @@ vec4 val = vec4(0.0f);
|
||||
// * - you are going to write into these cells
|
||||
// @ - you will fill these cells next cycles
|
||||
// ^ - first elem you start writing from
|
||||
int blocks_amount = IntegralDivideRoundUp<int>(in_ch, 4);
|
||||
int blocks_amount = DivideRoundUp<int>(in_ch, 4);
|
||||
code += "// Aligned case\n";
|
||||
code += "// I'm going to make " + std::to_string(blocks_amount) +
|
||||
" write(s)\n\n";
|
||||
|
@ -55,7 +55,7 @@ class Convolution : public NodeShader {
|
||||
{"dilation_h", attr.dilations.h},
|
||||
{"kernel_w", weights.w},
|
||||
{"kernel_h", weights.h},
|
||||
{"src_depth", IntegralDivideRoundUp(weights.i, 4)},
|
||||
{"src_depth", DivideRoundUp(weights.i, 4)},
|
||||
{"stride", int2(attr.strides.w, attr.strides.h)},
|
||||
};
|
||||
} else {
|
||||
@ -71,7 +71,7 @@ class Convolution : public NodeShader {
|
||||
{"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
|
||||
{"offsets_count", offsets_count},
|
||||
{"offsets", offsets},
|
||||
{"src_depth", IntegralDivideRoundUp(weights.i, 4)},
|
||||
{"src_depth", DivideRoundUp(weights.i, 4)},
|
||||
{"stride", int2(attr.strides.w, attr.strides.h)},
|
||||
};
|
||||
}
|
||||
@ -181,14 +181,14 @@ class Convolution1x1 : public NodeShader {
|
||||
|
||||
std::vector<Variable> parameters = {
|
||||
{"src_depth",
|
||||
IntegralDivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)},
|
||||
DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)},
|
||||
};
|
||||
|
||||
std::vector<std::pair<std::string, Object>> objects = {
|
||||
{"weights", MakeReadonlyObject(
|
||||
uint3(4, IntegralDivideRoundUp(attr.weights.shape.i, 4),
|
||||
IntegralDivideRoundUp(attr.weights.shape.o, 4)),
|
||||
ConvertToPHWO4I4(attr.weights))}};
|
||||
{"weights",
|
||||
MakeReadonlyObject(uint3(4, DivideRoundUp(attr.weights.shape.i, 4),
|
||||
DivideRoundUp(attr.weights.shape.o, 4)),
|
||||
ConvertToPHWO4I4(attr.weights))}};
|
||||
std::string source;
|
||||
for (int i = 0; i < multiplier; i++) {
|
||||
absl::StrAppend(&source, "highp vec4 result", i, " = vec4(0);\n");
|
||||
@ -224,7 +224,7 @@ class Convolution1x1 : public NodeShader {
|
||||
absl::StrAppend(&source, "value_0 = result0;\n");
|
||||
}
|
||||
|
||||
auto dst_depth = IntegralDivideRoundUp(ctx.output_shapes[0][3], 4);
|
||||
auto dst_depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
|
||||
uint3 workgroup = uint3(16, 16, 1);
|
||||
if (ctx.gpu_info->type == GpuType::ADRENO) {
|
||||
if (dst_depth >= 2) {
|
||||
@ -265,7 +265,7 @@ class Convolution1x1 : public NodeShader {
|
||||
/*shared_variables=*/{},
|
||||
/*workload=*/
|
||||
uint3(ctx.output_shapes[0][2] / multiplier, ctx.output_shapes[0][1],
|
||||
IntegralDivideRoundUp(ctx.output_shapes[0][3], 4)),
|
||||
DivideRoundUp(ctx.output_shapes[0][3], 4)),
|
||||
/*workgroup=*/
|
||||
GetIdealWorkgroupIfPossible(
|
||||
ctx.gpu_info->gpu_model, OperationType::CONVOLUTION_2D,
|
||||
|
@ -64,7 +64,7 @@ class OpenGlConverterImpl : public TensorObjectConverter {
|
||||
}
|
||||
|
||||
absl::Status Dispatch(const uint3& workload) {
|
||||
uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
|
||||
uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
|
||||
if (command_queue_) {
|
||||
return command_queue_->Dispatch(program_, num_workgroups);
|
||||
}
|
||||
@ -256,7 +256,7 @@ class ToTensorConverter : public OpenGlConverterImpl {
|
||||
return absl::InvalidArgumentError(
|
||||
"ToTensorConverter: output data size does not match expected size.");
|
||||
}
|
||||
auto d = IntegralDivideRoundUp(shape_.c, 4);
|
||||
auto d = DivideRoundUp(shape_.c, 4);
|
||||
RETURN_IF_ERROR(program_.SetParameter(
|
||||
{"sizes",
|
||||
int4(static_cast<int32_t>(shape_.w), static_cast<int32_t>(shape_.h),
|
||||
|
@ -54,7 +54,7 @@ class DepthwiseConvolution : public NodeShader {
|
||||
{"dilation_h", attr.dilations.h},
|
||||
{"kernel_w", weights.w},
|
||||
{"kernel_h", weights.h},
|
||||
{"src_depth", IntegralDivideRoundUp(weights.i, 4)},
|
||||
{"src_depth", DivideRoundUp(weights.i, 4)},
|
||||
{"channel_multiplier", weights.o},
|
||||
{"stride", int2(attr.strides.w, attr.strides.h)},
|
||||
};
|
||||
@ -71,7 +71,7 @@ class DepthwiseConvolution : public NodeShader {
|
||||
{"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
|
||||
{"offsets_count", offsets_count},
|
||||
{"offsets", offsets},
|
||||
{"src_depth", IntegralDivideRoundUp(weights.i, 4)},
|
||||
{"src_depth", DivideRoundUp(weights.i, 4)},
|
||||
{"channel_multiplier", weights.o},
|
||||
{"stride", int2(attr.strides.w, attr.strides.h)},
|
||||
};
|
||||
|
@ -39,8 +39,8 @@ class FullyConnectedBuffers : public NodeShader {
|
||||
const auto& attr =
|
||||
absl::any_cast<const FullyConnectedAttributes&>(ctx.op_attr);
|
||||
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
|
||||
// This shader can work with any workgroup size, the values below work well
|
||||
// for OpenGL.
|
||||
|
@ -105,10 +105,9 @@ absl::Status GenerateMultiplyScalarCode(
|
||||
/*shared_variables=*/{},
|
||||
// Declare workload explicitly because shader depends on gid.z.
|
||||
/*workload=*/
|
||||
uint3(
|
||||
static_cast<int>(ctx.input_shapes[0][2]),
|
||||
static_cast<int>(ctx.input_shapes[0][1]),
|
||||
IntegralDivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)),
|
||||
uint3(static_cast<int>(ctx.input_shapes[0][2]),
|
||||
static_cast<int>(ctx.input_shapes[0][1]),
|
||||
DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)),
|
||||
/*workgroup=*/uint3(),
|
||||
/*source_code=*/"value_0 *= $mul_buffer[gid.z]$;",
|
||||
/*input=*/IOStructure::AUTO,
|
||||
|
@ -98,8 +98,8 @@ class Pad : public NodeShader {
|
||||
source += " value_0 = $input_data_0[src_x, src_y, gid.z]$;\n";
|
||||
} else if (attr.prepended.c % 4 == 0) {
|
||||
parameters.push_back(
|
||||
{"src_slices", IntegralDivideRoundUp(
|
||||
static_cast<int>(ctx.input_shapes[0][3]), 4)});
|
||||
{"src_slices",
|
||||
DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)});
|
||||
source += R"(
|
||||
int src_z = gid.z - $prepended.z$ / 4;
|
||||
if (src_z >= 0 && src_z < $src_slices$) {
|
||||
|
@ -69,8 +69,8 @@ class PReLULinearAlpha : public NodeShader {
|
||||
/*workload=*/
|
||||
uint3(static_cast<int>(ctx.output_shapes[0][2]),
|
||||
static_cast<int>(ctx.output_shapes[0][1]),
|
||||
IntegralDivideRoundUp(
|
||||
static_cast<int>(ctx.output_shapes[0][3]), 4)),
|
||||
DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
|
||||
4)),
|
||||
/*workgroup=*/uint3(),
|
||||
/*source_code=*/
|
||||
"value_0 = max(value_0, 0.0) + $alpha[gid.z]$ * min(value_0, "
|
||||
@ -98,10 +98,10 @@ class PReLUFull : public NodeShader {
|
||||
"Alpha shape does not match input shape.");
|
||||
}
|
||||
|
||||
ObjectSize obj_size = uint3(
|
||||
static_cast<int>(ctx.output_shapes[0][2]),
|
||||
static_cast<int>(ctx.output_shapes[0][1]),
|
||||
IntegralDivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4));
|
||||
ObjectSize obj_size =
|
||||
uint3(static_cast<int>(ctx.output_shapes[0][2]),
|
||||
static_cast<int>(ctx.output_shapes[0][1]),
|
||||
DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4));
|
||||
|
||||
*generated_code =
|
||||
attr.clip
|
||||
@ -116,8 +116,8 @@ class PReLUFull : public NodeShader {
|
||||
/*workload=*/
|
||||
uint3(static_cast<int>(ctx.output_shapes[0][2]),
|
||||
static_cast<int>(ctx.output_shapes[0][1]),
|
||||
IntegralDivideRoundUp(
|
||||
static_cast<int>(ctx.output_shapes[0][3]), 4)),
|
||||
DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
|
||||
4)),
|
||||
/*workgroup=*/uint3(),
|
||||
/*source_code=*/
|
||||
"value_0 = clamp(value_0, 0.0, $clip$) + "
|
||||
@ -136,8 +136,8 @@ class PReLUFull : public NodeShader {
|
||||
/*workload=*/
|
||||
uint3(static_cast<int>(ctx.output_shapes[0][2]),
|
||||
static_cast<int>(ctx.output_shapes[0][1]),
|
||||
IntegralDivideRoundUp(
|
||||
static_cast<int>(ctx.output_shapes[0][3]), 4)),
|
||||
DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
|
||||
4)),
|
||||
/*workgroup=*/uint3(),
|
||||
/*source_code=*/
|
||||
"value_0 = max(value_0, 0.0) + $alpha[gid.x, gid.y, gid.z]$ "
|
||||
|
@ -60,13 +60,13 @@ class Softmax : public NodeShader {
|
||||
private:
|
||||
absl::Status GenerateCodeFor1x1(const GenerationContext& ctx,
|
||||
GeneratedCode* generated_code) const {
|
||||
const int depth = IntegralDivideRoundUp(ctx.output_shapes[0][3], 4);
|
||||
const int depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
|
||||
std::vector<Variable> shared_variables = {
|
||||
{"partial_sum", std::vector<float4>(8)},
|
||||
};
|
||||
std::vector<Variable> uniform_parameters = {
|
||||
{"depth", depth},
|
||||
{"depth_div_32", IntegralDivideRoundUp(depth, 32)},
|
||||
{"depth_div_32", DivideRoundUp(depth, 32)},
|
||||
{"mask", GetMask(ctx.output_shapes[0][3])},
|
||||
};
|
||||
std::string source_code = R"(
|
||||
@ -138,7 +138,7 @@ class Softmax : public NodeShader {
|
||||
GeneratedCode* generated_code) const {
|
||||
std::vector<Variable> parameters = {
|
||||
{"src_depth",
|
||||
IntegralDivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4)},
|
||||
DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4)},
|
||||
{"mask", GetMask(ctx.output_shapes[0][3])},
|
||||
};
|
||||
|
||||
|
@ -44,7 +44,7 @@ class ConvolutionTransposedBuffers : public NodeShader {
|
||||
std::vector<Variable> parameters = {
|
||||
{"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
|
||||
{"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
|
||||
{"src_depth", IntegralDivideRoundUp(weights.i, 4)},
|
||||
{"src_depth", DivideRoundUp(weights.i, 4)},
|
||||
{"kernel_size", int2(weights.w, weights.h)},
|
||||
{"stride", int2(attr.stride.w, attr.stride.h)},
|
||||
{"padding", int2(weights.w - 1 - attr.padding.prepended.w,
|
||||
|
@ -153,17 +153,17 @@ inline Object MakeReadonlyBuffer(const ObjectSize& size,
|
||||
|
||||
inline Object MakeReadonlyObject(const std::vector<float>& data) {
|
||||
return MakeReadonlyObject(
|
||||
IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
|
||||
DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
|
||||
}
|
||||
|
||||
inline Object MakeReadonlyTexture(const std::vector<float>& data) {
|
||||
return MakeReadonlyTexture(
|
||||
IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
|
||||
DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
|
||||
}
|
||||
|
||||
inline Object MakeReadonlyBuffer(const std::vector<float>& data) {
|
||||
return MakeReadonlyBuffer(
|
||||
IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
|
||||
DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
|
||||
}
|
||||
|
||||
// TODO(akulik): find better place for functions below.
|
||||
@ -172,7 +172,7 @@ inline uint3 GetPHWC4Size(const BHWC& shape) {
|
||||
uint3 size;
|
||||
size.x = shape.w;
|
||||
size.y = shape.h;
|
||||
size.z = shape.b * IntegralDivideRoundUp(shape.c, 4);
|
||||
size.z = shape.b * DivideRoundUp(shape.c, 4);
|
||||
return size;
|
||||
}
|
||||
|
||||
|
@ -167,10 +167,10 @@ std::vector<ComputeTaskDescriptorPtr> SelectWinograd36To4x4(
|
||||
|
||||
bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
|
||||
const BHWC& dst_shape) {
|
||||
const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
|
||||
const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int tiles_x = DivideRoundUp(dst_shape.w, 4);
|
||||
const int tiles_y = DivideRoundUp(dst_shape.h, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const bool suitable_attributes =
|
||||
attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
|
||||
attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
|
||||
@ -229,8 +229,8 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
|
||||
auto attr =
|
||||
absl::any_cast<Convolution2DAttributes>(node->operation.attributes);
|
||||
if (IsSuitableForWinograd4x4To6x6(attr, dst_shape)) {
|
||||
int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
|
||||
int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
|
||||
int tiles_x = DivideRoundUp(dst_shape.w, 4);
|
||||
int tiles_y = DivideRoundUp(dst_shape.h, 4);
|
||||
|
||||
Winograd4x4To36Attributes wino_up_attr;
|
||||
wino_up_attr.padding = attr.padding;
|
||||
|
@ -21,8 +21,8 @@ limitations under the License.
|
||||
#include "tensorflow/lite/delegates/gpu/common/util.h"
|
||||
#include "tensorflow/lite/delegates/gpu/metal/common.h"
|
||||
|
||||
using ::tflite::gpu::IntegralDivideRoundUp;
|
||||
using ::tflite::gpu::BHWC;
|
||||
using ::tflite::gpu::DivideRoundUp;
|
||||
using ::tflite::gpu::metal::CreateComputeProgram;
|
||||
|
||||
@implementation TFLBufferConvert {
|
||||
@ -102,10 +102,10 @@ using ::tflite::gpu::metal::CreateComputeProgram;
|
||||
[encoder setBytes:uniforms.data() length:uniforms.size() * sizeof(int) atIndex:2];
|
||||
|
||||
MTLSize group_size = MTLSizeMake(16, 16, 1);
|
||||
int layers = IntegralDivideRoundUp(shape.c, 4);
|
||||
int groups_x = IntegralDivideRoundUp(shape.w, group_size.width);
|
||||
int groups_y = IntegralDivideRoundUp(shape.h, group_size.height);
|
||||
int groups_z = IntegralDivideRoundUp(layers, group_size.depth);
|
||||
int layers = DivideRoundUp(shape.c, 4);
|
||||
int groups_x = DivideRoundUp(shape.w, group_size.width);
|
||||
int groups_y = DivideRoundUp(shape.h, group_size.height);
|
||||
int groups_z = DivideRoundUp(layers, group_size.depth);
|
||||
MTLSize groups_count = MTLSizeMake(groups_x, groups_y, groups_z);
|
||||
[encoder dispatchThreadgroups:groups_count threadsPerThreadgroup:group_size];
|
||||
}
|
||||
|
@ -440,9 +440,9 @@ ComputeTaskDescriptorPtr NonLinkableStub(int operation_id, ValueId input_id,
|
||||
desc->resize_function = [input_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
const auto& dimension = buffers.find(input_id)->second;
|
||||
uint3 groups_size{16, 16, 1};
|
||||
uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x),
|
||||
IntegralDivideRoundUp(dimension.h, groups_size.y),
|
||||
IntegralDivideRoundUp(dimension.c, 4)};
|
||||
uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
|
||||
DivideRoundUp(dimension.h, groups_size.y),
|
||||
DivideRoundUp(dimension.c, 4)};
|
||||
return std::make_pair(groups_size, groups_count);
|
||||
};
|
||||
|
||||
|
@ -71,7 +71,7 @@ std::string GetConcatZCode(const std::vector<int> channels) {
|
||||
// Also it is easy to write a loop in this case, to prevent long kernel
|
||||
// generation.
|
||||
for (int i = 0; i < channels.size(); ++i) {
|
||||
const int depth = IntegralDivideRoundUp(channels[i], 4);
|
||||
const int depth = DivideRoundUp(channels[i], 4);
|
||||
const std::string src_buffer = "src_buffer" + std::to_string(i);
|
||||
c += " for (int i = 0; i < " + std::to_string(depth) + "; ++i) {\n";
|
||||
c += " int src_index = i * U.src_size.w + xy_offset;\n";
|
||||
@ -88,7 +88,7 @@ std::string GetConcatZCode(const std::vector<int> channels) {
|
||||
int read_index = 0;
|
||||
int z = 0;
|
||||
for (int i = 0; i < channels.size(); ++i) {
|
||||
const int depth = IntegralDivideRoundUp(channels[i], 4);
|
||||
const int depth = DivideRoundUp(channels[i], 4);
|
||||
const std::string src_buffer = "src_buffer" + std::to_string(i);
|
||||
for (int d = 0; d < depth; ++d) {
|
||||
const int channels_in_group = std::min(4, channels[i] - d * 4);
|
||||
@ -168,11 +168,11 @@ std::vector<ComputeTaskDescriptorPtr> ConcatZ(
|
||||
std::vector<int> uniform_params{
|
||||
src_shape.w,
|
||||
src_shape.h,
|
||||
IntegralDivideRoundUp(src_shape.c, 4),
|
||||
DivideRoundUp(src_shape.c, 4),
|
||||
src_shape.w * src_shape.h,
|
||||
dst_shape.w,
|
||||
dst_shape.h,
|
||||
IntegralDivideRoundUp(dst_shape.c, 4),
|
||||
DivideRoundUp(dst_shape.c, 4),
|
||||
dst_shape.w * dst_shape.h,
|
||||
};
|
||||
return GetByteBuffer(uniform_params);
|
||||
@ -184,9 +184,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatZ(
|
||||
uint3 grid(dst_shape.w, dst_shape.h, 1);
|
||||
uint3 group_size{8u, 4u, 1u};
|
||||
uint3 groups;
|
||||
groups.x = IntegralDivideRoundUp(grid.x, group_size.x);
|
||||
groups.y = IntegralDivideRoundUp(grid.y, group_size.y);
|
||||
groups.z = IntegralDivideRoundUp(grid.z, group_size.z);
|
||||
groups.x = DivideRoundUp(grid.x, group_size.x);
|
||||
groups.y = DivideRoundUp(grid.y, group_size.y);
|
||||
groups.z = DivideRoundUp(grid.z, group_size.z);
|
||||
return std::make_pair(group_size, groups);
|
||||
};
|
||||
|
||||
@ -265,7 +265,7 @@ std::vector<ComputeTaskDescriptorPtr> ConcatX(
|
||||
[output_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
const auto& dimension = buffers.find(output_id)->second;
|
||||
std::vector<int> uniform_params{dimension.w, dimension.h,
|
||||
IntegralDivideRoundUp(dimension.c, 4),
|
||||
DivideRoundUp(dimension.c, 4),
|
||||
/*padding=*/0};
|
||||
return GetByteBuffer(uniform_params);
|
||||
}},
|
||||
@ -274,9 +274,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatX(
|
||||
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
const auto& output_dims = buffers.find(output_id)->second;
|
||||
const uint3 groups_size{8, 4, 1};
|
||||
int groups_x = IntegralDivideRoundUp(output_dims.w, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(output_dims.h, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(output_dims.c, 4);
|
||||
int groups_x = DivideRoundUp(output_dims.w, groups_size.x);
|
||||
int groups_y = DivideRoundUp(output_dims.h, groups_size.y);
|
||||
int groups_z = DivideRoundUp(output_dims.c, 4);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
|
||||
@ -356,7 +356,7 @@ std::vector<ComputeTaskDescriptorPtr> ConcatY(
|
||||
[output_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
const auto& dimension = buffers.find(output_id)->second;
|
||||
std::vector<int> uniform_params{dimension.w, dimension.h,
|
||||
IntegralDivideRoundUp(dimension.c, 4),
|
||||
DivideRoundUp(dimension.c, 4),
|
||||
/*padding=*/0};
|
||||
return GetByteBuffer(uniform_params);
|
||||
}},
|
||||
@ -365,9 +365,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatY(
|
||||
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
const auto& output_dims = buffers.find(output_id)->second;
|
||||
const uint3 groups_size{8, 4, 1};
|
||||
int groups_x = IntegralDivideRoundUp(output_dims.w, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(output_dims.h, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(output_dims.c, 4);
|
||||
int groups_x = DivideRoundUp(output_dims.w, groups_size.x);
|
||||
int groups_y = DivideRoundUp(output_dims.h, groups_size.y);
|
||||
int groups_z = DivideRoundUp(output_dims.c, 4);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
|
||||
|
@ -72,7 +72,7 @@ struct ConvParams {
|
||||
namespace {
|
||||
|
||||
int GetNumOutputSlices(int dst_channels) {
|
||||
const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
|
||||
const int dst_depth = DivideRoundUp(dst_channels, 4);
|
||||
if (dst_depth % 4 == 0 || dst_depth >= 16) {
|
||||
return 4;
|
||||
} else if (dst_depth % 2 == 0 || dst_depth >= 4) {
|
||||
@ -571,8 +571,8 @@ kernel void ComputeFunction(
|
||||
std::vector<float> ReorderWeightsForConv(
|
||||
const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights,
|
||||
const ConvParams& params) {
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
std::vector<float> weights_reordered(
|
||||
weights.shape.w * weights.shape.h *
|
||||
AlignByN(dst_depth, params.block_size.z) * 4 * src_depth * 4);
|
||||
@ -580,8 +580,7 @@ std::vector<float> ReorderWeightsForConv(
|
||||
bool isO4I4 = params.weight_layout == WeightsInnerBlockLayout::O4I4;
|
||||
|
||||
int counter = 0;
|
||||
for (int d = 0; d < IntegralDivideRoundUp(dst_depth, params.block_size.z);
|
||||
++d) {
|
||||
for (int d = 0; d < DivideRoundUp(dst_depth, params.block_size.z); ++d) {
|
||||
for (int y = 0; y < weights.shape.h; ++y) {
|
||||
for (int x = 0; x < weights.shape.w; ++x) {
|
||||
for (int s = 0; s < src_depth; ++s) {
|
||||
@ -618,17 +617,17 @@ std::vector<uint8_t> GetUniformBuffer(const BHWC& src_size,
|
||||
const BHWC& dst_size,
|
||||
const Convolution2DAttributes& attr,
|
||||
const ConvParams& params) {
|
||||
const int grid_x = IntegralDivideRoundUp(dst_size.w, params.block_size.x);
|
||||
const int grid_y = IntegralDivideRoundUp(dst_size.h, params.block_size.y);
|
||||
const int grid_x = DivideRoundUp(dst_size.w, params.block_size.x);
|
||||
const int grid_y = DivideRoundUp(dst_size.h, params.block_size.y);
|
||||
std::vector<int> uniform_params = {
|
||||
src_size.w,
|
||||
src_size.h,
|
||||
src_size.w * src_size.h,
|
||||
IntegralDivideRoundUp(src_size.c, 4),
|
||||
DivideRoundUp(src_size.c, 4),
|
||||
dst_size.w,
|
||||
dst_size.h,
|
||||
dst_size.w * dst_size.h,
|
||||
IntegralDivideRoundUp(dst_size.c, 4),
|
||||
DivideRoundUp(dst_size.c, 4),
|
||||
attr.strides.w,
|
||||
attr.strides.h,
|
||||
-attr.padding.prepended.w,
|
||||
@ -652,17 +651,17 @@ std::vector<uint8_t> GetUniformBuffer(const BHWC& src_size,
|
||||
std::vector<uint8_t> GetUniformBufferForWinograd(const BHWC& src_size,
|
||||
const BHWC& dst_size,
|
||||
const ConvParams& params) {
|
||||
const int grid_x = IntegralDivideRoundUp(dst_size.w, params.block_size.x);
|
||||
const int grid_y = IntegralDivideRoundUp(dst_size.h, params.block_size.y);
|
||||
const int grid_x = DivideRoundUp(dst_size.w, params.block_size.x);
|
||||
const int grid_y = DivideRoundUp(dst_size.h, params.block_size.y);
|
||||
std::vector<int> uniform_params = {
|
||||
src_size.w,
|
||||
src_size.h,
|
||||
src_size.w * src_size.h,
|
||||
IntegralDivideRoundUp(src_size.c, 4),
|
||||
DivideRoundUp(src_size.c, 4),
|
||||
dst_size.w,
|
||||
dst_size.h,
|
||||
dst_size.w * dst_size.h,
|
||||
IntegralDivideRoundUp(dst_size.c, 4),
|
||||
DivideRoundUp(dst_size.c, 4),
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
@ -685,38 +684,37 @@ std::vector<uint8_t> GetUniformBufferForWinograd(const BHWC& src_size,
|
||||
|
||||
int GetGroupsCount(const BHWC& dst_shape, const int3& wg_size,
|
||||
const int3& block_size) {
|
||||
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
|
||||
|
||||
int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x);
|
||||
int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y);
|
||||
int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z);
|
||||
int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
|
||||
int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
|
||||
int grid_z = DivideRoundUp(dst_slices, block_size.z);
|
||||
|
||||
return IntegralDivideRoundUp(grid_x, wg_size.x) *
|
||||
IntegralDivideRoundUp(grid_y, wg_size.y) *
|
||||
IntegralDivideRoundUp(grid_z, wg_size.z);
|
||||
return DivideRoundUp(grid_x, wg_size.x) * DivideRoundUp(grid_y, wg_size.y) *
|
||||
DivideRoundUp(grid_z, wg_size.z);
|
||||
}
|
||||
|
||||
int GetGroupsCountForLinearWH(const BHWC& dst_shape, const int3& wg_size,
|
||||
const int3& block_size) {
|
||||
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
|
||||
|
||||
int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x);
|
||||
int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y);
|
||||
int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z);
|
||||
int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
|
||||
int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
|
||||
int grid_z = DivideRoundUp(dst_slices, block_size.z);
|
||||
|
||||
return IntegralDivideRoundUp(grid_x * grid_y, wg_size.x) *
|
||||
IntegralDivideRoundUp(grid_z, wg_size.y);
|
||||
return DivideRoundUp(grid_x * grid_y, wg_size.x) *
|
||||
DivideRoundUp(grid_z, wg_size.y);
|
||||
}
|
||||
|
||||
int GetGroupsCountForLinearWHS(const BHWC& dst_shape, const int3& wg_size,
|
||||
const int3& block_size) {
|
||||
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
|
||||
|
||||
int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x);
|
||||
int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y);
|
||||
int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z);
|
||||
int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
|
||||
int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
|
||||
int grid_z = DivideRoundUp(dst_slices, block_size.z);
|
||||
|
||||
return IntegralDivideRoundUp(grid_x * grid_y * grid_z, wg_size.x);
|
||||
return DivideRoundUp(grid_x * grid_y * grid_z, wg_size.x);
|
||||
}
|
||||
|
||||
bool IsKernelXIs1(const Convolution2DAttributes& attr) {
|
||||
@ -758,8 +756,8 @@ int GetRecommendedBlockSize(const AppleGPUInfo& apple_info,
|
||||
ConvParams GetConvParamsForA7A8(const AppleGPUInfo& apple_info,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC& dst_shape) {
|
||||
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
|
||||
const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
|
||||
ConvParams params;
|
||||
params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
|
||||
@ -835,8 +833,8 @@ ConvParams GetConvParamsForA7A8(const AppleGPUInfo& apple_info,
|
||||
ConvParams GetConvParamsForA9AndHigher(const AppleGPUInfo& apple_info,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC& dst_shape) {
|
||||
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
|
||||
const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
int blk_total_size = GetRecommendedBlockSize(apple_info, dst_shape);
|
||||
int3 block_size = int3(1, 1, 1);
|
||||
if (blk_total_size >= 2 && apple_info.IsBionic()) {
|
||||
@ -917,8 +915,8 @@ ConvParams GetConvParamsForA9AndHigher(const AppleGPUInfo& apple_info,
|
||||
ConvParams GetConvParamsForIntel(const Convolution2DAttributes& attr,
|
||||
const RuntimeOptions& options,
|
||||
const BHWC& dst_shape) {
|
||||
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
|
||||
const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvParams params;
|
||||
params.weights_upload_type = WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST;
|
||||
params.x_kernel_is_1 = IsKernelXIs1(attr);
|
||||
@ -1017,29 +1015,28 @@ ConvParams GetConvParams(const DeviceInfo& device_info,
|
||||
|
||||
std::pair<uint3, uint3> GetDispatchSizes(const ConvParams& params,
|
||||
const BHWC& shape) {
|
||||
const int dst_slices = IntegralDivideRoundUp(shape.c, 4);
|
||||
const int dst_slices = DivideRoundUp(shape.c, 4);
|
||||
|
||||
int grid_x = IntegralDivideRoundUp(shape.w, params.block_size.x);
|
||||
int grid_y = IntegralDivideRoundUp(shape.h, params.block_size.y);
|
||||
int grid_z = IntegralDivideRoundUp(dst_slices, params.block_size.z);
|
||||
int grid_x = DivideRoundUp(shape.w, params.block_size.x);
|
||||
int grid_y = DivideRoundUp(shape.h, params.block_size.y);
|
||||
int grid_z = DivideRoundUp(dst_slices, params.block_size.z);
|
||||
|
||||
const uint3 group_size(params.work_group_size.x, params.work_group_size.y,
|
||||
params.work_group_size.z);
|
||||
int3 wg;
|
||||
uint3 groups_count;
|
||||
if (params.linear_whs) {
|
||||
wg.x = IntegralDivideRoundUp(grid_x * grid_y * grid_z,
|
||||
params.work_group_size.x);
|
||||
wg.x = DivideRoundUp(grid_x * grid_y * grid_z, params.work_group_size.x);
|
||||
groups_count = uint3(wg.x, 1, 1);
|
||||
} else if (params.linear_wh) {
|
||||
wg.x = IntegralDivideRoundUp(grid_x * grid_y, params.work_group_size.x);
|
||||
wg.y = IntegralDivideRoundUp(grid_z, params.work_group_size.y);
|
||||
wg.x = DivideRoundUp(grid_x * grid_y, params.work_group_size.x);
|
||||
wg.y = DivideRoundUp(grid_z, params.work_group_size.y);
|
||||
groups_count = uint3(wg[params.work_group_launch_order.x],
|
||||
wg[params.work_group_launch_order.y], 1);
|
||||
} else {
|
||||
wg.x = IntegralDivideRoundUp(grid_x, params.work_group_size.x);
|
||||
wg.y = IntegralDivideRoundUp(grid_y, params.work_group_size.y);
|
||||
wg.z = IntegralDivideRoundUp(grid_z, params.work_group_size.z);
|
||||
wg.x = DivideRoundUp(grid_x, params.work_group_size.x);
|
||||
wg.y = DivideRoundUp(grid_y, params.work_group_size.y);
|
||||
wg.z = DivideRoundUp(grid_z, params.work_group_size.z);
|
||||
groups_count = uint3(wg[params.work_group_launch_order.x],
|
||||
wg[params.work_group_launch_order.y],
|
||||
wg[params.work_group_launch_order.z]);
|
||||
@ -1076,7 +1073,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionGeneric(
|
||||
std::string addr_space =
|
||||
params.weights_upload_type == WeightsUploadType::CONSTANT_MEM ? "constant"
|
||||
: "device";
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
desc->immutable_buffers = {
|
||||
{addr_space + " FLT4* const filters",
|
||||
GetByteBufferConverted(weights_reordered, options.storage_precision)},
|
||||
@ -1108,7 +1105,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionWino4x4To6x6(
|
||||
int id, ValueId input_id, ValueId output_id, const BHWC& dst_shape,
|
||||
const Convolution2DAttributes& attr, const DeviceInfo& device_info,
|
||||
const RuntimeOptions& options) {
|
||||
const int dst_slices = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
ConvParams params;
|
||||
params.work_group_launch_order = int3(2, 0, 1);
|
||||
params.src_depth_loop_size = 1;
|
||||
|
@ -34,6 +34,7 @@ using ::tflite::gpu::Axis;
|
||||
using ::tflite::gpu::BHWC;
|
||||
using ::tflite::gpu::Convolution2DAttributes;
|
||||
using ::tflite::gpu::DataType;
|
||||
using ::tflite::gpu::DivideRoundUp;
|
||||
using ::tflite::gpu::HW;
|
||||
using ::tflite::gpu::Linear;
|
||||
using ::tflite::gpu::OHWI;
|
||||
@ -44,7 +45,6 @@ using ::tflite::gpu::TensorRef;
|
||||
using ::tflite::gpu::ValueId;
|
||||
using ::tflite::gpu::metal::ConvolutionGeneric;
|
||||
using ::tflite::gpu::metal::ConvolutionWino4x4To6x6;
|
||||
using ::tflite::gpu::IntegralDivideRoundUp;
|
||||
using ::tflite::gpu::metal::CompareVectors;
|
||||
using ::tflite::gpu::metal::SingleOpModel;
|
||||
|
||||
@ -275,7 +275,7 @@ using ::tflite::gpu::metal::SingleOpModel;
|
||||
BHWC conv_shape;
|
||||
conv_shape.b = dst_shape.b;
|
||||
conv_shape.h = 36;
|
||||
conv_shape.w = IntegralDivideRoundUp(new_width, 4) * IntegralDivideRoundUp(new_height, 4);
|
||||
conv_shape.w = DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
|
||||
conv_shape.c = dst_shape.c;
|
||||
|
||||
TensorFloat32 src_tensor;
|
||||
|
@ -208,7 +208,7 @@ kernel void ComputeFunction(
|
||||
// DepthWiseConv3x3Stride1x1
|
||||
std::vector<float> ReorderWeightsDepthWiseConv3x3Stride1x1(
|
||||
const DepthwiseConvolution2DAttributes& attr) {
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int kernel_x = 3;
|
||||
const int kernel_y = 3;
|
||||
std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth *
|
||||
@ -250,11 +250,11 @@ static std::vector<uint8_t> GetUniformBufferDepthWiseConv3x3Stride1x1(
|
||||
src_size.w,
|
||||
src_size.h,
|
||||
src_size.w * src_size.h,
|
||||
IntegralDivideRoundUp(src_size.c, 4),
|
||||
DivideRoundUp(src_size.c, 4),
|
||||
dst_size.w,
|
||||
dst_size.h,
|
||||
dst_size.w * dst_size.h,
|
||||
IntegralDivideRoundUp(dst_size.c, 4),
|
||||
DivideRoundUp(dst_size.c, 4),
|
||||
-params.padding.prepended.w,
|
||||
-params.padding.prepended.h,
|
||||
0, // dummy, for alignment
|
||||
@ -403,7 +403,7 @@ kernel void ComputeFunction(
|
||||
// DepthWiseConv3x3Stride2
|
||||
std::vector<float> ReorderWeightsDepthWiseConv3x3Stride2(
|
||||
const DepthwiseConvolution2DAttributes& attr) {
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int kernel_x = 3;
|
||||
const int kernel_y = 3;
|
||||
std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth *
|
||||
@ -445,11 +445,11 @@ static std::vector<uint8_t> GetUniformBufferDepthWiseConv3x3Stride2(
|
||||
src_size.w,
|
||||
src_size.h,
|
||||
src_size.w * src_size.h,
|
||||
IntegralDivideRoundUp(src_size.c, 4),
|
||||
DivideRoundUp(src_size.c, 4),
|
||||
dst_size.w,
|
||||
dst_size.h,
|
||||
dst_size.w * dst_size.h,
|
||||
IntegralDivideRoundUp(dst_size.c, 4),
|
||||
DivideRoundUp(dst_size.c, 4),
|
||||
-attr.padding.prepended.w,
|
||||
-attr.padding.prepended.h,
|
||||
attr.strides.w,
|
||||
@ -586,11 +586,11 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
|
||||
std::vector<int> uniform_params{
|
||||
dimension.w,
|
||||
dimension.h,
|
||||
IntegralDivideRoundUp(dimension.c, 4),
|
||||
DivideRoundUp(dimension.c, 4),
|
||||
0,
|
||||
output_dimension.w,
|
||||
output_dimension.h,
|
||||
IntegralDivideRoundUp(output_dimension.c, 4),
|
||||
DivideRoundUp(output_dimension.c, 4),
|
||||
0,
|
||||
attr.strides.w,
|
||||
attr.strides.h,
|
||||
@ -612,9 +612,9 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
|
||||
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
const auto& dimension = buffers.find(output_id)->second;
|
||||
uint3 groups_size{8, 4, 1};
|
||||
uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x),
|
||||
IntegralDivideRoundUp(dimension.h, groups_size.y),
|
||||
IntegralDivideRoundUp(dimension.c, 4)};
|
||||
uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
|
||||
DivideRoundUp(dimension.h, groups_size.y),
|
||||
DivideRoundUp(dimension.c, 4)};
|
||||
return std::make_pair(groups_size, groups_count);
|
||||
};
|
||||
|
||||
@ -661,17 +661,17 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride1x1(
|
||||
|
||||
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
const auto& dimension = buffers.find(output_id)->second;
|
||||
const int grid_x = IntegralDivideRoundUp(dimension.w, 2);
|
||||
const int grid_y = IntegralDivideRoundUp(dimension.h, 2);
|
||||
const int grid_z = IntegralDivideRoundUp(dimension.c, 4);
|
||||
const int grid_x = DivideRoundUp(dimension.w, 2);
|
||||
const int grid_y = DivideRoundUp(dimension.h, 2);
|
||||
const int grid_z = DivideRoundUp(dimension.c, 4);
|
||||
uint3 group_size{8, 4, 1};
|
||||
if (grid_x <= 4) {
|
||||
group_size.x = 4;
|
||||
group_size.z = grid_z % 2 == 0 ? 2 : 1;
|
||||
}
|
||||
const int groups_x = IntegralDivideRoundUp(grid_x, group_size.x);
|
||||
const int groups_y = IntegralDivideRoundUp(grid_y, group_size.y);
|
||||
const int groups_z = IntegralDivideRoundUp(grid_z, group_size.z);
|
||||
const int groups_x = DivideRoundUp(grid_x, group_size.x);
|
||||
const int groups_y = DivideRoundUp(grid_y, group_size.y);
|
||||
const int groups_z = DivideRoundUp(grid_z, group_size.z);
|
||||
return std::make_pair(group_size, uint3(groups_x, groups_y, groups_z));
|
||||
};
|
||||
|
||||
@ -726,12 +726,12 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride2(
|
||||
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
const auto& dimension = buffers.find(output_id)->second;
|
||||
const int grid_x = dimension.w;
|
||||
const int grid_y = IntegralDivideRoundUp(dimension.h, 2);
|
||||
const int grid_z = IntegralDivideRoundUp(dimension.c, 4);
|
||||
const int grid_y = DivideRoundUp(dimension.h, 2);
|
||||
const int grid_z = DivideRoundUp(dimension.c, 4);
|
||||
const uint3 group_size{8, 4, 1};
|
||||
const int groups_x = IntegralDivideRoundUp(grid_x, group_size.x);
|
||||
const int groups_y = IntegralDivideRoundUp(grid_y, group_size.y);
|
||||
const int groups_z = IntegralDivideRoundUp(grid_z, group_size.z);
|
||||
const int groups_x = DivideRoundUp(grid_x, group_size.x);
|
||||
const int groups_y = DivideRoundUp(grid_y, group_size.y);
|
||||
const int groups_z = DivideRoundUp(grid_z, group_size.z);
|
||||
return std::make_pair(group_size, uint3(groups_x, groups_y, groups_z));
|
||||
};
|
||||
|
||||
|
@ -45,7 +45,7 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
|
||||
const std::string barrier = device_info.IsWaveSizeEqualTo32()
|
||||
? "SIMDGROUP_BARRIER"
|
||||
: "threadgroup_barrier";
|
||||
const int src_depth = IntegralDivideRoundUp(src_channels, 4);
|
||||
const int src_depth = DivideRoundUp(src_channels, 4);
|
||||
std::stringstream code;
|
||||
code << R"(
|
||||
#include <metal_stdlib>
|
||||
@ -116,9 +116,8 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
|
||||
}
|
||||
}
|
||||
)";
|
||||
const int src_depth_sub_groups = shared_memory
|
||||
? IntegralDivideRoundUp(src_depth, 32)
|
||||
: IntegralDivideRoundUp(src_depth, 4);
|
||||
const int src_depth_sub_groups = shared_memory ? DivideRoundUp(src_depth, 32)
|
||||
: DivideRoundUp(src_depth, 4);
|
||||
return absl::Substitute(code.str(), src_depth_sub_groups, barrier);
|
||||
}
|
||||
} // namespace
|
||||
@ -146,7 +145,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
|
||||
bool shared_memory =
|
||||
device_info.IsAppleGPU() &&
|
||||
device_info.apple_info.IsLocalMemoryPreferredOverGlobal();
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int src_depth_aligned = AlignByN(src_depth, shared_memory ? 32 : 4);
|
||||
const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8);
|
||||
|
||||
@ -179,8 +178,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
|
||||
{"constant uniforms& params",
|
||||
[attr](const std::map<ValueId, BHWC>& buffers) {
|
||||
std::vector<uint32_t> uniform_params{
|
||||
static_cast<uint32_t>(
|
||||
IntegralDivideRoundUp(attr.weights.shape.i, 4)),
|
||||
static_cast<uint32_t>(DivideRoundUp(attr.weights.shape.i, 4)),
|
||||
static_cast<uint32_t>(AlignByN(attr.weights.shape.o, 8)),
|
||||
static_cast<uint32_t>(attr.weights.shape.o),
|
||||
static_cast<uint32_t>(0),
|
||||
@ -192,7 +190,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
|
||||
desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
|
||||
const uint3 groups_size{8, 4, 1};
|
||||
const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8);
|
||||
int groups_x = IntegralDivideRoundUp(dst_channels_aligned, groups_size.x);
|
||||
int groups_x = DivideRoundUp(dst_channels_aligned, groups_size.x);
|
||||
return std::make_pair(groups_size, uint3{groups_x, 1, 1});
|
||||
};
|
||||
|
||||
|
@ -131,9 +131,9 @@ std::vector<ComputeTaskDescriptorPtr> MaxUnpooling(
|
||||
const auto& src_shape = buffers.find(input_id)->second;
|
||||
BHWC dst_shape = CalculateOutputShape(src_shape, params);
|
||||
const uint3 groups_size{16, 16, 1};
|
||||
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
|
||||
int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
|
||||
int groups_z = DivideRoundUp(dst_shape.c, 4);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
|
||||
|
@ -133,7 +133,7 @@ std::vector<ComputeTaskDescriptorPtr> Mean(int id, ValueId input_id,
|
||||
[input_id, output_id,
|
||||
work_group_size](const std::map<ValueId, BHWC>& buffers) {
|
||||
const auto& src_shape = buffers.find(input_id)->second;
|
||||
const int src_slices = IntegralDivideRoundUp(src_shape.c, 4);
|
||||
const int src_slices = DivideRoundUp(src_shape.c, 4);
|
||||
struct uniforms {
|
||||
int4 src_size;
|
||||
float4 inv_multipliers;
|
||||
@ -153,8 +153,8 @@ std::vector<ComputeTaskDescriptorPtr> Mean(int id, ValueId input_id,
|
||||
desc->resize_function = [output_id, work_group_size](
|
||||
const std::map<ValueId, BHWC>& buffers) {
|
||||
BHWC dst_shape = buffers.find(output_id)->second;
|
||||
const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
const int groups_z = IntegralDivideRoundUp(dst_slices, work_group_size.z);
|
||||
const int dst_slices = DivideRoundUp(dst_shape.c, 4);
|
||||
const int groups_z = DivideRoundUp(dst_slices, work_group_size.z);
|
||||
return std::make_pair(work_group_size, uint3{1, 1, groups_z});
|
||||
};
|
||||
return {desc};
|
||||
|
@ -177,12 +177,12 @@ std::vector<ComputeTaskDescriptorPtr> Padding(int id, ValueId input_id,
|
||||
dimension.w,
|
||||
dimension.h,
|
||||
dimension.c,
|
||||
IntegralDivideRoundUp(dimension.c, 4),
|
||||
DivideRoundUp(dimension.c, 4),
|
||||
// int4 dst_size
|
||||
output_dimension.w,
|
||||
output_dimension.h,
|
||||
output_dimension.c,
|
||||
IntegralDivideRoundUp(output_dimension.c, 4),
|
||||
DivideRoundUp(output_dimension.c, 4),
|
||||
// int4 prepended padding
|
||||
attr.prepended.w,
|
||||
attr.prepended.h,
|
||||
@ -198,10 +198,10 @@ std::vector<ComputeTaskDescriptorPtr> Padding(int id, ValueId input_id,
|
||||
const uint3 groups_size{16, 16, 1};
|
||||
const auto& src_shape = buffers.find(input_id)->second;
|
||||
BHWC dst_shape = CalculateOutputShape(src_shape, attr);
|
||||
const int dst_layers = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
|
||||
const int dst_layers = DivideRoundUp(dst_shape.c, 4);
|
||||
int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
|
||||
int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
|
||||
int groups_z = DivideRoundUp(dst_layers, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
|
||||
|
@ -224,11 +224,11 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
|
||||
std::vector<int> uniform_params = {
|
||||
dimension.w,
|
||||
dimension.h,
|
||||
IntegralDivideRoundUp(dimension.c, 4),
|
||||
DivideRoundUp(dimension.c, 4),
|
||||
dimension.w * dimension.h,
|
||||
output_dimension.w,
|
||||
output_dimension.h,
|
||||
IntegralDivideRoundUp(dimension.c, 4),
|
||||
DivideRoundUp(dimension.c, 4),
|
||||
output_dimension.w * output_dimension.h,
|
||||
params.strides.w,
|
||||
params.strides.h,
|
||||
@ -242,11 +242,11 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
|
||||
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
BHWC dst_shape = buffers.find(output_id)->second;
|
||||
const uint3 grid =
|
||||
uint3(dst_shape.w, dst_shape.h, IntegralDivideRoundUp(dst_shape.c, 4));
|
||||
uint3(dst_shape.w, dst_shape.h, DivideRoundUp(dst_shape.c, 4));
|
||||
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
|
||||
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
|
||||
int groups_x = DivideRoundUp(grid.x, groups_size.x);
|
||||
int groups_y = DivideRoundUp(grid.y, groups_size.y);
|
||||
int groups_z = DivideRoundUp(grid.z, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
|
||||
|
@ -159,11 +159,11 @@ std::vector<ComputeTaskDescriptorPtr> Reshape(int id, ValueId input_id,
|
||||
|
||||
desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
|
||||
const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
|
||||
IntegralDivideRoundUp(attr.new_shape.c, 4));
|
||||
DivideRoundUp(attr.new_shape.c, 4));
|
||||
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
|
||||
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
|
||||
int groups_x = DivideRoundUp(grid.x, groups_size.x);
|
||||
int groups_y = DivideRoundUp(grid.y, groups_size.y);
|
||||
int groups_z = DivideRoundUp(grid.z, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
|
||||
@ -197,14 +197,14 @@ std::vector<ComputeTaskDescriptorPtr> Reshapex4(int id, ValueId input_id,
|
||||
const auto& dst_dim = buffers.find(output_id)->second;
|
||||
std::vector<int32_t> uniform_params{
|
||||
// int4 src_size
|
||||
src_dim.w, src_dim.h, IntegralDivideRoundUp(src_dim.c, 4),
|
||||
src_dim.w, src_dim.h, DivideRoundUp(src_dim.c, 4),
|
||||
src_dim.w * src_dim.h,
|
||||
// int4 dst_size
|
||||
dst_dim.w, dst_dim.h, IntegralDivideRoundUp(dst_dim.c, 4),
|
||||
dst_dim.w, dst_dim.h, DivideRoundUp(dst_dim.c, 4),
|
||||
dst_dim.w * dst_dim.h,
|
||||
// int2 plane_xz
|
||||
src_dim.w * IntegralDivideRoundUp(src_dim.c, 4),
|
||||
dst_dim.w * IntegralDivideRoundUp(dst_dim.c, 4),
|
||||
src_dim.w * DivideRoundUp(src_dim.c, 4),
|
||||
dst_dim.w * DivideRoundUp(dst_dim.c, 4),
|
||||
0, // dummy, for alignment
|
||||
0, // dummy, for alignment
|
||||
0, // dummy, for alignment
|
||||
@ -218,11 +218,11 @@ std::vector<ComputeTaskDescriptorPtr> Reshapex4(int id, ValueId input_id,
|
||||
|
||||
desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
|
||||
const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
|
||||
IntegralDivideRoundUp(attr.new_shape.c, 4));
|
||||
DivideRoundUp(attr.new_shape.c, 4));
|
||||
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
|
||||
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
|
||||
int groups_x = DivideRoundUp(grid.x, groups_size.x);
|
||||
int groups_y = DivideRoundUp(grid.y, groups_size.y);
|
||||
int groups_z = DivideRoundUp(grid.z, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
|
||||
|
@ -153,10 +153,10 @@ std::vector<ComputeTaskDescriptorPtr> Resize(int id, ValueId input_id,
|
||||
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
const uint3 groups_size{16, 16, 1};
|
||||
const auto& dst_dim = buffers.find(output_id)->second;
|
||||
int groups_x = IntegralDivideRoundUp(dst_dim.w, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(dst_dim.h, groups_size.y);
|
||||
const int dst_layers = IntegralDivideRoundUp(dst_dim.c, 4);
|
||||
int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
|
||||
int groups_x = DivideRoundUp(dst_dim.w, groups_size.x);
|
||||
int groups_y = DivideRoundUp(dst_dim.h, groups_size.y);
|
||||
const int dst_layers = DivideRoundUp(dst_dim.c, 4);
|
||||
int groups_z = DivideRoundUp(dst_layers, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
return {desc};
|
||||
|
@ -157,12 +157,12 @@ std::vector<ComputeTaskDescriptorPtr> Slice(int id, ValueId input_id,
|
||||
dimension.w,
|
||||
dimension.h,
|
||||
dimension.c,
|
||||
IntegralDivideRoundUp(dimension.c, 4),
|
||||
DivideRoundUp(dimension.c, 4),
|
||||
// int4 dst_size
|
||||
output_dimension.w,
|
||||
output_dimension.h,
|
||||
output_dimension.c,
|
||||
IntegralDivideRoundUp(output_dimension.c, 4),
|
||||
DivideRoundUp(output_dimension.c, 4),
|
||||
};
|
||||
return GetByteBuffer(uniform_params);
|
||||
}},
|
||||
@ -173,10 +173,10 @@ std::vector<ComputeTaskDescriptorPtr> Slice(int id, ValueId input_id,
|
||||
const uint3 groups_size{16, 16, 1};
|
||||
const auto& src_shape = buffers.find(input_id)->second;
|
||||
BHWC dst_shape = CalculateOutputShape(src_shape, attr);
|
||||
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
|
||||
const int dst_layers = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
|
||||
int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
|
||||
int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
|
||||
const int dst_layers = DivideRoundUp(dst_shape.c, 4);
|
||||
int groups_z = DivideRoundUp(dst_layers, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
|
||||
|
@ -169,8 +169,8 @@ std::vector<ComputeTaskDescriptorPtr> Softmax(int id, ValueId input_id,
|
||||
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
uint3 groups_size{8, 4, 1};
|
||||
const auto& dimension = buffers.find(output_id)->second;
|
||||
uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x),
|
||||
IntegralDivideRoundUp(dimension.h, groups_size.y), 1};
|
||||
uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
|
||||
DivideRoundUp(dimension.h, groups_size.y), 1};
|
||||
return std::make_pair(groups_size, groups_count);
|
||||
};
|
||||
|
||||
@ -198,13 +198,13 @@ std::vector<ComputeTaskDescriptorPtr> Softmax1x1(int id, ValueId input_id,
|
||||
desc->uniform_buffers = {
|
||||
{"constant uniforms& params",
|
||||
[channels_count](const std::map<ValueId, BHWC>& buffers) {
|
||||
const int src_depth = IntegralDivideRoundUp(channels_count, 4);
|
||||
const int src_depth = DivideRoundUp(channels_count, 4);
|
||||
struct uniforms {
|
||||
int4 size;
|
||||
float4 mask;
|
||||
};
|
||||
uniforms params;
|
||||
params.size = {src_depth, IntegralDivideRoundUp(src_depth, 32), 1, 1};
|
||||
params.size = {src_depth, DivideRoundUp(src_depth, 32), 1, 1};
|
||||
params.mask = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
const int reminder = channels_count % 4 == 0 ? 4 : channels_count % 4;
|
||||
for (int i = 0; i < reminder; ++i) {
|
||||
|
@ -113,12 +113,12 @@ kernel void ComputeFunction($1 uint3 gid[[thread_position_in_grid]]) {
|
||||
input_shape.h / attr.block_size,
|
||||
input_shape.w / attr.block_size,
|
||||
input_shape.c * attr.block_size * attr.block_size);
|
||||
const uint3 grid = uint3(output_shape.w, output_shape.h,
|
||||
IntegralDivideRoundUp(output_shape.c, 4));
|
||||
const uint3 grid =
|
||||
uint3(output_shape.w, output_shape.h, DivideRoundUp(output_shape.c, 4));
|
||||
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
|
||||
const int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
|
||||
const int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
|
||||
const int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
|
||||
const int groups_x = DivideRoundUp(grid.x, groups_size.x);
|
||||
const int groups_y = DivideRoundUp(grid.y, groups_size.y);
|
||||
const int groups_z = DivideRoundUp(grid.z, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3(groups_x, groups_y, groups_z));
|
||||
};
|
||||
return {desc};
|
||||
|
@ -130,8 +130,8 @@ std::string GetDeconvolution(const ConvolutionTransposedAttributes& attr) {
|
||||
constant_args, attr.padding.prepended.w, attr.padding.prepended.h,
|
||||
attr.stride.w, attr.stride.h, kernel_x, kernel_y, inner_size_x,
|
||||
inner_size_y, kernel_x - 1, kernel_y - 1);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 4);
|
||||
return absl::Substitute(shader_source, src_depth * dst_channels_aligned,
|
||||
src_depth, dst_depth, attr.weights.shape.o,
|
||||
@ -264,8 +264,8 @@ std::string GetDeconvolutionShared(const ConvolutionTransposedAttributes& attr,
|
||||
constant_args, attr.padding.prepended.w, attr.padding.prepended.h,
|
||||
attr.stride.w, attr.stride.h, kernel_x, kernel_y, inner_size_x,
|
||||
inner_size_y, kernel_x - 1, kernel_y - 1);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 4);
|
||||
const int src_local_size_x = (workgroup_x + kernel_x) / attr.stride.w;
|
||||
const int src_local_size_y = (workgroup_y + kernel_y) / attr.stride.h;
|
||||
@ -464,7 +464,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
|
||||
(kThreadGroupWidth + params.weights.shape.w) / params.stride.w;
|
||||
const int src_local_size_y =
|
||||
(kThreadGroupHeight + params.weights.shape.h) / params.stride.h;
|
||||
const int src_depth = IntegralDivideRoundUp(params.weights.shape.i, 4);
|
||||
const int src_depth = DivideRoundUp(params.weights.shape.i, 4);
|
||||
const int shared_size =
|
||||
sizeof(float) * 4 * src_depth * src_local_size_x * src_local_size_y;
|
||||
if (shared_size < 1000 * 16 &&
|
||||
@ -543,8 +543,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
|
||||
const uint3 groups_size{kThreadGroupWidth, kThreadGroupHeight, 1};
|
||||
BHWC dst_shape =
|
||||
CalculateOutputShape(buffers.find(input_id)->second, params);
|
||||
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
|
||||
int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
|
||||
int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
|
||||
int groups_z = 1;
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
@ -556,8 +556,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
|
||||
int id, ValueId input_id, ValueId output_id,
|
||||
const ConvolutionTransposedAttributes& params,
|
||||
const DeviceInfo& device_info, const RuntimeOptions& options) {
|
||||
const int src_depth = IntegralDivideRoundUp(params.weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(params.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(params.weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(params.weights.shape.o, 4);
|
||||
const int kernel_x = 4;
|
||||
const int kernel_y = 4;
|
||||
|
||||
@ -645,7 +645,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
|
||||
[input_id, output_id, params](const std::map<ValueId, BHWC>& buffers) {
|
||||
const auto& src_shape = buffers.find(input_id)->second;
|
||||
const auto& dst_shape = buffers.find(output_id)->second;
|
||||
const int src_depth = IntegralDivideRoundUp(src_shape.c, 4);
|
||||
const int src_depth = DivideRoundUp(src_shape.c, 4);
|
||||
std::vector<int> uniform_params{
|
||||
src_shape.w,
|
||||
src_shape.h,
|
||||
@ -653,7 +653,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
|
||||
src_shape.w * src_shape.h,
|
||||
dst_shape.w,
|
||||
dst_shape.h,
|
||||
IntegralDivideRoundUp(dst_shape.c, 4),
|
||||
DivideRoundUp(dst_shape.c, 4),
|
||||
0,
|
||||
4 * 16 * src_depth,
|
||||
0,
|
||||
@ -667,13 +667,13 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
|
||||
desc->resize_function = [output_id, block_size,
|
||||
params](const std::map<ValueId, BHWC>& buffers) {
|
||||
const auto& dst_shape = buffers.find(output_id)->second;
|
||||
const int grid_x = IntegralDivideRoundUp(dst_shape.w + 2, 2 * block_size.x);
|
||||
const int grid_y = IntegralDivideRoundUp(dst_shape.h + 2, 2 * block_size.y);
|
||||
const int grid_z = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
const int grid_x = DivideRoundUp(dst_shape.w + 2, 2 * block_size.x);
|
||||
const int grid_y = DivideRoundUp(dst_shape.h + 2, 2 * block_size.y);
|
||||
const int grid_z = DivideRoundUp(dst_shape.c, 4);
|
||||
const uint3 group_size{8, 4, 1};
|
||||
int groups_x = IntegralDivideRoundUp(grid_x, group_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(grid_y, group_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(grid_z, group_size.z);
|
||||
int groups_x = DivideRoundUp(grid_x, group_size.x);
|
||||
int groups_y = DivideRoundUp(grid_y, group_size.y);
|
||||
int groups_z = DivideRoundUp(grid_z, group_size.z);
|
||||
return std::make_pair(group_size, uint3{groups_z, groups_x, groups_y});
|
||||
};
|
||||
|
||||
|
@ -486,8 +486,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
|
||||
BHWC dst_shape;
|
||||
dst_shape.b = src_shape.b;
|
||||
dst_shape.h = 36;
|
||||
dst_shape.w = IntegralDivideRoundUp(new_width, 4) *
|
||||
IntegralDivideRoundUp(new_height, 4);
|
||||
dst_shape.w =
|
||||
DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
|
||||
dst_shape.c = src_shape.c;
|
||||
return dst_shape;
|
||||
}};
|
||||
@ -501,16 +501,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
|
||||
attr.padding.appended.w - 2;
|
||||
int new_height = src_shape.h + attr.padding.prepended.h +
|
||||
attr.padding.appended.h - 2;
|
||||
int tiles_x = IntegralDivideRoundUp(new_width, 4);
|
||||
int tiles_y = IntegralDivideRoundUp(new_height, 4);
|
||||
int tiles_x = DivideRoundUp(new_width, 4);
|
||||
int tiles_y = DivideRoundUp(new_height, 4);
|
||||
std::vector<int> sizes = {
|
||||
src_shape.w,
|
||||
src_shape.h,
|
||||
IntegralDivideRoundUp(src_shape.c, 4),
|
||||
DivideRoundUp(src_shape.c, 4),
|
||||
0,
|
||||
dst_shape.w,
|
||||
dst_shape.h,
|
||||
IntegralDivideRoundUp(dst_shape.c, 4),
|
||||
DivideRoundUp(dst_shape.c, 4),
|
||||
0,
|
||||
-attr.padding.prepended.w,
|
||||
-attr.padding.prepended.h,
|
||||
@ -529,12 +529,12 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
|
||||
src_shape.w + attr.padding.prepended.w + attr.padding.appended.w - 2;
|
||||
int new_height =
|
||||
src_shape.h + attr.padding.prepended.h + attr.padding.appended.h - 2;
|
||||
int grid_x = IntegralDivideRoundUp(new_width, 4);
|
||||
int grid_y = IntegralDivideRoundUp(new_height, 4);
|
||||
int grid_z = IntegralDivideRoundUp(src_shape.c, 4);
|
||||
int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
|
||||
int grid_x = DivideRoundUp(new_width, 4);
|
||||
int grid_y = DivideRoundUp(new_height, 4);
|
||||
int grid_z = DivideRoundUp(src_shape.c, 4);
|
||||
int groups_x = DivideRoundUp(grid_x, groups_size.x);
|
||||
int groups_y = DivideRoundUp(grid_y, groups_size.y);
|
||||
int groups_z = DivideRoundUp(grid_z, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
return {desc};
|
||||
@ -563,8 +563,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
|
||||
BHWC dst_shape;
|
||||
dst_shape.b = src_shape.b;
|
||||
dst_shape.h = 36;
|
||||
dst_shape.w = IntegralDivideRoundUp(new_width, 4) *
|
||||
IntegralDivideRoundUp(new_height, 4);
|
||||
dst_shape.w =
|
||||
DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
|
||||
dst_shape.c = src_shape.c;
|
||||
return dst_shape;
|
||||
}};
|
||||
@ -593,16 +593,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
|
||||
attr.padding.appended.w - 2;
|
||||
int new_height = src_shape.h + attr.padding.prepended.h +
|
||||
attr.padding.appended.h - 2;
|
||||
int tiles_x = IntegralDivideRoundUp(new_width, 4);
|
||||
int tiles_y = IntegralDivideRoundUp(new_height, 4);
|
||||
int tiles_x = DivideRoundUp(new_width, 4);
|
||||
int tiles_y = DivideRoundUp(new_height, 4);
|
||||
std::vector<int> sizes = {
|
||||
src_shape.w,
|
||||
src_shape.h,
|
||||
IntegralDivideRoundUp(src_shape.c, 4),
|
||||
DivideRoundUp(src_shape.c, 4),
|
||||
0,
|
||||
dst_shape.w,
|
||||
dst_shape.h,
|
||||
IntegralDivideRoundUp(dst_shape.c, 4),
|
||||
DivideRoundUp(dst_shape.c, 4),
|
||||
0,
|
||||
-attr.padding.prepended.w,
|
||||
-attr.padding.prepended.h,
|
||||
@ -619,10 +619,10 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
|
||||
const auto& dst_shape = buffers.find(output_id)->second;
|
||||
int grid_x = dst_shape.w;
|
||||
int grid_y = 6;
|
||||
int grid_z = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
|
||||
int grid_z = DivideRoundUp(dst_shape.c, 4);
|
||||
int groups_x = DivideRoundUp(grid_x, groups_size.x);
|
||||
int groups_y = DivideRoundUp(grid_y, groups_size.y);
|
||||
int groups_z = DivideRoundUp(grid_z, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
return {desc};
|
||||
@ -665,8 +665,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4(
|
||||
const auto& src_shape = buffers.find(input_id)->second;
|
||||
const auto& dst_shape = buffers.find(output_id)->second;
|
||||
std::vector<int> sizes = {
|
||||
src_shape.w, src_shape.h, IntegralDivideRoundUp(src_shape.c, 4), 0,
|
||||
dst_shape.w, dst_shape.h, IntegralDivideRoundUp(dst_shape.c, 4), 0,
|
||||
src_shape.w, src_shape.h, DivideRoundUp(src_shape.c, 4), 0,
|
||||
dst_shape.w, dst_shape.h, DivideRoundUp(dst_shape.c, 4), 0,
|
||||
};
|
||||
return GetByteBuffer(sizes);
|
||||
}},
|
||||
@ -677,10 +677,10 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4(
|
||||
const auto& src_shape = buffers.find(input_id)->second;
|
||||
int grid_x = src_shape.w;
|
||||
int grid_y = 1;
|
||||
int grid_z = IntegralDivideRoundUp(src_shape.c, 4);
|
||||
int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
|
||||
int grid_z = DivideRoundUp(src_shape.c, 4);
|
||||
int groups_x = DivideRoundUp(grid_x, groups_size.x);
|
||||
int groups_y = DivideRoundUp(grid_y, groups_size.y);
|
||||
int groups_z = DivideRoundUp(grid_z, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
return {desc};
|
||||
@ -734,16 +734,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4Tile4x1(
|
||||
[input_id, output_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
const auto& src_shape = buffers.find(input_id)->second;
|
||||
const auto& dst_shape = buffers.find(output_id)->second;
|
||||
const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
|
||||
const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
|
||||
const int tiles_x = DivideRoundUp(dst_shape.w, 4);
|
||||
const int tiles_y = DivideRoundUp(dst_shape.h, 4);
|
||||
std::vector<int> sizes = {
|
||||
src_shape.w,
|
||||
src_shape.h,
|
||||
IntegralDivideRoundUp(src_shape.c, 4),
|
||||
DivideRoundUp(src_shape.c, 4),
|
||||
0,
|
||||
dst_shape.w,
|
||||
dst_shape.h,
|
||||
IntegralDivideRoundUp(dst_shape.c, 4),
|
||||
DivideRoundUp(dst_shape.c, 4),
|
||||
0,
|
||||
tiles_x,
|
||||
tiles_y,
|
||||
@ -757,14 +757,14 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4Tile4x1(
|
||||
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
const uint3 groups_size{8, 4, 1};
|
||||
const auto& dst_shape = buffers.find(output_id)->second;
|
||||
const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
|
||||
const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
|
||||
const int tiles_x = DivideRoundUp(dst_shape.w, 4);
|
||||
const int tiles_y = DivideRoundUp(dst_shape.h, 4);
|
||||
int grid_x = tiles_x * tiles_y;
|
||||
int grid_y = 4;
|
||||
int grid_z = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
|
||||
int grid_z = DivideRoundUp(dst_shape.c, 4);
|
||||
int groups_x = DivideRoundUp(grid_x, groups_size.x);
|
||||
int groups_y = DivideRoundUp(grid_y, groups_size.y);
|
||||
int groups_z = DivideRoundUp(grid_z, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
return {desc};
|
||||
|
Loading…
Reference in New Issue
Block a user