Rename IntegralDivideRoundUp to DivideRoundUp.

PiperOrigin-RevId: 307447663 Change-Id: I1e0f6c9f058e3f0457a7522f1d10f7da8ab8610d
2020-04-20 12:02:49 -07:00 · 2020-04-20 12:02:49 -07:00 · b26b6b5669
commit b26b6b5669
parent 55c4d9e49c
80 changed files with 435 additions and 452 deletions
--- a/tensorflow/lite/delegates/gpu/api.h
+++ b/tensorflow/lite/delegates/gpu/api.h
@ -54,7 +54,7 @@ namespace gpu {
 //   H  - height
 //   W  - width
 //   C  - channels
-//   D  - depth := IntegralDivideRoundUp(C, 4)
+//   D  - depth := DivideRoundUp(C, 4)
 //   C4 - is the constant = 4.
 enum class DataLayout {
  UNKNOWN,
@ -164,7 +164,7 @@ struct Dimensions {
  Dimensions(int32_t batch, int32_t height, int32_t width, int32_t channels)
      : b(batch), h(height), w(width), c(channels) {}
-  int32_t d() const { return IntegralDivideRoundUp(c, 4); }
+  int32_t d() const { return DivideRoundUp(c, 4); }
  int32_t product() const { return b * h * w * c; }
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
@ -70,10 +70,10 @@ std::string Add::GetElementWiseCode(
 Add::Add(const OperationDef& definition, const std::vector<int>& channels,
         int dst_channels)
    : ElementwiseOperation(definition),
-      dst_depth_(IntegralDivideRoundUp(dst_channels, 4)) {
+      dst_depth_(DivideRoundUp(dst_channels, 4)) {
  src_depthes_.resize(channels.size());
  for (int i = 0; i < channels.size(); ++i) {
-    src_depthes_[i] = IntegralDivideRoundUp(channels[i], 4);
+    src_depthes_[i] = DivideRoundUp(channels[i], 4);
  }
 }
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
@ -79,7 +79,7 @@ std::string GetConcatKernelCode(
    // generation.
    c += "  int Z = 0;\n";
    for (int i = 0; i < channels.size(); ++i) {
-      const int depth = IntegralDivideRoundUp(channels[i], 4);
+      const int depth = DivideRoundUp(channels[i], 4);
      if (depth % 2 == 0) {
        // We can read more at once inside of loop in case depth % 2 == 0
        // it should be better for reading latency hiding
@ -112,7 +112,7 @@ std::string GetConcatKernelCode(
    int read_index = 0;
    int z = 0;
    for (int i = 0; i < channels.size(); ++i) {
-      const int depth = IntegralDivideRoundUp(channels[i], 4);
+      const int depth = DivideRoundUp(channels[i], 4);
      for (int d = 0; d < depth; ++d) {
        const int channels_in_group = std::min(4, channels[i] - d * 4);
        const std::string temp_name = "t" + std::to_string(read_index);
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
@ -128,24 +128,24 @@ absl::Status Conv3D::BindArguments() {
    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
  }
  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w)));
+      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w)));
  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
  return absl::OkStatus();
 }
 int3 Conv3D::GetGridSize() const {
-  const int grid_x = IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
+  const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
-                                           conv_params_.block_size.x);
+                                   conv_params_.block_size.x);
  const int grid_y =
-      IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+      DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
  const int grid_z =
-      IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) *
+      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) *
-      IntegralDivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
+      DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
  int3 wg;
-  wg.x = IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
+  wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
-  wg.y = IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
+  wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
-  wg.z = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
+  wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
  return int3(wg[conv_params_.work_group_launch_order[0]] *
                  conv_params_.work_group_size.x,
              wg[conv_params_.work_group_launch_order[1]] *
@ -885,8 +885,8 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
 Conv3D::ConvParams Conv3D::GuessBestParams(
    const CLDevice& device, const OperationDef& definition,
    const Convolution3DAttributes& attr) const {
-  const int dst_slices = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
-  const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
  const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
                             attr.dilations.w == 1 &&
                             attr.padding.prepended.w == 0 &&
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
@ -147,8 +147,8 @@ absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
                                   CLContext* context) {
  const int block_size = conv_params_.block_size.w;
  const int dst_slices =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
-  const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
  const int kernel_x = kernel_size_.x;
  const int kernel_y = kernel_size_.y;
  const int kernel_z = kernel_size_.z;
@ -219,8 +219,8 @@ void Conv3D::RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
                                  absl::Span<T> dst) {
  const int block_size = conv_params_.block_size.w;
  const int dst_slices =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
-  const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
  const int kernel_x = kernel_size_.x;
  const int kernel_y = kernel_size_.y;
  const int kernel_z = kernel_size_.z;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
@ -307,7 +307,7 @@ absl::Status ConvBuffer1x1::BindArguments() {
  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  const int src_width_elements = IntegralDivideRoundUp(
+  const int src_width_elements = DivideRoundUp(
      src_[0]->Width() * src_[0]->Batch(), (conv_params_.element_size / 4));
  int4 src_size = int4(src_width_elements, src_[0]->Height(), src_[0]->Slices(),
                       src_width_elements * src_[0]->Height());
@ -317,14 +317,14 @@ absl::Status ConvBuffer1x1::BindArguments() {
 }
 int3 ConvBuffer1x1::GetGridSize() const {
-  const int dst_width_elements = IntegralDivideRoundUp(
+  const int dst_width_elements = DivideRoundUp(
      dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4));
  const int grid_x =
-      IntegralDivideRoundUp(dst_width_elements, conv_params_.block_size.x);
+      DivideRoundUp(dst_width_elements, conv_params_.block_size.x);
  const int grid_y =
-      IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+      DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
  const int grid_z =
-      IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
+      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
  return int3(grid_x, grid_y, grid_z);
 }
@ -358,8 +358,8 @@ absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
  if (!IsConvBuffer1x1Supported(definition, attr)) {
    return absl::InvalidArgumentError("ConvBuffer1x1 doesn't supported");
  }
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
  ConvBuffer1x1::ConvParams conv_params;
  if (shape) {
    conv_params = GetBestParams(*creation_context.device, definition, *shape,
@ -376,8 +376,8 @@ absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
                                 const OperationDef& definition,
                                 const FullyConnectedAttributes& attr,
                                 ConvBuffer1x1* result, const BHWC* shape) {
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
  ConvBuffer1x1::ConvParams conv_params;
  if (shape) {
    conv_params = GetBestParams(*creation_context.device, definition, *shape,
@ -396,8 +396,8 @@ absl::Status CreateConvBuffer1x1Wino4x4To6x6(
    const CreationContext& creation_context, const OperationDef& definition,
    const Convolution2DAttributes& attr, ConvBuffer1x1* result,
    const BHWC* shape) {
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
  ConvBuffer1x1::ConvParams conv_params;
  if (shape) {
    conv_params = GetBestParams(*creation_context.device, definition, *shape,
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
@ -135,8 +135,8 @@ absl::Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
 template <DataType T>
 absl::Status ConvBuffer1x1::UploadWeights(
    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
  const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@ -40,9 +40,9 @@ std::string GenerateConvolutionConstantCode(
  std::string c = GetCommonDefines(op_def.precision);
-  const int out_z = IntegralDivideRoundUp(dst_channels, 4);
+  const int out_z = DivideRoundUp(dst_channels, 4);
  const std::string kOutZ = std::to_string(out_z);
-  const int src_depth = IntegralDivideRoundUp(src_channels, 4);
+  const int src_depth = DivideRoundUp(src_channels, 4);
  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
  const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
@ -290,7 +290,7 @@ bool IsConvConstantsSupported(const CLDevice& device,
                             : sizeof(half);
  const int filters_buffer_size = filters_count * float_size;
  const int kConstantMaxSize = GetOptimalMaxConstantSize(device.GetInfo());
-  const int flt4_registers = IntegralDivideRoundUp(w_shape.o, 4);
+  const int flt4_registers = DivideRoundUp(w_shape.o, 4);
  return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
 }
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@ -88,7 +88,7 @@ class ConvConstants : public GPUOperation {
 template <DataType T>
 absl::Status ConvConstants::UploadWeights(
    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
  const int kernel_x = weights.shape.w;
  const int kernel_y = weights.shape.h;
@ -112,8 +112,8 @@ absl::Status ConvConstants::UploadWeights(
 template <DataType S, typename T>
 void ConvConstants::RearrangeWeightsData(
    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
  const int kernel_x = weights.shape.w;
  const int kernel_y = weights.shape.h;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@ -205,8 +205,8 @@ absl::Status ConvPowerVR::BindArguments() {
             kernel_dilation_.z * src_[0]->Batch(), kernel_dilation_.w)));
  }
  if (conv_params_.linear_hw) {
-    const int grid_x = IntegralDivideRoundUp(
+    const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
-        dst_[0]->Width() * dst_[0]->Batch(), conv_params_.block_size.x);
+                                     conv_params_.block_size.x);
    RETURN_IF_ERROR(kernel_.SetBytesAuto(grid_x));
  }
  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
@ -215,27 +215,26 @@ absl::Status ConvPowerVR::BindArguments() {
 }
 int3 ConvPowerVR::GetGridSize() const {
-  const int grid_x = IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
+  const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
-                                           conv_params_.block_size.x);
+                                   conv_params_.block_size.x);
  const int grid_y =
-      IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+      DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
  const int grid_z =
-      IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
+      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
  int3 wg;
  if (conv_params_.linear_hw) {
-    wg.x =
+    wg.x = DivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x);
-        IntegralDivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x);
+    wg.y = DivideRoundUp(grid_z, conv_params_.work_group_size.y);
    wg.y = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.y);
    return int3(wg[conv_params_.work_group_launch_order[0]] *
                    conv_params_.work_group_size.x,
                wg[conv_params_.work_group_launch_order[1]] *
                    conv_params_.work_group_size.y,
                1);
  } else {
-    wg.x = IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
+    wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
-    wg.y = IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
+    wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
-    wg.z = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
+    wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
    return int3(wg[conv_params_.work_group_launch_order[0]] *
                    conv_params_.work_group_size.x,
                wg[conv_params_.work_group_launch_order[1]] *
@ -808,8 +807,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
    const CLDevice& device, const OperationDef& definition,
    const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
  const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
                             attr.dilations.w == 1 &&
                             attr.padding.prepended.w == 0 &&
@ -825,8 +824,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
    const CLDevice& device, const OperationDef& definition,
    const FullyConnectedAttributes& attr, const BHWC* dst_shape) const {
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
  ConvPowerVR::ConvParams params = GuessBestParams(
      device, definition, src_depth, dst_depth, true, true, false, dst_shape);
  params.work_group_size.x *= params.work_group_size.y;
@ -839,8 +838,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
    const CLDevice& device, const OperationDef& definition,
    const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
  ConvPowerVR::ConvParams params = GuessBestParams(
      device, definition, src_depth, dst_depth, true, true, true, dst_shape);
  params.block_size.x *= params.block_size.y;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@ -188,8 +188,8 @@ absl::Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
 template <DataType T>
 absl::Status ConvPowerVR::UploadWeights(
    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
  const bool f32_weights = conv_params_.weights_data_type == DataType::FLOAT32;
  const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@ -433,9 +433,9 @@ absl::Status ConvTexture::BindArguments() {
 int3 ConvTexture::GetGridSize() const {
  const int grid_x =
-      IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
+      DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
-  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), block_size_.y);
+  const int grid_y = DivideRoundUp(dst_[0]->Height(), block_size_.y);
-  const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.z);
+  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
  return int3(grid_x, grid_y, grid_z);
 }
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@ -148,9 +148,9 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
 template <DataType T>
 absl::Status ConvTexture::UploadWeights(
    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  int dst_depth = DivideRoundUp(weights.shape.o, 4);
  dst_depth = AlignByN(dst_depth, block_size_.z);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
  const int kernel_x = weights.shape.w;
  const int kernel_y = weights.shape.h;
@ -206,9 +206,9 @@ template <DataType S, typename T>
 void ConvTexture::RearrangeWeightsData(
    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst_0,
    absl::Span<T> dst_1, absl::Span<T> dst_2, absl::Span<T> dst_3) {
-  int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  int dst_depth = DivideRoundUp(weights.shape.o, 4);
  dst_depth = AlignByN(dst_depth, block_size_.z);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
  const int kernel_x = weights.shape.w;
  const int kernel_y = weights.shape.h;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@ -322,7 +322,7 @@ ConvolutionTransposed::ConvolutionTransposed(
      block_size_ = is_f16 ? int3(2, 2, 2) : int3(2, 2, 1);
    }
  }
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
  if (dst_depth == 1 || dst_depth == 3) {
    if (!device.IsMali()) {
      block_size_.y *= block_size_.z;
@ -406,10 +406,9 @@ absl::Status ConvolutionTransposed::BindArguments() {
 int3 ConvolutionTransposed::GetGridSize() const {
  const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
  const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
-  const int grid_x =
+  const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
-      IntegralDivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
+  const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
-  const int grid_y = IntegralDivideRoundUp(aligned_h, block_size_.y);
+  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
  const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.z);
  return int3(grid_x, grid_y, grid_z);
 }
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
@ -91,8 +91,8 @@ template <DataType T>
 absl::Status ConvolutionTransposed::UploadWeights(
    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
  const int dst_depth =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
  const int kernel_x = kernel_size_.x;
  const int kernel_y = kernel_size_.y;
  int texture_width = dst_depth;
@ -160,8 +160,8 @@ template <DataType S, typename T>
 void ConvolutionTransposed::RearrangeWeightsData(
    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
  const int dst_depth =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
  const int kernel_x = kernel_size_.x;
  const int kernel_y = kernel_size_.y;
  int texture_width = dst_depth;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
@ -440,8 +440,8 @@ absl::Status ConvolutionTransposed3D::BindArguments() {
  if (definition_.IsBatchSupported()) {
    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
  }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
+  RETURN_IF_ERROR(
-      IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.w)));
+      kernel_.SetBytesAuto(DivideRoundUp(dst_[0]->Slices(), block_size_.w)));
  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHDS()));
  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHDS()));
  return absl::OkStatus();
@ -451,11 +451,10 @@ int3 ConvolutionTransposed3D::GetGridSize() const {
  const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
  const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
  const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z);
-  const int grid_x =
+  const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
-      IntegralDivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
+  const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
-  const int grid_y = IntegralDivideRoundUp(aligned_h, block_size_.y);
+  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.w) *
-  const int grid_z = IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.w) *
+                     DivideRoundUp(aligned_d, block_size_.z);
                     IntegralDivideRoundUp(aligned_d, block_size_.z);
  return int3(grid_x, grid_y, grid_z);
 }
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
@ -91,8 +91,8 @@ template <DataType T>
 absl::Status ConvolutionTransposed3D::UploadWeights(
    const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
  const int dst_depth =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
  const int kernel_x = kernel_size_.x;
  const int kernel_y = kernel_size_.y;
  const int kernel_z = kernel_size_.z;
@ -162,8 +162,8 @@ template <DataType S, typename T>
 void ConvolutionTransposed3D::RearrangeWeightsData(
    const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
  const int dst_depth =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.w);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.w);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
  const int kernel_x = kernel_size_.x;
  const int kernel_y = kernel_size_.y;
  const int kernel_z = kernel_size_.z;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@ -339,14 +339,13 @@ absl::Status ConvolutionTransposed3x3::BindArguments() {
 }
 int3 ConvolutionTransposed3x3::GetGridSize() const {
-  const int grid_x =
+  const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
-      IntegralDivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
+  const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
  const int grid_z = dst_[0]->Slices();
  int3 wg;
-  wg.x = IntegralDivideRoundUp(grid_x, work_group_size_.x);
+  wg.x = DivideRoundUp(grid_x, work_group_size_.x);
-  wg.y = IntegralDivideRoundUp(grid_y, work_group_size_.y);
+  wg.y = DivideRoundUp(grid_y, work_group_size_.y);
-  wg.z = IntegralDivideRoundUp(grid_z, work_group_size_.z);
+  wg.z = DivideRoundUp(grid_z, work_group_size_.z);
  return int3(wg[work_group_launch_order_[0]] * work_group_size_.x,
              wg[work_group_launch_order_[1]] * work_group_size_.y,
              wg[work_group_launch_order_[2]] * work_group_size_.z);
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
@ -84,8 +84,8 @@ class ConvolutionTransposed3x3 : public GPUOperation {
 template <DataType T>
 absl::Status ConvolutionTransposed3x3::UploadWeights(
    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
  const int kernel_x = 3;  //  This operation support only 3x3 kernel
  const int kernel_y = 3;
  const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@ -109,8 +109,8 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
 template <DataType S, typename T>
 void ConvolutionTransposed3x3::RearrangeWeightsData(
    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
  const int kernel_x = 3;
  const int kernel_y = 3;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
@ -224,8 +224,8 @@ ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
 absl::Status ConvolutionTransposed3x3Thin::Compile(
    const CreationContext& creation_context) {
  const auto code = GenerateConvolutionTransposedCode(
-      definition_, biases_, IntegralDivideRoundUp(src_channels_, 4),
+      definition_, biases_, DivideRoundUp(src_channels_, 4),
-      IntegralDivideRoundUp(dst_channels_, 4), *creation_context.device,
+      DivideRoundUp(dst_channels_, 4), *creation_context.device,
      linked_operations_);
  return creation_context.cache->GetOrCreateCLKernel(
      code, "main_function", *creation_context.context,
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
@ -82,8 +82,8 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
 template <DataType T>
 absl::Status ConvolutionTransposed3x3Thin::UploadWeights(
    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int src_depth = DivideRoundUp(src_channels_, 4);
-  const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
+  const int dst_depth = DivideRoundUp(dst_channels_, 4);
  const int kernel_x = 3;  //  This operation support only 3x3 kernel
  const int kernel_y = 3;
  const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@ -108,8 +108,8 @@ absl::Status ConvolutionTransposed3x3Thin::UploadWeights(
 template <DataType S, typename T>
 void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int src_depth = DivideRoundUp(src_channels_, 4);
-  const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
+  const int dst_depth = DivideRoundUp(dst_channels_, 4);
  const int kernel_x = 3;
  const int kernel_y = 3;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
@ -332,9 +332,8 @@ absl::Status ConvolutionTransposed4x4::BindArguments() {
 }
 int3 ConvolutionTransposed4x4::GetGridSize() const {
-  const int grid_x =
+  const int grid_x = DivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch();
-      IntegralDivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch();
+  const int grid_y = DivideRoundUp(dst_[0]->Height() + 2, 2);
  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height() + 2, 2);
  const int grid_z = dst_[0]->Slices();
  return int3(grid_x, grid_y, grid_z);
 }
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
@ -82,8 +82,8 @@ class ConvolutionTransposed4x4 : public GPUOperation {
 template <DataType T>
 absl::Status ConvolutionTransposed4x4::UploadWeights(
    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
  const int kernel_x = 4;  //  This operation support only 4x4 kernel
  const int kernel_y = 4;
  const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@ -107,8 +107,8 @@ absl::Status ConvolutionTransposed4x4::UploadWeights(
 template <DataType S, typename T>
 void ConvolutionTransposed4x4::RearrangeWeightsData(
    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
  const int kernel_x = 4;
  const int kernel_y = 4;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
@ -187,8 +187,8 @@ ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
 absl::Status ConvolutionTransposedThin::Compile(
    const CreationContext& creation_context) {
  const auto code = GenerateConvolutionTransposedCode(
-      definition_, IntegralDivideRoundUp(src_channels_, 4), dst_channels_,
+      definition_, DivideRoundUp(src_channels_, 4), dst_channels_, kernel_size_,
-      kernel_size_, *creation_context.device, linked_operations_);
+      *creation_context.device, linked_operations_);
  std::vector<CompilerOptions> options;
  if (definition_.precision == CalculationsPrecision::F16 &&
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
@ -82,7 +82,7 @@ class ConvolutionTransposedThin : public GPUOperation {
 template <DataType T>
 absl::Status ConvolutionTransposedThin::UploadWeights(
    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int src_depth = DivideRoundUp(src_channels_, 4);
  const int elements_count =
      kernel_size_.x * kernel_size_.y * src_depth * 4 * dst_channels_;
@ -104,7 +104,7 @@ absl::Status ConvolutionTransposedThin::UploadWeights(
 template <DataType S, typename T>
 void ConvolutionTransposedThin::RearrangeWeightsData(
    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
+  const int src_depth = DivideRoundUp(src_channels_, 4);
  const int kernel_x = kernel_size_.x;
  const int kernel_y = kernel_size_.y;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
@ -89,7 +89,7 @@ template <DataType T>
 absl::Status DepthwiseConvolution::UploadWeights(
    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
  const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
+  const int dst_depth = DivideRoundUp(dst_channels, 4);
  const int kernel_x = weights.shape.w;
  const int kernel_y = weights.shape.h;
@ -137,7 +137,7 @@ template <DataType S, typename T>
 void DepthwiseConvolution::RearrangeWeightsData(
    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
  const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
+  const int dst_depth = DivideRoundUp(dst_channels, 4);
  const int kernel_x = weights.shape.w;
  const int kernel_y = weights.shape.h;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h
@ -88,7 +88,7 @@ template <DataType T>
 absl::Status DepthwiseConvolution3D::UploadWeights(
    const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
  const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_slices = IntegralDivideRoundUp(dst_channels, 4);
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
  const int kernel_x = weights.shape.w;
  const int kernel_y = weights.shape.h;
  const int kernel_z = weights.shape.d;
@ -130,7 +130,7 @@ template <DataType S, typename T>
 void DepthwiseConvolution3D::RearrangeWeightsData(
    const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
  const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_slices = IntegralDivideRoundUp(dst_channels, 4);
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
  const int kernel_x = weights.shape.w;
  const int kernel_y = weights.shape.h;
  const int kernel_z = weights.shape.d;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
@ -323,8 +323,8 @@ absl::Status DepthwiseConv3x3::BindArguments() {
 }
 int3 DepthwiseConv3x3::GetGridSize() const {
-  const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), 2);
+  const int grid_x = DivideRoundUp(dst_[0]->Width(), 2);
-  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
+  const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
  const int grid_z = dst_[0]->Slices();
  return int3(grid_x, grid_y, grid_z);
 }
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
@ -83,7 +83,7 @@ template <DataType T>
 absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
    const tflite::gpu::Tensor<OHWI, T>& weights,
    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
  int texture_width = 10;  // 3x3 kernel + 1 bias
  int texture_height = src_depth;
  const int elements_count = texture_width * texture_height;
@ -129,7 +129,7 @@ template <DataType S, typename T>
 void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(
    const tflite::gpu::Tensor<OHWI, S>& weights,
    const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
  int counter = 0;
  for (int s = 0; s < src_depth; ++s) {
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
@ -70,8 +70,8 @@ class FullyConnected : public GPUOperation {
 template <DataType T>
 absl::Status FullyConnected::UploadWeights(
    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
  const int elements_count = src_depth * dst_depth * 4;
  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
@ -94,8 +94,8 @@ absl::Status FullyConnected::UploadWeights(
 template <DataType T, typename S>
 void FullyConnected::RearrangeWeights(
    const tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<S> dst) {
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
  int counter = 0;
  for (int s = 0; s < src_depth; ++s) {
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
@ -129,8 +129,7 @@ absl::Status Softmax1x1::AddToQueue(CLCommandQueue* queue) {
  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
  const int depth = src_[0]->Slices();
-  RETURN_IF_ERROR(
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(int2(depth, DivideRoundUp(depth, 32))));
      kernel_.SetBytesAuto(int2(depth, IntegralDivideRoundUp(depth, 32))));
  RETURN_IF_ERROR(
      kernel_.SetBytesAuto(GetMaskForLastPlane(src_[0]->Channels())));
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@ -234,12 +234,12 @@ template <DataType S, typename T>
 void RearrangeWeightsToOHWIOGroupI4O4(
    const tflite::gpu::Tensor<OHWI, S>& weights, int out_group_size,
    absl::Span<T> dst) {
-  const int dst_slices = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
-  const int src_slices = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
  const int kernel_x = weights.shape.w;
  const int kernel_y = weights.shape.h;
-  const int dst_groups = IntegralDivideRoundUp(dst_slices, out_group_size);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
  int counter = 0;
  for (int d = 0; d < dst_groups; ++d) {
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@ -436,9 +436,9 @@ absl::Status Winograd4x4To36::BindArguments() {
  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  const int tiles_x = IntegralDivideRoundUp(
+  const int tiles_x = DivideRoundUp(
      src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
-  const int tiles_y = IntegralDivideRoundUp(
+  const int tiles_y = DivideRoundUp(
      src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
  const int tiles_total = tiles_x * tiles_y;
  RETURN_IF_ERROR(
@ -550,14 +550,14 @@ absl::Status Winograd36To4x4::BindArguments() {
  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  const int tiles_x = IntegralDivideRoundUp(dst_[0]->Width(), 4);
+  const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
  RETURN_IF_ERROR(kernel_.SetBytesAuto(tiles_x));
  return absl::OkStatus();
 }
 int3 Winograd36To4x4::GetGridSize() const {
-  const int tiles_x = IntegralDivideRoundUp(dst_[0]->Width(), 4);
+  const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
-  const int tiles_y = IntegralDivideRoundUp(dst_[0]->Height(), 4);
+  const int tiles_y = DivideRoundUp(dst_[0]->Height(), 4);
  const int grid_x = tiles_x * tiles_y * dst_[0]->Batch();
  const int grid_y = 4;
  const int grid_z = dst_[0]->Slices();
--- a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
@ -187,7 +187,7 @@ int3 GetWorkGroupXY128Simple(const int3& grid) { return int3(16, 8, 1); }
 int3 GetWorkGroup(const int3& grid, int max_size) {
  int wg_z = GetBiggestDividerWithPriority(grid.z, 8);
  int wg_xy_size = max_size / wg_z;
-  int wg_x = std::min(IntegralDivideRoundUp(grid.x, 2), wg_xy_size);
+  int wg_x = std::min(DivideRoundUp(grid.x, 2), wg_xy_size);
  int wg_y = std::min(wg_xy_size / wg_x, grid.y);
  return int3(wg_x, wg_y, wg_z);
 }
@ -231,12 +231,12 @@ absl::Status GetBestWorkGroupXY128Linear(const TuningParameters& params,
 }
 bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
-  int planar_work_groups = IntegralDivideRoundUp(width * height, 128);
+  int planar_work_groups = DivideRoundUp(width * height, 128);
  auto base_work_groups = Get2DWorkgroupsEqualTo128();
  bool have_equal_work_groups = false;
  for (auto& work_group : base_work_groups) {
-    int x_groups = IntegralDivideRoundUp(width, work_group.x);
+    int x_groups = DivideRoundUp(width, work_group.x);
-    int y_groups = IntegralDivideRoundUp(height, work_group.y);
+    int y_groups = DivideRoundUp(height, work_group.y);
    int xy_groups = x_groups * y_groups;
    if (xy_groups == planar_work_groups) {
      have_equal_work_groups = true;
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@ -101,7 +101,7 @@ absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
                                 CLContext* context, LinearStorage* result) {
  int size = creation_info.aligned_size != 0 ? creation_info.aligned_size
                                             : tensor.shape.v;
-  const int depth = IntegralDivideRoundUp(size, 4);
+  const int depth = DivideRoundUp(size, 4);
  if (creation_info.data_type == DataType::FLOAT32) {
    std::vector<float4> gpu_data(depth);
    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@ -56,10 +56,10 @@ bool IsChannelsBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
 bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
                                   const CLDevice& device,
                                   const BHWC& dst_shape) {
-  const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
+  const int tiles_x = DivideRoundUp(dst_shape.w, 4);
-  const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
+  const int tiles_y = DivideRoundUp(dst_shape.h, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
  const bool suitable_attributes =
      attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
      attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
@ -82,8 +82,8 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
    return absl::UnimplementedError("No implementation for this case.");
  }
-  const int tiles_x = IntegralDivideRoundUp(output_shape.w, 4);
+  const int tiles_x = DivideRoundUp(output_shape.w, 4);
-  const int tiles_y = IntegralDivideRoundUp(output_shape.h, 4);
+  const int tiles_y = DivideRoundUp(output_shape.h, 4);
  const BHWC shape_0{input_shape.b, 36, tiles_x * tiles_y, input_shape.c};
  const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
  TensorDescriptor td_0;
--- a/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc
@ -28,7 +28,7 @@ namespace cl {
 bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
                              const BHWDC& shape,
                              const TensorDescriptor& descriptor) {
-  const int slices = IntegralDivideRoundUp(shape.c, 4);
+  const int slices = DivideRoundUp(shape.c, 4);
  switch (descriptor.storage_type) {
    case TensorStorageType::BUFFER: {
      const int flt4_size =
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@ -65,11 +65,10 @@ absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
  }
  if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
    cl_mem image_memory;
-    RETURN_IF_ERROR(
+    RETURN_IF_ERROR(CreateImageBufferFromBuffer(
-        CreateImageBufferFromBuffer(context, memory, descriptor.data_type,
+        context, memory, descriptor.data_type,
-                                    shape.b * shape.w * shape.h * shape.d *
+        shape.b * shape.w * shape.h * shape.d * DivideRoundUp(shape.c, 4),
-                                        IntegralDivideRoundUp(shape.c, 4),
+        &image_memory));
                                    &image_memory));
    *result = Tensor(memory, memory_owner, image_memory, shape, descriptor);
  } else {
    *result = Tensor(memory, memory_owner, shape, descriptor);
@ -386,7 +385,7 @@ absl::Status AllocateTensorMemory(const CLContext& context,
                                  const CLDevice& device, const BHWDC& shape,
                                  const TensorDescriptor& descriptor,
                                  CLMemory* result) {
-  const int slices = IntegralDivideRoundUp(shape.c, 4);
+  const int slices = DivideRoundUp(shape.c, 4);
  switch (descriptor.storage_type) {
    case TensorStorageType::BUFFER:
    case TensorStorageType::IMAGE_BUFFER: {
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@ -61,7 +61,7 @@ class Tensor {
  int Height() const { return shape_.h; }
  int Depth() const { return shape_.d; }
  int Channels() const { return shape_.c; }
-  int Slices() const { return IntegralDivideRoundUp(shape_.c, 4); }
+  int Slices() const { return DivideRoundUp(shape_.c, 4); }
  int Batch() const { return shape_.b; }
  // returns int4(width * batch, height, slices, batch)
--- a/tensorflow/lite/delegates/gpu/common/convert.cc
+++ b/tensorflow/lite/delegates/gpu/common/convert.cc
@ -44,12 +44,11 @@ absl::Status ConvertToPHWO4I4(absl::Span<const float> in, const OHWI& shape,
  }
  float* output = out.data();
-  for (int p = 0; p < IntegralDivideRoundUp(shape.o, kPhwo4i4ChannelsInPlane);
+  for (int p = 0; p < DivideRoundUp(shape.o, kPhwo4i4ChannelsInPlane); ++p) {
       ++p) {
    for (int h = 0; h < shape.h; ++h) {
      for (int w = 0; w < shape.w; ++w) {
-        for (int c = 0;
+        for (int c = 0; c < DivideRoundUp(shape.i, kPhwo4i4ChannelsInPlane);
-             c < IntegralDivideRoundUp(shape.i, kPhwo4i4ChannelsInPlane); ++c) {
+             ++c) {
          for (int co = 0; co < kPhwo4i4ChannelsInPlane; ++co) {
            for (int ci = 0; ci < kPhwo4i4ChannelsInPlane; ++ci) {
              float value = 0;
@ -106,7 +105,7 @@ std::vector<float> ConvertToPHWO4I4Transposed(
 uint3 Get3DSizeForPHWO4I4(const OHWI& shape) {
  return uint3(AlignByN(shape.i, 4), shape.h * shape.w,
-               IntegralDivideRoundUp(shape.o, 4));
+               DivideRoundUp(shape.o, 4));
 }
 // Layout is Po,H,W,OI4x4.
@ -123,8 +122,8 @@ absl::Status ConvertToPHWO4I4(absl::Span<const float> in, const IHWO& shape,
        out.size(), " != ", GetElementsSizeForPHWO4I4(shape)));
  }
-  const int dst_depth = IntegralDivideRoundUp(shape.o, 4);
+  const int dst_depth = DivideRoundUp(shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(shape.i, 4);
+  const int src_depth = DivideRoundUp(shape.i, 4);
  float* output = out.data();
  for (int f = 0; f < dst_depth; ++f) {
@ -178,8 +177,7 @@ absl::Status ConvertToPIOHW4(absl::Span<const float> in, const OHWI& shape,
  }
  int32_t output_channels = shape.o * shape.i;
-  int32_t num_planes =
+  int32_t num_planes = DivideRoundUp(output_channels, kPiohw4ChannelsInPlane);
      IntegralDivideRoundUp(output_channels, kPiohw4ChannelsInPlane);
  float* output = out.data();
  for (int p = 0; p < num_planes; ++p) {
    for (int h = 0; h < shape.h; ++h) {
@ -232,7 +230,7 @@ absl::Status ConvertToPHWC4(absl::Span<const float> in, const BHWC& shape,
    return absl::OkStatus();
  }
  // Layout is Pc,H,W,C4 where P - is a plane based on channels.
-  int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
+  int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
  const int num_pixels = shape.h * shape.w;
  // A layer is a set of kPhwc4ChannelsInPlane channels images.
  const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
@ -281,7 +279,7 @@ absl::Status ConvertToPHWC4Half(absl::Span<const float> in, const BHWC& shape,
  RETURN_IF_ERROR(ValidateConvertToPHWC4(in, shape, out));
  // Layout is Pc,H,W,C4 where P - is a plane based on channels.
-  int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
+  int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
  const int num_pixels = shape.h * shape.w;
  // A layer is a set of kPhwc4ChannelsInPlane channels images.
  const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
@ -407,7 +405,7 @@ absl::Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
    return absl::OkStatus();
  }
-  int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
+  int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
  const int num_pixels = shape.h * shape.w;
  const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
  // A layer is a set of kPhwc4ChannelsInPlane channels images.
@ -449,7 +447,7 @@ absl::Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
 absl::Status ConvertFromPHWC4Half(absl::Span<const HalfBits> in,
                                  const BHWC& shape, absl::Span<float> out) {
  RETURN_IF_ERROR(ValidateConvertFromPHWC4(in, shape, out));
-  int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
+  int num_planes = DivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
  const int num_pixels = shape.h * shape.w;
  const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
  // A layer is a set of kPhwc4ChannelsInPlane channels images.
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@ -209,7 +209,7 @@ OperationType OperationTypeFromString(const std::string& name) {
 namespace {
 template <typename T>
-T IntegralDivideRoundUp(T n, T divisor) {
+T DivideRoundUp(T n, T divisor) {
  return (n - 1) / divisor + 1;
 }
@ -272,7 +272,7 @@ int32_t CalculateOutput(const BHWDC& input,
 }
 inline int32_t StridedSize(int32_t size, int32_t stride) {
-  return stride == 0 ? -1 : IntegralDivideRoundUp(size, stride);
+  return stride == 0 ? -1 : DivideRoundUp(size, stride);
 }
 template <Axis AxisT, typename AttrT>
--- a/tensorflow/lite/delegates/gpu/common/util.h
+++ b/tensorflow/lite/delegates/gpu/common/util.h
@ -24,24 +24,23 @@ namespace gpu {
 // @param n must be non negative
 // @param divisor must be greater than zero
 template <typename T, typename N>
-T IntegralDivideRoundUp(T n, N divisor) {
+T DivideRoundUp(T n, N divisor) {
  const T div = static_cast<T>(divisor);
  const T q = n / div;
  return n % div == 0 ? q : q + 1;
 }
 template <>
-inline uint3 IntegralDivideRoundUp(uint3 n, uint3 divisor) {
+inline uint3 DivideRoundUp(uint3 n, uint3 divisor) {
-  return uint3(IntegralDivideRoundUp(n.x, divisor.x),
+  return uint3(DivideRoundUp(n.x, divisor.x), DivideRoundUp(n.y, divisor.y),
-               IntegralDivideRoundUp(n.y, divisor.y),
+               DivideRoundUp(n.z, divisor.z));
               IntegralDivideRoundUp(n.z, divisor.z));
 }
 // @param number or its components must be greater than zero
 // @param n must be greater than zero
 template <typename T, typename N>
 T AlignByN(T number, N n) {
-  return IntegralDivideRoundUp(number, n) * n;
+  return DivideRoundUp(number, n) * n;
 }
 }  // namespace gpu
--- a/tensorflow/lite/delegates/gpu/common/util_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/util_test.cc
@ -24,16 +24,16 @@ namespace {
 using testing::Eq;
-TEST(UtilTest, IntegralDivideRoundUp) {
+TEST(UtilTest, DivideRoundUp) {
-  EXPECT_THAT(IntegralDivideRoundUp(0, 256), Eq(0));
+  EXPECT_THAT(DivideRoundUp(0, 256), Eq(0));
-  EXPECT_THAT(IntegralDivideRoundUp(2u, 256), Eq(1));
+  EXPECT_THAT(DivideRoundUp(2u, 256), Eq(1));
-  EXPECT_THAT(IntegralDivideRoundUp(2, 256), Eq(1));
+  EXPECT_THAT(DivideRoundUp(2, 256), Eq(1));
-  EXPECT_THAT(IntegralDivideRoundUp(255u, 256), Eq(1));
+  EXPECT_THAT(DivideRoundUp(255u, 256), Eq(1));
-  EXPECT_THAT(IntegralDivideRoundUp(255, 256), Eq(1));
+  EXPECT_THAT(DivideRoundUp(255, 256), Eq(1));
-  EXPECT_THAT(IntegralDivideRoundUp(256u, 256), Eq(1));
+  EXPECT_THAT(DivideRoundUp(256u, 256), Eq(1));
-  EXPECT_THAT(IntegralDivideRoundUp(256, 256), Eq(1));
+  EXPECT_THAT(DivideRoundUp(256, 256), Eq(1));
-  EXPECT_THAT(IntegralDivideRoundUp(257u, 256), Eq(2));
+  EXPECT_THAT(DivideRoundUp(257u, 256), Eq(2));
-  EXPECT_THAT(IntegralDivideRoundUp(257, 256), Eq(2));
+  EXPECT_THAT(DivideRoundUp(257, 256), Eq(2));
 }
 TEST(UtilTest, AlignByN) {
--- a/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
+++ b/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
@ -34,9 +34,9 @@ void AddCornerCases(const T& grid, int max_work_group_total_size,
  for (int x = 1; x <= 4; ++x) {
    for (int y = 1; y <= 4; ++y) {
      for (int z = 1; z <= 4; ++z) {
-        int wg_x = IntegralDivideRoundUp(grid.x, x);
+        int wg_x = DivideRoundUp(grid.x, x);
-        int wg_y = IntegralDivideRoundUp(grid.y, y);
+        int wg_y = DivideRoundUp(grid.y, y);
-        int wg_z = IntegralDivideRoundUp(grid.z, z);
+        int wg_z = DivideRoundUp(grid.z, z);
        if (wg_x > max_work_group_sizes.x || wg_y > max_work_group_sizes.y ||
            wg_z > max_work_group_sizes.z ||
            wg_x * wg_y * wg_z > max_work_group_total_size) {
--- a/tensorflow/lite/delegates/gpu/gl/api.cc
+++ b/tensorflow/lite/delegates/gpu/gl/api.cc
@ -201,7 +201,7 @@ class CompiledModelImpl
                   ShaderCode code) {
    // Calculate workgroup size.
    uint3 workgroup_size = workgroup_calculator.Calculate(code);
-    uint3 num_workgroups = IntegralDivideRoundUp(code.workload, workgroup_size);
+    uint3 num_workgroups = DivideRoundUp(code.workload, workgroup_size);
    for (const auto& object : code.objects) {
      if (IsRef(object)) {
--- a/tensorflow/lite/delegates/gpu/gl/api2.cc
+++ b/tensorflow/lite/delegates/gpu/gl/api2.cc
@ -569,7 +569,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
          } else {
            shader_index = it->second;
          }
-          auto num_workgroups = IntegralDivideRoundUp(code.workload, workgroup);
+          auto num_workgroups = DivideRoundUp(code.workload, workgroup);
          return runtime_ptr->AddProgram(shaders[shader_index], code.parameters,
                                         code.objects, num_workgroups);
        }));
--- a/tensorflow/lite/delegates/gpu/gl/compiler.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler.cc
@ -190,8 +190,7 @@ class CompilerImpl : public Compiler {
                "Workload uint3() requires all output sizes to match");
          }
        }
-        attr.code.workload =
+        attr.code.workload = uint3(shape.w, shape.h, DivideRoundUp(shape.c, 4));
            uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4));
      }
      int num_textures = 0;
--- a/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.cc
+++ b/tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.cc
@ -88,8 +88,8 @@ absl::Status ConverterBhwcToPhwc4::Convert(const BHWC& shape,
    return absl::UnimplementedError(
        "BhwcToPhwc4: Batch size is not equal to 1.");
  }
-  uint3 workload = uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4));
+  uint3 workload = uint3(shape.w, shape.h, DivideRoundUp(shape.c, 4));
-  uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
+  uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
  RETURN_IF_ERROR(program_.SetParameter(
      {"sizes_",
--- a/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.cc
+++ b/tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.cc
@ -83,7 +83,7 @@ absl::Status ConverterPhwc4ToBhwc::Convert(const BHWC& shape,
  }
  uint3 workload = uint3(shape.w, shape.h, shape.c);
-  uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
+  uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
  // TODO(akulik): simply pass workload as soon as UniformParameter
  // supports uint3
--- a/tensorflow/lite/delegates/gpu/gl/kernels/add.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/add.cc
@ -102,7 +102,7 @@ class Add : public NodeShader {
          // Declare workload explicitly because shader depends on gid.z.
          /*workload=*/
          uint3(ctx.input_shapes[0][2], ctx.input_shapes[0][1],
-                IntegralDivideRoundUp(ctx.input_shapes[0][3], 4)),
+                DivideRoundUp(ctx.input_shapes[0][3], 4)),
          /*workgroup=*/uint3(),
          /*source_code=*/"value_0 += $add_buffer[gid.z]$;",
          /*input=*/IOStructure::AUTO,
--- a/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc
@ -210,7 +210,7 @@ vec4 val = vec4(0.0f);
    // * - you are going to write into these cells
    // @ - you will fill these cells next cycles
    // ^ - first elem you start writing from
-    int blocks_amount = IntegralDivideRoundUp<int>(in_ch, 4);
+    int blocks_amount = DivideRoundUp<int>(in_ch, 4);
    code += "// Aligned case\n";
    code += "// I'm going to make " + std::to_string(blocks_amount) +
            " write(s)\n\n";
--- a/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc
@ -55,7 +55,7 @@ class Convolution : public NodeShader {
          {"dilation_h", attr.dilations.h},
          {"kernel_w", weights.w},
          {"kernel_h", weights.h},
-          {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+          {"src_depth", DivideRoundUp(weights.i, 4)},
          {"stride", int2(attr.strides.w, attr.strides.h)},
      };
    } else {
@ -71,7 +71,7 @@ class Convolution : public NodeShader {
          {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
          {"offsets_count", offsets_count},
          {"offsets", offsets},
-          {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+          {"src_depth", DivideRoundUp(weights.i, 4)},
          {"stride", int2(attr.strides.w, attr.strides.h)},
      };
    }
@ -181,14 +181,14 @@ class Convolution1x1 : public NodeShader {
    std::vector<Variable> parameters = {
        {"src_depth",
-         IntegralDivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)},
+         DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)},
    };
    std::vector<std::pair<std::string, Object>> objects = {
-        {"weights", MakeReadonlyObject(
+        {"weights",
-                        uint3(4, IntegralDivideRoundUp(attr.weights.shape.i, 4),
+         MakeReadonlyObject(uint3(4, DivideRoundUp(attr.weights.shape.i, 4),
-                              IntegralDivideRoundUp(attr.weights.shape.o, 4)),
+                                  DivideRoundUp(attr.weights.shape.o, 4)),
-                        ConvertToPHWO4I4(attr.weights))}};
+                            ConvertToPHWO4I4(attr.weights))}};
    std::string source;
    for (int i = 0; i < multiplier; i++) {
      absl::StrAppend(&source, "highp vec4 result", i, " = vec4(0);\n");
@ -224,7 +224,7 @@ class Convolution1x1 : public NodeShader {
      absl::StrAppend(&source, "value_0 = result0;\n");
    }
-    auto dst_depth = IntegralDivideRoundUp(ctx.output_shapes[0][3], 4);
+    auto dst_depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
    uint3 workgroup = uint3(16, 16, 1);
    if (ctx.gpu_info->type == GpuType::ADRENO) {
      if (dst_depth >= 2) {
@ -265,7 +265,7 @@ class Convolution1x1 : public NodeShader {
        /*shared_variables=*/{},
        /*workload=*/
        uint3(ctx.output_shapes[0][2] / multiplier, ctx.output_shapes[0][1],
-              IntegralDivideRoundUp(ctx.output_shapes[0][3], 4)),
+              DivideRoundUp(ctx.output_shapes[0][3], 4)),
        /*workgroup=*/
        GetIdealWorkgroupIfPossible(
            ctx.gpu_info->gpu_model, OperationType::CONVOLUTION_2D,
--- a/tensorflow/lite/delegates/gpu/gl/kernels/converter.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/converter.cc
@ -64,7 +64,7 @@ class OpenGlConverterImpl : public TensorObjectConverter {
  }
  absl::Status Dispatch(const uint3& workload) {
-    uint3 num_workgroups = IntegralDivideRoundUp(workload, workgroup_size_);
+    uint3 num_workgroups = DivideRoundUp(workload, workgroup_size_);
    if (command_queue_) {
      return command_queue_->Dispatch(program_, num_workgroups);
    }
@ -256,7 +256,7 @@ class ToTensorConverter : public OpenGlConverterImpl {
      return absl::InvalidArgumentError(
          "ToTensorConverter: output data size does not match expected size.");
    }
-    auto d = IntegralDivideRoundUp(shape_.c, 4);
+    auto d = DivideRoundUp(shape_.c, 4);
    RETURN_IF_ERROR(program_.SetParameter(
        {"sizes",
         int4(static_cast<int32_t>(shape_.w), static_cast<int32_t>(shape_.h),
--- a/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc
@ -54,7 +54,7 @@ class DepthwiseConvolution : public NodeShader {
          {"dilation_h", attr.dilations.h},
          {"kernel_w", weights.w},
          {"kernel_h", weights.h},
-          {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+          {"src_depth", DivideRoundUp(weights.i, 4)},
          {"channel_multiplier", weights.o},
          {"stride", int2(attr.strides.w, attr.strides.h)},
      };
@ -71,7 +71,7 @@ class DepthwiseConvolution : public NodeShader {
          {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
          {"offsets_count", offsets_count},
          {"offsets", offsets},
-          {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+          {"src_depth", DivideRoundUp(weights.i, 4)},
          {"channel_multiplier", weights.o},
          {"stride", int2(attr.strides.w, attr.strides.h)},
      };
--- a/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc
@ -39,8 +39,8 @@ class FullyConnectedBuffers : public NodeShader {
    const auto& attr =
        absl::any_cast<const FullyConnectedAttributes&>(ctx.op_attr);
-    const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+    const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
-    const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+    const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
    // This shader can work with any workgroup size, the values below work well
    // for OpenGL.
--- a/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
@ -105,10 +105,9 @@ absl::Status GenerateMultiplyScalarCode(
        /*shared_variables=*/{},
        // Declare workload explicitly because shader depends on gid.z.
        /*workload=*/
-        uint3(
+        uint3(static_cast<int>(ctx.input_shapes[0][2]),
-            static_cast<int>(ctx.input_shapes[0][2]),
+              static_cast<int>(ctx.input_shapes[0][1]),
-            static_cast<int>(ctx.input_shapes[0][1]),
+              DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)),
            IntegralDivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)),
        /*workgroup=*/uint3(),
        /*source_code=*/"value_0 *= $mul_buffer[gid.z]$;",
        /*input=*/IOStructure::AUTO,
--- a/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc
@ -98,8 +98,8 @@ class Pad : public NodeShader {
        source += "    value_0 = $input_data_0[src_x, src_y, gid.z]$;\n";
      } else if (attr.prepended.c % 4 == 0) {
        parameters.push_back(
-            {"src_slices", IntegralDivideRoundUp(
+            {"src_slices",
-                               static_cast<int>(ctx.input_shapes[0][3]), 4)});
+             DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)});
        source += R"(
    int src_z = gid.z - $prepended.z$ / 4;
    if (src_z >= 0 && src_z < $src_slices$) {
--- a/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc
@ -69,8 +69,8 @@ class PReLULinearAlpha : public NodeShader {
                  /*workload=*/
                  uint3(static_cast<int>(ctx.output_shapes[0][2]),
                        static_cast<int>(ctx.output_shapes[0][1]),
-                        IntegralDivideRoundUp(
+                        DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
-                            static_cast<int>(ctx.output_shapes[0][3]), 4)),
+                                      4)),
                  /*workgroup=*/uint3(),
                  /*source_code=*/
                  "value_0 = max(value_0, 0.0) + $alpha[gid.z]$ * min(value_0, "
@ -98,10 +98,10 @@ class PReLUFull : public NodeShader {
          "Alpha shape does not match input shape.");
    }
-    ObjectSize obj_size = uint3(
+    ObjectSize obj_size =
-        static_cast<int>(ctx.output_shapes[0][2]),
+        uint3(static_cast<int>(ctx.output_shapes[0][2]),
-        static_cast<int>(ctx.output_shapes[0][1]),
+              static_cast<int>(ctx.output_shapes[0][1]),
-        IntegralDivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4));
+              DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4));
    *generated_code =
        attr.clip
@ -116,8 +116,8 @@ class PReLUFull : public NodeShader {
                  /*workload=*/
                  uint3(static_cast<int>(ctx.output_shapes[0][2]),
                        static_cast<int>(ctx.output_shapes[0][1]),
-                        IntegralDivideRoundUp(
+                        DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
-                            static_cast<int>(ctx.output_shapes[0][3]), 4)),
+                                      4)),
                  /*workgroup=*/uint3(),
                  /*source_code=*/
                  "value_0 = clamp(value_0, 0.0, $clip$) + "
@ -136,8 +136,8 @@ class PReLUFull : public NodeShader {
                  /*workload=*/
                  uint3(static_cast<int>(ctx.output_shapes[0][2]),
                        static_cast<int>(ctx.output_shapes[0][1]),
-                        IntegralDivideRoundUp(
+                        DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]),
-                            static_cast<int>(ctx.output_shapes[0][3]), 4)),
+                                      4)),
                  /*workgroup=*/uint3(),
                  /*source_code=*/
                  "value_0 = max(value_0, 0.0) + $alpha[gid.x, gid.y, gid.z]$ "
--- a/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc
@ -60,13 +60,13 @@ class Softmax : public NodeShader {
 private:
  absl::Status GenerateCodeFor1x1(const GenerationContext& ctx,
                                  GeneratedCode* generated_code) const {
-    const int depth = IntegralDivideRoundUp(ctx.output_shapes[0][3], 4);
+    const int depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
    std::vector<Variable> shared_variables = {
        {"partial_sum", std::vector<float4>(8)},
    };
    std::vector<Variable> uniform_parameters = {
        {"depth", depth},
-        {"depth_div_32", IntegralDivideRoundUp(depth, 32)},
+        {"depth_div_32", DivideRoundUp(depth, 32)},
        {"mask", GetMask(ctx.output_shapes[0][3])},
    };
    std::string source_code = R"(
@ -138,7 +138,7 @@ class Softmax : public NodeShader {
                                   GeneratedCode* generated_code) const {
    std::vector<Variable> parameters = {
        {"src_depth",
-         IntegralDivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4)},
+         DivideRoundUp(static_cast<int>(ctx.output_shapes[0][3]), 4)},
        {"mask", GetMask(ctx.output_shapes[0][3])},
    };
--- a/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc
@ -44,7 +44,7 @@ class ConvolutionTransposedBuffers : public NodeShader {
    std::vector<Variable> parameters = {
        {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
        {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
-        {"src_depth", IntegralDivideRoundUp(weights.i, 4)},
+        {"src_depth", DivideRoundUp(weights.i, 4)},
        {"kernel_size", int2(weights.w, weights.h)},
        {"stride", int2(attr.stride.w, attr.stride.h)},
        {"padding", int2(weights.w - 1 - attr.padding.prepended.w,
--- a/tensorflow/lite/delegates/gpu/gl/object.h
+++ b/tensorflow/lite/delegates/gpu/gl/object.h
@ -153,17 +153,17 @@ inline Object MakeReadonlyBuffer(const ObjectSize& size,
 inline Object MakeReadonlyObject(const std::vector<float>& data) {
  return MakeReadonlyObject(
-      IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
+      DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
 }
 inline Object MakeReadonlyTexture(const std::vector<float>& data) {
  return MakeReadonlyTexture(
-      IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
+      DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
 }
 inline Object MakeReadonlyBuffer(const std::vector<float>& data) {
  return MakeReadonlyBuffer(
-      IntegralDivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
+      DivideRoundUp(static_cast<uint32_t>(data.size()), 4U), data);
 }
 // TODO(akulik): find better place for functions below.
@ -172,7 +172,7 @@ inline uint3 GetPHWC4Size(const BHWC& shape) {
  uint3 size;
  size.x = shape.w;
  size.y = shape.h;
-  size.z = shape.b * IntegralDivideRoundUp(shape.c, 4);
+  size.z = shape.b * DivideRoundUp(shape.c, 4);
  return size;
 }
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@ -167,10 +167,10 @@ std::vector<ComputeTaskDescriptorPtr> SelectWinograd36To4x4(
 bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
                                   const BHWC& dst_shape) {
-  const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
+  const int tiles_x = DivideRoundUp(dst_shape.w, 4);
-  const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
+  const int tiles_y = DivideRoundUp(dst_shape.h, 4);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
  const bool suitable_attributes =
      attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
      attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
@ -229,8 +229,8 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
      auto attr =
          absl::any_cast<Convolution2DAttributes>(node->operation.attributes);
      if (IsSuitableForWinograd4x4To6x6(attr, dst_shape)) {
-        int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
+        int tiles_x = DivideRoundUp(dst_shape.w, 4);
-        int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
+        int tiles_y = DivideRoundUp(dst_shape.h, 4);
        Winograd4x4To36Attributes wino_up_attr;
        wino_up_attr.padding = attr.padding;
--- a/tensorflow/lite/delegates/gpu/metal/buffer_convert.mm
+++ b/tensorflow/lite/delegates/gpu/metal/buffer_convert.mm
@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/common.h"
 using ::tflite::gpu::IntegralDivideRoundUp;
 using ::tflite::gpu::BHWC;
 using ::tflite::gpu::DivideRoundUp;
 using ::tflite::gpu::metal::CreateComputeProgram;
@implementation TFLBufferConvert {
@ -102,10 +102,10 @@ using ::tflite::gpu::metal::CreateComputeProgram;
  [encoder setBytes:uniforms.data() length:uniforms.size() * sizeof(int) atIndex:2];
  MTLSize group_size = MTLSizeMake(16, 16, 1);
-  int layers = IntegralDivideRoundUp(shape.c, 4);
+  int layers = DivideRoundUp(shape.c, 4);
-  int groups_x = IntegralDivideRoundUp(shape.w, group_size.width);
+  int groups_x = DivideRoundUp(shape.w, group_size.width);
-  int groups_y = IntegralDivideRoundUp(shape.h, group_size.height);
+  int groups_y = DivideRoundUp(shape.h, group_size.height);
-  int groups_z = IntegralDivideRoundUp(layers, group_size.depth);
+  int groups_z = DivideRoundUp(layers, group_size.depth);
  MTLSize groups_count = MTLSizeMake(groups_x, groups_y, groups_z);
  [encoder dispatchThreadgroups:groups_count threadsPerThreadgroup:group_size];
 }
--- a/tensorflow/lite/delegates/gpu/metal/compiled_model.cc
+++ b/tensorflow/lite/delegates/gpu/metal/compiled_model.cc
@ -440,9 +440,9 @@ ComputeTaskDescriptorPtr NonLinkableStub(int operation_id, ValueId input_id,
  desc->resize_function = [input_id](const std::map<ValueId, BHWC>& buffers) {
    const auto& dimension = buffers.find(input_id)->second;
    uint3 groups_size{16, 16, 1};
-    uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x),
+    uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
-                       IntegralDivideRoundUp(dimension.h, groups_size.y),
+                       DivideRoundUp(dimension.h, groups_size.y),
-                       IntegralDivideRoundUp(dimension.c, 4)};
+                       DivideRoundUp(dimension.c, 4)};
    return std::make_pair(groups_size, groups_count);
  };
--- a/tensorflow/lite/delegates/gpu/metal/kernels/concat.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/concat.cc
@ -71,7 +71,7 @@ std::string GetConcatZCode(const std::vector<int> channels) {
    // Also it is easy to write a loop in this case, to prevent long kernel
    // generation.
    for (int i = 0; i < channels.size(); ++i) {
-      const int depth = IntegralDivideRoundUp(channels[i], 4);
+      const int depth = DivideRoundUp(channels[i], 4);
      const std::string src_buffer = "src_buffer" + std::to_string(i);
      c += "  for (int i = 0; i < " + std::to_string(depth) + "; ++i) {\n";
      c += "    int src_index = i * U.src_size.w + xy_offset;\n";
@ -88,7 +88,7 @@ std::string GetConcatZCode(const std::vector<int> channels) {
    int read_index = 0;
    int z = 0;
    for (int i = 0; i < channels.size(); ++i) {
-      const int depth = IntegralDivideRoundUp(channels[i], 4);
+      const int depth = DivideRoundUp(channels[i], 4);
      const std::string src_buffer = "src_buffer" + std::to_string(i);
      for (int d = 0; d < depth; ++d) {
        const int channels_in_group = std::min(4, channels[i] - d * 4);
@ -168,11 +168,11 @@ std::vector<ComputeTaskDescriptorPtr> ConcatZ(
         std::vector<int> uniform_params{
             src_shape.w,
             src_shape.h,
-             IntegralDivideRoundUp(src_shape.c, 4),
+             DivideRoundUp(src_shape.c, 4),
             src_shape.w * src_shape.h,
             dst_shape.w,
             dst_shape.h,
-             IntegralDivideRoundUp(dst_shape.c, 4),
+             DivideRoundUp(dst_shape.c, 4),
             dst_shape.w * dst_shape.h,
         };
         return GetByteBuffer(uniform_params);
@ -184,9 +184,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatZ(
    uint3 grid(dst_shape.w, dst_shape.h, 1);
    uint3 group_size{8u, 4u, 1u};
    uint3 groups;
-    groups.x = IntegralDivideRoundUp(grid.x, group_size.x);
+    groups.x = DivideRoundUp(grid.x, group_size.x);
-    groups.y = IntegralDivideRoundUp(grid.y, group_size.y);
+    groups.y = DivideRoundUp(grid.y, group_size.y);
-    groups.z = IntegralDivideRoundUp(grid.z, group_size.z);
+    groups.z = DivideRoundUp(grid.z, group_size.z);
    return std::make_pair(group_size, groups);
  };
@ -265,7 +265,7 @@ std::vector<ComputeTaskDescriptorPtr> ConcatX(
       [output_id](const std::map<ValueId, BHWC>& buffers) {
         const auto& dimension = buffers.find(output_id)->second;
         std::vector<int> uniform_params{dimension.w, dimension.h,
-                                         IntegralDivideRoundUp(dimension.c, 4),
+                                         DivideRoundUp(dimension.c, 4),
                                         /*padding=*/0};
         return GetByteBuffer(uniform_params);
       }},
@ -274,9 +274,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatX(
  desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
    const auto& output_dims = buffers.find(output_id)->second;
    const uint3 groups_size{8, 4, 1};
-    int groups_x = IntegralDivideRoundUp(output_dims.w, groups_size.x);
+    int groups_x = DivideRoundUp(output_dims.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(output_dims.h, groups_size.y);
+    int groups_y = DivideRoundUp(output_dims.h, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(output_dims.c, 4);
+    int groups_z = DivideRoundUp(output_dims.c, 4);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
@ -356,7 +356,7 @@ std::vector<ComputeTaskDescriptorPtr> ConcatY(
       [output_id](const std::map<ValueId, BHWC>& buffers) {
         const auto& dimension = buffers.find(output_id)->second;
         std::vector<int> uniform_params{dimension.w, dimension.h,
-                                         IntegralDivideRoundUp(dimension.c, 4),
+                                         DivideRoundUp(dimension.c, 4),
                                         /*padding=*/0};
         return GetByteBuffer(uniform_params);
       }},
@ -365,9 +365,9 @@ std::vector<ComputeTaskDescriptorPtr> ConcatY(
  desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
    const auto& output_dims = buffers.find(output_id)->second;
    const uint3 groups_size{8, 4, 1};
-    int groups_x = IntegralDivideRoundUp(output_dims.w, groups_size.x);
+    int groups_x = DivideRoundUp(output_dims.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(output_dims.h, groups_size.y);
+    int groups_y = DivideRoundUp(output_dims.h, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(output_dims.c, 4);
+    int groups_z = DivideRoundUp(output_dims.c, 4);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
@ -72,7 +72,7 @@ struct ConvParams {
 namespace {
 int GetNumOutputSlices(int dst_channels) {
-  const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
+  const int dst_depth = DivideRoundUp(dst_channels, 4);
  if (dst_depth % 4 == 0 || dst_depth >= 16) {
    return 4;
  } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
@ -571,8 +571,8 @@ kernel void ComputeFunction(
 std::vector<float> ReorderWeightsForConv(
    const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights,
    const ConvParams& params) {
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
  std::vector<float> weights_reordered(
      weights.shape.w * weights.shape.h *
      AlignByN(dst_depth, params.block_size.z) * 4 * src_depth * 4);
@ -580,8 +580,7 @@ std::vector<float> ReorderWeightsForConv(
  bool isO4I4 = params.weight_layout == WeightsInnerBlockLayout::O4I4;
  int counter = 0;
-  for (int d = 0; d < IntegralDivideRoundUp(dst_depth, params.block_size.z);
+  for (int d = 0; d < DivideRoundUp(dst_depth, params.block_size.z); ++d) {
       ++d) {
    for (int y = 0; y < weights.shape.h; ++y) {
      for (int x = 0; x < weights.shape.w; ++x) {
        for (int s = 0; s < src_depth; ++s) {
@ -618,17 +617,17 @@ std::vector<uint8_t> GetUniformBuffer(const BHWC& src_size,
                                      const BHWC& dst_size,
                                      const Convolution2DAttributes& attr,
                                      const ConvParams& params) {
-  const int grid_x = IntegralDivideRoundUp(dst_size.w, params.block_size.x);
+  const int grid_x = DivideRoundUp(dst_size.w, params.block_size.x);
-  const int grid_y = IntegralDivideRoundUp(dst_size.h, params.block_size.y);
+  const int grid_y = DivideRoundUp(dst_size.h, params.block_size.y);
  std::vector<int> uniform_params = {
      src_size.w,
      src_size.h,
      src_size.w * src_size.h,
-      IntegralDivideRoundUp(src_size.c, 4),
+      DivideRoundUp(src_size.c, 4),
      dst_size.w,
      dst_size.h,
      dst_size.w * dst_size.h,
-      IntegralDivideRoundUp(dst_size.c, 4),
+      DivideRoundUp(dst_size.c, 4),
      attr.strides.w,
      attr.strides.h,
      -attr.padding.prepended.w,
@ -652,17 +651,17 @@ std::vector<uint8_t> GetUniformBuffer(const BHWC& src_size,
 std::vector<uint8_t> GetUniformBufferForWinograd(const BHWC& src_size,
                                                 const BHWC& dst_size,
                                                 const ConvParams& params) {
-  const int grid_x = IntegralDivideRoundUp(dst_size.w, params.block_size.x);
+  const int grid_x = DivideRoundUp(dst_size.w, params.block_size.x);
-  const int grid_y = IntegralDivideRoundUp(dst_size.h, params.block_size.y);
+  const int grid_y = DivideRoundUp(dst_size.h, params.block_size.y);
  std::vector<int> uniform_params = {
      src_size.w,
      src_size.h,
      src_size.w * src_size.h,
-      IntegralDivideRoundUp(src_size.c, 4),
+      DivideRoundUp(src_size.c, 4),
      dst_size.w,
      dst_size.h,
      dst_size.w * dst_size.h,
-      IntegralDivideRoundUp(dst_size.c, 4),
+      DivideRoundUp(dst_size.c, 4),
      1,
      1,
      0,
@ -685,38 +684,37 @@ std::vector<uint8_t> GetUniformBufferForWinograd(const BHWC& src_size,
 int GetGroupsCount(const BHWC& dst_shape, const int3& wg_size,
                   const int3& block_size) {
-  const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
+  const int dst_slices = DivideRoundUp(dst_shape.c, 4);
-  int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x);
+  int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
-  int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y);
+  int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
-  int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z);
+  int grid_z = DivideRoundUp(dst_slices, block_size.z);
-  return IntegralDivideRoundUp(grid_x, wg_size.x) *
+  return DivideRoundUp(grid_x, wg_size.x) * DivideRoundUp(grid_y, wg_size.y) *
-         IntegralDivideRoundUp(grid_y, wg_size.y) *
+         DivideRoundUp(grid_z, wg_size.z);
         IntegralDivideRoundUp(grid_z, wg_size.z);
 }
 int GetGroupsCountForLinearWH(const BHWC& dst_shape, const int3& wg_size,
                              const int3& block_size) {
-  const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
+  const int dst_slices = DivideRoundUp(dst_shape.c, 4);
-  int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x);
+  int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
-  int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y);
+  int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
-  int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z);
+  int grid_z = DivideRoundUp(dst_slices, block_size.z);
-  return IntegralDivideRoundUp(grid_x * grid_y, wg_size.x) *
+  return DivideRoundUp(grid_x * grid_y, wg_size.x) *
-         IntegralDivideRoundUp(grid_z, wg_size.y);
+         DivideRoundUp(grid_z, wg_size.y);
 }
 int GetGroupsCountForLinearWHS(const BHWC& dst_shape, const int3& wg_size,
                               const int3& block_size) {
-  const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
+  const int dst_slices = DivideRoundUp(dst_shape.c, 4);
-  int grid_x = IntegralDivideRoundUp(dst_shape.w, block_size.x);
+  int grid_x = DivideRoundUp(dst_shape.w, block_size.x);
-  int grid_y = IntegralDivideRoundUp(dst_shape.h, block_size.y);
+  int grid_y = DivideRoundUp(dst_shape.h, block_size.y);
-  int grid_z = IntegralDivideRoundUp(dst_slices, block_size.z);
+  int grid_z = DivideRoundUp(dst_slices, block_size.z);
-  return IntegralDivideRoundUp(grid_x * grid_y * grid_z, wg_size.x);
+  return DivideRoundUp(grid_x * grid_y * grid_z, wg_size.x);
 }
 bool IsKernelXIs1(const Convolution2DAttributes& attr) {
@ -758,8 +756,8 @@ int GetRecommendedBlockSize(const AppleGPUInfo& apple_info,
 ConvParams GetConvParamsForA7A8(const AppleGPUInfo& apple_info,
                                const Convolution2DAttributes& attr,
                                const BHWC& dst_shape) {
-  const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
+  const int dst_slices = DivideRoundUp(dst_shape.c, 4);
-  const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
  ConvParams params;
  params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
@ -835,8 +833,8 @@ ConvParams GetConvParamsForA7A8(const AppleGPUInfo& apple_info,
 ConvParams GetConvParamsForA9AndHigher(const AppleGPUInfo& apple_info,
                                       const Convolution2DAttributes& attr,
                                       const BHWC& dst_shape) {
-  const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
+  const int dst_slices = DivideRoundUp(dst_shape.c, 4);
-  const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
  int blk_total_size = GetRecommendedBlockSize(apple_info, dst_shape);
  int3 block_size = int3(1, 1, 1);
  if (blk_total_size >= 2 && apple_info.IsBionic()) {
@ -917,8 +915,8 @@ ConvParams GetConvParamsForA9AndHigher(const AppleGPUInfo& apple_info,
 ConvParams GetConvParamsForIntel(const Convolution2DAttributes& attr,
                                 const RuntimeOptions& options,
                                 const BHWC& dst_shape) {
-  const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
+  const int dst_slices = DivideRoundUp(dst_shape.c, 4);
-  const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
  ConvParams params;
  params.weights_upload_type = WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST;
  params.x_kernel_is_1 = IsKernelXIs1(attr);
@ -1017,29 +1015,28 @@ ConvParams GetConvParams(const DeviceInfo& device_info,
 std::pair<uint3, uint3> GetDispatchSizes(const ConvParams& params,
                                         const BHWC& shape) {
-  const int dst_slices = IntegralDivideRoundUp(shape.c, 4);
+  const int dst_slices = DivideRoundUp(shape.c, 4);
-  int grid_x = IntegralDivideRoundUp(shape.w, params.block_size.x);
+  int grid_x = DivideRoundUp(shape.w, params.block_size.x);
-  int grid_y = IntegralDivideRoundUp(shape.h, params.block_size.y);
+  int grid_y = DivideRoundUp(shape.h, params.block_size.y);
-  int grid_z = IntegralDivideRoundUp(dst_slices, params.block_size.z);
+  int grid_z = DivideRoundUp(dst_slices, params.block_size.z);
  const uint3 group_size(params.work_group_size.x, params.work_group_size.y,
                         params.work_group_size.z);
  int3 wg;
  uint3 groups_count;
  if (params.linear_whs) {
-    wg.x = IntegralDivideRoundUp(grid_x * grid_y * grid_z,
+    wg.x = DivideRoundUp(grid_x * grid_y * grid_z, params.work_group_size.x);
                                 params.work_group_size.x);
    groups_count = uint3(wg.x, 1, 1);
  } else if (params.linear_wh) {
-    wg.x = IntegralDivideRoundUp(grid_x * grid_y, params.work_group_size.x);
+    wg.x = DivideRoundUp(grid_x * grid_y, params.work_group_size.x);
-    wg.y = IntegralDivideRoundUp(grid_z, params.work_group_size.y);
+    wg.y = DivideRoundUp(grid_z, params.work_group_size.y);
    groups_count = uint3(wg[params.work_group_launch_order.x],
                         wg[params.work_group_launch_order.y], 1);
  } else {
-    wg.x = IntegralDivideRoundUp(grid_x, params.work_group_size.x);
+    wg.x = DivideRoundUp(grid_x, params.work_group_size.x);
-    wg.y = IntegralDivideRoundUp(grid_y, params.work_group_size.y);
+    wg.y = DivideRoundUp(grid_y, params.work_group_size.y);
-    wg.z = IntegralDivideRoundUp(grid_z, params.work_group_size.z);
+    wg.z = DivideRoundUp(grid_z, params.work_group_size.z);
    groups_count = uint3(wg[params.work_group_launch_order.x],
                         wg[params.work_group_launch_order.y],
                         wg[params.work_group_launch_order.z]);
@ -1076,7 +1073,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionGeneric(
  std::string addr_space =
      params.weights_upload_type == WeightsUploadType::CONSTANT_MEM ? "constant"
                                                                    : "device";
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
  desc->immutable_buffers = {
      {addr_space + " FLT4* const filters",
       GetByteBufferConverted(weights_reordered, options.storage_precision)},
@ -1108,7 +1105,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionWino4x4To6x6(
    int id, ValueId input_id, ValueId output_id, const BHWC& dst_shape,
    const Convolution2DAttributes& attr, const DeviceInfo& device_info,
    const RuntimeOptions& options) {
-  const int dst_slices = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
  ConvParams params;
  params.work_group_launch_order = int3(2, 0, 1);
  params.src_depth_loop_size = 1;
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm
@ -34,6 +34,7 @@ using ::tflite::gpu::Axis;
 using ::tflite::gpu::BHWC;
 using ::tflite::gpu::Convolution2DAttributes;
 using ::tflite::gpu::DataType;
 using ::tflite::gpu::DivideRoundUp;
 using ::tflite::gpu::HW;
 using ::tflite::gpu::Linear;
 using ::tflite::gpu::OHWI;
@ -44,7 +45,6 @@ using ::tflite::gpu::TensorRef;
 using ::tflite::gpu::ValueId;
 using ::tflite::gpu::metal::ConvolutionGeneric;
 using ::tflite::gpu::metal::ConvolutionWino4x4To6x6;
 using ::tflite::gpu::IntegralDivideRoundUp;
 using ::tflite::gpu::metal::CompareVectors;
 using ::tflite::gpu::metal::SingleOpModel;
@ -275,7 +275,7 @@ using ::tflite::gpu::metal::SingleOpModel;
  BHWC conv_shape;
  conv_shape.b = dst_shape.b;
  conv_shape.h = 36;
-  conv_shape.w = IntegralDivideRoundUp(new_width, 4) * IntegralDivideRoundUp(new_height, 4);
+  conv_shape.w = DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
  conv_shape.c = dst_shape.c;
  TensorFloat32 src_tensor;
--- a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
@ -208,7 +208,7 @@ kernel void ComputeFunction(
 // DepthWiseConv3x3Stride1x1
 std::vector<float> ReorderWeightsDepthWiseConv3x3Stride1x1(
    const DepthwiseConvolution2DAttributes& attr) {
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
  const int kernel_x = 3;
  const int kernel_y = 3;
  std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth *
@ -250,11 +250,11 @@ static std::vector<uint8_t> GetUniformBufferDepthWiseConv3x3Stride1x1(
      src_size.w,
      src_size.h,
      src_size.w * src_size.h,
-      IntegralDivideRoundUp(src_size.c, 4),
+      DivideRoundUp(src_size.c, 4),
      dst_size.w,
      dst_size.h,
      dst_size.w * dst_size.h,
-      IntegralDivideRoundUp(dst_size.c, 4),
+      DivideRoundUp(dst_size.c, 4),
      -params.padding.prepended.w,
      -params.padding.prepended.h,
      0,  // dummy, for alignment
@ -403,7 +403,7 @@ kernel void ComputeFunction(
 // DepthWiseConv3x3Stride2
 std::vector<float> ReorderWeightsDepthWiseConv3x3Stride2(
    const DepthwiseConvolution2DAttributes& attr) {
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
  const int kernel_x = 3;
  const int kernel_y = 3;
  std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth *
@ -445,11 +445,11 @@ static std::vector<uint8_t> GetUniformBufferDepthWiseConv3x3Stride2(
      src_size.w,
      src_size.h,
      src_size.w * src_size.h,
-      IntegralDivideRoundUp(src_size.c, 4),
+      DivideRoundUp(src_size.c, 4),
      dst_size.w,
      dst_size.h,
      dst_size.w * dst_size.h,
-      IntegralDivideRoundUp(dst_size.c, 4),
+      DivideRoundUp(dst_size.c, 4),
      -attr.padding.prepended.w,
      -attr.padding.prepended.h,
      attr.strides.w,
@ -586,11 +586,11 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
         std::vector<int> uniform_params{
             dimension.w,
             dimension.h,
-             IntegralDivideRoundUp(dimension.c, 4),
+             DivideRoundUp(dimension.c, 4),
             0,
             output_dimension.w,
             output_dimension.h,
-             IntegralDivideRoundUp(output_dimension.c, 4),
+             DivideRoundUp(output_dimension.c, 4),
             0,
             attr.strides.w,
             attr.strides.h,
@ -612,9 +612,9 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
  desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
    const auto& dimension = buffers.find(output_id)->second;
    uint3 groups_size{8, 4, 1};
-    uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x),
+    uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
-                       IntegralDivideRoundUp(dimension.h, groups_size.y),
+                       DivideRoundUp(dimension.h, groups_size.y),
-                       IntegralDivideRoundUp(dimension.c, 4)};
+                       DivideRoundUp(dimension.c, 4)};
    return std::make_pair(groups_size, groups_count);
  };
@ -661,17 +661,17 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride1x1(
  desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
    const auto& dimension = buffers.find(output_id)->second;
-    const int grid_x = IntegralDivideRoundUp(dimension.w, 2);
+    const int grid_x = DivideRoundUp(dimension.w, 2);
-    const int grid_y = IntegralDivideRoundUp(dimension.h, 2);
+    const int grid_y = DivideRoundUp(dimension.h, 2);
-    const int grid_z = IntegralDivideRoundUp(dimension.c, 4);
+    const int grid_z = DivideRoundUp(dimension.c, 4);
    uint3 group_size{8, 4, 1};
    if (grid_x <= 4) {
      group_size.x = 4;
      group_size.z = grid_z % 2 == 0 ? 2 : 1;
    }
-    const int groups_x = IntegralDivideRoundUp(grid_x, group_size.x);
+    const int groups_x = DivideRoundUp(grid_x, group_size.x);
-    const int groups_y = IntegralDivideRoundUp(grid_y, group_size.y);
+    const int groups_y = DivideRoundUp(grid_y, group_size.y);
-    const int groups_z = IntegralDivideRoundUp(grid_z, group_size.z);
+    const int groups_z = DivideRoundUp(grid_z, group_size.z);
    return std::make_pair(group_size, uint3(groups_x, groups_y, groups_z));
  };
@ -726,12 +726,12 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride2(
  desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
    const auto& dimension = buffers.find(output_id)->second;
    const int grid_x = dimension.w;
-    const int grid_y = IntegralDivideRoundUp(dimension.h, 2);
+    const int grid_y = DivideRoundUp(dimension.h, 2);
-    const int grid_z = IntegralDivideRoundUp(dimension.c, 4);
+    const int grid_z = DivideRoundUp(dimension.c, 4);
    const uint3 group_size{8, 4, 1};
-    const int groups_x = IntegralDivideRoundUp(grid_x, group_size.x);
+    const int groups_x = DivideRoundUp(grid_x, group_size.x);
-    const int groups_y = IntegralDivideRoundUp(grid_y, group_size.y);
+    const int groups_y = DivideRoundUp(grid_y, group_size.y);
-    const int groups_z = IntegralDivideRoundUp(grid_z, group_size.z);
+    const int groups_z = DivideRoundUp(grid_z, group_size.z);
    return std::make_pair(group_size, uint3(groups_x, groups_y, groups_z));
  };
--- a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
@ -45,7 +45,7 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
  const std::string barrier = device_info.IsWaveSizeEqualTo32()
                                  ? "SIMDGROUP_BARRIER"
                                  : "threadgroup_barrier";
-  const int src_depth = IntegralDivideRoundUp(src_channels, 4);
+  const int src_depth = DivideRoundUp(src_channels, 4);
  std::stringstream code;
  code << R"(
    #include <metal_stdlib>
@ -116,9 +116,8 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
  }
 }
  )";
-  const int src_depth_sub_groups = shared_memory
+  const int src_depth_sub_groups = shared_memory ? DivideRoundUp(src_depth, 32)
-                                       ? IntegralDivideRoundUp(src_depth, 32)
+                                                 : DivideRoundUp(src_depth, 4);
                                       : IntegralDivideRoundUp(src_depth, 4);
  return absl::Substitute(code.str(), src_depth_sub_groups, barrier);
 }
 }  // namespace
@ -146,7 +145,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
  bool shared_memory =
      device_info.IsAppleGPU() &&
      device_info.apple_info.IsLocalMemoryPreferredOverGlobal();
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
  const int src_depth_aligned = AlignByN(src_depth, shared_memory ? 32 : 4);
  const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8);
@ -179,8 +178,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
      {"constant uniforms& params",
       [attr](const std::map<ValueId, BHWC>& buffers) {
         std::vector<uint32_t> uniform_params{
-             static_cast<uint32_t>(
+             static_cast<uint32_t>(DivideRoundUp(attr.weights.shape.i, 4)),
                 IntegralDivideRoundUp(attr.weights.shape.i, 4)),
             static_cast<uint32_t>(AlignByN(attr.weights.shape.o, 8)),
             static_cast<uint32_t>(attr.weights.shape.o),
             static_cast<uint32_t>(0),
@ -192,7 +190,7 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
  desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
    const uint3 groups_size{8, 4, 1};
    const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8);
-    int groups_x = IntegralDivideRoundUp(dst_channels_aligned, groups_size.x);
+    int groups_x = DivideRoundUp(dst_channels_aligned, groups_size.x);
    return std::make_pair(groups_size, uint3{groups_x, 1, 1});
  };
--- a/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling.cc
@ -131,9 +131,9 @@ std::vector<ComputeTaskDescriptorPtr> MaxUnpooling(
    const auto& src_shape = buffers.find(input_id)->second;
    BHWC dst_shape = CalculateOutputShape(src_shape, params);
    const uint3 groups_size{16, 16, 1};
-    int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
+    int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
+    int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(dst_shape.c, 4);
+    int groups_z = DivideRoundUp(dst_shape.c, 4);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
--- a/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/mean.cc
@ -133,7 +133,7 @@ std::vector<ComputeTaskDescriptorPtr> Mean(int id, ValueId input_id,
       [input_id, output_id,
        work_group_size](const std::map<ValueId, BHWC>& buffers) {
         const auto& src_shape = buffers.find(input_id)->second;
-         const int src_slices = IntegralDivideRoundUp(src_shape.c, 4);
+         const int src_slices = DivideRoundUp(src_shape.c, 4);
         struct uniforms {
           int4 src_size;
           float4 inv_multipliers;
@ -153,8 +153,8 @@ std::vector<ComputeTaskDescriptorPtr> Mean(int id, ValueId input_id,
  desc->resize_function = [output_id, work_group_size](
                              const std::map<ValueId, BHWC>& buffers) {
    BHWC dst_shape = buffers.find(output_id)->second;
-    const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
+    const int dst_slices = DivideRoundUp(dst_shape.c, 4);
-    const int groups_z = IntegralDivideRoundUp(dst_slices, work_group_size.z);
+    const int groups_z = DivideRoundUp(dst_slices, work_group_size.z);
    return std::make_pair(work_group_size, uint3{1, 1, groups_z});
  };
  return {desc};
--- a/tensorflow/lite/delegates/gpu/metal/kernels/padding.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/padding.cc
@ -177,12 +177,12 @@ std::vector<ComputeTaskDescriptorPtr> Padding(int id, ValueId input_id,
             dimension.w,
             dimension.h,
             dimension.c,
-             IntegralDivideRoundUp(dimension.c, 4),
+             DivideRoundUp(dimension.c, 4),
             // int4 dst_size
             output_dimension.w,
             output_dimension.h,
             output_dimension.c,
-             IntegralDivideRoundUp(output_dimension.c, 4),
+             DivideRoundUp(output_dimension.c, 4),
             // int4 prepended padding
             attr.prepended.w,
             attr.prepended.h,
@ -198,10 +198,10 @@ std::vector<ComputeTaskDescriptorPtr> Padding(int id, ValueId input_id,
    const uint3 groups_size{16, 16, 1};
    const auto& src_shape = buffers.find(input_id)->second;
    BHWC dst_shape = CalculateOutputShape(src_shape, attr);
-    const int dst_layers = IntegralDivideRoundUp(dst_shape.c, 4);
+    const int dst_layers = DivideRoundUp(dst_shape.c, 4);
-    int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
+    int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
+    int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
+    int groups_z = DivideRoundUp(dst_layers, groups_size.z);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
--- a/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc
@ -224,11 +224,11 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
         std::vector<int> uniform_params = {
             dimension.w,
             dimension.h,
-             IntegralDivideRoundUp(dimension.c, 4),
+             DivideRoundUp(dimension.c, 4),
             dimension.w * dimension.h,
             output_dimension.w,
             output_dimension.h,
-             IntegralDivideRoundUp(dimension.c, 4),
+             DivideRoundUp(dimension.c, 4),
             output_dimension.w * output_dimension.h,
             params.strides.w,
             params.strides.h,
@ -242,11 +242,11 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
  desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
    BHWC dst_shape = buffers.find(output_id)->second;
    const uint3 grid =
-        uint3(dst_shape.w, dst_shape.h, IntegralDivideRoundUp(dst_shape.c, 4));
+        uint3(dst_shape.w, dst_shape.h, DivideRoundUp(dst_shape.c, 4));
    const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
-    int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
+    int groups_x = DivideRoundUp(grid.x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
+    int groups_y = DivideRoundUp(grid.y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
+    int groups_z = DivideRoundUp(grid.z, groups_size.z);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
--- a/tensorflow/lite/delegates/gpu/metal/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/reshape.cc
@ -159,11 +159,11 @@ std::vector<ComputeTaskDescriptorPtr> Reshape(int id, ValueId input_id,
  desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
    const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
-                             IntegralDivideRoundUp(attr.new_shape.c, 4));
+                             DivideRoundUp(attr.new_shape.c, 4));
    const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
-    int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
+    int groups_x = DivideRoundUp(grid.x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
+    int groups_y = DivideRoundUp(grid.y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
+    int groups_z = DivideRoundUp(grid.z, groups_size.z);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
@ -197,14 +197,14 @@ std::vector<ComputeTaskDescriptorPtr> Reshapex4(int id, ValueId input_id,
         const auto& dst_dim = buffers.find(output_id)->second;
         std::vector<int32_t> uniform_params{
             // int4 src_size
-             src_dim.w, src_dim.h, IntegralDivideRoundUp(src_dim.c, 4),
+             src_dim.w, src_dim.h, DivideRoundUp(src_dim.c, 4),
             src_dim.w * src_dim.h,
             // int4 dst_size
-             dst_dim.w, dst_dim.h, IntegralDivideRoundUp(dst_dim.c, 4),
+             dst_dim.w, dst_dim.h, DivideRoundUp(dst_dim.c, 4),
             dst_dim.w * dst_dim.h,
             // int2 plane_xz
-             src_dim.w * IntegralDivideRoundUp(src_dim.c, 4),
+             src_dim.w * DivideRoundUp(src_dim.c, 4),
-             dst_dim.w * IntegralDivideRoundUp(dst_dim.c, 4),
+             dst_dim.w * DivideRoundUp(dst_dim.c, 4),
             0,  // dummy, for alignment
             0,  // dummy, for alignment
             0,  // dummy, for alignment
@ -218,11 +218,11 @@ std::vector<ComputeTaskDescriptorPtr> Reshapex4(int id, ValueId input_id,
  desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
    const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
-                             IntegralDivideRoundUp(attr.new_shape.c, 4));
+                             DivideRoundUp(attr.new_shape.c, 4));
    const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
-    int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
+    int groups_x = DivideRoundUp(grid.x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
+    int groups_y = DivideRoundUp(grid.y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
+    int groups_z = DivideRoundUp(grid.z, groups_size.z);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
--- a/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/resize.cc
@ -153,10 +153,10 @@ std::vector<ComputeTaskDescriptorPtr> Resize(int id, ValueId input_id,
  desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
    const uint3 groups_size{16, 16, 1};
    const auto& dst_dim = buffers.find(output_id)->second;
-    int groups_x = IntegralDivideRoundUp(dst_dim.w, groups_size.x);
+    int groups_x = DivideRoundUp(dst_dim.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(dst_dim.h, groups_size.y);
+    int groups_y = DivideRoundUp(dst_dim.h, groups_size.y);
-    const int dst_layers = IntegralDivideRoundUp(dst_dim.c, 4);
+    const int dst_layers = DivideRoundUp(dst_dim.c, 4);
-    int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
+    int groups_z = DivideRoundUp(dst_layers, groups_size.z);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
  return {desc};
--- a/tensorflow/lite/delegates/gpu/metal/kernels/slice.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/slice.cc
@ -157,12 +157,12 @@ std::vector<ComputeTaskDescriptorPtr> Slice(int id, ValueId input_id,
             dimension.w,
             dimension.h,
             dimension.c,
-             IntegralDivideRoundUp(dimension.c, 4),
+             DivideRoundUp(dimension.c, 4),
             // int4 dst_size
             output_dimension.w,
             output_dimension.h,
             output_dimension.c,
-             IntegralDivideRoundUp(output_dimension.c, 4),
+             DivideRoundUp(output_dimension.c, 4),
         };
         return GetByteBuffer(uniform_params);
       }},
@ -173,10 +173,10 @@ std::vector<ComputeTaskDescriptorPtr> Slice(int id, ValueId input_id,
    const uint3 groups_size{16, 16, 1};
    const auto& src_shape = buffers.find(input_id)->second;
    BHWC dst_shape = CalculateOutputShape(src_shape, attr);
-    int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
+    int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
+    int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
-    const int dst_layers = IntegralDivideRoundUp(dst_shape.c, 4);
+    const int dst_layers = DivideRoundUp(dst_shape.c, 4);
-    int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
+    int groups_z = DivideRoundUp(dst_layers, groups_size.z);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
--- a/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc
@ -169,8 +169,8 @@ std::vector<ComputeTaskDescriptorPtr> Softmax(int id, ValueId input_id,
  desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
    uint3 groups_size{8, 4, 1};
    const auto& dimension = buffers.find(output_id)->second;
-    uint3 groups_count{IntegralDivideRoundUp(dimension.w, groups_size.x),
+    uint3 groups_count{DivideRoundUp(dimension.w, groups_size.x),
-                       IntegralDivideRoundUp(dimension.h, groups_size.y), 1};
+                       DivideRoundUp(dimension.h, groups_size.y), 1};
    return std::make_pair(groups_size, groups_count);
  };
@ -198,13 +198,13 @@ std::vector<ComputeTaskDescriptorPtr> Softmax1x1(int id, ValueId input_id,
  desc->uniform_buffers = {
      {"constant uniforms& params",
       [channels_count](const std::map<ValueId, BHWC>& buffers) {
-         const int src_depth = IntegralDivideRoundUp(channels_count, 4);
+         const int src_depth = DivideRoundUp(channels_count, 4);
         struct uniforms {
           int4 size;
           float4 mask;
         };
         uniforms params;
-         params.size = {src_depth, IntegralDivideRoundUp(src_depth, 32), 1, 1};
+         params.size = {src_depth, DivideRoundUp(src_depth, 32), 1, 1};
         params.mask = {0.0f, 0.0f, 0.0f, 0.0f};
         const int reminder = channels_count % 4 == 0 ? 4 : channels_count % 4;
         for (int i = 0; i < reminder; ++i) {
--- a/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc
@ -113,12 +113,12 @@ kernel void ComputeFunction($1 uint3 gid[[thread_position_in_grid]]) {
                            input_shape.h / attr.block_size,
                            input_shape.w / attr.block_size,
                            input_shape.c * attr.block_size * attr.block_size);
-    const uint3 grid = uint3(output_shape.w, output_shape.h,
+    const uint3 grid =
-                             IntegralDivideRoundUp(output_shape.c, 4));
+        uint3(output_shape.w, output_shape.h, DivideRoundUp(output_shape.c, 4));
    const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
-    const int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
+    const int groups_x = DivideRoundUp(grid.x, groups_size.x);
-    const int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
+    const int groups_y = DivideRoundUp(grid.y, groups_size.y);
-    const int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
+    const int groups_z = DivideRoundUp(grid.z, groups_size.z);
    return std::make_pair(groups_size, uint3(groups_x, groups_y, groups_z));
  };
  return {desc};
--- a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
@ -130,8 +130,8 @@ std::string GetDeconvolution(const ConvolutionTransposedAttributes& attr) {
      constant_args, attr.padding.prepended.w, attr.padding.prepended.h,
      attr.stride.w, attr.stride.h, kernel_x, kernel_y, inner_size_x,
      inner_size_y, kernel_x - 1, kernel_y - 1);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
  const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 4);
  return absl::Substitute(shader_source, src_depth * dst_channels_aligned,
                          src_depth, dst_depth, attr.weights.shape.o,
@ -264,8 +264,8 @@ std::string GetDeconvolutionShared(const ConvolutionTransposedAttributes& attr,
      constant_args, attr.padding.prepended.w, attr.padding.prepended.h,
      attr.stride.w, attr.stride.h, kernel_x, kernel_y, inner_size_x,
      inner_size_y, kernel_x - 1, kernel_y - 1);
-  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
  const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 4);
  const int src_local_size_x = (workgroup_x + kernel_x) / attr.stride.w;
  const int src_local_size_y = (workgroup_y + kernel_y) / attr.stride.h;
@ -464,7 +464,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
      (kThreadGroupWidth + params.weights.shape.w) / params.stride.w;
  const int src_local_size_y =
      (kThreadGroupHeight + params.weights.shape.h) / params.stride.h;
-  const int src_depth = IntegralDivideRoundUp(params.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(params.weights.shape.i, 4);
  const int shared_size =
      sizeof(float) * 4 * src_depth * src_local_size_x * src_local_size_y;
  if (shared_size < 1000 * 16 &&
@ -543,8 +543,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
    const uint3 groups_size{kThreadGroupWidth, kThreadGroupHeight, 1};
    BHWC dst_shape =
        CalculateOutputShape(buffers.find(input_id)->second, params);
-    int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
+    int groups_x = DivideRoundUp(dst_shape.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
+    int groups_y = DivideRoundUp(dst_shape.h, groups_size.y);
    int groups_z = 1;
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
@ -556,8 +556,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
    int id, ValueId input_id, ValueId output_id,
    const ConvolutionTransposedAttributes& params,
    const DeviceInfo& device_info, const RuntimeOptions& options) {
-  const int src_depth = IntegralDivideRoundUp(params.weights.shape.i, 4);
+  const int src_depth = DivideRoundUp(params.weights.shape.i, 4);
-  const int dst_depth = IntegralDivideRoundUp(params.weights.shape.o, 4);
+  const int dst_depth = DivideRoundUp(params.weights.shape.o, 4);
  const int kernel_x = 4;
  const int kernel_y = 4;
@ -645,7 +645,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
       [input_id, output_id, params](const std::map<ValueId, BHWC>& buffers) {
         const auto& src_shape = buffers.find(input_id)->second;
         const auto& dst_shape = buffers.find(output_id)->second;
-         const int src_depth = IntegralDivideRoundUp(src_shape.c, 4);
+         const int src_depth = DivideRoundUp(src_shape.c, 4);
         std::vector<int> uniform_params{
             src_shape.w,
             src_shape.h,
@ -653,7 +653,7 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
             src_shape.w * src_shape.h,
             dst_shape.w,
             dst_shape.h,
-             IntegralDivideRoundUp(dst_shape.c, 4),
+             DivideRoundUp(dst_shape.c, 4),
             0,
             4 * 16 * src_depth,
             0,
@ -667,13 +667,13 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
  desc->resize_function = [output_id, block_size,
                           params](const std::map<ValueId, BHWC>& buffers) {
    const auto& dst_shape = buffers.find(output_id)->second;
-    const int grid_x = IntegralDivideRoundUp(dst_shape.w + 2, 2 * block_size.x);
+    const int grid_x = DivideRoundUp(dst_shape.w + 2, 2 * block_size.x);
-    const int grid_y = IntegralDivideRoundUp(dst_shape.h + 2, 2 * block_size.y);
+    const int grid_y = DivideRoundUp(dst_shape.h + 2, 2 * block_size.y);
-    const int grid_z = IntegralDivideRoundUp(dst_shape.c, 4);
+    const int grid_z = DivideRoundUp(dst_shape.c, 4);
    const uint3 group_size{8, 4, 1};
-    int groups_x = IntegralDivideRoundUp(grid_x, group_size.x);
+    int groups_x = DivideRoundUp(grid_x, group_size.x);
-    int groups_y = IntegralDivideRoundUp(grid_y, group_size.y);
+    int groups_y = DivideRoundUp(grid_y, group_size.y);
-    int groups_z = IntegralDivideRoundUp(grid_z, group_size.z);
+    int groups_z = DivideRoundUp(grid_z, group_size.z);
    return std::make_pair(group_size, uint3{groups_z, groups_x, groups_y});
  };
--- a/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
@ -486,8 +486,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
        BHWC dst_shape;
        dst_shape.b = src_shape.b;
        dst_shape.h = 36;
-        dst_shape.w = IntegralDivideRoundUp(new_width, 4) *
+        dst_shape.w =
-                      IntegralDivideRoundUp(new_height, 4);
+            DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
        dst_shape.c = src_shape.c;
        return dst_shape;
      }};
@ -501,16 +501,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
                         attr.padding.appended.w - 2;
         int new_height = src_shape.h + attr.padding.prepended.h +
                          attr.padding.appended.h - 2;
-         int tiles_x = IntegralDivideRoundUp(new_width, 4);
+         int tiles_x = DivideRoundUp(new_width, 4);
-         int tiles_y = IntegralDivideRoundUp(new_height, 4);
+         int tiles_y = DivideRoundUp(new_height, 4);
         std::vector<int> sizes = {
             src_shape.w,
             src_shape.h,
-             IntegralDivideRoundUp(src_shape.c, 4),
+             DivideRoundUp(src_shape.c, 4),
             0,
             dst_shape.w,
             dst_shape.h,
-             IntegralDivideRoundUp(dst_shape.c, 4),
+             DivideRoundUp(dst_shape.c, 4),
             0,
             -attr.padding.prepended.w,
             -attr.padding.prepended.h,
@ -529,12 +529,12 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36(
        src_shape.w + attr.padding.prepended.w + attr.padding.appended.w - 2;
    int new_height =
        src_shape.h + attr.padding.prepended.h + attr.padding.appended.h - 2;
-    int grid_x = IntegralDivideRoundUp(new_width, 4);
+    int grid_x = DivideRoundUp(new_width, 4);
-    int grid_y = IntegralDivideRoundUp(new_height, 4);
+    int grid_y = DivideRoundUp(new_height, 4);
-    int grid_z = IntegralDivideRoundUp(src_shape.c, 4);
+    int grid_z = DivideRoundUp(src_shape.c, 4);
-    int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
+    int groups_x = DivideRoundUp(grid_x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
+    int groups_y = DivideRoundUp(grid_y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
+    int groups_z = DivideRoundUp(grid_z, groups_size.z);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
  return {desc};
@ -563,8 +563,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
        BHWC dst_shape;
        dst_shape.b = src_shape.b;
        dst_shape.h = 36;
-        dst_shape.w = IntegralDivideRoundUp(new_width, 4) *
+        dst_shape.w =
-                      IntegralDivideRoundUp(new_height, 4);
+            DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
        dst_shape.c = src_shape.c;
        return dst_shape;
      }};
@ -593,16 +593,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
                         attr.padding.appended.w - 2;
         int new_height = src_shape.h + attr.padding.prepended.h +
                          attr.padding.appended.h - 2;
-         int tiles_x = IntegralDivideRoundUp(new_width, 4);
+         int tiles_x = DivideRoundUp(new_width, 4);
-         int tiles_y = IntegralDivideRoundUp(new_height, 4);
+         int tiles_y = DivideRoundUp(new_height, 4);
         std::vector<int> sizes = {
             src_shape.w,
             src_shape.h,
-             IntegralDivideRoundUp(src_shape.c, 4),
+             DivideRoundUp(src_shape.c, 4),
             0,
             dst_shape.w,
             dst_shape.h,
-             IntegralDivideRoundUp(dst_shape.c, 4),
+             DivideRoundUp(dst_shape.c, 4),
             0,
             -attr.padding.prepended.w,
             -attr.padding.prepended.h,
@ -619,10 +619,10 @@ std::vector<ComputeTaskDescriptorPtr> Winograd4x4To36TileX6(
    const auto& dst_shape = buffers.find(output_id)->second;
    int grid_x = dst_shape.w;
    int grid_y = 6;
-    int grid_z = IntegralDivideRoundUp(dst_shape.c, 4);
+    int grid_z = DivideRoundUp(dst_shape.c, 4);
-    int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
+    int groups_x = DivideRoundUp(grid_x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
+    int groups_y = DivideRoundUp(grid_y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
+    int groups_z = DivideRoundUp(grid_z, groups_size.z);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
  return {desc};
@ -665,8 +665,8 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4(
         const auto& src_shape = buffers.find(input_id)->second;
         const auto& dst_shape = buffers.find(output_id)->second;
         std::vector<int> sizes = {
-             src_shape.w, src_shape.h, IntegralDivideRoundUp(src_shape.c, 4), 0,
+             src_shape.w, src_shape.h, DivideRoundUp(src_shape.c, 4), 0,
-             dst_shape.w, dst_shape.h, IntegralDivideRoundUp(dst_shape.c, 4), 0,
+             dst_shape.w, dst_shape.h, DivideRoundUp(dst_shape.c, 4), 0,
         };
         return GetByteBuffer(sizes);
       }},
@ -677,10 +677,10 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4(
    const auto& src_shape = buffers.find(input_id)->second;
    int grid_x = src_shape.w;
    int grid_y = 1;
-    int grid_z = IntegralDivideRoundUp(src_shape.c, 4);
+    int grid_z = DivideRoundUp(src_shape.c, 4);
-    int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
+    int groups_x = DivideRoundUp(grid_x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
+    int groups_y = DivideRoundUp(grid_y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
+    int groups_z = DivideRoundUp(grid_z, groups_size.z);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
  return {desc};
@ -734,16 +734,16 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4Tile4x1(
       [input_id, output_id](const std::map<ValueId, BHWC>& buffers) {
         const auto& src_shape = buffers.find(input_id)->second;
         const auto& dst_shape = buffers.find(output_id)->second;
-         const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
+         const int tiles_x = DivideRoundUp(dst_shape.w, 4);
-         const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
+         const int tiles_y = DivideRoundUp(dst_shape.h, 4);
         std::vector<int> sizes = {
             src_shape.w,
             src_shape.h,
-             IntegralDivideRoundUp(src_shape.c, 4),
+             DivideRoundUp(src_shape.c, 4),
             0,
             dst_shape.w,
             dst_shape.h,
-             IntegralDivideRoundUp(dst_shape.c, 4),
+             DivideRoundUp(dst_shape.c, 4),
             0,
             tiles_x,
             tiles_y,
@ -757,14 +757,14 @@ std::vector<ComputeTaskDescriptorPtr> Winograd36To4x4Tile4x1(
  desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
    const uint3 groups_size{8, 4, 1};
    const auto& dst_shape = buffers.find(output_id)->second;
-    const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
+    const int tiles_x = DivideRoundUp(dst_shape.w, 4);
-    const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
+    const int tiles_y = DivideRoundUp(dst_shape.h, 4);
    int grid_x = tiles_x * tiles_y;
    int grid_y = 4;
-    int grid_z = IntegralDivideRoundUp(dst_shape.c, 4);
+    int grid_z = DivideRoundUp(dst_shape.c, 4);
-    int groups_x = IntegralDivideRoundUp(grid_x, groups_size.x);
+    int groups_x = DivideRoundUp(grid_x, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(grid_y, groups_size.y);
+    int groups_y = DivideRoundUp(grid_y, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(grid_z, groups_size.z);
+    int groups_z = DivideRoundUp(grid_z, groups_size.z);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };
  return {desc};