231 lines
9.4 KiB
C++
231 lines
9.4 KiB
C++
/* Copyright 2019 Google LLC. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#include "tensorflow/lite/experimental/ruy/block_map.h"
|
|
|
|
#include "profiling/instrumentation.h"
|
|
#include "tensorflow/lite/experimental/ruy/check_macros.h"
|
|
#include "tensorflow/lite/experimental/ruy/opt_set.h"
|
|
#include "tensorflow/lite/experimental/ruy/size_util.h"
|
|
|
|
namespace ruy {
|
|
|
|
void GetBlockByIndex(const BlockMap& block_map, int index,
|
|
SidePair<int>* block) {
|
|
gemmlowp::ScopedProfilingLabel label("GetBlockByIndex");
|
|
const std::uint32_t index_u32 = index;
|
|
const std::uint32_t rectr =
|
|
index_u32 & ((1u << block_map.rectangularness_log2[Side::kLhs]) - 1);
|
|
const std::uint32_t rectc =
|
|
index_u32 & ((1u << block_map.rectangularness_log2[Side::kRhs]) - 1);
|
|
|
|
const std::uint32_t n1 =
|
|
index_u32 >> (block_map.rectangularness_log2[Side::kLhs] +
|
|
block_map.rectangularness_log2[Side::kRhs]);
|
|
RUY_DCHECK_EQ(index_u32,
|
|
(n1 << (block_map.rectangularness_log2[Side::kLhs] +
|
|
block_map.rectangularness_log2[Side::kRhs])) +
|
|
rectr + rectc);
|
|
|
|
std::uint32_t br, bc;
|
|
if (block_map.traversal_order == BlockMapTraversalOrder::kLinear) {
|
|
br = n1 & ((1u << block_map.num_blocks_base_log2) - 1);
|
|
bc = n1 >> block_map.num_blocks_base_log2;
|
|
} else {
|
|
// Decode fractal z-order
|
|
const std::uint32_t n2 = (n1 & 0x99999999u) | ((n1 & 0x44444444u) >> 1) |
|
|
((n1 & 0x22222222u) << 1);
|
|
const std::uint32_t n4 = (n2 & 0xc3c3c3c3u) | ((n2 & 0x30303030u) >> 2) |
|
|
((n2 & 0x0c0c0c0cu) << 2);
|
|
const std::uint32_t n8 = (n4 & 0xf00ff00fu) | ((n4 & 0x0f000f00u) >> 4) |
|
|
((n4 & 0x00f000f0u) << 4);
|
|
const std::uint32_t n16 = (n8 & 0xff0000ffu) | ((n8 & 0x00ff0000u) >> 8) |
|
|
((n8 & 0x0000ff00u) << 8);
|
|
|
|
br = n16 & 0xffff;
|
|
bc = n16 >> 16;
|
|
if (block_map.traversal_order == BlockMapTraversalOrder::kFractalU) {
|
|
// Change fractal z-order to u-order
|
|
br ^= bc;
|
|
}
|
|
}
|
|
|
|
br = (br << block_map.rectangularness_log2[Side::kLhs]) + rectr;
|
|
bc = (bc << block_map.rectangularness_log2[Side::kRhs]) + rectc;
|
|
|
|
// Store
|
|
(*block)[Side::kLhs] = br;
|
|
(*block)[Side::kRhs] = bc;
|
|
}
|
|
|
|
namespace {
|
|
|
|
int floor_log2_quotient(int num, int denom) {
|
|
if (num <= denom) {
|
|
return 0;
|
|
}
|
|
int log2_quotient = floor_log2(num) - ceil_log2(denom);
|
|
if ((denom << (log2_quotient + 1)) <= num) {
|
|
log2_quotient++;
|
|
}
|
|
return log2_quotient;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
|
|
int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
|
|
int cache_friendly_traversal_threshold, BlockMap* block_map) {
|
|
gemmlowp::ScopedProfilingLabel label("MakeBlockMap");
|
|
RUY_DCHECK_GE(rows, kernel_rows);
|
|
RUY_DCHECK_GE(cols, kernel_cols);
|
|
RUY_DCHECK_EQ(rows % kernel_rows, 0);
|
|
RUY_DCHECK_EQ(cols % kernel_cols, 0);
|
|
|
|
block_map->traversal_order = BlockMapTraversalOrder::kLinear;
|
|
if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL) &&
|
|
(rows * lhs_scalar_size + cols * rhs_scalar_size) * depth >=
|
|
cache_friendly_traversal_threshold) {
|
|
block_map->traversal_order = RUY_OPT_ENABLED(RUY_OPT_FRACTAL_U)
|
|
? BlockMapTraversalOrder::kFractalU
|
|
: BlockMapTraversalOrder::kFractalZ;
|
|
}
|
|
|
|
// See the comment on BlockMap in block_map.h.
|
|
// The destination matrix shape (rows x cols) is to be subdivided into a
|
|
// square (N x N) grid of blocks, whose shapes must be multiples of the
|
|
// kernel block shape (kernel_rows x kernel_cols).
|
|
// Inside each of these N*N blocks, we may have one further level of
|
|
// subdivision either along rows or along cols but not both, to handle
|
|
// better the highly rectangular cases. That is what we call
|
|
// 'rectangularness'. This extra level of subdivision is into
|
|
// (1 << rows_rectangularness_log2) blocks along rows dimension, or into
|
|
// (1 << cols_rectangularness_log2) blocks along cols dimension.
|
|
int rows_rectangularness_log2 = 0;
|
|
int cols_rectangularness_log2 = 0;
|
|
// In order to compute these rectangularness values, we need to divide
|
|
// the destination matrix's aspect ratio,
|
|
// rows / cols
|
|
// by the kernel block's aspect ratio,
|
|
// kernel_block_rows / kernel_block_cols.
|
|
// The quotient of these two quotients simplifies to
|
|
// (rows * kernel_cols) / (cols * kernel_rows)
|
|
// Whence the introduction of the following products:
|
|
const int rows_times_kernel_cols = rows * kernel_cols;
|
|
const int cols_times_kernel_rows = cols * kernel_rows;
|
|
if (rows_times_kernel_cols > cols_times_kernel_rows) {
|
|
rows_rectangularness_log2 =
|
|
floor_log2_quotient(rows_times_kernel_cols, cols_times_kernel_rows);
|
|
// Sanity check that we did not over-estimate rows_rectangularness_log2.
|
|
RUY_DCHECK_GE(rows_times_kernel_cols >> rows_rectangularness_log2,
|
|
cols_times_kernel_rows);
|
|
} else if (cols_times_kernel_rows > rows_times_kernel_cols) {
|
|
cols_rectangularness_log2 =
|
|
floor_log2_quotient(cols_times_kernel_rows, rows_times_kernel_cols);
|
|
// Sanity check that we did not over-estimate cols_rectangularness_log2.
|
|
RUY_DCHECK_GE(cols_times_kernel_rows >> cols_rectangularness_log2,
|
|
rows_times_kernel_cols);
|
|
}
|
|
|
|
RUY_DCHECK(!rows_rectangularness_log2 || !cols_rectangularness_log2);
|
|
|
|
const int size = std::min(rows, cols);
|
|
const int size_floor_log2 = floor_log2(size);
|
|
const int depth_ceil_log2 = ceil_log2(depth);
|
|
const int kernel_width_log2 = ceil_log2(std::max(kernel_cols, kernel_rows));
|
|
|
|
// l1_size_log2 was originally, coarsely speaking the number of rows of LHS,
|
|
// or the number of columns of RHS in a matrix multiplication that we expect,
|
|
// to fit in L1 cache.
|
|
//
|
|
// This initial rationale is not necessarily still relevant. The logic below
|
|
// was determined empirically, not in a principled way.
|
|
int l1_size_log2;
|
|
if (size_floor_log2 <= 3) {
|
|
l1_size_log2 = size_floor_log2;
|
|
} else if (size_floor_log2 <= 6) {
|
|
l1_size_log2 = 4;
|
|
} else {
|
|
l1_size_log2 = 5;
|
|
}
|
|
|
|
// The 15 here implicitly encodes target a 32k L1 cache (2^15 == 32k).
|
|
// Once again this only has a distant memory of being originally motivated
|
|
// by such clear principles linking this logic to cache sizes.
|
|
l1_size_log2 = std::min(
|
|
l1_size_log2, 15 - depth_ceil_log2 -
|
|
ceil_log2(std::max(lhs_scalar_size, rhs_scalar_size)));
|
|
l1_size_log2 = std::max(l1_size_log2, kernel_width_log2);
|
|
l1_size_log2 = std::min(l1_size_log2, size_floor_log2);
|
|
|
|
int num_blocks_base_log2 = size_floor_log2 - l1_size_log2;
|
|
RUY_DCHECK_GE(num_blocks_base_log2, 0);
|
|
|
|
const int num_blocks_of_rows_log2 =
|
|
num_blocks_base_log2 + rows_rectangularness_log2;
|
|
const int num_blocks_of_cols_log2 =
|
|
num_blocks_base_log2 + cols_rectangularness_log2;
|
|
|
|
const int smallr =
|
|
round_down_pot(rows >> num_blocks_of_rows_log2, kernel_rows);
|
|
const int smallc =
|
|
round_down_pot(cols >> num_blocks_of_cols_log2, kernel_cols);
|
|
const int missr =
|
|
round_up_pot(rows - (smallr << num_blocks_of_rows_log2), kernel_rows) >>
|
|
floor_log2(kernel_rows);
|
|
const int missc =
|
|
round_up_pot(cols - (smallc << num_blocks_of_cols_log2), kernel_cols) >>
|
|
floor_log2(kernel_cols);
|
|
|
|
block_map->dims[Side::kLhs] = rows;
|
|
block_map->dims[Side::kRhs] = cols;
|
|
block_map->kernel_dims[Side::kLhs] = kernel_rows;
|
|
block_map->kernel_dims[Side::kRhs] = kernel_cols;
|
|
block_map->num_blocks_base_log2 = num_blocks_base_log2;
|
|
block_map->rectangularness_log2[Side::kLhs] = rows_rectangularness_log2;
|
|
block_map->rectangularness_log2[Side::kRhs] = cols_rectangularness_log2;
|
|
block_map->small_block_dims[Side::kLhs] = smallr;
|
|
block_map->small_block_dims[Side::kRhs] = smallc;
|
|
block_map->large_blocks[Side::kLhs] = missr;
|
|
block_map->large_blocks[Side::kRhs] = missc;
|
|
}
|
|
|
|
void GetBlockMatrixCoords(Side side, const BlockMap& block_map, int block,
|
|
int* start, int* end) {
|
|
gemmlowp::ScopedProfilingLabel label("GetBlockMatrixCoords");
|
|
*start = block * block_map.small_block_dims[side] +
|
|
std::min(block, block_map.large_blocks[side]) *
|
|
block_map.kernel_dims[side];
|
|
*end =
|
|
*start + block_map.small_block_dims[side] +
|
|
(block < block_map.large_blocks[side] ? block_map.kernel_dims[side] : 0);
|
|
|
|
RUY_DCHECK_EQ(0, *start % block_map.kernel_dims[side]);
|
|
RUY_DCHECK_EQ(0, *end % block_map.kernel_dims[side]);
|
|
RUY_DCHECK_LE(*end, block_map.dims[side]);
|
|
RUY_DCHECK_LT(*start, *end);
|
|
RUY_DCHECK_GE(*start, 0);
|
|
}
|
|
|
|
void GetBlockMatrixCoords(const BlockMap& block_map, const SidePair<int>& block,
|
|
SidePair<int>* start, SidePair<int>* end) {
|
|
for (Side side : {Side::kLhs, Side::kRhs}) {
|
|
GetBlockMatrixCoords(side, block_map, block[side], &(*start)[side],
|
|
&(*end)[side]);
|
|
}
|
|
}
|
|
|
|
} // namespace ruy
|