Remove a div from an inner loop. The div is the single most costly operation in this function, amounting to ~10% of its runtime.
The 10% saving is consistent with the benchmark results: ``` name old time/op new time/op delta BM_PackRhs_float_32_64x64_IC32_FC64_5x5_VALID_s1x1_is1x1_B256x56 [patch: 5x5 D32; num_patches=3600 patch_size=800 num_inputs=32 padding=1 ] 3.06µs ±10% 3.02µs ± 9% ~ (p=0.661 n=10+9) BM_PackRhs_float_32_64x64_IC32_FC64_5x5_SAME_s1x1_is1x1_B256x56 [patch: 5x5 D32; num_patches=4096 patch_size=800 num_inputs=32 padding=2 ] 3.61µs ±12% 3.25µs ± 4% -9.95% (p=0.000 n=10+9) BM_PackRhs_float_32_64x64_IC32_FC64_5x5_VALID_s2x2_is1x1_B256x56 [patch: 5x5 D32; num_patches=900 patch_size=800 num_inputs=32 padding=1 ] 3.65µs ±12% 3.46µs ± 7% -5.09% (p=0.028 n=10+9) BM_PackRhs_float_32_64x64_IC32_FC64_5x5_SAME_s2x2_is1x1_B256x56 [patch: 5x5 D32; num_patches=1024 patch_size=800 num_inputs=32 padding=2 ] 4.03µs ±12% 3.64µs ± 3% -9.80% (p=0.000 n=10+8) BM_PackRhs_float_32_64x64_IC30_FC64_5x5_SAME_s1x1_is1x1_B256x56 [patch: 5x5 D30; num_patches=4096 patch_size=750 num_inputs=34 padding=2 ] 4.10µs ±14% 3.71µs ± 4% -9.68% (p=0.001 n=10+9) BM_PackRhs_float_32_64x64_IC30_FC64_5x5_VALID_s1x1_is1x1_B256x56 [patch: 5x5 D30; num_patches=3600 patch_size=750 num_inputs=34 padding=1 ] 3.36µs ± 3% 3.33µs ± 4% ~ (p=0.356 n=9+10) BM_PackRhs_float_32_64x64_IC30_FC64_5x5_SAME_s2x2_is1x1_B256x56 [patch: 5x5 D30; num_patches=1024 patch_size=750 num_inputs=34 padding=2 ] 4.12µs ± 7% 4.03µs ± 5% ~ (p=0.278 n=9+10) BM_PackRhs_float_32_64x64_IC30_FC64_5x5_VALID_s2x2_is1x1_B256x56 [patch: 5x5 D30; num_patches=900 patch_size=750 num_inputs=34 padding=1 ] 3.78µs ±10% 3.63µs ± 3% -4.14% (p=0.008 n=9+8) BM_PackRhs_float_32_256x256_IC4_FC16_8x8_SAME_s1x1_is1x1_B256x56 [patch: 8x8 D4; num_patches=65536 patch_size=256 num_inputs=16 padding=2 ] 5.80µs ±11% 5.40µs ± 3% -6.84% (p=0.001 n=10+9) BM_PackRhs_float_32_256x256_IC4_FC16_8x8_VALID_s1x1_is1x1_B256x56 [patch: 8x8 D4; num_patches=62001 patch_size=256 num_inputs=16 padding=1 ] 4.87µs ±10% 4.73µs ± 4% ~ (p=0.190 n=10+10) BM_PackRhs_float_32_256x256_IC4_FC16_8x8_SAME_s2x4_is1x1_B256x56 [patch: 8x8 D4; num_patches=8192 patch_size=256 num_inputs=16 padding=2 ] 6.04µs ± 1% 6.06µs ± 5% ~ (p=0.762 n=8+10) BM_PackRhs_float_32_256x256_IC4_FC16_8x8_VALID_s2x4_is1x1_B256x56 [patch: 8x8 D4; num_patches=7875 patch_size=256 num_inputs=16 padding=1 ] 5.29µs ± 2% 5.38µs ± 6% ~ (p=0.400 n=9+10) BM_PackRhs_float_32_64x64_IC4_FC16_3x3_SAME_s1x1_is1x1_B36x432 [patch: 3x3 D4; num_patches=4096 patch_size=36 num_inputs=256 padding=2 ] 13.2µs ± 4% 11.8µs ± 6% -10.43% (p=0.000 n=9+8) BM_PackRhs_float_32_64x64_IC4_FC16_3x3_VALID_s1x1_is1x1_B36x432 [patch: 3x3 D4; num_patches=3844 patch_size=36 num_inputs=256 padding=1 ] 10.9µs ±10% 10.9µs ±13% ~ (p=0.780 n=9+10) BM_PackRhs_float_32_64x64_IC4_FC16_3x3_SAME_s2x2_is1x1_B36x432 [patch: 3x3 D4; num_patches=1024 patch_size=36 num_inputs=256 padding=2 ] 11.3µs ± 3% 10.9µs ±16% ~ (p=0.052 n=10+10) BM_PackRhs_float_32_64x64_IC4_FC16_3x3_VALID_s2x2_is1x1_B36x432 [patch: 3x3 D4; num_patches=961 patch_size=36 num_inputs=256 padding=1 ] 9.18µs ± 4% 9.21µs ±14% ~ (p=0.481 n=10+10) BM_PackRhs_float_32_32x32_IC96_FC96_5x5_SAME_s1x1_is2x2_B272x240 [patch: 5x5 D96; num_patches=3969 patch_size=2400 num_inputs=42 padding=2] 16.5µs ± 0% 17.0µs ±17% ~ (p=0.931 n=9+9) BM_PackRhs_float_32_32x32_IC96_FC96_5x5_VALID_s1x1_is2x2_B272x240 [patch: 5x5 D96; num_patches=3481 patch_size=2400 num_inputs=42 padding=1] 16.5µs ± 1% 16.7µs ± 8% ~ (p=0.423 n=8+9) BM_PackRhs_qint8_32_64x64_IC32_FC64_5x5_SAME_s1x1_is1x1_B256x56 [patch: 5x5 D32; num_patches=4096 patch_size=800 num_inputs=128 padding=2] 3.83µs ±11% 3.61µs ± 1% -5.85% (p=0.000 n=10+8) BM_PackLhs_float_128_FC1024_3x3_B256x56 [filter: count=1024 dims=3x3; input: depth=128; num_filers=113 ] 8.07µs ± 2% 8.07µs ± 0% ~ (p=1.000 n=9+9) BM_PackLhs_float_128_FC1024_3x3_B56x256 [filter: count=1024 dims=3x3; input: depth=128; num_filers=113 ] 10.0µs ± 3% 10.0µs ± 4% ~ (p=0.796 n=10+10) BM_PackLhs_float_30_FC64_3x3_B256x56 [filter: count=64 dims=3x3; input: depth=30; num_filers=7767 ] 1.25µs ± 2% 1.26µs ± 4% ~ (p=0.447 n=9+10) BM_PackLhs_float_50_FC64_3x3_B56x256 [filter: count=64 dims=3x3; input: depth=50; num_filers=4660 ] 4.11µs ± 2% 4.09µs ± 2% ~ (p=0.780 n=10+9) ``` PiperOrigin-RevId: 321393418 Change-Id: I501c54635e2bc7e8f506f80fca70a860050905ce
This commit is contained in:
parent
3822d5f114
commit
37e9ec4b3b
@ -180,6 +180,7 @@ struct gemm_pack_colmajor_block<
|
||||
|
||||
const StorageIndex start_col = rhs.colOffset();
|
||||
const StorageIndex max_col = rhs.maxCol(peeled_k);
|
||||
const StorageIndex rhs_depth_offset = rhs.depthOffset();
|
||||
|
||||
for (StorageIndex col = 0; col < cols; ++col) {
|
||||
SubMapper lm = rhs.getLinearMapper(0, col);
|
||||
@ -199,7 +200,7 @@ struct gemm_pack_colmajor_block<
|
||||
if (!has_padding ||
|
||||
(!pad_col && !lm.padAnyRow(start_row, max_row - 1))) {
|
||||
const StorageIndex start_depth =
|
||||
(c == start_col) ? rhs.depthOffset() : 0;
|
||||
(c == start_col) ? rhs_depth_offset : 0;
|
||||
|
||||
const StorageIndex max_depth =
|
||||
std::min<StorageIndex>(start_depth + (peeled_k - k),
|
||||
@ -286,7 +287,7 @@ struct gemm_pack_colmajor_block<
|
||||
eigen_assert(k <= peeled_k);
|
||||
|
||||
const StorageIndex start_depth =
|
||||
((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
|
||||
((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0;
|
||||
const StorageIndex max_depth =
|
||||
rhs.maxDepth(peeled_k - k, start_depth);
|
||||
|
||||
@ -359,6 +360,7 @@ struct gemm_pack_colmajor_block<
|
||||
|
||||
const StorageIndex start_col = rhs.colOffset();
|
||||
const StorageIndex max_col = rhs.maxCol(peeled_k);
|
||||
const StorageIndex rhs_depth_offset = rhs.depthOffset();
|
||||
|
||||
// Original input column and row after applying all non-standard strides and
|
||||
// dilations. Computed by padOrSkip{Row,Col}.
|
||||
@ -380,7 +382,7 @@ struct gemm_pack_colmajor_block<
|
||||
eigen_assert(k <= peeled_k);
|
||||
|
||||
const StorageIndex start_depth =
|
||||
((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
|
||||
((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0;
|
||||
const StorageIndex max_depth =
|
||||
rhs.maxDepth(peeled_k - k, start_depth);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user