Remove a div from an inner loop. The div is the single most costly operation in this function, amounting to ~10% of its runtime.

The 10% saving is consistent with the benchmark results:
```
name                                                                                                                                                     old time/op             new time/op             delta
BM_PackRhs_float_32_64x64_IC32_FC64_5x5_VALID_s1x1_is1x1_B256x56  [patch: 5x5 D32; num_patches=3600 patch_size=800 num_inputs=32 padding=1 ]             3.06µs ±10%             3.02µs ± 9%     ~            (p=0.661 n=10+9)
BM_PackRhs_float_32_64x64_IC32_FC64_5x5_SAME_s1x1_is1x1_B256x56   [patch: 5x5 D32; num_patches=4096 patch_size=800 num_inputs=32 padding=2 ]             3.61µs ±12%             3.25µs ± 4%   -9.95%         (p=0.000 n=10+9)
BM_PackRhs_float_32_64x64_IC32_FC64_5x5_VALID_s2x2_is1x1_B256x56  [patch: 5x5 D32; num_patches=900 patch_size=800 num_inputs=32 padding=1  ]             3.65µs ±12%             3.46µs ± 7%   -5.09%         (p=0.028 n=10+9)
BM_PackRhs_float_32_64x64_IC32_FC64_5x5_SAME_s2x2_is1x1_B256x56   [patch: 5x5 D32; num_patches=1024 patch_size=800 num_inputs=32 padding=2 ]             4.03µs ±12%             3.64µs ± 3%   -9.80%         (p=0.000 n=10+8)
BM_PackRhs_float_32_64x64_IC30_FC64_5x5_SAME_s1x1_is1x1_B256x56   [patch: 5x5 D30; num_patches=4096 patch_size=750 num_inputs=34 padding=2 ]             4.10µs ±14%             3.71µs ± 4%   -9.68%         (p=0.001 n=10+9)
BM_PackRhs_float_32_64x64_IC30_FC64_5x5_VALID_s1x1_is1x1_B256x56  [patch: 5x5 D30; num_patches=3600 patch_size=750 num_inputs=34 padding=1 ]             3.36µs ± 3%             3.33µs ± 4%     ~            (p=0.356 n=9+10)
BM_PackRhs_float_32_64x64_IC30_FC64_5x5_SAME_s2x2_is1x1_B256x56   [patch: 5x5 D30; num_patches=1024 patch_size=750 num_inputs=34 padding=2 ]             4.12µs ± 7%             4.03µs ± 5%     ~            (p=0.278 n=9+10)
BM_PackRhs_float_32_64x64_IC30_FC64_5x5_VALID_s2x2_is1x1_B256x56  [patch: 5x5 D30; num_patches=900 patch_size=750 num_inputs=34 padding=1  ]             3.78µs ±10%             3.63µs ± 3%   -4.14%          (p=0.008 n=9+8)
BM_PackRhs_float_32_256x256_IC4_FC16_8x8_SAME_s1x1_is1x1_B256x56  [patch: 8x8 D4; num_patches=65536 patch_size=256 num_inputs=16 padding=2 ]             5.80µs ±11%             5.40µs ± 3%   -6.84%         (p=0.001 n=10+9)
BM_PackRhs_float_32_256x256_IC4_FC16_8x8_VALID_s1x1_is1x1_B256x56 [patch: 8x8 D4; num_patches=62001 patch_size=256 num_inputs=16 padding=1 ]             4.87µs ±10%             4.73µs ± 4%     ~           (p=0.190 n=10+10)
BM_PackRhs_float_32_256x256_IC4_FC16_8x8_SAME_s2x4_is1x1_B256x56  [patch: 8x8 D4; num_patches=8192 patch_size=256 num_inputs=16 padding=2  ]             6.04µs ± 1%             6.06µs ± 5%     ~            (p=0.762 n=8+10)
BM_PackRhs_float_32_256x256_IC4_FC16_8x8_VALID_s2x4_is1x1_B256x56 [patch: 8x8 D4; num_patches=7875 patch_size=256 num_inputs=16 padding=1  ]             5.29µs ± 2%             5.38µs ± 6%     ~            (p=0.400 n=9+10)
BM_PackRhs_float_32_64x64_IC4_FC16_3x3_SAME_s1x1_is1x1_B36x432    [patch: 3x3 D4; num_patches=4096 patch_size=36 num_inputs=256 padding=2  ]             13.2µs ± 4%             11.8µs ± 6%  -10.43%          (p=0.000 n=9+8)
BM_PackRhs_float_32_64x64_IC4_FC16_3x3_VALID_s1x1_is1x1_B36x432   [patch: 3x3 D4; num_patches=3844 patch_size=36 num_inputs=256 padding=1  ]             10.9µs ±10%             10.9µs ±13%     ~            (p=0.780 n=9+10)
BM_PackRhs_float_32_64x64_IC4_FC16_3x3_SAME_s2x2_is1x1_B36x432    [patch: 3x3 D4; num_patches=1024 patch_size=36 num_inputs=256 padding=2  ]             11.3µs ± 3%             10.9µs ±16%     ~           (p=0.052 n=10+10)
BM_PackRhs_float_32_64x64_IC4_FC16_3x3_VALID_s2x2_is1x1_B36x432   [patch: 3x3 D4; num_patches=961 patch_size=36 num_inputs=256 padding=1   ]             9.18µs ± 4%             9.21µs ±14%     ~           (p=0.481 n=10+10)
BM_PackRhs_float_32_32x32_IC96_FC96_5x5_SAME_s1x1_is2x2_B272x240  [patch: 5x5 D96; num_patches=3969 patch_size=2400 num_inputs=42 padding=2]             16.5µs ± 0%             17.0µs ±17%     ~             (p=0.931 n=9+9)
BM_PackRhs_float_32_32x32_IC96_FC96_5x5_VALID_s1x1_is2x2_B272x240 [patch: 5x5 D96; num_patches=3481 patch_size=2400 num_inputs=42 padding=1]             16.5µs ± 1%             16.7µs ± 8%     ~             (p=0.423 n=8+9)
BM_PackRhs_qint8_32_64x64_IC32_FC64_5x5_SAME_s1x1_is1x1_B256x56   [patch: 5x5 D32; num_patches=4096 patch_size=800 num_inputs=128 padding=2]             3.83µs ±11%             3.61µs ± 1%   -5.85%         (p=0.000 n=10+8)
BM_PackLhs_float_128_FC1024_3x3_B256x56                           [filter: count=1024 dims=3x3; input: depth=128; num_filers=113           ]             8.07µs ± 2%             8.07µs ± 0%     ~             (p=1.000 n=9+9)
BM_PackLhs_float_128_FC1024_3x3_B56x256                           [filter: count=1024 dims=3x3; input: depth=128; num_filers=113           ]             10.0µs ± 3%             10.0µs ± 4%     ~           (p=0.796 n=10+10)
BM_PackLhs_float_30_FC64_3x3_B256x56                              [filter: count=64 dims=3x3; input: depth=30; num_filers=7767             ]             1.25µs ± 2%             1.26µs ± 4%     ~            (p=0.447 n=9+10)
BM_PackLhs_float_50_FC64_3x3_B56x256                              [filter: count=64 dims=3x3; input: depth=50; num_filers=4660             ]             4.11µs ± 2%             4.09µs ± 2%     ~            (p=0.780 n=10+9)
```

PiperOrigin-RevId: 321393418
Change-Id: I501c54635e2bc7e8f506f80fca70a860050905ce
This commit is contained in:
A. Unique TensorFlower 2020-07-15 10:51:07 -07:00 committed by TensorFlower Gardener
parent 3822d5f114
commit 37e9ec4b3b

View File

@ -180,6 +180,7 @@ struct gemm_pack_colmajor_block<
const StorageIndex start_col = rhs.colOffset();
const StorageIndex max_col = rhs.maxCol(peeled_k);
const StorageIndex rhs_depth_offset = rhs.depthOffset();
for (StorageIndex col = 0; col < cols; ++col) {
SubMapper lm = rhs.getLinearMapper(0, col);
@ -199,7 +200,7 @@ struct gemm_pack_colmajor_block<
if (!has_padding ||
(!pad_col && !lm.padAnyRow(start_row, max_row - 1))) {
const StorageIndex start_depth =
(c == start_col) ? rhs.depthOffset() : 0;
(c == start_col) ? rhs_depth_offset : 0;
const StorageIndex max_depth =
std::min<StorageIndex>(start_depth + (peeled_k - k),
@ -286,7 +287,7 @@ struct gemm_pack_colmajor_block<
eigen_assert(k <= peeled_k);
const StorageIndex start_depth =
((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0;
const StorageIndex max_depth =
rhs.maxDepth(peeled_k - k, start_depth);
@ -359,6 +360,7 @@ struct gemm_pack_colmajor_block<
const StorageIndex start_col = rhs.colOffset();
const StorageIndex max_col = rhs.maxCol(peeled_k);
const StorageIndex rhs_depth_offset = rhs.depthOffset();
// Original input column and row after applying all non-standard strides and
// dilations. Computed by padOrSkip{Row,Col}.
@ -380,7 +382,7 @@ struct gemm_pack_colmajor_block<
eigen_assert(k <= peeled_k);
const StorageIndex start_depth =
((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0;
const StorageIndex max_depth =
rhs.maxDepth(peeled_k - k, start_depth);