Allow the compiler to vectorize the loop.
``` name old time/op new time/op delta BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x112x112x64_f2x2x64_s2x2_SAME 18.6ms ± 5% 18.5ms ±13% ~ (p=0.912 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x128_f2x2x128_s2x2_SAME 12.7ms ±12% 12.7ms ±17% ~ (p=0.684 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x256_f2x2x256_s2x2_SAME 12.2ms ± 8% 11.2ms ± 4% -8.21% (p=0.001 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x112x112x64_f2x2x64_s2x2_VALID 18.7ms ±20% 18.6ms ±23% ~ (p=0.278 n=9+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x128_f2x2x128_s2x2_VALID 12.5ms ±15% 11.4ms ± 2% -8.98% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x256_f2x2x256_s2x2_VALID 11.6ms ± 8% 11.1ms ± 2% -4.22% (p=0.011 n=9+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x64_f1x1x64_s1x1_SAME 4.57ms ± 3% 4.34ms ± 1% -5.04% (p=0.000 n=8+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x64_f1x1x256_s1x1_SAME 12.0ms ± 4% 11.5ms ± 2% -4.32% (p=0.000 n=8+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x256_f1x1x64_s1x1_SAME 20.0ms ±31% 20.6ms ±17% ~ (p=0.912 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x64_f3x3x64_s1x1_SAME 36.5ms ±21% 32.0ms ± 1% -12.30% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x128_f1x1x128_s1x1_SAME 3.71ms ±17% 3.33ms ± 1% -10.47% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x128_f1x1x512_s1x1_SAME 11.8ms ±16% 10.5ms ± 1% -11.37% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x512_f1x1x128_s1x1_SAME 13.1ms ±13% 11.4ms ± 2% -13.36% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x512_f3x3x128_s1x1_SAME 142ms ±12% 124ms ± 1% -13.22% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x14x14x256_f1x1x256_s1x1_SAME 3.51ms ±14% 3.18ms ±20% -9.43% (p=0.009 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x14x14x256_f1x1x1024_s1x1_SAME 14.0ms ±18% 12.0ms ± 4% -13.80% (p=0.012 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x14x14x1024_f1x1x256_s1x1_SAME 12.8ms ±18% 11.1ms ± 2% -13.57% (p=0.001 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x14x14x256_f3x3x256_s1x1_SAME 23.0ms ±18% 19.9ms ± 4% -13.38% (p=0.004 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x112x112x64_f2x2x64_s2x2_SAME 45.3ms ± 9% 40.5ms ± 4% -10.74% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x128_f2x2x128_s2x2_SAME 33.2ms ±13% 28.8ms ± 2% -13.11% (p=0.001 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x256_f2x2x256_s2x2_SAME 31.5ms ±15% 26.7ms ± 2% -15.13% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x112x112x64_f2x2x64_s2x2_VALID 45.5ms ± 8% 41.3ms ± 9% -9.31% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x128_f2x2x128_s2x2_VALID 33.2ms ±12% 28.8ms ± 2% -13.38% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x256_f2x2x256_s2x2_VALID 31.6ms ±14% 26.7ms ± 1% -15.53% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x64_f1x1x64_s1x1_SAME 11.1ms ±15% 9.4ms ± 3% -15.29% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x64_f1x1x256_s1x1_SAME 27.3ms ±13% 23.5ms ± 2% -13.97% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x256_f1x1x64_s1x1_SAME 46.2ms ± 8% 40.9ms ± 5% -11.54% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x64_f3x3x64_s1x1_SAME 81.5ms ±17% 64.2ms ± 1% -21.17% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x128_f1x1x128_s1x1_SAME 8.30ms ±16% 6.80ms ± 1% -18.02% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x128_f1x1x512_s1x1_SAME 27.7ms ±14% 23.3ms ± 2% -15.87% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x512_f1x1x128_s1x1_SAME 34.9ms ±14% 28.8ms ± 2% -17.41% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x512_f3x3x128_s1x1_SAME 300ms ±12% 250ms ± 1% -16.80% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x14x14x256_f1x1x256_s1x1_SAME 7.91ms ±15% 7.17ms ±21% -9.36% (p=0.029 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x14x14x256_f1x1x1024_s1x1_SAME 35.6ms ±15% 33.4ms ±13% -6.21% (p=0.023 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x14x14x1024_f1x1x256_s1x1_SAME 28.0ms ± 3% 26.7ms ± 1% -4.81% (p=0.000 n=8+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x14x14x256_f3x3x256_s1x1_SAME 41.4ms ± 3% 39.6ms ± 4% -4.43% (p=0.001 n=9+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x112x112x64_f2x2x64_s2x2_SAME 90.0ms ± 3% 85.5ms ±11% -5.00% (p=0.010 n=9+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x128_f2x2x128_s2x2_SAME 69.0ms ± 1% 65.8ms ± 3% -4.68% (p=0.000 n=9+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x256_f2x2x256_s2x2_SAME 57.8ms ± 2% 55.8ms ± 2% -3.54% (p=0.000 n=9+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x112x112x64_f2x2x64_s2x2_VALID 89.3ms ± 2% 86.1ms ± 5% -3.57% (p=0.006 n=9+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x128_f2x2x128_s2x2_VALID 68.7ms ± 2% 65.7ms ± 3% -4.31% (p=0.001 n=8+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x256_f2x2x256_s2x2_VALID 57.1ms ± 1% 55.6ms ± 1% -2.58% (p=0.000 n=8+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x64_f1x1x64_s1x1_SAME 23.6ms ± 9% 21.9ms ± 1% -7.27% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x64_f1x1x256_s1x1_SAME 51.0ms ± 3% 48.4ms ± 5% -5.13% (p=0.001 n=8+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x256_f1x1x64_s1x1_SAME 90.2ms ± 3% 85.4ms ± 5% -5.29% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x64_f3x3x64_s1x1_SAME 143ms ± 7% 133ms ± 3% -6.48% (p=0.000 n=9+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x128_f1x1x128_s1x1_SAME 14.5ms ± 4% 14.0ms ± 2% -3.82% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x128_f1x1x512_s1x1_SAME 53.8ms ± 3% 51.5ms ± 2% -4.43% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x512_f1x1x128_s1x1_SAME 69.8ms ± 5% 66.9ms ± 8% -4.15% (p=0.010 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x512_f3x3x128_s1x1_SAME 533ms ± 3% 508ms ± 0% -4.65% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x14x14x256_f1x1x256_s1x1_SAME 14.0ms ± 6% 14.9ms ±21% ~ (p=0.481 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x14x14x256_f1x1x1024_s1x1_SAME 67.2ms ± 3% 71.0ms ±22% ~ (p=0.278 n=9+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x14x14x1024_f1x1x256_s1x1_SAME 57.5ms ± 4% 56.6ms ± 5% ~ (p=0.055 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x14x14x256_f3x3x256_s1x1_SAME 78.1ms ± 5% 75.5ms ± 3% -3.32% (p=0.011 n=9+8) ``` PiperOrigin-RevId: 316949219 Change-Id: I1a1fb517a5c28d489da9762b650577b61bf4e0de
This commit is contained in:
parent
82fce3390d
commit
d5ca984c53
@ -76,7 +76,7 @@ template <typename T>
|
||||
void Col2im(const T* col_data, const int depth, const int height,
|
||||
const int width, const int filter_h, const int filter_w,
|
||||
const int pad_t, const int pad_l, const int pad_b, const int pad_r,
|
||||
const int stride_h, const int stride_w, T* im_data) {
|
||||
const int stride_h, const int stride_w, T* __restrict im_data) {
|
||||
int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
|
||||
int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
|
||||
int h_pad = -pad_t;
|
||||
@ -87,7 +87,6 @@ void Col2im(const T* col_data, const int depth, const int height,
|
||||
for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
|
||||
for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
|
||||
if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
|
||||
// TODO(andydavis) Vectorize this loop (if compiler does not).
|
||||
for (int i = 0; i < depth; ++i) {
|
||||
im_patch_data[i] += col_data[i];
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user