diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h index c84d7f0bafc..7f6d1e80046 100644 --- a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h +++ b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h @@ -129,6 +129,7 @@ class TensorContractionInputMapper< m_colStride = patch_rows; m_outputRows = tensor.impl().outputRows(); + m_outputCols = tensor.impl().outputCols(); m_row_strides = tensor.impl().userRowStride(); m_col_strides = tensor.impl().userColStride(); @@ -187,6 +188,7 @@ class TensorContractionInputMapper< m_inputCols = base_mapper.m_inputCols; m_outputRows = base_mapper.m_outputRows; + m_outputCols = base_mapper.m_outputCols; m_row_strides = base_mapper.m_row_strides; m_col_strides = base_mapper.m_col_strides; @@ -652,7 +654,8 @@ class TensorContractionInputMapper< Index m_inputRows; // Number of rows in the input tensor Index m_inputCols; // Number of cols in the input tensor - Index m_outputRows; // Number of patch rows + Index m_outputRows; // Number of convolution output rows + Index m_outputCols; // Number of convolution output column Index m_row_strides; // User specified row stride Index m_col_strides; // User specified col stride @@ -872,6 +875,23 @@ class TensorContractionSubMapper< inputIndex, mask(0, num_coeffs)); } EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool hasPadding() const { + // TODO(ezhulenev): It does seems that for inflated filter it's still + // possible to guarantee "no padding or skipping" for non-standard packing. + if (nonStandardPatches()) return true; + + // Check if output rows and columns matches the PADDING_VALID case. If they + // are it means that there is no padding for the input tensor. + const bool match_rows = m_base_mapper.m_outputRows == + divup(m_base_mapper.m_inputRows - patchRows() + 1, + m_base_mapper.m_row_strides); + const bool match_cols = m_base_mapper.m_outputCols == + divup(m_base_mapper.m_inputCols - patchCols() + 1, + m_base_mapper.m_col_strides); + + return !match_rows || !match_cols; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool padRow(const Index row) const { const Index r = m_rowIndex + row; return r < 0 || r >= m_base_mapper.m_inputRows; @@ -1629,16 +1649,14 @@ EIGEN_DEVICE_FUNC case PADDING_VALID: { const TensorIndex InputRowsEff = InputRows + padding_top + padding_bottom; const TensorIndex InputColsEff = InputCols + padding_left + padding_right; - out_height = numext::ceil((InputRowsEff - kernelRowsEff + 1.f) / - static_cast(row_stride)); - out_width = numext::ceil((InputColsEff - kernelColsEff + 1.f) / - static_cast(col_stride)); + out_height = divup(InputRowsEff - kernelRowsEff + 1, row_stride); + out_width = divup(InputColsEff - kernelColsEff + 1, col_stride); break; } case PADDING_SAME: { eigen_assert(!padding_explicit); - out_height = numext::ceil(InputRows / static_cast(row_stride)); - out_width = numext::ceil(InputCols / static_cast(col_stride)); + out_height = divup(InputRows, row_stride); + out_width = divup(InputCols, col_stride); break; } default: { diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h index 8715475adbb..c163eb887d7 100644 --- a/tensorflow/core/kernels/eigen_spatial_convolutions.h +++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h @@ -115,13 +115,23 @@ struct gemm_pack_colmajor_block< if (standard_patches && (rhs.patchDepth() % packet_size == 0)) { // Single packet always belong to single patch (row, col). - packStandardPatches( - block, rhs, rows, cols); + if (rhs.hasPadding()) { + packStandardPatches(block, rhs, rows, cols); + } else { + packStandardPatches(block, rhs, rows, cols); + } } else if (standard_patches) { // Single packet can span across multiple patch rows or columns. - packStandardPatches( - block, rhs, rows, cols); + if (rhs.hasPadding()) { + packStandardPatches(block, rhs, rows, cols); + } else { + packStandardPatches(block, rhs, rows, cols); + } } else if (rhs.patchDepth() % packet_size == 0) { // Single packet always belong to single patch (row, col). @@ -138,8 +148,8 @@ struct gemm_pack_colmajor_block< private: // (A) Standard image patches: // - // (1) in_row_stride = 1 && in_col_stide == 1 - // (2) patch_row_inflate_strides == 1 && patch_col_inflate_strides == 1 + // (1) patch_row_inflate_strides == 1 AND + // (2) patch_col_inflate_strides == 1 // // Standard patches guarantee that two inner most dimensions (depth and rows) // are contiguous in memory and we can try to squeeze reads from them. @@ -154,8 +164,11 @@ struct gemm_pack_colmajor_block< // depth dimension size to be a multiple of packet size, so we can skip all // non vectorized loads and checks, because it's guaranteed that block size // will be a multiple of a packet size (see TensorContractionBlocking). - - template + // + // - has_padding: Input tensor has non-zero padding. In this case for each + // patch col and row we need to check that it doesn't correspond to the + // padded region of original input. + template EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block, const DataMapper rhs, StorageIndex rows, @@ -177,10 +190,14 @@ struct gemm_pack_colmajor_block< const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0; const StorageIndex max_row = rhs.maxRow(peeled_k, c); - const bool pad_col = lm.padCol(c); + const bool pad_col = has_padding && lm.padCol(c); + + eigen_assert(has_padding || !lm.padCol(c)); + eigen_assert(has_padding || !lm.padAnyRow(start_row, max_row - 1)); // We can squeeze reads for all rows in [start_row, max_row) range. - if (!pad_col && !lm.padAnyRow(start_row, max_row - 1)) { + if (!has_padding || + (!pad_col && !lm.padAnyRow(start_row, max_row - 1))) { const StorageIndex start_depth = (c == start_col) ? rhs.depthOffset() : 0; @@ -196,6 +213,24 @@ struct gemm_pack_colmajor_block< eigen_assert((max_depth - start_depth) % packet_size == 0); StorageIndex d = start_depth; + const StorageIndex unrolled_depth = max_depth - 4 * packet_size; + for (; d <= unrolled_depth; d += 4 * packet_size) { + eigen_assert(k < peeled_k); + + Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx); + Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx); + Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx); + Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx); + + internal::pstoreu(block + 0 * packet_size, p0); + internal::pstoreu(block + 1 * packet_size, p1); + internal::pstoreu(block + 2 * packet_size, p2); + internal::pstoreu(block + 3 * packet_size, p3); + + block += 4 * packet_size; + k += 4 * packet_size; + } + for (; d < max_depth; d += packet_size) { eigen_assert(k < peeled_k); internal::pstoreu(block, rhs.packetNoPadding(d, base_idx)); @@ -205,8 +240,26 @@ struct gemm_pack_colmajor_block< } else { StorageIndex d = start_depth; - const StorageIndex vectorized_depth = max_depth - packet_size; + const StorageIndex unrolled_depth = max_depth - 4 * packet_size; + for (; d <= unrolled_depth; d += 4 * packet_size) { + eigen_assert(k < peeled_k); + + Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx); + Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx); + Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx); + Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx); + + internal::pstoreu(block + 0 * packet_size, p0); + internal::pstoreu(block + 1 * packet_size, p1); + internal::pstoreu(block + 2 * packet_size, p2); + internal::pstoreu(block + 3 * packet_size, p3); + + block += 4 * packet_size; + k += 4 * packet_size; + } + + const StorageIndex vectorized_depth = max_depth - packet_size; for (; d <= vectorized_depth; d += packet_size) { eigen_assert(k < peeled_k); internal::pstoreu(block, rhs.packetNoPadding(d, base_idx)); @@ -237,7 +290,9 @@ struct gemm_pack_colmajor_block< const StorageIndex max_depth = rhs.maxDepth(peeled_k - k, start_depth); - const bool pad = pad_col || lm.padRow(r); + const bool pad = has_padding && (pad_col || lm.padRow(r)); + eigen_assert(has_padding || !lm.padRow(r)); + const StorageIndex base_idx = lm.baseIndex(r, c); if (patch_depth_is_multiple_of_packet_size) { @@ -248,7 +303,8 @@ struct gemm_pack_colmajor_block< for (; d < max_depth; d += packet_size) { eigen_assert(k < peeled_k); - const Packet p = pad ? pset1(Scalar(0)) + const Packet p = (has_padding && pad) + ? pset1(Scalar(0)) : rhs.packetNoPadding(d, base_idx); internal::pstoreu(block, p); block += packet_size; @@ -256,11 +312,13 @@ struct gemm_pack_colmajor_block< } } else { - const StorageIndex vectorized_depth = max_depth - packet_size; StorageIndex d = start_depth; + + const StorageIndex vectorized_depth = max_depth - packet_size; for (; d <= vectorized_depth; d += packet_size) { eigen_assert(k < peeled_k); - const Packet p = pad ? pset1(Scalar(0)) + const Packet p = (has_padding && pad) + ? pset1(Scalar(0)) : rhs.packetNoPadding(d, base_idx); internal::pstoreu(block, p); block += packet_size; @@ -269,7 +327,7 @@ struct gemm_pack_colmajor_block< eigen_assert(k <= peeled_k); const Index num_coeffs = CoeffFinalizer::finalize( - block, rhs, base_idx, d, max_depth, pad); + block, rhs, base_idx, d, max_depth, has_padding && pad); k += num_coeffs; block += num_coeffs; diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc index e43fd7ed4b1..5c9d6946928 100644 --- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc +++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc @@ -1382,6 +1382,7 @@ static void PackRhsHelper(int iters, int input_depth, /* Filter (kernel) dimensions: */ int filter_count, int filter_cols, int filter_rows, + Eigen::PaddingType padding, /* Input strides: */ int col_strides, int row_strides, /* Patch inflate strides: */ @@ -1489,14 +1490,27 @@ static void PackRhsHelper(int iters, row_strides, col_strides, // /*in_row_strides=*/1, /*in_col_strides=*/1, // patch_row_inflate_stride, patch_col_inflate_stride, // - Eigen::PADDING_SAME, /*padding_value=*/0.0); + padding, /*padding_value=*/0.0); // 2. Reshape extracted patches into "virtual" 2d tensor. - // NOTE: This is valid for PADDING_SAME only. Index input_rows_eff = (input_rows - 1) * patch_row_inflate_stride + 1; Index input_cols_eff = (input_cols - 1) * patch_col_inflate_stride + 1; - Index output_rows = input_rows_eff / row_strides; - Index output_cols = input_cols_eff / col_strides; + + Index output_rows = 0; + Index output_cols = 0; + + if (padding == Eigen::PADDING_SAME) { + output_rows = input_rows_eff / row_strides; + output_cols = input_cols_eff / col_strides; + } else if (padding == Eigen::PADDING_VALID) { + output_rows = + numext::ceil((input_rows_eff - filter_rows + 1.f) / row_strides); + output_cols = + numext::ceil((input_cols_eff - filter_cols + 1.f) / row_strides); + } else { + eigen_assert(false && "not supported"); + } + NewDimension reshape_dims; reshape_dims[0] = input_depth * filter_rows * filter_cols; // patch size reshape_dims[1] = output_rows * output_cols * input_batches; // num_patches @@ -1561,7 +1575,7 @@ static void PackRhsHelper(int iters, tensorflow::testing::SetLabel( absl::StrCat("patch: ", patch_rows, "x", patch_cols, " D", patch_depth, "; num_patches=", num_patches, " patch_size=", patch_size, - " num_inputs=", num_inputs)); + " num_inputs=", num_inputs, " padding=", padding)); } template @@ -1755,24 +1769,24 @@ static void PackLhsHelper(int iters, #define BM_CONCAT(a, b) a##b -#define BM_RHS_NAME(prefix, T, N, H, W, C, FC, FH, FW, SH, SW, ISH, ISW, BR, \ - BC) \ - BM_CONCAT( \ - BM_##prefix##_##T##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW, \ - _s##SH##x##SW##_is##ISH##x##ISW##_B##BR##x##BC) +#define BM_RHS_NAME(prefix, T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, ISW, \ + BR, BC) \ + BM_CONCAT( \ + BM_##prefix##_##T##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW, \ + _##PAD##_s##SH##x##SW##_is##ISH##x##ISW##_B##BR##x##BC) -#define BM_PackRhs(T, N, H, W, C, FC, FH, FW, SH, SW, ISH, ISW, BR, BC) \ - static void BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, SH, SW, ISH, \ - ISW, BR, BC)(int iters) { \ - PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW, ISH, ISW, BR, BC); \ - } \ - BENCHMARK(BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, SH, SW, ISH, ISW, \ - BR, BC)) +#define BM_PackRhs(T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, ISW, BR, BC) \ + static void BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, PAD, SH, SW, \ + ISH, ISW, BR, BC)(int iters) { \ + PackRhsHelper(iters, N, H, W, C, FC, FH, FW, PADDING_##PAD, SH, SW, \ + ISH, ISW, BR, BC); \ + } \ + BENCHMARK(BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, \ + ISW, BR, BC)) // Number of input channel (input depth) it equal to the number of patch // channels (patch depth). -// NOTE: This is the most common case in Tensorflow models. // Fast path: input channel dimension is the multiple of the packet size. BM_PackRhs(/*type*/ float, // /*batch*/ 32, // @@ -1780,6 +1794,7 @@ BM_PackRhs(/*type*/ float, // /*channels*/ 32, // /*num_filters*/ 64, // /*filter*/ 5, 5, // + /*padding*/ VALID, // /*stride*/ 1, 1, // /*patch inflate stride*/ 1, 1, // /*block*/ 256, 56); @@ -1790,6 +1805,29 @@ BM_PackRhs(/*type*/ float, // /*channels*/ 32, // /*num_filters*/ 64, // /*filter*/ 5, 5, // + /*padding*/ SAME, // + /*stride*/ 1, 1, // + /*patch inflate stride*/ 1, 1, // + /*block*/ 256, 56); + +BM_PackRhs(/*type*/ float, // + /*batch*/ 32, // + /*image*/ 64, 64, // + /*channels*/ 32, // + /*num_filters*/ 64, // + /*filter*/ 5, 5, // + /*padding*/ VALID, // + /*stride*/ 2, 2, // + /*patch inflate stride*/ 1, 1, // + /*block*/ 256, 56); + +BM_PackRhs(/*type*/ float, // + /*batch*/ 32, // + /*image*/ 64, 64, // + /*channels*/ 32, // + /*num_filters*/ 64, // + /*filter*/ 5, 5, // + /*padding*/ SAME, // /*stride*/ 2, 2, // /*patch inflate stride*/ 1, 1, // /*block*/ 256, 56); @@ -1801,6 +1839,7 @@ BM_PackRhs(/*type*/ float, // /*channels*/ 30, // /*num_filters*/ 64, // /*filter*/ 5, 5, // + /*padding*/ SAME, // /*stride*/ 1, 1, // /*patch inflate stride*/ 1, 1, // /*block*/ 256, 56); @@ -1811,6 +1850,29 @@ BM_PackRhs(/*type*/ float, // /*channels*/ 30, // /*num_filters*/ 64, // /*filter*/ 5, 5, // + /*padding*/ VALID, // + /*stride*/ 1, 1, // + /*patch inflate stride*/ 1, 1, // + /*block*/ 256, 56); + +BM_PackRhs(/*type*/ float, // + /*batch*/ 32, // + /*image*/ 64, 64, // + /*channels*/ 30, // + /*num_filters*/ 64, // + /*filter*/ 5, 5, // + /*padding*/ SAME, // + /*stride*/ 2, 2, // + /*patch inflate stride*/ 1, 1, // + /*block*/ 256, 56); + +BM_PackRhs(/*type*/ float, // + /*batch*/ 32, // + /*image*/ 64, 64, // + /*channels*/ 30, // + /*num_filters*/ 64, // + /*filter*/ 5, 5, // + /*padding*/ VALID, // /*stride*/ 2, 2, // /*patch inflate stride*/ 1, 1, // /*block*/ 256, 56); @@ -1822,6 +1884,7 @@ BM_PackRhs(/*type*/ float, // /*channels*/ 4, // /*num_filters*/ 16, // /*filter*/ 8, 8, // + /*padding*/ SAME, // /*stride*/ 1, 1, // /*patch inflate stride*/ 1, 1, // /*block*/ 256, 56); @@ -1832,6 +1895,29 @@ BM_PackRhs(/*type*/ float, // /*channels*/ 4, // /*num_filters*/ 16, // /*filter*/ 8, 8, // + /*padding*/ VALID, // + /*stride*/ 1, 1, // + /*patch inflate stride*/ 1, 1, // + /*block*/ 256, 56); + +BM_PackRhs(/*type*/ float, // + /*batch*/ 32, // + /*image*/ 256, 256, // + /*channels*/ 4, // + /*num_filters*/ 16, // + /*filter*/ 8, 8, // + /*padding*/ SAME, // + /*stride*/ 2, 4, // + /*patch inflate stride*/ 1, 1, // + /*block*/ 256, 56); + +BM_PackRhs(/*type*/ float, // + /*batch*/ 32, // + /*image*/ 256, 256, // + /*channels*/ 4, // + /*num_filters*/ 16, // + /*filter*/ 8, 8, // + /*padding*/ VALID, // /*stride*/ 2, 4, // /*patch inflate stride*/ 1, 1, // /*block*/ 256, 56); @@ -1843,6 +1929,19 @@ BM_PackRhs(/*type*/ float, // /*channels*/ 4, // /*num_filters*/ 16, // /*filter*/ 3, 3, // + /*padding*/ SAME, // + /*stride*/ 1, 1, // + /*patch inflate stride*/ 1, 1, // + /*block*/ 36, 432); + +// Short and wide block with small input channel dimension. +BM_PackRhs(/*type*/ float, // + /*batch*/ 32, // + /*image*/ 64, 64, // + /*channels*/ 4, // + /*num_filters*/ 16, // + /*filter*/ 3, 3, // + /*padding*/ VALID, // /*stride*/ 1, 1, // /*patch inflate stride*/ 1, 1, // /*block*/ 36, 432); @@ -1853,16 +1952,41 @@ BM_PackRhs(/*type*/ float, // /*channels*/ 4, // /*num_filters*/ 16, // /*filter*/ 3, 3, // + /*padding*/ SAME, // /*stride*/ 2, 2, // /*patch inflate stride*/ 1, 1, // /*block*/ 36, 432); +BM_PackRhs(/*type*/ float, // + /*batch*/ 32, // + /*image*/ 64, 64, // + /*channels*/ 4, // + /*num_filters*/ 16, // + /*filter*/ 3, 3, // + /*padding*/ VALID, // + /*stride*/ 2, 2, // + /*patch inflate stride*/ 1, 1, // + /*block*/ 36, 432); + +// Non standard patches with inflated strides. +BM_PackRhs(/*type*/ float, // + /*batch*/ 32, // + /*image*/ 32, 32, // + /*channels*/ 96, // + /*num_filters*/ 96, // + /*filter*/ 5, 5, // + /*padding*/ SAME, // + /*stride*/ 1, 1, // + /*patch inflate stride*/ 2, 2, // + /*block*/ 272, 240); + BM_PackRhs(/*type*/ float, // /*batch*/ 32, // /*image*/ 32, 32, // /*channels*/ 96, // /*num_filters*/ 96, // /*filter*/ 5, 5, // + /*padding*/ VALID, // /*stride*/ 1, 1, // /*patch inflate stride*/ 2, 2, // /*block*/ 272, 240); @@ -1875,6 +1999,7 @@ BM_PackRhs(/*type*/ qint8, // /*channels*/ 32, // /*num_filters*/ 64, // /*filter*/ 5, 5, // + /*padding*/ SAME, // /*stride*/ 1, 1, // /*patch inflate stride*/ 1, 1, // /*block*/ 256, 56);