diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index dc5ac9dc84e..945ce69ab30 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -820,6 +820,7 @@ cc_library( ], deps = [ ":eigen_contraction_kernel", + ":eigen_convolution_helpers", ":eigen_spatial_convolutions-inl", "//third_party/eigen3", ], @@ -839,6 +840,7 @@ cc_library( "eigen_volume_patch.h", ], deps = [ + ":eigen_convolution_helpers", ":eigen_spatial_convolutions-inl", "//third_party/eigen3", ], @@ -849,6 +851,16 @@ cc_library( hdrs = [ "eigen_spatial_convolutions-inl.h", ], + deps = [ + ":eigen_convolution_helpers", + ], +) + +cc_library( + name = "eigen_convolution_helpers", + hdrs = [ + "eigen_convolution_helpers.h", + ], ) cc_library( @@ -5928,6 +5940,7 @@ filegroup( "eigen_attention.h", "eigen_backward_cuboid_convolutions.h", "eigen_backward_spatial_convolutions.h", + "eigen_convolution_helpers.h", "eigen_cuboid_convolution.h", "eigen_pooling.h", "eigen_softmax.h", @@ -6396,6 +6409,7 @@ filegroup( ) ANDROID_TEXTUAL_HDRS = [ + "eigen_convolution_helpers.h", "eigen_spatial_convolutions-inl.h", "gather_nd_op_cpu_impl.h", "gemm_functors.h", diff --git a/tensorflow/core/kernels/eigen_convolution_helpers.h b/tensorflow/core/kernels/eigen_convolution_helpers.h new file mode 100644 index 00000000000..bfb6092199a --- /dev/null +++ b/tensorflow/core/kernels/eigen_convolution_helpers.h @@ -0,0 +1,86 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_ +#define TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_ + +namespace Eigen { +namespace internal { + +// TensorEvaluatorHasPartialPacket +// provides `value` that is true if TensorEvaluatorType has `PacketType +// partialPacket(IndexType, unpacket_traits::mask_t) +// const` and if the PacketType supports masked load. +// +// Partial packets are used to: +// +// 1) Split the packet over two columns in eigen based spatial convolution and +// use partial loads for each individual part before combining them to get the +// required packet. This class is used to pick the correct implementation of +// loadPacketStandard function. +// +// 2) Split the packet over two rows (within the same column) in eigen based +// cuboid convolution and use partial loads for each individual part before +// combining them to get the required packet. This class is used to pick the +// correct implementation of loadPacketStandard function. This usage is similar +// to the usage in eigen based spatial convolution described above. +// +// 3) Finalize packing of columns in gemm_pack_colmajor after processing +// vectorized part with full packets (see eigen_spatial_convolutions.h). +template +class TensorEvaluatorHasPartialPacket { + public: + template + static auto functionExistsSfinae( + typename std::enable_if< + unpacket_traits::masked_load_available && + std::is_same() + .template partialPacket( + std::declval(), + std::declval::mask_t>()))>::value>:: + type*) -> std::true_type; + + template + static auto functionExistsSfinae(...) -> std::false_type; + + typedef decltype( + functionExistsSfinae( + nullptr)) status; + + static const bool value = status::value; +}; + +// Compute a mask for loading/storing coefficients in/from a packet in a +// [from, to) range. If the mask bit is 1, element will be loaded/stored. +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + typename std::enable_if::masked_load_available, + typename unpacket_traits::mask_t>::type + mask(int from, int to) { + const Index packet_size = internal::unpacket_traits::size; + eigen_assert(0 <= from && to <= (packet_size + 1) && from < to); + + using Mask = typename internal::unpacket_traits::mask_t; + const Mask mask_max = std::numeric_limits::max(); + + return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from)); +} + +} // namespace internal +} // namespace Eigen + +#endif // TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_ diff --git a/tensorflow/core/kernels/eigen_cuboid_convolution.h b/tensorflow/core/kernels/eigen_cuboid_convolution.h index a0b3c101eba..fc65f9ef4c8 100644 --- a/tensorflow/core/kernels/eigen_cuboid_convolution.h +++ b/tensorflow/core/kernels/eigen_cuboid_convolution.h @@ -23,6 +23,8 @@ limitations under the License. #include "tensorflow/core/kernels/eigen_contraction_kernel.h" #endif +#include "tensorflow/core/kernels/eigen_convolution_helpers.h" + namespace Eigen { namespace internal { @@ -445,14 +447,151 @@ class TensorContractionInputMapper< return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex, otherIndex); } - return loadPacketStandard(patchId, planeIndex, rowIndex, colIndex, - otherIndex); + typedef decltype(m_impl) TensorEvaluatorT; + return loadPacketStandard( + patchId, planeIndex, rowIndex, colIndex, otherIndex); } + // Helper function to load a 'partial' packet - this is the single row part of + // a packet that is split across two rows (but single column). In the + // 'partial' packet, the elements corresponding to the row (specified through + // rowOffset) are loaded and the rest of the elements are zero-filled into the + // 'partial' packet. This function is called from + // loadPacketStandardFromSingleColumnTwoRows(). This code path is exercied + // only when the packet type supports masked load and when the partial packet + // load is available in the TensorEvaluator. EIGEN_DEVICE_FUNC - EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index planeIndex, - Index rowIndex, Index colIndex, - Index otherIndex) const { + EIGEN_ALWAYS_INLINE Packet loadPartialPacketStandard( + Index planeIndex, Index rowIndex, Index colIndex, Index otherIndex, + Index patchId, const Index span[], const Index patchOffsets[], + Index colOffset, Index rowOffset) const { + const Index inputCol = colIndex + colOffset; + const Index inputRow = rowIndex + rowOffset; + const Index planeOffsets[2] = { + patchOffsets[0] - colOffset * m_colStride - rowOffset * m_rowStride, + patchOffsets[1] - colOffset * m_colStride - rowOffset * m_rowStride}; + const Index inputPlanes[2] = {planeIndex + planeOffsets[0], + planeIndex + planeOffsets[1]}; + + if (inputRow >= m_inputRows || inputRow < 0 || inputCol >= m_inputCols || + inputCol < 0 || inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) { + // Partial packet is all zeros + return internal::pset1(Scalar(0)); + } else if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) { + // From inputIndex-span[0], we need to load elements starting from index + // span[0] all the way upto (and including) span[1]. + const Index depth = patchId - patchOffsets[0] * patchDepth(); + const Index inputIndex = depth + inputPlanes[0] * m_planeInputStride + + inputRow * m_rowInputStride + + inputCol * m_colInputStride + otherIndex; + return m_impl.template partialPacket( + inputIndex - span[0], mask(span[0], span[1] + 1)); + } else { + // Using slow path for this partial packet. + // We need to load elements starting from index span[0] all the way upto + // (and including) span[1]. We split this load into 3 parts: + // 0 : span[0]-1 - Zeros will be loaded for these indices + // span[0] : span[1] - Elements will be loaded here for these indices + // span[1]+1 : packetSize-1 - Zeross will be loaded for these indices + const Index packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX + typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < span[0]; ++i) values[i] = Scalar(0); + for (int i = span[0]; i < span[1] + 1; ++i) + values[i] = loadCoeff(patchId - span[0] + i, planeIndex, rowIndex, + colIndex, otherIndex); + for (int i = span[1] + 1; i < packetSize; ++i) values[i] = Scalar(0); + return internal::pload(values); + } + } + + // Helper function to load a packet that is split across two rows (but single + // column). If required, this function is called from loadPacketStandard() + // when the packet type supports masked load and when the partial packet load + // is available in the TensorEvaluator. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumnTwoRows( + Index patchId, Index planeIndex, Index rowIndex, Index colIndex, + Index otherIndex, const Index patchOffsets[], const Index colOffsets[], + const Index rowOffsets[]) const { + eigen_assert(colOffsets[1] == colOffsets[0] && + rowOffsets[1] == rowOffsets[0] + 1); + const Index packetSize = internal::unpacket_traits::size; + + // Packet to load will be split into 2 parts where each part spans a single + // row and both the parts span the same column. + // First determine where to split. + const Index patchIdSplit = + (((rowOffsets[1] * m_rowStride) + (colOffsets[0] * m_colStride)) * + m_patch_depth) - + 1; + const Index patchOffsetSplit = patchIdSplit / m_fastDimZero; + + // patchIds[i]: patchId corresponding to partial packet i + // spans[i]: Start and end indices corresponding to the elements + // to be loaded for partial packet i + // patchOffsets2Cols[i]: patchOffsets corresponding to partial packet i + const Index patchIds[2] = {patchId, patchIdSplit + 1}; + const Index spans[2][2] = {{0, patchIdSplit - patchId}, + {patchIdSplit - patchId + 1, packetSize - 1}}; + const Index patchOffsets2Cols[2][2] = { + {patchOffsets[0], patchOffsetSplit}, + {patchOffsetSplit + 1, patchOffsets[1]}}; + + // Load partial packets and do bit-wise OR to generate required packet + return internal::por( + loadPartialPacketStandard(planeIndex, rowIndex, colIndex, otherIndex, + patchIds[0], spans[0], patchOffsets2Cols[0], + colOffsets[0], rowOffsets[0]), + loadPartialPacketStandard(planeIndex, rowIndex, colIndex, otherIndex, + patchIds[1], spans[1], patchOffsets2Cols[1], + colOffsets[1], rowOffsets[1])); + } + + // Helper function to load a packet that is present in a single column and + // row. If required, this function is called from loadPacketStandard(). + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumnSingleRow( + Index patchId, Index planeIndex, Index rowIndex, Index colIndex, + Index otherIndex, const Index patchOffsets[], const Index colOffsets[], + const Index rowOffsets[], const Index inputCols[], + const Index inputRows[]) const { + eigen_assert(colOffsets[1] == colOffsets[0] && + rowOffsets[1] == rowOffsets[0]); + const Index planeOffsets[2] = { + patchOffsets[0] - colOffsets[0] * m_colStride - + rowOffsets[0] * m_rowStride, + patchOffsets[1] - colOffsets[1] * m_colStride - + rowOffsets[1] * m_rowStride}; + eigen_assert(planeOffsets[0] <= planeOffsets[1]); + const Index inputPlanes[2] = {planeIndex + planeOffsets[0], + planeIndex + planeOffsets[1]}; + + if (inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) { + return internal::pset1(Scalar(0)); + } + if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) { + const Index depth = patchId - patchOffsets[0] * patchDepth(); + const Index inputIndex = depth + inputPlanes[0] * m_planeInputStride + + inputRows[0] * m_rowInputStride + + inputCols[0] * m_colInputStride + otherIndex; + return m_impl.template packet(inputIndex); + } + return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex, + otherIndex); + } + + // Load standard packet from a patch specified by the "within patch offset" + // (patchId) and the precomputed indices of the first element of the patch. + // This function will be called if partial packet loading is not available + // for the TesnorEvaluator or if the packet type does not support masked + // load. + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if< + !TensorEvaluatorHasPartialPacket::value, + PacketT>::type + loadPacketStandard(Index patchId, Index planeIndex, Index rowIndex, + Index colIndex, Index otherIndex) const { const Index packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(patchId < @@ -492,27 +631,81 @@ class TensorContractionInputMapper< } if (inputRows[0] == inputRows[1]) { - const Index planeOffsets[2] = { - patchOffsets[0] - colOffsets[0] * m_colStride - - rowOffsets[0] * m_rowStride, - patchOffsets[1] - colOffsets[1] * m_colStride - - rowOffsets[1] * m_rowStride}; - eigen_assert(planeOffsets[0] <= planeOffsets[1]); - const Index inputPlanes[2] = {planeIndex + planeOffsets[0], - planeIndex + planeOffsets[1]}; + return loadPacketStandardFromSingleColumnSingleRow( + patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets, + colOffsets, rowOffsets, inputCols, inputRows); + } + } + } - if (inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) { - return internal::pset1(Scalar(0)); - } + return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex, + otherIndex); + } - if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) { - const Index depth = patchId - patchOffsets[0] * patchDepth(); - const Index inputIndex = - depth + inputPlanes[0] * m_planeInputStride + - inputRows[0] * m_rowInputStride + - inputCols[0] * m_colInputStride + otherIndex; - return m_impl.template packet(inputIndex); - } + // Load standard packet from a patch specified by the "within patch offset" + // (patchId) and the precomputed indices of the first element of the patch. + // This function will be called if partial packet loading is available for + // the TesnorEvaluator and if the packet type supports masked load. + // The only difference between this and the other case is that if the packet + // to load is split across two rows (but in same column), then in this case + // instead of going to the slow (element-by-element) load, we load two packets + // - each containing elements from one of the rows (rest of the elements of + // the packets are zeroes), and then combine these two packets to generate the + // required packet. The idea is to enable fast load (if possible) of these + // 'partial' packets. + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if< + TensorEvaluatorHasPartialPacket::value, + PacketT>::type + loadPacketStandard(Index patchId, Index planeIndex, Index rowIndex, + Index colIndex, Index otherIndex) const { + const Index packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(patchId < + patchDepth() * patchPlanes() * patchRows() * patchCols()); + eigen_assert(!nonStandardPatches()); + + if ((patchDepth() % packetSize) == 0) { + return loadPacketFast(patchId, planeIndex, rowIndex, colIndex, + otherIndex); + } else { + // Offsets and input calculation here are identical to + // loadCoeffStandard(...), but repeated twice. + + const Index patchOffsets[2] = { + patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero}; + + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, + patchOffsets[1] / m_fastColStride}; + eigen_assert(colOffsets[0] <= colOffsets[1]); + + const Index inputCols[2] = {colIndex + colOffsets[0], + colIndex + colOffsets[1]}; + if (inputCols[0] >= m_inputCols || inputCols[1] < 0) { + return internal::pset1(Scalar(0)); + } + + if (inputCols[0] == inputCols[1]) { + const Index rowOffsets[2] = { + (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride, + (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + const Index inputRows[2] = {rowIndex + rowOffsets[0], + rowIndex + rowOffsets[1]}; + + if (inputRows[0] >= m_inputRows || inputRows[1] < 0) { + return internal::pset1(Scalar(0)); + } + + if (inputRows[0] == inputRows[1]) { + return loadPacketStandardFromSingleColumnSingleRow( + patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets, + colOffsets, rowOffsets, inputCols, inputRows); + } + if (inputRows[0] + 1 == inputRows[1]) { + return loadPacketStandardFromSingleColumnTwoRows( + patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets, + colOffsets, rowOffsets); } } } @@ -760,7 +953,8 @@ class TensorContractionSubMapper< } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index i) const { - return m_base_mapper.loadPacketStandard( + typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT; + return m_base_mapper.template loadPacketStandard( i + m_depth_offset, m_planeIndex, m_rowIndex, m_colIndex, m_otherIndex); } template diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h index bb47fa1be78..c84d7f0bafc 100644 --- a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h +++ b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h @@ -16,66 +16,13 @@ limitations under the License. #ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_ #define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_ +#include "tensorflow/core/kernels/eigen_convolution_helpers.h" + // Note this header is used in both TF and TFLite. namespace Eigen { namespace internal { -// TensorEvaluatorHasPartialPacket -// provides `value` that is true if TensorEvaluatorType has `PacketType -// partialPacket(IndexType, unpacket_traits::mask_t) -// const` and if the PacketType supports masked load. -// -// Partial packets are used to: -// -// 1) Split the packet over two columns and use partial loads for each -// individual part before combining them to get the required packet. This -// class is used to pick the correct implementation of loadPacketStandard -// function below. -// -// 2) Finalize packing of columns in gemm_pack_colmajor after processing -// vectorized part with full packets (see eigen_spatiual_convolutions.h). -template -class TensorEvaluatorHasPartialPacket { - public: - template - static auto functionExistsSfinae( - typename std::enable_if< - unpacket_traits::masked_load_available && - std::is_same() - .template partialPacket( - std::declval(), - std::declval::mask_t>()))>::value>:: - type*) -> std::true_type; - - template - static auto functionExistsSfinae(...) -> std::false_type; - - typedef decltype( - functionExistsSfinae( - nullptr)) status; - - static const bool value = status::value; -}; - -// Compute a mask for loading/storing coefficients in/from a packet in a -// [from, to) range. If the mask bit is 1, element will be loaded/stored. -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - typename std::enable_if::masked_load_available, - typename unpacket_traits::mask_t>::type - mask(int from, int to) { - const Index packet_size = internal::unpacket_traits::size; - eigen_assert(0 <= from && to <= (packet_size + 1) && from < to); - - using Mask = typename internal::unpacket_traits::mask_t; - const Mask mask_max = std::numeric_limits::max(); - - return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from)); -} - // WARNING: Most of the code here implicitly assumes that the matrix is in // ColMajor layout. This is guaranteed by the tensor contraction (see // TensorContraction.h).