Merge pull request #30138 from anuj-rawat:eigen_cuboid_convolution_gemm_pack_rhs

PiperOrigin-RevId: 256426023
2019-07-03 17:10:09 -07:00 · 2019-07-03 17:10:09 -07:00 · ddf01f1df0
commit ddf01f1df0
parent 5de1bc9a3e 854e36277b
4 changed files with 321 additions and 80 deletions
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -820,6 +820,7 @@ cc_library(
    ],
    deps = [
        ":eigen_contraction_kernel",
        ":eigen_convolution_helpers",
        ":eigen_spatial_convolutions-inl",
        "//third_party/eigen3",
    ],
@ -839,6 +840,7 @@ cc_library(
        "eigen_volume_patch.h",
    ],
    deps = [
        ":eigen_convolution_helpers",
        ":eigen_spatial_convolutions-inl",
        "//third_party/eigen3",
    ],
@ -849,6 +851,16 @@ cc_library(
    hdrs = [
        "eigen_spatial_convolutions-inl.h",
    ],
    deps = [
        ":eigen_convolution_helpers",
    ],
 )
 cc_library(
    name = "eigen_convolution_helpers",
    hdrs = [
        "eigen_convolution_helpers.h",
    ],
 )
 cc_library(
@ -5928,6 +5940,7 @@ filegroup(
        "eigen_attention.h",
        "eigen_backward_cuboid_convolutions.h",
        "eigen_backward_spatial_convolutions.h",
        "eigen_convolution_helpers.h",
        "eigen_cuboid_convolution.h",
        "eigen_pooling.h",
        "eigen_softmax.h",
@ -6396,6 +6409,7 @@ filegroup(
 )
 ANDROID_TEXTUAL_HDRS = [
    "eigen_convolution_helpers.h",
    "eigen_spatial_convolutions-inl.h",
    "gather_nd_op_cpu_impl.h",
    "gemm_functors.h",
--- a/tensorflow/core/kernels/eigen_convolution_helpers.h
+++ b/tensorflow/core/kernels/eigen_convolution_helpers.h
@ -0,0 +1,86 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
 #define TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
 namespace Eigen {
 namespace internal {
 // TensorEvaluatorHasPartialPacket<TensorEvaluatorType, PacketType, IndexType>
 // provides `value` that is true if TensorEvaluatorType has `PacketType
 // partialPacket<PacketType>(IndexType, unpacket_traits<PacketType>::mask_t)
 // const` and if the PacketType supports masked load.
 //
 // Partial packets are used to:
 //
 // 1) Split the packet over two columns in eigen based spatial convolution and
 // use partial loads for each individual part before combining them to get the
 // required packet. This class is used to pick the correct implementation of
 // loadPacketStandard function.
 //
 // 2) Split the packet over two rows (within the same column) in eigen based
 // cuboid convolution and use partial loads for each individual part before
 // combining them to get the required packet. This class is used to pick the
 // correct implementation of loadPacketStandard function. This usage is similar
 // to the usage in eigen based spatial convolution described above.
 //
 // 3) Finalize packing of columns in gemm_pack_colmajor after processing
 //    vectorized part with full packets (see eigen_spatial_convolutions.h).
 template <typename TensorEvaluatorType, typename PacketType, typename IndexType>
 class TensorEvaluatorHasPartialPacket {
 public:
  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
  static auto functionExistsSfinae(
      typename std::enable_if<
          unpacket_traits<PacketT>::masked_load_available &&
          std::is_same<PacketT,
                       decltype(std::declval<const TensorEvaluatorT>()
                                    .template partialPacket<PacketT>(
                                        std::declval<IndexT>(),
                                        std::declval<typename unpacket_traits<
                                            PacketT>::mask_t>()))>::value>::
          type*) -> std::true_type;
  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
  static auto functionExistsSfinae(...) -> std::false_type;
  typedef decltype(
      functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(
          nullptr)) status;
  static const bool value = status::value;
 };
 // Compute a mask for loading/storing coefficients in/from a packet in a
 // [from, to) range. If the mask bit is 1, element will be loaded/stored.
 template <typename Packet>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
    typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
                            typename unpacket_traits<Packet>::mask_t>::type
    mask(int from, int to) {
  const Index packet_size = internal::unpacket_traits<Packet>::size;
  eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
  using Mask = typename internal::unpacket_traits<Packet>::mask_t;
  const Mask mask_max = std::numeric_limits<Mask>::max();
  return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from));
 }
 }  // namespace internal
 }  // namespace Eigen
 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
--- a/tensorflow/core/kernels/eigen_cuboid_convolution.h
+++ b/tensorflow/core/kernels/eigen_cuboid_convolution.h
@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #endif
 #include "tensorflow/core/kernels/eigen_convolution_helpers.h"
 namespace Eigen {
 namespace internal {
@ -445,14 +447,151 @@ class TensorContractionInputMapper<
      return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
                                    otherIndex);
    }
-    return loadPacketStandard(patchId, planeIndex, rowIndex, colIndex,
+    typedef decltype(m_impl) TensorEvaluatorT;
-                              otherIndex);
+    return loadPacketStandard<Packet, TensorEvaluatorT>(
        patchId, planeIndex, rowIndex, colIndex, otherIndex);
  }
  // Helper function to load a 'partial' packet - this is the single row part of
  // a packet that is split across two rows (but single column). In the
  // 'partial' packet, the elements corresponding to the row (specified through
  // rowOffset) are loaded and the rest of the elements are zero-filled into the
  // 'partial' packet. This function is called from
  // loadPacketStandardFromSingleColumnTwoRows(). This code path is exercied
  // only when the packet type supports masked load and when the partial packet
  // load is available in the TensorEvaluator.
  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index planeIndex,
+  EIGEN_ALWAYS_INLINE Packet loadPartialPacketStandard(
-                                                Index rowIndex, Index colIndex,
+      Index planeIndex, Index rowIndex, Index colIndex, Index otherIndex,
-                                                Index otherIndex) const {
+      Index patchId, const Index span[], const Index patchOffsets[],
      Index colOffset, Index rowOffset) const {
    const Index inputCol = colIndex + colOffset;
    const Index inputRow = rowIndex + rowOffset;
    const Index planeOffsets[2] = {
        patchOffsets[0] - colOffset * m_colStride - rowOffset * m_rowStride,
        patchOffsets[1] - colOffset * m_colStride - rowOffset * m_rowStride};
    const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
                                  planeIndex + planeOffsets[1]};
    if (inputRow >= m_inputRows || inputRow < 0 || inputCol >= m_inputCols ||
        inputCol < 0 || inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
      // Partial packet is all zeros
      return internal::pset1<Packet>(Scalar(0));
    } else if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
      // From inputIndex-span[0], we need to load elements starting from index
      // span[0] all the way upto (and including) span[1].
      const Index depth = patchId - patchOffsets[0] * patchDepth();
      const Index inputIndex = depth + inputPlanes[0] * m_planeInputStride +
                               inputRow * m_rowInputStride +
                               inputCol * m_colInputStride + otherIndex;
      return m_impl.template partialPacket<Packet>(
          inputIndex - span[0], mask<Packet>(span[0], span[1] + 1));
    } else {
      // Using slow path for this partial packet.
      // We need to load elements starting from index span[0] all the way upto
      // (and including) span[1]. We split this load into 3 parts:
      // 0 : span[0]-1 - Zeros will be loaded for these indices
      // span[0] : span[1] - Elements will be loaded here for these indices
      // span[1]+1 : packetSize-1 - Zeross will be loaded for these indices
      const Index packetSize = internal::unpacket_traits<Packet>::size;
      EIGEN_ALIGN_MAX
      typename internal::remove_const<Scalar>::type values[packetSize];
      for (int i = 0; i < span[0]; ++i) values[i] = Scalar(0);
      for (int i = span[0]; i < span[1] + 1; ++i)
        values[i] = loadCoeff(patchId - span[0] + i, planeIndex, rowIndex,
                              colIndex, otherIndex);
      for (int i = span[1] + 1; i < packetSize; ++i) values[i] = Scalar(0);
      return internal::pload<Packet>(values);
    }
  }
  // Helper function to load a packet that is split across two rows (but single
  // column). If required, this function is called from loadPacketStandard()
  // when the packet type supports masked load and when the partial packet load
  // is available in the TensorEvaluator.
  EIGEN_DEVICE_FUNC
  EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumnTwoRows(
      Index patchId, Index planeIndex, Index rowIndex, Index colIndex,
      Index otherIndex, const Index patchOffsets[], const Index colOffsets[],
      const Index rowOffsets[]) const {
    eigen_assert(colOffsets[1] == colOffsets[0] &&
                 rowOffsets[1] == rowOffsets[0] + 1);
    const Index packetSize = internal::unpacket_traits<Packet>::size;
    // Packet to load will be split into 2 parts where each part spans a single
    // row and both the parts span the same column.
    // First determine where to split.
    const Index patchIdSplit =
        (((rowOffsets[1] * m_rowStride) + (colOffsets[0] * m_colStride)) *
         m_patch_depth) -
        1;
    const Index patchOffsetSplit = patchIdSplit / m_fastDimZero;
    // patchIds[i]:          patchId corresponding to partial packet i
    // spans[i]:             Start and end indices corresponding to the elements
    //                       to be loaded for partial packet i
    // patchOffsets2Cols[i]: patchOffsets corresponding to partial packet i
    const Index patchIds[2] = {patchId, patchIdSplit + 1};
    const Index spans[2][2] = {{0, patchIdSplit - patchId},
                               {patchIdSplit - patchId + 1, packetSize - 1}};
    const Index patchOffsets2Cols[2][2] = {
        {patchOffsets[0], patchOffsetSplit},
        {patchOffsetSplit + 1, patchOffsets[1]}};
    // Load partial packets and do bit-wise OR to generate required packet
    return internal::por<Packet>(
        loadPartialPacketStandard(planeIndex, rowIndex, colIndex, otherIndex,
                                  patchIds[0], spans[0], patchOffsets2Cols[0],
                                  colOffsets[0], rowOffsets[0]),
        loadPartialPacketStandard(planeIndex, rowIndex, colIndex, otherIndex,
                                  patchIds[1], spans[1], patchOffsets2Cols[1],
                                  colOffsets[1], rowOffsets[1]));
  }
  // Helper function to load a packet that is present in a single column and
  // row. If required, this function is called from loadPacketStandard().
  EIGEN_DEVICE_FUNC
  EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumnSingleRow(
      Index patchId, Index planeIndex, Index rowIndex, Index colIndex,
      Index otherIndex, const Index patchOffsets[], const Index colOffsets[],
      const Index rowOffsets[], const Index inputCols[],
      const Index inputRows[]) const {
    eigen_assert(colOffsets[1] == colOffsets[0] &&
                 rowOffsets[1] == rowOffsets[0]);
    const Index planeOffsets[2] = {
        patchOffsets[0] - colOffsets[0] * m_colStride -
            rowOffsets[0] * m_rowStride,
        patchOffsets[1] - colOffsets[1] * m_colStride -
            rowOffsets[1] * m_rowStride};
    eigen_assert(planeOffsets[0] <= planeOffsets[1]);
    const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
                                  planeIndex + planeOffsets[1]};
    if (inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
      return internal::pset1<Packet>(Scalar(0));
    }
    if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
      const Index depth = patchId - patchOffsets[0] * patchDepth();
      const Index inputIndex = depth + inputPlanes[0] * m_planeInputStride +
                               inputRows[0] * m_rowInputStride +
                               inputCols[0] * m_colInputStride + otherIndex;
      return m_impl.template packet<Unaligned>(inputIndex);
    }
    return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
                                  otherIndex);
  }
  // Load standard packet from a patch specified by the "within patch offset"
  // (patchId) and the precomputed indices of the first element of the patch.
  // This function will be called if partial packet loading is not available
  // for the TesnorEvaluator or if the packet type does not support masked
  // load.
  template <typename PacketT, typename TensorEvaluatorT>
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
      !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
      PacketT>::type
  loadPacketStandard(Index patchId, Index planeIndex, Index rowIndex,
                     Index colIndex, Index otherIndex) const {
    const Index packetSize = internal::unpacket_traits<Packet>::size;
    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(patchId <
@ -492,27 +631,81 @@ class TensorContractionInputMapper<
        }
        if (inputRows[0] == inputRows[1]) {
-          const Index planeOffsets[2] = {
+          return loadPacketStandardFromSingleColumnSingleRow(
-              patchOffsets[0] - colOffsets[0] * m_colStride -
+              patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
-                  rowOffsets[0] * m_rowStride,
+              colOffsets, rowOffsets, inputCols, inputRows);
-              patchOffsets[1] - colOffsets[1] * m_colStride -
+        }
-                  rowOffsets[1] * m_rowStride};
+      }
-          eigen_assert(planeOffsets[0] <= planeOffsets[1]);
+    }
          const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
                                        planeIndex + planeOffsets[1]};
-          if (inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
+    return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
-            return internal::pset1<Packet>(Scalar(0));
+                                  otherIndex);
-          }
+  }
-          if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
+  // Load standard packet from a patch specified by the "within patch offset"
-            const Index depth = patchId - patchOffsets[0] * patchDepth();
+  // (patchId) and the precomputed indices of the first element of the patch.
-            const Index inputIndex =
+  // This function will be called if partial packet loading is available for
-                depth + inputPlanes[0] * m_planeInputStride +
+  // the TesnorEvaluator and if the packet type supports masked load.
-                inputRows[0] * m_rowInputStride +
+  // The only difference between this and the other case is that if the packet
-                inputCols[0] * m_colInputStride + otherIndex;
+  // to load is split across two rows (but in same column), then in this case
-            return m_impl.template packet<Unaligned>(inputIndex);
+  // instead of going to the slow (element-by-element) load, we load two packets
-          }
+  // - each containing elements from one of the rows (rest of the elements of
  // the packets are zeroes), and then combine these two packets to generate the
  // required packet. The idea is to enable fast load (if possible) of these
  // 'partial' packets.
  template <typename PacketT, typename TensorEvaluatorT>
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
      PacketT>::type
  loadPacketStandard(Index patchId, Index planeIndex, Index rowIndex,
                     Index colIndex, Index otherIndex) const {
    const Index packetSize = internal::unpacket_traits<Packet>::size;
    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(patchId <
                 patchDepth() * patchPlanes() * patchRows() * patchCols());
    eigen_assert(!nonStandardPatches());
    if ((patchDepth() % packetSize) == 0) {
      return loadPacketFast(patchId, planeIndex, rowIndex, colIndex,
                            otherIndex);
    } else {
      // Offsets and input calculation here are identical to
      // loadCoeffStandard(...), but repeated twice.
      const Index patchOffsets[2] = {
          patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
      const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
                                   patchOffsets[1] / m_fastColStride};
      eigen_assert(colOffsets[0] <= colOffsets[1]);
      const Index inputCols[2] = {colIndex + colOffsets[0],
                                  colIndex + colOffsets[1]};
      if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
        return internal::pset1<Packet>(Scalar(0));
      }
      if (inputCols[0] == inputCols[1]) {
        const Index rowOffsets[2] = {
            (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
            (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
        eigen_assert(rowOffsets[0] <= rowOffsets[1]);
        const Index inputRows[2] = {rowIndex + rowOffsets[0],
                                    rowIndex + rowOffsets[1]};
        if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
          return internal::pset1<Packet>(Scalar(0));
        }
        if (inputRows[0] == inputRows[1]) {
          return loadPacketStandardFromSingleColumnSingleRow(
              patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
              colOffsets, rowOffsets, inputCols, inputRows);
        }
        if (inputRows[0] + 1 == inputRows[1]) {
          return loadPacketStandardFromSingleColumnTwoRows(
              patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
              colOffsets, rowOffsets);
        }
      }
    }
@ -760,7 +953,8 @@ class TensorContractionSubMapper<
  }
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
  loadPacketStandard(Index i) const {
-    return m_base_mapper.loadPacketStandard(
+    typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT;
    return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>(
        i + m_depth_offset, m_planeIndex, m_rowIndex, m_colIndex, m_otherIndex);
  }
  template <typename Packet>
--- a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
@ -16,66 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
 #define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
 #include "tensorflow/core/kernels/eigen_convolution_helpers.h"
 // Note this header is used in both TF and TFLite.
 namespace Eigen {
 namespace internal {
 // TensorEvaluatorHasPartialPacket<TensorEvaluatorType, PacketType, IndexType>
 // provides `value` that is true if TensorEvaluatorType has `PacketType
 // partialPacket<PacketType>(IndexType, unpacket_traits<PacketType>::mask_t)
 // const` and if the PacketType supports masked load.
 //
 // Partial packets are used to:
 //
 // 1) Split the packet over two columns and use partial loads for each
 //    individual part before combining them to get the required packet. This
 //    class is used to pick the correct implementation of loadPacketStandard
 //    function below.
 //
 // 2) Finalize packing of columns in gemm_pack_colmajor after processing
 //    vectorized part with full packets (see eigen_spatiual_convolutions.h).
 template <typename TensorEvaluatorType, typename PacketType, typename IndexType>
 class TensorEvaluatorHasPartialPacket {
 public:
  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
  static auto functionExistsSfinae(
      typename std::enable_if<
          unpacket_traits<PacketT>::masked_load_available &&
          std::is_same<PacketT,
                       decltype(std::declval<const TensorEvaluatorT>()
                                    .template partialPacket<PacketT>(
                                        std::declval<IndexT>(),
                                        std::declval<typename unpacket_traits<
                                            PacketT>::mask_t>()))>::value>::
          type*) -> std::true_type;
  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
  static auto functionExistsSfinae(...) -> std::false_type;
  typedef decltype(
      functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(
          nullptr)) status;
  static const bool value = status::value;
 };
 // Compute a mask for loading/storing coefficients in/from a packet in a
 // [from, to) range. If the mask bit is 1, element will be loaded/stored.
 template <typename Packet>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
    typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
                            typename unpacket_traits<Packet>::mask_t>::type
    mask(int from, int to) {
  const Index packet_size = internal::unpacket_traits<Packet>::size;
  eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
  using Mask = typename internal::unpacket_traits<Packet>::mask_t;
  const Mask mask_max = std::numeric_limits<Mask>::max();
  return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from));
 }
 // WARNING: Most of the code here implicitly assumes that the matrix is in
 // ColMajor layout. This is guaranteed by the tensor contraction (see
 // TensorContraction.h).