Merge pull request #30138 from anuj-rawat:eigen_cuboid_convolution_gemm_pack_rhs

PiperOrigin-RevId: 256426023
2019-07-03 17:10:09 -07:00 · 2019-07-03 17:10:09 -07:00 · ddf01f1df0
commit ddf01f1df0
parent 5de1bc9a3e 854e36277b
4 changed files with 321 additions and 80 deletions
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -820,6 +820,7 @@ cc_library(
    ],
    deps = [
        ":eigen_contraction_kernel",
+        ":eigen_convolution_helpers",
        ":eigen_spatial_convolutions-inl",
        "//third_party/eigen3",
    ],
@ -839,6 +840,7 @@ cc_library(
        "eigen_volume_patch.h",
    ],
    deps = [
+        ":eigen_convolution_helpers",
        ":eigen_spatial_convolutions-inl",
        "//third_party/eigen3",
    ],
@ -849,6 +851,16 @@ cc_library(
    hdrs = [
        "eigen_spatial_convolutions-inl.h",
    ],
+    deps = [
+        ":eigen_convolution_helpers",
+    ],
+)
+
+cc_library(
+    name = "eigen_convolution_helpers",
+    hdrs = [
+        "eigen_convolution_helpers.h",
+    ],
 )

 cc_library(
@ -5928,6 +5940,7 @@ filegroup(
        "eigen_attention.h",
        "eigen_backward_cuboid_convolutions.h",
        "eigen_backward_spatial_convolutions.h",
+        "eigen_convolution_helpers.h",
        "eigen_cuboid_convolution.h",
        "eigen_pooling.h",
        "eigen_softmax.h",
@ -6396,6 +6409,7 @@ filegroup(
 )

 ANDROID_TEXTUAL_HDRS = [
+    "eigen_convolution_helpers.h",
    "eigen_spatial_convolutions-inl.h",
    "gather_nd_op_cpu_impl.h",
    "gemm_functors.h",
--- a/tensorflow/core/kernels/eigen_convolution_helpers.h
+++ b/tensorflow/core/kernels/eigen_convolution_helpers.h
@ -0,0 +1,86 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
+
+namespace Eigen {
+namespace internal {
+
+// TensorEvaluatorHasPartialPacket<TensorEvaluatorType, PacketType, IndexType>
+// provides `value` that is true if TensorEvaluatorType has `PacketType
+// partialPacket<PacketType>(IndexType, unpacket_traits<PacketType>::mask_t)
+// const` and if the PacketType supports masked load.
+//
+// Partial packets are used to:
+//
+// 1) Split the packet over two columns in eigen based spatial convolution and
+// use partial loads for each individual part before combining them to get the
+// required packet. This class is used to pick the correct implementation of
+// loadPacketStandard function.
+//
+// 2) Split the packet over two rows (within the same column) in eigen based
+// cuboid convolution and use partial loads for each individual part before
+// combining them to get the required packet. This class is used to pick the
+// correct implementation of loadPacketStandard function. This usage is similar
+// to the usage in eigen based spatial convolution described above.
+//
+// 3) Finalize packing of columns in gemm_pack_colmajor after processing
+//    vectorized part with full packets (see eigen_spatial_convolutions.h).
+template <typename TensorEvaluatorType, typename PacketType, typename IndexType>
+class TensorEvaluatorHasPartialPacket {
+ public:
+  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
+  static auto functionExistsSfinae(
+      typename std::enable_if<
+          unpacket_traits<PacketT>::masked_load_available &&
+          std::is_same<PacketT,
+                       decltype(std::declval<const TensorEvaluatorT>()
+                                    .template partialPacket<PacketT>(
+                                        std::declval<IndexT>(),
+                                        std::declval<typename unpacket_traits<
+                                            PacketT>::mask_t>()))>::value>::
+          type*) -> std::true_type;
+
+  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
+  static auto functionExistsSfinae(...) -> std::false_type;
+
+  typedef decltype(
+      functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(
+          nullptr)) status;
+
+  static const bool value = status::value;
+};
+
+// Compute a mask for loading/storing coefficients in/from a packet in a
+// [from, to) range. If the mask bit is 1, element will be loaded/stored.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+    typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
+                            typename unpacket_traits<Packet>::mask_t>::type
+    mask(int from, int to) {
+  const Index packet_size = internal::unpacket_traits<Packet>::size;
+  eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
+
+  using Mask = typename internal::unpacket_traits<Packet>::mask_t;
+  const Mask mask_max = std::numeric_limits<Mask>::max();
+
+  return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from));
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
--- a/tensorflow/core/kernels/eigen_cuboid_convolution.h
+++ b/tensorflow/core/kernels/eigen_cuboid_convolution.h
@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #endif

+#include "tensorflow/core/kernels/eigen_convolution_helpers.h"
+
 namespace Eigen {

 namespace internal {
@ -445,14 +447,151 @@ class TensorContractionInputMapper<
      return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
                                    otherIndex);
    }
-    return loadPacketStandard(patchId, planeIndex, rowIndex, colIndex,
-                              otherIndex);
+    typedef decltype(m_impl) TensorEvaluatorT;
+    return loadPacketStandard<Packet, TensorEvaluatorT>(
+        patchId, planeIndex, rowIndex, colIndex, otherIndex);
  }

+  // Helper function to load a 'partial' packet - this is the single row part of
+  // a packet that is split across two rows (but single column). In the
+  // 'partial' packet, the elements corresponding to the row (specified through
+  // rowOffset) are loaded and the rest of the elements are zero-filled into the
+  // 'partial' packet. This function is called from
+  // loadPacketStandardFromSingleColumnTwoRows(). This code path is exercied
+  // only when the packet type supports masked load and when the partial packet
+  // load is available in the TensorEvaluator.
  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index planeIndex,
-                                                Index rowIndex, Index colIndex,
-                                                Index otherIndex) const {
+  EIGEN_ALWAYS_INLINE Packet loadPartialPacketStandard(
+      Index planeIndex, Index rowIndex, Index colIndex, Index otherIndex,
+      Index patchId, const Index span[], const Index patchOffsets[],
+      Index colOffset, Index rowOffset) const {
+    const Index inputCol = colIndex + colOffset;
+    const Index inputRow = rowIndex + rowOffset;
+    const Index planeOffsets[2] = {
+        patchOffsets[0] - colOffset * m_colStride - rowOffset * m_rowStride,
+        patchOffsets[1] - colOffset * m_colStride - rowOffset * m_rowStride};
+    const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
+                                  planeIndex + planeOffsets[1]};
+
+    if (inputRow >= m_inputRows || inputRow < 0 || inputCol >= m_inputCols ||
+        inputCol < 0 || inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
+      // Partial packet is all zeros
+      return internal::pset1<Packet>(Scalar(0));
+    } else if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
+      // From inputIndex-span[0], we need to load elements starting from index
+      // span[0] all the way upto (and including) span[1].
+      const Index depth = patchId - patchOffsets[0] * patchDepth();
+      const Index inputIndex = depth + inputPlanes[0] * m_planeInputStride +
+                               inputRow * m_rowInputStride +
+                               inputCol * m_colInputStride + otherIndex;
+      return m_impl.template partialPacket<Packet>(
+          inputIndex - span[0], mask<Packet>(span[0], span[1] + 1));
+    } else {
+      // Using slow path for this partial packet.
+      // We need to load elements starting from index span[0] all the way upto
+      // (and including) span[1]. We split this load into 3 parts:
+      // 0 : span[0]-1 - Zeros will be loaded for these indices
+      // span[0] : span[1] - Elements will be loaded here for these indices
+      // span[1]+1 : packetSize-1 - Zeross will be loaded for these indices
+      const Index packetSize = internal::unpacket_traits<Packet>::size;
+      EIGEN_ALIGN_MAX
+      typename internal::remove_const<Scalar>::type values[packetSize];
+      for (int i = 0; i < span[0]; ++i) values[i] = Scalar(0);
+      for (int i = span[0]; i < span[1] + 1; ++i)
+        values[i] = loadCoeff(patchId - span[0] + i, planeIndex, rowIndex,
+                              colIndex, otherIndex);
+      for (int i = span[1] + 1; i < packetSize; ++i) values[i] = Scalar(0);
+      return internal::pload<Packet>(values);
+    }
+  }
+
+  // Helper function to load a packet that is split across two rows (but single
+  // column). If required, this function is called from loadPacketStandard()
+  // when the packet type supports masked load and when the partial packet load
+  // is available in the TensorEvaluator.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumnTwoRows(
+      Index patchId, Index planeIndex, Index rowIndex, Index colIndex,
+      Index otherIndex, const Index patchOffsets[], const Index colOffsets[],
+      const Index rowOffsets[]) const {
+    eigen_assert(colOffsets[1] == colOffsets[0] &&
+                 rowOffsets[1] == rowOffsets[0] + 1);
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+
+    // Packet to load will be split into 2 parts where each part spans a single
+    // row and both the parts span the same column.
+    // First determine where to split.
+    const Index patchIdSplit =
+        (((rowOffsets[1] * m_rowStride) + (colOffsets[0] * m_colStride)) *
+         m_patch_depth) -
+        1;
+    const Index patchOffsetSplit = patchIdSplit / m_fastDimZero;
+
+    // patchIds[i]:          patchId corresponding to partial packet i
+    // spans[i]:             Start and end indices corresponding to the elements
+    //                       to be loaded for partial packet i
+    // patchOffsets2Cols[i]: patchOffsets corresponding to partial packet i
+    const Index patchIds[2] = {patchId, patchIdSplit + 1};
+    const Index spans[2][2] = {{0, patchIdSplit - patchId},
+                               {patchIdSplit - patchId + 1, packetSize - 1}};
+    const Index patchOffsets2Cols[2][2] = {
+        {patchOffsets[0], patchOffsetSplit},
+        {patchOffsetSplit + 1, patchOffsets[1]}};
+
+    // Load partial packets and do bit-wise OR to generate required packet
+    return internal::por<Packet>(
+        loadPartialPacketStandard(planeIndex, rowIndex, colIndex, otherIndex,
+                                  patchIds[0], spans[0], patchOffsets2Cols[0],
+                                  colOffsets[0], rowOffsets[0]),
+        loadPartialPacketStandard(planeIndex, rowIndex, colIndex, otherIndex,
+                                  patchIds[1], spans[1], patchOffsets2Cols[1],
+                                  colOffsets[1], rowOffsets[1]));
+  }
+
+  // Helper function to load a packet that is present in a single column and
+  // row. If required, this function is called from loadPacketStandard().
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumnSingleRow(
+      Index patchId, Index planeIndex, Index rowIndex, Index colIndex,
+      Index otherIndex, const Index patchOffsets[], const Index colOffsets[],
+      const Index rowOffsets[], const Index inputCols[],
+      const Index inputRows[]) const {
+    eigen_assert(colOffsets[1] == colOffsets[0] &&
+                 rowOffsets[1] == rowOffsets[0]);
+    const Index planeOffsets[2] = {
+        patchOffsets[0] - colOffsets[0] * m_colStride -
+            rowOffsets[0] * m_rowStride,
+        patchOffsets[1] - colOffsets[1] * m_colStride -
+            rowOffsets[1] * m_rowStride};
+    eigen_assert(planeOffsets[0] <= planeOffsets[1]);
+    const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
+                                  planeIndex + planeOffsets[1]};
+
+    if (inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
+      return internal::pset1<Packet>(Scalar(0));
+    }
+    if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
+      const Index depth = patchId - patchOffsets[0] * patchDepth();
+      const Index inputIndex = depth + inputPlanes[0] * m_planeInputStride +
+                               inputRows[0] * m_rowInputStride +
+                               inputCols[0] * m_colInputStride + otherIndex;
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
+                                  otherIndex);
+  }
+
+  // Load standard packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  // This function will be called if partial packet loading is not available
+  // for the TesnorEvaluator or if the packet type does not support masked
+  // load.
+  template <typename PacketT, typename TensorEvaluatorT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
+      !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
+      PacketT>::type
+  loadPacketStandard(Index patchId, Index planeIndex, Index rowIndex,
+                     Index colIndex, Index otherIndex) const {
    const Index packetSize = internal::unpacket_traits<Packet>::size;
    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(patchId <
@ -492,27 +631,81 @@ class TensorContractionInputMapper<
        }

        if (inputRows[0] == inputRows[1]) {
-          const Index planeOffsets[2] = {
-              patchOffsets[0] - colOffsets[0] * m_colStride -
-                  rowOffsets[0] * m_rowStride,
-              patchOffsets[1] - colOffsets[1] * m_colStride -
-                  rowOffsets[1] * m_rowStride};
-          eigen_assert(planeOffsets[0] <= planeOffsets[1]);
-          const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
-                                        planeIndex + planeOffsets[1]};
+          return loadPacketStandardFromSingleColumnSingleRow(
+              patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
+              colOffsets, rowOffsets, inputCols, inputRows);
+        }
+      }
+    }

-          if (inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
-            return internal::pset1<Packet>(Scalar(0));
-          }
+    return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
+                                  otherIndex);
+  }

-          if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
-            const Index depth = patchId - patchOffsets[0] * patchDepth();
-            const Index inputIndex =
-                depth + inputPlanes[0] * m_planeInputStride +
-                inputRows[0] * m_rowInputStride +
-                inputCols[0] * m_colInputStride + otherIndex;
-            return m_impl.template packet<Unaligned>(inputIndex);
-          }
+  // Load standard packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  // This function will be called if partial packet loading is available for
+  // the TesnorEvaluator and if the packet type supports masked load.
+  // The only difference between this and the other case is that if the packet
+  // to load is split across two rows (but in same column), then in this case
+  // instead of going to the slow (element-by-element) load, we load two packets
+  // - each containing elements from one of the rows (rest of the elements of
+  // the packets are zeroes), and then combine these two packets to generate the
+  // required packet. The idea is to enable fast load (if possible) of these
+  // 'partial' packets.
+  template <typename PacketT, typename TensorEvaluatorT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
+      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
+      PacketT>::type
+  loadPacketStandard(Index patchId, Index planeIndex, Index rowIndex,
+                     Index colIndex, Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId <
+                 patchDepth() * patchPlanes() * patchRows() * patchCols());
+    eigen_assert(!nonStandardPatches());
+
+    if ((patchDepth() % packetSize) == 0) {
+      return loadPacketFast(patchId, planeIndex, rowIndex, colIndex,
+                            otherIndex);
+    } else {
+      // Offsets and input calculation here are identical to
+      // loadCoeffStandard(...), but repeated twice.
+
+      const Index patchOffsets[2] = {
+          patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
+
+      const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
+                                   patchOffsets[1] / m_fastColStride};
+      eigen_assert(colOffsets[0] <= colOffsets[1]);
+
+      const Index inputCols[2] = {colIndex + colOffsets[0],
+                                  colIndex + colOffsets[1]};
+      if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
+        return internal::pset1<Packet>(Scalar(0));
+      }
+
+      if (inputCols[0] == inputCols[1]) {
+        const Index rowOffsets[2] = {
+            (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
+            (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
+        eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+        const Index inputRows[2] = {rowIndex + rowOffsets[0],
+                                    rowIndex + rowOffsets[1]};
+
+        if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
+          return internal::pset1<Packet>(Scalar(0));
+        }
+
+        if (inputRows[0] == inputRows[1]) {
+          return loadPacketStandardFromSingleColumnSingleRow(
+              patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
+              colOffsets, rowOffsets, inputCols, inputRows);
+        }
+        if (inputRows[0] + 1 == inputRows[1]) {
+          return loadPacketStandardFromSingleColumnTwoRows(
+              patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
+              colOffsets, rowOffsets);
        }
      }
    }
@ -760,7 +953,8 @@ class TensorContractionSubMapper<
  }
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
  loadPacketStandard(Index i) const {
-    return m_base_mapper.loadPacketStandard(
+    typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT;
+    return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>(
        i + m_depth_offset, m_planeIndex, m_rowIndex, m_colIndex, m_otherIndex);
  }
  template <typename Packet>
--- a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
@ -16,66 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
 #define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_

+#include "tensorflow/core/kernels/eigen_convolution_helpers.h"
+
 // Note this header is used in both TF and TFLite.
 namespace Eigen {

 namespace internal {

-// TensorEvaluatorHasPartialPacket<TensorEvaluatorType, PacketType, IndexType>
-// provides `value` that is true if TensorEvaluatorType has `PacketType
-// partialPacket<PacketType>(IndexType, unpacket_traits<PacketType>::mask_t)
-// const` and if the PacketType supports masked load.
-//
-// Partial packets are used to:
-//
-// 1) Split the packet over two columns and use partial loads for each
-//    individual part before combining them to get the required packet. This
-//    class is used to pick the correct implementation of loadPacketStandard
-//    function below.
-//
-// 2) Finalize packing of columns in gemm_pack_colmajor after processing
-//    vectorized part with full packets (see eigen_spatiual_convolutions.h).
-template <typename TensorEvaluatorType, typename PacketType, typename IndexType>
-class TensorEvaluatorHasPartialPacket {
- public:
-  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
-  static auto functionExistsSfinae(
-      typename std::enable_if<
-          unpacket_traits<PacketT>::masked_load_available &&
-          std::is_same<PacketT,
-                       decltype(std::declval<const TensorEvaluatorT>()
-                                    .template partialPacket<PacketT>(
-                                        std::declval<IndexT>(),
-                                        std::declval<typename unpacket_traits<
-                                            PacketT>::mask_t>()))>::value>::
-          type*) -> std::true_type;
-
-  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
-  static auto functionExistsSfinae(...) -> std::false_type;
-
-  typedef decltype(
-      functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(
-          nullptr)) status;
-
-  static const bool value = status::value;
-};
-
-// Compute a mask for loading/storing coefficients in/from a packet in a
-// [from, to) range. If the mask bit is 1, element will be loaded/stored.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
-                            typename unpacket_traits<Packet>::mask_t>::type
-    mask(int from, int to) {
-  const Index packet_size = internal::unpacket_traits<Packet>::size;
-  eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
-
-  using Mask = typename internal::unpacket_traits<Packet>::mask_t;
-  const Mask mask_max = std::numeric_limits<Mask>::max();
-
-  return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from));
-}
-
 // WARNING: Most of the code here implicitly assumes that the matrix is in
 // ColMajor layout. This is guaranteed by the tensor contraction (see
 // TensorContraction.h).