Merge pull request #30138 from anuj-rawat:eigen_cuboid_convolution_gemm_pack_rhs
PiperOrigin-RevId: 256426023
This commit is contained in:
commit
ddf01f1df0
@ -820,6 +820,7 @@ cc_library(
|
||||
],
|
||||
deps = [
|
||||
":eigen_contraction_kernel",
|
||||
":eigen_convolution_helpers",
|
||||
":eigen_spatial_convolutions-inl",
|
||||
"//third_party/eigen3",
|
||||
],
|
||||
@ -839,6 +840,7 @@ cc_library(
|
||||
"eigen_volume_patch.h",
|
||||
],
|
||||
deps = [
|
||||
":eigen_convolution_helpers",
|
||||
":eigen_spatial_convolutions-inl",
|
||||
"//third_party/eigen3",
|
||||
],
|
||||
@ -849,6 +851,16 @@ cc_library(
|
||||
hdrs = [
|
||||
"eigen_spatial_convolutions-inl.h",
|
||||
],
|
||||
deps = [
|
||||
":eigen_convolution_helpers",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "eigen_convolution_helpers",
|
||||
hdrs = [
|
||||
"eigen_convolution_helpers.h",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
@ -5928,6 +5940,7 @@ filegroup(
|
||||
"eigen_attention.h",
|
||||
"eigen_backward_cuboid_convolutions.h",
|
||||
"eigen_backward_spatial_convolutions.h",
|
||||
"eigen_convolution_helpers.h",
|
||||
"eigen_cuboid_convolution.h",
|
||||
"eigen_pooling.h",
|
||||
"eigen_softmax.h",
|
||||
@ -6396,6 +6409,7 @@ filegroup(
|
||||
)
|
||||
|
||||
ANDROID_TEXTUAL_HDRS = [
|
||||
"eigen_convolution_helpers.h",
|
||||
"eigen_spatial_convolutions-inl.h",
|
||||
"gather_nd_op_cpu_impl.h",
|
||||
"gemm_functors.h",
|
||||
|
86
tensorflow/core/kernels/eigen_convolution_helpers.h
Normal file
86
tensorflow/core/kernels/eigen_convolution_helpers.h
Normal file
@ -0,0 +1,86 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
|
||||
#define TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// TensorEvaluatorHasPartialPacket<TensorEvaluatorType, PacketType, IndexType>
|
||||
// provides `value` that is true if TensorEvaluatorType has `PacketType
|
||||
// partialPacket<PacketType>(IndexType, unpacket_traits<PacketType>::mask_t)
|
||||
// const` and if the PacketType supports masked load.
|
||||
//
|
||||
// Partial packets are used to:
|
||||
//
|
||||
// 1) Split the packet over two columns in eigen based spatial convolution and
|
||||
// use partial loads for each individual part before combining them to get the
|
||||
// required packet. This class is used to pick the correct implementation of
|
||||
// loadPacketStandard function.
|
||||
//
|
||||
// 2) Split the packet over two rows (within the same column) in eigen based
|
||||
// cuboid convolution and use partial loads for each individual part before
|
||||
// combining them to get the required packet. This class is used to pick the
|
||||
// correct implementation of loadPacketStandard function. This usage is similar
|
||||
// to the usage in eigen based spatial convolution described above.
|
||||
//
|
||||
// 3) Finalize packing of columns in gemm_pack_colmajor after processing
|
||||
// vectorized part with full packets (see eigen_spatial_convolutions.h).
|
||||
template <typename TensorEvaluatorType, typename PacketType, typename IndexType>
|
||||
class TensorEvaluatorHasPartialPacket {
|
||||
public:
|
||||
template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
|
||||
static auto functionExistsSfinae(
|
||||
typename std::enable_if<
|
||||
unpacket_traits<PacketT>::masked_load_available &&
|
||||
std::is_same<PacketT,
|
||||
decltype(std::declval<const TensorEvaluatorT>()
|
||||
.template partialPacket<PacketT>(
|
||||
std::declval<IndexT>(),
|
||||
std::declval<typename unpacket_traits<
|
||||
PacketT>::mask_t>()))>::value>::
|
||||
type*) -> std::true_type;
|
||||
|
||||
template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
|
||||
static auto functionExistsSfinae(...) -> std::false_type;
|
||||
|
||||
typedef decltype(
|
||||
functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(
|
||||
nullptr)) status;
|
||||
|
||||
static const bool value = status::value;
|
||||
};
|
||||
|
||||
// Compute a mask for loading/storing coefficients in/from a packet in a
|
||||
// [from, to) range. If the mask bit is 1, element will be loaded/stored.
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
|
||||
typename unpacket_traits<Packet>::mask_t>::type
|
||||
mask(int from, int to) {
|
||||
const Index packet_size = internal::unpacket_traits<Packet>::size;
|
||||
eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
|
||||
|
||||
using Mask = typename internal::unpacket_traits<Packet>::mask_t;
|
||||
const Mask mask_max = std::numeric_limits<Mask>::max();
|
||||
|
||||
return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from));
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
|
@ -23,6 +23,8 @@ limitations under the License.
|
||||
#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
|
||||
#endif
|
||||
|
||||
#include "tensorflow/core/kernels/eigen_convolution_helpers.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
@ -445,14 +447,151 @@ class TensorContractionInputMapper<
|
||||
return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
|
||||
otherIndex);
|
||||
}
|
||||
return loadPacketStandard(patchId, planeIndex, rowIndex, colIndex,
|
||||
otherIndex);
|
||||
typedef decltype(m_impl) TensorEvaluatorT;
|
||||
return loadPacketStandard<Packet, TensorEvaluatorT>(
|
||||
patchId, planeIndex, rowIndex, colIndex, otherIndex);
|
||||
}
|
||||
|
||||
// Helper function to load a 'partial' packet - this is the single row part of
|
||||
// a packet that is split across two rows (but single column). In the
|
||||
// 'partial' packet, the elements corresponding to the row (specified through
|
||||
// rowOffset) are loaded and the rest of the elements are zero-filled into the
|
||||
// 'partial' packet. This function is called from
|
||||
// loadPacketStandardFromSingleColumnTwoRows(). This code path is exercied
|
||||
// only when the packet type supports masked load and when the partial packet
|
||||
// load is available in the TensorEvaluator.
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index planeIndex,
|
||||
Index rowIndex, Index colIndex,
|
||||
Index otherIndex) const {
|
||||
EIGEN_ALWAYS_INLINE Packet loadPartialPacketStandard(
|
||||
Index planeIndex, Index rowIndex, Index colIndex, Index otherIndex,
|
||||
Index patchId, const Index span[], const Index patchOffsets[],
|
||||
Index colOffset, Index rowOffset) const {
|
||||
const Index inputCol = colIndex + colOffset;
|
||||
const Index inputRow = rowIndex + rowOffset;
|
||||
const Index planeOffsets[2] = {
|
||||
patchOffsets[0] - colOffset * m_colStride - rowOffset * m_rowStride,
|
||||
patchOffsets[1] - colOffset * m_colStride - rowOffset * m_rowStride};
|
||||
const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
|
||||
planeIndex + planeOffsets[1]};
|
||||
|
||||
if (inputRow >= m_inputRows || inputRow < 0 || inputCol >= m_inputCols ||
|
||||
inputCol < 0 || inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
|
||||
// Partial packet is all zeros
|
||||
return internal::pset1<Packet>(Scalar(0));
|
||||
} else if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
|
||||
// From inputIndex-span[0], we need to load elements starting from index
|
||||
// span[0] all the way upto (and including) span[1].
|
||||
const Index depth = patchId - patchOffsets[0] * patchDepth();
|
||||
const Index inputIndex = depth + inputPlanes[0] * m_planeInputStride +
|
||||
inputRow * m_rowInputStride +
|
||||
inputCol * m_colInputStride + otherIndex;
|
||||
return m_impl.template partialPacket<Packet>(
|
||||
inputIndex - span[0], mask<Packet>(span[0], span[1] + 1));
|
||||
} else {
|
||||
// Using slow path for this partial packet.
|
||||
// We need to load elements starting from index span[0] all the way upto
|
||||
// (and including) span[1]. We split this load into 3 parts:
|
||||
// 0 : span[0]-1 - Zeros will be loaded for these indices
|
||||
// span[0] : span[1] - Elements will be loaded here for these indices
|
||||
// span[1]+1 : packetSize-1 - Zeross will be loaded for these indices
|
||||
const Index packetSize = internal::unpacket_traits<Packet>::size;
|
||||
EIGEN_ALIGN_MAX
|
||||
typename internal::remove_const<Scalar>::type values[packetSize];
|
||||
for (int i = 0; i < span[0]; ++i) values[i] = Scalar(0);
|
||||
for (int i = span[0]; i < span[1] + 1; ++i)
|
||||
values[i] = loadCoeff(patchId - span[0] + i, planeIndex, rowIndex,
|
||||
colIndex, otherIndex);
|
||||
for (int i = span[1] + 1; i < packetSize; ++i) values[i] = Scalar(0);
|
||||
return internal::pload<Packet>(values);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to load a packet that is split across two rows (but single
|
||||
// column). If required, this function is called from loadPacketStandard()
|
||||
// when the packet type supports masked load and when the partial packet load
|
||||
// is available in the TensorEvaluator.
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumnTwoRows(
|
||||
Index patchId, Index planeIndex, Index rowIndex, Index colIndex,
|
||||
Index otherIndex, const Index patchOffsets[], const Index colOffsets[],
|
||||
const Index rowOffsets[]) const {
|
||||
eigen_assert(colOffsets[1] == colOffsets[0] &&
|
||||
rowOffsets[1] == rowOffsets[0] + 1);
|
||||
const Index packetSize = internal::unpacket_traits<Packet>::size;
|
||||
|
||||
// Packet to load will be split into 2 parts where each part spans a single
|
||||
// row and both the parts span the same column.
|
||||
// First determine where to split.
|
||||
const Index patchIdSplit =
|
||||
(((rowOffsets[1] * m_rowStride) + (colOffsets[0] * m_colStride)) *
|
||||
m_patch_depth) -
|
||||
1;
|
||||
const Index patchOffsetSplit = patchIdSplit / m_fastDimZero;
|
||||
|
||||
// patchIds[i]: patchId corresponding to partial packet i
|
||||
// spans[i]: Start and end indices corresponding to the elements
|
||||
// to be loaded for partial packet i
|
||||
// patchOffsets2Cols[i]: patchOffsets corresponding to partial packet i
|
||||
const Index patchIds[2] = {patchId, patchIdSplit + 1};
|
||||
const Index spans[2][2] = {{0, patchIdSplit - patchId},
|
||||
{patchIdSplit - patchId + 1, packetSize - 1}};
|
||||
const Index patchOffsets2Cols[2][2] = {
|
||||
{patchOffsets[0], patchOffsetSplit},
|
||||
{patchOffsetSplit + 1, patchOffsets[1]}};
|
||||
|
||||
// Load partial packets and do bit-wise OR to generate required packet
|
||||
return internal::por<Packet>(
|
||||
loadPartialPacketStandard(planeIndex, rowIndex, colIndex, otherIndex,
|
||||
patchIds[0], spans[0], patchOffsets2Cols[0],
|
||||
colOffsets[0], rowOffsets[0]),
|
||||
loadPartialPacketStandard(planeIndex, rowIndex, colIndex, otherIndex,
|
||||
patchIds[1], spans[1], patchOffsets2Cols[1],
|
||||
colOffsets[1], rowOffsets[1]));
|
||||
}
|
||||
|
||||
// Helper function to load a packet that is present in a single column and
|
||||
// row. If required, this function is called from loadPacketStandard().
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumnSingleRow(
|
||||
Index patchId, Index planeIndex, Index rowIndex, Index colIndex,
|
||||
Index otherIndex, const Index patchOffsets[], const Index colOffsets[],
|
||||
const Index rowOffsets[], const Index inputCols[],
|
||||
const Index inputRows[]) const {
|
||||
eigen_assert(colOffsets[1] == colOffsets[0] &&
|
||||
rowOffsets[1] == rowOffsets[0]);
|
||||
const Index planeOffsets[2] = {
|
||||
patchOffsets[0] - colOffsets[0] * m_colStride -
|
||||
rowOffsets[0] * m_rowStride,
|
||||
patchOffsets[1] - colOffsets[1] * m_colStride -
|
||||
rowOffsets[1] * m_rowStride};
|
||||
eigen_assert(planeOffsets[0] <= planeOffsets[1]);
|
||||
const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
|
||||
planeIndex + planeOffsets[1]};
|
||||
|
||||
if (inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
|
||||
return internal::pset1<Packet>(Scalar(0));
|
||||
}
|
||||
if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
|
||||
const Index depth = patchId - patchOffsets[0] * patchDepth();
|
||||
const Index inputIndex = depth + inputPlanes[0] * m_planeInputStride +
|
||||
inputRows[0] * m_rowInputStride +
|
||||
inputCols[0] * m_colInputStride + otherIndex;
|
||||
return m_impl.template packet<Unaligned>(inputIndex);
|
||||
}
|
||||
return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
|
||||
otherIndex);
|
||||
}
|
||||
|
||||
// Load standard packet from a patch specified by the "within patch offset"
|
||||
// (patchId) and the precomputed indices of the first element of the patch.
|
||||
// This function will be called if partial packet loading is not available
|
||||
// for the TesnorEvaluator or if the packet type does not support masked
|
||||
// load.
|
||||
template <typename PacketT, typename TensorEvaluatorT>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
|
||||
!TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
|
||||
PacketT>::type
|
||||
loadPacketStandard(Index patchId, Index planeIndex, Index rowIndex,
|
||||
Index colIndex, Index otherIndex) const {
|
||||
const Index packetSize = internal::unpacket_traits<Packet>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(patchId <
|
||||
@ -492,27 +631,81 @@ class TensorContractionInputMapper<
|
||||
}
|
||||
|
||||
if (inputRows[0] == inputRows[1]) {
|
||||
const Index planeOffsets[2] = {
|
||||
patchOffsets[0] - colOffsets[0] * m_colStride -
|
||||
rowOffsets[0] * m_rowStride,
|
||||
patchOffsets[1] - colOffsets[1] * m_colStride -
|
||||
rowOffsets[1] * m_rowStride};
|
||||
eigen_assert(planeOffsets[0] <= planeOffsets[1]);
|
||||
const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
|
||||
planeIndex + planeOffsets[1]};
|
||||
return loadPacketStandardFromSingleColumnSingleRow(
|
||||
patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
|
||||
colOffsets, rowOffsets, inputCols, inputRows);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
|
||||
return internal::pset1<Packet>(Scalar(0));
|
||||
}
|
||||
return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
|
||||
otherIndex);
|
||||
}
|
||||
|
||||
if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
|
||||
const Index depth = patchId - patchOffsets[0] * patchDepth();
|
||||
const Index inputIndex =
|
||||
depth + inputPlanes[0] * m_planeInputStride +
|
||||
inputRows[0] * m_rowInputStride +
|
||||
inputCols[0] * m_colInputStride + otherIndex;
|
||||
return m_impl.template packet<Unaligned>(inputIndex);
|
||||
}
|
||||
// Load standard packet from a patch specified by the "within patch offset"
|
||||
// (patchId) and the precomputed indices of the first element of the patch.
|
||||
// This function will be called if partial packet loading is available for
|
||||
// the TesnorEvaluator and if the packet type supports masked load.
|
||||
// The only difference between this and the other case is that if the packet
|
||||
// to load is split across two rows (but in same column), then in this case
|
||||
// instead of going to the slow (element-by-element) load, we load two packets
|
||||
// - each containing elements from one of the rows (rest of the elements of
|
||||
// the packets are zeroes), and then combine these two packets to generate the
|
||||
// required packet. The idea is to enable fast load (if possible) of these
|
||||
// 'partial' packets.
|
||||
template <typename PacketT, typename TensorEvaluatorT>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
|
||||
TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
|
||||
PacketT>::type
|
||||
loadPacketStandard(Index patchId, Index planeIndex, Index rowIndex,
|
||||
Index colIndex, Index otherIndex) const {
|
||||
const Index packetSize = internal::unpacket_traits<Packet>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(patchId <
|
||||
patchDepth() * patchPlanes() * patchRows() * patchCols());
|
||||
eigen_assert(!nonStandardPatches());
|
||||
|
||||
if ((patchDepth() % packetSize) == 0) {
|
||||
return loadPacketFast(patchId, planeIndex, rowIndex, colIndex,
|
||||
otherIndex);
|
||||
} else {
|
||||
// Offsets and input calculation here are identical to
|
||||
// loadCoeffStandard(...), but repeated twice.
|
||||
|
||||
const Index patchOffsets[2] = {
|
||||
patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
|
||||
|
||||
const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
|
||||
patchOffsets[1] / m_fastColStride};
|
||||
eigen_assert(colOffsets[0] <= colOffsets[1]);
|
||||
|
||||
const Index inputCols[2] = {colIndex + colOffsets[0],
|
||||
colIndex + colOffsets[1]};
|
||||
if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
|
||||
return internal::pset1<Packet>(Scalar(0));
|
||||
}
|
||||
|
||||
if (inputCols[0] == inputCols[1]) {
|
||||
const Index rowOffsets[2] = {
|
||||
(patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
|
||||
(patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
|
||||
eigen_assert(rowOffsets[0] <= rowOffsets[1]);
|
||||
const Index inputRows[2] = {rowIndex + rowOffsets[0],
|
||||
rowIndex + rowOffsets[1]};
|
||||
|
||||
if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
|
||||
return internal::pset1<Packet>(Scalar(0));
|
||||
}
|
||||
|
||||
if (inputRows[0] == inputRows[1]) {
|
||||
return loadPacketStandardFromSingleColumnSingleRow(
|
||||
patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
|
||||
colOffsets, rowOffsets, inputCols, inputRows);
|
||||
}
|
||||
if (inputRows[0] + 1 == inputRows[1]) {
|
||||
return loadPacketStandardFromSingleColumnTwoRows(
|
||||
patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
|
||||
colOffsets, rowOffsets);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -760,7 +953,8 @@ class TensorContractionSubMapper<
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
|
||||
loadPacketStandard(Index i) const {
|
||||
return m_base_mapper.loadPacketStandard(
|
||||
typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT;
|
||||
return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>(
|
||||
i + m_depth_offset, m_planeIndex, m_rowIndex, m_colIndex, m_otherIndex);
|
||||
}
|
||||
template <typename Packet>
|
||||
|
@ -16,66 +16,13 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
|
||||
#define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
|
||||
|
||||
#include "tensorflow/core/kernels/eigen_convolution_helpers.h"
|
||||
|
||||
// Note this header is used in both TF and TFLite.
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
// TensorEvaluatorHasPartialPacket<TensorEvaluatorType, PacketType, IndexType>
|
||||
// provides `value` that is true if TensorEvaluatorType has `PacketType
|
||||
// partialPacket<PacketType>(IndexType, unpacket_traits<PacketType>::mask_t)
|
||||
// const` and if the PacketType supports masked load.
|
||||
//
|
||||
// Partial packets are used to:
|
||||
//
|
||||
// 1) Split the packet over two columns and use partial loads for each
|
||||
// individual part before combining them to get the required packet. This
|
||||
// class is used to pick the correct implementation of loadPacketStandard
|
||||
// function below.
|
||||
//
|
||||
// 2) Finalize packing of columns in gemm_pack_colmajor after processing
|
||||
// vectorized part with full packets (see eigen_spatiual_convolutions.h).
|
||||
template <typename TensorEvaluatorType, typename PacketType, typename IndexType>
|
||||
class TensorEvaluatorHasPartialPacket {
|
||||
public:
|
||||
template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
|
||||
static auto functionExistsSfinae(
|
||||
typename std::enable_if<
|
||||
unpacket_traits<PacketT>::masked_load_available &&
|
||||
std::is_same<PacketT,
|
||||
decltype(std::declval<const TensorEvaluatorT>()
|
||||
.template partialPacket<PacketT>(
|
||||
std::declval<IndexT>(),
|
||||
std::declval<typename unpacket_traits<
|
||||
PacketT>::mask_t>()))>::value>::
|
||||
type*) -> std::true_type;
|
||||
|
||||
template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
|
||||
static auto functionExistsSfinae(...) -> std::false_type;
|
||||
|
||||
typedef decltype(
|
||||
functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(
|
||||
nullptr)) status;
|
||||
|
||||
static const bool value = status::value;
|
||||
};
|
||||
|
||||
// Compute a mask for loading/storing coefficients in/from a packet in a
|
||||
// [from, to) range. If the mask bit is 1, element will be loaded/stored.
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
|
||||
typename unpacket_traits<Packet>::mask_t>::type
|
||||
mask(int from, int to) {
|
||||
const Index packet_size = internal::unpacket_traits<Packet>::size;
|
||||
eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
|
||||
|
||||
using Mask = typename internal::unpacket_traits<Packet>::mask_t;
|
||||
const Mask mask_max = std::numeric_limits<Mask>::max();
|
||||
|
||||
return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from));
|
||||
}
|
||||
|
||||
// WARNING: Most of the code here implicitly assumes that the matrix is in
|
||||
// ColMajor layout. This is guaranteed by the tensor contraction (see
|
||||
// TensorContraction.h).
|
||||
|
Loading…
Reference in New Issue
Block a user