Merge pull request #30138 from anuj-rawat:eigen_cuboid_convolution_gemm_pack_rhs
PiperOrigin-RevId: 256426023
This commit is contained in:
commit
ddf01f1df0
@ -820,6 +820,7 @@ cc_library(
|
|||||||
],
|
],
|
||||||
deps = [
|
deps = [
|
||||||
":eigen_contraction_kernel",
|
":eigen_contraction_kernel",
|
||||||
|
":eigen_convolution_helpers",
|
||||||
":eigen_spatial_convolutions-inl",
|
":eigen_spatial_convolutions-inl",
|
||||||
"//third_party/eigen3",
|
"//third_party/eigen3",
|
||||||
],
|
],
|
||||||
@ -839,6 +840,7 @@ cc_library(
|
|||||||
"eigen_volume_patch.h",
|
"eigen_volume_patch.h",
|
||||||
],
|
],
|
||||||
deps = [
|
deps = [
|
||||||
|
":eigen_convolution_helpers",
|
||||||
":eigen_spatial_convolutions-inl",
|
":eigen_spatial_convolutions-inl",
|
||||||
"//third_party/eigen3",
|
"//third_party/eigen3",
|
||||||
],
|
],
|
||||||
@ -849,6 +851,16 @@ cc_library(
|
|||||||
hdrs = [
|
hdrs = [
|
||||||
"eigen_spatial_convolutions-inl.h",
|
"eigen_spatial_convolutions-inl.h",
|
||||||
],
|
],
|
||||||
|
deps = [
|
||||||
|
":eigen_convolution_helpers",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
cc_library(
|
||||||
|
name = "eigen_convolution_helpers",
|
||||||
|
hdrs = [
|
||||||
|
"eigen_convolution_helpers.h",
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
@ -5928,6 +5940,7 @@ filegroup(
|
|||||||
"eigen_attention.h",
|
"eigen_attention.h",
|
||||||
"eigen_backward_cuboid_convolutions.h",
|
"eigen_backward_cuboid_convolutions.h",
|
||||||
"eigen_backward_spatial_convolutions.h",
|
"eigen_backward_spatial_convolutions.h",
|
||||||
|
"eigen_convolution_helpers.h",
|
||||||
"eigen_cuboid_convolution.h",
|
"eigen_cuboid_convolution.h",
|
||||||
"eigen_pooling.h",
|
"eigen_pooling.h",
|
||||||
"eigen_softmax.h",
|
"eigen_softmax.h",
|
||||||
@ -6396,6 +6409,7 @@ filegroup(
|
|||||||
)
|
)
|
||||||
|
|
||||||
ANDROID_TEXTUAL_HDRS = [
|
ANDROID_TEXTUAL_HDRS = [
|
||||||
|
"eigen_convolution_helpers.h",
|
||||||
"eigen_spatial_convolutions-inl.h",
|
"eigen_spatial_convolutions-inl.h",
|
||||||
"gather_nd_op_cpu_impl.h",
|
"gather_nd_op_cpu_impl.h",
|
||||||
"gemm_functors.h",
|
"gemm_functors.h",
|
||||||
|
86
tensorflow/core/kernels/eigen_convolution_helpers.h
Normal file
86
tensorflow/core/kernels/eigen_convolution_helpers.h
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
|
||||||
|
#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
|
||||||
|
#define TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// TensorEvaluatorHasPartialPacket<TensorEvaluatorType, PacketType, IndexType>
|
||||||
|
// provides `value` that is true if TensorEvaluatorType has `PacketType
|
||||||
|
// partialPacket<PacketType>(IndexType, unpacket_traits<PacketType>::mask_t)
|
||||||
|
// const` and if the PacketType supports masked load.
|
||||||
|
//
|
||||||
|
// Partial packets are used to:
|
||||||
|
//
|
||||||
|
// 1) Split the packet over two columns in eigen based spatial convolution and
|
||||||
|
// use partial loads for each individual part before combining them to get the
|
||||||
|
// required packet. This class is used to pick the correct implementation of
|
||||||
|
// loadPacketStandard function.
|
||||||
|
//
|
||||||
|
// 2) Split the packet over two rows (within the same column) in eigen based
|
||||||
|
// cuboid convolution and use partial loads for each individual part before
|
||||||
|
// combining them to get the required packet. This class is used to pick the
|
||||||
|
// correct implementation of loadPacketStandard function. This usage is similar
|
||||||
|
// to the usage in eigen based spatial convolution described above.
|
||||||
|
//
|
||||||
|
// 3) Finalize packing of columns in gemm_pack_colmajor after processing
|
||||||
|
// vectorized part with full packets (see eigen_spatial_convolutions.h).
|
||||||
|
template <typename TensorEvaluatorType, typename PacketType, typename IndexType>
|
||||||
|
class TensorEvaluatorHasPartialPacket {
|
||||||
|
public:
|
||||||
|
template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
|
||||||
|
static auto functionExistsSfinae(
|
||||||
|
typename std::enable_if<
|
||||||
|
unpacket_traits<PacketT>::masked_load_available &&
|
||||||
|
std::is_same<PacketT,
|
||||||
|
decltype(std::declval<const TensorEvaluatorT>()
|
||||||
|
.template partialPacket<PacketT>(
|
||||||
|
std::declval<IndexT>(),
|
||||||
|
std::declval<typename unpacket_traits<
|
||||||
|
PacketT>::mask_t>()))>::value>::
|
||||||
|
type*) -> std::true_type;
|
||||||
|
|
||||||
|
template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
|
||||||
|
static auto functionExistsSfinae(...) -> std::false_type;
|
||||||
|
|
||||||
|
typedef decltype(
|
||||||
|
functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(
|
||||||
|
nullptr)) status;
|
||||||
|
|
||||||
|
static const bool value = status::value;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Compute a mask for loading/storing coefficients in/from a packet in a
|
||||||
|
// [from, to) range. If the mask bit is 1, element will be loaded/stored.
|
||||||
|
template <typename Packet>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
|
typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
|
||||||
|
typename unpacket_traits<Packet>::mask_t>::type
|
||||||
|
mask(int from, int to) {
|
||||||
|
const Index packet_size = internal::unpacket_traits<Packet>::size;
|
||||||
|
eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
|
||||||
|
|
||||||
|
using Mask = typename internal::unpacket_traits<Packet>::mask_t;
|
||||||
|
const Mask mask_max = std::numeric_limits<Mask>::max();
|
||||||
|
|
||||||
|
return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from));
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
|
@ -23,6 +23,8 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
|
#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "tensorflow/core/kernels/eigen_convolution_helpers.h"
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
@ -445,14 +447,151 @@ class TensorContractionInputMapper<
|
|||||||
return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
|
return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
|
||||||
otherIndex);
|
otherIndex);
|
||||||
}
|
}
|
||||||
return loadPacketStandard(patchId, planeIndex, rowIndex, colIndex,
|
typedef decltype(m_impl) TensorEvaluatorT;
|
||||||
otherIndex);
|
return loadPacketStandard<Packet, TensorEvaluatorT>(
|
||||||
|
patchId, planeIndex, rowIndex, colIndex, otherIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper function to load a 'partial' packet - this is the single row part of
|
||||||
|
// a packet that is split across two rows (but single column). In the
|
||||||
|
// 'partial' packet, the elements corresponding to the row (specified through
|
||||||
|
// rowOffset) are loaded and the rest of the elements are zero-filled into the
|
||||||
|
// 'partial' packet. This function is called from
|
||||||
|
// loadPacketStandardFromSingleColumnTwoRows(). This code path is exercied
|
||||||
|
// only when the packet type supports masked load and when the partial packet
|
||||||
|
// load is available in the TensorEvaluator.
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index planeIndex,
|
EIGEN_ALWAYS_INLINE Packet loadPartialPacketStandard(
|
||||||
Index rowIndex, Index colIndex,
|
Index planeIndex, Index rowIndex, Index colIndex, Index otherIndex,
|
||||||
Index otherIndex) const {
|
Index patchId, const Index span[], const Index patchOffsets[],
|
||||||
|
Index colOffset, Index rowOffset) const {
|
||||||
|
const Index inputCol = colIndex + colOffset;
|
||||||
|
const Index inputRow = rowIndex + rowOffset;
|
||||||
|
const Index planeOffsets[2] = {
|
||||||
|
patchOffsets[0] - colOffset * m_colStride - rowOffset * m_rowStride,
|
||||||
|
patchOffsets[1] - colOffset * m_colStride - rowOffset * m_rowStride};
|
||||||
|
const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
|
||||||
|
planeIndex + planeOffsets[1]};
|
||||||
|
|
||||||
|
if (inputRow >= m_inputRows || inputRow < 0 || inputCol >= m_inputCols ||
|
||||||
|
inputCol < 0 || inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
|
||||||
|
// Partial packet is all zeros
|
||||||
|
return internal::pset1<Packet>(Scalar(0));
|
||||||
|
} else if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
|
||||||
|
// From inputIndex-span[0], we need to load elements starting from index
|
||||||
|
// span[0] all the way upto (and including) span[1].
|
||||||
|
const Index depth = patchId - patchOffsets[0] * patchDepth();
|
||||||
|
const Index inputIndex = depth + inputPlanes[0] * m_planeInputStride +
|
||||||
|
inputRow * m_rowInputStride +
|
||||||
|
inputCol * m_colInputStride + otherIndex;
|
||||||
|
return m_impl.template partialPacket<Packet>(
|
||||||
|
inputIndex - span[0], mask<Packet>(span[0], span[1] + 1));
|
||||||
|
} else {
|
||||||
|
// Using slow path for this partial packet.
|
||||||
|
// We need to load elements starting from index span[0] all the way upto
|
||||||
|
// (and including) span[1]. We split this load into 3 parts:
|
||||||
|
// 0 : span[0]-1 - Zeros will be loaded for these indices
|
||||||
|
// span[0] : span[1] - Elements will be loaded here for these indices
|
||||||
|
// span[1]+1 : packetSize-1 - Zeross will be loaded for these indices
|
||||||
|
const Index packetSize = internal::unpacket_traits<Packet>::size;
|
||||||
|
EIGEN_ALIGN_MAX
|
||||||
|
typename internal::remove_const<Scalar>::type values[packetSize];
|
||||||
|
for (int i = 0; i < span[0]; ++i) values[i] = Scalar(0);
|
||||||
|
for (int i = span[0]; i < span[1] + 1; ++i)
|
||||||
|
values[i] = loadCoeff(patchId - span[0] + i, planeIndex, rowIndex,
|
||||||
|
colIndex, otherIndex);
|
||||||
|
for (int i = span[1] + 1; i < packetSize; ++i) values[i] = Scalar(0);
|
||||||
|
return internal::pload<Packet>(values);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to load a packet that is split across two rows (but single
|
||||||
|
// column). If required, this function is called from loadPacketStandard()
|
||||||
|
// when the packet type supports masked load and when the partial packet load
|
||||||
|
// is available in the TensorEvaluator.
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
|
EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumnTwoRows(
|
||||||
|
Index patchId, Index planeIndex, Index rowIndex, Index colIndex,
|
||||||
|
Index otherIndex, const Index patchOffsets[], const Index colOffsets[],
|
||||||
|
const Index rowOffsets[]) const {
|
||||||
|
eigen_assert(colOffsets[1] == colOffsets[0] &&
|
||||||
|
rowOffsets[1] == rowOffsets[0] + 1);
|
||||||
|
const Index packetSize = internal::unpacket_traits<Packet>::size;
|
||||||
|
|
||||||
|
// Packet to load will be split into 2 parts where each part spans a single
|
||||||
|
// row and both the parts span the same column.
|
||||||
|
// First determine where to split.
|
||||||
|
const Index patchIdSplit =
|
||||||
|
(((rowOffsets[1] * m_rowStride) + (colOffsets[0] * m_colStride)) *
|
||||||
|
m_patch_depth) -
|
||||||
|
1;
|
||||||
|
const Index patchOffsetSplit = patchIdSplit / m_fastDimZero;
|
||||||
|
|
||||||
|
// patchIds[i]: patchId corresponding to partial packet i
|
||||||
|
// spans[i]: Start and end indices corresponding to the elements
|
||||||
|
// to be loaded for partial packet i
|
||||||
|
// patchOffsets2Cols[i]: patchOffsets corresponding to partial packet i
|
||||||
|
const Index patchIds[2] = {patchId, patchIdSplit + 1};
|
||||||
|
const Index spans[2][2] = {{0, patchIdSplit - patchId},
|
||||||
|
{patchIdSplit - patchId + 1, packetSize - 1}};
|
||||||
|
const Index patchOffsets2Cols[2][2] = {
|
||||||
|
{patchOffsets[0], patchOffsetSplit},
|
||||||
|
{patchOffsetSplit + 1, patchOffsets[1]}};
|
||||||
|
|
||||||
|
// Load partial packets and do bit-wise OR to generate required packet
|
||||||
|
return internal::por<Packet>(
|
||||||
|
loadPartialPacketStandard(planeIndex, rowIndex, colIndex, otherIndex,
|
||||||
|
patchIds[0], spans[0], patchOffsets2Cols[0],
|
||||||
|
colOffsets[0], rowOffsets[0]),
|
||||||
|
loadPartialPacketStandard(planeIndex, rowIndex, colIndex, otherIndex,
|
||||||
|
patchIds[1], spans[1], patchOffsets2Cols[1],
|
||||||
|
colOffsets[1], rowOffsets[1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to load a packet that is present in a single column and
|
||||||
|
// row. If required, this function is called from loadPacketStandard().
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
|
EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumnSingleRow(
|
||||||
|
Index patchId, Index planeIndex, Index rowIndex, Index colIndex,
|
||||||
|
Index otherIndex, const Index patchOffsets[], const Index colOffsets[],
|
||||||
|
const Index rowOffsets[], const Index inputCols[],
|
||||||
|
const Index inputRows[]) const {
|
||||||
|
eigen_assert(colOffsets[1] == colOffsets[0] &&
|
||||||
|
rowOffsets[1] == rowOffsets[0]);
|
||||||
|
const Index planeOffsets[2] = {
|
||||||
|
patchOffsets[0] - colOffsets[0] * m_colStride -
|
||||||
|
rowOffsets[0] * m_rowStride,
|
||||||
|
patchOffsets[1] - colOffsets[1] * m_colStride -
|
||||||
|
rowOffsets[1] * m_rowStride};
|
||||||
|
eigen_assert(planeOffsets[0] <= planeOffsets[1]);
|
||||||
|
const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
|
||||||
|
planeIndex + planeOffsets[1]};
|
||||||
|
|
||||||
|
if (inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
|
||||||
|
return internal::pset1<Packet>(Scalar(0));
|
||||||
|
}
|
||||||
|
if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
|
||||||
|
const Index depth = patchId - patchOffsets[0] * patchDepth();
|
||||||
|
const Index inputIndex = depth + inputPlanes[0] * m_planeInputStride +
|
||||||
|
inputRows[0] * m_rowInputStride +
|
||||||
|
inputCols[0] * m_colInputStride + otherIndex;
|
||||||
|
return m_impl.template packet<Unaligned>(inputIndex);
|
||||||
|
}
|
||||||
|
return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
|
||||||
|
otherIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load standard packet from a patch specified by the "within patch offset"
|
||||||
|
// (patchId) and the precomputed indices of the first element of the patch.
|
||||||
|
// This function will be called if partial packet loading is not available
|
||||||
|
// for the TesnorEvaluator or if the packet type does not support masked
|
||||||
|
// load.
|
||||||
|
template <typename PacketT, typename TensorEvaluatorT>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
|
||||||
|
!TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
|
||||||
|
PacketT>::type
|
||||||
|
loadPacketStandard(Index patchId, Index planeIndex, Index rowIndex,
|
||||||
|
Index colIndex, Index otherIndex) const {
|
||||||
const Index packetSize = internal::unpacket_traits<Packet>::size;
|
const Index packetSize = internal::unpacket_traits<Packet>::size;
|
||||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||||
eigen_assert(patchId <
|
eigen_assert(patchId <
|
||||||
@ -492,27 +631,81 @@ class TensorContractionInputMapper<
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (inputRows[0] == inputRows[1]) {
|
if (inputRows[0] == inputRows[1]) {
|
||||||
const Index planeOffsets[2] = {
|
return loadPacketStandardFromSingleColumnSingleRow(
|
||||||
patchOffsets[0] - colOffsets[0] * m_colStride -
|
patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
|
||||||
rowOffsets[0] * m_rowStride,
|
colOffsets, rowOffsets, inputCols, inputRows);
|
||||||
patchOffsets[1] - colOffsets[1] * m_colStride -
|
}
|
||||||
rowOffsets[1] * m_rowStride};
|
}
|
||||||
eigen_assert(planeOffsets[0] <= planeOffsets[1]);
|
}
|
||||||
const Index inputPlanes[2] = {planeIndex + planeOffsets[0],
|
|
||||||
planeIndex + planeOffsets[1]};
|
|
||||||
|
|
||||||
if (inputPlanes[0] >= m_inputPlanes || inputPlanes[1] < 0) {
|
return packetWithPossibleZero(patchId, planeIndex, rowIndex, colIndex,
|
||||||
return internal::pset1<Packet>(Scalar(0));
|
otherIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
|
// Load standard packet from a patch specified by the "within patch offset"
|
||||||
const Index depth = patchId - patchOffsets[0] * patchDepth();
|
// (patchId) and the precomputed indices of the first element of the patch.
|
||||||
const Index inputIndex =
|
// This function will be called if partial packet loading is available for
|
||||||
depth + inputPlanes[0] * m_planeInputStride +
|
// the TesnorEvaluator and if the packet type supports masked load.
|
||||||
inputRows[0] * m_rowInputStride +
|
// The only difference between this and the other case is that if the packet
|
||||||
inputCols[0] * m_colInputStride + otherIndex;
|
// to load is split across two rows (but in same column), then in this case
|
||||||
return m_impl.template packet<Unaligned>(inputIndex);
|
// instead of going to the slow (element-by-element) load, we load two packets
|
||||||
}
|
// - each containing elements from one of the rows (rest of the elements of
|
||||||
|
// the packets are zeroes), and then combine these two packets to generate the
|
||||||
|
// required packet. The idea is to enable fast load (if possible) of these
|
||||||
|
// 'partial' packets.
|
||||||
|
template <typename PacketT, typename TensorEvaluatorT>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
|
||||||
|
TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value,
|
||||||
|
PacketT>::type
|
||||||
|
loadPacketStandard(Index patchId, Index planeIndex, Index rowIndex,
|
||||||
|
Index colIndex, Index otherIndex) const {
|
||||||
|
const Index packetSize = internal::unpacket_traits<Packet>::size;
|
||||||
|
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||||
|
eigen_assert(patchId <
|
||||||
|
patchDepth() * patchPlanes() * patchRows() * patchCols());
|
||||||
|
eigen_assert(!nonStandardPatches());
|
||||||
|
|
||||||
|
if ((patchDepth() % packetSize) == 0) {
|
||||||
|
return loadPacketFast(patchId, planeIndex, rowIndex, colIndex,
|
||||||
|
otherIndex);
|
||||||
|
} else {
|
||||||
|
// Offsets and input calculation here are identical to
|
||||||
|
// loadCoeffStandard(...), but repeated twice.
|
||||||
|
|
||||||
|
const Index patchOffsets[2] = {
|
||||||
|
patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
|
||||||
|
|
||||||
|
const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
|
||||||
|
patchOffsets[1] / m_fastColStride};
|
||||||
|
eigen_assert(colOffsets[0] <= colOffsets[1]);
|
||||||
|
|
||||||
|
const Index inputCols[2] = {colIndex + colOffsets[0],
|
||||||
|
colIndex + colOffsets[1]};
|
||||||
|
if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
|
||||||
|
return internal::pset1<Packet>(Scalar(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inputCols[0] == inputCols[1]) {
|
||||||
|
const Index rowOffsets[2] = {
|
||||||
|
(patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
|
||||||
|
(patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
|
||||||
|
eigen_assert(rowOffsets[0] <= rowOffsets[1]);
|
||||||
|
const Index inputRows[2] = {rowIndex + rowOffsets[0],
|
||||||
|
rowIndex + rowOffsets[1]};
|
||||||
|
|
||||||
|
if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
|
||||||
|
return internal::pset1<Packet>(Scalar(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inputRows[0] == inputRows[1]) {
|
||||||
|
return loadPacketStandardFromSingleColumnSingleRow(
|
||||||
|
patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
|
||||||
|
colOffsets, rowOffsets, inputCols, inputRows);
|
||||||
|
}
|
||||||
|
if (inputRows[0] + 1 == inputRows[1]) {
|
||||||
|
return loadPacketStandardFromSingleColumnTwoRows(
|
||||||
|
patchId, planeIndex, rowIndex, colIndex, otherIndex, patchOffsets,
|
||||||
|
colOffsets, rowOffsets);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -760,7 +953,8 @@ class TensorContractionSubMapper<
|
|||||||
}
|
}
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
|
||||||
loadPacketStandard(Index i) const {
|
loadPacketStandard(Index i) const {
|
||||||
return m_base_mapper.loadPacketStandard(
|
typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT;
|
||||||
|
return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>(
|
||||||
i + m_depth_offset, m_planeIndex, m_rowIndex, m_colIndex, m_otherIndex);
|
i + m_depth_offset, m_planeIndex, m_rowIndex, m_colIndex, m_otherIndex);
|
||||||
}
|
}
|
||||||
template <typename Packet>
|
template <typename Packet>
|
||||||
|
@ -16,66 +16,13 @@ limitations under the License.
|
|||||||
#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
|
#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
|
||||||
#define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
|
#define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
|
||||||
|
|
||||||
|
#include "tensorflow/core/kernels/eigen_convolution_helpers.h"
|
||||||
|
|
||||||
// Note this header is used in both TF and TFLite.
|
// Note this header is used in both TF and TFLite.
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
// TensorEvaluatorHasPartialPacket<TensorEvaluatorType, PacketType, IndexType>
|
|
||||||
// provides `value` that is true if TensorEvaluatorType has `PacketType
|
|
||||||
// partialPacket<PacketType>(IndexType, unpacket_traits<PacketType>::mask_t)
|
|
||||||
// const` and if the PacketType supports masked load.
|
|
||||||
//
|
|
||||||
// Partial packets are used to:
|
|
||||||
//
|
|
||||||
// 1) Split the packet over two columns and use partial loads for each
|
|
||||||
// individual part before combining them to get the required packet. This
|
|
||||||
// class is used to pick the correct implementation of loadPacketStandard
|
|
||||||
// function below.
|
|
||||||
//
|
|
||||||
// 2) Finalize packing of columns in gemm_pack_colmajor after processing
|
|
||||||
// vectorized part with full packets (see eigen_spatiual_convolutions.h).
|
|
||||||
template <typename TensorEvaluatorType, typename PacketType, typename IndexType>
|
|
||||||
class TensorEvaluatorHasPartialPacket {
|
|
||||||
public:
|
|
||||||
template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
|
|
||||||
static auto functionExistsSfinae(
|
|
||||||
typename std::enable_if<
|
|
||||||
unpacket_traits<PacketT>::masked_load_available &&
|
|
||||||
std::is_same<PacketT,
|
|
||||||
decltype(std::declval<const TensorEvaluatorT>()
|
|
||||||
.template partialPacket<PacketT>(
|
|
||||||
std::declval<IndexT>(),
|
|
||||||
std::declval<typename unpacket_traits<
|
|
||||||
PacketT>::mask_t>()))>::value>::
|
|
||||||
type*) -> std::true_type;
|
|
||||||
|
|
||||||
template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
|
|
||||||
static auto functionExistsSfinae(...) -> std::false_type;
|
|
||||||
|
|
||||||
typedef decltype(
|
|
||||||
functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(
|
|
||||||
nullptr)) status;
|
|
||||||
|
|
||||||
static const bool value = status::value;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Compute a mask for loading/storing coefficients in/from a packet in a
|
|
||||||
// [from, to) range. If the mask bit is 1, element will be loaded/stored.
|
|
||||||
template <typename Packet>
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
|
||||||
typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
|
|
||||||
typename unpacket_traits<Packet>::mask_t>::type
|
|
||||||
mask(int from, int to) {
|
|
||||||
const Index packet_size = internal::unpacket_traits<Packet>::size;
|
|
||||||
eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
|
|
||||||
|
|
||||||
using Mask = typename internal::unpacket_traits<Packet>::mask_t;
|
|
||||||
const Mask mask_max = std::numeric_limits<Mask>::max();
|
|
||||||
|
|
||||||
return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from));
|
|
||||||
}
|
|
||||||
|
|
||||||
// WARNING: Most of the code here implicitly assumes that the matrix is in
|
// WARNING: Most of the code here implicitly assumes that the matrix is in
|
||||||
// ColMajor layout. This is guaranteed by the tensor contraction (see
|
// ColMajor layout. This is guaranteed by the tensor contraction (see
|
||||||
// TensorContraction.h).
|
// TensorContraction.h).
|
||||||
|
Loading…
Reference in New Issue
Block a user