Clean up code in the TF FixedPoint extension to Eigen to make the AVX2/AVX512 compile with recent versions of clang.
PiperOrigin-RevId: 310044623 Change-Id: I26cfd536da43ce20975b68095dc6bfea68e44f58
This commit is contained in:
parent
d71b3dfeca
commit
b99e88eee6
@ -11,19 +11,8 @@
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
typedef struct Packet32q8i {
|
||||
__m256i val;
|
||||
operator __m256i() const { return val; }
|
||||
Packet32q8i() : val(_mm256_setzero_si256()){};
|
||||
Packet32q8i(__m256i val) : val(val) {}
|
||||
} Packet32q8i;
|
||||
|
||||
typedef struct Packet16q8i {
|
||||
__m128i val;
|
||||
operator __m128i() const { return val; }
|
||||
Packet16q8i() : val(_mm_setzero_si128()) {}
|
||||
Packet16q8i(__m128i val) : val(val) {}
|
||||
} Packet16q8i;
|
||||
typedef eigen_packet_wrapper<__m256i, 10> Packet32q8i;
|
||||
typedef eigen_packet_wrapper<__m128i, 11> Packet16q8i;
|
||||
|
||||
template <>
|
||||
struct packet_traits<QInt8> : default_packet_traits {
|
||||
@ -102,23 +91,23 @@ EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
|
||||
reinterpret_cast<__m256i*>(to), from.val);
|
||||
reinterpret_cast<__m256i*>(to), from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
|
||||
typedef __m256 Packet8f;
|
||||
|
@ -27,61 +27,14 @@ inline int _mm256_extract_epi8_N1(const __m256i X) {
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
typedef struct Packet32q8i {
|
||||
__m256i val;
|
||||
operator __m256i() const { return val; }
|
||||
Packet32q8i() : val(_mm256_setzero_si256()){};
|
||||
Packet32q8i(__m256i val) : val(val) {}
|
||||
} Packet32q8i;
|
||||
|
||||
typedef struct Packet16q16i {
|
||||
__m256i val;
|
||||
operator __m256i() const { return val; }
|
||||
Packet16q16i() : val(_mm256_setzero_si256()){};
|
||||
Packet16q16i(__m256i val) : val(val) {}
|
||||
} Packet16q16i;
|
||||
|
||||
typedef struct Packet32q8u {
|
||||
__m256i val;
|
||||
operator __m256i() const { return val; }
|
||||
Packet32q8u() : val(_mm256_setzero_si256()){};
|
||||
Packet32q8u(__m256i val) : val(val) {}
|
||||
} Packet32q8u;
|
||||
|
||||
typedef struct Packet16q8i {
|
||||
__m128i val;
|
||||
operator __m128i() const { return val; }
|
||||
Packet16q8i() : val(_mm_setzero_si128()) {}
|
||||
Packet16q8i(__m128i val) : val(val) {}
|
||||
} Packet16q8i;
|
||||
|
||||
typedef struct Packet16q8u {
|
||||
__m128i val;
|
||||
operator __m128i() const { return val; }
|
||||
Packet16q8u() : val(_mm_setzero_si128()) {}
|
||||
Packet16q8u(__m128i val) : val(val) {}
|
||||
} Packet16q8u;
|
||||
|
||||
typedef struct Packet8q16i {
|
||||
__m128i val;
|
||||
operator __m128i() const { return val; }
|
||||
Packet8q16i() : val(_mm_setzero_si128()) {}
|
||||
Packet8q16i(__m128i val) : val(val) {}
|
||||
} Packet8q16i;
|
||||
|
||||
typedef struct Packet8q32i {
|
||||
__m256i val;
|
||||
operator __m256i() const { return val; }
|
||||
Packet8q32i() : val(_mm256_setzero_si256()){};
|
||||
Packet8q32i(__m256i val) : val(val) {}
|
||||
} Packet8q32i;
|
||||
|
||||
typedef struct Packet4q32i {
|
||||
__m128i val;
|
||||
operator __m128i() const { return val; }
|
||||
Packet4q32i() : val(_mm_setzero_si128()) {}
|
||||
Packet4q32i(__m128i val) : val(val) {}
|
||||
} Packet4q32i;
|
||||
typedef eigen_packet_wrapper<__m256i, 20> Packet32q8i;
|
||||
typedef eigen_packet_wrapper<__m256i, 21> Packet16q16i;
|
||||
typedef eigen_packet_wrapper<__m256i, 22> Packet32q8u;
|
||||
typedef eigen_packet_wrapper<__m128i, 23> Packet16q8i;
|
||||
typedef eigen_packet_wrapper<__m128i, 25> Packet16q8u;
|
||||
typedef eigen_packet_wrapper<__m128i, 26> Packet8q16i;
|
||||
typedef eigen_packet_wrapper<__m256i, 27> Packet8q32i;
|
||||
typedef eigen_packet_wrapper<__m128i, 28> Packet4q32i;
|
||||
|
||||
#ifndef EIGEN_VECTORIZE_AVX512
|
||||
template <>
|
||||
@ -315,64 +268,64 @@ EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
|
||||
reinterpret_cast<__m256i*>(to), from.val);
|
||||
reinterpret_cast<__m256i*>(to), from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
|
||||
reinterpret_cast<__m256i*>(to), from.val);
|
||||
reinterpret_cast<__m256i*>(to), from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
|
||||
reinterpret_cast<__m256i*>(to), from.val);
|
||||
reinterpret_cast<__m256i*>(to), from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet8q16i& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
|
||||
reinterpret_cast<__m256i*>(to), from.val);
|
||||
reinterpret_cast<__m256i*>(to), from.m_val);
|
||||
}
|
||||
|
||||
// Aligned store
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet8q16i& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
|
||||
// Extract first element.
|
||||
@ -382,15 +335,15 @@ EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
|
||||
return _mm256_extract_epi16_N0(a.val);
|
||||
return _mm256_extract_epi16_N0(a.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
|
||||
return static_cast<uint8_t>(_mm256_extract_epi8_N0(a.val));
|
||||
return static_cast<uint8_t>(_mm256_extract_epi8_N0(a.m_val));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QInt8 pfirst<Packet32q8i>(const Packet32q8i& a) {
|
||||
return _mm256_extract_epi8_N0(a.val);
|
||||
return _mm256_extract_epi8_N0(a.m_val);
|
||||
}
|
||||
|
||||
// Initialize to constant value.
|
||||
@ -411,7 +364,7 @@ EIGEN_STRONG_INLINE Packet8q32i pset1<Packet8q32i>(const QInt32& from) {
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
|
||||
const Packet8q32i& b) {
|
||||
return _mm256_add_epi32(a.val, b.val);
|
||||
return _mm256_add_epi32(a.m_val, b.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
|
||||
@ -420,62 +373,62 @@ EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
|
||||
const Packet8q32i& b) {
|
||||
return _mm256_sub_epi32(a.val, b.val);
|
||||
return _mm256_sub_epi32(a.m_val, b.m_val);
|
||||
}
|
||||
// Note: mullo truncates the result to 32 bits.
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8q32i pmul<Packet8q32i>(const Packet8q32i& a,
|
||||
const Packet8q32i& b) {
|
||||
return _mm256_mullo_epi32(a.val, b.val);
|
||||
return _mm256_mullo_epi32(a.m_val, b.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8q32i pnegate<Packet8q32i>(const Packet8q32i& a) {
|
||||
return _mm256_sub_epi32(_mm256_setzero_si256(), a.val);
|
||||
return _mm256_sub_epi32(_mm256_setzero_si256(), a.m_val);
|
||||
}
|
||||
|
||||
// Min and max.
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8q32i pmin<Packet8q32i>(const Packet8q32i& a,
|
||||
const Packet8q32i& b) {
|
||||
return _mm256_min_epi32(a.val, b.val);
|
||||
return _mm256_min_epi32(a.m_val, b.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
|
||||
const Packet8q32i& b) {
|
||||
return _mm256_max_epi32(a.val, b.val);
|
||||
return _mm256_max_epi32(a.m_val, b.m_val);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
|
||||
const Packet16q16i& b) {
|
||||
return _mm256_min_epi16(a.val, b.val);
|
||||
return _mm256_min_epi16(a.m_val, b.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
|
||||
const Packet16q16i& b) {
|
||||
return _mm256_max_epi16(a.val, b.val);
|
||||
return _mm256_max_epi16(a.m_val, b.m_val);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
|
||||
const Packet32q8u& b) {
|
||||
return _mm256_min_epu8(a.val, b.val);
|
||||
return _mm256_min_epu8(a.m_val, b.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet32q8u pmax<Packet32q8u>(const Packet32q8u& a,
|
||||
const Packet32q8u& b) {
|
||||
return _mm256_max_epu8(a.val, b.val);
|
||||
return _mm256_max_epu8(a.m_val, b.m_val);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet32q8i pmin<Packet32q8i>(const Packet32q8i& a,
|
||||
const Packet32q8i& b) {
|
||||
return _mm256_min_epi8(a.val, b.val);
|
||||
return _mm256_min_epi8(a.m_val, b.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet32q8i pmax<Packet32q8i>(const Packet32q8i& a,
|
||||
const Packet32q8i& b) {
|
||||
return _mm256_max_epi8(a.val, b.val);
|
||||
return _mm256_max_epi8(a.m_val, b.m_val);
|
||||
}
|
||||
|
||||
// Reductions.
|
||||
|
@ -6,33 +6,10 @@
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
typedef struct Packet64q8i {
|
||||
__m512i val;
|
||||
operator __m512i() const { return val; }
|
||||
Packet64q8i();
|
||||
Packet64q8i(__m512i val) : val(val) {}
|
||||
} Packet64q8i;
|
||||
|
||||
typedef struct Packet32q16i {
|
||||
__m512i val;
|
||||
operator __m512i() const { return val; }
|
||||
Packet32q16i();
|
||||
Packet32q16i(__m512i val) : val(val) {}
|
||||
} Packet32q16i;
|
||||
|
||||
typedef struct Packet64q8u {
|
||||
__m512i val;
|
||||
operator __m512i() const { return val; }
|
||||
Packet64q8u();
|
||||
Packet64q8u(__m512i val) : val(val) {}
|
||||
} Packet64q8u;
|
||||
|
||||
typedef struct Packet16q32i {
|
||||
__m512i val;
|
||||
operator __m512i() const { return val; }
|
||||
Packet16q32i();
|
||||
Packet16q32i(__m512i val) : val(val) {}
|
||||
} Packet16q32i;
|
||||
typedef eigen_packet_wrapper<__m512i, 30> Packet64q8i;
|
||||
typedef eigen_packet_wrapper<__m512i, 31> Packet32q16i;
|
||||
typedef eigen_packet_wrapper<__m512i, 32> Packet64q8u;
|
||||
typedef eigen_packet_wrapper<__m512i, 33> Packet16q32i;
|
||||
|
||||
template <>
|
||||
struct packet_traits<QInt8> : default_packet_traits {
|
||||
@ -216,44 +193,44 @@ EIGEN_STRONG_INLINE Packet16q32i pload<Packet16q32i>(const QInt32* from) {
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet64q8i& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
|
||||
reinterpret_cast<__m512i*>(to), from.val);
|
||||
reinterpret_cast<__m512i*>(to), from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet32q16i& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
|
||||
reinterpret_cast<__m512i*>(to), from.val);
|
||||
reinterpret_cast<__m512i*>(to), from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet64q8u& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
|
||||
reinterpret_cast<__m512i*>(to), from.val);
|
||||
reinterpret_cast<__m512i*>(to), from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet16q32i& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
|
||||
reinterpret_cast<__m512i*>(to), from.val);
|
||||
reinterpret_cast<__m512i*>(to), from.m_val);
|
||||
}
|
||||
|
||||
// Aligned store
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet16q32i& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet64q8u& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet64q8i& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet32q16i& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
|
||||
from.val);
|
||||
from.m_val);
|
||||
}
|
||||
|
||||
// Extract first element.
|
||||
@ -264,15 +241,15 @@ EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) {
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) {
|
||||
return static_cast<uint8_t>(
|
||||
_mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0));
|
||||
_mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) {
|
||||
return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0);
|
||||
return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QInt16 pfirst<Packet32q16i>(const Packet32q16i& a) {
|
||||
return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.val, 0), 0);
|
||||
return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.m_val, 0), 0);
|
||||
}
|
||||
|
||||
// Initialize to constant value.
|
||||
@ -297,46 +274,46 @@ EIGEN_STRONG_INLINE Packet16q32i pset1<Packet16q32i>(const QInt32& from) {
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16q32i padd<Packet16q32i>(const Packet16q32i& a,
|
||||
const Packet16q32i& b) {
|
||||
return _mm512_add_epi32(a.val, b.val);
|
||||
return _mm512_add_epi32(a.m_val, b.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16q32i psub<Packet16q32i>(const Packet16q32i& a,
|
||||
const Packet16q32i& b) {
|
||||
return _mm512_sub_epi32(a.val, b.val);
|
||||
return _mm512_sub_epi32(a.m_val, b.m_val);
|
||||
}
|
||||
// Note: mullo truncates the result to 32 bits.
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16q32i pmul<Packet16q32i>(const Packet16q32i& a,
|
||||
const Packet16q32i& b) {
|
||||
return _mm512_mullo_epi32(a.val, b.val);
|
||||
return _mm512_mullo_epi32(a.m_val, b.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16q32i pnegate<Packet16q32i>(const Packet16q32i& a) {
|
||||
return _mm512_sub_epi32(_mm512_setzero_si512(), a.val);
|
||||
return _mm512_sub_epi32(_mm512_setzero_si512(), a.m_val);
|
||||
}
|
||||
|
||||
// Min and max.
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16q32i pmin<Packet16q32i>(const Packet16q32i& a,
|
||||
const Packet16q32i& b) {
|
||||
return _mm512_min_epi32(a.val, b.val);
|
||||
return _mm512_min_epi32(a.m_val, b.m_val);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16q32i pmax<Packet16q32i>(const Packet16q32i& a,
|
||||
const Packet16q32i& b) {
|
||||
return _mm512_max_epi32(a.val, b.val);
|
||||
return _mm512_max_epi32(a.m_val, b.m_val);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet64q8u pmin<Packet64q8u>(const Packet64q8u& a,
|
||||
const Packet64q8u& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512BW
|
||||
return _mm512_min_epu8(a.val, b.val);
|
||||
return _mm512_min_epu8(a.m_val, b.m_val);
|
||||
#else
|
||||
__m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
|
||||
__m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
|
||||
__m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
|
||||
__m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
|
||||
__m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
|
||||
__m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
|
||||
__m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
|
||||
__m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
|
||||
__m256i r0 = _mm256_min_epu8(ap0, bp0);
|
||||
__m256i r1 = _mm256_min_epu8(ap1, bp1);
|
||||
return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
|
||||
@ -346,12 +323,12 @@ template <>
|
||||
EIGEN_STRONG_INLINE Packet64q8u pmax<Packet64q8u>(const Packet64q8u& a,
|
||||
const Packet64q8u& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512BW
|
||||
return _mm512_max_epu8(a.val, b.val);
|
||||
return _mm512_max_epu8(a.m_val, b.m_val);
|
||||
#else
|
||||
__m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
|
||||
__m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
|
||||
__m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
|
||||
__m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
|
||||
__m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
|
||||
__m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
|
||||
__m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
|
||||
__m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
|
||||
__m256i r0 = _mm256_max_epu8(ap0, bp0);
|
||||
__m256i r1 = _mm256_max_epu8(ap1, bp1);
|
||||
return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
|
||||
@ -362,12 +339,12 @@ template <>
|
||||
EIGEN_STRONG_INLINE Packet64q8i pmin<Packet64q8i>(const Packet64q8i& a,
|
||||
const Packet64q8i& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512BW
|
||||
return _mm512_min_epi8(a.val, b.val);
|
||||
return _mm512_min_epi8(a.m_val, b.m_val);
|
||||
#else
|
||||
__m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
|
||||
__m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
|
||||
__m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
|
||||
__m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
|
||||
__m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
|
||||
__m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
|
||||
__m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
|
||||
__m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
|
||||
__m256i r0 = _mm256_min_epi8(ap0, bp0);
|
||||
__m256i r1 = _mm256_min_epi8(ap1, bp1);
|
||||
return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
|
||||
@ -377,12 +354,12 @@ template <>
|
||||
EIGEN_STRONG_INLINE Packet32q16i pmin<Packet32q16i>(const Packet32q16i& a,
|
||||
const Packet32q16i& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512BW
|
||||
return _mm512_min_epi16(a.val, b.val);
|
||||
return _mm512_min_epi16(a.m_val, b.m_val);
|
||||
#else
|
||||
__m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
|
||||
__m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
|
||||
__m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
|
||||
__m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
|
||||
__m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
|
||||
__m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
|
||||
__m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
|
||||
__m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
|
||||
__m256i r0 = _mm256_min_epi16(ap0, bp0);
|
||||
__m256i r1 = _mm256_min_epi16(ap1, bp1);
|
||||
return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
|
||||
@ -392,12 +369,12 @@ template <>
|
||||
EIGEN_STRONG_INLINE Packet64q8i pmax<Packet64q8i>(const Packet64q8i& a,
|
||||
const Packet64q8i& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512BW
|
||||
return _mm512_max_epi8(a.val, b.val);
|
||||
return _mm512_max_epi8(a.m_val, b.m_val);
|
||||
#else
|
||||
__m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
|
||||
__m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
|
||||
__m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
|
||||
__m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
|
||||
__m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
|
||||
__m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
|
||||
__m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
|
||||
__m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
|
||||
__m256i r0 = _mm256_max_epi8(ap0, bp0);
|
||||
__m256i r1 = _mm256_max_epi8(ap1, bp1);
|
||||
return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
|
||||
@ -407,12 +384,12 @@ template <>
|
||||
EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a,
|
||||
const Packet32q16i& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512BW
|
||||
return _mm512_max_epi16(a.val, b.val);
|
||||
return _mm512_max_epi16(a.m_val, b.m_val);
|
||||
#else
|
||||
__m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
|
||||
__m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
|
||||
__m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
|
||||
__m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
|
||||
__m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
|
||||
__m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
|
||||
__m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
|
||||
__m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
|
||||
__m256i r0 = _mm256_max_epi16(ap0, bp0);
|
||||
__m256i r1 = _mm256_max_epi16(ap1, bp1);
|
||||
return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
|
||||
@ -422,112 +399,112 @@ EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a,
|
||||
// Reductions.
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) {
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
|
||||
Packet4i res =
|
||||
_mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
|
||||
res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(
|
||||
_mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
|
||||
return pfirst(res);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
|
||||
Packet4i res =
|
||||
_mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
|
||||
res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(
|
||||
_mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
|
||||
return pfirst(res);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
|
||||
Packet4i res =
|
||||
_mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3));
|
||||
res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
std::uint32_t w = pfirst(
|
||||
_mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
|
||||
std::uint32_t w = pfirst(res);
|
||||
return std::min(
|
||||
{static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
|
||||
Packet4i res =
|
||||
_mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3));
|
||||
res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
std::uint32_t w = pfirst(
|
||||
_mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
|
||||
std::uint32_t w = pfirst(res);
|
||||
return std::max(
|
||||
{static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
|
||||
Packet4i res =
|
||||
_mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3));
|
||||
res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
std::uint32_t w = pfirst(
|
||||
_mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
|
||||
std::uint32_t w = pfirst(res);
|
||||
return std::min(
|
||||
{static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
|
||||
static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
|
||||
Packet4i res =
|
||||
_mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3));
|
||||
res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
std::uint32_t w = pfirst(
|
||||
_mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
|
||||
std::uint32_t w = pfirst(res);
|
||||
return std::max(
|
||||
{static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
|
||||
static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
|
||||
Packet4i res =
|
||||
_mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3));
|
||||
res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
std::uint32_t w = pfirst(
|
||||
_mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
|
||||
std::uint32_t w = pfirst(res);
|
||||
return std::min(
|
||||
{static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
|
||||
static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
|
||||
Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
|
||||
Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
|
||||
Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
|
||||
Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
|
||||
Packet4i res =
|
||||
_mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3));
|
||||
res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
std::uint32_t w = pfirst(
|
||||
_mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
|
||||
std::uint32_t w = pfirst(res);
|
||||
return std::min(
|
||||
{static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
|
||||
static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
|
||||
|
@ -13,7 +13,7 @@ struct type_casting_traits<QInt32, float> {
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8f pcast<Packet8q32i>(const Packet8q32i& a) {
|
||||
return _mm256_cvtepi32_ps(a.val);
|
||||
return _mm256_cvtepi32_ps(a.m_val);
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -35,8 +35,8 @@ template <>
|
||||
EIGEN_STRONG_INLINE Packet32q8i
|
||||
pcast<Packet8q32i, Packet32q8i>(const Packet8q32i& a, const Packet8q32i& b,
|
||||
const Packet8q32i& c, const Packet8q32i& d) {
|
||||
__m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.val, b.val),
|
||||
_mm256_packs_epi32(c.val, d.val));
|
||||
__m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.m_val, b.m_val),
|
||||
_mm256_packs_epi32(c.m_val, d.m_val));
|
||||
// Since packs does not cross 128 bit lane boundaries,
|
||||
// we have to permute to properly order the final result.
|
||||
const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
|
||||
|
@ -14,7 +14,7 @@ struct type_casting_traits<QInt32, float> {
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pcast<Packet16q32i>(const Packet16q32i& a) {
|
||||
return _mm512_cvtepi32_ps(a.val);
|
||||
return _mm512_cvtepi32_ps(a.m_val);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
Loading…
Reference in New Issue
Block a user