Added AVX-512 support to Eigen and TensorFlow.
Change: 140553382
This commit is contained in:
parent
07569510a0
commit
35b6050b57
@ -66,8 +66,13 @@ struct AllocatorStats {
|
||||
// device memory.
|
||||
class Allocator {
|
||||
public:
|
||||
#ifdef EIGEN_VECTORIZE_AVX512
|
||||
// Align to 64 byte boundary.
|
||||
static constexpr size_t kAllocatorAlignment = 64;
|
||||
#else
|
||||
// Align to 32 byte boundary.
|
||||
static constexpr size_t kAllocatorAlignment = 32;
|
||||
#endif
|
||||
|
||||
virtual ~Allocator();
|
||||
|
||||
|
@ -329,7 +329,11 @@ struct AvgPoolMeanReducer {
|
||||
}
|
||||
|
||||
#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__)
|
||||
#ifdef EIGEN_VECTORIZE_AVX
|
||||
#ifdef EIGEN_VECTORIZE_AVX512
|
||||
#define pequal(a, b) \
|
||||
_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1)
|
||||
#define psel(a, b, false_mask) _mm512_ternarylogic_epi64(false_mask, a, b, 0xca)
|
||||
#elif defined EIGEN_VECTORIZE_AVX
|
||||
#define pequal(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_UQ)
|
||||
#define psel(a, b, false_mask) _mm256_blendv_ps(a, b, false_mask)
|
||||
#else
|
||||
|
@ -209,6 +209,77 @@ EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f
|
||||
pbroadcast_first<Packet16f>(const Packet16f& a_in) {
|
||||
Packet4f a = _mm512_castps512_ps128(a_in);
|
||||
return _mm512_broadcastss_ps(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f
|
||||
pbroadcast_second<Packet16f>(const Packet16f& a_in) {
|
||||
Packet4f a = _mm512_castps512_ps128(a_in);
|
||||
return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f
|
||||
pbroadcast_third<Packet16f>(const Packet16f& a_in) {
|
||||
Packet4f a = _mm512_castps512_ps128(a_in);
|
||||
return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f
|
||||
pbroadcast_fourth<Packet16f>(const Packet16f& a_in) {
|
||||
Packet4f a = _mm512_castps512_ps128(a_in);
|
||||
return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pbroadcast_first<Packet8d>(const Packet8d& a_in) {
|
||||
Packet2d a = _mm512_castpd512_pd128(a_in);
|
||||
return _mm512_broadcastsd_pd(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pbroadcast_second<Packet8d>(const Packet8d& a_in) {
|
||||
Packet2d a = _mm_permute_pd(_mm512_castpd512_pd128(a_in), 3);
|
||||
return _mm512_broadcastsd_pd(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pbroadcast_third<Packet8d>(const Packet8d& a_in) {
|
||||
Packet2d a = _mm512_extractf32x4_ps(a_in, 1);
|
||||
return _mm512_broadcastsd_pd(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pbroadcast_fourth<Packet8d>(const Packet8d& a_in) {
|
||||
Packet2d a = _mm_permute_pd(_mm512_extractf32x4_ps(a_in, 1), 3);
|
||||
return _mm512_broadcastsd_pd(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i
|
||||
pbroadcast_first<Packet16i>(const Packet16i& a_in) {
|
||||
Packet4i a = _mm512_castsi512_si128(a_in);
|
||||
return _mm512_broadcastd_epi32(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i
|
||||
pbroadcast_second<Packet16i>(const Packet16i& a_in) {
|
||||
Packet4i a = _mm512_castsi512_si128(a_in);
|
||||
return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i
|
||||
pbroadcast_third<Packet16i>(const Packet16i& a_in) {
|
||||
Packet4i a = _mm512_castsi512_si128(a_in);
|
||||
return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i
|
||||
pbroadcast_fourth<Packet16i>(const Packet16i& a_in) {
|
||||
Packet4i a = _mm512_castsi512_si128(a_in);
|
||||
return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX
|
||||
// For a Packet of Size 8 floats(256-bits), swap the 2nd and 3rd quadwords
|
||||
template <>
|
||||
@ -245,6 +316,25 @@ EIGEN_STRONG_INLINE Packet8f pload2bf16<Packet8f>(const float* from) {
|
||||
_mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512
|
||||
// Return a Packet with 4 floats loaded from 4 bfloat16 values
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pload4bf16<Packet16f>(const float* from) {
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
__m128i tmp = _mm_castpd_si128(_mm_load_pd1((const double*)from));
|
||||
return _mm512_castps128_ps512(
|
||||
_mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
|
||||
}
|
||||
// Return a Packet with 2 floats loaded from 2 bfloat16 values
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pload2bf16<Packet16f>(const float* from) {
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
__m128i tmp = _mm_castps_si128(_mm_load_ps1(from));
|
||||
return _mm512_castps128_ps512(
|
||||
_mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
|
||||
}
|
||||
#endif
|
||||
|
||||
// For each 128-bit lane convert 4 bfloat to 4 float values from the lower half
|
||||
// of the 128-bit lane
|
||||
template <typename Packet>
|
||||
@ -312,6 +402,22 @@ EIGEN_STRONG_INLINE Packet8f pbroadcast_fourth<Packet8f>(const Packet8f& a) {
|
||||
_mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 3))));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_l(const Packet16f& from) {
|
||||
return _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(from)),
|
||||
16);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_u(const Packet16f& from) {
|
||||
return _mm512_slli_epi32(
|
||||
_mm512_cvtepu16_epi32(_mm512_extractf64x4_pd(from, 1)), 16);
|
||||
}
|
||||
|
||||
#endif
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
@ -200,7 +200,7 @@ class SparseMatmulOpTest : public ::testing::Test {
|
||||
|
||||
// zero out lower 16-bits of mantissa of data3 values
|
||||
// copy bfloat representation to data3_bfloat16
|
||||
for (int i = 0; i < kMaxPacketSize; ++i) {
|
||||
for (int i = 0; i < kMaxPacketSize * 2; ++i) {
|
||||
uint16_t* data3_p = reinterpret_cast<uint16_t*>(&data3[i]);
|
||||
uint16_t* data3_bfloat16_p =
|
||||
reinterpret_cast<uint16_t*>(data3_bfloat16) + i;
|
||||
@ -222,7 +222,13 @@ class SparseMatmulOpTest : public ::testing::Test {
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512
|
||||
static const int kMaxPacketSize = 16;
|
||||
#elif defined EIGEN_VECTORIZE_AVX || defined EIGEN_VECTORIZE_AVX2
|
||||
static const int kMaxPacketSize = 8;
|
||||
#else
|
||||
static const int kMaxPacketSize = 4;
|
||||
#endif
|
||||
typedef typename Eigen::internal::packet_traits<float>::type Packet;
|
||||
const int PacketSize;
|
||||
// float values
|
||||
@ -230,9 +236,9 @@ class SparseMatmulOpTest : public ::testing::Test {
|
||||
// output of intrinsics
|
||||
EIGEN_ALIGN_MAX float data2[kMaxPacketSize];
|
||||
// float values with only 7 mantissa bits (bfloat representable)
|
||||
EIGEN_ALIGN_MAX float data3[kMaxPacketSize];
|
||||
EIGEN_ALIGN_MAX float data3[kMaxPacketSize * 2];
|
||||
// bfloat16 representation of data3
|
||||
EIGEN_ALIGN_MAX float data3_bfloat16[kMaxPacketSize / 2];
|
||||
EIGEN_ALIGN_MAX float data3_bfloat16[kMaxPacketSize];
|
||||
EIGEN_ALIGN_MAX float ref[kMaxPacketSize];
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user