Added AVX-512 support to Eigen and TensorFlow.
Change: 140553382
This commit is contained in:
parent
07569510a0
commit
35b6050b57
@ -66,8 +66,13 @@ struct AllocatorStats {
|
|||||||
// device memory.
|
// device memory.
|
||||||
class Allocator {
|
class Allocator {
|
||||||
public:
|
public:
|
||||||
|
#ifdef EIGEN_VECTORIZE_AVX512
|
||||||
|
// Align to 64 byte boundary.
|
||||||
|
static constexpr size_t kAllocatorAlignment = 64;
|
||||||
|
#else
|
||||||
// Align to 32 byte boundary.
|
// Align to 32 byte boundary.
|
||||||
static constexpr size_t kAllocatorAlignment = 32;
|
static constexpr size_t kAllocatorAlignment = 32;
|
||||||
|
#endif
|
||||||
|
|
||||||
virtual ~Allocator();
|
virtual ~Allocator();
|
||||||
|
|
||||||
|
@ -329,7 +329,11 @@ struct AvgPoolMeanReducer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__)
|
#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__)
|
||||||
#ifdef EIGEN_VECTORIZE_AVX
|
#ifdef EIGEN_VECTORIZE_AVX512
|
||||||
|
#define pequal(a, b) \
|
||||||
|
_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1)
|
||||||
|
#define psel(a, b, false_mask) _mm512_ternarylogic_epi64(false_mask, a, b, 0xca)
|
||||||
|
#elif defined EIGEN_VECTORIZE_AVX
|
||||||
#define pequal(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_UQ)
|
#define pequal(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_UQ)
|
||||||
#define psel(a, b, false_mask) _mm256_blendv_ps(a, b, false_mask)
|
#define psel(a, b, false_mask) _mm256_blendv_ps(a, b, false_mask)
|
||||||
#else
|
#else
|
||||||
|
@ -209,6 +209,77 @@ EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef EIGEN_VECTORIZE_AVX512
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet16f
|
||||||
|
pbroadcast_first<Packet16f>(const Packet16f& a_in) {
|
||||||
|
Packet4f a = _mm512_castps512_ps128(a_in);
|
||||||
|
return _mm512_broadcastss_ps(a);
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet16f
|
||||||
|
pbroadcast_second<Packet16f>(const Packet16f& a_in) {
|
||||||
|
Packet4f a = _mm512_castps512_ps128(a_in);
|
||||||
|
return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet16f
|
||||||
|
pbroadcast_third<Packet16f>(const Packet16f& a_in) {
|
||||||
|
Packet4f a = _mm512_castps512_ps128(a_in);
|
||||||
|
return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet16f
|
||||||
|
pbroadcast_fourth<Packet16f>(const Packet16f& a_in) {
|
||||||
|
Packet4f a = _mm512_castps512_ps128(a_in);
|
||||||
|
return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet8d pbroadcast_first<Packet8d>(const Packet8d& a_in) {
|
||||||
|
Packet2d a = _mm512_castpd512_pd128(a_in);
|
||||||
|
return _mm512_broadcastsd_pd(a);
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet8d pbroadcast_second<Packet8d>(const Packet8d& a_in) {
|
||||||
|
Packet2d a = _mm_permute_pd(_mm512_castpd512_pd128(a_in), 3);
|
||||||
|
return _mm512_broadcastsd_pd(a);
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet8d pbroadcast_third<Packet8d>(const Packet8d& a_in) {
|
||||||
|
Packet2d a = _mm512_extractf32x4_ps(a_in, 1);
|
||||||
|
return _mm512_broadcastsd_pd(a);
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet8d pbroadcast_fourth<Packet8d>(const Packet8d& a_in) {
|
||||||
|
Packet2d a = _mm_permute_pd(_mm512_extractf32x4_ps(a_in, 1), 3);
|
||||||
|
return _mm512_broadcastsd_pd(a);
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet16i
|
||||||
|
pbroadcast_first<Packet16i>(const Packet16i& a_in) {
|
||||||
|
Packet4i a = _mm512_castsi512_si128(a_in);
|
||||||
|
return _mm512_broadcastd_epi32(a);
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet16i
|
||||||
|
pbroadcast_second<Packet16i>(const Packet16i& a_in) {
|
||||||
|
Packet4i a = _mm512_castsi512_si128(a_in);
|
||||||
|
return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet16i
|
||||||
|
pbroadcast_third<Packet16i>(const Packet16i& a_in) {
|
||||||
|
Packet4i a = _mm512_castsi512_si128(a_in);
|
||||||
|
return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet16i
|
||||||
|
pbroadcast_fourth<Packet16i>(const Packet16i& a_in) {
|
||||||
|
Packet4i a = _mm512_castsi512_si128(a_in);
|
||||||
|
return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef EIGEN_VECTORIZE_AVX
|
#ifdef EIGEN_VECTORIZE_AVX
|
||||||
// For a Packet of Size 8 floats(256-bits), swap the 2nd and 3rd quadwords
|
// For a Packet of Size 8 floats(256-bits), swap the 2nd and 3rd quadwords
|
||||||
template <>
|
template <>
|
||||||
@ -245,6 +316,25 @@ EIGEN_STRONG_INLINE Packet8f pload2bf16<Packet8f>(const float* from) {
|
|||||||
_mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
|
_mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef EIGEN_VECTORIZE_AVX512
|
||||||
|
// Return a Packet with 4 floats loaded from 4 bfloat16 values
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet16f pload4bf16<Packet16f>(const float* from) {
|
||||||
|
__m128i zero = _mm_setzero_si128();
|
||||||
|
__m128i tmp = _mm_castpd_si128(_mm_load_pd1((const double*)from));
|
||||||
|
return _mm512_castps128_ps512(
|
||||||
|
_mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
|
||||||
|
}
|
||||||
|
// Return a Packet with 2 floats loaded from 2 bfloat16 values
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet16f pload2bf16<Packet16f>(const float* from) {
|
||||||
|
__m128i zero = _mm_setzero_si128();
|
||||||
|
__m128i tmp = _mm_castps_si128(_mm_load_ps1(from));
|
||||||
|
return _mm512_castps128_ps512(
|
||||||
|
_mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// For each 128-bit lane convert 4 bfloat to 4 float values from the lower half
|
// For each 128-bit lane convert 4 bfloat to 4 float values from the lower half
|
||||||
// of the 128-bit lane
|
// of the 128-bit lane
|
||||||
template <typename Packet>
|
template <typename Packet>
|
||||||
@ -312,6 +402,22 @@ EIGEN_STRONG_INLINE Packet8f pbroadcast_fourth<Packet8f>(const Packet8f& a) {
|
|||||||
_mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 3))));
|
_mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 3))));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef EIGEN_VECTORIZE_AVX512
|
||||||
|
|
||||||
|
template <typename Packet>
|
||||||
|
EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_l(const Packet16f& from) {
|
||||||
|
return _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(from)),
|
||||||
|
16);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Packet>
|
||||||
|
EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_u(const Packet16f& from) {
|
||||||
|
return _mm512_slli_epi32(
|
||||||
|
_mm512_cvtepu16_epi32(_mm512_extractf64x4_pd(from, 1)), 16);
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
} // namespace internal
|
} // namespace internal
|
||||||
} // namespace Eigen
|
} // namespace Eigen
|
||||||
|
@ -200,7 +200,7 @@ class SparseMatmulOpTest : public ::testing::Test {
|
|||||||
|
|
||||||
// zero out lower 16-bits of mantissa of data3 values
|
// zero out lower 16-bits of mantissa of data3 values
|
||||||
// copy bfloat representation to data3_bfloat16
|
// copy bfloat representation to data3_bfloat16
|
||||||
for (int i = 0; i < kMaxPacketSize; ++i) {
|
for (int i = 0; i < kMaxPacketSize * 2; ++i) {
|
||||||
uint16_t* data3_p = reinterpret_cast<uint16_t*>(&data3[i]);
|
uint16_t* data3_p = reinterpret_cast<uint16_t*>(&data3[i]);
|
||||||
uint16_t* data3_bfloat16_p =
|
uint16_t* data3_bfloat16_p =
|
||||||
reinterpret_cast<uint16_t*>(data3_bfloat16) + i;
|
reinterpret_cast<uint16_t*>(data3_bfloat16) + i;
|
||||||
@ -222,7 +222,13 @@ class SparseMatmulOpTest : public ::testing::Test {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef EIGEN_VECTORIZE_AVX512
|
||||||
static const int kMaxPacketSize = 16;
|
static const int kMaxPacketSize = 16;
|
||||||
|
#elif defined EIGEN_VECTORIZE_AVX || defined EIGEN_VECTORIZE_AVX2
|
||||||
|
static const int kMaxPacketSize = 8;
|
||||||
|
#else
|
||||||
|
static const int kMaxPacketSize = 4;
|
||||||
|
#endif
|
||||||
typedef typename Eigen::internal::packet_traits<float>::type Packet;
|
typedef typename Eigen::internal::packet_traits<float>::type Packet;
|
||||||
const int PacketSize;
|
const int PacketSize;
|
||||||
// float values
|
// float values
|
||||||
@ -230,9 +236,9 @@ class SparseMatmulOpTest : public ::testing::Test {
|
|||||||
// output of intrinsics
|
// output of intrinsics
|
||||||
EIGEN_ALIGN_MAX float data2[kMaxPacketSize];
|
EIGEN_ALIGN_MAX float data2[kMaxPacketSize];
|
||||||
// float values with only 7 mantissa bits (bfloat representable)
|
// float values with only 7 mantissa bits (bfloat representable)
|
||||||
EIGEN_ALIGN_MAX float data3[kMaxPacketSize];
|
EIGEN_ALIGN_MAX float data3[kMaxPacketSize * 2];
|
||||||
// bfloat16 representation of data3
|
// bfloat16 representation of data3
|
||||||
EIGEN_ALIGN_MAX float data3_bfloat16[kMaxPacketSize / 2];
|
EIGEN_ALIGN_MAX float data3_bfloat16[kMaxPacketSize];
|
||||||
EIGEN_ALIGN_MAX float ref[kMaxPacketSize];
|
EIGEN_ALIGN_MAX float ref[kMaxPacketSize];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user