Added AVX-512 support to Eigen and TensorFlow.

Change: 140553382
This commit is contained in:
A. Unique TensorFlower 2016-11-29 18:08:34 -08:00 committed by TensorFlower Gardener
parent 07569510a0
commit 35b6050b57
4 changed files with 125 additions and 4 deletions

View File

@ -66,8 +66,13 @@ struct AllocatorStats {
// device memory.
class Allocator {
public:
#ifdef EIGEN_VECTORIZE_AVX512
// Align to 64 byte boundary.
static constexpr size_t kAllocatorAlignment = 64;
#else
// Align to 32 byte boundary.
static constexpr size_t kAllocatorAlignment = 32;
#endif
virtual ~Allocator();

View File

@ -329,7 +329,11 @@ struct AvgPoolMeanReducer {
}
#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__)
#ifdef EIGEN_VECTORIZE_AVX
#ifdef EIGEN_VECTORIZE_AVX512
#define pequal(a, b) \
_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1)
#define psel(a, b, false_mask) _mm512_ternarylogic_epi64(false_mask, a, b, 0xca)
#elif defined EIGEN_VECTORIZE_AVX
#define pequal(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_UQ)
#define psel(a, b, false_mask) _mm256_blendv_ps(a, b, false_mask)
#else

View File

@ -209,6 +209,77 @@ EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
#endif
#ifdef EIGEN_VECTORIZE_AVX512
template <>
EIGEN_STRONG_INLINE Packet16f
pbroadcast_first<Packet16f>(const Packet16f& a_in) {
Packet4f a = _mm512_castps512_ps128(a_in);
return _mm512_broadcastss_ps(a);
}
template <>
EIGEN_STRONG_INLINE Packet16f
pbroadcast_second<Packet16f>(const Packet16f& a_in) {
Packet4f a = _mm512_castps512_ps128(a_in);
return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
}
template <>
EIGEN_STRONG_INLINE Packet16f
pbroadcast_third<Packet16f>(const Packet16f& a_in) {
Packet4f a = _mm512_castps512_ps128(a_in);
return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
}
template <>
EIGEN_STRONG_INLINE Packet16f
pbroadcast_fourth<Packet16f>(const Packet16f& a_in) {
Packet4f a = _mm512_castps512_ps128(a_in);
return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)));
}
template <>
EIGEN_STRONG_INLINE Packet8d pbroadcast_first<Packet8d>(const Packet8d& a_in) {
Packet2d a = _mm512_castpd512_pd128(a_in);
return _mm512_broadcastsd_pd(a);
}
template <>
EIGEN_STRONG_INLINE Packet8d pbroadcast_second<Packet8d>(const Packet8d& a_in) {
Packet2d a = _mm_permute_pd(_mm512_castpd512_pd128(a_in), 3);
return _mm512_broadcastsd_pd(a);
}
template <>
EIGEN_STRONG_INLINE Packet8d pbroadcast_third<Packet8d>(const Packet8d& a_in) {
Packet2d a = _mm512_extractf32x4_ps(a_in, 1);
return _mm512_broadcastsd_pd(a);
}
template <>
EIGEN_STRONG_INLINE Packet8d pbroadcast_fourth<Packet8d>(const Packet8d& a_in) {
Packet2d a = _mm_permute_pd(_mm512_extractf32x4_ps(a_in, 1), 3);
return _mm512_broadcastsd_pd(a);
}
template <>
EIGEN_STRONG_INLINE Packet16i
pbroadcast_first<Packet16i>(const Packet16i& a_in) {
Packet4i a = _mm512_castsi512_si128(a_in);
return _mm512_broadcastd_epi32(a);
}
template <>
EIGEN_STRONG_INLINE Packet16i
pbroadcast_second<Packet16i>(const Packet16i& a_in) {
Packet4i a = _mm512_castsi512_si128(a_in);
return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 1, 1)));
}
template <>
EIGEN_STRONG_INLINE Packet16i
pbroadcast_third<Packet16i>(const Packet16i& a_in) {
Packet4i a = _mm512_castsi512_si128(a_in);
return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 2, 2, 2)));
}
template <>
EIGEN_STRONG_INLINE Packet16i
pbroadcast_fourth<Packet16i>(const Packet16i& a_in) {
Packet4i a = _mm512_castsi512_si128(a_in);
return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 3, 3)));
}
#endif
#ifdef EIGEN_VECTORIZE_AVX
// For a Packet of Size 8 floats(256-bits), swap the 2nd and 3rd quadwords
template <>
@ -245,6 +316,25 @@ EIGEN_STRONG_INLINE Packet8f pload2bf16<Packet8f>(const float* from) {
_mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
}
#ifdef EIGEN_VECTORIZE_AVX512
// Return a Packet with 4 floats loaded from 4 bfloat16 values
template <>
EIGEN_STRONG_INLINE Packet16f pload4bf16<Packet16f>(const float* from) {
__m128i zero = _mm_setzero_si128();
__m128i tmp = _mm_castpd_si128(_mm_load_pd1((const double*)from));
return _mm512_castps128_ps512(
_mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
}
// Return a Packet with 2 floats loaded from 2 bfloat16 values
template <>
EIGEN_STRONG_INLINE Packet16f pload2bf16<Packet16f>(const float* from) {
__m128i zero = _mm_setzero_si128();
__m128i tmp = _mm_castps_si128(_mm_load_ps1(from));
return _mm512_castps128_ps512(
_mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp)));
}
#endif
// For each 128-bit lane convert 4 bfloat to 4 float values from the lower half
// of the 128-bit lane
template <typename Packet>
@ -312,6 +402,22 @@ EIGEN_STRONG_INLINE Packet8f pbroadcast_fourth<Packet8f>(const Packet8f& a) {
_mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 3))));
}
#endif
#ifdef EIGEN_VECTORIZE_AVX512
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_l(const Packet16f& from) {
return _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(from)),
16);
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_u(const Packet16f& from) {
return _mm512_slli_epi32(
_mm512_cvtepu16_epi32(_mm512_extractf64x4_pd(from, 1)), 16);
}
#endif
} // namespace internal
} // namespace Eigen

View File

@ -200,7 +200,7 @@ class SparseMatmulOpTest : public ::testing::Test {
// zero out lower 16-bits of mantissa of data3 values
// copy bfloat representation to data3_bfloat16
for (int i = 0; i < kMaxPacketSize; ++i) {
for (int i = 0; i < kMaxPacketSize * 2; ++i) {
uint16_t* data3_p = reinterpret_cast<uint16_t*>(&data3[i]);
uint16_t* data3_bfloat16_p =
reinterpret_cast<uint16_t*>(data3_bfloat16) + i;
@ -222,7 +222,13 @@ class SparseMatmulOpTest : public ::testing::Test {
return true;
}
#ifdef EIGEN_VECTORIZE_AVX512
static const int kMaxPacketSize = 16;
#elif defined EIGEN_VECTORIZE_AVX || defined EIGEN_VECTORIZE_AVX2
static const int kMaxPacketSize = 8;
#else
static const int kMaxPacketSize = 4;
#endif
typedef typename Eigen::internal::packet_traits<float>::type Packet;
const int PacketSize;
// float values
@ -230,9 +236,9 @@ class SparseMatmulOpTest : public ::testing::Test {
// output of intrinsics
EIGEN_ALIGN_MAX float data2[kMaxPacketSize];
// float values with only 7 mantissa bits (bfloat representable)
EIGEN_ALIGN_MAX float data3[kMaxPacketSize];
EIGEN_ALIGN_MAX float data3[kMaxPacketSize * 2];
// bfloat16 representation of data3
EIGEN_ALIGN_MAX float data3_bfloat16[kMaxPacketSize / 2];
EIGEN_ALIGN_MAX float data3_bfloat16[kMaxPacketSize];
EIGEN_ALIGN_MAX float ref[kMaxPacketSize];
};