From 35b6050b574e6b4c4cecf8db2a0c37e48d43b9ea Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 29 Nov 2016 18:08:34 -0800 Subject: [PATCH] Added AVX-512 support to Eigen and TensorFlow. Change: 140553382 --- tensorflow/core/framework/allocator.h | 5 + tensorflow/core/kernels/eigen_pooling.h | 6 +- tensorflow/core/kernels/sparse_matmul_op.h | 106 ++++++++++++++++++ .../core/kernels/sparse_matmul_op_test.cc | 12 +- 4 files changed, 125 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h index 4f8eb04c957..06859c52908 100644 --- a/tensorflow/core/framework/allocator.h +++ b/tensorflow/core/framework/allocator.h @@ -66,8 +66,13 @@ struct AllocatorStats { // device memory. class Allocator { public: +#ifdef EIGEN_VECTORIZE_AVX512 + // Align to 64 byte boundary. + static constexpr size_t kAllocatorAlignment = 64; +#else // Align to 32 byte boundary. static constexpr size_t kAllocatorAlignment = 32; +#endif virtual ~Allocator(); diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h index 8eea1b0f9de..e13c8b98357 100644 --- a/tensorflow/core/kernels/eigen_pooling.h +++ b/tensorflow/core/kernels/eigen_pooling.h @@ -329,7 +329,11 @@ struct AvgPoolMeanReducer { } #if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__) -#ifdef EIGEN_VECTORIZE_AVX +#ifdef EIGEN_VECTORIZE_AVX512 +#define pequal(a, b) \ + _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1) +#define psel(a, b, false_mask) _mm512_ternarylogic_epi64(false_mask, a, b, 0xca) +#elif defined EIGEN_VECTORIZE_AVX #define pequal(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_UQ) #define psel(a, b, false_mask) _mm256_blendv_ps(a, b, false_mask) #else diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h index 4e14f0099ab..170d4ec18b9 100644 --- a/tensorflow/core/kernels/sparse_matmul_op.h +++ b/tensorflow/core/kernels/sparse_matmul_op.h @@ -209,6 +209,77 @@ EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth(const Packet4f& a) { #endif +#ifdef EIGEN_VECTORIZE_AVX512 +template <> +EIGEN_STRONG_INLINE Packet16f +pbroadcast_first(const Packet16f& a_in) { + Packet4f a = _mm512_castps512_ps128(a_in); + return _mm512_broadcastss_ps(a); +} +template <> +EIGEN_STRONG_INLINE Packet16f +pbroadcast_second(const Packet16f& a_in) { + Packet4f a = _mm512_castps512_ps128(a_in); + return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); +} +template <> +EIGEN_STRONG_INLINE Packet16f +pbroadcast_third(const Packet16f& a_in) { + Packet4f a = _mm512_castps512_ps128(a_in); + return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2))); +} +template <> +EIGEN_STRONG_INLINE Packet16f +pbroadcast_fourth(const Packet16f& a_in) { + Packet4f a = _mm512_castps512_ps128(a_in); + return _mm512_broadcastss_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3))); +} +template <> +EIGEN_STRONG_INLINE Packet8d pbroadcast_first(const Packet8d& a_in) { + Packet2d a = _mm512_castpd512_pd128(a_in); + return _mm512_broadcastsd_pd(a); +} +template <> +EIGEN_STRONG_INLINE Packet8d pbroadcast_second(const Packet8d& a_in) { + Packet2d a = _mm_permute_pd(_mm512_castpd512_pd128(a_in), 3); + return _mm512_broadcastsd_pd(a); +} +template <> +EIGEN_STRONG_INLINE Packet8d pbroadcast_third(const Packet8d& a_in) { + Packet2d a = _mm512_extractf32x4_ps(a_in, 1); + return _mm512_broadcastsd_pd(a); +} +template <> +EIGEN_STRONG_INLINE Packet8d pbroadcast_fourth(const Packet8d& a_in) { + Packet2d a = _mm_permute_pd(_mm512_extractf32x4_ps(a_in, 1), 3); + return _mm512_broadcastsd_pd(a); +} +template <> +EIGEN_STRONG_INLINE Packet16i +pbroadcast_first(const Packet16i& a_in) { + Packet4i a = _mm512_castsi512_si128(a_in); + return _mm512_broadcastd_epi32(a); +} +template <> +EIGEN_STRONG_INLINE Packet16i +pbroadcast_second(const Packet16i& a_in) { + Packet4i a = _mm512_castsi512_si128(a_in); + return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 1, 1))); +} +template <> +EIGEN_STRONG_INLINE Packet16i +pbroadcast_third(const Packet16i& a_in) { + Packet4i a = _mm512_castsi512_si128(a_in); + return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 2, 2, 2))); +} +template <> +EIGEN_STRONG_INLINE Packet16i +pbroadcast_fourth(const Packet16i& a_in) { + Packet4i a = _mm512_castsi512_si128(a_in); + return _mm512_broadcastd_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 3, 3))); +} +#endif + #ifdef EIGEN_VECTORIZE_AVX // For a Packet of Size 8 floats(256-bits), swap the 2nd and 3rd quadwords template <> @@ -245,6 +316,25 @@ EIGEN_STRONG_INLINE Packet8f pload2bf16(const float* from) { _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp))); } +#ifdef EIGEN_VECTORIZE_AVX512 +// Return a Packet with 4 floats loaded from 4 bfloat16 values +template <> +EIGEN_STRONG_INLINE Packet16f pload4bf16(const float* from) { + __m128i zero = _mm_setzero_si128(); + __m128i tmp = _mm_castpd_si128(_mm_load_pd1((const double*)from)); + return _mm512_castps128_ps512( + _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp))); +} +// Return a Packet with 2 floats loaded from 2 bfloat16 values +template <> +EIGEN_STRONG_INLINE Packet16f pload2bf16(const float* from) { + __m128i zero = _mm_setzero_si128(); + __m128i tmp = _mm_castps_si128(_mm_load_ps1(from)); + return _mm512_castps128_ps512( + _mm_castsi128_ps(_mm_unpacklo_epi16(zero, tmp))); +} +#endif + // For each 128-bit lane convert 4 bfloat to 4 float values from the lower half // of the 128-bit lane template @@ -312,6 +402,22 @@ EIGEN_STRONG_INLINE Packet8f pbroadcast_fourth(const Packet8f& a) { _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_permute_ps(a, 3)))); } +#endif + +#ifdef EIGEN_VECTORIZE_AVX512 + +template +EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_l(const Packet16f& from) { + return _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(from)), + 16); +} + +template +EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_u(const Packet16f& from) { + return _mm512_slli_epi32( + _mm512_cvtepu16_epi32(_mm512_extractf64x4_pd(from, 1)), 16); +} + #endif } // namespace internal } // namespace Eigen diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc index 45cad2e23b1..b155e45187c 100644 --- a/tensorflow/core/kernels/sparse_matmul_op_test.cc +++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc @@ -200,7 +200,7 @@ class SparseMatmulOpTest : public ::testing::Test { // zero out lower 16-bits of mantissa of data3 values // copy bfloat representation to data3_bfloat16 - for (int i = 0; i < kMaxPacketSize; ++i) { + for (int i = 0; i < kMaxPacketSize * 2; ++i) { uint16_t* data3_p = reinterpret_cast(&data3[i]); uint16_t* data3_bfloat16_p = reinterpret_cast(data3_bfloat16) + i; @@ -222,7 +222,13 @@ class SparseMatmulOpTest : public ::testing::Test { return true; } +#ifdef EIGEN_VECTORIZE_AVX512 static const int kMaxPacketSize = 16; +#elif defined EIGEN_VECTORIZE_AVX || defined EIGEN_VECTORIZE_AVX2 + static const int kMaxPacketSize = 8; +#else + static const int kMaxPacketSize = 4; +#endif typedef typename Eigen::internal::packet_traits::type Packet; const int PacketSize; // float values @@ -230,9 +236,9 @@ class SparseMatmulOpTest : public ::testing::Test { // output of intrinsics EIGEN_ALIGN_MAX float data2[kMaxPacketSize]; // float values with only 7 mantissa bits (bfloat representable) - EIGEN_ALIGN_MAX float data3[kMaxPacketSize]; + EIGEN_ALIGN_MAX float data3[kMaxPacketSize * 2]; // bfloat16 representation of data3 - EIGEN_ALIGN_MAX float data3_bfloat16[kMaxPacketSize / 2]; + EIGEN_ALIGN_MAX float data3_bfloat16[kMaxPacketSize]; EIGEN_ALIGN_MAX float ref[kMaxPacketSize]; };