Fix alignment crashes in AVX512 builds (#19121)

* Fix issue #15588 by simplifying the code

The allocator.h code tried to be clever and use 32 byte alignment for SSE/AVX2/etc use,
and 64 byte alignment for AVX512.

Unfortunately, the #ifdef in use (from EIGEN) is not useful; the bazel BUILD files do
not propagate the tf_copts() compiler flags when the allocator.cc/allocator.h files get
compiled, to EIGEN does not see the actual AVX512 using compiler flags...

Rather than changing compiler flag propagation throughout a whole bunch of code,
there's an opportunity to just simplify the code and always use 64 byte alignment.
Yes it wastes a bit of space, but on the other hand now these allocations are
cache line aligned which isn't a bad thing... and an ifdef can be dropped

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>

* Set EIGEN_MAX_ALIGN_BYTES=64

This patch sets a 64 byte upper bound on the alignment of memory allocated by
eigen.  This is necessary to prevent crashes during the execution of the unit
tests when they are compiled with AVX512 support.

Signed-off-by: Mark Ryan <mark.d.ryan@intel.com>

* Update the tensorflow/compiler/aot tests for 64 byte alignment

Modifications to the tensorflow/core/framework/allocator.h to always
use 64 byte alignment causes failures in the tensorflow/compiler/aot
unit tests.  This patch updates these tests so that they pass with
64 byte aligned allocated memory.

Signed-off-by: Mark Ryan <mark.d.ryan@intel.com>

* Update Tensor.Slice_Basic for 64 byte alignment

The test case

//tensorflow/core:framework_tensor_test:Tensor.Slice_Basic

fails with EIGEN_MAX_ALIGN_BYTES set to 64.  The reason is that the
slices it takes of the sample tensor are 32 byte and not 64 byte
aligned.  This commit increases one of the dimensions of the original
tensor to ensure that the slices taken by the test cases are indeed 64
byte aligned.

Signed-off-by: Mark Ryan <mark.d.ryan@intel.com>

* Update ScopedAllocatorConcatOpTest.Reshape for 64 byte alignment

The ScopedAllocatorConcatOpTest.Reshape test requires that the elements
of the field_shapes parameter of ExecOp are multiples of
Allocator::kAllocatorAlignment in size.  If they are not, the backing
tensor allocated by PrepOp will have too many elements and reshaping
will fail.  This commit modifies the test case, making the elements
64 bytes in size, the new value for Allocator::kAllocatorAlignment.

Signed-off-by: Mark Ryan <mark.d.ryan@intel.com>
This commit is contained in:
Mark Ryan 2018-05-17 18:17:39 +01:00 committed by Rasmus Munk Larsen
parent 9b41e5158e
commit ba30ba07b2
7 changed files with 32 additions and 31 deletions

View File

@ -56,9 +56,9 @@ namespace bar {
//
// Memory stats:
// arg bytes total: 104
// arg bytes aligned: 128
// arg bytes aligned: 192
// temp bytes total: 126
// temp bytes aligned: 224
// temp bytes aligned: 320
class MyClass : public tensorflow::XlaCompiledCpuFunction {
public:
// Number of input arguments for the compiled computation.

View File

@ -25,8 +25,8 @@ namespace tensorflow {
namespace tfcompile {
namespace runtime {
// Align to 32-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
static constexpr size_t kAlign = 32;
// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
static constexpr size_t kAlign = 64;
// aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1
// values. There are `n` entries in `sizes`. Each buffer is aligned to kAlign

View File

@ -24,7 +24,7 @@ namespace runtime {
namespace {
TEST(Runtime, AlignmentValue) {
// We've chosen 32 byte alignment for the tfcompile runtime to mimic the
// We've chosen 64 byte alignment for the tfcompile runtime to mimic the
// regular tensorflow allocator, which was chosen to play nicely with Eigen.
// The tfcompile runtime also has a requirement that comes from the xla
// generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
@ -39,13 +39,13 @@ TEST(Runtime, AlignedBufferBytes) {
EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0);
static constexpr intptr_t sizesB[1] = {3};
EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 32);
EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64);
static constexpr intptr_t sizesC[1] = {32};
EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 32);
EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64);
static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 192);
EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320);
}
void* add_ptr(void* base, uintptr_t delta) {
@ -101,11 +101,11 @@ TEST(Runtime, MallocFreeContiguousBuffers) {
EXPECT_NE(base, nullptr);
EXPECT_EQ(bufD[0], add_ptr(base, 0));
EXPECT_EQ(bufD[1], nullptr);
EXPECT_EQ(bufD[2], add_ptr(base, 32));
EXPECT_EQ(bufD[2], add_ptr(base, 64));
EXPECT_EQ(bufD[3], nullptr);
EXPECT_EQ(bufD[4], add_ptr(base, 64));
EXPECT_EQ(bufD[5], add_ptr(base, 128));
EXPECT_EQ(bufD[6], add_ptr(base, 160));
EXPECT_EQ(bufD[4], add_ptr(base, 128));
EXPECT_EQ(bufD[5], add_ptr(base, 192));
EXPECT_EQ(bufD[6], add_ptr(base, 256));
for (int i = 0; i < 7; ++i) {
const intptr_t size = sizesD[i];
if (size != -1) {

View File

@ -67,13 +67,8 @@ struct AllocatorStats {
// device memory.
class Allocator {
public:
#ifdef EIGEN_VECTORIZE_AVX512
// Align to 64 byte boundary.
static constexpr size_t kAllocatorAlignment = 64;
#else
// Align to 32 byte boundary.
static constexpr size_t kAllocatorAlignment = 32;
#endif
virtual ~Allocator();

View File

@ -1147,29 +1147,29 @@ TEST(Tensor, FailureToAllocate) {
// On the alignment.
//
// As of 2015/8, tensorflow::Tensor allocates its buffer with 32-byte
// As of 2018/5, tensorflow::Tensor allocates its buffer with 64-byte
// alignment. Tensor::tensor/flat/vec/matrix methods requires the
// buffer satisfies Eigen::Aligned (e.g., 16-bytes aligned usually,
// and 32-bytes for AVX). Tensor::Slice requires the caller to ensure
// its result is aligned if the caller intends to use those methods.
// In this test case, we simply make sure each slice is 32-byte
// aligned: sizeof(float) * 4 * 2 = 32.
// 32-bytes for AVX, and 64-bytes for AVX512). Tensor::Slice requires
// the caller to ensure its result is aligned if the caller intends
// to use those methods. In this test case, we simply make sure each
// slice is 64-byte aligned: sizeof(float) * 4 * 36 = 576. 576 % 64 = 0.
TEST(Tensor, Slice_Basic) {
Tensor saved;
{ // General
Tensor x(DT_FLOAT, TensorShape({10, 4, 34}));
Tensor x(DT_FLOAT, TensorShape({10, 4, 36}));
// Fills in known values.
for (int i = 0; i < 10; ++i) {
x.Slice(i, i + 1).flat<float>().setConstant(i * 1.f);
}
// A simple slice along dim0.
Tensor y = x.Slice(4, 8);
EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 34})));
EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 36})));
auto tx = x.tensor<float, 3>();
auto ty = y.tensor<float, 3>();
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
for (int k = 0; k < 34; ++k) {
for (int k = 0; k < 36; ++k) {
EXPECT_EQ(ty(i, j, k), 4.0 + i);
EXPECT_EQ(&tx(4 + i, j, k), &ty(i, j, k));
}
@ -1186,7 +1186,7 @@ TEST(Tensor, Slice_Basic) {
auto tz = z.tensor<float, 3>();
EXPECT_EQ(1, z.dim_size(0));
for (int j = 0; j < 4; ++j) {
for (int k = 0; k < 34; ++k) {
for (int k = 0; k < 36; ++k) {
EXPECT_EQ(tz(0, j, k), 6.0);
}
}
@ -1198,16 +1198,16 @@ TEST(Tensor, Slice_Basic) {
EXPECT_EQ(1, saved.dim_size(0));
auto tsaved = saved.tensor<float, 3>();
for (int j = 0; j < 4; ++j) {
for (int k = 0; k < 34; ++k) {
for (int k = 0; k < 36; ++k) {
EXPECT_EQ(tsaved(0, j, k), 6.0);
}
}
}
{ // Empty
Tensor x(DT_FLOAT, TensorShape({10, 0, 34}));
Tensor x(DT_FLOAT, TensorShape({10, 0, 36}));
x.flat<float>().setRandom();
Tensor y = x.Slice(4, 8);
EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 34})));
EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 36})));
}
{

View File

@ -212,8 +212,13 @@ TEST_F(ScopedAllocatorConcatOpTest, Success3) {
}
TEST_F(ScopedAllocatorConcatOpTest, Reshape) {
MakeOp({2, 2, 2}, DT_DOUBLE, true, "test", 120, 2);
ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
MakeOp({2, 2, 4}, DT_DOUBLE, true, "test", 120, 2);
// The elements of the third parameter to ExecOp must be multiples of
// Allocator::kAllocatorAlignment in size. If they are not, the backing
// tensor allocated by PrepOp will have too many elements and reshaping
// will fail.
ExecOp(DT_DOUBLE, 120, {{2, 4}, {2, 4}});
}
TEST_F(ScopedAllocatorConcatOpTest, NoReshapeAttr) {

View File

@ -64,6 +64,7 @@ cc_library(
# This define (mostly) guarantees we don't link any problematic
# code. We use it, but we do not rely on it, as evidenced above.
"EIGEN_MPL2_ONLY",
"EIGEN_MAX_ALIGN_BYTES=64",
],
includes = ["."],
visibility = ["//visibility:public"],