Fix alignment crashes in AVX512 builds (#19121)

* Fix issue #15588 by simplifying the code The allocator.h code tried to be clever and use 32 byte alignment for SSE/AVX2/etc use, and 64 byte alignment for AVX512. Unfortunately, the #ifdef in use (from EIGEN) is not useful; the bazel BUILD files do not propagate the tf_copts() compiler flags when the allocator.cc/allocator.h files get compiled, to EIGEN does not see the actual AVX512 using compiler flags... Rather than changing compiler flag propagation throughout a whole bunch of code, there's an opportunity to just simplify the code and always use 64 byte alignment. Yes it wastes a bit of space, but on the other hand now these allocations are cache line aligned which isn't a bad thing... and an ifdef can be dropped Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> * Set EIGEN_MAX_ALIGN_BYTES=64 This patch sets a 64 byte upper bound on the alignment of memory allocated by eigen. This is necessary to prevent crashes during the execution of the unit tests when they are compiled with AVX512 support. Signed-off-by: Mark Ryan <mark.d.ryan@intel.com> * Update the tensorflow/compiler/aot tests for 64 byte alignment Modifications to the tensorflow/core/framework/allocator.h to always use 64 byte alignment causes failures in the tensorflow/compiler/aot unit tests. This patch updates these tests so that they pass with 64 byte aligned allocated memory. Signed-off-by: Mark Ryan <mark.d.ryan@intel.com> * Update Tensor.Slice_Basic for 64 byte alignment The test case //tensorflow/core:framework_tensor_test:Tensor.Slice_Basic fails with EIGEN_MAX_ALIGN_BYTES set to 64. The reason is that the slices it takes of the sample tensor are 32 byte and not 64 byte aligned. This commit increases one of the dimensions of the original tensor to ensure that the slices taken by the test cases are indeed 64 byte aligned. Signed-off-by: Mark Ryan <mark.d.ryan@intel.com> * Update ScopedAllocatorConcatOpTest.Reshape for 64 byte alignment The ScopedAllocatorConcatOpTest.Reshape test requires that the elements of the field_shapes parameter of ExecOp are multiples of Allocator::kAllocatorAlignment in size. If they are not, the backing tensor allocated by PrepOp will have too many elements and reshaping will fail. This commit modifies the test case, making the elements 64 bytes in size, the new value for Allocator::kAllocatorAlignment. Signed-off-by: Mark Ryan <mark.d.ryan@intel.com>
2018-05-17 18:17:39 +01:00 · 2018-05-17 18:17:39 +01:00 · ba30ba07b2
commit ba30ba07b2
parent 9b41e5158e
7 changed files with 32 additions and 31 deletions
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@ -56,9 +56,9 @@ namespace bar {
 //
 // Memory stats:
 //   arg bytes total:    104
-//   arg bytes aligned:  128
+//   arg bytes aligned:  192
 //   temp bytes total:   126
-//   temp bytes aligned: 224
+//   temp bytes aligned: 320
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
 public:
  // Number of input arguments for the compiled computation.
--- a/tensorflow/compiler/aot/runtime.h
+++ b/tensorflow/compiler/aot/runtime.h
@ -25,8 +25,8 @@ namespace tensorflow {
 namespace tfcompile {
 namespace runtime {

-// Align to 32-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
-static constexpr size_t kAlign = 32;
+// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
+static constexpr size_t kAlign = 64;

 // aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1
 // values.  There are `n` entries in `sizes`.  Each buffer is aligned to kAlign
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@ -24,7 +24,7 @@ namespace runtime {
 namespace {

 TEST(Runtime, AlignmentValue) {
-  // We've chosen 32 byte alignment for the tfcompile runtime to mimic the
+  // We've chosen 64 byte alignment for the tfcompile runtime to mimic the
  // regular tensorflow allocator, which was chosen to play nicely with Eigen.
  // The tfcompile runtime also has a requirement that comes from the xla
  // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
@ -39,13 +39,13 @@ TEST(Runtime, AlignedBufferBytes) {
  EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0);

  static constexpr intptr_t sizesB[1] = {3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64);

  static constexpr intptr_t sizesC[1] = {32};
-  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64);

  static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 192);
+  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320);
 }

 void* add_ptr(void* base, uintptr_t delta) {
@ -101,11 +101,11 @@ TEST(Runtime, MallocFreeContiguousBuffers) {
  EXPECT_NE(base, nullptr);
  EXPECT_EQ(bufD[0], add_ptr(base, 0));
  EXPECT_EQ(bufD[1], nullptr);
-  EXPECT_EQ(bufD[2], add_ptr(base, 32));
+  EXPECT_EQ(bufD[2], add_ptr(base, 64));
  EXPECT_EQ(bufD[3], nullptr);
-  EXPECT_EQ(bufD[4], add_ptr(base, 64));
-  EXPECT_EQ(bufD[5], add_ptr(base, 128));
-  EXPECT_EQ(bufD[6], add_ptr(base, 160));
+  EXPECT_EQ(bufD[4], add_ptr(base, 128));
+  EXPECT_EQ(bufD[5], add_ptr(base, 192));
+  EXPECT_EQ(bufD[6], add_ptr(base, 256));
  for (int i = 0; i < 7; ++i) {
    const intptr_t size = sizesD[i];
    if (size != -1) {
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@ -67,13 +67,8 @@ struct AllocatorStats {
 // device memory.
 class Allocator {
 public:
-#ifdef EIGEN_VECTORIZE_AVX512
  // Align to 64 byte boundary.
  static constexpr size_t kAllocatorAlignment = 64;
-#else
-  // Align to 32 byte boundary.
-  static constexpr size_t kAllocatorAlignment = 32;
-#endif

  virtual ~Allocator();

--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@ -1147,29 +1147,29 @@ TEST(Tensor, FailureToAllocate) {

 // On the alignment.
 //
-// As of 2015/8, tensorflow::Tensor allocates its buffer with 32-byte
+// As of 2018/5, tensorflow::Tensor allocates its buffer with 64-byte
 // alignment. Tensor::tensor/flat/vec/matrix methods requires the
 // buffer satisfies Eigen::Aligned (e.g., 16-bytes aligned usually,
-// and 32-bytes for AVX). Tensor::Slice requires the caller to ensure
-// its result is aligned if the caller intends to use those methods.
-// In this test case, we simply make sure each slice is 32-byte
-// aligned: sizeof(float) * 4 * 2 = 32.
+// 32-bytes for AVX, and 64-bytes for AVX512). Tensor::Slice requires
+// the caller to ensure its result is aligned if the caller intends
+// to use those methods. In this test case, we simply make sure each
+// slice is 64-byte aligned: sizeof(float) * 4 * 36 = 576.  576 % 64 = 0.
 TEST(Tensor, Slice_Basic) {
  Tensor saved;
  {  // General
-    Tensor x(DT_FLOAT, TensorShape({10, 4, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 4, 36}));
    // Fills in known values.
    for (int i = 0; i < 10; ++i) {
      x.Slice(i, i + 1).flat<float>().setConstant(i * 1.f);
    }
    // A simple slice along dim0.
    Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 36})));
    auto tx = x.tensor<float, 3>();
    auto ty = y.tensor<float, 3>();
    for (int i = 0; i < 4; ++i) {
      for (int j = 0; j < 4; ++j) {
-        for (int k = 0; k < 34; ++k) {
+        for (int k = 0; k < 36; ++k) {
          EXPECT_EQ(ty(i, j, k), 4.0 + i);
          EXPECT_EQ(&tx(4 + i, j, k), &ty(i, j, k));
        }
@ -1186,7 +1186,7 @@ TEST(Tensor, Slice_Basic) {
    auto tz = z.tensor<float, 3>();
    EXPECT_EQ(1, z.dim_size(0));
    for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
        EXPECT_EQ(tz(0, j, k), 6.0);
      }
    }
@ -1198,16 +1198,16 @@ TEST(Tensor, Slice_Basic) {
    EXPECT_EQ(1, saved.dim_size(0));
    auto tsaved = saved.tensor<float, 3>();
    for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
        EXPECT_EQ(tsaved(0, j, k), 6.0);
      }
    }
  }
  {  // Empty
-    Tensor x(DT_FLOAT, TensorShape({10, 0, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 0, 36}));
    x.flat<float>().setRandom();
    Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 36})));
  }

  {
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@ -212,8 +212,13 @@ TEST_F(ScopedAllocatorConcatOpTest, Success3) {
 }

 TEST_F(ScopedAllocatorConcatOpTest, Reshape) {
-  MakeOp({2, 2, 2}, DT_DOUBLE, true, "test", 120, 2);
-  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
+  MakeOp({2, 2, 4}, DT_DOUBLE, true, "test", 120, 2);
+
+  // The elements of the third parameter to ExecOp must be multiples of
+  // Allocator::kAllocatorAlignment in size.  If they are not, the backing
+  // tensor allocated by PrepOp will have too many elements and reshaping
+  // will fail.
+  ExecOp(DT_DOUBLE, 120, {{2, 4}, {2, 4}});
 }

 TEST_F(ScopedAllocatorConcatOpTest, NoReshapeAttr) {
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@ -64,6 +64,7 @@ cc_library(
        # This define (mostly) guarantees we don't link any problematic
        # code. We use it, but we do not rely on it, as evidenced above.
        "EIGEN_MPL2_ONLY",
+        "EIGEN_MAX_ALIGN_BYTES=64",
    ],
    includes = ["."],
    visibility = ["//visibility:public"],