From 6d5c4a75cc0e0626b890eb0b41b303f0c6e13494 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Fri, 24 Apr 2020 02:48:57 -0700
Subject: [PATCH] [XLA:CPU] Reconsider alignment heuristics

We were trying to model tcmalloc and glibc internals based on completely
outdated heuristics. operator new will return any alignment based on the type,
so using e.g. memory backed by a std::vector was never an option here. The
heuristic was also known to produce annoying performance cliffs (e.g. with a
convolution with a tiny filter).

Just unify all the places we hardcoded alignment to use
cpu_function_runtime::kMinAlign and make it use 16, which is the sweet spot for
vectorization on most platforms. Assert that alignment is sufficient when
giving buffers to tfcompile'd code. This also matches the default alignment
in Eigen, so we stay compatible. Raising it would break things like
Eigen::aligned_allocator.

We still align buffers in tfcompile to 64 bytes for optimal performance. I
tried lowering this to 16, but that triggered some benchmarks to slow down.

PiperOrigin-RevId: 308222775
Change-Id: Ib308f4d687a7b27ae5016365080ad957fe8584bc
---
 .../tf2xla/cpu_function_runtime_test.cc       |  2 ++
 .../tf2xla/xla_compiled_cpu_function.h        |  5 +++-
 .../compiler/xla/cpu_function_runtime.h       |  3 +++
 tensorflow/compiler/xla/service/cpu/BUILD     |  1 +
 .../compiler/xla/service/cpu/cpu_compiler.cc  |  5 ++--
 .../service/cpu/target_machine_features.cc    | 24 +++++++------------
 .../cpu/tests/cpu_external_constants_test.cc  |  4 ++--
 7 files changed, 23 insertions(+), 21 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc b/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc
index f06665dad56..8dede16c332 100644
--- a/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc
+++ b/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc
@@ -29,6 +29,8 @@ TEST(XlaCompiledCpuFunctionTest, AlignmentValue) {
   // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
   // So any value that we choose must abide by that constraint as well.
   EXPECT_EQ(xla::cpu_function_runtime::kAlign, Allocator::kAllocatorAlignment);
+  EXPECT_LE(xla::cpu_function_runtime::kMinAlign,
+            Allocator::kAllocatorAlignment);
 }
 
 std::vector<BufferInfo> SizesToBufferInfos(const intptr_t* sizes, size_t n) {
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 04d9086ce4c..550f562a0e1 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -172,13 +172,16 @@ class XlaCompiledCpuFunction {
   // called for each positional argument, in order to set the argument buffers.
   //
   // Allocated memory must be aligned to the size specified by
-  // tensorflow::tfcompile::runtime::kAlign. If possible, use the functions in
+  // xla::cpu_function_runtime::kMinAlign. If possible, use the functions in
   // tensorflow/compiler/tf2xla/cpu_function_runtime.h to ensure correct
   // alignment.
   //
   // Aliasing of argument and result buffers is not allowed, and results in
   // undefined behavior.
   void set_arg_data(size_t index, const void* data) {
+    assert((arg_size(index) < xla::cpu_function_runtime::kMinAlign ||
+            (uintptr_t)data % xla::cpu_function_runtime::kMinAlign == 0) &&
+           "Underaligned pointer!");
     // The const_cast is safe because the generated code does not write to arg
     // buffers.
     //
diff --git a/tensorflow/compiler/xla/cpu_function_runtime.h b/tensorflow/compiler/xla/cpu_function_runtime.h
index 0c3355cbbfb..ea981d526e4 100644
--- a/tensorflow/compiler/xla/cpu_function_runtime.h
+++ b/tensorflow/compiler/xla/cpu_function_runtime.h
@@ -138,6 +138,9 @@ class BufferInfo {
 // Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
 constexpr size_t kAlign = 64;
 
+// The minimum alignment of buffers passed to XLA:CPU.
+constexpr size_t kMinAlign = 16;
+
 // When declaring variables that will be passed to an XLA instance as input via
 // set_arg_data(), be it a regular input or a resource variable in the graph,
 // the C++ variables must be aligned.
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 7e1b8a1e7ee..e8e1f044704 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -357,6 +357,7 @@ cc_library(
     ],
     hdrs = ["target_machine_features.h"],
     deps = [
+        "//tensorflow/compiler/xla:cpu_function_runtime",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 8f56aa803e2..b04237138e8 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -402,8 +402,9 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
 namespace {
 
 // Align buffers to 16-byte boundaries.
-constexpr int64 kMemoryAlignment = 16;
-auto memory_alignment = [](LogicalBuffer::Color) { return kMemoryAlignment; };
+int64 memory_alignment(LogicalBuffer::Color) {
+  return cpu_function_runtime::kMinAlign;
+}
 
 llvm::TargetOptions CompilerTargetOptions(
     const HloModuleConfig& module_config) {
diff --git a/tensorflow/compiler/xla/service/cpu/target_machine_features.cc b/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
index 5cdac203af2..518684e38c5 100644
--- a/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
+++ b/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+
+#include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -34,27 +36,17 @@ llvm::TargetTransformInfo* LLVMTargetMachineFeatures::GetTargetTransformInfoFor(
 
 int64 LLVMTargetMachineFeatures::minimum_alignment_for_allocation(
     int64 size_bytes) const {
-  // GLibc malloc returns a pointer with alignment 8 on 32-bit platforms and 16
-  // on 64-bit platforms.  TCMalloc returns a pointer with alignment 8 for
-  // allocations smaller than kMallocAlignmentThreshold bytes and at least
-  // alignment 16 for allocations greater than or equal to
-  // kMallocAlignmentThreshold bytes.  N.B. We could improve on this lower bound
-  // by explicitly allocating the memory with posix_memalign.  This is
-  // complicated by our desire to allow parameter buffers created by clients to
-  // be consumed directly by the JIT.
+  // Assume that all pointers are aligned to at least
+  // xla::cpu_function_runtime::kMinAlign.
   if (size_bytes == 0) {
     // No need to align empty buffers.
     return 1;
   }
 
-  const int64 kMallocAlignmentThreshold = 512;
-
-  int pointer_size = target_machine_->getPointerSize(0);
-  int buffer_alignment =
-      size_bytes >= kMallocAlignmentThreshold ? 2 * pointer_size : pointer_size;
-  DCHECK_GT(buffer_alignment, 0);
-
-  return buffer_alignment;
+  // Allow small buffers to be underaligned, there is no vectorization benefit
+  // anyways.
+  return std::min<int64>(llvm::PowerOf2Ceil(size_bytes),
+                         cpu_function_runtime::kMinAlign);
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
index f4da6856940..c698afbdc6a 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
@@ -65,8 +65,8 @@ TEST_F(CpuExternalConstantsTest, BasicNegative) {
   // The constant array in this test case is small enough that there is no need
   // to externalize it.
   TestWithArray(/*rows=*/4, /*cols=*/4, R"(
-CHECK-NOT: @constant_global_0 = external unnamed_addr constant [16 x float], align 8
-CHECK: @0 = private unnamed_addr constant [64 x i8] {{.*}}, align 8
+CHECK-NOT: @constant_global_0 = external unnamed_addr constant [16 x float]
+CHECK: @0 = private unnamed_addr constant [64 x i8] {{.*}}, align 16
 )");
 }
 }  // namespace