[XLA:CPU] Reconsider alignment heuristics

We were trying to model tcmalloc and glibc internals based on completely outdated heuristics. operator new will return any alignment based on the type, so using e.g. memory backed by a std::vector was never an option here. The heuristic was also known to produce annoying performance cliffs (e.g. with a convolution with a tiny filter). Just unify all the places we hardcoded alignment to use cpu_function_runtime::kMinAlign and make it use 16, which is the sweet spot for vectorization on most platforms. Assert that alignment is sufficient when giving buffers to tfcompile'd code. This also matches the default alignment in Eigen, so we stay compatible. Raising it would break things like Eigen::aligned_allocator. We still align buffers in tfcompile to 64 bytes for optimal performance. I tried lowering this to 16, but that triggered some benchmarks to slow down. PiperOrigin-RevId: 308222775 Change-Id: Ib308f4d687a7b27ae5016365080ad957fe8584bc
2020-04-24 02:48:57 -07:00 · 2020-04-24 02:48:57 -07:00 · 6d5c4a75cc
commit 6d5c4a75cc
parent e26743fa35
7 changed files with 23 additions and 21 deletions
--- a/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc
+++ b/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc
@ -29,6 +29,8 @@ TEST(XlaCompiledCpuFunctionTest, AlignmentValue) {
  // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
  // So any value that we choose must abide by that constraint as well.
  EXPECT_EQ(xla::cpu_function_runtime::kAlign, Allocator::kAllocatorAlignment);
+  EXPECT_LE(xla::cpu_function_runtime::kMinAlign,
+            Allocator::kAllocatorAlignment);
 }

 std::vector<BufferInfo> SizesToBufferInfos(const intptr_t* sizes, size_t n) {
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@ -172,13 +172,16 @@ class XlaCompiledCpuFunction {
  // called for each positional argument, in order to set the argument buffers.
  //
  // Allocated memory must be aligned to the size specified by
-  // tensorflow::tfcompile::runtime::kAlign. If possible, use the functions in
+  // xla::cpu_function_runtime::kMinAlign. If possible, use the functions in
  // tensorflow/compiler/tf2xla/cpu_function_runtime.h to ensure correct
  // alignment.
  //
  // Aliasing of argument and result buffers is not allowed, and results in
  // undefined behavior.
  void set_arg_data(size_t index, const void* data) {
+    assert((arg_size(index) < xla::cpu_function_runtime::kMinAlign ||
+            (uintptr_t)data % xla::cpu_function_runtime::kMinAlign == 0) &&
+           "Underaligned pointer!");
    // The const_cast is safe because the generated code does not write to arg
    // buffers.
    //
--- a/tensorflow/compiler/xla/cpu_function_runtime.h
+++ b/tensorflow/compiler/xla/cpu_function_runtime.h
@ -138,6 +138,9 @@ class BufferInfo {
 // Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
 constexpr size_t kAlign = 64;

+// The minimum alignment of buffers passed to XLA:CPU.
+constexpr size_t kMinAlign = 16;
+
 // When declaring variables that will be passed to an XLA instance as input via
 // set_arg_data(), be it a regular input or a resource variable in the graph,
 // the C++ variables must be aligned.
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@ -357,6 +357,7 @@ cc_library(
    ],
    hdrs = ["target_machine_features.h"],
    deps = [
+        "//tensorflow/compiler/xla:cpu_function_runtime",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/core:lib",
        "@com_google_absl//absl/container:flat_hash_map",
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@ -402,8 +402,9 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
 namespace {

 // Align buffers to 16-byte boundaries.
-constexpr int64 kMemoryAlignment = 16;
-auto memory_alignment = [](LogicalBuffer::Color) { return kMemoryAlignment; };
+int64 memory_alignment(LogicalBuffer::Color) {
+  return cpu_function_runtime::kMinAlign;
+}

 llvm::TargetOptions CompilerTargetOptions(
    const HloModuleConfig& module_config) {
--- a/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
+++ b/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/

 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+
+#include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/core/platform/logging.h"

 namespace xla {
@ -34,27 +36,17 @@ llvm::TargetTransformInfo* LLVMTargetMachineFeatures::GetTargetTransformInfoFor(

 int64 LLVMTargetMachineFeatures::minimum_alignment_for_allocation(
    int64 size_bytes) const {
-  // GLibc malloc returns a pointer with alignment 8 on 32-bit platforms and 16
-  // on 64-bit platforms.  TCMalloc returns a pointer with alignment 8 for
-  // allocations smaller than kMallocAlignmentThreshold bytes and at least
-  // alignment 16 for allocations greater than or equal to
-  // kMallocAlignmentThreshold bytes.  N.B. We could improve on this lower bound
-  // by explicitly allocating the memory with posix_memalign.  This is
-  // complicated by our desire to allow parameter buffers created by clients to
-  // be consumed directly by the JIT.
+  // Assume that all pointers are aligned to at least
+  // xla::cpu_function_runtime::kMinAlign.
  if (size_bytes == 0) {
    // No need to align empty buffers.
    return 1;
  }

-  const int64 kMallocAlignmentThreshold = 512;
-
-  int pointer_size = target_machine_->getPointerSize(0);
-  int buffer_alignment =
-      size_bytes >= kMallocAlignmentThreshold ? 2 * pointer_size : pointer_size;
-  DCHECK_GT(buffer_alignment, 0);
-
-  return buffer_alignment;
+  // Allow small buffers to be underaligned, there is no vectorization benefit
+  // anyways.
+  return std::min<int64>(llvm::PowerOf2Ceil(size_bytes),
+                         cpu_function_runtime::kMinAlign);
 }

 }  // namespace cpu
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
@ -65,8 +65,8 @@ TEST_F(CpuExternalConstantsTest, BasicNegative) {
  // The constant array in this test case is small enough that there is no need
  // to externalize it.
  TestWithArray(/*rows=*/4, /*cols=*/4, R"(
-CHECK-NOT: @constant_global_0 = external unnamed_addr constant [16 x float], align 8
-CHECK: @0 = private unnamed_addr constant [64 x i8] {{.*}}, align 8
+CHECK-NOT: @constant_global_0 = external unnamed_addr constant [16 x float]
+CHECK: @0 = private unnamed_addr constant [64 x i8] {{.*}}, align 16
 )");
 }
 }  // namespace