Switch open-source to use jemalloc for CPU Tensor memory allocation, gRPC, and other places where we call malloc/free.

- Only enabled on Linux for now. - Added as a ./configure option defaulting to enabled. Change: 144266237
2017-01-11 16:39:35 -08:00 · 2017-01-11 16:39:35 -08:00 · 83c6e0c63a
commit 83c6e0c63a
parent 99e1b19ceb
25 changed files with 505 additions and 59 deletions
--- a/18
+++ b/18
@ -57,9 +57,27 @@ done
 if is_windows; then
  TF_NEED_GCP=0
  TF_NEED_HDFS=0
+  TF_NEED_JEMALLOC=0
  TF_NEED_OPENCL=0
 fi

+while [ "$TF_NEED_JEMALLOC" == "" ]; do
+  read -p "Do you wish to use jemalloc as the malloc implementation? "\
+"(Linux only) [Y/n] " INPUT
+  case $INPUT in
+    [Yy]* ) echo "jemalloc enabled on Linux"; TF_NEED_JEMALLOC=1;;
+    [Nn]* ) echo "jemalloc disabled on Linux"; TF_NEED_JEMALLOC=0;;
+    "" ) echo "jemalloc enabled on Linux"; TF_NEED_JEMALLOC=1;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
+done
+
+if [ "$TF_NEED_JEMALLOC" == "1" ]; then
+  sed -i -e "s/WITH_JEMALLOC = False/WITH_JEMALLOC = True/" tensorflow/core/platform/default/build_config.bzl
+else
+  sed -i -e "s/WITH_JEMALLOC = True/WITH_JEMALLOC = False/" tensorflow/core/platform/default/build_config.bzl
+fi
+
 while [ "$TF_NEED_GCP" == "" ]; do
  read -p "Do you wish to build TensorFlow with "\
 "Google Cloud Platform support? [y/N] " INPUT
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@ -159,11 +160,13 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in,
    return InvalidArgument("Passing non-empty TF_Buffer is invalid.");
  }
  const auto proto_size = in.ByteSize();
-  void* buf = malloc(proto_size);
+  void* buf = tensorflow::port::Malloc(proto_size);
  in.SerializeToArray(buf, proto_size);
  out->data = buf;
  out->length = proto_size;
-  out->data_deallocator = [](void* data, size_t length) { free(data); };
+  out->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
  return Status::OK();
 }

@ -287,13 +290,15 @@ void TF_SetConfig(TF_SessionOptions* options, const void* proto,
 TF_Buffer* TF_NewBuffer() { return new TF_Buffer{nullptr, 0, nullptr}; }

 TF_Buffer* TF_NewBufferFromString(const void* proto, size_t proto_len) {
-  void* copy = malloc(proto_len);
+  void* copy = tensorflow::port::Malloc(proto_len);
  memcpy(copy, proto, proto_len);

  TF_Buffer* buf = new TF_Buffer;
  buf->data = copy;
  buf->length = proto_len;
-  buf->data_deallocator = [](void* data, size_t length) { free(data); };
+  buf->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
  return buf;
 }

@ -694,7 +699,7 @@ TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status) {
 TF_Buffer TF_GetOpList(TF_Library* lib_handle) { return lib_handle->op_list; }

 void TF_DeleteLibraryHandle(TF_Library* lib_handle) {
-  free(const_cast<void*>(lib_handle->op_list.data));
+  tensorflow::port::Free(const_cast<void*>(lib_handle->op_list.data));
  delete lib_handle;
 }

--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/platform/mem.h"

 namespace tensorflow {

@ -41,7 +42,7 @@ void* XlaDeviceAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
  // Regardless of the size requested, always allocate a XlaGlobalData. Respect
  // the aligment request because there is alignment checking even for Tensors
  // whose data is never accessed.
-  void* p = port::aligned_malloc(sizeof(XlaGlobalData), alignment);
+  void* p = port::AlignedMalloc(sizeof(XlaGlobalData), alignment);
  VLOG(2) << "Allocated XLA device tensor " << p;
  return new (p) XlaGlobalData();
 }
@ -50,7 +51,7 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
  XlaGlobalData* global_data = reinterpret_cast<XlaGlobalData*>(ptr);
  VLOG(2) << "Deallocated XLA device tensor " << ptr;
  global_data->~XlaGlobalData();
-  port::aligned_free(ptr);
+  port::AlignedFree(ptr);
 }

 void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"

 namespace tensorflow {
@ -47,7 +48,7 @@ class XlaCompilationAllocator : public Allocator {
    // XlaExpression. Respect the aligment request because there is
    // alignment checking even for Tensors whose data is never
    // accessed.
-    void* p = port::aligned_malloc(sizeof(XlaExpression), alignment);
+    void* p = port::AlignedMalloc(sizeof(XlaExpression), alignment);
    XlaExpression* expression = reinterpret_cast<XlaExpression*>(p);
    new (expression) XlaExpression();
    return expression;
@ -56,7 +57,7 @@ class XlaCompilationAllocator : public Allocator {
  void DeallocateRaw(void* ptr) override {
    XlaExpression* expression = reinterpret_cast<XlaExpression*>(ptr);
    expression->~XlaExpression();
-    port::aligned_free(ptr);
+    port::AlignedFree(ptr);
  }

  // Make sure that even tensors with 0 elements have allocated
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -84,12 +84,14 @@ load(
    "//tensorflow/core:platform/default/build_config.bzl",
    "tf_proto_library",
    "tf_proto_library_cc",
+    "tf_additional_core_deps",
+    "tf_additional_lib_defines",
+    "tf_additional_lib_deps",
    "tf_additional_lib_hdrs",
    "tf_additional_lib_srcs",
    "tf_additional_minimal_lib_srcs",
    "tf_additional_proto_hdrs",
    "tf_additional_proto_srcs",
-    "tf_additional_lib_deps",
    "tf_additional_stream_executor_srcs",
    "tf_additional_cupti_wrapper_deps",
    "tf_additional_libdevice_data",
@ -1126,12 +1128,13 @@ cc_library(
        "platform/tracing.h",
    ],
    copts = tf_copts(),
+    defines = tf_additional_lib_defines(),
    linkopts = ["-ldl"],
-    deps = [
+    deps = tf_additional_lib_deps() + [
        ":lib_proto_parsing",
        ":protos_all_cc",
-        "//tensorflow/core/platform/default/build_config:platformlib",
        "//third_party/eigen3",
+        "//tensorflow/core/platform/default/build_config:platformlib",
        "@zlib_archive//:zlib",
    ],
 )
@ -1351,7 +1354,7 @@ tf_cuda_library(
        ":protos_all_cc",
        "//third_party/eigen3",
        "//tensorflow/core/kernels:required",
-    ] + tf_additional_lib_deps(),
+    ] + tf_additional_core_deps(),
    alwayslink = 1,
 )

--- a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
@ -215,7 +215,7 @@ Status CUPTIManager::DisableTrace() {
 void CUPTIManager::InternalBufferRequested(uint8_t **buffer, size_t *size,
                                           size_t *maxNumRecords) {
  VLOG(2) << "BufferRequested";
-  void *p = port::aligned_malloc(kBufferSize, kBufferAlignment);
+  void *p = port::AlignedMalloc(kBufferSize, kBufferAlignment);
  *size = kBufferSize;
  *buffer = reinterpret_cast<uint8_t *>(p);
  *maxNumRecords = 0;
@ -246,7 +246,7 @@ void CUPTIManager::InternalBufferCompleted(CUcontext ctx, uint32_t streamId,
      LOG(WARNING) << "Dropped " << dropped << " activity records";
    }
  }
-  port::aligned_free(buffer);
+  port::AlignedFree(buffer);
 }

 CUPTIManager *GetCUPTIManager() {
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@ -171,9 +171,9 @@ class BasicCPUAllocator : public SubAllocator {
  ~BasicCPUAllocator() override {}

  void* Alloc(size_t alignment, size_t num_bytes) override {
-    return port::aligned_malloc(num_bytes, alignment);
+    return port::AlignedMalloc(num_bytes, alignment);
  }
-  void Free(void* ptr, size_t num_bytes) override { port::aligned_free(ptr); }
+  void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
 };

 // Allocator for pinned CPU RAM that is made known to CUDA for the
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@ -275,6 +275,7 @@ cc_library(
        "//tensorflow/core/distributed_runtime:server_lib",
        "//tensorflow/core/distributed_runtime:worker_env",
        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc_unsecure",
    ],
    alwayslink = 1,
 )
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@ -21,6 +21,7 @@ limitations under the License.
 #include "grpc++/grpc++.h"
 #include "grpc++/security/credentials.h"
 #include "grpc++/server_builder.h"
+#include "grpc/support/alloc.h"

 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@ -41,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/public/session_options.h"

 namespace tensorflow {
@ -304,6 +306,11 @@ class GrpcServerFactory : public ServerFactory {
 class GrpcServerRegistrar {
 public:
  GrpcServerRegistrar() {
+    gpr_allocation_functions alloc_fns;
+    alloc_fns.malloc_fn = port::Malloc;
+    alloc_fns.realloc_fn = port::Realloc;
+    alloc_fns.free_fn = port::Free;
+    gpr_set_allocation_functions(alloc_fns);
    ServerFactory::Register("GRPC_SERVER", new GrpcServerFactory());
  }
 };
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@ -68,7 +68,7 @@ class CPUAllocator : public Allocator {
  string Name() override { return "cpu"; }

  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    void* p = port::aligned_malloc(num_bytes, alignment);
+    void* p = port::AlignedMalloc(num_bytes, alignment);
    if (cpu_allocator_collect_stats) {
      const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
      mutex_lock l(mu_);
@ -89,7 +89,7 @@ class CPUAllocator : public Allocator {
      mutex_lock l(mu_);
      stats_.bytes_in_use -= alloc_size;
    }
-    port::aligned_free(ptr);
+    port::AlignedFree(ptr);
  }

  void GetStats(AllocatorStats* stats) override {
--- a/tensorflow/core/framework/load_library.cc
+++ b/tensorflow/core/framework/load_library.cc
@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mem.h"

 namespace tensorflow {

@ -91,7 +92,7 @@ Status LoadLibrary(const char* library_filename, void** result,
  }
  string str;
  library.op_list.SerializeToString(&str);
-  char* str_buf = reinterpret_cast<char*>(malloc(str.length()));
+  char* str_buf = reinterpret_cast<char*>(port::Malloc(str.length()));
  memcpy(str_buf, str.data(), str.length());
  *buf = str_buf;
  *len = str.length();
--- a/tensorflow/core/framework/tracking_allocator_test.cc
+++ b/tensorflow/core/framework/tracking_allocator_test.cc
@ -19,6 +19,7 @@ limitations under the License.

 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/test.h"

 namespace tensorflow {
@ -27,7 +28,7 @@ class TestableSizeTrackingAllocator : public Allocator {
 public:
  string Name() override { return "test"; }
  void* AllocateRaw(size_t /*alignment*/, size_t num_bytes) override {
-    void* ptr = malloc(num_bytes);
+    void* ptr = port::Malloc(num_bytes);
    size_map_[ptr] = num_bytes;
    return ptr;
  }
@ -35,7 +36,7 @@ class TestableSizeTrackingAllocator : public Allocator {
    const auto& iter = size_map_.find(ptr);
    EXPECT_NE(size_map_.end(), iter);
    size_map_.erase(iter);
-    free(ptr);
+    port::Free(ptr);
  }
  bool TracksAllocationSizes() override { return true; }
  size_t RequestedSize(void* ptr) override {
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@ -18,6 +18,7 @@ limitations under the License.

 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/util/tensor_format.h"

 #if GOOGLE_CUDA
@ -44,9 +45,9 @@ class LaunchConv2DOp {
 template <class T, size_t size>
 struct Im2ColBufferResource : public ResourceBase {
  Im2ColBufferResource<T, size>() {
-    data = static_cast<T*>(malloc(size * sizeof(T)));
+    data = static_cast<T*>(port::Malloc(size * sizeof(T)));
  }
-  ~Im2ColBufferResource<T, size>() { free(data); }
+  ~Im2ColBufferResource<T, size>() { port::Free(data); }
  // This mutex ensures that only a single operation at a time is able to use
  // the buffer memory held by this resource.
  mutex mu;
--- a/tensorflow/core/lib/core/arena.cc
+++ b/tensorflow/core/lib/core/arena.cc
@ -48,7 +48,8 @@ Arena::Arena(const size_t block_size)
      overflow_blocks_(NULL) {
  assert(block_size > kDefaultAlignment);

-  first_blocks_[0].mem = reinterpret_cast<char*>(malloc(block_size_));
+  first_blocks_[0].mem =
+      reinterpret_cast<char*>(port::AlignedMalloc(block_size_, sizeof(void*)));

  first_blocks_[0].size = block_size_;

@ -59,7 +60,9 @@ Arena::~Arena() {
  FreeBlocks();
  assert(overflow_blocks_ == NULL);  // FreeBlocks() should do that
  // The first X blocks stay allocated always by default.  Delete them now.
-  for (size_t i = 0; i < blocks_alloced_; ++i) free(first_blocks_[i].mem);
+  for (size_t i = 0; i < blocks_alloced_; ++i) {
+    port::AlignedFree(first_blocks_[i].mem);
+  }
 }

 // Returns true iff it advances freestart_ to the first position
@ -162,8 +165,11 @@ Arena::AllocatedBlock* Arena::AllocNewBlock(const size_t block_size,

  // Must be a multiple of kDefaultAlignment, unless requested
  // alignment is 1, in which case we don't care at all.
-  const uint32 adjusted_alignment =
+  uint32 adjusted_alignment =
      (alignment > 1 ? LeastCommonMultiple(alignment, kDefaultAlignment) : 1);
+  // Required minimum alignment for port::AlignedMalloc().
+  adjusted_alignment =
+      std::max(adjusted_alignment, static_cast<uint32>(sizeof(void*)));

  CHECK_LE(adjusted_alignment, static_cast<uint32>(1 << 20))
      << "Alignment on boundaries greater than 1MB not supported.";
@ -171,16 +177,12 @@ Arena::AllocatedBlock* Arena::AllocNewBlock(const size_t block_size,
  // If block_size > alignment we force block_size to be a multiple
  // of alignment; if block_size < alignment we make no adjustment.
  size_t adjusted_block_size = block_size;
-  if (adjusted_alignment > 1) {
-    if (adjusted_block_size > adjusted_alignment) {
-      const uint32 excess = adjusted_block_size % adjusted_alignment;
-      adjusted_block_size += (excess > 0 ? adjusted_alignment - excess : 0);
-    }
-    block->mem = reinterpret_cast<char*>(
-        port::aligned_malloc(adjusted_block_size, adjusted_alignment));
-  } else {
-    block->mem = reinterpret_cast<char*>(malloc(adjusted_block_size));
+  if (adjusted_block_size > adjusted_alignment) {
+    const uint32 excess = adjusted_block_size % adjusted_alignment;
+    adjusted_block_size += (excess > 0 ? adjusted_alignment - excess : 0);
  }
+  block->mem = reinterpret_cast<char*>(
+      port::AlignedMalloc(adjusted_block_size, adjusted_alignment));
  block->size = adjusted_block_size;
  CHECK(NULL != block->mem) << "block_size=" << block_size
                            << " adjusted_block_size=" << adjusted_block_size
@ -242,7 +244,7 @@ void* Arena::GetMemoryFallback(const size_t size, const int alignment) {

 void Arena::FreeBlocks() {
  for (size_t i = 1; i < blocks_alloced_; ++i) {  // keep first block alloced
-    free(first_blocks_[i].mem);
+    port::AlignedFree(first_blocks_[i].mem);
    first_blocks_[i].mem = NULL;
    first_blocks_[i].size = 0;
  }
@ -250,7 +252,7 @@ void Arena::FreeBlocks() {
  if (overflow_blocks_ != NULL) {
    std::vector<AllocatedBlock>::iterator it;
    for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) {
-      free(it->mem);
+      port::AlignedFree(it->mem);
    }
    delete overflow_blocks_;  // These should be used very rarely
    overflow_blocks_ = NULL;
--- a/tensorflow/core/lib/gtl/inlined_vector.h
+++ b/tensorflow/core/lib/gtl/inlined_vector.h
@ -45,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/manual_constructor.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"

 #include <initializer_list>  // NOLINT(build/include_order)
@ -353,7 +354,7 @@ class InlinedVector {
    size_t n = size();
    Destroy(base, n);
    if (!is_inline()) {
-      free(base);
+      port::Free(base);
    }
  }

@ -434,7 +435,7 @@ class InlinedVector {
    }

    T* src = data();
-    T* dst = static_cast<T*>(malloc(target * sizeof(T)));
+    T* dst = static_cast<T*>(port::Malloc(target * sizeof(T)));

    // Need to copy elem before discarding src since it might alias src.
    InitType{}(dst + s, std::forward<Args>(args)...);
--- a/tensorflow/core/lib/gtl/manual_constructor.h
+++ b/tensorflow/core/lib/gtl/manual_constructor.h
@ -30,7 +30,7 @@ limitations under the License.
 #include <utility>

 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mem.h"  // For aligned_malloc/aligned_free
+#include "tensorflow/core/platform/mem.h"

 namespace tensorflow {
 namespace gtl {
@ -127,9 +127,9 @@ class ManualConstructor {
  // Support users creating arrays of ManualConstructor<>s.  This ensures that
  // the array itself has the correct alignment.
  static void* operator new[](size_t size) {
-    return port::aligned_malloc(size, TF_LIB_GTL_ALIGN_OF(Type));
+    return port::AlignedMalloc(size, TF_LIB_GTL_ALIGN_OF(Type));
  }
-  static void operator delete[](void* mem) { port::aligned_free(mem); }
+  static void operator delete[](void* mem) { port::AlignedFree(mem); }

  inline Type* get() { return reinterpret_cast<Type*>(space_); }
  inline const Type* get() const {
--- a/tensorflow/core/platform/cloud/http_request_test.cc
+++ b/tensorflow/core/platform/cloud/http_request_test.cc
@ -17,6 +17,7 @@ limitations under the License.
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/test.h"

 namespace tensorflow {
@ -172,7 +173,8 @@ class FakeLibCurl : public LibCurl {
      temp_str.replace(n, victim.size(), encoded);
      n += encoded.size();
    }
-    char* out_char_str = (char*)malloc(sizeof(char) * temp_str.size() + 1);
+    char* out_char_str =
+        (char*)port::Malloc(sizeof(char) * temp_str.size() + 1);
    std::copy(temp_str.begin(), temp_str.end(), out_char_str);
    out_char_str[temp_str.size()] = '\0';
    return out_char_str;
@ -180,7 +182,7 @@ class FakeLibCurl : public LibCurl {
  void curl_slist_free_all(curl_slist* list) override {
    delete reinterpret_cast<std::vector<string>*>(list);
  }
-  void curl_free(void* p) override { free(p); }
+  void curl_free(void* p) override { port::Free(p); }

  // Variables defining the behavior of this fake.
  string response_content;
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@ -3,10 +3,11 @@
 load("@protobuf//:protobuf.bzl", "cc_proto_library")
 load("@protobuf//:protobuf.bzl", "py_proto_library")

-# configure may change the following lines to True
+# configure may change the following lines
 WITH_GCP_SUPPORT = False
 WITH_HDFS_SUPPORT = False
 WITH_XLA_SUPPORT = False
+WITH_JEMALLOC = True

 # Appends a suffix to a list of deps.
 def tf_deps(deps, suffix):
@ -176,7 +177,29 @@ def tf_additional_test_srcs():
 def tf_kernel_tests_linkstatic():
  return 0

+# jemalloc only enabled on Linux for now.
+# TODO(jhseu): Enable on other platforms.
+def tf_additional_lib_defines():
+  defines = []
+  if WITH_JEMALLOC:
+    defines += select({
+        "//tensorflow:linux_x86_64": [
+            "TENSORFLOW_USE_JEMALLOC"
+        ],
+        "//conditions:default": [],
+    })
+  return defines
+
 def tf_additional_lib_deps():
+  deps = []
+  if WITH_JEMALLOC:
+    deps += select({
+        "//tensorflow:linux_x86_64": ["@jemalloc"],
+        "//conditions:default": [],
+    })
+  return deps
+
+def tf_additional_core_deps():
  deps = []
  if WITH_GCP_SUPPORT:
    deps.append("//tensorflow/core/platform/cloud:gcs_file_system")
--- a/tensorflow/core/platform/mem.h
+++ b/tensorflow/core/platform/mem.h
@ -24,9 +24,14 @@ limitations under the License.
 namespace tensorflow {
 namespace port {

-// Aligned allocation/deallocation
-void* aligned_malloc(size_t size, int minimum_alignment);
-void aligned_free(void* aligned_memory);
+// Aligned allocation/deallocation. `minimum_alignment` must be a power of 2
+// and a multiple of sizeof(void*).
+void* AlignedMalloc(size_t size, int minimum_alignment);
+void AlignedFree(void* aligned_memory);
+
+void* Malloc(size_t size);
+void* Realloc(void* ptr, size_t size);
+void Free(void* ptr);

 // Tries to release num_bytes of free memory back to the operating
 // system for reuse.  Use this routine with caution -- to get this
--- a/tensorflow/core/platform/port_test.cc
+++ b/tensorflow/core/platform/port_test.cc
@ -25,11 +25,11 @@ namespace port {

 TEST(Port, AlignedMalloc) {
  for (size_t alignment = 1; alignment <= 1 << 20; alignment <<= 1) {
-    void* p = aligned_malloc(1, alignment);
-    ASSERT_TRUE(p != NULL) << "aligned_malloc(1, " << alignment << ")";
+    void* p = AlignedMalloc(1, alignment);
+    ASSERT_TRUE(p != NULL) << "AlignedMalloc(1, " << alignment << ")";
    uintptr_t pval = reinterpret_cast<uintptr_t>(p);
    EXPECT_EQ(pval % alignment, 0);
-    aligned_free(p);
+    AlignedFree(p);
  }
 }

--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@ -13,8 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

+#ifdef TENSORFLOW_USE_JEMALLOC
+#include "jemalloc/jemalloc.h"
+#endif
+
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 #if defined(__linux__) && !defined(__ANDROID__)
 #include <sched.h>
@ -60,7 +65,7 @@ int NumSchedulableCPUs() {
  return kDefaultCores;
 }

-void *aligned_malloc(size_t size, int minimum_alignment) {
+void *AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
  return memalign(minimum_alignment, size);
 #else  // !defined(__ANDROID__)
@ -69,15 +74,45 @@ void *aligned_malloc(size_t size, int minimum_alignment) {
  // sizeof(void*). In this case, fall back on malloc which should return
  // memory aligned to at least the size of a pointer.
  const int required_alignment = sizeof(void *);
-  if (minimum_alignment < required_alignment) return malloc(size);
-  if (posix_memalign(&ptr, minimum_alignment, size) != 0)
+  if (minimum_alignment < required_alignment) return Malloc(size);
+#ifdef TENSORFLOW_USE_JEMALLOC
+  int err = jemalloc_posix_memalign(&ptr, minimum_alignment, size);
+#else
+  int err = posix_memalign(&ptr, minimum_alignment, size);
+#endif
+  if (err != 0) {
    return NULL;
-  else
+  } else {
    return ptr;
+  }
 #endif
 }

-void aligned_free(void *aligned_memory) { free(aligned_memory); }
+void AlignedFree(void *aligned_memory) { Free(aligned_memory); }
+
+void *Malloc(size_t size) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_malloc(size);
+#else
+  return malloc(size);
+#endif
+}
+
+void *Realloc(void *ptr, size_t size) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_realloc(ptr, size);
+#else
+  return realloc(ptr, size);
+#endif
+}
+
+void Free(void *ptr) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  jemalloc_free(ptr);
+#else
+  free(ptr);
+#endif
+}

 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
  // No-op.
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@ -52,11 +52,17 @@ int NumSchedulableCPUs() {
  return system_info.dwNumberOfProcessors;
 }

-void* aligned_malloc(size_t size, int minimum_alignment) {
+void* AlignedMalloc(size_t size, int minimum_alignment) {
  return _aligned_malloc(size, minimum_alignment);
 }

-void aligned_free(void* aligned_memory) { _aligned_free(aligned_memory); }
+void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
+
+void* Malloc(size_t size) { return ::malloc(size); }
+
+void* Realloc(void* ptr, size_t size) { return ::realloc(ptr, size); }
+
+void Free(void* ptr) { ::free(ptr); }

 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
  // No-op.
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@ -87,6 +87,7 @@ filegroup(
        "@gif_archive//:COPYING",
        "@grpc//:LICENSE",
        "@highwayhash//:LICENSE",
+        "@jemalloc//:COPYING",
        "@jpeg//:LICENSE.md",
        "@libxsmm_archive//:LICENSE",
        "@local_config_sycl//sycl:LICENSE.text",
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -376,3 +376,14 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
      name = "junit",
      actual = "@junit_jar//jar",
  )
+
+  native.new_http_archive(
+      name = "jemalloc",
+      urls = [
+          "http://bazel-mirror.storage.googleapis.com/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
+          "https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
+      ],
+      sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
+      strip_prefix = "jemalloc-4.4.0",
+      build_file = str(Label("//third_party:jemalloc.BUILD")),
+  )
--- a/third_party/jemalloc.BUILD
+++ b/third_party/jemalloc.BUILD
@ -0,0 +1,321 @@
+# Description:
+# jemalloc - a general-purpose scalable concurrent malloc implementation
+
+licenses(["notice"])  # BSD
+
+exports_files(["COPYING"])
+
+load("@//third_party:common.bzl", "template_rule")
+
+cc_library(
+    name = "jemalloc",
+    srcs = [
+        "src/arena.c",
+        "src/atomic.c",
+        "src/base.c",
+        "src/bitmap.c",
+        "src/chunk.c",
+        "src/chunk_dss.c",
+        "src/chunk_mmap.c",
+        "src/ckh.c",
+        "src/ctl.c",
+        "src/extent.c",
+        "src/hash.c",
+        "src/huge.c",
+        "src/jemalloc.c",
+        "src/mb.c",
+        "src/mutex.c",
+        "src/nstime.c",
+        "src/pages.c",
+        "src/prng.c",
+        "src/prof.c",
+        "src/quarantine.c",
+        "src/rtree.c",
+        "src/spin.c",
+        "src/stats.c",
+        "src/tcache.c",
+        "src/tsd.c",
+        "src/util.c",
+        "src/witness.c",
+    ],
+    hdrs = [
+        "include/jemalloc/internal/arena.h",
+        "include/jemalloc/internal/assert.h",
+        "include/jemalloc/internal/atomic.h",
+        "include/jemalloc/internal/base.h",
+        "include/jemalloc/internal/bitmap.h",
+        "include/jemalloc/internal/chunk.h",
+        "include/jemalloc/internal/chunk_dss.h",
+        "include/jemalloc/internal/chunk_mmap.h",
+        "include/jemalloc/internal/ckh.h",
+        "include/jemalloc/internal/ctl.h",
+        "include/jemalloc/internal/extent.h",
+        "include/jemalloc/internal/hash.h",
+        "include/jemalloc/internal/huge.h",
+        "include/jemalloc/internal/jemalloc_internal.h",
+        "include/jemalloc/internal/jemalloc_internal_decls.h",
+        "include/jemalloc/internal/jemalloc_internal_defs.h",
+        "include/jemalloc/internal/jemalloc_internal_macros.h",
+        "include/jemalloc/internal/mb.h",
+        "include/jemalloc/internal/mutex.h",
+        "include/jemalloc/internal/nstime.h",
+        "include/jemalloc/internal/pages.h",
+        "include/jemalloc/internal/ph.h",
+        "include/jemalloc/internal/private_namespace.h",
+        "include/jemalloc/internal/prng.h",
+        "include/jemalloc/internal/prof.h",
+        "include/jemalloc/internal/ql.h",
+        "include/jemalloc/internal/qr.h",
+        "include/jemalloc/internal/quarantine.h",
+        "include/jemalloc/internal/rb.h",
+        "include/jemalloc/internal/rtree.h",
+        "include/jemalloc/internal/size_classes.h",
+        "include/jemalloc/internal/smoothstep.h",
+        "include/jemalloc/internal/spin.h",
+        "include/jemalloc/internal/stats.h",
+        "include/jemalloc/internal/tcache.h",
+        "include/jemalloc/internal/ticker.h",
+        "include/jemalloc/internal/tsd.h",
+        "include/jemalloc/internal/util.h",
+        "include/jemalloc/internal/valgrind.h",
+        "include/jemalloc/internal/witness.h",
+        "include/jemalloc/jemalloc.h",
+    ],
+    # Same flags that jemalloc uses to build.
+    copts = [
+        "-O3",
+        "-funroll-loops",
+        "-D_GNU_SOURCE",
+        "-D_REENTRANT",
+    ],
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+)
+
+sh_binary(
+    name = "jemalloc_sh",
+    srcs = ["include/jemalloc/jemalloc.sh"],
+)
+
+genrule(
+    name = "jemalloc_h",
+    srcs = [
+        ":jemalloc_defs_h",
+        ":jemalloc_macros_h",
+        ":jemalloc_mangle_h",
+        ":jemalloc_protos_h",
+        ":jemalloc_rename_h",
+        ":jemalloc_typedefs_h",
+    ],
+    outs = ["include/jemalloc/jemalloc.h"],
+    cmd = "$(location :jemalloc_sh) $$(dirname $(location :jemalloc_defs_h))/../../ >$@",
+    tools = [":jemalloc_sh"],
+)
+
+# Add to this list if you want to export more symbols from jemalloc.
+genrule(
+    name = "public_symbols_txt",
+    outs = ["include/jemalloc/internal/public_symbols.txt"],
+    cmd = "\n".join([
+        "cat <<'EOF' > $@",
+        "free:jemalloc_free",
+        "malloc:jemalloc_malloc",
+        "posix_memalign:jemalloc_posix_memalign",
+        "realloc:jemalloc_realloc",
+        "EOF",
+    ]),
+)
+
+sh_binary(
+    name = "jemalloc_mangle_sh",
+    srcs = ["include/jemalloc/jemalloc_mangle.sh"],
+)
+
+genrule(
+    name = "jemalloc_mangle_h",
+    srcs = [":public_symbols_txt"],
+    outs = ["include/jemalloc/jemalloc_mangle.h"],
+    cmd = "$(location :jemalloc_mangle_sh) $(location :public_symbols_txt) je_ >$@",
+    tools = [":jemalloc_mangle_sh"],
+)
+
+sh_binary(
+    name = "jemalloc_rename_sh",
+    srcs = ["include/jemalloc/jemalloc_rename.sh"],
+)
+
+genrule(
+    name = "jemalloc_rename_h",
+    srcs = [":public_symbols_txt"],
+    outs = ["include/jemalloc/jemalloc_rename.h"],
+    cmd = "$(location :jemalloc_rename_sh) $(location :public_symbols_txt) >$@",
+    tools = [":jemalloc_rename_sh"],
+)
+
+sh_binary(
+    name = "private_namespace_sh",
+    srcs = ["include/jemalloc/internal/private_namespace.sh"],
+)
+
+genrule(
+    name = "private_namespace_h",
+    srcs = ["include/jemalloc/internal/private_symbols.txt"],
+    outs = ["include/jemalloc/internal/private_namespace.h"],
+    cmd = "$(location :private_namespace_sh) $(location include/jemalloc/internal/private_symbols.txt) >$@",
+    tools = [":private_namespace_sh"],
+)
+
+sh_binary(
+    name = "public_namespace_sh",
+    srcs = ["include/jemalloc/internal/public_namespace.sh"],
+)
+
+genrule(
+    name = "public_namespace_h",
+    srcs = [":public_symbols_txt"],
+    outs = ["include/jemalloc/internal/public_namespace.h"],
+    cmd = "$(location :public_namespace_sh) $(location :public_symbols_txt) >$@",
+    tools = [":public_namespace_sh"],
+)
+
+sh_binary(
+    name = "size_classes_sh",
+    srcs = ["include/jemalloc/internal/size_classes.sh"],
+)
+
+# Size classes for Linux x86_64. Update if adding builds for other
+# architectures. See size_classes.sh for details on the arguments.
+genrule(
+    name = "size_classes_h",
+    outs = ["include/jemalloc/internal/size_classes.h"],
+    cmd = "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
+    tools = [":size_classes_sh"],
+)
+
+template_rule(
+    name = "jemalloc_internal_h",
+    src = "include/jemalloc/internal/jemalloc_internal.h.in",
+    out = "include/jemalloc/internal/jemalloc_internal.h",
+    substitutions = {
+        "@private_namespace@": "je_",
+        "@install_suffix@": "",
+    },
+)
+
+template_rule(
+    name = "jemalloc_internal_defs_h",
+    src = "include/jemalloc/internal/jemalloc_internal_defs.h.in",
+    out = "include/jemalloc/internal/jemalloc_internal_defs.h",
+    substitutions = {
+        "#undef JEMALLOC_PREFIX": "#define JEMALLOC_PREFIX \"jemalloc_\"",
+        "#undef JEMALLOC_CPREFIX": "#define JEMALLOC_CPREFIX \"JEMALLOC_\"",
+        "#undef JEMALLOC_PRIVATE_NAMESPACE": "#define JEMALLOC_PRIVATE_NAMESPACE je_",
+        "#undef CPU_SPINWAIT": "#define CPU_SPINWAIT __asm__ volatile(\"pause\")",
+        "#undef JEMALLOC_HAVE_BUILTIN_CLZ": "#define JEMALLOC_HAVE_BUILTIN_CLZ",
+        "#undef JEMALLOC_USE_SYSCALL": "#define JEMALLOC_USE_SYSCALL",
+        "#undef JEMALLOC_HAVE_SECURE_GETENV": "#define JEMALLOC_HAVE_SECURE_GETENV",
+        "#undef JEMALLOC_HAVE_PTHREAD_ATFORK": "#define JEMALLOC_HAVE_PTHREAD_ATFORK",
+        "#undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE": "#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1",
+        # Newline required because of substitution conflicts.
+        "#undef JEMALLOC_HAVE_CLOCK_MONOTONIC\n": "#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1\n",
+        "#undef JEMALLOC_THREADED_INIT": "#define JEMALLOC_THREADED_INIT",
+        "#undef JEMALLOC_TLS_MODEL": "#define JEMALLOC_TLS_MODEL __attribute__((tls_model(\"initial-exec\")))",
+        "#undef JEMALLOC_CC_SILENCE": "#define JEMALLOC_CC_SILENCE",
+        "#undef JEMALLOC_STATS": "#define JEMALLOC_STATS",
+        "#undef JEMALLOC_TCACHE": "#define JEMALLOC_TCACHE",
+        "#undef JEMALLOC_DSS": "#define JEMALLOC_DSS",
+        "#undef JEMALLOC_FILL": "#define JEMALLOC_FILL",
+        "#undef LG_TINY_MIN": "#define LG_TINY_MIN 3",
+        "#undef LG_PAGE": "#define LG_PAGE 12",
+        "#undef JEMALLOC_MAPS_COALESCE": "#define JEMALLOC_MAPS_COALESCE",
+        "#undef JEMALLOC_TLS": "#define JEMALLOC_TLS",
+        "#undef JEMALLOC_INTERNAL_UNREACHABLE": "#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable",
+        "#undef JEMALLOC_INTERNAL_FFSLL": "#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll",
+        # Newline required because of substitution conflicts.
+        "#undef JEMALLOC_INTERNAL_FFSL\n": "#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl\n",
+        "#undef JEMALLOC_INTERNAL_FFS\n": "#define JEMALLOC_INTERNAL_FFS __builtin_ffs\n",
+        "#undef JEMALLOC_CACHE_OBLIVIOUS": "#define JEMALLOC_CACHE_OBLIVIOUS",
+        "#undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY": "#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY",
+        "#undef JEMALLOC_HAVE_MADVISE": "#define JEMALLOC_HAVE_MADVISE",
+        "#undef JEMALLOC_PURGE_MADVISE_DONTNEED": "#define JEMALLOC_PURGE_MADVISE_DONTNEED",
+        "#undef JEMALLOC_THP": "#define JEMALLOC_THP",
+        "#undef JEMALLOC_HAS_ALLOCA_H": "#define JEMALLOC_HAS_ALLOCA_H 1",
+        # Newline required because of substitution conflicts.
+        "#undef LG_SIZEOF_INT\n": "#define LG_SIZEOF_INT 2\n",
+        "#undef LG_SIZEOF_LONG\n": "#define LG_SIZEOF_LONG 3\n",
+        "#undef LG_SIZEOF_LONG_LONG": "#define LG_SIZEOF_LONG_LONG 3",
+        "#undef LG_SIZEOF_INTMAX_T": "#define LG_SIZEOF_INTMAX_T 3",
+        "#undef JEMALLOC_GLIBC_MALLOC_HOOK": "#define JEMALLOC_GLIBC_MALLOC_HOOK",
+        "#undef JEMALLOC_GLIBC_MEMALIGN_HOOK": "#define JEMALLOC_GLIBC_MEMALIGN_HOOK",
+        "#undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP": "#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP",
+        "#undef JEMALLOC_CONFIG_MALLOC_CONF": "#define JEMALLOC_CONFIG_MALLOC_CONF \"\"",
+    },
+)
+
+template_rule(
+    name = "jemalloc_defs_h",
+    src = "include/jemalloc/jemalloc_defs.h.in",
+    out = "include/jemalloc/jemalloc_defs.h",
+    substitutions = {
+        "#undef JEMALLOC_HAVE_ATTR": "#define JEMALLOC_HAVE_ATTR",
+        "#undef JEMALLOC_HAVE_ATTR_ALLOC_SIZE": "#define JEMALLOC_HAVE_ATTR_ALLOC_SIZE",
+        "#undef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF": "#define JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF",
+        "#undef JEMALLOC_HAVE_ATTR_FORMAT_PRINTF": "#define JEMALLOC_HAVE_ATTR_FORMAT_PRINTF",
+        "#undef JEMALLOC_OVERRIDE_MEMALIGN": "#define JEMALLOC_OVERRIDE_MEMALIGN",
+        "#undef JEMALLOC_OVERRIDE_VALLOC": "#define JEMALLOC_OVERRIDE_VALLOC",
+        "#undef JEMALLOC_USABLE_SIZE_CONST": "#define JEMALLOC_USABLE_SIZE_CONST",
+        "#undef JEMALLOC_USE_CXX_THROW": "#define JEMALLOC_USE_CXX_THROW",
+        "#undef LG_SIZEOF_PTR": "#define LG_SIZEOF_PTR 3",
+    },
+)
+
+template_rule(
+    name = "jemalloc_macros_h",
+    src = "include/jemalloc/jemalloc_macros.h.in",
+    out = "include/jemalloc/jemalloc_macros.h",
+    substitutions = {
+        "@jemalloc_version@": "0.0.0",
+        "@jemalloc_version_major@": "0",
+        "@jemalloc_version_minor@": "0",
+        "@jemalloc_version_bugfix@": "0",
+        "@jemalloc_version_nrev@": "0",
+        "@jemalloc_version_gid@": "0000000000000000000000000000000000000000",
+    },
+)
+
+template_rule(
+    name = "jemalloc_protos_h",
+    src = "include/jemalloc/jemalloc_protos.h.in",
+    out = "include/jemalloc/jemalloc_protos.h",
+    substitutions = {
+        "@aligned_alloc": "aligned_alloc",
+        "@calloc": "calloc",
+        "@cbopaque": "cbopaque",
+        "@dallocx": "dallocx",
+        "@free": "free",
+        "@je": "je",
+        "@mallctl": "mallctl",
+        "@mallctlnametomib": "mallctlnametomib",
+        "@mallctlbymib": "mallctlbymib",
+        "@malloc_stats_print": "malloc_stats_print",
+        "@malloc_usable_size": "malloc_usable_size",
+        "@malloc": "malloc",
+        "@mallocx": "mallocx",
+        "@memalign": "memalign",
+        "@nallocx": "nallocx",
+        "@posix_memalign": "posix_memalign",
+        "@rallocx": "rallocx",
+        "@realloc": "realloc",
+        "@sallocx": "sallocx",
+        "@sdallocx": "sdallocx",
+        "@valloc": "valloc",
+        "@xallocx": "xallocx",
+    },
+)
+
+template_rule(
+    name = "jemalloc_typedefs_h",
+    src = "include/jemalloc/jemalloc_typedefs.h.in",
+    out = "include/jemalloc/jemalloc_typedefs.h",
+    substitutions = {},
+)