From 83c6e0c63acdcab2c58c4ed7220bfa58879b1d57 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Wed, 11 Jan 2017 16:39:35 -0800
Subject: [PATCH] Switch open-source to use jemalloc for CPU Tensor memory
 allocation, gRPC, and other places where we call malloc/free.

- Only enabled on Linux for now.
- Added as a ./configure option defaulting to enabled.
Change: 144266237
---
 configure                                     |  18 +
 tensorflow/c/c_api.cc                         |  15 +-
 tensorflow/compiler/jit/xla_device_context.cc |   5 +-
 .../compiler/tf2xla/xla_compilation_device.cc |   5 +-
 tensorflow/core/BUILD                         |  11 +-
 .../core/common_runtime/gpu/gpu_tracer.cc     |   4 +-
 .../core/common_runtime/gpu/pool_allocator.h  |   4 +-
 tensorflow/core/distributed_runtime/rpc/BUILD |   1 +
 .../rpc/grpc_server_lib.cc                    |   7 +
 tensorflow/core/framework/allocator.cc        |   4 +-
 tensorflow/core/framework/load_library.cc     |   3 +-
 .../core/framework/tracking_allocator_test.cc |   5 +-
 tensorflow/core/kernels/conv_ops.h            |   5 +-
 tensorflow/core/lib/core/arena.cc             |  30 +-
 tensorflow/core/lib/gtl/inlined_vector.h      |   5 +-
 tensorflow/core/lib/gtl/manual_constructor.h  |   6 +-
 .../core/platform/cloud/http_request_test.cc  |   6 +-
 .../core/platform/default/build_config.bzl    |  25 +-
 tensorflow/core/platform/mem.h                |  11 +-
 tensorflow/core/platform/port_test.cc         |   6 +-
 tensorflow/core/platform/posix/port.cc        |  45 ++-
 tensorflow/core/platform/windows/port.cc      |  10 +-
 tensorflow/tools/pip_package/BUILD            |   1 +
 tensorflow/workspace.bzl                      |  11 +
 third_party/jemalloc.BUILD                    | 321 ++++++++++++++++++
 25 files changed, 505 insertions(+), 59 deletions(-)
 create mode 100644 third_party/jemalloc.BUILD

diff --git a/configure b/configure
index 64add33bd5d..1e4d786974d 100755
--- a/configure
+++ b/configure
@@ -57,9 +57,27 @@ done
 if is_windows; then
   TF_NEED_GCP=0
   TF_NEED_HDFS=0
+  TF_NEED_JEMALLOC=0
   TF_NEED_OPENCL=0
 fi
 
+while [ "$TF_NEED_JEMALLOC" == "" ]; do
+  read -p "Do you wish to use jemalloc as the malloc implementation? "\
+"(Linux only) [Y/n] " INPUT
+  case $INPUT in
+    [Yy]* ) echo "jemalloc enabled on Linux"; TF_NEED_JEMALLOC=1;;
+    [Nn]* ) echo "jemalloc disabled on Linux"; TF_NEED_JEMALLOC=0;;
+    "" ) echo "jemalloc enabled on Linux"; TF_NEED_JEMALLOC=1;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
+done
+
+if [ "$TF_NEED_JEMALLOC" == "1" ]; then
+  sed -i -e "s/WITH_JEMALLOC = False/WITH_JEMALLOC = True/" tensorflow/core/platform/default/build_config.bzl
+else
+  sed -i -e "s/WITH_JEMALLOC = True/WITH_JEMALLOC = False/" tensorflow/core/platform/default/build_config.bzl
+fi
+
 while [ "$TF_NEED_GCP" == "" ]; do
   read -p "Do you wish to build TensorFlow with "\
 "Google Cloud Platform support? [y/N] " INPUT
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 14988fbc4d7..5e236a81fb3 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -159,11 +160,13 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in,
     return InvalidArgument("Passing non-empty TF_Buffer is invalid.");
   }
   const auto proto_size = in.ByteSize();
-  void* buf = malloc(proto_size);
+  void* buf = tensorflow::port::Malloc(proto_size);
   in.SerializeToArray(buf, proto_size);
   out->data = buf;
   out->length = proto_size;
-  out->data_deallocator = [](void* data, size_t length) { free(data); };
+  out->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
   return Status::OK();
 }
 
@@ -287,13 +290,15 @@ void TF_SetConfig(TF_SessionOptions* options, const void* proto,
 TF_Buffer* TF_NewBuffer() { return new TF_Buffer{nullptr, 0, nullptr}; }
 
 TF_Buffer* TF_NewBufferFromString(const void* proto, size_t proto_len) {
-  void* copy = malloc(proto_len);
+  void* copy = tensorflow::port::Malloc(proto_len);
   memcpy(copy, proto, proto_len);
 
   TF_Buffer* buf = new TF_Buffer;
   buf->data = copy;
   buf->length = proto_len;
-  buf->data_deallocator = [](void* data, size_t length) { free(data); };
+  buf->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
   return buf;
 }
 
@@ -694,7 +699,7 @@ TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status) {
 TF_Buffer TF_GetOpList(TF_Library* lib_handle) { return lib_handle->op_list; }
 
 void TF_DeleteLibraryHandle(TF_Library* lib_handle) {
-  free(const_cast<void*>(lib_handle->op_list.data));
+  tensorflow::port::Free(const_cast<void*>(lib_handle->op_list.data));
   delete lib_handle;
 }
 
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 250960d3958..f329e83e14d 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/platform/mem.h"
 
 namespace tensorflow {
 
@@ -41,7 +42,7 @@ void* XlaDeviceAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   // Regardless of the size requested, always allocate a XlaGlobalData. Respect
   // the aligment request because there is alignment checking even for Tensors
   // whose data is never accessed.
-  void* p = port::aligned_malloc(sizeof(XlaGlobalData), alignment);
+  void* p = port::AlignedMalloc(sizeof(XlaGlobalData), alignment);
   VLOG(2) << "Allocated XLA device tensor " << p;
   return new (p) XlaGlobalData();
 }
@@ -50,7 +51,7 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
   XlaGlobalData* global_data = reinterpret_cast<XlaGlobalData*>(ptr);
   VLOG(2) << "Deallocated XLA device tensor " << ptr;
   global_data->~XlaGlobalData();
-  port::aligned_free(ptr);
+  port::AlignedFree(ptr);
 }
 
 void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index 86a53c929ef..ad3c9217440 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace tensorflow {
@@ -47,7 +48,7 @@ class XlaCompilationAllocator : public Allocator {
     // XlaExpression. Respect the aligment request because there is
     // alignment checking even for Tensors whose data is never
     // accessed.
-    void* p = port::aligned_malloc(sizeof(XlaExpression), alignment);
+    void* p = port::AlignedMalloc(sizeof(XlaExpression), alignment);
     XlaExpression* expression = reinterpret_cast<XlaExpression*>(p);
     new (expression) XlaExpression();
     return expression;
@@ -56,7 +57,7 @@ class XlaCompilationAllocator : public Allocator {
   void DeallocateRaw(void* ptr) override {
     XlaExpression* expression = reinterpret_cast<XlaExpression*>(ptr);
     expression->~XlaExpression();
-    port::aligned_free(ptr);
+    port::AlignedFree(ptr);
   }
 
   // Make sure that even tensors with 0 elements have allocated
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 0fc610a76e4..324183c0536 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -84,12 +84,14 @@ load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_proto_library",
     "tf_proto_library_cc",
+    "tf_additional_core_deps",
+    "tf_additional_lib_defines",
+    "tf_additional_lib_deps",
     "tf_additional_lib_hdrs",
     "tf_additional_lib_srcs",
     "tf_additional_minimal_lib_srcs",
     "tf_additional_proto_hdrs",
     "tf_additional_proto_srcs",
-    "tf_additional_lib_deps",
     "tf_additional_stream_executor_srcs",
     "tf_additional_cupti_wrapper_deps",
     "tf_additional_libdevice_data",
@@ -1126,12 +1128,13 @@ cc_library(
         "platform/tracing.h",
     ],
     copts = tf_copts(),
+    defines = tf_additional_lib_defines(),
     linkopts = ["-ldl"],
-    deps = [
+    deps = tf_additional_lib_deps() + [
         ":lib_proto_parsing",
         ":protos_all_cc",
-        "//tensorflow/core/platform/default/build_config:platformlib",
         "//third_party/eigen3",
+        "//tensorflow/core/platform/default/build_config:platformlib",
         "@zlib_archive//:zlib",
     ],
 )
@@ -1351,7 +1354,7 @@ tf_cuda_library(
         ":protos_all_cc",
         "//third_party/eigen3",
         "//tensorflow/core/kernels:required",
-    ] + tf_additional_lib_deps(),
+    ] + tf_additional_core_deps(),
     alwayslink = 1,
 )
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
index ee93b19d291..981a6549889 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
@@ -215,7 +215,7 @@ Status CUPTIManager::DisableTrace() {
 void CUPTIManager::InternalBufferRequested(uint8_t **buffer, size_t *size,
                                            size_t *maxNumRecords) {
   VLOG(2) << "BufferRequested";
-  void *p = port::aligned_malloc(kBufferSize, kBufferAlignment);
+  void *p = port::AlignedMalloc(kBufferSize, kBufferAlignment);
   *size = kBufferSize;
   *buffer = reinterpret_cast<uint8_t *>(p);
   *maxNumRecords = 0;
@@ -246,7 +246,7 @@ void CUPTIManager::InternalBufferCompleted(CUcontext ctx, uint32_t streamId,
       LOG(WARNING) << "Dropped " << dropped << " activity records";
     }
   }
-  port::aligned_free(buffer);
+  port::AlignedFree(buffer);
 }
 
 CUPTIManager *GetCUPTIManager() {
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h
index 5842758f0e3..91ce830df85 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@@ -171,9 +171,9 @@ class BasicCPUAllocator : public SubAllocator {
   ~BasicCPUAllocator() override {}
 
   void* Alloc(size_t alignment, size_t num_bytes) override {
-    return port::aligned_malloc(num_bytes, alignment);
+    return port::AlignedMalloc(num_bytes, alignment);
   }
-  void Free(void* ptr, size_t num_bytes) override { port::aligned_free(ptr); }
+  void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
 };
 
 // Allocator for pinned CPU RAM that is made known to CUDA for the
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 89710a4654c..8ab8712c8cc 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -275,6 +275,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:worker_env",
         "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc_unsecure",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index addf09672ab..99309a98cab 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "grpc++/grpc++.h"
 #include "grpc++/security/credentials.h"
 #include "grpc++/server_builder.h"
+#include "grpc/support/alloc.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -41,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -304,6 +306,11 @@ class GrpcServerFactory : public ServerFactory {
 class GrpcServerRegistrar {
  public:
   GrpcServerRegistrar() {
+    gpr_allocation_functions alloc_fns;
+    alloc_fns.malloc_fn = port::Malloc;
+    alloc_fns.realloc_fn = port::Realloc;
+    alloc_fns.free_fn = port::Free;
+    gpr_set_allocation_functions(alloc_fns);
     ServerFactory::Register("GRPC_SERVER", new GrpcServerFactory());
   }
 };
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 601d87fa554..812ce4bfe7e 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -68,7 +68,7 @@ class CPUAllocator : public Allocator {
   string Name() override { return "cpu"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    void* p = port::aligned_malloc(num_bytes, alignment);
+    void* p = port::AlignedMalloc(num_bytes, alignment);
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
       mutex_lock l(mu_);
@@ -89,7 +89,7 @@ class CPUAllocator : public Allocator {
       mutex_lock l(mu_);
       stats_.bytes_in_use -= alloc_size;
     }
-    port::aligned_free(ptr);
+    port::AlignedFree(ptr);
   }
 
   void GetStats(AllocatorStats* stats) override {
diff --git a/tensorflow/core/framework/load_library.cc b/tensorflow/core/framework/load_library.cc
index f56e5fae1bc..f8253353008 100644
--- a/tensorflow/core/framework/load_library.cc
+++ b/tensorflow/core/framework/load_library.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mem.h"
 
 namespace tensorflow {
 
@@ -91,7 +92,7 @@ Status LoadLibrary(const char* library_filename, void** result,
   }
   string str;
   library.op_list.SerializeToString(&str);
-  char* str_buf = reinterpret_cast<char*>(malloc(str.length()));
+  char* str_buf = reinterpret_cast<char*>(port::Malloc(str.length()));
   memcpy(str_buf, str.data(), str.length());
   *buf = str_buf;
   *len = str.length();
diff --git a/tensorflow/core/framework/tracking_allocator_test.cc b/tensorflow/core/framework/tracking_allocator_test.cc
index 98134392ef7..850cdc39099 100644
--- a/tensorflow/core/framework/tracking_allocator_test.cc
+++ b/tensorflow/core/framework/tracking_allocator_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -27,7 +28,7 @@ class TestableSizeTrackingAllocator : public Allocator {
  public:
   string Name() override { return "test"; }
   void* AllocateRaw(size_t /*alignment*/, size_t num_bytes) override {
-    void* ptr = malloc(num_bytes);
+    void* ptr = port::Malloc(num_bytes);
     size_map_[ptr] = num_bytes;
     return ptr;
   }
@@ -35,7 +36,7 @@ class TestableSizeTrackingAllocator : public Allocator {
     const auto& iter = size_map_.find(ptr);
     EXPECT_NE(size_map_.end(), iter);
     size_map_.erase(iter);
-    free(ptr);
+    port::Free(ptr);
   }
   bool TracksAllocationSizes() override { return true; }
   size_t RequestedSize(void* ptr) override {
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index 897afe77966..60091fc27fd 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 #if GOOGLE_CUDA
@@ -44,9 +45,9 @@ class LaunchConv2DOp {
 template <class T, size_t size>
 struct Im2ColBufferResource : public ResourceBase {
   Im2ColBufferResource<T, size>() {
-    data = static_cast<T*>(malloc(size * sizeof(T)));
+    data = static_cast<T*>(port::Malloc(size * sizeof(T)));
   }
-  ~Im2ColBufferResource<T, size>() { free(data); }
+  ~Im2ColBufferResource<T, size>() { port::Free(data); }
   // This mutex ensures that only a single operation at a time is able to use
   // the buffer memory held by this resource.
   mutex mu;
diff --git a/tensorflow/core/lib/core/arena.cc b/tensorflow/core/lib/core/arena.cc
index a7148ed1c75..53998a1821f 100644
--- a/tensorflow/core/lib/core/arena.cc
+++ b/tensorflow/core/lib/core/arena.cc
@@ -48,7 +48,8 @@ Arena::Arena(const size_t block_size)
       overflow_blocks_(NULL) {
   assert(block_size > kDefaultAlignment);
 
-  first_blocks_[0].mem = reinterpret_cast<char*>(malloc(block_size_));
+  first_blocks_[0].mem =
+      reinterpret_cast<char*>(port::AlignedMalloc(block_size_, sizeof(void*)));
 
   first_blocks_[0].size = block_size_;
 
@@ -59,7 +60,9 @@ Arena::~Arena() {
   FreeBlocks();
   assert(overflow_blocks_ == NULL);  // FreeBlocks() should do that
   // The first X blocks stay allocated always by default.  Delete them now.
-  for (size_t i = 0; i < blocks_alloced_; ++i) free(first_blocks_[i].mem);
+  for (size_t i = 0; i < blocks_alloced_; ++i) {
+    port::AlignedFree(first_blocks_[i].mem);
+  }
 }
 
 // Returns true iff it advances freestart_ to the first position
@@ -162,8 +165,11 @@ Arena::AllocatedBlock* Arena::AllocNewBlock(const size_t block_size,
 
   // Must be a multiple of kDefaultAlignment, unless requested
   // alignment is 1, in which case we don't care at all.
-  const uint32 adjusted_alignment =
+  uint32 adjusted_alignment =
       (alignment > 1 ? LeastCommonMultiple(alignment, kDefaultAlignment) : 1);
+  // Required minimum alignment for port::AlignedMalloc().
+  adjusted_alignment =
+      std::max(adjusted_alignment, static_cast<uint32>(sizeof(void*)));
 
   CHECK_LE(adjusted_alignment, static_cast<uint32>(1 << 20))
       << "Alignment on boundaries greater than 1MB not supported.";
@@ -171,16 +177,12 @@ Arena::AllocatedBlock* Arena::AllocNewBlock(const size_t block_size,
   // If block_size > alignment we force block_size to be a multiple
   // of alignment; if block_size < alignment we make no adjustment.
   size_t adjusted_block_size = block_size;
-  if (adjusted_alignment > 1) {
-    if (adjusted_block_size > adjusted_alignment) {
-      const uint32 excess = adjusted_block_size % adjusted_alignment;
-      adjusted_block_size += (excess > 0 ? adjusted_alignment - excess : 0);
-    }
-    block->mem = reinterpret_cast<char*>(
-        port::aligned_malloc(adjusted_block_size, adjusted_alignment));
-  } else {
-    block->mem = reinterpret_cast<char*>(malloc(adjusted_block_size));
+  if (adjusted_block_size > adjusted_alignment) {
+    const uint32 excess = adjusted_block_size % adjusted_alignment;
+    adjusted_block_size += (excess > 0 ? adjusted_alignment - excess : 0);
   }
+  block->mem = reinterpret_cast<char*>(
+      port::AlignedMalloc(adjusted_block_size, adjusted_alignment));
   block->size = adjusted_block_size;
   CHECK(NULL != block->mem) << "block_size=" << block_size
                             << " adjusted_block_size=" << adjusted_block_size
@@ -242,7 +244,7 @@ void* Arena::GetMemoryFallback(const size_t size, const int alignment) {
 
 void Arena::FreeBlocks() {
   for (size_t i = 1; i < blocks_alloced_; ++i) {  // keep first block alloced
-    free(first_blocks_[i].mem);
+    port::AlignedFree(first_blocks_[i].mem);
     first_blocks_[i].mem = NULL;
     first_blocks_[i].size = 0;
   }
@@ -250,7 +252,7 @@ void Arena::FreeBlocks() {
   if (overflow_blocks_ != NULL) {
     std::vector<AllocatedBlock>::iterator it;
     for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) {
-      free(it->mem);
+      port::AlignedFree(it->mem);
     }
     delete overflow_blocks_;  // These should be used very rarely
     overflow_blocks_ = NULL;
diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h
index fc439f9eb66..d6e5d9effa7 100644
--- a/tensorflow/core/lib/gtl/inlined_vector.h
+++ b/tensorflow/core/lib/gtl/inlined_vector.h
@@ -45,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/manual_constructor.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 
 #include <initializer_list>  // NOLINT(build/include_order)
@@ -353,7 +354,7 @@ class InlinedVector {
     size_t n = size();
     Destroy(base, n);
     if (!is_inline()) {
-      free(base);
+      port::Free(base);
     }
   }
 
@@ -434,7 +435,7 @@ class InlinedVector {
     }
 
     T* src = data();
-    T* dst = static_cast<T*>(malloc(target * sizeof(T)));
+    T* dst = static_cast<T*>(port::Malloc(target * sizeof(T)));
 
     // Need to copy elem before discarding src since it might alias src.
     InitType{}(dst + s, std::forward<Args>(args)...);
diff --git a/tensorflow/core/lib/gtl/manual_constructor.h b/tensorflow/core/lib/gtl/manual_constructor.h
index 8f041a13538..0a76e0962e6 100644
--- a/tensorflow/core/lib/gtl/manual_constructor.h
+++ b/tensorflow/core/lib/gtl/manual_constructor.h
@@ -30,7 +30,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mem.h"  // For aligned_malloc/aligned_free
+#include "tensorflow/core/platform/mem.h"
 
 namespace tensorflow {
 namespace gtl {
@@ -127,9 +127,9 @@ class ManualConstructor {
   // Support users creating arrays of ManualConstructor<>s.  This ensures that
   // the array itself has the correct alignment.
   static void* operator new[](size_t size) {
-    return port::aligned_malloc(size, TF_LIB_GTL_ALIGN_OF(Type));
+    return port::AlignedMalloc(size, TF_LIB_GTL_ALIGN_OF(Type));
   }
-  static void operator delete[](void* mem) { port::aligned_free(mem); }
+  static void operator delete[](void* mem) { port::AlignedFree(mem); }
 
   inline Type* get() { return reinterpret_cast<Type*>(space_); }
   inline const Type* get() const {
diff --git a/tensorflow/core/platform/cloud/http_request_test.cc b/tensorflow/core/platform/cloud/http_request_test.cc
index 93c4ec51d95..31ba3e337f9 100644
--- a/tensorflow/core/platform/cloud/http_request_test.cc
+++ b/tensorflow/core/platform/cloud/http_request_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -172,7 +173,8 @@ class FakeLibCurl : public LibCurl {
       temp_str.replace(n, victim.size(), encoded);
       n += encoded.size();
     }
-    char* out_char_str = (char*)malloc(sizeof(char) * temp_str.size() + 1);
+    char* out_char_str =
+        (char*)port::Malloc(sizeof(char) * temp_str.size() + 1);
     std::copy(temp_str.begin(), temp_str.end(), out_char_str);
     out_char_str[temp_str.size()] = '\0';
     return out_char_str;
@@ -180,7 +182,7 @@ class FakeLibCurl : public LibCurl {
   void curl_slist_free_all(curl_slist* list) override {
     delete reinterpret_cast<std::vector<string>*>(list);
   }
-  void curl_free(void* p) override { free(p); }
+  void curl_free(void* p) override { port::Free(p); }
 
   // Variables defining the behavior of this fake.
   string response_content;
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 80c23b1df15..168f9df2e84 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -3,10 +3,11 @@
 load("@protobuf//:protobuf.bzl", "cc_proto_library")
 load("@protobuf//:protobuf.bzl", "py_proto_library")
 
-# configure may change the following lines to True
+# configure may change the following lines
 WITH_GCP_SUPPORT = False
 WITH_HDFS_SUPPORT = False
 WITH_XLA_SUPPORT = False
+WITH_JEMALLOC = True
 
 # Appends a suffix to a list of deps.
 def tf_deps(deps, suffix):
@@ -176,7 +177,29 @@ def tf_additional_test_srcs():
 def tf_kernel_tests_linkstatic():
   return 0
 
+# jemalloc only enabled on Linux for now.
+# TODO(jhseu): Enable on other platforms.
+def tf_additional_lib_defines():
+  defines = []
+  if WITH_JEMALLOC:
+    defines += select({
+        "//tensorflow:linux_x86_64": [
+            "TENSORFLOW_USE_JEMALLOC"
+        ],
+        "//conditions:default": [],
+    })
+  return defines
+
 def tf_additional_lib_deps():
+  deps = []
+  if WITH_JEMALLOC:
+    deps += select({
+        "//tensorflow:linux_x86_64": ["@jemalloc"],
+        "//conditions:default": [],
+    })
+  return deps
+
+def tf_additional_core_deps():
   deps = []
   if WITH_GCP_SUPPORT:
     deps.append("//tensorflow/core/platform/cloud:gcs_file_system")
diff --git a/tensorflow/core/platform/mem.h b/tensorflow/core/platform/mem.h
index 6618145c3d1..dc389a87415 100644
--- a/tensorflow/core/platform/mem.h
+++ b/tensorflow/core/platform/mem.h
@@ -24,9 +24,14 @@ limitations under the License.
 namespace tensorflow {
 namespace port {
 
-// Aligned allocation/deallocation
-void* aligned_malloc(size_t size, int minimum_alignment);
-void aligned_free(void* aligned_memory);
+// Aligned allocation/deallocation. `minimum_alignment` must be a power of 2
+// and a multiple of sizeof(void*).
+void* AlignedMalloc(size_t size, int minimum_alignment);
+void AlignedFree(void* aligned_memory);
+
+void* Malloc(size_t size);
+void* Realloc(void* ptr, size_t size);
+void Free(void* ptr);
 
 // Tries to release num_bytes of free memory back to the operating
 // system for reuse.  Use this routine with caution -- to get this
diff --git a/tensorflow/core/platform/port_test.cc b/tensorflow/core/platform/port_test.cc
index 8d98eb25a20..8930e49ff84 100644
--- a/tensorflow/core/platform/port_test.cc
+++ b/tensorflow/core/platform/port_test.cc
@@ -25,11 +25,11 @@ namespace port {
 
 TEST(Port, AlignedMalloc) {
   for (size_t alignment = 1; alignment <= 1 << 20; alignment <<= 1) {
-    void* p = aligned_malloc(1, alignment);
-    ASSERT_TRUE(p != NULL) << "aligned_malloc(1, " << alignment << ")";
+    void* p = AlignedMalloc(1, alignment);
+    ASSERT_TRUE(p != NULL) << "AlignedMalloc(1, " << alignment << ")";
     uintptr_t pval = reinterpret_cast<uintptr_t>(p);
     EXPECT_EQ(pval % alignment, 0);
-    aligned_free(p);
+    AlignedFree(p);
   }
 }
 
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 84bc9492b57..7dce43f0cc4 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -13,8 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifdef TENSORFLOW_USE_JEMALLOC
+#include "jemalloc/jemalloc.h"
+#endif
+
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 #if defined(__linux__) && !defined(__ANDROID__)
 #include <sched.h>
@@ -60,7 +65,7 @@ int NumSchedulableCPUs() {
   return kDefaultCores;
 }
 
-void *aligned_malloc(size_t size, int minimum_alignment) {
+void *AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
   return memalign(minimum_alignment, size);
 #else  // !defined(__ANDROID__)
@@ -69,15 +74,45 @@ void *aligned_malloc(size_t size, int minimum_alignment) {
   // sizeof(void*). In this case, fall back on malloc which should return
   // memory aligned to at least the size of a pointer.
   const int required_alignment = sizeof(void *);
-  if (minimum_alignment < required_alignment) return malloc(size);
-  if (posix_memalign(&ptr, minimum_alignment, size) != 0)
+  if (minimum_alignment < required_alignment) return Malloc(size);
+#ifdef TENSORFLOW_USE_JEMALLOC
+  int err = jemalloc_posix_memalign(&ptr, minimum_alignment, size);
+#else
+  int err = posix_memalign(&ptr, minimum_alignment, size);
+#endif
+  if (err != 0) {
     return NULL;
-  else
+  } else {
     return ptr;
+  }
 #endif
 }
 
-void aligned_free(void *aligned_memory) { free(aligned_memory); }
+void AlignedFree(void *aligned_memory) { Free(aligned_memory); }
+
+void *Malloc(size_t size) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_malloc(size);
+#else
+  return malloc(size);
+#endif
+}
+
+void *Realloc(void *ptr, size_t size) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_realloc(ptr, size);
+#else
+  return realloc(ptr, size);
+#endif
+}
+
+void Free(void *ptr) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  jemalloc_free(ptr);
+#else
+  free(ptr);
+#endif
+}
 
 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
   // No-op.
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index ee5be221cd6..b2167081a69 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -52,11 +52,17 @@ int NumSchedulableCPUs() {
   return system_info.dwNumberOfProcessors;
 }
 
-void* aligned_malloc(size_t size, int minimum_alignment) {
+void* AlignedMalloc(size_t size, int minimum_alignment) {
   return _aligned_malloc(size, minimum_alignment);
 }
 
-void aligned_free(void* aligned_memory) { _aligned_free(aligned_memory); }
+void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
+
+void* Malloc(size_t size) { return ::malloc(size); }
+
+void* Realloc(void* ptr, size_t size) { return ::realloc(ptr, size); }
+
+void Free(void* ptr) { ::free(ptr); }
 
 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
   // No-op.
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 5570cea32fc..62fb9b9176e 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -87,6 +87,7 @@ filegroup(
         "@gif_archive//:COPYING",
         "@grpc//:LICENSE",
         "@highwayhash//:LICENSE",
+        "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
         "@libxsmm_archive//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 5cea08e2f3a..06f9ca88a72 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -376,3 +376,14 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       name = "junit",
       actual = "@junit_jar//jar",
   )
+
+  native.new_http_archive(
+      name = "jemalloc",
+      urls = [
+          "http://bazel-mirror.storage.googleapis.com/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
+          "https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
+      ],
+      sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
+      strip_prefix = "jemalloc-4.4.0",
+      build_file = str(Label("//third_party:jemalloc.BUILD")),
+  )
diff --git a/third_party/jemalloc.BUILD b/third_party/jemalloc.BUILD
new file mode 100644
index 00000000000..2496d126277
--- /dev/null
+++ b/third_party/jemalloc.BUILD
@@ -0,0 +1,321 @@
+# Description:
+# jemalloc - a general-purpose scalable concurrent malloc implementation
+
+licenses(["notice"])  # BSD
+
+exports_files(["COPYING"])
+
+load("@//third_party:common.bzl", "template_rule")
+
+cc_library(
+    name = "jemalloc",
+    srcs = [
+        "src/arena.c",
+        "src/atomic.c",
+        "src/base.c",
+        "src/bitmap.c",
+        "src/chunk.c",
+        "src/chunk_dss.c",
+        "src/chunk_mmap.c",
+        "src/ckh.c",
+        "src/ctl.c",
+        "src/extent.c",
+        "src/hash.c",
+        "src/huge.c",
+        "src/jemalloc.c",
+        "src/mb.c",
+        "src/mutex.c",
+        "src/nstime.c",
+        "src/pages.c",
+        "src/prng.c",
+        "src/prof.c",
+        "src/quarantine.c",
+        "src/rtree.c",
+        "src/spin.c",
+        "src/stats.c",
+        "src/tcache.c",
+        "src/tsd.c",
+        "src/util.c",
+        "src/witness.c",
+    ],
+    hdrs = [
+        "include/jemalloc/internal/arena.h",
+        "include/jemalloc/internal/assert.h",
+        "include/jemalloc/internal/atomic.h",
+        "include/jemalloc/internal/base.h",
+        "include/jemalloc/internal/bitmap.h",
+        "include/jemalloc/internal/chunk.h",
+        "include/jemalloc/internal/chunk_dss.h",
+        "include/jemalloc/internal/chunk_mmap.h",
+        "include/jemalloc/internal/ckh.h",
+        "include/jemalloc/internal/ctl.h",
+        "include/jemalloc/internal/extent.h",
+        "include/jemalloc/internal/hash.h",
+        "include/jemalloc/internal/huge.h",
+        "include/jemalloc/internal/jemalloc_internal.h",
+        "include/jemalloc/internal/jemalloc_internal_decls.h",
+        "include/jemalloc/internal/jemalloc_internal_defs.h",
+        "include/jemalloc/internal/jemalloc_internal_macros.h",
+        "include/jemalloc/internal/mb.h",
+        "include/jemalloc/internal/mutex.h",
+        "include/jemalloc/internal/nstime.h",
+        "include/jemalloc/internal/pages.h",
+        "include/jemalloc/internal/ph.h",
+        "include/jemalloc/internal/private_namespace.h",
+        "include/jemalloc/internal/prng.h",
+        "include/jemalloc/internal/prof.h",
+        "include/jemalloc/internal/ql.h",
+        "include/jemalloc/internal/qr.h",
+        "include/jemalloc/internal/quarantine.h",
+        "include/jemalloc/internal/rb.h",
+        "include/jemalloc/internal/rtree.h",
+        "include/jemalloc/internal/size_classes.h",
+        "include/jemalloc/internal/smoothstep.h",
+        "include/jemalloc/internal/spin.h",
+        "include/jemalloc/internal/stats.h",
+        "include/jemalloc/internal/tcache.h",
+        "include/jemalloc/internal/ticker.h",
+        "include/jemalloc/internal/tsd.h",
+        "include/jemalloc/internal/util.h",
+        "include/jemalloc/internal/valgrind.h",
+        "include/jemalloc/internal/witness.h",
+        "include/jemalloc/jemalloc.h",
+    ],
+    # Same flags that jemalloc uses to build.
+    copts = [
+        "-O3",
+        "-funroll-loops",
+        "-D_GNU_SOURCE",
+        "-D_REENTRANT",
+    ],
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+)
+
+sh_binary(
+    name = "jemalloc_sh",
+    srcs = ["include/jemalloc/jemalloc.sh"],
+)
+
+genrule(
+    name = "jemalloc_h",
+    srcs = [
+        ":jemalloc_defs_h",
+        ":jemalloc_macros_h",
+        ":jemalloc_mangle_h",
+        ":jemalloc_protos_h",
+        ":jemalloc_rename_h",
+        ":jemalloc_typedefs_h",
+    ],
+    outs = ["include/jemalloc/jemalloc.h"],
+    cmd = "$(location :jemalloc_sh) $$(dirname $(location :jemalloc_defs_h))/../../ >$@",
+    tools = [":jemalloc_sh"],
+)
+
+# Add to this list if you want to export more symbols from jemalloc.
+genrule(
+    name = "public_symbols_txt",
+    outs = ["include/jemalloc/internal/public_symbols.txt"],
+    cmd = "\n".join([
+        "cat <<'EOF' > $@",
+        "free:jemalloc_free",
+        "malloc:jemalloc_malloc",
+        "posix_memalign:jemalloc_posix_memalign",
+        "realloc:jemalloc_realloc",
+        "EOF",
+    ]),
+)
+
+sh_binary(
+    name = "jemalloc_mangle_sh",
+    srcs = ["include/jemalloc/jemalloc_mangle.sh"],
+)
+
+genrule(
+    name = "jemalloc_mangle_h",
+    srcs = [":public_symbols_txt"],
+    outs = ["include/jemalloc/jemalloc_mangle.h"],
+    cmd = "$(location :jemalloc_mangle_sh) $(location :public_symbols_txt) je_ >$@",
+    tools = [":jemalloc_mangle_sh"],
+)
+
+sh_binary(
+    name = "jemalloc_rename_sh",
+    srcs = ["include/jemalloc/jemalloc_rename.sh"],
+)
+
+genrule(
+    name = "jemalloc_rename_h",
+    srcs = [":public_symbols_txt"],
+    outs = ["include/jemalloc/jemalloc_rename.h"],
+    cmd = "$(location :jemalloc_rename_sh) $(location :public_symbols_txt) >$@",
+    tools = [":jemalloc_rename_sh"],
+)
+
+sh_binary(
+    name = "private_namespace_sh",
+    srcs = ["include/jemalloc/internal/private_namespace.sh"],
+)
+
+genrule(
+    name = "private_namespace_h",
+    srcs = ["include/jemalloc/internal/private_symbols.txt"],
+    outs = ["include/jemalloc/internal/private_namespace.h"],
+    cmd = "$(location :private_namespace_sh) $(location include/jemalloc/internal/private_symbols.txt) >$@",
+    tools = [":private_namespace_sh"],
+)
+
+sh_binary(
+    name = "public_namespace_sh",
+    srcs = ["include/jemalloc/internal/public_namespace.sh"],
+)
+
+genrule(
+    name = "public_namespace_h",
+    srcs = [":public_symbols_txt"],
+    outs = ["include/jemalloc/internal/public_namespace.h"],
+    cmd = "$(location :public_namespace_sh) $(location :public_symbols_txt) >$@",
+    tools = [":public_namespace_sh"],
+)
+
+sh_binary(
+    name = "size_classes_sh",
+    srcs = ["include/jemalloc/internal/size_classes.sh"],
+)
+
+# Size classes for Linux x86_64. Update if adding builds for other
+# architectures. See size_classes.sh for details on the arguments.
+genrule(
+    name = "size_classes_h",
+    outs = ["include/jemalloc/internal/size_classes.h"],
+    cmd = "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
+    tools = [":size_classes_sh"],
+)
+
+template_rule(
+    name = "jemalloc_internal_h",
+    src = "include/jemalloc/internal/jemalloc_internal.h.in",
+    out = "include/jemalloc/internal/jemalloc_internal.h",
+    substitutions = {
+        "@private_namespace@": "je_",
+        "@install_suffix@": "",
+    },
+)
+
+template_rule(
+    name = "jemalloc_internal_defs_h",
+    src = "include/jemalloc/internal/jemalloc_internal_defs.h.in",
+    out = "include/jemalloc/internal/jemalloc_internal_defs.h",
+    substitutions = {
+        "#undef JEMALLOC_PREFIX": "#define JEMALLOC_PREFIX \"jemalloc_\"",
+        "#undef JEMALLOC_CPREFIX": "#define JEMALLOC_CPREFIX \"JEMALLOC_\"",
+        "#undef JEMALLOC_PRIVATE_NAMESPACE": "#define JEMALLOC_PRIVATE_NAMESPACE je_",
+        "#undef CPU_SPINWAIT": "#define CPU_SPINWAIT __asm__ volatile(\"pause\")",
+        "#undef JEMALLOC_HAVE_BUILTIN_CLZ": "#define JEMALLOC_HAVE_BUILTIN_CLZ",
+        "#undef JEMALLOC_USE_SYSCALL": "#define JEMALLOC_USE_SYSCALL",
+        "#undef JEMALLOC_HAVE_SECURE_GETENV": "#define JEMALLOC_HAVE_SECURE_GETENV",
+        "#undef JEMALLOC_HAVE_PTHREAD_ATFORK": "#define JEMALLOC_HAVE_PTHREAD_ATFORK",
+        "#undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE": "#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1",
+        # Newline required because of substitution conflicts.
+        "#undef JEMALLOC_HAVE_CLOCK_MONOTONIC\n": "#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1\n",
+        "#undef JEMALLOC_THREADED_INIT": "#define JEMALLOC_THREADED_INIT",
+        "#undef JEMALLOC_TLS_MODEL": "#define JEMALLOC_TLS_MODEL __attribute__((tls_model(\"initial-exec\")))",
+        "#undef JEMALLOC_CC_SILENCE": "#define JEMALLOC_CC_SILENCE",
+        "#undef JEMALLOC_STATS": "#define JEMALLOC_STATS",
+        "#undef JEMALLOC_TCACHE": "#define JEMALLOC_TCACHE",
+        "#undef JEMALLOC_DSS": "#define JEMALLOC_DSS",
+        "#undef JEMALLOC_FILL": "#define JEMALLOC_FILL",
+        "#undef LG_TINY_MIN": "#define LG_TINY_MIN 3",
+        "#undef LG_PAGE": "#define LG_PAGE 12",
+        "#undef JEMALLOC_MAPS_COALESCE": "#define JEMALLOC_MAPS_COALESCE",
+        "#undef JEMALLOC_TLS": "#define JEMALLOC_TLS",
+        "#undef JEMALLOC_INTERNAL_UNREACHABLE": "#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable",
+        "#undef JEMALLOC_INTERNAL_FFSLL": "#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll",
+        # Newline required because of substitution conflicts.
+        "#undef JEMALLOC_INTERNAL_FFSL\n": "#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl\n",
+        "#undef JEMALLOC_INTERNAL_FFS\n": "#define JEMALLOC_INTERNAL_FFS __builtin_ffs\n",
+        "#undef JEMALLOC_CACHE_OBLIVIOUS": "#define JEMALLOC_CACHE_OBLIVIOUS",
+        "#undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY": "#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY",
+        "#undef JEMALLOC_HAVE_MADVISE": "#define JEMALLOC_HAVE_MADVISE",
+        "#undef JEMALLOC_PURGE_MADVISE_DONTNEED": "#define JEMALLOC_PURGE_MADVISE_DONTNEED",
+        "#undef JEMALLOC_THP": "#define JEMALLOC_THP",
+        "#undef JEMALLOC_HAS_ALLOCA_H": "#define JEMALLOC_HAS_ALLOCA_H 1",
+        # Newline required because of substitution conflicts.
+        "#undef LG_SIZEOF_INT\n": "#define LG_SIZEOF_INT 2\n",
+        "#undef LG_SIZEOF_LONG\n": "#define LG_SIZEOF_LONG 3\n",
+        "#undef LG_SIZEOF_LONG_LONG": "#define LG_SIZEOF_LONG_LONG 3",
+        "#undef LG_SIZEOF_INTMAX_T": "#define LG_SIZEOF_INTMAX_T 3",
+        "#undef JEMALLOC_GLIBC_MALLOC_HOOK": "#define JEMALLOC_GLIBC_MALLOC_HOOK",
+        "#undef JEMALLOC_GLIBC_MEMALIGN_HOOK": "#define JEMALLOC_GLIBC_MEMALIGN_HOOK",
+        "#undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP": "#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP",
+        "#undef JEMALLOC_CONFIG_MALLOC_CONF": "#define JEMALLOC_CONFIG_MALLOC_CONF \"\"",
+    },
+)
+
+template_rule(
+    name = "jemalloc_defs_h",
+    src = "include/jemalloc/jemalloc_defs.h.in",
+    out = "include/jemalloc/jemalloc_defs.h",
+    substitutions = {
+        "#undef JEMALLOC_HAVE_ATTR": "#define JEMALLOC_HAVE_ATTR",
+        "#undef JEMALLOC_HAVE_ATTR_ALLOC_SIZE": "#define JEMALLOC_HAVE_ATTR_ALLOC_SIZE",
+        "#undef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF": "#define JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF",
+        "#undef JEMALLOC_HAVE_ATTR_FORMAT_PRINTF": "#define JEMALLOC_HAVE_ATTR_FORMAT_PRINTF",
+        "#undef JEMALLOC_OVERRIDE_MEMALIGN": "#define JEMALLOC_OVERRIDE_MEMALIGN",
+        "#undef JEMALLOC_OVERRIDE_VALLOC": "#define JEMALLOC_OVERRIDE_VALLOC",
+        "#undef JEMALLOC_USABLE_SIZE_CONST": "#define JEMALLOC_USABLE_SIZE_CONST",
+        "#undef JEMALLOC_USE_CXX_THROW": "#define JEMALLOC_USE_CXX_THROW",
+        "#undef LG_SIZEOF_PTR": "#define LG_SIZEOF_PTR 3",
+    },
+)
+
+template_rule(
+    name = "jemalloc_macros_h",
+    src = "include/jemalloc/jemalloc_macros.h.in",
+    out = "include/jemalloc/jemalloc_macros.h",
+    substitutions = {
+        "@jemalloc_version@": "0.0.0",
+        "@jemalloc_version_major@": "0",
+        "@jemalloc_version_minor@": "0",
+        "@jemalloc_version_bugfix@": "0",
+        "@jemalloc_version_nrev@": "0",
+        "@jemalloc_version_gid@": "0000000000000000000000000000000000000000",
+    },
+)
+
+template_rule(
+    name = "jemalloc_protos_h",
+    src = "include/jemalloc/jemalloc_protos.h.in",
+    out = "include/jemalloc/jemalloc_protos.h",
+    substitutions = {
+        "@aligned_alloc": "aligned_alloc",
+        "@calloc": "calloc",
+        "@cbopaque": "cbopaque",
+        "@dallocx": "dallocx",
+        "@free": "free",
+        "@je": "je",
+        "@mallctl": "mallctl",
+        "@mallctlnametomib": "mallctlnametomib",
+        "@mallctlbymib": "mallctlbymib",
+        "@malloc_stats_print": "malloc_stats_print",
+        "@malloc_usable_size": "malloc_usable_size",
+        "@malloc": "malloc",
+        "@mallocx": "mallocx",
+        "@memalign": "memalign",
+        "@nallocx": "nallocx",
+        "@posix_memalign": "posix_memalign",
+        "@rallocx": "rallocx",
+        "@realloc": "realloc",
+        "@sallocx": "sallocx",
+        "@sdallocx": "sdallocx",
+        "@valloc": "valloc",
+        "@xallocx": "xallocx",
+    },
+)
+
+template_rule(
+    name = "jemalloc_typedefs_h",
+    src = "include/jemalloc/jemalloc_typedefs.h.in",
+    out = "include/jemalloc/jemalloc_typedefs.h",
+    substitutions = {},
+)