Switch open-source to use jemalloc for CPU Tensor memory allocation, gRPC, and other places where we call malloc/free.

- Only enabled on Linux for now.
- Added as a ./configure option defaulting to enabled.
Change: 144266237
This commit is contained in:
Jonathan Hseu 2017-01-11 16:39:35 -08:00 committed by TensorFlower Gardener
parent 99e1b19ceb
commit 83c6e0c63a
25 changed files with 505 additions and 59 deletions

18
configure vendored
View File

@ -57,9 +57,27 @@ done
if is_windows; then
TF_NEED_GCP=0
TF_NEED_HDFS=0
TF_NEED_JEMALLOC=0
TF_NEED_OPENCL=0
fi
while [ "$TF_NEED_JEMALLOC" == "" ]; do
read -p "Do you wish to use jemalloc as the malloc implementation? "\
"(Linux only) [Y/n] " INPUT
case $INPUT in
[Yy]* ) echo "jemalloc enabled on Linux"; TF_NEED_JEMALLOC=1;;
[Nn]* ) echo "jemalloc disabled on Linux"; TF_NEED_JEMALLOC=0;;
"" ) echo "jemalloc enabled on Linux"; TF_NEED_JEMALLOC=1;;
* ) echo "Invalid selection: " $INPUT;;
esac
done
if [ "$TF_NEED_JEMALLOC" == "1" ]; then
sed -i -e "s/WITH_JEMALLOC = False/WITH_JEMALLOC = True/" tensorflow/core/platform/default/build_config.bzl
else
sed -i -e "s/WITH_JEMALLOC = True/WITH_JEMALLOC = False/" tensorflow/core/platform/default/build_config.bzl
fi
while [ "$TF_NEED_GCP" == "" ]; do
read -p "Do you wish to build TensorFlow with "\
"Google Cloud Platform support? [y/N] " INPUT

View File

@ -37,6 +37,7 @@ limitations under the License.
#include "tensorflow/core/lib/core/stringpiece.h"
#include "tensorflow/core/lib/gtl/array_slice.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/mem.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/protobuf.h"
#include "tensorflow/core/platform/thread_annotations.h"
@ -159,11 +160,13 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in,
return InvalidArgument("Passing non-empty TF_Buffer is invalid.");
}
const auto proto_size = in.ByteSize();
void* buf = malloc(proto_size);
void* buf = tensorflow::port::Malloc(proto_size);
in.SerializeToArray(buf, proto_size);
out->data = buf;
out->length = proto_size;
out->data_deallocator = [](void* data, size_t length) { free(data); };
out->data_deallocator = [](void* data, size_t length) {
tensorflow::port::Free(data);
};
return Status::OK();
}
@ -287,13 +290,15 @@ void TF_SetConfig(TF_SessionOptions* options, const void* proto,
TF_Buffer* TF_NewBuffer() { return new TF_Buffer{nullptr, 0, nullptr}; }
TF_Buffer* TF_NewBufferFromString(const void* proto, size_t proto_len) {
void* copy = malloc(proto_len);
void* copy = tensorflow::port::Malloc(proto_len);
memcpy(copy, proto, proto_len);
TF_Buffer* buf = new TF_Buffer;
buf->data = copy;
buf->length = proto_len;
buf->data_deallocator = [](void* data, size_t length) { free(data); };
buf->data_deallocator = [](void* data, size_t length) {
tensorflow::port::Free(data);
};
return buf;
}
@ -694,7 +699,7 @@ TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status) {
TF_Buffer TF_GetOpList(TF_Library* lib_handle) { return lib_handle->op_list; }
void TF_DeleteLibraryHandle(TF_Library* lib_handle) {
free(const_cast<void*>(lib_handle->op_list.data));
tensorflow::port::Free(const_cast<void*>(lib_handle->op_list.data));
delete lib_handle;
}

View File

@ -19,6 +19,7 @@ limitations under the License.
#include "tensorflow/compiler/tf2xla/shape_util.h"
#include "tensorflow/compiler/xla/literal_util.h"
#include "tensorflow/core/common_runtime/dma_helper.h"
#include "tensorflow/core/platform/mem.h"
namespace tensorflow {
@ -41,7 +42,7 @@ void* XlaDeviceAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
// Regardless of the size requested, always allocate a XlaGlobalData. Respect
// the aligment request because there is alignment checking even for Tensors
// whose data is never accessed.
void* p = port::aligned_malloc(sizeof(XlaGlobalData), alignment);
void* p = port::AlignedMalloc(sizeof(XlaGlobalData), alignment);
VLOG(2) << "Allocated XLA device tensor " << p;
return new (p) XlaGlobalData();
}
@ -50,7 +51,7 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
XlaGlobalData* global_data = reinterpret_cast<XlaGlobalData*>(ptr);
VLOG(2) << "Deallocated XLA device tensor " << ptr;
global_data->~XlaGlobalData();
port::aligned_free(ptr);
port::AlignedFree(ptr);
}
void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }

View File

@ -24,6 +24,7 @@ limitations under the License.
#include "tensorflow/core/common_runtime/device_factory.h"
#include "tensorflow/core/common_runtime/local_device.h"
#include "tensorflow/core/framework/device_base.h"
#include "tensorflow/core/platform/mem.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace tensorflow {
@ -47,7 +48,7 @@ class XlaCompilationAllocator : public Allocator {
// XlaExpression. Respect the aligment request because there is
// alignment checking even for Tensors whose data is never
// accessed.
void* p = port::aligned_malloc(sizeof(XlaExpression), alignment);
void* p = port::AlignedMalloc(sizeof(XlaExpression), alignment);
XlaExpression* expression = reinterpret_cast<XlaExpression*>(p);
new (expression) XlaExpression();
return expression;
@ -56,7 +57,7 @@ class XlaCompilationAllocator : public Allocator {
void DeallocateRaw(void* ptr) override {
XlaExpression* expression = reinterpret_cast<XlaExpression*>(ptr);
expression->~XlaExpression();
port::aligned_free(ptr);
port::AlignedFree(ptr);
}
// Make sure that even tensors with 0 elements have allocated

View File

@ -84,12 +84,14 @@ load(
"//tensorflow/core:platform/default/build_config.bzl",
"tf_proto_library",
"tf_proto_library_cc",
"tf_additional_core_deps",
"tf_additional_lib_defines",
"tf_additional_lib_deps",
"tf_additional_lib_hdrs",
"tf_additional_lib_srcs",
"tf_additional_minimal_lib_srcs",
"tf_additional_proto_hdrs",
"tf_additional_proto_srcs",
"tf_additional_lib_deps",
"tf_additional_stream_executor_srcs",
"tf_additional_cupti_wrapper_deps",
"tf_additional_libdevice_data",
@ -1126,12 +1128,13 @@ cc_library(
"platform/tracing.h",
],
copts = tf_copts(),
defines = tf_additional_lib_defines(),
linkopts = ["-ldl"],
deps = [
deps = tf_additional_lib_deps() + [
":lib_proto_parsing",
":protos_all_cc",
"//tensorflow/core/platform/default/build_config:platformlib",
"//third_party/eigen3",
"//tensorflow/core/platform/default/build_config:platformlib",
"@zlib_archive//:zlib",
],
)
@ -1351,7 +1354,7 @@ tf_cuda_library(
":protos_all_cc",
"//third_party/eigen3",
"//tensorflow/core/kernels:required",
] + tf_additional_lib_deps(),
] + tf_additional_core_deps(),
alwayslink = 1,
)

View File

@ -215,7 +215,7 @@ Status CUPTIManager::DisableTrace() {
void CUPTIManager::InternalBufferRequested(uint8_t **buffer, size_t *size,
size_t *maxNumRecords) {
VLOG(2) << "BufferRequested";
void *p = port::aligned_malloc(kBufferSize, kBufferAlignment);
void *p = port::AlignedMalloc(kBufferSize, kBufferAlignment);
*size = kBufferSize;
*buffer = reinterpret_cast<uint8_t *>(p);
*maxNumRecords = 0;
@ -246,7 +246,7 @@ void CUPTIManager::InternalBufferCompleted(CUcontext ctx, uint32_t streamId,
LOG(WARNING) << "Dropped " << dropped << " activity records";
}
}
port::aligned_free(buffer);
port::AlignedFree(buffer);
}
CUPTIManager *GetCUPTIManager() {

View File

@ -171,9 +171,9 @@ class BasicCPUAllocator : public SubAllocator {
~BasicCPUAllocator() override {}
void* Alloc(size_t alignment, size_t num_bytes) override {
return port::aligned_malloc(num_bytes, alignment);
return port::AlignedMalloc(num_bytes, alignment);
}
void Free(void* ptr, size_t num_bytes) override { port::aligned_free(ptr); }
void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
};
// Allocator for pinned CPU RAM that is made known to CUDA for the

View File

@ -275,6 +275,7 @@ cc_library(
"//tensorflow/core/distributed_runtime:server_lib",
"//tensorflow/core/distributed_runtime:worker_env",
"@grpc//:grpc++_unsecure",
"@grpc//:grpc_unsecure",
],
alwayslink = 1,
)

View File

@ -21,6 +21,7 @@ limitations under the License.
#include "grpc++/grpc++.h"
#include "grpc++/security/credentials.h"
#include "grpc++/server_builder.h"
#include "grpc/support/alloc.h"
#include "tensorflow/core/common_runtime/device_factory.h"
#include "tensorflow/core/common_runtime/device_mgr.h"
@ -41,6 +42,7 @@ limitations under the License.
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/mem.h"
#include "tensorflow/core/public/session_options.h"
namespace tensorflow {
@ -304,6 +306,11 @@ class GrpcServerFactory : public ServerFactory {
class GrpcServerRegistrar {
public:
GrpcServerRegistrar() {
gpr_allocation_functions alloc_fns;
alloc_fns.malloc_fn = port::Malloc;
alloc_fns.realloc_fn = port::Realloc;
alloc_fns.free_fn = port::Free;
gpr_set_allocation_functions(alloc_fns);
ServerFactory::Register("GRPC_SERVER", new GrpcServerFactory());
}
};

View File

@ -68,7 +68,7 @@ class CPUAllocator : public Allocator {
string Name() override { return "cpu"; }
void* AllocateRaw(size_t alignment, size_t num_bytes) override {
void* p = port::aligned_malloc(num_bytes, alignment);
void* p = port::AlignedMalloc(num_bytes, alignment);
if (cpu_allocator_collect_stats) {
const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
mutex_lock l(mu_);
@ -89,7 +89,7 @@ class CPUAllocator : public Allocator {
mutex_lock l(mu_);
stats_.bytes_in_use -= alloc_size;
}
port::aligned_free(ptr);
port::AlignedFree(ptr);
}
void GetStats(AllocatorStats* stats) override {

View File

@ -20,6 +20,7 @@ limitations under the License.
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/mem.h"
namespace tensorflow {
@ -91,7 +92,7 @@ Status LoadLibrary(const char* library_filename, void** result,
}
string str;
library.op_list.SerializeToString(&str);
char* str_buf = reinterpret_cast<char*>(malloc(str.length()));
char* str_buf = reinterpret_cast<char*>(port::Malloc(str.length()));
memcpy(str_buf, str.data(), str.length());
*buf = str_buf;
*len = str.length();

View File

@ -19,6 +19,7 @@ limitations under the License.
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/mem.h"
#include "tensorflow/core/platform/test.h"
namespace tensorflow {
@ -27,7 +28,7 @@ class TestableSizeTrackingAllocator : public Allocator {
public:
string Name() override { return "test"; }
void* AllocateRaw(size_t /*alignment*/, size_t num_bytes) override {
void* ptr = malloc(num_bytes);
void* ptr = port::Malloc(num_bytes);
size_map_[ptr] = num_bytes;
return ptr;
}
@ -35,7 +36,7 @@ class TestableSizeTrackingAllocator : public Allocator {
const auto& iter = size_map_.find(ptr);
EXPECT_NE(size_map_.end(), iter);
size_map_.erase(iter);
free(ptr);
port::Free(ptr);
}
bool TracksAllocationSizes() override { return true; }
size_t RequestedSize(void* ptr) override {

View File

@ -18,6 +18,7 @@ limitations under the License.
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/resource_mgr.h"
#include "tensorflow/core/platform/mem.h"
#include "tensorflow/core/util/tensor_format.h"
#if GOOGLE_CUDA
@ -44,9 +45,9 @@ class LaunchConv2DOp {
template <class T, size_t size>
struct Im2ColBufferResource : public ResourceBase {
Im2ColBufferResource<T, size>() {
data = static_cast<T*>(malloc(size * sizeof(T)));
data = static_cast<T*>(port::Malloc(size * sizeof(T)));
}
~Im2ColBufferResource<T, size>() { free(data); }
~Im2ColBufferResource<T, size>() { port::Free(data); }
// This mutex ensures that only a single operation at a time is able to use
// the buffer memory held by this resource.
mutex mu;

View File

@ -48,7 +48,8 @@ Arena::Arena(const size_t block_size)
overflow_blocks_(NULL) {
assert(block_size > kDefaultAlignment);
first_blocks_[0].mem = reinterpret_cast<char*>(malloc(block_size_));
first_blocks_[0].mem =
reinterpret_cast<char*>(port::AlignedMalloc(block_size_, sizeof(void*)));
first_blocks_[0].size = block_size_;
@ -59,7 +60,9 @@ Arena::~Arena() {
FreeBlocks();
assert(overflow_blocks_ == NULL); // FreeBlocks() should do that
// The first X blocks stay allocated always by default. Delete them now.
for (size_t i = 0; i < blocks_alloced_; ++i) free(first_blocks_[i].mem);
for (size_t i = 0; i < blocks_alloced_; ++i) {
port::AlignedFree(first_blocks_[i].mem);
}
}
// Returns true iff it advances freestart_ to the first position
@ -162,8 +165,11 @@ Arena::AllocatedBlock* Arena::AllocNewBlock(const size_t block_size,
// Must be a multiple of kDefaultAlignment, unless requested
// alignment is 1, in which case we don't care at all.
const uint32 adjusted_alignment =
uint32 adjusted_alignment =
(alignment > 1 ? LeastCommonMultiple(alignment, kDefaultAlignment) : 1);
// Required minimum alignment for port::AlignedMalloc().
adjusted_alignment =
std::max(adjusted_alignment, static_cast<uint32>(sizeof(void*)));
CHECK_LE(adjusted_alignment, static_cast<uint32>(1 << 20))
<< "Alignment on boundaries greater than 1MB not supported.";
@ -171,16 +177,12 @@ Arena::AllocatedBlock* Arena::AllocNewBlock(const size_t block_size,
// If block_size > alignment we force block_size to be a multiple
// of alignment; if block_size < alignment we make no adjustment.
size_t adjusted_block_size = block_size;
if (adjusted_alignment > 1) {
if (adjusted_block_size > adjusted_alignment) {
const uint32 excess = adjusted_block_size % adjusted_alignment;
adjusted_block_size += (excess > 0 ? adjusted_alignment - excess : 0);
}
block->mem = reinterpret_cast<char*>(
port::aligned_malloc(adjusted_block_size, adjusted_alignment));
} else {
block->mem = reinterpret_cast<char*>(malloc(adjusted_block_size));
if (adjusted_block_size > adjusted_alignment) {
const uint32 excess = adjusted_block_size % adjusted_alignment;
adjusted_block_size += (excess > 0 ? adjusted_alignment - excess : 0);
}
block->mem = reinterpret_cast<char*>(
port::AlignedMalloc(adjusted_block_size, adjusted_alignment));
block->size = adjusted_block_size;
CHECK(NULL != block->mem) << "block_size=" << block_size
<< " adjusted_block_size=" << adjusted_block_size
@ -242,7 +244,7 @@ void* Arena::GetMemoryFallback(const size_t size, const int alignment) {
void Arena::FreeBlocks() {
for (size_t i = 1; i < blocks_alloced_; ++i) { // keep first block alloced
free(first_blocks_[i].mem);
port::AlignedFree(first_blocks_[i].mem);
first_blocks_[i].mem = NULL;
first_blocks_[i].size = 0;
}
@ -250,7 +252,7 @@ void Arena::FreeBlocks() {
if (overflow_blocks_ != NULL) {
std::vector<AllocatedBlock>::iterator it;
for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) {
free(it->mem);
port::AlignedFree(it->mem);
}
delete overflow_blocks_; // These should be used very rarely
overflow_blocks_ = NULL;

View File

@ -45,6 +45,7 @@ limitations under the License.
#include "tensorflow/core/lib/gtl/manual_constructor.h"
#include "tensorflow/core/platform/cpu_info.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/mem.h"
#include "tensorflow/core/platform/types.h"
#include <initializer_list> // NOLINT(build/include_order)
@ -353,7 +354,7 @@ class InlinedVector {
size_t n = size();
Destroy(base, n);
if (!is_inline()) {
free(base);
port::Free(base);
}
}
@ -434,7 +435,7 @@ class InlinedVector {
}
T* src = data();
T* dst = static_cast<T*>(malloc(target * sizeof(T)));
T* dst = static_cast<T*>(port::Malloc(target * sizeof(T)));
// Need to copy elem before discarding src since it might alias src.
InitType{}(dst + s, std::forward<Args>(args)...);

View File

@ -30,7 +30,7 @@ limitations under the License.
#include <utility>
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/mem.h" // For aligned_malloc/aligned_free
#include "tensorflow/core/platform/mem.h"
namespace tensorflow {
namespace gtl {
@ -127,9 +127,9 @@ class ManualConstructor {
// Support users creating arrays of ManualConstructor<>s. This ensures that
// the array itself has the correct alignment.
static void* operator new[](size_t size) {
return port::aligned_malloc(size, TF_LIB_GTL_ALIGN_OF(Type));
return port::AlignedMalloc(size, TF_LIB_GTL_ALIGN_OF(Type));
}
static void operator delete[](void* mem) { port::aligned_free(mem); }
static void operator delete[](void* mem) { port::AlignedFree(mem); }
inline Type* get() { return reinterpret_cast<Type*>(space_); }
inline const Type* get() const {

View File

@ -17,6 +17,7 @@ limitations under the License.
#include <fstream>
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/lib/io/path.h"
#include "tensorflow/core/platform/mem.h"
#include "tensorflow/core/platform/test.h"
namespace tensorflow {
@ -172,7 +173,8 @@ class FakeLibCurl : public LibCurl {
temp_str.replace(n, victim.size(), encoded);
n += encoded.size();
}
char* out_char_str = (char*)malloc(sizeof(char) * temp_str.size() + 1);
char* out_char_str =
(char*)port::Malloc(sizeof(char) * temp_str.size() + 1);
std::copy(temp_str.begin(), temp_str.end(), out_char_str);
out_char_str[temp_str.size()] = '\0';
return out_char_str;
@ -180,7 +182,7 @@ class FakeLibCurl : public LibCurl {
void curl_slist_free_all(curl_slist* list) override {
delete reinterpret_cast<std::vector<string>*>(list);
}
void curl_free(void* p) override { free(p); }
void curl_free(void* p) override { port::Free(p); }
// Variables defining the behavior of this fake.
string response_content;

View File

@ -3,10 +3,11 @@
load("@protobuf//:protobuf.bzl", "cc_proto_library")
load("@protobuf//:protobuf.bzl", "py_proto_library")
# configure may change the following lines to True
# configure may change the following lines
WITH_GCP_SUPPORT = False
WITH_HDFS_SUPPORT = False
WITH_XLA_SUPPORT = False
WITH_JEMALLOC = True
# Appends a suffix to a list of deps.
def tf_deps(deps, suffix):
@ -176,7 +177,29 @@ def tf_additional_test_srcs():
def tf_kernel_tests_linkstatic():
return 0
# jemalloc only enabled on Linux for now.
# TODO(jhseu): Enable on other platforms.
def tf_additional_lib_defines():
defines = []
if WITH_JEMALLOC:
defines += select({
"//tensorflow:linux_x86_64": [
"TENSORFLOW_USE_JEMALLOC"
],
"//conditions:default": [],
})
return defines
def tf_additional_lib_deps():
deps = []
if WITH_JEMALLOC:
deps += select({
"//tensorflow:linux_x86_64": ["@jemalloc"],
"//conditions:default": [],
})
return deps
def tf_additional_core_deps():
deps = []
if WITH_GCP_SUPPORT:
deps.append("//tensorflow/core/platform/cloud:gcs_file_system")

View File

@ -24,9 +24,14 @@ limitations under the License.
namespace tensorflow {
namespace port {
// Aligned allocation/deallocation
void* aligned_malloc(size_t size, int minimum_alignment);
void aligned_free(void* aligned_memory);
// Aligned allocation/deallocation. `minimum_alignment` must be a power of 2
// and a multiple of sizeof(void*).
void* AlignedMalloc(size_t size, int minimum_alignment);
void AlignedFree(void* aligned_memory);
void* Malloc(size_t size);
void* Realloc(void* ptr, size_t size);
void Free(void* ptr);
// Tries to release num_bytes of free memory back to the operating
// system for reuse. Use this routine with caution -- to get this

View File

@ -25,11 +25,11 @@ namespace port {
TEST(Port, AlignedMalloc) {
for (size_t alignment = 1; alignment <= 1 << 20; alignment <<= 1) {
void* p = aligned_malloc(1, alignment);
ASSERT_TRUE(p != NULL) << "aligned_malloc(1, " << alignment << ")";
void* p = AlignedMalloc(1, alignment);
ASSERT_TRUE(p != NULL) << "AlignedMalloc(1, " << alignment << ")";
uintptr_t pval = reinterpret_cast<uintptr_t>(p);
EXPECT_EQ(pval % alignment, 0);
aligned_free(p);
AlignedFree(p);
}
}

View File

@ -13,8 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifdef TENSORFLOW_USE_JEMALLOC
#include "jemalloc/jemalloc.h"
#endif
#include "tensorflow/core/platform/cpu_info.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/mem.h"
#include "tensorflow/core/platform/types.h"
#if defined(__linux__) && !defined(__ANDROID__)
#include <sched.h>
@ -60,7 +65,7 @@ int NumSchedulableCPUs() {
return kDefaultCores;
}
void *aligned_malloc(size_t size, int minimum_alignment) {
void *AlignedMalloc(size_t size, int minimum_alignment) {
#if defined(__ANDROID__)
return memalign(minimum_alignment, size);
#else // !defined(__ANDROID__)
@ -69,15 +74,45 @@ void *aligned_malloc(size_t size, int minimum_alignment) {
// sizeof(void*). In this case, fall back on malloc which should return
// memory aligned to at least the size of a pointer.
const int required_alignment = sizeof(void *);
if (minimum_alignment < required_alignment) return malloc(size);
if (posix_memalign(&ptr, minimum_alignment, size) != 0)
if (minimum_alignment < required_alignment) return Malloc(size);
#ifdef TENSORFLOW_USE_JEMALLOC
int err = jemalloc_posix_memalign(&ptr, minimum_alignment, size);
#else
int err = posix_memalign(&ptr, minimum_alignment, size);
#endif
if (err != 0) {
return NULL;
else
} else {
return ptr;
}
#endif
}
void aligned_free(void *aligned_memory) { free(aligned_memory); }
void AlignedFree(void *aligned_memory) { Free(aligned_memory); }
void *Malloc(size_t size) {
#ifdef TENSORFLOW_USE_JEMALLOC
return jemalloc_malloc(size);
#else
return malloc(size);
#endif
}
void *Realloc(void *ptr, size_t size) {
#ifdef TENSORFLOW_USE_JEMALLOC
return jemalloc_realloc(ptr, size);
#else
return realloc(ptr, size);
#endif
}
void Free(void *ptr) {
#ifdef TENSORFLOW_USE_JEMALLOC
jemalloc_free(ptr);
#else
free(ptr);
#endif
}
void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
// No-op.

View File

@ -52,11 +52,17 @@ int NumSchedulableCPUs() {
return system_info.dwNumberOfProcessors;
}
void* aligned_malloc(size_t size, int minimum_alignment) {
void* AlignedMalloc(size_t size, int minimum_alignment) {
return _aligned_malloc(size, minimum_alignment);
}
void aligned_free(void* aligned_memory) { _aligned_free(aligned_memory); }
void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
void* Malloc(size_t size) { return ::malloc(size); }
void* Realloc(void* ptr, size_t size) { return ::realloc(ptr, size); }
void Free(void* ptr) { ::free(ptr); }
void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
// No-op.

View File

@ -87,6 +87,7 @@ filegroup(
"@gif_archive//:COPYING",
"@grpc//:LICENSE",
"@highwayhash//:LICENSE",
"@jemalloc//:COPYING",
"@jpeg//:LICENSE.md",
"@libxsmm_archive//:LICENSE",
"@local_config_sycl//sycl:LICENSE.text",

View File

@ -376,3 +376,14 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
name = "junit",
actual = "@junit_jar//jar",
)
native.new_http_archive(
name = "jemalloc",
urls = [
"http://bazel-mirror.storage.googleapis.com/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
"https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
],
sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
strip_prefix = "jemalloc-4.4.0",
build_file = str(Label("//third_party:jemalloc.BUILD")),
)

321
third_party/jemalloc.BUILD vendored Normal file
View File

@ -0,0 +1,321 @@
# Description:
# jemalloc - a general-purpose scalable concurrent malloc implementation
licenses(["notice"]) # BSD
exports_files(["COPYING"])
load("@//third_party:common.bzl", "template_rule")
cc_library(
name = "jemalloc",
srcs = [
"src/arena.c",
"src/atomic.c",
"src/base.c",
"src/bitmap.c",
"src/chunk.c",
"src/chunk_dss.c",
"src/chunk_mmap.c",
"src/ckh.c",
"src/ctl.c",
"src/extent.c",
"src/hash.c",
"src/huge.c",
"src/jemalloc.c",
"src/mb.c",
"src/mutex.c",
"src/nstime.c",
"src/pages.c",
"src/prng.c",
"src/prof.c",
"src/quarantine.c",
"src/rtree.c",
"src/spin.c",
"src/stats.c",
"src/tcache.c",
"src/tsd.c",
"src/util.c",
"src/witness.c",
],
hdrs = [
"include/jemalloc/internal/arena.h",
"include/jemalloc/internal/assert.h",
"include/jemalloc/internal/atomic.h",
"include/jemalloc/internal/base.h",
"include/jemalloc/internal/bitmap.h",
"include/jemalloc/internal/chunk.h",
"include/jemalloc/internal/chunk_dss.h",
"include/jemalloc/internal/chunk_mmap.h",
"include/jemalloc/internal/ckh.h",
"include/jemalloc/internal/ctl.h",
"include/jemalloc/internal/extent.h",
"include/jemalloc/internal/hash.h",
"include/jemalloc/internal/huge.h",
"include/jemalloc/internal/jemalloc_internal.h",
"include/jemalloc/internal/jemalloc_internal_decls.h",
"include/jemalloc/internal/jemalloc_internal_defs.h",
"include/jemalloc/internal/jemalloc_internal_macros.h",
"include/jemalloc/internal/mb.h",
"include/jemalloc/internal/mutex.h",
"include/jemalloc/internal/nstime.h",
"include/jemalloc/internal/pages.h",
"include/jemalloc/internal/ph.h",
"include/jemalloc/internal/private_namespace.h",
"include/jemalloc/internal/prng.h",
"include/jemalloc/internal/prof.h",
"include/jemalloc/internal/ql.h",
"include/jemalloc/internal/qr.h",
"include/jemalloc/internal/quarantine.h",
"include/jemalloc/internal/rb.h",
"include/jemalloc/internal/rtree.h",
"include/jemalloc/internal/size_classes.h",
"include/jemalloc/internal/smoothstep.h",
"include/jemalloc/internal/spin.h",
"include/jemalloc/internal/stats.h",
"include/jemalloc/internal/tcache.h",
"include/jemalloc/internal/ticker.h",
"include/jemalloc/internal/tsd.h",
"include/jemalloc/internal/util.h",
"include/jemalloc/internal/valgrind.h",
"include/jemalloc/internal/witness.h",
"include/jemalloc/jemalloc.h",
],
# Same flags that jemalloc uses to build.
copts = [
"-O3",
"-funroll-loops",
"-D_GNU_SOURCE",
"-D_REENTRANT",
],
includes = ["include"],
visibility = ["//visibility:public"],
)
sh_binary(
name = "jemalloc_sh",
srcs = ["include/jemalloc/jemalloc.sh"],
)
genrule(
name = "jemalloc_h",
srcs = [
":jemalloc_defs_h",
":jemalloc_macros_h",
":jemalloc_mangle_h",
":jemalloc_protos_h",
":jemalloc_rename_h",
":jemalloc_typedefs_h",
],
outs = ["include/jemalloc/jemalloc.h"],
cmd = "$(location :jemalloc_sh) $$(dirname $(location :jemalloc_defs_h))/../../ >$@",
tools = [":jemalloc_sh"],
)
# Add to this list if you want to export more symbols from jemalloc.
genrule(
name = "public_symbols_txt",
outs = ["include/jemalloc/internal/public_symbols.txt"],
cmd = "\n".join([
"cat <<'EOF' > $@",
"free:jemalloc_free",
"malloc:jemalloc_malloc",
"posix_memalign:jemalloc_posix_memalign",
"realloc:jemalloc_realloc",
"EOF",
]),
)
sh_binary(
name = "jemalloc_mangle_sh",
srcs = ["include/jemalloc/jemalloc_mangle.sh"],
)
genrule(
name = "jemalloc_mangle_h",
srcs = [":public_symbols_txt"],
outs = ["include/jemalloc/jemalloc_mangle.h"],
cmd = "$(location :jemalloc_mangle_sh) $(location :public_symbols_txt) je_ >$@",
tools = [":jemalloc_mangle_sh"],
)
sh_binary(
name = "jemalloc_rename_sh",
srcs = ["include/jemalloc/jemalloc_rename.sh"],
)
genrule(
name = "jemalloc_rename_h",
srcs = [":public_symbols_txt"],
outs = ["include/jemalloc/jemalloc_rename.h"],
cmd = "$(location :jemalloc_rename_sh) $(location :public_symbols_txt) >$@",
tools = [":jemalloc_rename_sh"],
)
sh_binary(
name = "private_namespace_sh",
srcs = ["include/jemalloc/internal/private_namespace.sh"],
)
genrule(
name = "private_namespace_h",
srcs = ["include/jemalloc/internal/private_symbols.txt"],
outs = ["include/jemalloc/internal/private_namespace.h"],
cmd = "$(location :private_namespace_sh) $(location include/jemalloc/internal/private_symbols.txt) >$@",
tools = [":private_namespace_sh"],
)
sh_binary(
name = "public_namespace_sh",
srcs = ["include/jemalloc/internal/public_namespace.sh"],
)
genrule(
name = "public_namespace_h",
srcs = [":public_symbols_txt"],
outs = ["include/jemalloc/internal/public_namespace.h"],
cmd = "$(location :public_namespace_sh) $(location :public_symbols_txt) >$@",
tools = [":public_namespace_sh"],
)
sh_binary(
name = "size_classes_sh",
srcs = ["include/jemalloc/internal/size_classes.sh"],
)
# Size classes for Linux x86_64. Update if adding builds for other
# architectures. See size_classes.sh for details on the arguments.
genrule(
name = "size_classes_h",
outs = ["include/jemalloc/internal/size_classes.h"],
cmd = "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
tools = [":size_classes_sh"],
)
template_rule(
name = "jemalloc_internal_h",
src = "include/jemalloc/internal/jemalloc_internal.h.in",
out = "include/jemalloc/internal/jemalloc_internal.h",
substitutions = {
"@private_namespace@": "je_",
"@install_suffix@": "",
},
)
template_rule(
name = "jemalloc_internal_defs_h",
src = "include/jemalloc/internal/jemalloc_internal_defs.h.in",
out = "include/jemalloc/internal/jemalloc_internal_defs.h",
substitutions = {
"#undef JEMALLOC_PREFIX": "#define JEMALLOC_PREFIX \"jemalloc_\"",
"#undef JEMALLOC_CPREFIX": "#define JEMALLOC_CPREFIX \"JEMALLOC_\"",
"#undef JEMALLOC_PRIVATE_NAMESPACE": "#define JEMALLOC_PRIVATE_NAMESPACE je_",
"#undef CPU_SPINWAIT": "#define CPU_SPINWAIT __asm__ volatile(\"pause\")",
"#undef JEMALLOC_HAVE_BUILTIN_CLZ": "#define JEMALLOC_HAVE_BUILTIN_CLZ",
"#undef JEMALLOC_USE_SYSCALL": "#define JEMALLOC_USE_SYSCALL",
"#undef JEMALLOC_HAVE_SECURE_GETENV": "#define JEMALLOC_HAVE_SECURE_GETENV",
"#undef JEMALLOC_HAVE_PTHREAD_ATFORK": "#define JEMALLOC_HAVE_PTHREAD_ATFORK",
"#undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE": "#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1",
# Newline required because of substitution conflicts.
"#undef JEMALLOC_HAVE_CLOCK_MONOTONIC\n": "#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1\n",
"#undef JEMALLOC_THREADED_INIT": "#define JEMALLOC_THREADED_INIT",
"#undef JEMALLOC_TLS_MODEL": "#define JEMALLOC_TLS_MODEL __attribute__((tls_model(\"initial-exec\")))",
"#undef JEMALLOC_CC_SILENCE": "#define JEMALLOC_CC_SILENCE",
"#undef JEMALLOC_STATS": "#define JEMALLOC_STATS",
"#undef JEMALLOC_TCACHE": "#define JEMALLOC_TCACHE",
"#undef JEMALLOC_DSS": "#define JEMALLOC_DSS",
"#undef JEMALLOC_FILL": "#define JEMALLOC_FILL",
"#undef LG_TINY_MIN": "#define LG_TINY_MIN 3",
"#undef LG_PAGE": "#define LG_PAGE 12",
"#undef JEMALLOC_MAPS_COALESCE": "#define JEMALLOC_MAPS_COALESCE",
"#undef JEMALLOC_TLS": "#define JEMALLOC_TLS",
"#undef JEMALLOC_INTERNAL_UNREACHABLE": "#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable",
"#undef JEMALLOC_INTERNAL_FFSLL": "#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll",
# Newline required because of substitution conflicts.
"#undef JEMALLOC_INTERNAL_FFSL\n": "#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl\n",
"#undef JEMALLOC_INTERNAL_FFS\n": "#define JEMALLOC_INTERNAL_FFS __builtin_ffs\n",
"#undef JEMALLOC_CACHE_OBLIVIOUS": "#define JEMALLOC_CACHE_OBLIVIOUS",
"#undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY": "#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY",
"#undef JEMALLOC_HAVE_MADVISE": "#define JEMALLOC_HAVE_MADVISE",
"#undef JEMALLOC_PURGE_MADVISE_DONTNEED": "#define JEMALLOC_PURGE_MADVISE_DONTNEED",
"#undef JEMALLOC_THP": "#define JEMALLOC_THP",
"#undef JEMALLOC_HAS_ALLOCA_H": "#define JEMALLOC_HAS_ALLOCA_H 1",
# Newline required because of substitution conflicts.
"#undef LG_SIZEOF_INT\n": "#define LG_SIZEOF_INT 2\n",
"#undef LG_SIZEOF_LONG\n": "#define LG_SIZEOF_LONG 3\n",
"#undef LG_SIZEOF_LONG_LONG": "#define LG_SIZEOF_LONG_LONG 3",
"#undef LG_SIZEOF_INTMAX_T": "#define LG_SIZEOF_INTMAX_T 3",
"#undef JEMALLOC_GLIBC_MALLOC_HOOK": "#define JEMALLOC_GLIBC_MALLOC_HOOK",
"#undef JEMALLOC_GLIBC_MEMALIGN_HOOK": "#define JEMALLOC_GLIBC_MEMALIGN_HOOK",
"#undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP": "#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP",
"#undef JEMALLOC_CONFIG_MALLOC_CONF": "#define JEMALLOC_CONFIG_MALLOC_CONF \"\"",
},
)
template_rule(
name = "jemalloc_defs_h",
src = "include/jemalloc/jemalloc_defs.h.in",
out = "include/jemalloc/jemalloc_defs.h",
substitutions = {
"#undef JEMALLOC_HAVE_ATTR": "#define JEMALLOC_HAVE_ATTR",
"#undef JEMALLOC_HAVE_ATTR_ALLOC_SIZE": "#define JEMALLOC_HAVE_ATTR_ALLOC_SIZE",
"#undef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF": "#define JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF",
"#undef JEMALLOC_HAVE_ATTR_FORMAT_PRINTF": "#define JEMALLOC_HAVE_ATTR_FORMAT_PRINTF",
"#undef JEMALLOC_OVERRIDE_MEMALIGN": "#define JEMALLOC_OVERRIDE_MEMALIGN",
"#undef JEMALLOC_OVERRIDE_VALLOC": "#define JEMALLOC_OVERRIDE_VALLOC",
"#undef JEMALLOC_USABLE_SIZE_CONST": "#define JEMALLOC_USABLE_SIZE_CONST",
"#undef JEMALLOC_USE_CXX_THROW": "#define JEMALLOC_USE_CXX_THROW",
"#undef LG_SIZEOF_PTR": "#define LG_SIZEOF_PTR 3",
},
)
template_rule(
name = "jemalloc_macros_h",
src = "include/jemalloc/jemalloc_macros.h.in",
out = "include/jemalloc/jemalloc_macros.h",
substitutions = {
"@jemalloc_version@": "0.0.0",
"@jemalloc_version_major@": "0",
"@jemalloc_version_minor@": "0",
"@jemalloc_version_bugfix@": "0",
"@jemalloc_version_nrev@": "0",
"@jemalloc_version_gid@": "0000000000000000000000000000000000000000",
},
)
template_rule(
name = "jemalloc_protos_h",
src = "include/jemalloc/jemalloc_protos.h.in",
out = "include/jemalloc/jemalloc_protos.h",
substitutions = {
"@aligned_alloc": "aligned_alloc",
"@calloc": "calloc",
"@cbopaque": "cbopaque",
"@dallocx": "dallocx",
"@free": "free",
"@je": "je",
"@mallctl": "mallctl",
"@mallctlnametomib": "mallctlnametomib",
"@mallctlbymib": "mallctlbymib",
"@malloc_stats_print": "malloc_stats_print",
"@malloc_usable_size": "malloc_usable_size",
"@malloc": "malloc",
"@mallocx": "mallocx",
"@memalign": "memalign",
"@nallocx": "nallocx",
"@posix_memalign": "posix_memalign",
"@rallocx": "rallocx",
"@realloc": "realloc",
"@sallocx": "sallocx",
"@sdallocx": "sdallocx",
"@valloc": "valloc",
"@xallocx": "xallocx",
},
)
template_rule(
name = "jemalloc_typedefs_h",
src = "include/jemalloc/jemalloc_typedefs.h.in",
out = "include/jemalloc/jemalloc_typedefs.h",
substitutions = {},
)