diff --git a/tensorflow/core/common_runtime/scoped_allocator.cc b/tensorflow/core/common_runtime/scoped_allocator.cc index 3c3e4ffd6d9..e61df192c39 100644 --- a/tensorflow/core/common_runtime/scoped_allocator.cc +++ b/tensorflow/core/common_runtime/scoped_allocator.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "tensorflow/core/common_runtime/scoped_allocator.h" + #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h" +#include "tensorflow/core/platform/dynamic_annotations.h" namespace tensorflow { @@ -34,7 +36,7 @@ ScopedAllocator::ScopedAllocator(const Tensor& backing_tensor, int32 scope_id, tbuf_->Ref(); // Hold this until all expected_calls have been made. container->Ref(); - CHECK_GE(tbuf_->size(), fields.back().offset + fields.back().bytes); + CHECK_GE(tbuf_->size(), fields.back().offset + fields.back().bytes_requested); } ScopedAllocator::~ScopedAllocator() { @@ -56,43 +58,66 @@ ScopedAllocator::~ScopedAllocator() { void* ScopedAllocator::AllocateRaw(int32 field_index, size_t num_bytes) { VLOG(1) << "ScopedAllocator index " << id_ << " AllocateRaw " << "field " << field_index << " num_bytes " << num_bytes; - mutex_lock l(mu_); - if (expected_call_count_ <= 0) { - LOG(ERROR) << "Scoped allocator " << name_ - << " could not satisfy request for " << num_bytes - << " bytes, expected uses exhausted. "; - return nullptr; - } - - int32_t num_fields = static_cast(fields_.size()); - if (field_index >= num_fields) { - LOG(ERROR) << "ScopedAllocator " << name_ - << " received unexpected field number " << field_index; - return nullptr; - } - - const Field& f = fields_[field_index]; - if (num_bytes != f.bytes) { - LOG(ERROR) << "ScopedAllocator " << name_ << " got request for " - << num_bytes << " bytes from field " << field_index - << " which has precalculated size " << f.bytes << " and offset " - << f.offset; - return nullptr; - } - - void* ptr = static_cast((tbuf_->template base() + f.offset)); - - ++live_alloc_count_; - --expected_call_count_; - if (0 == expected_call_count_) { - for (auto& f : fields_) { - container_->Drop(f.scope_id, this); + void* ptr = nullptr; + const Field* field = nullptr; + { + mutex_lock l(mu_); + if (expected_call_count_ <= 0) { + LOG(ERROR) << "Scoped allocator " << name_ + << " could not satisfy request for " << num_bytes + << " bytes, expected uses exhausted. "; + return nullptr; + } + + int32_t num_fields = static_cast(fields_.size()); + if (field_index >= num_fields) { + LOG(ERROR) << "ScopedAllocator " << name_ + << " received unexpected field number " << field_index; + return nullptr; + } + + field = &fields_[field_index]; + if (num_bytes != field->bytes_requested) { + LOG(ERROR) << "ScopedAllocator " << name_ << " got request for " + << num_bytes << " bytes from field " << field_index + << " which has precalculated size " << field->bytes_requested + << " and offset " << field->offset; + return nullptr; + } + + ptr = static_cast((tbuf_->template base() + field->offset)); + + ++live_alloc_count_; + --expected_call_count_; + if (0 == expected_call_count_) { + for (auto& f : fields_) { + container_->Drop(f.scope_id, this); + } + container_->Drop(id_, this); + container_->Unref(); + container_ = nullptr; } - container_->Drop(id_, this); - container_->Unref(); - container_ = nullptr; } - VLOG(1) << "AllocateRaw returning " << ptr; + VLOG(2) << "AllocateRaw returning " << ptr << " bytes_requested " + << field->bytes_requested << " bytes_allocated " + << field->bytes_allocated; + + // If there is overshoot due to alignment, let MSAN believe that the padding + // is initialized. This is okay because we do not use this memory region for + // anything meaningful. + if (field->bytes_allocated > field->bytes_requested) { + size_t extra_bytes = field->bytes_allocated - field->bytes_requested; + void* extra_buf = static_cast(static_cast(ptr) + + field->bytes_allocated - extra_bytes); + VLOG(2) << "AllocateRaw requested " << num_bytes + << " bytes which is not divisible by kAllocatorAlignment=" + << Allocator::kAllocatorAlignment << " and hence we allocated " + << field->bytes_allocated << ". Annotating " << extra_bytes + << " bytes starting at " << extra_buf + << " with TF_ANNOTATE_MEMORY_IS_INITIALIZED"; + TF_ANNOTATE_MEMORY_IS_INITIALIZED(extra_buf, extra_bytes); + } + return ptr; } diff --git a/tensorflow/core/common_runtime/scoped_allocator.h b/tensorflow/core/common_runtime/scoped_allocator.h index 683bbc7e9ed..1b458aacb23 100644 --- a/tensorflow/core/common_runtime/scoped_allocator.h +++ b/tensorflow/core/common_runtime/scoped_allocator.h @@ -35,7 +35,8 @@ class ScopedAllocator { struct Field { int32 scope_id; size_t offset; - size_t bytes; + size_t bytes_requested; + size_t bytes_allocated; }; // Field index that refers to backing tensor, not any aliased field. static const int32 kBackingIndex = -1; diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc index 8ac6adc2e48..96f55aa4f4d 100644 --- a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc +++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc @@ -166,21 +166,35 @@ size_t ScopedAllocatorMgr::PopulateFields( const DataType dtype, std::vector* fields) { const int32 num_fields = static_cast(shapes.size()); fields->resize(num_fields); + // At the end of iteration `i`, `offset` points to the offset from the start + // of the backing buffer until the end of `field[i].bytes_allocated`. This + // is aligned to `kAllocatorAlignment`. size_t offset = 0; for (int32 i = 0; i < num_fields; ++i) { + size_t bytes_requested = shapes[i].num_elements() * DataTypeSize(dtype); + auto* field = &((*fields)[i]); + field->scope_id = scope_id + 1 + i; + field->bytes_requested = bytes_requested; + field->offset = offset; + offset += bytes_requested; + + // Compute actual #bytes allocated, which may include padding due to + // alignment. + size_t bytes_allocated = bytes_requested; size_t overshoot = offset % Allocator::kAllocatorAlignment; if (overshoot > 0) { - offset += (Allocator::kAllocatorAlignment - overshoot); + size_t alignment_bytes = Allocator::kAllocatorAlignment - overshoot; + bytes_allocated += alignment_bytes; + offset += alignment_bytes; } - size_t bytes = shapes[i].num_elements() * DataTypeSize(dtype); - (*fields)[i].scope_id = scope_id + 1 + i; - (*fields)[i].bytes = bytes; - (*fields)[i].offset = offset; - VLOG(1) << "field=" << i << " scope_id=" << (*fields)[i].scope_id - << " bytes=" << (*fields)[i].bytes - << " offset=" << (*fields)[i].offset; - offset += bytes; + field->bytes_allocated = bytes_allocated; + + VLOG(1) << "field=" << i << " scope_id=" << field->scope_id + << " bytes_requested=" << field->bytes_requested + << " offset=" << field->offset + << " bytes_allocated=" << field->bytes_allocated; } + return offset; } diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc index 38e07e47f24..a359924f056 100644 --- a/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc +++ b/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc @@ -110,13 +110,13 @@ TEST_F(ScopedAllocatorMgrTest, PopulateFields) { InitTensor(); PopulateFields(); EXPECT_EQ(0, fields_[0].offset); - EXPECT_EQ(512 * sizeof(float), fields_[0].bytes); + EXPECT_EQ(512 * sizeof(float), fields_[0].bytes_requested); EXPECT_EQ(scope_id_ + 1, fields_[0].scope_id); EXPECT_EQ(512 * sizeof(float), fields_[1].offset); - EXPECT_EQ(9 * sizeof(float), fields_[1].bytes); + EXPECT_EQ(9 * sizeof(float), fields_[1].bytes_requested); EXPECT_EQ(scope_id_ + 2, fields_[1].scope_id); EXPECT_EQ(521 * sizeof(float) + AlignmentPadding(), fields_[2].offset); - EXPECT_EQ(512 * sizeof(float), fields_[2].bytes); + EXPECT_EQ(512 * sizeof(float), fields_[2].bytes_requested); EXPECT_EQ(scope_id_ + 3, fields_[2].scope_id); } @@ -185,9 +185,10 @@ TEST_F(ScopedAllocatorMgrTest, AllocatorInitFail) { fields_.resize(1); fields_[0].scope_id = scope_id_ + 1; fields_[0].offset = 0; - fields_[0].bytes = backing_tensor_shape_.num_elements() * 2 * sizeof(float); - // fields[0].offset + fields[0].bytes is larger than the size of the backing - // tensor, so this check should fail + fields_[0].bytes_requested = + backing_tensor_shape_.num_elements() * 2 * sizeof(float); + // fields[0].offset + fields[0].bytes_requested is larger than the size of the + // backing tensor, so this check should fail EXPECT_DEATH(Status s = AddScopedAllocator(1, scope_id_), ""); } diff --git a/tensorflow/core/kernels/scoped_allocator_ops.cc b/tensorflow/core/kernels/scoped_allocator_ops.cc index 79320e08cb2..881d728f632 100644 --- a/tensorflow/core/kernels/scoped_allocator_ops.cc +++ b/tensorflow/core/kernels/scoped_allocator_ops.cc @@ -39,7 +39,7 @@ class ScopedAllocatorOp : public OpKernel { // the subtensors to be allocated from it, taking into account // alignment considerations. ScopedAllocatorMgr::PopulateFields(id_, shapes_, dtype_, &fields_); - size_t num_bytes = fields_.back().offset + fields_.back().bytes; + size_t num_bytes = fields_.back().offset + fields_.back().bytes_allocated; num_elements_ = num_bytes / DataTypeSize(dtype_); OP_REQUIRES(context, num_bytes % DataTypeSize(dtype_) == 0, errors::InvalidArgument( diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc index 634f9ba8876..531089decf2 100644 --- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc +++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc @@ -91,8 +91,8 @@ void PrepOp(DataType dtype, int32 id, ScopedAllocatorMgr::PopulateFields(id, fields_shapes, dtype, fields); // We don't simply allocate a tensor with shape as backing_tensor_shape, // because we need to account for padding in the fields. We actually need a - // tensor of size at least (fields[-1].offset + fields[-1].bytes). - size_t num_bytes = fields->back().offset + fields->back().bytes; + // tensor of size at least (fields[-1].offset + fields[-1].bytes_allocated). + size_t num_bytes = fields->back().offset + fields->back().bytes_allocated; int32_t num_elements = num_bytes / DataTypeSize(dtype); CHECK_EQ(num_bytes % DataTypeSize(dtype), 0); diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py index a626f1c2594..c7fe931a0f7 100644 --- a/tensorflow/python/ops/collective_ops_test.py +++ b/tensorflow/python/ops/collective_ops_test.py @@ -158,7 +158,7 @@ class CollectiveOpTest(test.TestCase): # to `all_reduce` has an explicit device string. We don't use # `identity` because `cast` is more resilient to getting optimized # away by various optimization passes. - input_tensor = math_ops.cast(device_tensors[j], dtypes.float64) + input_tensor = math_ops.cast(device_tensors[j], dtypes.float16) collective_op = collective_ops.all_reduce( input_tensor, group_size, group_key, instances[j], 'Add', 'Id')