Annotate alignment bytes in scoped allocator as initialized.
Padding in scoped allocator may be unitialized. This is okay because the uninitialized memory is not consumed meaningfully by a downstream op. This change annotates the padding with TF_ANNOTATE_MEMORY_IS_INITIALIZED to avoid msan warnings. PiperOrigin-RevId: 268051135
This commit is contained in:
parent
24888a277e
commit
0e5f75d3be
@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
#include "tensorflow/core/common_runtime/scoped_allocator.h"
|
#include "tensorflow/core/common_runtime/scoped_allocator.h"
|
||||||
|
|
||||||
#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
|
#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
|
||||||
|
#include "tensorflow/core/platform/dynamic_annotations.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
@ -34,7 +36,7 @@ ScopedAllocator::ScopedAllocator(const Tensor& backing_tensor, int32 scope_id,
|
|||||||
tbuf_->Ref();
|
tbuf_->Ref();
|
||||||
// Hold this until all expected_calls have been made.
|
// Hold this until all expected_calls have been made.
|
||||||
container->Ref();
|
container->Ref();
|
||||||
CHECK_GE(tbuf_->size(), fields.back().offset + fields.back().bytes);
|
CHECK_GE(tbuf_->size(), fields.back().offset + fields.back().bytes_requested);
|
||||||
}
|
}
|
||||||
|
|
||||||
ScopedAllocator::~ScopedAllocator() {
|
ScopedAllocator::~ScopedAllocator() {
|
||||||
@ -56,6 +58,9 @@ ScopedAllocator::~ScopedAllocator() {
|
|||||||
void* ScopedAllocator::AllocateRaw(int32 field_index, size_t num_bytes) {
|
void* ScopedAllocator::AllocateRaw(int32 field_index, size_t num_bytes) {
|
||||||
VLOG(1) << "ScopedAllocator index " << id_ << " AllocateRaw "
|
VLOG(1) << "ScopedAllocator index " << id_ << " AllocateRaw "
|
||||||
<< "field " << field_index << " num_bytes " << num_bytes;
|
<< "field " << field_index << " num_bytes " << num_bytes;
|
||||||
|
void* ptr = nullptr;
|
||||||
|
const Field* field = nullptr;
|
||||||
|
{
|
||||||
mutex_lock l(mu_);
|
mutex_lock l(mu_);
|
||||||
if (expected_call_count_ <= 0) {
|
if (expected_call_count_ <= 0) {
|
||||||
LOG(ERROR) << "Scoped allocator " << name_
|
LOG(ERROR) << "Scoped allocator " << name_
|
||||||
@ -71,16 +76,16 @@ void* ScopedAllocator::AllocateRaw(int32 field_index, size_t num_bytes) {
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
const Field& f = fields_[field_index];
|
field = &fields_[field_index];
|
||||||
if (num_bytes != f.bytes) {
|
if (num_bytes != field->bytes_requested) {
|
||||||
LOG(ERROR) << "ScopedAllocator " << name_ << " got request for "
|
LOG(ERROR) << "ScopedAllocator " << name_ << " got request for "
|
||||||
<< num_bytes << " bytes from field " << field_index
|
<< num_bytes << " bytes from field " << field_index
|
||||||
<< " which has precalculated size " << f.bytes << " and offset "
|
<< " which has precalculated size " << field->bytes_requested
|
||||||
<< f.offset;
|
<< " and offset " << field->offset;
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void* ptr = static_cast<void*>((tbuf_->template base<char>() + f.offset));
|
ptr = static_cast<void*>((tbuf_->template base<char>() + field->offset));
|
||||||
|
|
||||||
++live_alloc_count_;
|
++live_alloc_count_;
|
||||||
--expected_call_count_;
|
--expected_call_count_;
|
||||||
@ -92,7 +97,27 @@ void* ScopedAllocator::AllocateRaw(int32 field_index, size_t num_bytes) {
|
|||||||
container_->Unref();
|
container_->Unref();
|
||||||
container_ = nullptr;
|
container_ = nullptr;
|
||||||
}
|
}
|
||||||
VLOG(1) << "AllocateRaw returning " << ptr;
|
}
|
||||||
|
VLOG(2) << "AllocateRaw returning " << ptr << " bytes_requested "
|
||||||
|
<< field->bytes_requested << " bytes_allocated "
|
||||||
|
<< field->bytes_allocated;
|
||||||
|
|
||||||
|
// If there is overshoot due to alignment, let MSAN believe that the padding
|
||||||
|
// is initialized. This is okay because we do not use this memory region for
|
||||||
|
// anything meaningful.
|
||||||
|
if (field->bytes_allocated > field->bytes_requested) {
|
||||||
|
size_t extra_bytes = field->bytes_allocated - field->bytes_requested;
|
||||||
|
void* extra_buf = static_cast<void*>(static_cast<char*>(ptr) +
|
||||||
|
field->bytes_allocated - extra_bytes);
|
||||||
|
VLOG(2) << "AllocateRaw requested " << num_bytes
|
||||||
|
<< " bytes which is not divisible by kAllocatorAlignment="
|
||||||
|
<< Allocator::kAllocatorAlignment << " and hence we allocated "
|
||||||
|
<< field->bytes_allocated << ". Annotating " << extra_bytes
|
||||||
|
<< " bytes starting at " << extra_buf
|
||||||
|
<< " with TF_ANNOTATE_MEMORY_IS_INITIALIZED";
|
||||||
|
TF_ANNOTATE_MEMORY_IS_INITIALIZED(extra_buf, extra_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -35,7 +35,8 @@ class ScopedAllocator {
|
|||||||
struct Field {
|
struct Field {
|
||||||
int32 scope_id;
|
int32 scope_id;
|
||||||
size_t offset;
|
size_t offset;
|
||||||
size_t bytes;
|
size_t bytes_requested;
|
||||||
|
size_t bytes_allocated;
|
||||||
};
|
};
|
||||||
// Field index that refers to backing tensor, not any aliased field.
|
// Field index that refers to backing tensor, not any aliased field.
|
||||||
static const int32 kBackingIndex = -1;
|
static const int32 kBackingIndex = -1;
|
||||||
|
@ -166,21 +166,35 @@ size_t ScopedAllocatorMgr::PopulateFields(
|
|||||||
const DataType dtype, std::vector<ScopedAllocator::Field>* fields) {
|
const DataType dtype, std::vector<ScopedAllocator::Field>* fields) {
|
||||||
const int32 num_fields = static_cast<int32>(shapes.size());
|
const int32 num_fields = static_cast<int32>(shapes.size());
|
||||||
fields->resize(num_fields);
|
fields->resize(num_fields);
|
||||||
|
// At the end of iteration `i`, `offset` points to the offset from the start
|
||||||
|
// of the backing buffer until the end of `field[i].bytes_allocated`. This
|
||||||
|
// is aligned to `kAllocatorAlignment`.
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
for (int32 i = 0; i < num_fields; ++i) {
|
for (int32 i = 0; i < num_fields; ++i) {
|
||||||
|
size_t bytes_requested = shapes[i].num_elements() * DataTypeSize(dtype);
|
||||||
|
auto* field = &((*fields)[i]);
|
||||||
|
field->scope_id = scope_id + 1 + i;
|
||||||
|
field->bytes_requested = bytes_requested;
|
||||||
|
field->offset = offset;
|
||||||
|
offset += bytes_requested;
|
||||||
|
|
||||||
|
// Compute actual #bytes allocated, which may include padding due to
|
||||||
|
// alignment.
|
||||||
|
size_t bytes_allocated = bytes_requested;
|
||||||
size_t overshoot = offset % Allocator::kAllocatorAlignment;
|
size_t overshoot = offset % Allocator::kAllocatorAlignment;
|
||||||
if (overshoot > 0) {
|
if (overshoot > 0) {
|
||||||
offset += (Allocator::kAllocatorAlignment - overshoot);
|
size_t alignment_bytes = Allocator::kAllocatorAlignment - overshoot;
|
||||||
|
bytes_allocated += alignment_bytes;
|
||||||
|
offset += alignment_bytes;
|
||||||
}
|
}
|
||||||
size_t bytes = shapes[i].num_elements() * DataTypeSize(dtype);
|
field->bytes_allocated = bytes_allocated;
|
||||||
(*fields)[i].scope_id = scope_id + 1 + i;
|
|
||||||
(*fields)[i].bytes = bytes;
|
VLOG(1) << "field=" << i << " scope_id=" << field->scope_id
|
||||||
(*fields)[i].offset = offset;
|
<< " bytes_requested=" << field->bytes_requested
|
||||||
VLOG(1) << "field=" << i << " scope_id=" << (*fields)[i].scope_id
|
<< " offset=" << field->offset
|
||||||
<< " bytes=" << (*fields)[i].bytes
|
<< " bytes_allocated=" << field->bytes_allocated;
|
||||||
<< " offset=" << (*fields)[i].offset;
|
|
||||||
offset += bytes;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return offset;
|
return offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -110,13 +110,13 @@ TEST_F(ScopedAllocatorMgrTest, PopulateFields) {
|
|||||||
InitTensor();
|
InitTensor();
|
||||||
PopulateFields();
|
PopulateFields();
|
||||||
EXPECT_EQ(0, fields_[0].offset);
|
EXPECT_EQ(0, fields_[0].offset);
|
||||||
EXPECT_EQ(512 * sizeof(float), fields_[0].bytes);
|
EXPECT_EQ(512 * sizeof(float), fields_[0].bytes_requested);
|
||||||
EXPECT_EQ(scope_id_ + 1, fields_[0].scope_id);
|
EXPECT_EQ(scope_id_ + 1, fields_[0].scope_id);
|
||||||
EXPECT_EQ(512 * sizeof(float), fields_[1].offset);
|
EXPECT_EQ(512 * sizeof(float), fields_[1].offset);
|
||||||
EXPECT_EQ(9 * sizeof(float), fields_[1].bytes);
|
EXPECT_EQ(9 * sizeof(float), fields_[1].bytes_requested);
|
||||||
EXPECT_EQ(scope_id_ + 2, fields_[1].scope_id);
|
EXPECT_EQ(scope_id_ + 2, fields_[1].scope_id);
|
||||||
EXPECT_EQ(521 * sizeof(float) + AlignmentPadding(), fields_[2].offset);
|
EXPECT_EQ(521 * sizeof(float) + AlignmentPadding(), fields_[2].offset);
|
||||||
EXPECT_EQ(512 * sizeof(float), fields_[2].bytes);
|
EXPECT_EQ(512 * sizeof(float), fields_[2].bytes_requested);
|
||||||
EXPECT_EQ(scope_id_ + 3, fields_[2].scope_id);
|
EXPECT_EQ(scope_id_ + 3, fields_[2].scope_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -185,9 +185,10 @@ TEST_F(ScopedAllocatorMgrTest, AllocatorInitFail) {
|
|||||||
fields_.resize(1);
|
fields_.resize(1);
|
||||||
fields_[0].scope_id = scope_id_ + 1;
|
fields_[0].scope_id = scope_id_ + 1;
|
||||||
fields_[0].offset = 0;
|
fields_[0].offset = 0;
|
||||||
fields_[0].bytes = backing_tensor_shape_.num_elements() * 2 * sizeof(float);
|
fields_[0].bytes_requested =
|
||||||
// fields[0].offset + fields[0].bytes is larger than the size of the backing
|
backing_tensor_shape_.num_elements() * 2 * sizeof(float);
|
||||||
// tensor, so this check should fail
|
// fields[0].offset + fields[0].bytes_requested is larger than the size of the
|
||||||
|
// backing tensor, so this check should fail
|
||||||
EXPECT_DEATH(Status s = AddScopedAllocator(1, scope_id_), "");
|
EXPECT_DEATH(Status s = AddScopedAllocator(1, scope_id_), "");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ class ScopedAllocatorOp : public OpKernel {
|
|||||||
// the subtensors to be allocated from it, taking into account
|
// the subtensors to be allocated from it, taking into account
|
||||||
// alignment considerations.
|
// alignment considerations.
|
||||||
ScopedAllocatorMgr::PopulateFields(id_, shapes_, dtype_, &fields_);
|
ScopedAllocatorMgr::PopulateFields(id_, shapes_, dtype_, &fields_);
|
||||||
size_t num_bytes = fields_.back().offset + fields_.back().bytes;
|
size_t num_bytes = fields_.back().offset + fields_.back().bytes_allocated;
|
||||||
num_elements_ = num_bytes / DataTypeSize(dtype_);
|
num_elements_ = num_bytes / DataTypeSize(dtype_);
|
||||||
OP_REQUIRES(context, num_bytes % DataTypeSize(dtype_) == 0,
|
OP_REQUIRES(context, num_bytes % DataTypeSize(dtype_) == 0,
|
||||||
errors::InvalidArgument(
|
errors::InvalidArgument(
|
||||||
|
@ -91,8 +91,8 @@ void PrepOp(DataType dtype, int32 id,
|
|||||||
ScopedAllocatorMgr::PopulateFields(id, fields_shapes, dtype, fields);
|
ScopedAllocatorMgr::PopulateFields(id, fields_shapes, dtype, fields);
|
||||||
// We don't simply allocate a tensor with shape as backing_tensor_shape,
|
// We don't simply allocate a tensor with shape as backing_tensor_shape,
|
||||||
// because we need to account for padding in the fields. We actually need a
|
// because we need to account for padding in the fields. We actually need a
|
||||||
// tensor of size at least (fields[-1].offset + fields[-1].bytes).
|
// tensor of size at least (fields[-1].offset + fields[-1].bytes_allocated).
|
||||||
size_t num_bytes = fields->back().offset + fields->back().bytes;
|
size_t num_bytes = fields->back().offset + fields->back().bytes_allocated;
|
||||||
int32_t num_elements = num_bytes / DataTypeSize(dtype);
|
int32_t num_elements = num_bytes / DataTypeSize(dtype);
|
||||||
CHECK_EQ(num_bytes % DataTypeSize(dtype), 0);
|
CHECK_EQ(num_bytes % DataTypeSize(dtype), 0);
|
||||||
|
|
||||||
|
@ -158,7 +158,7 @@ class CollectiveOpTest(test.TestCase):
|
|||||||
# to `all_reduce` has an explicit device string. We don't use
|
# to `all_reduce` has an explicit device string. We don't use
|
||||||
# `identity` because `cast` is more resilient to getting optimized
|
# `identity` because `cast` is more resilient to getting optimized
|
||||||
# away by various optimization passes.
|
# away by various optimization passes.
|
||||||
input_tensor = math_ops.cast(device_tensors[j], dtypes.float64)
|
input_tensor = math_ops.cast(device_tensors[j], dtypes.float16)
|
||||||
collective_op = collective_ops.all_reduce(
|
collective_op = collective_ops.all_reduce(
|
||||||
input_tensor, group_size, group_key, instances[j],
|
input_tensor, group_size, group_key, instances[j],
|
||||||
'Add', 'Id')
|
'Add', 'Id')
|
||||||
|
Loading…
Reference in New Issue
Block a user