Annotate alignment bytes in scoped allocator as initialized.

Padding in scoped allocator may be unitialized. This is okay because the uninitialized memory is not consumed meaningfully by a downstream op. This change annotates the padding with TF_ANNOTATE_MEMORY_IS_INITIALIZED to avoid msan warnings. PiperOrigin-RevId: 268051135
2019-09-09 12:26:11 -07:00 · 2019-09-09 12:26:11 -07:00 · 0e5f75d3be
commit 0e5f75d3be
parent 24888a277e
7 changed files with 97 additions and 56 deletions
--- a/tensorflow/core/common_runtime/scoped_allocator.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator.cc
@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/scoped_allocator.h"
+
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"

 namespace tensorflow {

@ -34,7 +36,7 @@ ScopedAllocator::ScopedAllocator(const Tensor& backing_tensor, int32 scope_id,
  tbuf_->Ref();
  // Hold this until all expected_calls have been made.
  container->Ref();
-  CHECK_GE(tbuf_->size(), fields.back().offset + fields.back().bytes);
+  CHECK_GE(tbuf_->size(), fields.back().offset + fields.back().bytes_requested);
 }

 ScopedAllocator::~ScopedAllocator() {
@ -56,43 +58,66 @@ ScopedAllocator::~ScopedAllocator() {
 void* ScopedAllocator::AllocateRaw(int32 field_index, size_t num_bytes) {
  VLOG(1) << "ScopedAllocator index " << id_ << " AllocateRaw "
          << "field " << field_index << " num_bytes " << num_bytes;
-  mutex_lock l(mu_);
-  if (expected_call_count_ <= 0) {
-    LOG(ERROR) << "Scoped allocator " << name_
-               << " could not satisfy request for " << num_bytes
-               << " bytes, expected uses exhausted. ";
-    return nullptr;
-  }
-
-  int32_t num_fields = static_cast<int32>(fields_.size());
-  if (field_index >= num_fields) {
-    LOG(ERROR) << "ScopedAllocator " << name_
-               << " received unexpected field number " << field_index;
-    return nullptr;
-  }
-
-  const Field& f = fields_[field_index];
-  if (num_bytes != f.bytes) {
-    LOG(ERROR) << "ScopedAllocator " << name_ << " got request for "
-               << num_bytes << " bytes from field " << field_index
-               << " which has precalculated size " << f.bytes << " and offset "
-               << f.offset;
-    return nullptr;
-  }
-
-  void* ptr = static_cast<void*>((tbuf_->template base<char>() + f.offset));
-
-  ++live_alloc_count_;
-  --expected_call_count_;
-  if (0 == expected_call_count_) {
-    for (auto& f : fields_) {
-      container_->Drop(f.scope_id, this);
+  void* ptr = nullptr;
+  const Field* field = nullptr;
+  {
+    mutex_lock l(mu_);
+    if (expected_call_count_ <= 0) {
+      LOG(ERROR) << "Scoped allocator " << name_
+                 << " could not satisfy request for " << num_bytes
+                 << " bytes, expected uses exhausted. ";
+      return nullptr;
+    }
+
+    int32_t num_fields = static_cast<int32>(fields_.size());
+    if (field_index >= num_fields) {
+      LOG(ERROR) << "ScopedAllocator " << name_
+                 << " received unexpected field number " << field_index;
+      return nullptr;
+    }
+
+    field = &fields_[field_index];
+    if (num_bytes != field->bytes_requested) {
+      LOG(ERROR) << "ScopedAllocator " << name_ << " got request for "
+                 << num_bytes << " bytes from field " << field_index
+                 << " which has precalculated size " << field->bytes_requested
+                 << " and offset " << field->offset;
+      return nullptr;
+    }
+
+    ptr = static_cast<void*>((tbuf_->template base<char>() + field->offset));
+
+    ++live_alloc_count_;
+    --expected_call_count_;
+    if (0 == expected_call_count_) {
+      for (auto& f : fields_) {
+        container_->Drop(f.scope_id, this);
+      }
+      container_->Drop(id_, this);
+      container_->Unref();
+      container_ = nullptr;
    }
-    container_->Drop(id_, this);
-    container_->Unref();
-    container_ = nullptr;
  }
-  VLOG(1) << "AllocateRaw returning " << ptr;
+  VLOG(2) << "AllocateRaw returning " << ptr << " bytes_requested "
+          << field->bytes_requested << " bytes_allocated "
+          << field->bytes_allocated;
+
+  // If there is overshoot due to alignment, let MSAN believe that the padding
+  // is initialized.  This is okay because we do not use this memory region for
+  // anything meaningful.
+  if (field->bytes_allocated > field->bytes_requested) {
+    size_t extra_bytes = field->bytes_allocated - field->bytes_requested;
+    void* extra_buf = static_cast<void*>(static_cast<char*>(ptr) +
+                                         field->bytes_allocated - extra_bytes);
+    VLOG(2) << "AllocateRaw requested " << num_bytes
+            << " bytes which is not divisible by kAllocatorAlignment="
+            << Allocator::kAllocatorAlignment << " and hence we allocated "
+            << field->bytes_allocated << ". Annotating " << extra_bytes
+            << " bytes starting at " << extra_buf
+            << " with TF_ANNOTATE_MEMORY_IS_INITIALIZED";
+    TF_ANNOTATE_MEMORY_IS_INITIALIZED(extra_buf, extra_bytes);
+  }
+
  return ptr;
 }

--- a/tensorflow/core/common_runtime/scoped_allocator.h
+++ b/tensorflow/core/common_runtime/scoped_allocator.h
@ -35,7 +35,8 @@ class ScopedAllocator {
  struct Field {
    int32 scope_id;
    size_t offset;
-    size_t bytes;
+    size_t bytes_requested;
+    size_t bytes_allocated;
  };
  // Field index that refers to backing tensor, not any aliased field.
  static const int32 kBackingIndex = -1;
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
@ -166,21 +166,35 @@ size_t ScopedAllocatorMgr::PopulateFields(
    const DataType dtype, std::vector<ScopedAllocator::Field>* fields) {
  const int32 num_fields = static_cast<int32>(shapes.size());
  fields->resize(num_fields);
+  // At the end of iteration `i`, `offset` points to the offset from the start
+  // of the backing buffer until the end of `field[i].bytes_allocated`.  This
+  // is aligned to `kAllocatorAlignment`.
  size_t offset = 0;
  for (int32 i = 0; i < num_fields; ++i) {
+    size_t bytes_requested = shapes[i].num_elements() * DataTypeSize(dtype);
+    auto* field = &((*fields)[i]);
+    field->scope_id = scope_id + 1 + i;
+    field->bytes_requested = bytes_requested;
+    field->offset = offset;
+    offset += bytes_requested;
+
+    // Compute actual #bytes allocated, which may include padding due to
+    // alignment.
+    size_t bytes_allocated = bytes_requested;
    size_t overshoot = offset % Allocator::kAllocatorAlignment;
    if (overshoot > 0) {
-      offset += (Allocator::kAllocatorAlignment - overshoot);
+      size_t alignment_bytes = Allocator::kAllocatorAlignment - overshoot;
+      bytes_allocated += alignment_bytes;
+      offset += alignment_bytes;
    }
-    size_t bytes = shapes[i].num_elements() * DataTypeSize(dtype);
-    (*fields)[i].scope_id = scope_id + 1 + i;
-    (*fields)[i].bytes = bytes;
-    (*fields)[i].offset = offset;
-    VLOG(1) << "field=" << i << " scope_id=" << (*fields)[i].scope_id
-            << " bytes=" << (*fields)[i].bytes
-            << " offset=" << (*fields)[i].offset;
-    offset += bytes;
+    field->bytes_allocated = bytes_allocated;
+
+    VLOG(1) << "field=" << i << " scope_id=" << field->scope_id
+            << " bytes_requested=" << field->bytes_requested
+            << " offset=" << field->offset
+            << " bytes_allocated=" << field->bytes_allocated;
  }
+
  return offset;
 }

--- a/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc
@ -110,13 +110,13 @@ TEST_F(ScopedAllocatorMgrTest, PopulateFields) {
  InitTensor();
  PopulateFields();
  EXPECT_EQ(0, fields_[0].offset);
-  EXPECT_EQ(512 * sizeof(float), fields_[0].bytes);
+  EXPECT_EQ(512 * sizeof(float), fields_[0].bytes_requested);
  EXPECT_EQ(scope_id_ + 1, fields_[0].scope_id);
  EXPECT_EQ(512 * sizeof(float), fields_[1].offset);
-  EXPECT_EQ(9 * sizeof(float), fields_[1].bytes);
+  EXPECT_EQ(9 * sizeof(float), fields_[1].bytes_requested);
  EXPECT_EQ(scope_id_ + 2, fields_[1].scope_id);
  EXPECT_EQ(521 * sizeof(float) + AlignmentPadding(), fields_[2].offset);
-  EXPECT_EQ(512 * sizeof(float), fields_[2].bytes);
+  EXPECT_EQ(512 * sizeof(float), fields_[2].bytes_requested);
  EXPECT_EQ(scope_id_ + 3, fields_[2].scope_id);
 }

@ -185,9 +185,10 @@ TEST_F(ScopedAllocatorMgrTest, AllocatorInitFail) {
  fields_.resize(1);
  fields_[0].scope_id = scope_id_ + 1;
  fields_[0].offset = 0;
-  fields_[0].bytes = backing_tensor_shape_.num_elements() * 2 * sizeof(float);
-  // fields[0].offset + fields[0].bytes is larger than the size of the backing
-  // tensor, so this check should fail
+  fields_[0].bytes_requested =
+      backing_tensor_shape_.num_elements() * 2 * sizeof(float);
+  // fields[0].offset + fields[0].bytes_requested is larger than the size of the
+  // backing tensor, so this check should fail
  EXPECT_DEATH(Status s = AddScopedAllocator(1, scope_id_), "");
 }

--- a/tensorflow/core/kernels/scoped_allocator_ops.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops.cc
@ -39,7 +39,7 @@ class ScopedAllocatorOp : public OpKernel {
    // the subtensors to be allocated from it, taking into account
    // alignment considerations.
    ScopedAllocatorMgr::PopulateFields(id_, shapes_, dtype_, &fields_);
-    size_t num_bytes = fields_.back().offset + fields_.back().bytes;
+    size_t num_bytes = fields_.back().offset + fields_.back().bytes_allocated;
    num_elements_ = num_bytes / DataTypeSize(dtype_);
    OP_REQUIRES(context, num_bytes % DataTypeSize(dtype_) == 0,
                errors::InvalidArgument(
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@ -91,8 +91,8 @@ void PrepOp(DataType dtype, int32 id,
  ScopedAllocatorMgr::PopulateFields(id, fields_shapes, dtype, fields);
  // We don't simply allocate a tensor with shape as backing_tensor_shape,
  // because we need to account for padding in the fields.  We actually need a
-  // tensor of size at least (fields[-1].offset + fields[-1].bytes).
-  size_t num_bytes = fields->back().offset + fields->back().bytes;
+  // tensor of size at least (fields[-1].offset + fields[-1].bytes_allocated).
+  size_t num_bytes = fields->back().offset + fields->back().bytes_allocated;
  int32_t num_elements = num_bytes / DataTypeSize(dtype);
  CHECK_EQ(num_bytes % DataTypeSize(dtype), 0);

--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@ -158,7 +158,7 @@ class CollectiveOpTest(test.TestCase):
              # to `all_reduce` has an explicit device string.  We don't use
              # `identity` because `cast` is more resilient to getting optimized
              # away by various optimization passes.
-              input_tensor = math_ops.cast(device_tensors[j], dtypes.float64)
+              input_tensor = math_ops.cast(device_tensors[j], dtypes.float16)
              collective_op = collective_ops.all_reduce(
                  input_tensor, group_size, group_key, instances[j],
                  'Add', 'Id')