diff --git a/tensorflow/core/common_runtime/scoped_allocator.cc b/tensorflow/core/common_runtime/scoped_allocator.cc
index 3c3e4ffd6d9..e61df192c39 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/scoped_allocator.h"
+
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
 
 namespace tensorflow {
 
@@ -34,7 +36,7 @@ ScopedAllocator::ScopedAllocator(const Tensor& backing_tensor, int32 scope_id,
   tbuf_->Ref();
   // Hold this until all expected_calls have been made.
   container->Ref();
-  CHECK_GE(tbuf_->size(), fields.back().offset + fields.back().bytes);
+  CHECK_GE(tbuf_->size(), fields.back().offset + fields.back().bytes_requested);
 }
 
 ScopedAllocator::~ScopedAllocator() {
@@ -56,43 +58,66 @@ ScopedAllocator::~ScopedAllocator() {
 void* ScopedAllocator::AllocateRaw(int32 field_index, size_t num_bytes) {
   VLOG(1) << "ScopedAllocator index " << id_ << " AllocateRaw "
           << "field " << field_index << " num_bytes " << num_bytes;
-  mutex_lock l(mu_);
-  if (expected_call_count_ <= 0) {
-    LOG(ERROR) << "Scoped allocator " << name_
-               << " could not satisfy request for " << num_bytes
-               << " bytes, expected uses exhausted. ";
-    return nullptr;
-  }
-
-  int32_t num_fields = static_cast<int32>(fields_.size());
-  if (field_index >= num_fields) {
-    LOG(ERROR) << "ScopedAllocator " << name_
-               << " received unexpected field number " << field_index;
-    return nullptr;
-  }
-
-  const Field& f = fields_[field_index];
-  if (num_bytes != f.bytes) {
-    LOG(ERROR) << "ScopedAllocator " << name_ << " got request for "
-               << num_bytes << " bytes from field " << field_index
-               << " which has precalculated size " << f.bytes << " and offset "
-               << f.offset;
-    return nullptr;
-  }
-
-  void* ptr = static_cast<void*>((tbuf_->template base<char>() + f.offset));
-
-  ++live_alloc_count_;
-  --expected_call_count_;
-  if (0 == expected_call_count_) {
-    for (auto& f : fields_) {
-      container_->Drop(f.scope_id, this);
+  void* ptr = nullptr;
+  const Field* field = nullptr;
+  {
+    mutex_lock l(mu_);
+    if (expected_call_count_ <= 0) {
+      LOG(ERROR) << "Scoped allocator " << name_
+                 << " could not satisfy request for " << num_bytes
+                 << " bytes, expected uses exhausted. ";
+      return nullptr;
+    }
+
+    int32_t num_fields = static_cast<int32>(fields_.size());
+    if (field_index >= num_fields) {
+      LOG(ERROR) << "ScopedAllocator " << name_
+                 << " received unexpected field number " << field_index;
+      return nullptr;
+    }
+
+    field = &fields_[field_index];
+    if (num_bytes != field->bytes_requested) {
+      LOG(ERROR) << "ScopedAllocator " << name_ << " got request for "
+                 << num_bytes << " bytes from field " << field_index
+                 << " which has precalculated size " << field->bytes_requested
+                 << " and offset " << field->offset;
+      return nullptr;
+    }
+
+    ptr = static_cast<void*>((tbuf_->template base<char>() + field->offset));
+
+    ++live_alloc_count_;
+    --expected_call_count_;
+    if (0 == expected_call_count_) {
+      for (auto& f : fields_) {
+        container_->Drop(f.scope_id, this);
+      }
+      container_->Drop(id_, this);
+      container_->Unref();
+      container_ = nullptr;
     }
-    container_->Drop(id_, this);
-    container_->Unref();
-    container_ = nullptr;
   }
-  VLOG(1) << "AllocateRaw returning " << ptr;
+  VLOG(2) << "AllocateRaw returning " << ptr << " bytes_requested "
+          << field->bytes_requested << " bytes_allocated "
+          << field->bytes_allocated;
+
+  // If there is overshoot due to alignment, let MSAN believe that the padding
+  // is initialized.  This is okay because we do not use this memory region for
+  // anything meaningful.
+  if (field->bytes_allocated > field->bytes_requested) {
+    size_t extra_bytes = field->bytes_allocated - field->bytes_requested;
+    void* extra_buf = static_cast<void*>(static_cast<char*>(ptr) +
+                                         field->bytes_allocated - extra_bytes);
+    VLOG(2) << "AllocateRaw requested " << num_bytes
+            << " bytes which is not divisible by kAllocatorAlignment="
+            << Allocator::kAllocatorAlignment << " and hence we allocated "
+            << field->bytes_allocated << ". Annotating " << extra_bytes
+            << " bytes starting at " << extra_buf
+            << " with TF_ANNOTATE_MEMORY_IS_INITIALIZED";
+    TF_ANNOTATE_MEMORY_IS_INITIALIZED(extra_buf, extra_bytes);
+  }
+
   return ptr;
 }
 
diff --git a/tensorflow/core/common_runtime/scoped_allocator.h b/tensorflow/core/common_runtime/scoped_allocator.h
index 683bbc7e9ed..1b458aacb23 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.h
+++ b/tensorflow/core/common_runtime/scoped_allocator.h
@@ -35,7 +35,8 @@ class ScopedAllocator {
   struct Field {
     int32 scope_id;
     size_t offset;
-    size_t bytes;
+    size_t bytes_requested;
+    size_t bytes_allocated;
   };
   // Field index that refers to backing tensor, not any aliased field.
   static const int32 kBackingIndex = -1;
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
index 8ac6adc2e48..96f55aa4f4d 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
@@ -166,21 +166,35 @@ size_t ScopedAllocatorMgr::PopulateFields(
     const DataType dtype, std::vector<ScopedAllocator::Field>* fields) {
   const int32 num_fields = static_cast<int32>(shapes.size());
   fields->resize(num_fields);
+  // At the end of iteration `i`, `offset` points to the offset from the start
+  // of the backing buffer until the end of `field[i].bytes_allocated`.  This
+  // is aligned to `kAllocatorAlignment`.
   size_t offset = 0;
   for (int32 i = 0; i < num_fields; ++i) {
+    size_t bytes_requested = shapes[i].num_elements() * DataTypeSize(dtype);
+    auto* field = &((*fields)[i]);
+    field->scope_id = scope_id + 1 + i;
+    field->bytes_requested = bytes_requested;
+    field->offset = offset;
+    offset += bytes_requested;
+
+    // Compute actual #bytes allocated, which may include padding due to
+    // alignment.
+    size_t bytes_allocated = bytes_requested;
     size_t overshoot = offset % Allocator::kAllocatorAlignment;
     if (overshoot > 0) {
-      offset += (Allocator::kAllocatorAlignment - overshoot);
+      size_t alignment_bytes = Allocator::kAllocatorAlignment - overshoot;
+      bytes_allocated += alignment_bytes;
+      offset += alignment_bytes;
     }
-    size_t bytes = shapes[i].num_elements() * DataTypeSize(dtype);
-    (*fields)[i].scope_id = scope_id + 1 + i;
-    (*fields)[i].bytes = bytes;
-    (*fields)[i].offset = offset;
-    VLOG(1) << "field=" << i << " scope_id=" << (*fields)[i].scope_id
-            << " bytes=" << (*fields)[i].bytes
-            << " offset=" << (*fields)[i].offset;
-    offset += bytes;
+    field->bytes_allocated = bytes_allocated;
+
+    VLOG(1) << "field=" << i << " scope_id=" << field->scope_id
+            << " bytes_requested=" << field->bytes_requested
+            << " offset=" << field->offset
+            << " bytes_allocated=" << field->bytes_allocated;
   }
+
   return offset;
 }
 
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc
index 38e07e47f24..a359924f056 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc
@@ -110,13 +110,13 @@ TEST_F(ScopedAllocatorMgrTest, PopulateFields) {
   InitTensor();
   PopulateFields();
   EXPECT_EQ(0, fields_[0].offset);
-  EXPECT_EQ(512 * sizeof(float), fields_[0].bytes);
+  EXPECT_EQ(512 * sizeof(float), fields_[0].bytes_requested);
   EXPECT_EQ(scope_id_ + 1, fields_[0].scope_id);
   EXPECT_EQ(512 * sizeof(float), fields_[1].offset);
-  EXPECT_EQ(9 * sizeof(float), fields_[1].bytes);
+  EXPECT_EQ(9 * sizeof(float), fields_[1].bytes_requested);
   EXPECT_EQ(scope_id_ + 2, fields_[1].scope_id);
   EXPECT_EQ(521 * sizeof(float) + AlignmentPadding(), fields_[2].offset);
-  EXPECT_EQ(512 * sizeof(float), fields_[2].bytes);
+  EXPECT_EQ(512 * sizeof(float), fields_[2].bytes_requested);
   EXPECT_EQ(scope_id_ + 3, fields_[2].scope_id);
 }
 
@@ -185,9 +185,10 @@ TEST_F(ScopedAllocatorMgrTest, AllocatorInitFail) {
   fields_.resize(1);
   fields_[0].scope_id = scope_id_ + 1;
   fields_[0].offset = 0;
-  fields_[0].bytes = backing_tensor_shape_.num_elements() * 2 * sizeof(float);
-  // fields[0].offset + fields[0].bytes is larger than the size of the backing
-  // tensor, so this check should fail
+  fields_[0].bytes_requested =
+      backing_tensor_shape_.num_elements() * 2 * sizeof(float);
+  // fields[0].offset + fields[0].bytes_requested is larger than the size of the
+  // backing tensor, so this check should fail
   EXPECT_DEATH(Status s = AddScopedAllocator(1, scope_id_), "");
 }
 
diff --git a/tensorflow/core/kernels/scoped_allocator_ops.cc b/tensorflow/core/kernels/scoped_allocator_ops.cc
index 79320e08cb2..881d728f632 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops.cc
@@ -39,7 +39,7 @@ class ScopedAllocatorOp : public OpKernel {
     // the subtensors to be allocated from it, taking into account
     // alignment considerations.
     ScopedAllocatorMgr::PopulateFields(id_, shapes_, dtype_, &fields_);
-    size_t num_bytes = fields_.back().offset + fields_.back().bytes;
+    size_t num_bytes = fields_.back().offset + fields_.back().bytes_allocated;
     num_elements_ = num_bytes / DataTypeSize(dtype_);
     OP_REQUIRES(context, num_bytes % DataTypeSize(dtype_) == 0,
                 errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index 634f9ba8876..531089decf2 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -91,8 +91,8 @@ void PrepOp(DataType dtype, int32 id,
   ScopedAllocatorMgr::PopulateFields(id, fields_shapes, dtype, fields);
   // We don't simply allocate a tensor with shape as backing_tensor_shape,
   // because we need to account for padding in the fields.  We actually need a
-  // tensor of size at least (fields[-1].offset + fields[-1].bytes).
-  size_t num_bytes = fields->back().offset + fields->back().bytes;
+  // tensor of size at least (fields[-1].offset + fields[-1].bytes_allocated).
+  size_t num_bytes = fields->back().offset + fields->back().bytes_allocated;
   int32_t num_elements = num_bytes / DataTypeSize(dtype);
   CHECK_EQ(num_bytes % DataTypeSize(dtype), 0);
 
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index a626f1c2594..c7fe931a0f7 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -158,7 +158,7 @@ class CollectiveOpTest(test.TestCase):
               # to `all_reduce` has an explicit device string.  We don't use
               # `identity` because `cast` is more resilient to getting optimized
               # away by various optimization passes.
-              input_tensor = math_ops.cast(device_tensors[j], dtypes.float64)
+              input_tensor = math_ops.cast(device_tensors[j], dtypes.float16)
               collective_op = collective_ops.all_reduce(
                   input_tensor, group_size, group_key, instances[j],
                   'Add', 'Id')