diff --git a/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc b/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc
index 75dd607f75c..3d1155ef41e 100644
--- a/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc
+++ b/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc
@@ -43,8 +43,8 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
   tflite::ops::micro::AllOpsResolver resolver;
 
   // Create an area of memory to use for input, output, and intermediate arrays.
-  // Finding the minimum value for your model may require some trial and error.
-  const int tensor_arena_size = 2 * 1024;
+  // `arena_used_bytes` can be used to retrieve the optimal size.
+  const int tensor_arena_size = 2208 + 16 + 100 /* some reserved space */;
   uint8_t tensor_arena[tensor_arena_size];
 
   // Build an interpreter to run the model with
@@ -53,6 +53,10 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
 
   // Allocate memory from the tensor_arena for the model's tensors
   TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  // At the time of writing, the hello world model uses 2208 bytes, we leave
+  // 100 bytes head room here to make the test less fragile and in the same
+  // time, alert for substantial increase.
+  TF_LITE_MICRO_EXPECT_LE(interpreter.arena_used_bytes(), 2208 + 100);
 
   // Obtain a pointer to the model's input tensor
   TfLiteTensor* input = interpreter.input(0);
diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
index faea73e9169..c5e2d579ccd 100644
--- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
+++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
@@ -41,13 +41,8 @@ void ReverseSortInPlace(int* values, int* ids, int size) {
 GreedyMemoryPlanner::GreedyMemoryPlanner(unsigned char* scratch_buffer,
                                          int scratch_buffer_size)
     : buffer_count_(0), need_to_calculate_offsets_(true) {
-  const int per_buffer_size = sizeof(BufferRequirements) +  // requirements_
-                              sizeof(int) +  // buffer_sizes_sorted_by_size_
-                              sizeof(int) +  // buffer_ids_sorted_by_size_
-                              sizeof(ListEntry) +  // buffers_sorted_by_offset_
-                              sizeof(int);         // buffer_offsets_;
   // Allocate the arrays we need within the scratch buffer arena.
-  max_buffer_count_ = scratch_buffer_size / per_buffer_size;
+  max_buffer_count_ = scratch_buffer_size / per_buffer_size();
 
   unsigned char* next_free = scratch_buffer;
   requirements_ = reinterpret_cast<BufferRequirements*>(next_free);
diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
index f2c77ed94f3..0cb81093596 100644
--- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
+++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
@@ -86,6 +86,17 @@ class GreedyMemoryPlanner : public MemoryPlanner {
     int next_entry_index;
   };
 
+  // Number of bytes required in order to plan a buffer.
+  static size_t per_buffer_size() {
+    const int per_buffer_size =
+        sizeof(BufferRequirements) +  // requirements_
+        sizeof(int) +                 // buffer_sizes_sorted_by_size_
+        sizeof(int) +                 // buffer_ids_sorted_by_size_
+        sizeof(ListEntry) +           // buffers_sorted_by_offset_
+        sizeof(int);                  // buffer_offsets_;
+    return per_buffer_size;
+  }
+
  private:
   // Whether a buffer is active in a given time range.
   bool DoesEntryOverlapInTime(const ListEntry* entry, const int first_time_used,
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 1bbcadf110e..573ac2e0b11 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -440,6 +440,13 @@ MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
                                ErrorReporter* error_reporter)
     : model_(model), error_reporter_(error_reporter), context_(context) {
   uint8_t* aligned_arena = AlignPointerUp(tensor_arena, kBufferAlignment);
+  if (aligned_arena != tensor_arena) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "%d bytes lost due to alignment. To avoid this loss, please make sure "
+        "the tensor_arena is 16 bytes aligned.",
+        aligned_arena - tensor_arena);
+  }
   size_t aligned_arena_size = tensor_arena + arena_size - aligned_arena;
   // Creates a root memory allocator managing the arena. The allocator itself
   // also locates in the arena buffer. This allocator doesn't need to be
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index e7dd4f3e34e..a846b0c63ba 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -64,9 +64,10 @@ typedef struct {
 // This information could change in the future version.
 // ************** .memory_allocator->GetBuffer()
 // Tensors/Scratch buffers (head)
-// **************
+// ************** .head_watermark
 // unused memory
-// ************** .memory_allocator->GetBuffer() + ->GetDataSize()
+// ************** .memory_allocator->GetBuffer() + ->GetMaxBufferSize()
+//                                               - ->GetDataSize()
 // persistent area (tail)
 // ************** .memory_allocator->GetBuffer() + ->GetMaxBufferSize()
 class MicroAllocator {
@@ -88,6 +89,15 @@ class MicroAllocator {
   // called in this class.
   TfLiteStatus FinishTensorAllocation();
 
+  // Returns the arena usage in bytes, only available after
+  // `FinishTensorAllocation`. Otherwise, it will return 0.
+  size_t used_bytes() const {
+    if (active_) {
+      return 0;
+    }
+    return memory_allocator_->GetUsedBytes();
+  }
+
   // Run through the model to allocate nodes and registrations. We need to keep
   // them for the entire life time of the model to allow persistent tensors.
   // This method needs to be called before FinishTensorAllocation method.
@@ -115,6 +125,7 @@ class MicroAllocator {
   TfLiteStatus Init();
 
   const Model* model_;
+  // A simple memory allocator that always allocate from the arena tail.
   SimpleMemoryAllocator* memory_allocator_;
   ErrorReporter* error_reporter_;
   TfLiteContext* context_;
diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index 03aa7c0bd39..78419edbbf9 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -142,11 +142,15 @@ TF_LITE_MICRO_TEST(TestMissingQuantization) {
 TF_LITE_MICRO_TEST(TestFinishTensorAllocation) {
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
   TfLiteContext context;
-  constexpr size_t arena_size = 1024;
+  constexpr size_t arena_size =
+      760 /* minimal arena size at the time of writting */ +
+      16 /* alignment */ + 100 /* leave some headroom for future proof */;
   uint8_t arena[arena_size];
   tflite::MicroAllocator allocator(&context, model, arena, arena_size,
                                    micro_test::reporter);
   TF_LITE_MICRO_EXPECT_EQ(4, context.tensors_size);
+  // Memory planning hasn't been finalized, so the used bytes is unknown.
+  TF_LITE_MICRO_EXPECT_EQ(0, allocator.used_bytes());
 
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.FinishTensorAllocation());
   // No allocation to be done afterwards.
@@ -170,6 +174,7 @@ TF_LITE_MICRO_TEST(TestFinishTensorAllocation) {
                           context.tensors[1].data.raw);
   TF_LITE_MICRO_EXPECT_NE(context.tensors[3].data.raw,
                           context.tensors[2].data.raw);
+  TF_LITE_MICRO_EXPECT_LE(allocator.used_bytes(), 760 + 100);
 }
 
 TF_LITE_MICRO_TEST(TestAllocationForModelsWithBranches) {
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index e41f2e3dc0f..b2046128c78 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -139,6 +139,14 @@ class MicroInterpreter {
     return node_and_registrations_[node_index];
   }
 
+  // For debugging only.
+  // Returns the actual used arena in bytes. This method gives the optimal arena
+  // size. It's only available after `AllocateTensors` has been called.
+  // Note that normally `tensor_arena` requires 16 bytes alignment to fully
+  // utilize the space. If it's not the case, the optimial arena size would be
+  // arena_used_bytes() + 16.
+  size_t arena_used_bytes() const { return allocator_.used_bytes(); }
+
  private:
   void CorrectTensorEndianness(TfLiteTensor* tensorCorr);
 
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index 9517a806f3b..36e8c009b96 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -174,7 +174,9 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
   TF_LITE_MICRO_EXPECT_NE(nullptr, model);
   tflite::MockOpResolver mock_resolver;
-  constexpr size_t allocator_buffer_size = 1024;
+  constexpr size_t allocator_buffer_size =
+      928 /* optimal arena size at the time of writting. */ +
+      16 /* alignment */ + 100 /* some headroom */;
   uint8_t allocator_buffer[allocator_buffer_size];
 
   // Create a new scope so that we can test the destructor.
@@ -183,6 +185,7 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
                                          allocator_buffer_size,
                                          micro_test::reporter);
     TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+    TF_LITE_MICRO_EXPECT_LE(interpreter.arena_used_bytes(), 928 + 100);
     TF_LITE_MICRO_EXPECT_EQ(1, interpreter.inputs_size());
     TF_LITE_MICRO_EXPECT_EQ(2, interpreter.outputs_size());
 
@@ -266,12 +269,15 @@ TF_LITE_MICRO_TEST(TestVariableTensorReset) {
   TF_LITE_MICRO_EXPECT_NE(nullptr, model);
 
   tflite::MockOpResolver mock_resolver;
-  constexpr size_t allocator_buffer_size = 2048;
+  constexpr size_t allocator_buffer_size =
+      2096 /* optimal arena size at the time of writting. */ +
+      16 /* alignment */ + 100 /* some headroom */;
   uint8_t allocator_buffer[allocator_buffer_size];
   tflite::MicroInterpreter interpreter(model, mock_resolver, allocator_buffer,
                                        allocator_buffer_size,
                                        micro_test::reporter);
   TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  TF_LITE_MICRO_EXPECT_LE(interpreter.arena_used_bytes(), 2096 + 100);
   TF_LITE_MICRO_EXPECT_EQ(1, interpreter.inputs_size());
   TF_LITE_MICRO_EXPECT_EQ(1, interpreter.outputs_size());
 
diff --git a/tensorflow/lite/micro/simple_memory_allocator.h b/tensorflow/lite/micro/simple_memory_allocator.h
index 42c7d963ff7..cf1818609f6 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.h
+++ b/tensorflow/lite/micro/simple_memory_allocator.h
@@ -31,6 +31,8 @@ class SimpleMemoryAllocator {
   SimpleMemoryAllocator(ErrorReporter* error_reporter, uint8_t* buffer_head,
                         uint8_t* buffer_tail)
       : error_reporter_(error_reporter),
+        buffer_head_(buffer_head),
+        buffer_tail_(buffer_tail),
         head_(buffer_head),
         tail_(buffer_tail) {}
   SimpleMemoryAllocator(ErrorReporter* error_reporter, uint8_t* buffer,
@@ -47,9 +49,14 @@ class SimpleMemoryAllocator {
   uint8_t* GetHead() const { return head_; }
   uint8_t* GetTail() const { return tail_; }
   size_t GetAvailableMemory() const { return tail_ - head_; }
+  size_t GetUsedBytes() const { return GetBufferSize() - GetAvailableMemory(); }
 
  private:
+  size_t GetBufferSize() const { return buffer_tail_ - buffer_head_; }
+
   ErrorReporter* error_reporter_;
+  uint8_t* buffer_head_;
+  uint8_t* buffer_tail_;
   uint8_t* head_;
   uint8_t* tail_;
 };