diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 78730cbdcb8..74f2e38b8ab 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -2100,10 +2100,14 @@ std::unique_ptr<HloComputation> MakeBenchmarkWhileBody() {
   return builder.Build();
 }
 
-void BM_SequentialWhiles(int num_iters, int num_whiles) {
+void BM_SequentialWhiles(::testing::benchmark::State& state) {
+  const int num_whiles = state.range(0);
+
   // This benchmark constructs a chain of sequential while instructions.
-  tensorflow::testing::StopTiming();
-  for (int i = 0; i < num_iters; ++i) {
+  // Timer starts automatically at the first iteration of this loop
+  // and ends after the last one.
+  for (auto s : state) {
+    state.PauseTiming();
     HloModuleConfig config;
     config.set_debug_options(GetDebugOptionsFromFlags());
     HloModule module("BM_SequentialWhiles", config);
@@ -2131,19 +2135,22 @@ void BM_SequentialWhiles(int num_iters, int num_whiles) {
 
     CopyInsertion copy_insertion;
 
-    tensorflow::testing::StartTiming();
+    state.ResumeTiming();
     ASSERT_IS_OK(copy_insertion.Run(&module).status());
-    tensorflow::testing::StopTiming();
+    state.PauseTiming();
 
     // The entry computation should have three copies, and each body has one.
     ASSERT_EQ(CountCopies(module), 3 + num_whiles);
+    state.ResumeTiming();
   }
 }
 
-void BM_ParallelWhiles(int num_iters, int num_whiles) {
+void BM_ParallelWhiles(::testing::benchmark::State& state) {
+  const int num_whiles = state.range(0);
+
   // This benchmark constructs a fan-out of parallel while instructions.
-  tensorflow::testing::StopTiming();
-  for (int i = 0; i < num_iters; ++i) {
+  for (auto s : state) {
+    state.PauseTiming();
     HloModuleConfig config;
     config.set_debug_options(GetDebugOptionsFromFlags());
     HloModule module("BM_SequentialWhiles", config);
@@ -2182,9 +2189,9 @@ void BM_ParallelWhiles(int num_iters, int num_whiles) {
 
     CopyInsertion copy_insertion;
 
-    tensorflow::testing::StartTiming();
+    state.ResumeTiming();
     ASSERT_IS_OK(copy_insertion.Run(&module).status());
-    tensorflow::testing::StopTiming();
+    state.PauseTiming();
 
     // Each body receives of copy of two of the parameters (the corresponding
     // elements in the body are modified), and there is one copy in each body.
@@ -2209,14 +2216,15 @@ std::unique_ptr<HloComputation> MakeBenchmarkWhileBody(
   return builder.Build();
 }
 
-void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) {
-  tensorflow::testing::StopTiming();
+void BM_ManyElementTuple(::testing::benchmark::State& state) {
+  const int num_tuple_inputs = state.range(0);
   HloModuleConfig config;
   config.set_debug_options(GetDebugOptionsFromFlags());
   CopyInsertion copy_insertion;
   const Shape element_shape = ShapeUtil::MakeShape(F32, {});
   std::vector<HloInstruction*> tuple_params(num_tuple_inputs);
-  for (int i = 0; i < num_iters; ++i) {
+  for (auto s : state) {
+    state.PauseTiming();
     auto builder = HloComputation::Builder("BM_ParallelWhiles");
     HloModule module("BM_ManyElementTuple", config);
     for (int j = 0; j < num_tuple_inputs; ++j) {
@@ -2234,9 +2242,8 @@ void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) {
     builder.AddInstruction(HloInstruction::CreateGetTupleElement(
         ShapeUtil::MakeShape(F32, {}), xla_while, 0));
     module.AddEntryComputation(builder.Build());
-    tensorflow::testing::StartTiming();
+    state.ResumeTiming();
     ASSERT_IS_OK(copy_insertion.Run(&module).status());
-    tensorflow::testing::StopTiming();
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index b04635dda03..b01caca0b1d 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -2545,8 +2545,7 @@ TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
 
 // Reducing many numbers should be fast because it doesn't create
 // intermediate Literals; the microbenchmark should finish in < 1 msec.
-void BM_ReducePrecisely(int num_iters) {
-  tensorflow::testing::StopTiming();
+void BM_ReducePrecisely(::testing::benchmark::State& state) {
   HloComputation::Builder b("BM_ReducePrecisely");
   HloModuleConfig config;
   config.set_debug_options(GetDebugOptionsFromFlags());
@@ -2574,10 +2573,11 @@ void BM_ReducePrecisely(int num_iters) {
                                    /*dimensions_to_reduce=*/{0}, add_func));
   module.AddEntryComputation(b.Build());
 
-  HloEvaluator hlo_eval;
-  tensorflow::testing::StartTiming();
-  hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
-  tensorflow::testing::StopTiming();
+  // Benchmark loop
+  for (auto s : state) {
+    HloEvaluator hlo_eval;
+    hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
+  }
 }
 
 BENCHMARK(BM_ReducePrecisely);
diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
index 49751d10c5a..763d89e57fa 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
@@ -173,8 +173,10 @@ TEST(ScopedShapedBufferTest, TestSubShapeTree) {
 
 // Test TakeSubTree with different depths (depth of ShapeTree) and fan-outs
 // (cardinality of each non-leaf node's children).
-void BM_TakeSubTree(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_TakeSubTree(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   TestAllocator allocator;
   xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
@@ -183,13 +185,11 @@ void BM_TakeSubTree(int iters, int depth, int fan_out) {
   }
   xla::ScopedShapedBuffer shaped_buffer(shape, /*allocator=*/&allocator,
                                         /*device_ordinal=*/0);
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     // Extract a buffer from approximately the middle of the first level of the
     // tree.
     (void)shaped_buffer.TakeSubTree(/*index=*/{fan_out / 2}).release();
   }
-  tensorflow::testing::StopTiming();
 }
 
 BENCHMARK(BM_TakeSubTree)
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index c294355e269..9078f674fa0 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -535,94 +535,100 @@ TEST_F(ShapeTreeTest, ReverseIterateOrderLeaves) {
                }));
 }
 
-void BM_Construct(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_Construct(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
     std::vector<xla::Shape> shapes(fan_out, shape);
     shape = ShapeUtil::MakeTupleShape(shapes);
   }
-  tensorflow::testing::StartTiming();
 
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     ShapeTree<int> shape_tree(shape);
   }
 }
 
-void BM_ConstructUnowned(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_ConstructUnowned(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
     std::vector<xla::Shape> shapes(fan_out, shape);
     shape = ShapeUtil::MakeTupleShape(shapes);
   }
-  tensorflow::testing::StartTiming();
 
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     ShapeTree<int> shape_tree(&shape);
   }
 }
 
-void BM_Copy(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_Copy(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
     std::vector<xla::Shape> shapes(fan_out, shape);
     shape = ShapeUtil::MakeTupleShape(shapes);
   }
-  tensorflow::testing::StartTiming();
 
   ShapeTree<int> shape_tree(shape);
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     ShapeTree<int> copy = shape_tree;
     tensorflow::testing::DoNotOptimize(copy);
   }
 }
 
-void BM_Move(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_Move(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
     std::vector<xla::Shape> shapes(fan_out, shape);
     shape = ShapeUtil::MakeTupleShape(shapes);
   }
-  tensorflow::testing::StartTiming();
 
   ShapeTree<int> shape_tree(shape);
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     ShapeTree<int> copy = std::move(shape_tree);
     shape_tree = std::move(copy);
   }
 }
 
-void BM_ForEach(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_ForEach(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
     std::vector<xla::Shape> shapes(fan_out, shape);
     shape = ShapeUtil::MakeTupleShape(shapes);
   }
-  tensorflow::testing::StartTiming();
 
   ShapeTree<int> shape_tree(shape);
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     shape_tree.ForEachMutableElement([](const ShapeIndex& index, int* data) {
       tensorflow::testing::DoNotOptimize(index);
     });
   }
 }
 
-void BM_Iterate(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
+void BM_Iterate(::testing::benchmark::State& state) {
+  const int depth = state.range(0);
+  const int fan_out = state.range(1);
+
   Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
   for (int i = 0; i < depth; ++i) {
     std::vector<xla::Shape> shapes(fan_out, shape);
     shape = ShapeUtil::MakeTupleShape(shapes);
   }
-  tensorflow::testing::StartTiming();
 
   ShapeTree<int> shape_tree(shape);
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     for (auto& iter : shape_tree) {
       tensorflow::testing::DoNotOptimize(iter.second);
     }
diff --git a/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc b/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
index 2a1eed7c7a7..c884fcca25b 100644
--- a/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
@@ -824,9 +824,8 @@ XLA_TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) {
   ComputeAndCompare(&b, {});
 }
 
-void BM_ParallelFusion(int num_iters) {
+void BM_ParallelFusion(::testing::benchmark::State& state) {
   // Simple element-wise computation to benchmark parallel task partitioning.
-  tensorflow::testing::StopTiming();
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
@@ -915,17 +914,16 @@ void BM_ParallelFusion(int num_iters) {
   const int64 total_bytes = param0_dim0 * param0_dim0 +
                             param1_dim0 * param1_dim0 +
                             param2_dim0 * param2_dim0;
-  tensorflow::testing::BytesProcessed(static_cast<int64>(num_iters) *
-                                      total_bytes * sizeof(float));
-  tensorflow::testing::UseRealTime();
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < num_iters; ++i) {
+
+  for (auto s : state) {
     auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options);
     ASSERT_TRUE(result.ok());
   }
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * total_bytes *
+                          sizeof(float));
 }
 
-BENCHMARK(BM_ParallelFusion);
+BENCHMARK(BM_ParallelFusion)->UseRealTime();
 
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 0974d37779e..0362d5fe1a5 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -750,9 +750,7 @@ XLA_TEST_F(HloTestBase, AddOfDUS) {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
 }
 
-void BM_DynamicSlice(int num_iters) {
-  tensorflow::testing::StopTiming();
-
+void BM_DynamicSlice(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
   se::StreamExecutorMemoryAllocator allocator(platform, executors);
@@ -817,8 +815,7 @@ void BM_DynamicSlice(int num_iters) {
   }
 
   // Run benchmark.
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < num_iters; ++i) {
+  for (auto s : state) {
     auto result = executable->Run(shaped_buffer_ptrs, options);
     ASSERT_TRUE(result.ok());
   }
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index fab1a53611f..e5fe7793769 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -946,9 +946,7 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_INTERPRETER(InfeedOutfeedTest)) {
 
 // Benchmark that measures the overhead of the LocalClient API when running a
 // trivial computation
-void BM_LocalClientOverhead(int num_iters) {
-  tensorflow::testing::StopTiming();
-
+void BM_LocalClientOverhead(benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
   se::StreamExecutorMemoryAllocator allocator(platform, executors);
@@ -990,8 +988,7 @@ void BM_LocalClientOverhead(int num_iters) {
     ASSERT_IS_OK(result);
   }
 
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < num_iters; ++i) {
+  for (auto s : state) {
     auto result = executable->Run({&buffer}, run_options);
     ASSERT_IS_OK(result);
   }
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index 697c24e6587..a343184d66e 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -357,8 +357,8 @@ class TransferDeviceToHostBenchmark : public TransferManagerTest {
   using TransferManagerTest::TransferManagerTest;
   ~TransferDeviceToHostBenchmark() override {}
 
-  void Run(int iters, int num_tuple_elements, int array_size) {
-    tensorflow::testing::StopTiming();
+  void Run(::testing::benchmark::State& state, int num_tuple_elements,
+           int array_size) {
     SetUp();
 
     std::vector<Literal> tuple_elements;
@@ -370,13 +370,11 @@ class TransferDeviceToHostBenchmark : public TransferManagerTest {
     auto device_buffer = AllocateDeviceBuffer(literal.shape());
     TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
                                                            device_buffer));
-    tensorflow::testing::StartTiming();
-    for (int i = 0; i < iters; ++i) {
+    for (auto s : state) {
       TF_ASSERT_OK_AND_ASSIGN(
           Literal result,
           transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer));
     }
-    tensorflow::testing::StopTiming();
     TearDown();
   }
 
@@ -388,7 +386,8 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest {
   using TransferManagerTest::TransferManagerTest;
   ~TransferHostToDeviceBenchmark() override {}
 
-  void Run(int iters, int num_tuple_elements, int array_size) {
+  void Run(::testing::benchmark::State& state, int num_tuple_elements,
+           int array_size) {
     tensorflow::testing::StopTiming();
     SetUp();
 
@@ -400,7 +399,7 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest {
     Literal literal = LiteralUtil::MakeTupleOwned(std::move(tuple_elements));
     auto device_buffer = AllocateDeviceBuffer(literal.shape());
     tensorflow::testing::StartTiming();
-    for (int i = 0; i < iters; ++i) {
+    for (auto s : state) {
       TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
                                                              device_buffer));
     }
@@ -411,16 +410,20 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest {
   void TestBody() override {}
 };
 
-void BM_TransferDeviceToHost(int iters, int num_tuple_elements,
-                             int array_size) {
+void BM_TransferDeviceToHost(::testing::benchmark::State& state) {
+  const int num_tuple_elements = state.range(0);
+  const int array_size = state.range(1);
+
   TransferDeviceToHostBenchmark bm;
-  bm.Run(iters, num_tuple_elements, array_size);
+  bm.Run(state, num_tuple_elements, array_size);
 }
 
-void BM_TransferHostToDevice(int iters, int num_tuple_elements,
-                             int array_size) {
+void BM_TransferHostToDevice(::testing::benchmark::State& state) {
+  const int num_tuple_elements = state.range(0);
+  const int array_size = state.range(1);
+
   TransferHostToDeviceBenchmark bm;
-  bm.Run(iters, num_tuple_elements, array_size);
+  bm.Run(state, num_tuple_elements, array_size);
 }
 
 BENCHMARK(BM_TransferHostToDevice)
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 8e8c3605cc7..73bb30f46d0 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -1259,9 +1259,8 @@ XLA_TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileInfeedCondition)) {
   ComputeAndCompareR0<int32>(&builder, 2, {});
 }
 
-void BM_WhileLoop(int num_iters) {
+void BM_WhileLoop(::testing::benchmark::State& state) {
   // Benchmark a simple kernel to measure while loop overheads.
-  tensorflow::testing::StopTiming();
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
@@ -1330,8 +1329,7 @@ void BM_WhileLoop(int num_iters) {
   }
 
   // Run benchmark.
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < num_iters; ++i) {
+  for (auto s : state) {
     auto result =
         executable->Run(absl::Span<const ShapedBuffer* const>(), options);
     ASSERT_TRUE(result.ok());