diff --git a/tensorflow/core/kernels/conv_grad_filter_ops_benchmark_test.cc b/tensorflow/core/kernels/conv_grad_filter_ops_benchmark_test.cc
index 97148945331..0be09d8e614 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops_benchmark_test.cc
@@ -116,12 +116,15 @@ static Graph* Conv2DBackpropFilter(int batch, int height, int width,
 #define BM_Conv2DBwdFilter(T, FMT, N, H, W, C, FH, FW, FC, SH, SW, PADDING,    \
                            type)                                               \
   static void BM_NAME(BM_Conv2DBackpropFilter, type, T, FMT, N, H, W, C, FH,   \
-                      FW, FC, SH, SW, PADDING)(int iters) {                    \
-    testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) *      \
-                            (C));                                              \
-    test::Benchmark(#type, Conv2DBackpropFilter<T>(N, H, W, C, FH, FW, FC, SH, \
-                                                   SW, PADDING, FORMAT_##FMT)) \
-        .Run(iters);                                                           \
+                      FW, FC, SH, SW,                                          \
+                      PADDING)(::testing::benchmark::State & state) {          \
+    test::Benchmark(#type,                                                     \
+                    Conv2DBackpropFilter<T>(N, H, W, C, FH, FW, FC, SH, SW,    \
+                                            PADDING, FORMAT_##FMT),            \
+                    /*old_benchmark_api*/ false)                               \
+        .Run(state);                                                           \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * (N) *     \
+                            (H) * (W) * (C));                                  \
   }                                                                            \
   BENCHMARK(BM_NAME(BM_Conv2DBackpropFilter, type, T, FMT, N, H, W, C, FH, FW, \
                     FC, SH, SW, PADDING));
diff --git a/tensorflow/core/kernels/conv_grad_input_ops_benchmark_test.cc b/tensorflow/core/kernels/conv_grad_input_ops_benchmark_test.cc
index 713c935dcf7..575e914d944 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops_benchmark_test.cc
@@ -84,9 +84,9 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
           .Input(backprop)
           .Attr("T", DataTypeToEnum<T>::value)
           .Attr("strides", {1, stride_h, stride_w, 1})
-          .Attr("padding", padding == Padding::SAME
-                               ? "SAME"
-                               : padding == Padding::VALID ? "VALID" : "N/A")
+          .Attr("padding", padding == Padding::SAME    ? "SAME"
+                           : padding == Padding::VALID ? "VALID"
+                                                       : "N/A")
           .Attr("data_format", ToString(data_format))
           .Finalize(graph, &conv2d));
 
@@ -115,12 +115,14 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
 #define BM_Conv2DBwdInput(T, FMT, N, H, W, C, FW, FH, FC, SH, SW, PADDING,    \
                           type)                                               \
   static void BM_NAME(BM_Conv2DBackpropInput, type, T, FMT, N, H, W, C, FH,   \
-                      FW, FC, SH, SW, PADDING)(int iters) {                   \
-    testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) *     \
-                            (C));                                             \
-    test::Benchmark(#type, Conv2DBackpropInput<T>(N, H, W, C, FH, FW, FC, SH, \
-                                                  SW, PADDING, FORMAT_##FMT)) \
-        .Run(iters);                                                          \
+                      FW, FC, SH, SW,                                         \
+                      PADDING)(::testing::benchmark::State & state) {         \
+    test::Benchmark(#type,                                                    \
+                    Conv2DBackpropInput<T>(N, H, W, C, FH, FW, FC, SH, SW,    \
+                                           PADDING, FORMAT_##FMT),            \
+                    /*old_benchmark_api*/ false)                              \
+        .Run(state);                                                          \
+    state.SetItemsProcessed(state.iterations() * (N) * (H) * (W) * (C));      \
   }                                                                           \
   BENCHMARK(BM_NAME(BM_Conv2DBackpropInput, type, T, FMT, N, H, W, C, FH, FW, \
                     FC, SH, SW, PADDING));
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc b/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
index e253014bf94..83a5b40b24b 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
@@ -91,10 +91,8 @@ TEST(SnapshotUtilTest, CombinationRoundTripTest) {
   SnapshotRoundTrip(io::compression::kSnappy, 2);
 }
 
-void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
-                                 int version) {
-  tensorflow::testing::StopTiming();
-
+void SnapshotReaderBenchmarkLoop(::testing::benchmark::State& state,
+                                 std::string compression_type, int version) {
   tensorflow::DataTypeVector dtypes;
   std::vector<Tensor> tensors;
   GenerateTensorVector(dtypes, tensors);
@@ -106,7 +104,7 @@ void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
   TF_ASSERT_OK(Writer::Create(tensorflow::Env::Default(), filename,
                               compression_type, version, dtypes, &writer));
 
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     writer->WriteTensors(tensors).IgnoreError();
   }
   TF_ASSERT_OK(writer->Close());
@@ -115,34 +113,32 @@ void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
   TF_ASSERT_OK(Reader::Create(Env::Default(), filename, compression_type,
                               version, dtypes, &reader));
 
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     std::vector<Tensor> read_tensors;
     reader->ReadTensors(&read_tensors).IgnoreError();
   }
-  tensorflow::testing::StopTiming();
 
   TF_ASSERT_OK(Env::Default()->DeleteFile(filename));
 }
 
-void SnapshotCustomReaderNoneBenchmark(int iters) {
-  SnapshotReaderBenchmarkLoop(iters, io::compression::kNone, 1);
+void SnapshotCustomReaderNoneBenchmark(::testing::benchmark::State& state) {
+  SnapshotReaderBenchmarkLoop(state, io::compression::kNone, 1);
 }
 
-void SnapshotCustomReaderGzipBenchmark(int iters) {
-  SnapshotReaderBenchmarkLoop(iters, io::compression::kGzip, 1);
+void SnapshotCustomReaderGzipBenchmark(::testing::benchmark::State& state) {
+  SnapshotReaderBenchmarkLoop(state, io::compression::kGzip, 1);
 }
 
-void SnapshotCustomReaderSnappyBenchmark(int iters) {
-  SnapshotReaderBenchmarkLoop(iters, io::compression::kSnappy, 1);
+void SnapshotCustomReaderSnappyBenchmark(::testing::benchmark::State& state) {
+  SnapshotReaderBenchmarkLoop(state, io::compression::kSnappy, 1);
 }
 
-void SnapshotTFRecordReaderNoneBenchmark(int iters) {
-  SnapshotReaderBenchmarkLoop(iters, io::compression::kNone, 2);
+void SnapshotTFRecordReaderNoneBenchmark(::testing::benchmark::State& state) {
+  SnapshotReaderBenchmarkLoop(state, io::compression::kNone, 2);
 }
 
-void SnapshotTFRecordReaderGzipBenchmark(int iters) {
-  SnapshotReaderBenchmarkLoop(iters, io::compression::kGzip, 2);
+void SnapshotTFRecordReaderGzipBenchmark(::testing::benchmark::State& state) {
+  SnapshotReaderBenchmarkLoop(state, io::compression::kGzip, 2);
 }
 
 BENCHMARK(SnapshotCustomReaderNoneBenchmark);
@@ -151,10 +147,8 @@ BENCHMARK(SnapshotCustomReaderSnappyBenchmark);
 BENCHMARK(SnapshotTFRecordReaderNoneBenchmark);
 BENCHMARK(SnapshotTFRecordReaderGzipBenchmark);
 
-void SnapshotWriterBenchmarkLoop(int iters, std::string compression_type,
-                                 int version) {
-  tensorflow::testing::StopTiming();
-
+void SnapshotWriterBenchmarkLoop(::testing::benchmark::State& state,
+                                 std::string compression_type, int version) {
   tensorflow::DataTypeVector dtypes;
   std::vector<Tensor> tensors;
   GenerateTensorVector(dtypes, tensors);
@@ -166,38 +160,36 @@ void SnapshotWriterBenchmarkLoop(int iters, std::string compression_type,
   TF_ASSERT_OK(Writer::Create(tensorflow::Env::Default(), filename,
                               compression_type, version, dtypes, &writer));
 
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
     writer->WriteTensors(tensors).IgnoreError();
   }
   writer->Close().IgnoreError();
-  tensorflow::testing::StopTiming();
 
   TF_ASSERT_OK(Env::Default()->DeleteFile(filename));
 }
 
-void SnapshotCustomWriterNoneBenchmark(int iters) {
-  SnapshotWriterBenchmarkLoop(iters, io::compression::kNone, 1);
+void SnapshotCustomWriterNoneBenchmark(::testing::benchmark::State& state) {
+  SnapshotWriterBenchmarkLoop(state, io::compression::kNone, 1);
 }
 
-void SnapshotCustomWriterGzipBenchmark(int iters) {
-  SnapshotWriterBenchmarkLoop(iters, io::compression::kGzip, 1);
+void SnapshotCustomWriterGzipBenchmark(::testing::benchmark::State& state) {
+  SnapshotWriterBenchmarkLoop(state, io::compression::kGzip, 1);
 }
 
-void SnapshotCustomWriterSnappyBenchmark(int iters) {
-  SnapshotWriterBenchmarkLoop(iters, io::compression::kSnappy, 1);
+void SnapshotCustomWriterSnappyBenchmark(::testing::benchmark::State& state) {
+  SnapshotWriterBenchmarkLoop(state, io::compression::kSnappy, 1);
 }
 
-void SnapshotTFRecordWriterNoneBenchmark(int iters) {
-  SnapshotWriterBenchmarkLoop(iters, io::compression::kNone, 2);
+void SnapshotTFRecordWriterNoneBenchmark(::testing::benchmark::State& state) {
+  SnapshotWriterBenchmarkLoop(state, io::compression::kNone, 2);
 }
 
-void SnapshotTFRecordWriterGzipBenchmark(int iters) {
-  SnapshotWriterBenchmarkLoop(iters, io::compression::kGzip, 2);
+void SnapshotTFRecordWriterGzipBenchmark(::testing::benchmark::State& state) {
+  SnapshotWriterBenchmarkLoop(state, io::compression::kGzip, 2);
 }
 
-void SnapshotTFRecordWriterSnappyBenchmark(int iters) {
-  SnapshotWriterBenchmarkLoop(iters, io::compression::kSnappy, 2);
+void SnapshotTFRecordWriterSnappyBenchmark(::testing::benchmark::State& state) {
+  SnapshotWriterBenchmarkLoop(state, io::compression::kSnappy, 2);
 }
 
 BENCHMARK(SnapshotCustomWriterNoneBenchmark);
diff --git a/tensorflow/core/kernels/eigen_benchmark.h b/tensorflow/core/kernels/eigen_benchmark.h
index 87e41b89b3d..8b35bfdcd64 100644
--- a/tensorflow/core/kernels/eigen_benchmark.h
+++ b/tensorflow/core/kernels/eigen_benchmark.h
@@ -35,8 +35,9 @@ class SpatialConvolutionBenchmarksSuite {
 
   using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
 
-  SpatialConvolutionBenchmarksSuite(int iters, Device& device)
-      : iters_(iters), device_(device) {}
+  SpatialConvolutionBenchmarksSuite(::testing::benchmark::State& state,
+                                    Device& device)
+      : state_(state), device_(device) {}
 
   Eigen::Index BufferSize(const Dimensions& dims) {
     return dims.TotalSize() * sizeof(Scalar);
@@ -62,12 +63,10 @@ class SpatialConvolutionBenchmarksSuite {
     Filter filter(filter_data, filter_dims);
     Output output(output_data, output_dims);
 
-    ::tensorflow::testing::StartTiming();
-    for (int i = 0; i < iters_; ++i) {
+    for (auto s : state_) {
       output.device(device_) = Eigen::SpatialConvolution(input, filter);
       tensorflow::testing::DoNotOptimize(output);
     }
-    ::tensorflow::testing::StopTiming();
 
     device_.deallocate(input_data);
     device_.deallocate(filter_data);
@@ -102,13 +101,11 @@ class SpatialConvolutionBenchmarksSuite {
     OutputBackward output_backward(output_backward_data, output_dims);
     InputBackward input_backward(input_backward_data, input_dims);
 
-    ::tensorflow::testing::StartTiming();
-    for (int i = 0; i < iters_; ++i) {
+    for (auto s : state_) {
       input_backward.device(device_) = Eigen::SpatialConvolutionBackwardInput(
           filter, output_backward, input_rows, input_cols);
       tensorflow::testing::DoNotOptimize(input_backward);
     }
-    ::tensorflow::testing::StopTiming();
 
     device_.deallocate(filter_data);
     device_.deallocate(output_backward_data);
@@ -143,13 +140,11 @@ class SpatialConvolutionBenchmarksSuite {
     OutputBackward output_backward(output_backward_data, input_dims);
     FilterBackward filter_backward(filter_backward_data, filter_dims);
 
-    ::tensorflow::testing::StartTiming();
-    for (int i = 0; i < iters_; ++i) {
+    for (auto s : state_) {
       filter_backward.device(device_) = Eigen::SpatialConvolutionBackwardKernel(
           input, output_backward, filter_rows, filter_cols);
       tensorflow::testing::DoNotOptimize(filter_backward);
     }
-    ::tensorflow::testing::StopTiming();
 
     device_.deallocate(input_data);
     device_.deallocate(output_backward_data);
@@ -157,7 +152,8 @@ class SpatialConvolutionBenchmarksSuite {
   }
 
  private:
-  int iters_;
+  ::testing::benchmark::State& state_;
+
   Device& device_;
 };
 
@@ -170,8 +166,9 @@ class CuboidConvolutionBenchmarksSuite {
 
   using Dimensions = Eigen::DSizes<Eigen::Index, 5>;
 
-  CuboidConvolutionBenchmarksSuite(int iters, Device& device)
-      : iters_(iters), device_(device) {}
+  CuboidConvolutionBenchmarksSuite(::testing::benchmark::State& state,
+                                   Device& device)
+      : state_(state), device_(device) {}
 
   Eigen::Index BufferSize(const Dimensions& dims) {
     return dims.TotalSize() * sizeof(Scalar);
@@ -198,12 +195,10 @@ class CuboidConvolutionBenchmarksSuite {
     Filter filter(filter_data, filter_dims);
     Output output(output_data, output_dims);
 
-    ::tensorflow::testing::StartTiming();
-    for (int i = 0; i < iters_; ++i) {
+    for (auto s : state_) {
       output.device(device_) = Eigen::CuboidConvolution(input, filter);
       tensorflow::testing::DoNotOptimize(output);
     }
-    ::tensorflow::testing::StopTiming();
 
     device_.deallocate(input_data);
     device_.deallocate(filter_data);
@@ -240,13 +235,11 @@ class CuboidConvolutionBenchmarksSuite {
     OutputBackward output_backward(output_backward_data, output_dims);
     InputBackward input_backward(input_backward_data, input_dims);
 
-    ::tensorflow::testing::StartTiming();
-    for (int i = 0; i < iters_; ++i) {
+    for (auto s : state_) {
       input_backward.device(device_) = Eigen::CuboidConvolutionBackwardInput(
           filter, output_backward, input_planes, input_rows, input_cols);
       tensorflow::testing::DoNotOptimize(input_backward);
     }
-    ::tensorflow::testing::StopTiming();
 
     device_.deallocate(filter_data);
     device_.deallocate(output_backward_data);
@@ -283,13 +276,11 @@ class CuboidConvolutionBenchmarksSuite {
     OutputBackward output_backward(output_backward_data, output_dims);
     FilterBackward filter_backward(filter_backward_data, filter_dims);
 
-    ::tensorflow::testing::StartTiming();
-    for (int i = 0; i < iters_; ++i) {
+    for (auto s : state_) {
       filter_backward.device(device_) = Eigen::CuboidConvolutionBackwardKernel(
           input, output_backward, filter_planes, filter_rows, filter_cols);
       tensorflow::testing::DoNotOptimize(filter_backward);
     }
-    ::tensorflow::testing::StopTiming();
 
     device_.deallocate(input_data);
     device_.deallocate(output_backward_data);
@@ -297,7 +288,7 @@ class CuboidConvolutionBenchmarksSuite {
   }
 
  private:
-  int iters_;
+  ::testing::benchmark::State& state_;
   Device& device_;
 };
 
diff --git a/tensorflow/core/kernels/eigen_benchmark_cpu_test.cc b/tensorflow/core/kernels/eigen_benchmark_cpu_test.cc
index 12fa7f3409d..2abc2e99912 100644
--- a/tensorflow/core/kernels/eigen_benchmark_cpu_test.cc
+++ b/tensorflow/core/kernels/eigen_benchmark_cpu_test.cc
@@ -27,19 +27,17 @@ limitations under the License.
 // Spatial Convolutions                                                       //
 // -------------------------------------------------------------------------- //
 
-void SpatialConvolution(int iters, int num_threads,
+void SpatialConvolution(::testing::benchmark::State& state, int num_threads,
                         /* Input dimensions: */
                         int input_batches, int input_height, int input_width,
                         int input_depth,
                         /* Filter (kernel) dimensions: */
                         int filter_count, int filter_height, int filter_width) {
-  ::tensorflow::testing::StopTiming();
-
   CREATE_THREAD_POOL(num_threads);
 
   using Benchmark =
       SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
-  auto benchmark = Benchmark(iters, device);
+  auto benchmark = Benchmark(state, device);
 
   typename Benchmark::Dimensions input_dims(input_batches, input_height,
                                             input_width, input_depth);
@@ -52,23 +50,22 @@ void SpatialConvolution(int iters, int num_threads,
       (input_dims.TotalSize() / input_depth) * filter_count;
   auto flops =
       num_computed_elements * (input_depth * filter_height * filter_width);
-  ::tensorflow::testing::ItemsProcessed(flops * iters);
+  state.SetItemsProcessed(flops * state.iterations());
 }
 
-void SpatialConvolutionBackwardInput(int iters, int num_threads,
+void SpatialConvolutionBackwardInput(::testing::benchmark::State& state,
+                                     int num_threads,
                                      /* Input dimensions: */
                                      int input_batches, int input_height,
                                      int input_width, int input_depth,
                                      /* Filter (kernel) dimensions: */
                                      int filter_count, int filter_height,
                                      int filter_width) {
-  ::tensorflow::testing::StopTiming();
-
   CREATE_THREAD_POOL(num_threads);
 
   using Benchmark =
       SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
-  auto benchmark = Benchmark(iters, device);
+  auto benchmark = Benchmark(state, device);
 
   typename Benchmark::Dimensions input_dims(input_batches, input_height,
                                             input_width, input_depth);
@@ -80,23 +77,22 @@ void SpatialConvolutionBackwardInput(int iters, int num_threads,
   auto num_computed_elements = input_dims.TotalSize();
   auto flops =
       num_computed_elements * (input_depth * filter_height * filter_width);
-  ::tensorflow::testing::ItemsProcessed(flops * iters);
+  state.SetItemsProcessed(flops * state.iterations());
 }
 
-void SpatialConvolutionBackwardKernel(int iters, int num_threads,
+void SpatialConvolutionBackwardKernel(::testing::benchmark::State& state,
+                                      int num_threads,
                                       /* Input dimensions: */
                                       int input_batches, int input_height,
                                       int input_width, int input_depth,
                                       /* Filter (kernel) dimensions: */
                                       int filter_count, int filter_height,
                                       int filter_width) {
-  ::tensorflow::testing::StopTiming();
-
   CREATE_THREAD_POOL(num_threads);
 
   using Benchmark =
       SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
-  auto benchmark = Benchmark(iters, device);
+  auto benchmark = Benchmark(state, device);
 
   typename Benchmark::Dimensions input_dims(input_batches, input_height,
                                             input_width, input_depth);
@@ -108,7 +104,7 @@ void SpatialConvolutionBackwardKernel(int iters, int num_threads,
   auto num_computed_elements = filter_dims.TotalSize();
   auto flops =
       num_computed_elements * (input_batches * input_height * input_width);
-  ::tensorflow::testing::ItemsProcessed(flops * iters);
+  state.SetItemsProcessed(flops * state.iterations());
 }
 
 // Macro arguments names: --------------------------------------------------- //
@@ -126,26 +122,26 @@ void SpatialConvolutionBackwardKernel(int iters, int num_threads,
 
 #define BM_SpatialConvolution(NT, N, H, W, C, FC, FH, FW, LABEL)          \
   static void BM_SPATIAL_NAME(SpatialConvolution, NT, N, H, W, C, FC, FH, \
-                              FW)(int iters) {                            \
-    ::tensorflow::testing::SetLabel(LABEL);                               \
-    SpatialConvolution(iters, NT, N, H, W, C, FC, FH, FW);                \
+                              FW)(::testing::benchmark::State & state) {  \
+    state.SetLabel(LABEL);                                                \
+    SpatialConvolution(state, NT, N, H, W, C, FC, FH, FW);                \
   }                                                                       \
   BENCHMARK(BM_SPATIAL_NAME(SpatialConvolution, NT, N, H, W, C, FC, FH, FW))
 
 #define BM_SpatialConvolutionBwdInput(NT, N, H, W, C, FC, FH, FW, LABEL)      \
   static void BM_SPATIAL_NAME(SpatialConvolutionBwdInput, NT, N, H, W, C, FC, \
-                              FH, FW)(int iters) {                            \
-    ::tensorflow::testing::SetLabel(LABEL);                                   \
-    SpatialConvolutionBackwardInput(iters, NT, N, H, W, C, FC, FH, FW);       \
+                              FH, FW)(::testing::benchmark::State & state) {  \
+    state.SetLabel(LABEL);                                                    \
+    SpatialConvolutionBackwardInput(state, NT, N, H, W, C, FC, FH, FW);       \
   }                                                                           \
   BENCHMARK(                                                                  \
       BM_SPATIAL_NAME(SpatialConvolutionBwdInput, NT, N, H, W, C, FC, FH, FW))
 
 #define BM_SpatialConvolutionBwdKernel(NT, N, H, W, C, FC, FH, FW, LABEL)      \
   static void BM_SPATIAL_NAME(SpatialConvolutionBwdKernel, NT, N, H, W, C, FC, \
-                              FH, FW)(int iters) {                             \
-    ::tensorflow::testing::SetLabel(LABEL);                                    \
-    SpatialConvolutionBackwardKernel(iters, NT, N, H, W, C, FC, FH, FW);       \
+                              FH, FW)(::testing::benchmark::State & state) {   \
+    state.SetLabel(LABEL);                                                     \
+    SpatialConvolutionBackwardKernel(state, NT, N, H, W, C, FC, FH, FW);       \
   }                                                                            \
   BENCHMARK(BM_SPATIAL_NAME(SpatialConvolutionBwdKernel, NT, N, H, W, C, FC,   \
                             FH, FW))
@@ -248,20 +244,18 @@ BM_SpatialConvolutionsBwdKernel(32, 7, 7, 192, 384, 3, 3, "conv5_00_3x3");
 // Cuboid Convolutions                                                        //
 // -------------------------------------------------------------------------- //
 
-void CuboidConvolution(int iters, int num_threads,
+void CuboidConvolution(::testing::benchmark::State& state, int num_threads,
                        /* Input dimensions: */
                        int input_batches, int input_height, int input_width,
                        int input_planes, int input_depth,
                        /* Filter (kernel) dimensions: */
                        int filter_count, int filter_height, int filter_width,
                        int filter_planes) {
-  ::tensorflow::testing::StopTiming();
-
   CREATE_THREAD_POOL(num_threads);
 
   using Benchmark =
       CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
-  auto benchmark = Benchmark(iters, device);
+  auto benchmark = Benchmark(state, device);
 
   typename Benchmark::Dimensions input_dims(
       input_batches, input_height, input_width, input_planes, input_depth);
@@ -274,10 +268,11 @@ void CuboidConvolution(int iters, int num_threads,
       (input_dims.TotalSize() / input_depth) * filter_count;
   auto flops = num_computed_elements *
                (input_depth * filter_height * filter_width * filter_planes);
-  ::tensorflow::testing::ItemsProcessed(flops * iters);
+  state.SetItemsProcessed(flops * state.iterations());
 }
 
-void CuboidConvolutionBackwardInput(int iters, int num_threads,
+void CuboidConvolutionBackwardInput(::testing::benchmark::State& state,
+                                    int num_threads,
                                     /* Input dimensions: */
                                     int input_batches, int input_height,
                                     int input_width, int input_planes,
@@ -285,13 +280,11 @@ void CuboidConvolutionBackwardInput(int iters, int num_threads,
                                     /* Filter (kernel) dimensions: */
                                     int filter_count, int filter_height,
                                     int filter_width, int filter_planes) {
-  ::tensorflow::testing::StopTiming();
-
   CREATE_THREAD_POOL(num_threads);
 
   using Benchmark =
       CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
-  auto benchmark = Benchmark(iters, device);
+  auto benchmark = Benchmark(state, device);
 
   typename Benchmark::Dimensions input_dims(
       input_batches, input_height, input_width, input_planes, input_depth);
@@ -303,10 +296,11 @@ void CuboidConvolutionBackwardInput(int iters, int num_threads,
   auto num_computed_elements = input_dims.TotalSize();
   auto flops = num_computed_elements *
                (input_depth * filter_height * filter_width * filter_planes);
-  ::tensorflow::testing::ItemsProcessed(flops * iters);
+  state.SetItemsProcessed(flops * state.iterations());
 }
 
-void CuboidConvolutionBackwardKernel(int iters, int num_threads,
+void CuboidConvolutionBackwardKernel(::testing::benchmark::State& state,
+                                     int num_threads,
                                      /* Input dimensions: */
                                      int input_batches, int input_height,
                                      int input_width, int input_planes,
@@ -314,13 +308,11 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
                                      /* Filter (kernel) dimensions: */
                                      int filter_count, int filter_height,
                                      int filter_width, int filter_planes) {
-  ::tensorflow::testing::StopTiming();
-
   CREATE_THREAD_POOL(num_threads);
 
   using Benchmark =
       CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
-  auto benchmark = Benchmark(iters, device);
+  auto benchmark = Benchmark(state, device);
 
   typename Benchmark::Dimensions input_dims(
       input_batches, input_height, input_width, input_planes, input_depth);
@@ -332,9 +324,16 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
   auto num_computed_elements = filter_dims.TotalSize();
   auto flops = num_computed_elements *
                (input_batches * input_height * input_width * input_planes);
-  ::tensorflow::testing::ItemsProcessed(flops * iters);
+  state.SetItemsProcessed(flops * state.iterations());
 }
 
+// The multiple #'s in the function names + the `::testing::benchmark::State&`
+// as parameters apparently confuses clang if they are not on the same line. So
+// we need to turn off LINT and clang-format for this block.
+//
+// clang-format off
+// NOLINTBEGIN
+
 // Macro arguments names: --------------------------------------------------- //
 //   NT: num threads
 //    N: batch size
@@ -354,33 +353,33 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
             _f_##FC##_##FH##_##FW##_##FP)
 
 #define BM_CuboidConvolution(NT, N, H, W, P, C, FC, FH, FW, FP, LABEL)         \
-  static void BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, \
-                             FP)(int iters) {                                  \
-    ::tensorflow::testing::SetLabel(LABEL);                                    \
-    CuboidConvolution(iters, NT, N, H, W, P, C, FC, FH, FW, FP);               \
+  static void BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) {                   \
+    state.SetLabel(LABEL);                                    \
+    CuboidConvolution(state, NT, N, H, W, P, C, FC, FH, FW, FP);               \
   }                                                                            \
   BENCHMARK(                                                                   \
       BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, FP))
 
 #define BM_CuboidConvolutionBwdInput(NT, N, H, W, P, C, FC, FH, FW, FP, LABEL) \
-  static void BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, \
-                             FH, FW, FP)(int iters) {                          \
-    ::tensorflow::testing::SetLabel(LABEL);                                    \
-    CuboidConvolutionBackwardInput(iters, NT, N, H, W, P, C, FC, FH, FW, FP);  \
+  static void BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) {           \
+    state.SetLabel(LABEL);                                    \
+    CuboidConvolutionBackwardInput(state, NT, N, H, W, P, C, FC, FH, FW, FP);  \
   }                                                                            \
   BENCHMARK(BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC,   \
                            FH, FW, FP))
 
 #define BM_CuboidConvolutionBwdKernel(NT, N, H, W, P, C, FC, FH, FW, FP,       \
                                       LABEL)                                   \
-  static void BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C,    \
-                             FC, FH, FW, FP)(int iters) {                      \
-    ::tensorflow::testing::SetLabel(LABEL);                                    \
-    CuboidConvolutionBackwardKernel(iters, NT, N, H, W, P, C, FC, FH, FW, FP); \
+  static void BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) {       \
+    state.SetLabel(LABEL);                                    \
+    CuboidConvolutionBackwardKernel(state, NT, N, H, W, P, C, FC, FH, FW, FP); \
   }                                                                            \
   BENCHMARK(BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, FC,  \
                            FH, FW, FP))
 
+// NOLINTEND
+// clang-format on
+
 #define BM_CuboidConvolutions(N, H, W, P, C, FC, FH, FW, FP, LABEL) \
   BM_CuboidConvolution(2, N, H, W, P, C, FC, FH, FW, FP, LABEL);    \
   BM_CuboidConvolution(4, N, H, W, P, C, FC, FH, FW, FP, LABEL);    \
diff --git a/tensorflow/core/kernels/fused_batch_norm_op_test.cc b/tensorflow/core/kernels/fused_batch_norm_op_test.cc
index 734fb294135..989fbc27b7c 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op_test.cc
@@ -283,18 +283,23 @@ static Graph* FusedBatchNormGrad(int n, int h, int w, int c, bool is_training,
 // -------------------------------------------------------------------------- //
 // FusedBatchNorm inference
 // -------------------------------------------------------------------------- //
+// clang-format off
+// NOLINTBEGIN
+#define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)         \
+  static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)(::testing::benchmark::State & state) {                     \
+    test::Benchmark(                                                          \
+        #DEVICE,                                                              \
+        FusedBatchNormInference<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT), \
+        /*old_benchmark_api*/ false)                                          \
+        .Run(state);                                                          \
+    state.SetItemsProcessed(state.iterations() * N * H * W * C);              \
+  }                                                                           \
+  BENCHMARK(                                                                  \
+      BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE))    \
+      ->UseRealTime();
 
-#define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)       \
-  static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT,   \
-                      DEVICE)(int iters) {                                  \
-    testing::UseRealTime();                                                 \
-    testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C);     \
-    test::Benchmark(#DEVICE, FusedBatchNormInference<T>(                    \
-                                 N, H, W, C, IS_TRAINING, FORMAT_##FORMAT)) \
-        .Run(iters);                                                        \
-  }                                                                         \
-  BENCHMARK(                                                                \
-      BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE));
+// NOLINTEND
+// clang-format on
 
 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, cpu);
 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, cpu);
@@ -320,17 +325,19 @@ BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NCHW, gpu);
 // FusedBatchNorm gradient
 // -------------------------------------------------------------------------- //
 
-#define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)     \
-  static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \
-                      DEVICE)(int iters) {                                    \
-    testing::UseRealTime();                                                   \
-    testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C);       \
-    test::Benchmark(#DEVICE, FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING,   \
-                                                   FORMAT_##FORMAT))          \
-        .Run(iters);                                                          \
-  }                                                                           \
-  BENCHMARK(BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT,   \
-                    DEVICE));
+#define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)      \
+  static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT,  \
+                      DEVICE)(::testing::benchmark::State & state) {           \
+    test::Benchmark(                                                           \
+        #DEVICE,                                                               \
+        FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT),       \
+        /*old_benchmark_api*/ false)                                           \
+        .Run(state);                                                           \
+    state.SetItemsProcessed(state.iterations() * N * H * W * C);               \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)) \
+      ->UseRealTime();
 
 #define BM_FusedBatchNormGradResnetShapes(T, IS_TRAINING, FORMAT, DEVICE) \
   BM_FusedBatchNormGrad(64, 56, 56, 64, T, IS_TRAINING, FORMAT, DEVICE);  \
diff --git a/tensorflow/core/kernels/linalg/banded_triangular_solve_op_test.cc b/tensorflow/core/kernels/linalg/banded_triangular_solve_op_test.cc
index 7c20b88845f..f4b54fb1c6a 100644
--- a/tensorflow/core/kernels/linalg/banded_triangular_solve_op_test.cc
+++ b/tensorflow/core/kernels/linalg/banded_triangular_solve_op_test.cc
@@ -98,14 +98,16 @@ static Graph* BandedTriangularSolve(int64 num_bands, int64 n, int64 m,
 //   BS: boolean indicating whether to use the banded solver
 //    T: C++ type of scalars (e.g. float, std::complex)
 //   TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
-#define BM_BandedTriangularSolveDev(K, N, M, BS, T, TT, D)                     \
-  static void BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT(        \
-      int iters) {                                                             \
-    testing::UseRealTime();                                                    \
-    testing::ItemsProcessed(static_cast<int64>(iters) * K * N + N * M);        \
-    test::Benchmark(#D, BandedTriangularSolve<T>(K, N, M, BS, TT)).Run(iters); \
-  }                                                                            \
-  BENCHMARK(BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT);
+#define BM_BandedTriangularSolveDev(K, N, M, BS, T, TT, D)              \
+  static void BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT( \
+      ::testing::benchmark::State& state) {                             \
+    test::Benchmark(#D, BandedTriangularSolve<T>(K, N, M, BS, TT),      \
+                    /*old_benchmark_api*/ false)                        \
+        .Run(state);                                                    \
+    state.SetItemsProcessed(state.iterations() * K * N + N * M);        \
+  }                                                                     \
+  BENCHMARK(BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT)   \
+      ->UseRealTime();
 
 #define BM_BandedTriangularSolve(K, N, M, BS, D)                \
   BM_BandedTriangularSolveDev(K, N, M, BS, float, DT_FLOAT, D); \
diff --git a/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_test.cc b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_test.cc
index 7bb71ae8b68..e03f29340ae 100644
--- a/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_test.cc
+++ b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_test.cc
@@ -101,18 +101,18 @@ static Graph* MatrixTriangularSolveWithBroadcast(int64 b0, int64 b1, int64 m,
 //    T: C++ type of scalars (e.g. float, std::complex)
 //   TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
 //    D: Device (e.g. cpu, gpu)
-#define BM_MatrixTriangularSolveDev(B1, B2, M, N, MB, T, TT, D)                \
-  static void                                                                  \
-      BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D(  \
-          int iters) {                                                         \
-    testing::UseRealTime();                                                    \
-    testing::ItemsProcessed(static_cast<int64>(iters) * std::max(B1, B2) * M * \
-                            M * N * 2);                                        \
-    test::Benchmark(                                                           \
-        #D, MatrixTriangularSolveWithBroadcast<T>(B1, B2, M, N, MB, TT))       \
-        .Run(iters);                                                           \
-  }                                                                            \
-  BENCHMARK(                                                                   \
+#define BM_MatrixTriangularSolveDev(B1, B2, M, N, MB, T, TT, D)               \
+  static void                                                                 \
+      BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D( \
+          ::testing::benchmark::State& state) {                               \
+    state.SetItemsProcessed(state.iterations() * std::max(B1, B2) * M * M *   \
+                            N * 2);                                           \
+    test::Benchmark(                                                          \
+        #D, MatrixTriangularSolveWithBroadcast<T>(B1, B2, M, N, MB, TT),      \
+        /*old_benchmark_api*/ false)                                          \
+        .Run(state);                                                          \
+  }                                                                           \
+  BENCHMARK(                                                                  \
       BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
index 92ddf8edbfb..a321f4739eb 100644
--- a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
+++ b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
@@ -56,20 +56,25 @@ static Graph* ConstructSpaceToBatchGraph(
 
 // The BM_Expand macro is needed for this to build with VC++.
 #define BM_Expand(x) x
+// Macro is already longer than 80 chars.
+// NOLINTBEGIN
 #define BM_SpaceToBatchDev(OP, DEVICE, DTYPE, B, H, W, D, BS, P00, P01, P10,                            \
                            P11)                                                                         \
   static void                                                                                           \
       BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11( \
-          int iters) {                                                                                  \
-    testing::ItemsProcessed(static_cast<int64>(iters) * B * (H + P00 + P01) *                           \
+          ::testing::benchmark::State& state) {                                                         \
+    test::Benchmark(                                                                                    \
+        #DEVICE,                                                                                        \
+        ConstructSpaceToBatchGraph(#OP, TensorShape({B, H, W, D}), BS, DTYPE,                           \
+                                   {{P00, P01}, {P10, P11}}),                                           \
+        /*old_benchmark_api*/ false)                                                                    \
+        .Run(state);                                                                                    \
+    state.SetItemsProcessed(state.iterations() * B * (H + P00 + P01) *                                  \
                             (W + P10 + P11) * D);                                                       \
-    test::Benchmark(#DEVICE, ConstructSpaceToBatchGraph(                                                \
-                                 #OP, TensorShape({B, H, W, D}), BS, DTYPE,                             \
-                                 {{P00, P01}, {P10, P11}}))                                             \
-        .Run(iters);                                                                                    \
   }                                                                                                     \
   BENCHMARK(                                                                                            \
       BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11);
+// NOLINTEND
 #define BM_SpaceToBatch(OP, ...)                                 \
   BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \
   BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \
diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc
index 1dc51cd804c..a0f07d4c4de 100644
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@@ -107,36 +107,30 @@ static Graph* ReplicatedSparseMatMul(int m, int n, int d, float sparsity_1,
 #define BM_SPARSE(M, K, N, S1, S2, TRA, TRB, TA, TB)                           \
   static void                                                                  \
       BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB( \
-          int iters) {                                                         \
-    testing::StopTiming();                                                     \
-    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2);        \
+          ::testing::benchmark::State& state) {                                \
     auto label = strings::Printf("tr_a: %d tr_b: %d sp_a: %0.2f sp_b: %0.2f",  \
                                  TRA, TRB, S1 / 100.0, S2 / 100.0);            \
-    testing::SetLabel(label);                                                  \
-    testing::UseRealTime();                                                    \
+    state.SetLabel(label);                                                     \
     auto g = SparseMatMul<TA, TB>(M, N, K, S1 / 100.0, S2 / 100.0, TRA, TRB);  \
-    testing::StartTiming();                                                    \
-    test::Benchmark("cpu", g).Run(iters);                                      \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);         \
   }                                                                            \
   BENCHMARK(                                                                   \
-      BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB);
+      BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB) \
+      ->UseRealTime();
 
 #define BM_SPARSE_REPLICATED(M, K, N, S1, S2, Copies)                          \
   static void BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies( \
-      int iters) {                                                             \
-    testing::StopTiming();                                                     \
-    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * Copies *   \
-                            2);                                                \
+      ::testing::benchmark::State& state) {                                    \
     auto label = strings::Printf("copies: %d sp_a: %0.2f sp_b: %0.2f",         \
                                  (Copies), S1 / 100.0, S2 / 100.0);            \
-    testing::SetLabel(label);                                                  \
-    testing::UseRealTime();                                                    \
+    state.SetLabel(label);                                                     \
     auto g =                                                                   \
         ReplicatedSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, (Copies));     \
-    testing::StartTiming();                                                    \
-    test::Benchmark("cpu", g).Run(iters);                                      \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);         \
+    state.SetItemsProcessed(state.iterations() * M * K * N * Copies * 2);      \
   }                                                                            \
-  BENCHMARK(BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies);
+  BENCHMARK(BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies)   \
+      ->UseRealTime();
 
 #define BM_SPARSE_FLOAT(M, K, N, S1, S2, TRA, TRB) \
   BM_SPARSE(M, K, N, S1, S2, TRA, TRB, float, float)
@@ -219,22 +213,21 @@ static Graph* MultiSparseMatMul(int m, int n, int d, float sparsity_1,
   return g;
 }
 
-#define BM_SPARSE_MULTI(M, K, N, S1, S2, Copies)                             \
-  static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies(    \
-      int iters) {                                                           \
-    testing::StopTiming();                                                   \
-    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2 * 2 *  \
-                            Copies);                                         \
-    auto label = strings::Printf("%d_%d_%d_%d_%0.2f_%0.2f", M, K, N, Copies, \
-                                 S1 / 100.0, S2 / 100.0);                    \
-    testing::SetLabel(label);                                                \
-    testing::UseRealTime();                                                  \
-    auto g = MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, Copies);     \
-    testing::StartTiming();                                                  \
-    test::Benchmark("cpu", g).Run(iters);                                    \
-  }                                                                          \
-  BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies);
-
+// clang-format off
+// NOLINTBEGIN
+#define BM_SPARSE_MULTI(M, K, N, S1, S2, Copies)                              \
+  static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies(::testing::benchmark::State& state) {                                              \
+    auto label = strings::Printf("%d_%d_%d_%d_%0.2f_%0.2f", M, K, N, Copies,  \
+                                 S1 / 100.0, S2 / 100.0);                     \
+    state.SetLabel(label);                                                    \
+    auto g = MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, Copies);      \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);        \
+    state.SetItemsProcessed(state.iterations() * M * K * N * 2 * 2 * Copies); \
+  }                                                                           \
+  BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies)       \
+      ->UseRealTime();
+// NOLINTEND
+// clang-format on
 BM_SPARSE_MULTI(1024, 2140, 4096, 0, 82, 1);
 BM_SPARSE_MULTI(1024, 4096, 2048, 83, 83, 1);
 BM_SPARSE_MULTI(400, 800, 2560, 85, 85, 1);
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_test.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_test.cc
index 249ddbe8e63..b06f72d42e0 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_test.cc
@@ -68,19 +68,22 @@ static Graph* SparseTensorDenseMatmul(int nnz, int m, int k, int n,
   return g;
 }
 
+// NOLINTBEGIN
 #define BM_SparseTensorDenseMatmulDev(NNZ, M, K, N, TA, TB, DEVICE)                  \
   static void                                                                        \
       BM_SparseTensorDenseMatmul##_##NNZ##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE( \
-          int iters) {                                                               \
+          ::testing::benchmark::State& state) {                                      \
     int64 items_per_iter = (static_cast<int64>(NNZ) * (TB ? K : N));                 \
-    testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter);             \
-    testing::BytesProcessed(static_cast<int64>(iters) * items_per_iter *             \
+    test::Benchmark(#DEVICE, SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB),          \
+                    /*old_benchmark_api*/ false)                                     \
+        .Run(state);                                                                 \
+    state.SetItemsProcessed(state.iterations() * items_per_iter);                    \
+    state.SetBytesProcessed(state.iterations() * items_per_iter *                    \
                             sizeof(float));                                          \
-    test::Benchmark(#DEVICE, SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB))          \
-        .Run(iters);                                                                 \
   }                                                                                  \
   BENCHMARK(                                                                         \
       BM_SparseTensorDenseMatmul##_##NNZ##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE);
+// NOLINTEND
 
 #define BM_SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB)    \
   BM_SparseTensorDenseMatmulDev(NNZ, M, K, N, TA, TB, cpu); \