Updated benchmarks to newer API

PiperOrigin-RevId: 358013972 Change-Id: I99f0f538a39845408fbc29dcd60652c42eaf652e
2021-02-17 12:58:26 -08:00 · 2021-02-17 12:58:26 -08:00 · 1178262a2a
commit 1178262a2a
parent 7d45aa8560
11 changed files with 210 additions and 213 deletions
--- a/tensorflow/core/kernels/conv_grad_filter_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops_benchmark_test.cc
@ -116,12 +116,15 @@ static Graph* Conv2DBackpropFilter(int batch, int height, int width,
 #define BM_Conv2DBwdFilter(T, FMT, N, H, W, C, FH, FW, FC, SH, SW, PADDING,    \
                           type)                                               \
  static void BM_NAME(BM_Conv2DBackpropFilter, type, T, FMT, N, H, W, C, FH,   \
-                      FW, FC, SH, SW, PADDING)(int iters) {                    \
+                      FW, FC, SH, SW,                                          \
-    testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) *      \
+                      PADDING)(::testing::benchmark::State & state) {          \
-                            (C));                                              \
+    test::Benchmark(#type,                                                     \
-    test::Benchmark(#type, Conv2DBackpropFilter<T>(N, H, W, C, FH, FW, FC, SH, \
+                    Conv2DBackpropFilter<T>(N, H, W, C, FH, FW, FC, SH, SW,    \
-                                                   SW, PADDING, FORMAT_##FMT)) \
+                                            PADDING, FORMAT_##FMT),            \
-        .Run(iters);                                                           \
+                    /*old_benchmark_api*/ false)                               \
        .Run(state);                                                           \
    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * (N) *     \
                            (H) * (W) * (C));                                  \
  }                                                                            \
  BENCHMARK(BM_NAME(BM_Conv2DBackpropFilter, type, T, FMT, N, H, W, C, FH, FW, \
                    FC, SH, SW, PADDING));
--- a/tensorflow/core/kernels/conv_grad_input_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops_benchmark_test.cc
@ -84,9 +84,9 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
          .Input(backprop)
          .Attr("T", DataTypeToEnum<T>::value)
          .Attr("strides", {1, stride_h, stride_w, 1})
-          .Attr("padding", padding == Padding::SAME
+          .Attr("padding", padding == Padding::SAME    ? "SAME"
-                               ? "SAME"
+                           : padding == Padding::VALID ? "VALID"
-                               : padding == Padding::VALID ? "VALID" : "N/A")
+                                                       : "N/A")
          .Attr("data_format", ToString(data_format))
          .Finalize(graph, &conv2d));
@ -115,12 +115,14 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
 #define BM_Conv2DBwdInput(T, FMT, N, H, W, C, FW, FH, FC, SH, SW, PADDING,    \
                          type)                                               \
  static void BM_NAME(BM_Conv2DBackpropInput, type, T, FMT, N, H, W, C, FH,   \
-                      FW, FC, SH, SW, PADDING)(int iters) {                   \
+                      FW, FC, SH, SW,                                         \
-    testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) *     \
+                      PADDING)(::testing::benchmark::State & state) {         \
-                            (C));                                             \
+    test::Benchmark(#type,                                                    \
-    test::Benchmark(#type, Conv2DBackpropInput<T>(N, H, W, C, FH, FW, FC, SH, \
+                    Conv2DBackpropInput<T>(N, H, W, C, FH, FW, FC, SH, SW,    \
-                                                  SW, PADDING, FORMAT_##FMT)) \
+                                           PADDING, FORMAT_##FMT),            \
-        .Run(iters);                                                          \
+                    /*old_benchmark_api*/ false)                              \
        .Run(state);                                                          \
    state.SetItemsProcessed(state.iterations() * (N) * (H) * (W) * (C));      \
  }                                                                           \
  BENCHMARK(BM_NAME(BM_Conv2DBackpropInput, type, T, FMT, N, H, W, C, FH, FW, \
                    FC, SH, SW, PADDING));
--- a/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
@ -91,10 +91,8 @@ TEST(SnapshotUtilTest, CombinationRoundTripTest) {
  SnapshotRoundTrip(io::compression::kSnappy, 2);
 }
-void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
+void SnapshotReaderBenchmarkLoop(::testing::benchmark::State& state,
-                                 int version) {
+                                 std::string compression_type, int version) {
  tensorflow::testing::StopTiming();
  tensorflow::DataTypeVector dtypes;
  std::vector<Tensor> tensors;
  GenerateTensorVector(dtypes, tensors);
@ -106,7 +104,7 @@ void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
  TF_ASSERT_OK(Writer::Create(tensorflow::Env::Default(), filename,
                              compression_type, version, dtypes, &writer));
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    writer->WriteTensors(tensors).IgnoreError();
  }
  TF_ASSERT_OK(writer->Close());
@ -115,34 +113,32 @@ void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
  TF_ASSERT_OK(Reader::Create(Env::Default(), filename, compression_type,
                              version, dtypes, &reader));
-  tensorflow::testing::StartTiming();
+  for (auto s : state) {
  for (int i = 0; i < iters; ++i) {
    std::vector<Tensor> read_tensors;
    reader->ReadTensors(&read_tensors).IgnoreError();
  }
  tensorflow::testing::StopTiming();
  TF_ASSERT_OK(Env::Default()->DeleteFile(filename));
 }
-void SnapshotCustomReaderNoneBenchmark(int iters) {
+void SnapshotCustomReaderNoneBenchmark(::testing::benchmark::State& state) {
-  SnapshotReaderBenchmarkLoop(iters, io::compression::kNone, 1);
+  SnapshotReaderBenchmarkLoop(state, io::compression::kNone, 1);
 }
-void SnapshotCustomReaderGzipBenchmark(int iters) {
+void SnapshotCustomReaderGzipBenchmark(::testing::benchmark::State& state) {
-  SnapshotReaderBenchmarkLoop(iters, io::compression::kGzip, 1);
+  SnapshotReaderBenchmarkLoop(state, io::compression::kGzip, 1);
 }
-void SnapshotCustomReaderSnappyBenchmark(int iters) {
+void SnapshotCustomReaderSnappyBenchmark(::testing::benchmark::State& state) {
-  SnapshotReaderBenchmarkLoop(iters, io::compression::kSnappy, 1);
+  SnapshotReaderBenchmarkLoop(state, io::compression::kSnappy, 1);
 }
-void SnapshotTFRecordReaderNoneBenchmark(int iters) {
+void SnapshotTFRecordReaderNoneBenchmark(::testing::benchmark::State& state) {
-  SnapshotReaderBenchmarkLoop(iters, io::compression::kNone, 2);
+  SnapshotReaderBenchmarkLoop(state, io::compression::kNone, 2);
 }
-void SnapshotTFRecordReaderGzipBenchmark(int iters) {
+void SnapshotTFRecordReaderGzipBenchmark(::testing::benchmark::State& state) {
-  SnapshotReaderBenchmarkLoop(iters, io::compression::kGzip, 2);
+  SnapshotReaderBenchmarkLoop(state, io::compression::kGzip, 2);
 }
 BENCHMARK(SnapshotCustomReaderNoneBenchmark);
@ -151,10 +147,8 @@ BENCHMARK(SnapshotCustomReaderSnappyBenchmark);
 BENCHMARK(SnapshotTFRecordReaderNoneBenchmark);
 BENCHMARK(SnapshotTFRecordReaderGzipBenchmark);
-void SnapshotWriterBenchmarkLoop(int iters, std::string compression_type,
+void SnapshotWriterBenchmarkLoop(::testing::benchmark::State& state,
-                                 int version) {
+                                 std::string compression_type, int version) {
  tensorflow::testing::StopTiming();
  tensorflow::DataTypeVector dtypes;
  std::vector<Tensor> tensors;
  GenerateTensorVector(dtypes, tensors);
@ -166,38 +160,36 @@ void SnapshotWriterBenchmarkLoop(int iters, std::string compression_type,
  TF_ASSERT_OK(Writer::Create(tensorflow::Env::Default(), filename,
                              compression_type, version, dtypes, &writer));
-  tensorflow::testing::StartTiming();
+  for (auto s : state) {
  for (int i = 0; i < iters; ++i) {
    writer->WriteTensors(tensors).IgnoreError();
  }
  writer->Close().IgnoreError();
  tensorflow::testing::StopTiming();
  TF_ASSERT_OK(Env::Default()->DeleteFile(filename));
 }
-void SnapshotCustomWriterNoneBenchmark(int iters) {
+void SnapshotCustomWriterNoneBenchmark(::testing::benchmark::State& state) {
-  SnapshotWriterBenchmarkLoop(iters, io::compression::kNone, 1);
+  SnapshotWriterBenchmarkLoop(state, io::compression::kNone, 1);
 }
-void SnapshotCustomWriterGzipBenchmark(int iters) {
+void SnapshotCustomWriterGzipBenchmark(::testing::benchmark::State& state) {
-  SnapshotWriterBenchmarkLoop(iters, io::compression::kGzip, 1);
+  SnapshotWriterBenchmarkLoop(state, io::compression::kGzip, 1);
 }
-void SnapshotCustomWriterSnappyBenchmark(int iters) {
+void SnapshotCustomWriterSnappyBenchmark(::testing::benchmark::State& state) {
-  SnapshotWriterBenchmarkLoop(iters, io::compression::kSnappy, 1);
+  SnapshotWriterBenchmarkLoop(state, io::compression::kSnappy, 1);
 }
-void SnapshotTFRecordWriterNoneBenchmark(int iters) {
+void SnapshotTFRecordWriterNoneBenchmark(::testing::benchmark::State& state) {
-  SnapshotWriterBenchmarkLoop(iters, io::compression::kNone, 2);
+  SnapshotWriterBenchmarkLoop(state, io::compression::kNone, 2);
 }
-void SnapshotTFRecordWriterGzipBenchmark(int iters) {
+void SnapshotTFRecordWriterGzipBenchmark(::testing::benchmark::State& state) {
-  SnapshotWriterBenchmarkLoop(iters, io::compression::kGzip, 2);
+  SnapshotWriterBenchmarkLoop(state, io::compression::kGzip, 2);
 }
-void SnapshotTFRecordWriterSnappyBenchmark(int iters) {
+void SnapshotTFRecordWriterSnappyBenchmark(::testing::benchmark::State& state) {
-  SnapshotWriterBenchmarkLoop(iters, io::compression::kSnappy, 2);
+  SnapshotWriterBenchmarkLoop(state, io::compression::kSnappy, 2);
 }
 BENCHMARK(SnapshotCustomWriterNoneBenchmark);
--- a/tensorflow/core/kernels/eigen_benchmark.h
+++ b/tensorflow/core/kernels/eigen_benchmark.h
@ -35,8 +35,9 @@ class SpatialConvolutionBenchmarksSuite {
  using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
-  SpatialConvolutionBenchmarksSuite(int iters, Device& device)
+  SpatialConvolutionBenchmarksSuite(::testing::benchmark::State& state,
-      : iters_(iters), device_(device) {}
+                                    Device& device)
      : state_(state), device_(device) {}
  Eigen::Index BufferSize(const Dimensions& dims) {
    return dims.TotalSize() * sizeof(Scalar);
@ -62,12 +63,10 @@ class SpatialConvolutionBenchmarksSuite {
    Filter filter(filter_data, filter_dims);
    Output output(output_data, output_dims);
-    ::tensorflow::testing::StartTiming();
+    for (auto s : state_) {
    for (int i = 0; i < iters_; ++i) {
      output.device(device_) = Eigen::SpatialConvolution(input, filter);
      tensorflow::testing::DoNotOptimize(output);
    }
    ::tensorflow::testing::StopTiming();
    device_.deallocate(input_data);
    device_.deallocate(filter_data);
@ -102,13 +101,11 @@ class SpatialConvolutionBenchmarksSuite {
    OutputBackward output_backward(output_backward_data, output_dims);
    InputBackward input_backward(input_backward_data, input_dims);
-    ::tensorflow::testing::StartTiming();
+    for (auto s : state_) {
    for (int i = 0; i < iters_; ++i) {
      input_backward.device(device_) = Eigen::SpatialConvolutionBackwardInput(
          filter, output_backward, input_rows, input_cols);
      tensorflow::testing::DoNotOptimize(input_backward);
    }
    ::tensorflow::testing::StopTiming();
    device_.deallocate(filter_data);
    device_.deallocate(output_backward_data);
@ -143,13 +140,11 @@ class SpatialConvolutionBenchmarksSuite {
    OutputBackward output_backward(output_backward_data, input_dims);
    FilterBackward filter_backward(filter_backward_data, filter_dims);
-    ::tensorflow::testing::StartTiming();
+    for (auto s : state_) {
    for (int i = 0; i < iters_; ++i) {
      filter_backward.device(device_) = Eigen::SpatialConvolutionBackwardKernel(
          input, output_backward, filter_rows, filter_cols);
      tensorflow::testing::DoNotOptimize(filter_backward);
    }
    ::tensorflow::testing::StopTiming();
    device_.deallocate(input_data);
    device_.deallocate(output_backward_data);
@ -157,7 +152,8 @@ class SpatialConvolutionBenchmarksSuite {
  }
 private:
-  int iters_;
+  ::testing::benchmark::State& state_;
  Device& device_;
 };
@ -170,8 +166,9 @@ class CuboidConvolutionBenchmarksSuite {
  using Dimensions = Eigen::DSizes<Eigen::Index, 5>;
-  CuboidConvolutionBenchmarksSuite(int iters, Device& device)
+  CuboidConvolutionBenchmarksSuite(::testing::benchmark::State& state,
-      : iters_(iters), device_(device) {}
+                                   Device& device)
      : state_(state), device_(device) {}
  Eigen::Index BufferSize(const Dimensions& dims) {
    return dims.TotalSize() * sizeof(Scalar);
@ -198,12 +195,10 @@ class CuboidConvolutionBenchmarksSuite {
    Filter filter(filter_data, filter_dims);
    Output output(output_data, output_dims);
-    ::tensorflow::testing::StartTiming();
+    for (auto s : state_) {
    for (int i = 0; i < iters_; ++i) {
      output.device(device_) = Eigen::CuboidConvolution(input, filter);
      tensorflow::testing::DoNotOptimize(output);
    }
    ::tensorflow::testing::StopTiming();
    device_.deallocate(input_data);
    device_.deallocate(filter_data);
@ -240,13 +235,11 @@ class CuboidConvolutionBenchmarksSuite {
    OutputBackward output_backward(output_backward_data, output_dims);
    InputBackward input_backward(input_backward_data, input_dims);
-    ::tensorflow::testing::StartTiming();
+    for (auto s : state_) {
    for (int i = 0; i < iters_; ++i) {
      input_backward.device(device_) = Eigen::CuboidConvolutionBackwardInput(
          filter, output_backward, input_planes, input_rows, input_cols);
      tensorflow::testing::DoNotOptimize(input_backward);
    }
    ::tensorflow::testing::StopTiming();
    device_.deallocate(filter_data);
    device_.deallocate(output_backward_data);
@ -283,13 +276,11 @@ class CuboidConvolutionBenchmarksSuite {
    OutputBackward output_backward(output_backward_data, output_dims);
    FilterBackward filter_backward(filter_backward_data, filter_dims);
-    ::tensorflow::testing::StartTiming();
+    for (auto s : state_) {
    for (int i = 0; i < iters_; ++i) {
      filter_backward.device(device_) = Eigen::CuboidConvolutionBackwardKernel(
          input, output_backward, filter_planes, filter_rows, filter_cols);
      tensorflow::testing::DoNotOptimize(filter_backward);
    }
    ::tensorflow::testing::StopTiming();
    device_.deallocate(input_data);
    device_.deallocate(output_backward_data);
@ -297,7 +288,7 @@ class CuboidConvolutionBenchmarksSuite {
  }
 private:
-  int iters_;
+  ::testing::benchmark::State& state_;
  Device& device_;
 };
--- a/tensorflow/core/kernels/eigen_benchmark_cpu_test.cc
+++ b/tensorflow/core/kernels/eigen_benchmark_cpu_test.cc
@ -27,19 +27,17 @@ limitations under the License.
 // Spatial Convolutions                                                       //
 // -------------------------------------------------------------------------- //
-void SpatialConvolution(int iters, int num_threads,
+void SpatialConvolution(::testing::benchmark::State& state, int num_threads,
                        /* Input dimensions: */
                        int input_batches, int input_height, int input_width,
                        int input_depth,
                        /* Filter (kernel) dimensions: */
                        int filter_count, int filter_height, int filter_width) {
  ::tensorflow::testing::StopTiming();
  CREATE_THREAD_POOL(num_threads);
  using Benchmark =
      SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
-  auto benchmark = Benchmark(iters, device);
+  auto benchmark = Benchmark(state, device);
  typename Benchmark::Dimensions input_dims(input_batches, input_height,
                                            input_width, input_depth);
@ -52,23 +50,22 @@ void SpatialConvolution(int iters, int num_threads,
      (input_dims.TotalSize() / input_depth) * filter_count;
  auto flops =
      num_computed_elements * (input_depth * filter_height * filter_width);
-  ::tensorflow::testing::ItemsProcessed(flops * iters);
+  state.SetItemsProcessed(flops * state.iterations());
 }
-void SpatialConvolutionBackwardInput(int iters, int num_threads,
+void SpatialConvolutionBackwardInput(::testing::benchmark::State& state,
                                     int num_threads,
                                     /* Input dimensions: */
                                     int input_batches, int input_height,
                                     int input_width, int input_depth,
                                     /* Filter (kernel) dimensions: */
                                     int filter_count, int filter_height,
                                     int filter_width) {
  ::tensorflow::testing::StopTiming();
  CREATE_THREAD_POOL(num_threads);
  using Benchmark =
      SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
-  auto benchmark = Benchmark(iters, device);
+  auto benchmark = Benchmark(state, device);
  typename Benchmark::Dimensions input_dims(input_batches, input_height,
                                            input_width, input_depth);
@ -80,23 +77,22 @@ void SpatialConvolutionBackwardInput(int iters, int num_threads,
  auto num_computed_elements = input_dims.TotalSize();
  auto flops =
      num_computed_elements * (input_depth * filter_height * filter_width);
-  ::tensorflow::testing::ItemsProcessed(flops * iters);
+  state.SetItemsProcessed(flops * state.iterations());
 }
-void SpatialConvolutionBackwardKernel(int iters, int num_threads,
+void SpatialConvolutionBackwardKernel(::testing::benchmark::State& state,
                                      int num_threads,
                                      /* Input dimensions: */
                                      int input_batches, int input_height,
                                      int input_width, int input_depth,
                                      /* Filter (kernel) dimensions: */
                                      int filter_count, int filter_height,
                                      int filter_width) {
  ::tensorflow::testing::StopTiming();
  CREATE_THREAD_POOL(num_threads);
  using Benchmark =
      SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
-  auto benchmark = Benchmark(iters, device);
+  auto benchmark = Benchmark(state, device);
  typename Benchmark::Dimensions input_dims(input_batches, input_height,
                                            input_width, input_depth);
@ -108,7 +104,7 @@ void SpatialConvolutionBackwardKernel(int iters, int num_threads,
  auto num_computed_elements = filter_dims.TotalSize();
  auto flops =
      num_computed_elements * (input_batches * input_height * input_width);
-  ::tensorflow::testing::ItemsProcessed(flops * iters);
+  state.SetItemsProcessed(flops * state.iterations());
 }
 // Macro arguments names: --------------------------------------------------- //
@ -126,26 +122,26 @@ void SpatialConvolutionBackwardKernel(int iters, int num_threads,
 #define BM_SpatialConvolution(NT, N, H, W, C, FC, FH, FW, LABEL)          \
  static void BM_SPATIAL_NAME(SpatialConvolution, NT, N, H, W, C, FC, FH, \
-                              FW)(int iters) {                            \
+                              FW)(::testing::benchmark::State & state) {  \
-    ::tensorflow::testing::SetLabel(LABEL);                               \
+    state.SetLabel(LABEL);                                                \
-    SpatialConvolution(iters, NT, N, H, W, C, FC, FH, FW);                \
+    SpatialConvolution(state, NT, N, H, W, C, FC, FH, FW);                \
  }                                                                       \
  BENCHMARK(BM_SPATIAL_NAME(SpatialConvolution, NT, N, H, W, C, FC, FH, FW))
 #define BM_SpatialConvolutionBwdInput(NT, N, H, W, C, FC, FH, FW, LABEL)      \
  static void BM_SPATIAL_NAME(SpatialConvolutionBwdInput, NT, N, H, W, C, FC, \
-                              FH, FW)(int iters) {                            \
+                              FH, FW)(::testing::benchmark::State & state) {  \
-    ::tensorflow::testing::SetLabel(LABEL);                                   \
+    state.SetLabel(LABEL);                                                    \
-    SpatialConvolutionBackwardInput(iters, NT, N, H, W, C, FC, FH, FW);       \
+    SpatialConvolutionBackwardInput(state, NT, N, H, W, C, FC, FH, FW);       \
  }                                                                           \
  BENCHMARK(                                                                  \
      BM_SPATIAL_NAME(SpatialConvolutionBwdInput, NT, N, H, W, C, FC, FH, FW))
 #define BM_SpatialConvolutionBwdKernel(NT, N, H, W, C, FC, FH, FW, LABEL)      \
  static void BM_SPATIAL_NAME(SpatialConvolutionBwdKernel, NT, N, H, W, C, FC, \
-                              FH, FW)(int iters) {                             \
+                              FH, FW)(::testing::benchmark::State & state) {   \
-    ::tensorflow::testing::SetLabel(LABEL);                                    \
+    state.SetLabel(LABEL);                                                     \
-    SpatialConvolutionBackwardKernel(iters, NT, N, H, W, C, FC, FH, FW);       \
+    SpatialConvolutionBackwardKernel(state, NT, N, H, W, C, FC, FH, FW);       \
  }                                                                            \
  BENCHMARK(BM_SPATIAL_NAME(SpatialConvolutionBwdKernel, NT, N, H, W, C, FC,   \
                            FH, FW))
@ -248,20 +244,18 @@ BM_SpatialConvolutionsBwdKernel(32, 7, 7, 192, 384, 3, 3, "conv5_00_3x3");
 // Cuboid Convolutions                                                        //
 // -------------------------------------------------------------------------- //
-void CuboidConvolution(int iters, int num_threads,
+void CuboidConvolution(::testing::benchmark::State& state, int num_threads,
                       /* Input dimensions: */
                       int input_batches, int input_height, int input_width,
                       int input_planes, int input_depth,
                       /* Filter (kernel) dimensions: */
                       int filter_count, int filter_height, int filter_width,
                       int filter_planes) {
  ::tensorflow::testing::StopTiming();
  CREATE_THREAD_POOL(num_threads);
  using Benchmark =
      CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
-  auto benchmark = Benchmark(iters, device);
+  auto benchmark = Benchmark(state, device);
  typename Benchmark::Dimensions input_dims(
      input_batches, input_height, input_width, input_planes, input_depth);
@ -274,10 +268,11 @@ void CuboidConvolution(int iters, int num_threads,
      (input_dims.TotalSize() / input_depth) * filter_count;
  auto flops = num_computed_elements *
               (input_depth * filter_height * filter_width * filter_planes);
-  ::tensorflow::testing::ItemsProcessed(flops * iters);
+  state.SetItemsProcessed(flops * state.iterations());
 }
-void CuboidConvolutionBackwardInput(int iters, int num_threads,
+void CuboidConvolutionBackwardInput(::testing::benchmark::State& state,
                                    int num_threads,
                                    /* Input dimensions: */
                                    int input_batches, int input_height,
                                    int input_width, int input_planes,
@ -285,13 +280,11 @@ void CuboidConvolutionBackwardInput(int iters, int num_threads,
                                    /* Filter (kernel) dimensions: */
                                    int filter_count, int filter_height,
                                    int filter_width, int filter_planes) {
  ::tensorflow::testing::StopTiming();
  CREATE_THREAD_POOL(num_threads);
  using Benchmark =
      CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
-  auto benchmark = Benchmark(iters, device);
+  auto benchmark = Benchmark(state, device);
  typename Benchmark::Dimensions input_dims(
      input_batches, input_height, input_width, input_planes, input_depth);
@ -303,10 +296,11 @@ void CuboidConvolutionBackwardInput(int iters, int num_threads,
  auto num_computed_elements = input_dims.TotalSize();
  auto flops = num_computed_elements *
               (input_depth * filter_height * filter_width * filter_planes);
-  ::tensorflow::testing::ItemsProcessed(flops * iters);
+  state.SetItemsProcessed(flops * state.iterations());
 }
-void CuboidConvolutionBackwardKernel(int iters, int num_threads,
+void CuboidConvolutionBackwardKernel(::testing::benchmark::State& state,
                                     int num_threads,
                                     /* Input dimensions: */
                                     int input_batches, int input_height,
                                     int input_width, int input_planes,
@ -314,13 +308,11 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
                                     /* Filter (kernel) dimensions: */
                                     int filter_count, int filter_height,
                                     int filter_width, int filter_planes) {
  ::tensorflow::testing::StopTiming();
  CREATE_THREAD_POOL(num_threads);
  using Benchmark =
      CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
-  auto benchmark = Benchmark(iters, device);
+  auto benchmark = Benchmark(state, device);
  typename Benchmark::Dimensions input_dims(
      input_batches, input_height, input_width, input_planes, input_depth);
@ -332,9 +324,16 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
  auto num_computed_elements = filter_dims.TotalSize();
  auto flops = num_computed_elements *
               (input_batches * input_height * input_width * input_planes);
-  ::tensorflow::testing::ItemsProcessed(flops * iters);
+  state.SetItemsProcessed(flops * state.iterations());
 }
 // The multiple #'s in the function names + the `::testing::benchmark::State&`
 // as parameters apparently confuses clang if they are not on the same line. So
 // we need to turn off LINT and clang-format for this block.
 //
 // clang-format off
 // NOLINTBEGIN
 // Macro arguments names: --------------------------------------------------- //
 //   NT: num threads
 //    N: batch size
@ -354,33 +353,33 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
            _f_##FC##_##FH##_##FW##_##FP)
 #define BM_CuboidConvolution(NT, N, H, W, P, C, FC, FH, FW, FP, LABEL)         \
-  static void BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, \
+  static void BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) {                   \
-                             FP)(int iters) {                                  \
+    state.SetLabel(LABEL);                                    \
-    ::tensorflow::testing::SetLabel(LABEL);                                    \
+    CuboidConvolution(state, NT, N, H, W, P, C, FC, FH, FW, FP);               \
    CuboidConvolution(iters, NT, N, H, W, P, C, FC, FH, FW, FP);               \
  }                                                                            \
  BENCHMARK(                                                                   \
      BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, FP))
 #define BM_CuboidConvolutionBwdInput(NT, N, H, W, P, C, FC, FH, FW, FP, LABEL) \
-  static void BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, \
+  static void BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) {           \
-                             FH, FW, FP)(int iters) {                          \
+    state.SetLabel(LABEL);                                    \
-    ::tensorflow::testing::SetLabel(LABEL);                                    \
+    CuboidConvolutionBackwardInput(state, NT, N, H, W, P, C, FC, FH, FW, FP);  \
    CuboidConvolutionBackwardInput(iters, NT, N, H, W, P, C, FC, FH, FW, FP);  \
  }                                                                            \
  BENCHMARK(BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC,   \
                           FH, FW, FP))
 #define BM_CuboidConvolutionBwdKernel(NT, N, H, W, P, C, FC, FH, FW, FP,       \
                                      LABEL)                                   \
-  static void BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C,    \
+  static void BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) {       \
-                             FC, FH, FW, FP)(int iters) {                      \
+    state.SetLabel(LABEL);                                    \
-    ::tensorflow::testing::SetLabel(LABEL);                                    \
+    CuboidConvolutionBackwardKernel(state, NT, N, H, W, P, C, FC, FH, FW, FP); \
    CuboidConvolutionBackwardKernel(iters, NT, N, H, W, P, C, FC, FH, FW, FP); \
  }                                                                            \
  BENCHMARK(BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, FC,  \
                           FH, FW, FP))
 // NOLINTEND
 // clang-format on
 #define BM_CuboidConvolutions(N, H, W, P, C, FC, FH, FW, FP, LABEL) \
  BM_CuboidConvolution(2, N, H, W, P, C, FC, FH, FW, FP, LABEL);    \
  BM_CuboidConvolution(4, N, H, W, P, C, FC, FH, FW, FP, LABEL);    \
--- a/tensorflow/core/kernels/fused_batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op_test.cc
@ -283,18 +283,23 @@ static Graph* FusedBatchNormGrad(int n, int h, int w, int c, bool is_training,
 // -------------------------------------------------------------------------- //
 // FusedBatchNorm inference
 // -------------------------------------------------------------------------- //
 // clang-format off
 // NOLINTBEGIN
 #define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)         \
  static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)(::testing::benchmark::State & state) {                     \
    test::Benchmark(                                                          \
        #DEVICE,                                                              \
        FusedBatchNormInference<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT), \
        /*old_benchmark_api*/ false)                                          \
        .Run(state);                                                          \
    state.SetItemsProcessed(state.iterations() * N * H * W * C);              \
  }                                                                           \
  BENCHMARK(                                                                  \
      BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE))    \
      ->UseRealTime();
-#define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)       \
+// NOLINTEND
-  static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT,   \
+// clang-format on
                      DEVICE)(int iters) {                                  \
    testing::UseRealTime();                                                 \
    testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C);     \
    test::Benchmark(#DEVICE, FusedBatchNormInference<T>(                    \
                                 N, H, W, C, IS_TRAINING, FORMAT_##FORMAT)) \
        .Run(iters);                                                        \
  }                                                                         \
  BENCHMARK(                                                                \
      BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE));
 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, cpu);
 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, cpu);
@ -320,17 +325,19 @@ BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NCHW, gpu);
 // FusedBatchNorm gradient
 // -------------------------------------------------------------------------- //
-#define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)     \
+#define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)      \
-  static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \
+  static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT,  \
-                      DEVICE)(int iters) {                                    \
+                      DEVICE)(::testing::benchmark::State & state) {           \
-    testing::UseRealTime();                                                   \
+    test::Benchmark(                                                           \
-    testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C);       \
+        #DEVICE,                                                               \
-    test::Benchmark(#DEVICE, FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING,   \
+        FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT),       \
-                                                   FORMAT_##FORMAT))          \
+        /*old_benchmark_api*/ false)                                           \
-        .Run(iters);                                                          \
+        .Run(state);                                                           \
-  }                                                                           \
+    state.SetItemsProcessed(state.iterations() * N * H * W * C);               \
-  BENCHMARK(BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT,   \
+  }                                                                            \
-                    DEVICE));
+  BENCHMARK(                                                                   \
      BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)) \
      ->UseRealTime();
 #define BM_FusedBatchNormGradResnetShapes(T, IS_TRAINING, FORMAT, DEVICE) \
  BM_FusedBatchNormGrad(64, 56, 56, 64, T, IS_TRAINING, FORMAT, DEVICE);  \
--- a/tensorflow/core/kernels/linalg/banded_triangular_solve_op_test.cc
+++ b/tensorflow/core/kernels/linalg/banded_triangular_solve_op_test.cc
@ -98,14 +98,16 @@ static Graph* BandedTriangularSolve(int64 num_bands, int64 n, int64 m,
 //   BS: boolean indicating whether to use the banded solver
 //    T: C++ type of scalars (e.g. float, std::complex)
 //   TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
-#define BM_BandedTriangularSolveDev(K, N, M, BS, T, TT, D)                     \
+#define BM_BandedTriangularSolveDev(K, N, M, BS, T, TT, D)              \
-  static void BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT(        \
+  static void BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT( \
-      int iters) {                                                             \
+      ::testing::benchmark::State& state) {                             \
-    testing::UseRealTime();                                                    \
+    test::Benchmark(#D, BandedTriangularSolve<T>(K, N, M, BS, TT),      \
-    testing::ItemsProcessed(static_cast<int64>(iters) * K * N + N * M);        \
+                    /*old_benchmark_api*/ false)                        \
-    test::Benchmark(#D, BandedTriangularSolve<T>(K, N, M, BS, TT)).Run(iters); \
+        .Run(state);                                                    \
-  }                                                                            \
+    state.SetItemsProcessed(state.iterations() * K * N + N * M);        \
-  BENCHMARK(BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT);
+  }                                                                     \
  BENCHMARK(BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT)   \
      ->UseRealTime();
 #define BM_BandedTriangularSolve(K, N, M, BS, D)                \
  BM_BandedTriangularSolveDev(K, N, M, BS, float, DT_FLOAT, D); \
--- a/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_test.cc
+++ b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_test.cc
@ -101,18 +101,18 @@ static Graph* MatrixTriangularSolveWithBroadcast(int64 b0, int64 b1, int64 m,
 //    T: C++ type of scalars (e.g. float, std::complex)
 //   TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
 //    D: Device (e.g. cpu, gpu)
-#define BM_MatrixTriangularSolveDev(B1, B2, M, N, MB, T, TT, D)                \
+#define BM_MatrixTriangularSolveDev(B1, B2, M, N, MB, T, TT, D)               \
-  static void                                                                  \
+  static void                                                                 \
-      BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D(  \
+      BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D( \
-          int iters) {                                                         \
+          ::testing::benchmark::State& state) {                               \
-    testing::UseRealTime();                                                    \
+    state.SetItemsProcessed(state.iterations() * std::max(B1, B2) * M * M *   \
-    testing::ItemsProcessed(static_cast<int64>(iters) * std::max(B1, B2) * M * \
+                            N * 2);                                           \
-                            M * N * 2);                                        \
+    test::Benchmark(                                                          \
-    test::Benchmark(                                                           \
+        #D, MatrixTriangularSolveWithBroadcast<T>(B1, B2, M, N, MB, TT),      \
-        #D, MatrixTriangularSolveWithBroadcast<T>(B1, B2, M, N, MB, TT))       \
+        /*old_benchmark_api*/ false)                                          \
-        .Run(iters);                                                           \
+        .Run(state);                                                          \
-  }                                                                            \
+  }                                                                           \
-  BENCHMARK(                                                                   \
+  BENCHMARK(                                                                  \
      BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
+++ b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
@ -56,20 +56,25 @@ static Graph* ConstructSpaceToBatchGraph(
 // The BM_Expand macro is needed for this to build with VC++.
 #define BM_Expand(x) x
 // Macro is already longer than 80 chars.
 // NOLINTBEGIN
 #define BM_SpaceToBatchDev(OP, DEVICE, DTYPE, B, H, W, D, BS, P00, P01, P10,                            \
                           P11)                                                                         \
  static void                                                                                           \
      BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11( \
-          int iters) {                                                                                  \
+          ::testing::benchmark::State& state) {                                                         \
-    testing::ItemsProcessed(static_cast<int64>(iters) * B * (H + P00 + P01) *                           \
+    test::Benchmark(                                                                                    \
        #DEVICE,                                                                                        \
        ConstructSpaceToBatchGraph(#OP, TensorShape({B, H, W, D}), BS, DTYPE,                           \
                                   {{P00, P01}, {P10, P11}}),                                           \
        /*old_benchmark_api*/ false)                                                                    \
        .Run(state);                                                                                    \
    state.SetItemsProcessed(state.iterations() * B * (H + P00 + P01) *                                  \
                            (W + P10 + P11) * D);                                                       \
    test::Benchmark(#DEVICE, ConstructSpaceToBatchGraph(                                                \
                                 #OP, TensorShape({B, H, W, D}), BS, DTYPE,                             \
                                 {{P00, P01}, {P10, P11}}))                                             \
        .Run(iters);                                                                                    \
  }                                                                                                     \
  BENCHMARK(                                                                                            \
      BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11);
 // NOLINTEND
 #define BM_SpaceToBatch(OP, ...)                                 \
  BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \
  BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@ -107,36 +107,30 @@ static Graph* ReplicatedSparseMatMul(int m, int n, int d, float sparsity_1,
 #define BM_SPARSE(M, K, N, S1, S2, TRA, TRB, TA, TB)                           \
  static void                                                                  \
      BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB( \
-          int iters) {                                                         \
+          ::testing::benchmark::State& state) {                                \
    testing::StopTiming();                                                     \
    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2);        \
    auto label = strings::Printf("tr_a: %d tr_b: %d sp_a: %0.2f sp_b: %0.2f",  \
                                 TRA, TRB, S1 / 100.0, S2 / 100.0);            \
-    testing::SetLabel(label);                                                  \
+    state.SetLabel(label);                                                     \
    testing::UseRealTime();                                                    \
    auto g = SparseMatMul<TA, TB>(M, N, K, S1 / 100.0, S2 / 100.0, TRA, TRB);  \
-    testing::StartTiming();                                                    \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);         \
    test::Benchmark("cpu", g).Run(iters);                                      \
  }                                                                            \
  BENCHMARK(                                                                   \
-      BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB);
+      BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB) \
      ->UseRealTime();
 #define BM_SPARSE_REPLICATED(M, K, N, S1, S2, Copies)                          \
  static void BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies( \
-      int iters) {                                                             \
+      ::testing::benchmark::State& state) {                                    \
    testing::StopTiming();                                                     \
    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * Copies *   \
                            2);                                                \
    auto label = strings::Printf("copies: %d sp_a: %0.2f sp_b: %0.2f",         \
                                 (Copies), S1 / 100.0, S2 / 100.0);            \
-    testing::SetLabel(label);                                                  \
+    state.SetLabel(label);                                                     \
    testing::UseRealTime();                                                    \
    auto g =                                                                   \
        ReplicatedSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, (Copies));     \
-    testing::StartTiming();                                                    \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);         \
-    test::Benchmark("cpu", g).Run(iters);                                      \
+    state.SetItemsProcessed(state.iterations() * M * K * N * Copies * 2);      \
  }                                                                            \
-  BENCHMARK(BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies);
+  BENCHMARK(BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies)   \
      ->UseRealTime();
 #define BM_SPARSE_FLOAT(M, K, N, S1, S2, TRA, TRB) \
  BM_SPARSE(M, K, N, S1, S2, TRA, TRB, float, float)
@ -219,22 +213,21 @@ static Graph* MultiSparseMatMul(int m, int n, int d, float sparsity_1,
  return g;
 }
-#define BM_SPARSE_MULTI(M, K, N, S1, S2, Copies)                             \
+// clang-format off
-  static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies(    \
+// NOLINTBEGIN
-      int iters) {                                                           \
+#define BM_SPARSE_MULTI(M, K, N, S1, S2, Copies)                              \
-    testing::StopTiming();                                                   \
+  static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies(::testing::benchmark::State& state) {                                              \
-    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2 * 2 *  \
+    auto label = strings::Printf("%d_%d_%d_%d_%0.2f_%0.2f", M, K, N, Copies,  \
-                            Copies);                                         \
+                                 S1 / 100.0, S2 / 100.0);                     \
-    auto label = strings::Printf("%d_%d_%d_%d_%0.2f_%0.2f", M, K, N, Copies, \
+    state.SetLabel(label);                                                    \
-                                 S1 / 100.0, S2 / 100.0);                    \
+    auto g = MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, Copies);      \
-    testing::SetLabel(label);                                                \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);        \
-    testing::UseRealTime();                                                  \
+    state.SetItemsProcessed(state.iterations() * M * K * N * 2 * 2 * Copies); \
-    auto g = MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, Copies);     \
+  }                                                                           \
-    testing::StartTiming();                                                  \
+  BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies)       \
-    test::Benchmark("cpu", g).Run(iters);                                    \
+      ->UseRealTime();
-  }                                                                          \
+// NOLINTEND
-  BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies);
+// clang-format on
 BM_SPARSE_MULTI(1024, 2140, 4096, 0, 82, 1);
 BM_SPARSE_MULTI(1024, 4096, 2048, 83, 83, 1);
 BM_SPARSE_MULTI(400, 800, 2560, 85, 85, 1);
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_test.cc
@ -68,19 +68,22 @@ static Graph* SparseTensorDenseMatmul(int nnz, int m, int k, int n,
  return g;
 }
 // NOLINTBEGIN
 #define BM_SparseTensorDenseMatmulDev(NNZ, M, K, N, TA, TB, DEVICE)                  \
  static void                                                                        \
      BM_SparseTensorDenseMatmul##_##NNZ##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE( \
-          int iters) {                                                               \
+          ::testing::benchmark::State& state) {                                      \
    int64 items_per_iter = (static_cast<int64>(NNZ) * (TB ? K : N));                 \
-    testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter);             \
+    test::Benchmark(#DEVICE, SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB),          \
-    testing::BytesProcessed(static_cast<int64>(iters) * items_per_iter *             \
+                    /*old_benchmark_api*/ false)                                     \
        .Run(state);                                                                 \
    state.SetItemsProcessed(state.iterations() * items_per_iter);                    \
    state.SetBytesProcessed(state.iterations() * items_per_iter *                    \
                            sizeof(float));                                          \
    test::Benchmark(#DEVICE, SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB))          \
        .Run(iters);                                                                 \
  }                                                                                  \
  BENCHMARK(                                                                         \
      BM_SparseTensorDenseMatmul##_##NNZ##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE);
 // NOLINTEND
 #define BM_SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB)    \
  BM_SparseTensorDenseMatmulDev(NNZ, M, K, N, TA, TB, cpu); \