diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 9cd61d7a089..88ef1482a6a 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -291,11 +291,11 @@ struct ResultCandidate {
   float box_coord[4];
 };
 
-void DoNMS(int batch_idx, int class_idx, const float* boxes_data,
-           const float* scores_data, int num_boxes, int q, int num_classes,
-           const int size_per_class, const float score_threshold,
-           const float iou_threshold,
-           std::vector<ResultCandidate>& result_candidate_vec) {
+void DoNMSPerClass(int batch_idx, int class_idx, const float* boxes_data,
+                   const float* scores_data, int num_boxes, int q,
+                   int num_classes, const int size_per_class,
+                   const float score_threshold, const float iou_threshold,
+                   std::vector<ResultCandidate>& result_candidate_vec) {
   std::vector<float> class_scores_data;
   class_scores_data.reserve(num_boxes);
   std::vector<float> class_boxes_data;
@@ -341,7 +341,7 @@ void DoNMS(int batch_idx, int class_idx, const float* boxes_data,
 
   std::sort(candidate_vector.begin(), candidate_vector.end(), cmp);
   const Tensor const_boxes = boxes;
-  typename TTypes<float, 2>::ConstTensor boxes_data_1 =
+  typename TTypes<float, 2>::ConstTensor boxes_data_t =
       const_boxes.tensor<float, 2>();
   int candidate_idx = 0;
   float iou;
@@ -354,7 +354,7 @@ void DoNMS(int batch_idx, int class_idx, const float* boxes_data,
     // in order to see if `next_candidate` should be suppressed.
     bool should_select = true;
     for (int j = selected.size() - 1; j >= 0; --j) {
-      iou = IOU<float>(boxes_data_1, next_candidate.box_index, selected[j]);
+      iou = IOU<float>(boxes_data_t, next_candidate.box_index, selected[j]);
       if (iou > iou_threshold) {
         should_select = false;
         break;
@@ -364,16 +364,13 @@ void DoNMS(int batch_idx, int class_idx, const float* boxes_data,
     if (should_select) {
       // Add the selected box to the result candidate. Sorted by score
       int id = next_candidate.box_index;
-      auto& rc =
-          result_candidate_vec[selected.size() + size_per_class * class_idx];
+      result_candidate_vec[selected.size() + size_per_class * class_idx] = {
+          next_candidate.box_index,
+          next_candidate.score,
+          class_idx,
+          {boxes_data_t(id, 0), boxes_data_t(id, 1), boxes_data_t(id, 2),
+           boxes_data_t(id, 3)}};
       selected.push_back(next_candidate.box_index);
-      rc.box_index = next_candidate.box_index;
-      rc.score = next_candidate.score;
-      rc.class_idx = class_idx;
-      rc.box_coord[0] = boxes_data_1(id, 0);
-      rc.box_coord[1] = boxes_data_1(id, 1);
-      rc.box_coord[2] = boxes_data_1(id, 2);
-      rc.box_coord[3] = boxes_data_1(id, 3);
     }
   }
 }
@@ -473,23 +470,24 @@ void BatchedNonMaxSuppressionOp(
     for (int idx = begin; idx < end; ++idx) {
       int batch_idx = idx / num_classes;
       int class_idx = idx % num_classes;
-      DoNMS(batch_idx, class_idx, boxes_data + boxes_per_batch * batch_idx,
-            scores_data + scores_per_batch * batch_idx, num_boxes, q,
-            num_classes, size_per_class, score_threshold, iou_threshold,
-            result_candidate_vec[batch_idx]);
+      DoNMSPerClass(batch_idx, class_idx,
+                    boxes_data + boxes_per_batch * batch_idx,
+                    scores_data + scores_per_batch * batch_idx, num_boxes, q,
+                    num_classes, size_per_class, score_threshold, iou_threshold,
+                    result_candidate_vec[batch_idx]);
     }
   };
 
   int length = num_batches * num_classes;
   // Input data boxes_data, scores_data
-  int input_bytes = length * num_boxes * 5;
-  int output_bytes = length * num_boxes * 5;
-  int compute_cycles = (Eigen::TensorOpCost::AddCost<int>() * 5 +
-                        Eigen::TensorOpCost::MulCost<int>() * 2 +
-                        Eigen::TensorOpCost::AddCost<float>() * 10 +
-                        Eigen::TensorOpCost::MulCost<float>() * 6 +
-                        Eigen::TensorOpCost::DivCost<float>()) *
-                       length;
+  int input_bytes = num_boxes * 10 * sizeof(float);
+  int output_bytes = num_boxes * 10 * sizeof(float);
+  int compute_cycles = Eigen::TensorOpCost::AddCost<int>() * num_boxes * 14 +
+                       Eigen::TensorOpCost::MulCost<int>() * num_boxes * 9 +
+                       Eigen::TensorOpCost::MulCost<float>() * num_boxes * 9 +
+                       Eigen::TensorOpCost::AddCost<float>() * num_boxes * 8;
+  // The cost here is not the actual number of cycles, but rather a set of
+  // hand-tuned numbers that seem to work best.
   const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
   const CPUDevice& d = context->eigen_device<CPUDevice>();
   d.parallelFor(length, cost, shard_nms);
@@ -519,14 +517,14 @@ void BatchedNonMaxSuppressionOp(
   };
   length = num_batches;
   // Input data boxes_data, scores_data
-  input_bytes = length * num_boxes * 5;
-  output_bytes = length * num_boxes * 5;
-  compute_cycles = (Eigen::TensorOpCost::AddCost<int>() * 5 +
-                    Eigen::TensorOpCost::MulCost<int>() * 2 +
-                    Eigen::TensorOpCost::AddCost<float>() * 10 +
-                    Eigen::TensorOpCost::MulCost<float>() * 6 +
-                    Eigen::TensorOpCost::DivCost<float>()) *
-                   length;
+  input_bytes =
+      num_boxes * 10 * sizeof(float) + per_batch_size * 6 * sizeof(float);
+  output_bytes =
+      num_boxes * 5 * sizeof(float) + per_batch_size * 6 * sizeof(float);
+  compute_cycles = Eigen::TensorOpCost::AddCost<int>() * num_boxes * 5 +
+                   Eigen::TensorOpCost::AddCost<float>() * num_boxes * 5;
+  // The cost here is not the actual number of cycles, but rather a set of
+  // hand-tuned numbers that seem to work best.
   const Eigen::TensorOpCost cost_result(input_bytes, output_bytes,
                                         compute_cycles);
   d.parallelFor(length, cost_result, shard_result);
@@ -561,14 +559,11 @@ void BatchedNonMaxSuppressionOp(
   };
   length = num_batches * per_batch_size;
   // Input data boxes_data, scores_data
-  input_bytes = length * per_batch_size * 6;
-  output_bytes = length * per_batch_size * 6;
-  compute_cycles = (Eigen::TensorOpCost::AddCost<int>() * 5 +
-                    Eigen::TensorOpCost::MulCost<int>() * 2 +
-                    Eigen::TensorOpCost::AddCost<float>() * 10 +
-                    Eigen::TensorOpCost::MulCost<float>() * 6 +
-                    Eigen::TensorOpCost::DivCost<float>()) *
-                   length;
+  input_bytes = 6 * sizeof(float);
+  output_bytes = 6 * sizeof(float);
+  compute_cycles = Eigen::TensorOpCost::AddCost<int>() * 2 +
+                   Eigen::TensorOpCost::MulCost<int>() * 2 +
+                   Eigen::TensorOpCost::DivCost<float>() * 2;
   const Eigen::TensorOpCost cost_copy_result(input_bytes, output_bytes,
                                              compute_cycles);
   d.parallelFor(length, cost_copy_result, shard_copy_result);
diff --git a/tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc b/tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc
index 920b2af40c4..40c8d77ec9d 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -56,9 +56,23 @@ static Graph* BM_CombinedNonMaxSuppression(int batches, int box_num,
   }                                                                          \
   BENCHMARK(BM_CombinedNMS_##DEVICE##_##B##_##BN##_##CN##_##Q);
 
-BM_CombinedNonMaxSuppressionDev(cpu, 1, 1917, 90, 1);
-BM_CombinedNonMaxSuppressionDev(cpu, 28, 1917, 90, 1);
-BM_CombinedNonMaxSuppressionDev(cpu, 32, 1917, 90, 1);
-BM_CombinedNonMaxSuppressionDev(cpu, 64, 1917, 90, 1);
+#define BM_Batch(BN, CN, Q)                            \
+  BM_CombinedNonMaxSuppressionDev(cpu, 1, BN, CN, Q);  \
+  BM_CombinedNonMaxSuppressionDev(cpu, 28, BN, CN, Q); \
+  BM_CombinedNonMaxSuppressionDev(cpu, 32, BN, CN, Q); \
+  BM_CombinedNonMaxSuppressionDev(cpu, 64, BN, CN, Q);
+
+#define BN_Boxes_Number(CN, Q) \
+  BM_Batch(500, CN, Q);        \
+  BM_Batch(1000, CN, Q);       \
+  BM_Batch(1917, CN, Q);       \
+  BM_Batch(2500, CN, Q);
+
+BN_Boxes_Number(25, 1);
+BN_Boxes_Number(25, 25);
+BN_Boxes_Number(90, 1);
+BN_Boxes_Number(90, 90);
+BN_Boxes_Number(200, 1);
+BN_Boxes_Number(200, 200);
 
 }  // namespace tensorflow