From b843ed5862358a75200c60b30e2a468b1309ec72 Mon Sep 17 00:00:00 2001 From: "Li, Guizi" Date: Mon, 24 Feb 2020 21:17:21 +0800 Subject: [PATCH] refine the cost in NMS and add more test --- .../core/kernels/non_max_suppression_op.cc | 83 +++++++++---------- .../non_max_suppression_op_benchmark_test.cc | 24 ++++-- 2 files changed, 58 insertions(+), 49 deletions(-) diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc index 9cd61d7a089..88ef1482a6a 100644 --- a/tensorflow/core/kernels/non_max_suppression_op.cc +++ b/tensorflow/core/kernels/non_max_suppression_op.cc @@ -291,11 +291,11 @@ struct ResultCandidate { float box_coord[4]; }; -void DoNMS(int batch_idx, int class_idx, const float* boxes_data, - const float* scores_data, int num_boxes, int q, int num_classes, - const int size_per_class, const float score_threshold, - const float iou_threshold, - std::vector& result_candidate_vec) { +void DoNMSPerClass(int batch_idx, int class_idx, const float* boxes_data, + const float* scores_data, int num_boxes, int q, + int num_classes, const int size_per_class, + const float score_threshold, const float iou_threshold, + std::vector& result_candidate_vec) { std::vector class_scores_data; class_scores_data.reserve(num_boxes); std::vector class_boxes_data; @@ -341,7 +341,7 @@ void DoNMS(int batch_idx, int class_idx, const float* boxes_data, std::sort(candidate_vector.begin(), candidate_vector.end(), cmp); const Tensor const_boxes = boxes; - typename TTypes::ConstTensor boxes_data_1 = + typename TTypes::ConstTensor boxes_data_t = const_boxes.tensor(); int candidate_idx = 0; float iou; @@ -354,7 +354,7 @@ void DoNMS(int batch_idx, int class_idx, const float* boxes_data, // in order to see if `next_candidate` should be suppressed. bool should_select = true; for (int j = selected.size() - 1; j >= 0; --j) { - iou = IOU(boxes_data_1, next_candidate.box_index, selected[j]); + iou = IOU(boxes_data_t, next_candidate.box_index, selected[j]); if (iou > iou_threshold) { should_select = false; break; @@ -364,16 +364,13 @@ void DoNMS(int batch_idx, int class_idx, const float* boxes_data, if (should_select) { // Add the selected box to the result candidate. Sorted by score int id = next_candidate.box_index; - auto& rc = - result_candidate_vec[selected.size() + size_per_class * class_idx]; + result_candidate_vec[selected.size() + size_per_class * class_idx] = { + next_candidate.box_index, + next_candidate.score, + class_idx, + {boxes_data_t(id, 0), boxes_data_t(id, 1), boxes_data_t(id, 2), + boxes_data_t(id, 3)}}; selected.push_back(next_candidate.box_index); - rc.box_index = next_candidate.box_index; - rc.score = next_candidate.score; - rc.class_idx = class_idx; - rc.box_coord[0] = boxes_data_1(id, 0); - rc.box_coord[1] = boxes_data_1(id, 1); - rc.box_coord[2] = boxes_data_1(id, 2); - rc.box_coord[3] = boxes_data_1(id, 3); } } } @@ -473,23 +470,24 @@ void BatchedNonMaxSuppressionOp( for (int idx = begin; idx < end; ++idx) { int batch_idx = idx / num_classes; int class_idx = idx % num_classes; - DoNMS(batch_idx, class_idx, boxes_data + boxes_per_batch * batch_idx, - scores_data + scores_per_batch * batch_idx, num_boxes, q, - num_classes, size_per_class, score_threshold, iou_threshold, - result_candidate_vec[batch_idx]); + DoNMSPerClass(batch_idx, class_idx, + boxes_data + boxes_per_batch * batch_idx, + scores_data + scores_per_batch * batch_idx, num_boxes, q, + num_classes, size_per_class, score_threshold, iou_threshold, + result_candidate_vec[batch_idx]); } }; int length = num_batches * num_classes; // Input data boxes_data, scores_data - int input_bytes = length * num_boxes * 5; - int output_bytes = length * num_boxes * 5; - int compute_cycles = (Eigen::TensorOpCost::AddCost() * 5 + - Eigen::TensorOpCost::MulCost() * 2 + - Eigen::TensorOpCost::AddCost() * 10 + - Eigen::TensorOpCost::MulCost() * 6 + - Eigen::TensorOpCost::DivCost()) * - length; + int input_bytes = num_boxes * 10 * sizeof(float); + int output_bytes = num_boxes * 10 * sizeof(float); + int compute_cycles = Eigen::TensorOpCost::AddCost() * num_boxes * 14 + + Eigen::TensorOpCost::MulCost() * num_boxes * 9 + + Eigen::TensorOpCost::MulCost() * num_boxes * 9 + + Eigen::TensorOpCost::AddCost() * num_boxes * 8; + // The cost here is not the actual number of cycles, but rather a set of + // hand-tuned numbers that seem to work best. const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles); const CPUDevice& d = context->eigen_device(); d.parallelFor(length, cost, shard_nms); @@ -519,14 +517,14 @@ void BatchedNonMaxSuppressionOp( }; length = num_batches; // Input data boxes_data, scores_data - input_bytes = length * num_boxes * 5; - output_bytes = length * num_boxes * 5; - compute_cycles = (Eigen::TensorOpCost::AddCost() * 5 + - Eigen::TensorOpCost::MulCost() * 2 + - Eigen::TensorOpCost::AddCost() * 10 + - Eigen::TensorOpCost::MulCost() * 6 + - Eigen::TensorOpCost::DivCost()) * - length; + input_bytes = + num_boxes * 10 * sizeof(float) + per_batch_size * 6 * sizeof(float); + output_bytes = + num_boxes * 5 * sizeof(float) + per_batch_size * 6 * sizeof(float); + compute_cycles = Eigen::TensorOpCost::AddCost() * num_boxes * 5 + + Eigen::TensorOpCost::AddCost() * num_boxes * 5; + // The cost here is not the actual number of cycles, but rather a set of + // hand-tuned numbers that seem to work best. const Eigen::TensorOpCost cost_result(input_bytes, output_bytes, compute_cycles); d.parallelFor(length, cost_result, shard_result); @@ -561,14 +559,11 @@ void BatchedNonMaxSuppressionOp( }; length = num_batches * per_batch_size; // Input data boxes_data, scores_data - input_bytes = length * per_batch_size * 6; - output_bytes = length * per_batch_size * 6; - compute_cycles = (Eigen::TensorOpCost::AddCost() * 5 + - Eigen::TensorOpCost::MulCost() * 2 + - Eigen::TensorOpCost::AddCost() * 10 + - Eigen::TensorOpCost::MulCost() * 6 + - Eigen::TensorOpCost::DivCost()) * - length; + input_bytes = 6 * sizeof(float); + output_bytes = 6 * sizeof(float); + compute_cycles = Eigen::TensorOpCost::AddCost() * 2 + + Eigen::TensorOpCost::MulCost() * 2 + + Eigen::TensorOpCost::DivCost() * 2; const Eigen::TensorOpCost cost_copy_result(input_bytes, output_bytes, compute_cycles); d.parallelFor(length, cost_copy_result, shard_copy_result); diff --git a/tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc b/tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc index 920b2af40c4..40c8d77ec9d 100644 --- a/tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc +++ b/tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc @@ -1,4 +1,4 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -56,9 +56,23 @@ static Graph* BM_CombinedNonMaxSuppression(int batches, int box_num, } \ BENCHMARK(BM_CombinedNMS_##DEVICE##_##B##_##BN##_##CN##_##Q); -BM_CombinedNonMaxSuppressionDev(cpu, 1, 1917, 90, 1); -BM_CombinedNonMaxSuppressionDev(cpu, 28, 1917, 90, 1); -BM_CombinedNonMaxSuppressionDev(cpu, 32, 1917, 90, 1); -BM_CombinedNonMaxSuppressionDev(cpu, 64, 1917, 90, 1); +#define BM_Batch(BN, CN, Q) \ + BM_CombinedNonMaxSuppressionDev(cpu, 1, BN, CN, Q); \ + BM_CombinedNonMaxSuppressionDev(cpu, 28, BN, CN, Q); \ + BM_CombinedNonMaxSuppressionDev(cpu, 32, BN, CN, Q); \ + BM_CombinedNonMaxSuppressionDev(cpu, 64, BN, CN, Q); + +#define BN_Boxes_Number(CN, Q) \ + BM_Batch(500, CN, Q); \ + BM_Batch(1000, CN, Q); \ + BM_Batch(1917, CN, Q); \ + BM_Batch(2500, CN, Q); + +BN_Boxes_Number(25, 1); +BN_Boxes_Number(25, 25); +BN_Boxes_Number(90, 1); +BN_Boxes_Number(90, 90); +BN_Boxes_Number(200, 1); +BN_Boxes_Number(200, 200); } // namespace tensorflow