refine the cost in NMS and add more test

This commit is contained in:
Li, Guizi 2020-02-24 21:17:21 +08:00
parent 0be4b608c0
commit b843ed5862
2 changed files with 58 additions and 49 deletions

View File

@ -291,11 +291,11 @@ struct ResultCandidate {
float box_coord[4];
};
void DoNMS(int batch_idx, int class_idx, const float* boxes_data,
const float* scores_data, int num_boxes, int q, int num_classes,
const int size_per_class, const float score_threshold,
const float iou_threshold,
std::vector<ResultCandidate>& result_candidate_vec) {
void DoNMSPerClass(int batch_idx, int class_idx, const float* boxes_data,
const float* scores_data, int num_boxes, int q,
int num_classes, const int size_per_class,
const float score_threshold, const float iou_threshold,
std::vector<ResultCandidate>& result_candidate_vec) {
std::vector<float> class_scores_data;
class_scores_data.reserve(num_boxes);
std::vector<float> class_boxes_data;
@ -341,7 +341,7 @@ void DoNMS(int batch_idx, int class_idx, const float* boxes_data,
std::sort(candidate_vector.begin(), candidate_vector.end(), cmp);
const Tensor const_boxes = boxes;
typename TTypes<float, 2>::ConstTensor boxes_data_1 =
typename TTypes<float, 2>::ConstTensor boxes_data_t =
const_boxes.tensor<float, 2>();
int candidate_idx = 0;
float iou;
@ -354,7 +354,7 @@ void DoNMS(int batch_idx, int class_idx, const float* boxes_data,
// in order to see if `next_candidate` should be suppressed.
bool should_select = true;
for (int j = selected.size() - 1; j >= 0; --j) {
iou = IOU<float>(boxes_data_1, next_candidate.box_index, selected[j]);
iou = IOU<float>(boxes_data_t, next_candidate.box_index, selected[j]);
if (iou > iou_threshold) {
should_select = false;
break;
@ -364,16 +364,13 @@ void DoNMS(int batch_idx, int class_idx, const float* boxes_data,
if (should_select) {
// Add the selected box to the result candidate. Sorted by score
int id = next_candidate.box_index;
auto& rc =
result_candidate_vec[selected.size() + size_per_class * class_idx];
result_candidate_vec[selected.size() + size_per_class * class_idx] = {
next_candidate.box_index,
next_candidate.score,
class_idx,
{boxes_data_t(id, 0), boxes_data_t(id, 1), boxes_data_t(id, 2),
boxes_data_t(id, 3)}};
selected.push_back(next_candidate.box_index);
rc.box_index = next_candidate.box_index;
rc.score = next_candidate.score;
rc.class_idx = class_idx;
rc.box_coord[0] = boxes_data_1(id, 0);
rc.box_coord[1] = boxes_data_1(id, 1);
rc.box_coord[2] = boxes_data_1(id, 2);
rc.box_coord[3] = boxes_data_1(id, 3);
}
}
}
@ -473,23 +470,24 @@ void BatchedNonMaxSuppressionOp(
for (int idx = begin; idx < end; ++idx) {
int batch_idx = idx / num_classes;
int class_idx = idx % num_classes;
DoNMS(batch_idx, class_idx, boxes_data + boxes_per_batch * batch_idx,
scores_data + scores_per_batch * batch_idx, num_boxes, q,
num_classes, size_per_class, score_threshold, iou_threshold,
result_candidate_vec[batch_idx]);
DoNMSPerClass(batch_idx, class_idx,
boxes_data + boxes_per_batch * batch_idx,
scores_data + scores_per_batch * batch_idx, num_boxes, q,
num_classes, size_per_class, score_threshold, iou_threshold,
result_candidate_vec[batch_idx]);
}
};
int length = num_batches * num_classes;
// Input data boxes_data, scores_data
int input_bytes = length * num_boxes * 5;
int output_bytes = length * num_boxes * 5;
int compute_cycles = (Eigen::TensorOpCost::AddCost<int>() * 5 +
Eigen::TensorOpCost::MulCost<int>() * 2 +
Eigen::TensorOpCost::AddCost<float>() * 10 +
Eigen::TensorOpCost::MulCost<float>() * 6 +
Eigen::TensorOpCost::DivCost<float>()) *
length;
int input_bytes = num_boxes * 10 * sizeof(float);
int output_bytes = num_boxes * 10 * sizeof(float);
int compute_cycles = Eigen::TensorOpCost::AddCost<int>() * num_boxes * 14 +
Eigen::TensorOpCost::MulCost<int>() * num_boxes * 9 +
Eigen::TensorOpCost::MulCost<float>() * num_boxes * 9 +
Eigen::TensorOpCost::AddCost<float>() * num_boxes * 8;
// The cost here is not the actual number of cycles, but rather a set of
// hand-tuned numbers that seem to work best.
const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
const CPUDevice& d = context->eigen_device<CPUDevice>();
d.parallelFor(length, cost, shard_nms);
@ -519,14 +517,14 @@ void BatchedNonMaxSuppressionOp(
};
length = num_batches;
// Input data boxes_data, scores_data
input_bytes = length * num_boxes * 5;
output_bytes = length * num_boxes * 5;
compute_cycles = (Eigen::TensorOpCost::AddCost<int>() * 5 +
Eigen::TensorOpCost::MulCost<int>() * 2 +
Eigen::TensorOpCost::AddCost<float>() * 10 +
Eigen::TensorOpCost::MulCost<float>() * 6 +
Eigen::TensorOpCost::DivCost<float>()) *
length;
input_bytes =
num_boxes * 10 * sizeof(float) + per_batch_size * 6 * sizeof(float);
output_bytes =
num_boxes * 5 * sizeof(float) + per_batch_size * 6 * sizeof(float);
compute_cycles = Eigen::TensorOpCost::AddCost<int>() * num_boxes * 5 +
Eigen::TensorOpCost::AddCost<float>() * num_boxes * 5;
// The cost here is not the actual number of cycles, but rather a set of
// hand-tuned numbers that seem to work best.
const Eigen::TensorOpCost cost_result(input_bytes, output_bytes,
compute_cycles);
d.parallelFor(length, cost_result, shard_result);
@ -561,14 +559,11 @@ void BatchedNonMaxSuppressionOp(
};
length = num_batches * per_batch_size;
// Input data boxes_data, scores_data
input_bytes = length * per_batch_size * 6;
output_bytes = length * per_batch_size * 6;
compute_cycles = (Eigen::TensorOpCost::AddCost<int>() * 5 +
Eigen::TensorOpCost::MulCost<int>() * 2 +
Eigen::TensorOpCost::AddCost<float>() * 10 +
Eigen::TensorOpCost::MulCost<float>() * 6 +
Eigen::TensorOpCost::DivCost<float>()) *
length;
input_bytes = 6 * sizeof(float);
output_bytes = 6 * sizeof(float);
compute_cycles = Eigen::TensorOpCost::AddCost<int>() * 2 +
Eigen::TensorOpCost::MulCost<int>() * 2 +
Eigen::TensorOpCost::DivCost<float>() * 2;
const Eigen::TensorOpCost cost_copy_result(input_bytes, output_bytes,
compute_cycles);
d.parallelFor(length, cost_copy_result, shard_copy_result);

View File

@ -1,4 +1,4 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -56,9 +56,23 @@ static Graph* BM_CombinedNonMaxSuppression(int batches, int box_num,
} \
BENCHMARK(BM_CombinedNMS_##DEVICE##_##B##_##BN##_##CN##_##Q);
BM_CombinedNonMaxSuppressionDev(cpu, 1, 1917, 90, 1);
BM_CombinedNonMaxSuppressionDev(cpu, 28, 1917, 90, 1);
BM_CombinedNonMaxSuppressionDev(cpu, 32, 1917, 90, 1);
BM_CombinedNonMaxSuppressionDev(cpu, 64, 1917, 90, 1);
#define BM_Batch(BN, CN, Q) \
BM_CombinedNonMaxSuppressionDev(cpu, 1, BN, CN, Q); \
BM_CombinedNonMaxSuppressionDev(cpu, 28, BN, CN, Q); \
BM_CombinedNonMaxSuppressionDev(cpu, 32, BN, CN, Q); \
BM_CombinedNonMaxSuppressionDev(cpu, 64, BN, CN, Q);
#define BN_Boxes_Number(CN, Q) \
BM_Batch(500, CN, Q); \
BM_Batch(1000, CN, Q); \
BM_Batch(1917, CN, Q); \
BM_Batch(2500, CN, Q);
BN_Boxes_Number(25, 1);
BN_Boxes_Number(25, 25);
BN_Boxes_Number(90, 1);
BN_Boxes_Number(90, 90);
BN_Boxes_Number(200, 1);
BN_Boxes_Number(200, 200);
} // namespace tensorflow