benchmark_model: add optional inter-benchmark delay and max benchmark length params to help reduce effects of thermal throttling and standardize run-length between graphs.
PiperOrigin-RevId: 164996768
This commit is contained in:
parent
8d75705ffa
commit
5d1792f994
@ -266,27 +266,47 @@ Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status TimeMultipleRuns(double sleep_seconds, int num_runs,
|
void SleepSeconds(double sleep_seconds) {
|
||||||
const std::vector<InputLayerInfo>& inputs,
|
if (sleep_seconds <= 0.0) {
|
||||||
const std::vector<string>& outputs, Session* session,
|
return;
|
||||||
StatSummarizer* stats, int64* total_time_us) {
|
}
|
||||||
// Convert the run_delay string into a timespec.
|
#ifdef PLATFORM_WINDOWS
|
||||||
|
Sleep(sleep_seconds * 1000);
|
||||||
|
#else
|
||||||
|
// Convert the inference_delay string into a timespec.
|
||||||
timespec req;
|
timespec req;
|
||||||
req.tv_sec = static_cast<time_t>(sleep_seconds);
|
req.tv_sec = static_cast<time_t>(sleep_seconds);
|
||||||
req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
|
req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
|
||||||
|
nanosleep(&req, nullptr);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
|
||||||
|
const std::vector<InputLayerInfo>& inputs,
|
||||||
|
const std::vector<string>& outputs, Session* session,
|
||||||
|
StatSummarizer* stats, int64* total_time_us,
|
||||||
|
int64* actual_num_runs) {
|
||||||
*total_time_us = 0;
|
*total_time_us = 0;
|
||||||
|
|
||||||
LOG(INFO) << "Running benchmark for " << num_runs << " iterations "
|
LOG(INFO) << "Running benchmark for max " << num_runs << " iterations, max "
|
||||||
|
<< max_time_s << " seconds "
|
||||||
<< (stats != nullptr ? "with" : "without")
|
<< (stats != nullptr ? "with" : "without")
|
||||||
<< " detailed stat logging:";
|
<< " detailed stat logging, with " << sleep_seconds
|
||||||
|
<< "s sleep between inferences";
|
||||||
|
|
||||||
Stat<int64> stat;
|
Stat<int64> stat;
|
||||||
for (int i = 0; i < num_runs; ++i) {
|
const bool until_max_time = num_runs <= 0;
|
||||||
|
for (int i = 0; until_max_time || i < num_runs; ++i) {
|
||||||
int64 time;
|
int64 time;
|
||||||
Status run_status = RunBenchmark(inputs, outputs, session, stats, &time);
|
Status run_status = RunBenchmark(inputs, outputs, session, stats, &time);
|
||||||
stat.UpdateStat(time);
|
stat.UpdateStat(time);
|
||||||
*total_time_us += time;
|
(*total_time_us) += time;
|
||||||
|
++(*actual_num_runs);
|
||||||
|
|
||||||
|
if (max_time_s > 0.0 && (*total_time_us / 1000000.0) > max_time_s) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (!run_status.ok()) {
|
if (!run_status.ok()) {
|
||||||
LOG(INFO) << "Failed on run " << i;
|
LOG(INFO) << "Failed on run " << i;
|
||||||
return run_status;
|
return run_status;
|
||||||
@ -296,11 +316,7 @@ Status TimeMultipleRuns(double sleep_seconds, int num_runs,
|
|||||||
// This can be helpful to determine the effect of mobile processor
|
// This can be helpful to determine the effect of mobile processor
|
||||||
// scaling and thermal throttling.
|
// scaling and thermal throttling.
|
||||||
if (sleep_seconds > 0.0) {
|
if (sleep_seconds > 0.0) {
|
||||||
#ifdef PLATFORM_WINDOWS
|
SleepSeconds(sleep_seconds);
|
||||||
Sleep(sleep_seconds * 1000);
|
|
||||||
#else
|
|
||||||
nanosleep(&req, nullptr);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::stringstream stream;
|
std::stringstream stream;
|
||||||
@ -317,8 +333,10 @@ int Main(int argc, char** argv) {
|
|||||||
string input_layer_type_string = "float";
|
string input_layer_type_string = "float";
|
||||||
string input_layer_values_string = "";
|
string input_layer_values_string = "";
|
||||||
string output_layer_string = "output:0";
|
string output_layer_string = "output:0";
|
||||||
int num_runs = 50;
|
int max_num_runs = 1000;
|
||||||
string run_delay = "-1.0";
|
string max_time = "10.0";
|
||||||
|
string inference_delay = "-1.0";
|
||||||
|
string inter_benchmark_delay = "-1.0";
|
||||||
int num_threads = -1;
|
int num_threads = -1;
|
||||||
string benchmark_name = "";
|
string benchmark_name = "";
|
||||||
string output_prefix = "";
|
string output_prefix = "";
|
||||||
@ -342,8 +360,12 @@ int Main(int argc, char** argv) {
|
|||||||
Flag("input_layer_values", &input_layer_values_string,
|
Flag("input_layer_values", &input_layer_values_string,
|
||||||
"values to initialize the inputs with"),
|
"values to initialize the inputs with"),
|
||||||
Flag("output_layer", &output_layer_string, "output layer name"),
|
Flag("output_layer", &output_layer_string, "output layer name"),
|
||||||
Flag("num_runs", &num_runs, "number of runs"),
|
Flag("max_num_runs", &max_num_runs, "number of runs max"),
|
||||||
Flag("run_delay", &run_delay, "delay between runs in seconds"),
|
Flag("max_time", &max_time, "length to run max"),
|
||||||
|
Flag("inference_delay", &inference_delay,
|
||||||
|
"delay between runs in seconds"),
|
||||||
|
Flag("inter_benchmark_delay", &inter_benchmark_delay,
|
||||||
|
"delay between benchmarks in seconds"),
|
||||||
Flag("num_threads", &num_threads, "number of threads"),
|
Flag("num_threads", &num_threads, "number of threads"),
|
||||||
Flag("benchmark_name", &benchmark_name, "benchmark name"),
|
Flag("benchmark_name", &benchmark_name, "benchmark name"),
|
||||||
Flag("output_prefix", &output_prefix, "benchmark output prefix"),
|
Flag("output_prefix", &output_prefix, "benchmark output prefix"),
|
||||||
@ -406,8 +428,10 @@ int Main(int argc, char** argv) {
|
|||||||
LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
|
LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
|
||||||
LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
|
LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
|
||||||
LOG(INFO) << "Output layers: [" << output_layer_string << "]";
|
LOG(INFO) << "Output layers: [" << output_layer_string << "]";
|
||||||
LOG(INFO) << "Num runs: [" << num_runs << "]";
|
LOG(INFO) << "Num runs: [" << max_num_runs << "]";
|
||||||
LOG(INFO) << "Inter-run delay (seconds): [" << run_delay << "]";
|
LOG(INFO) << "Inter-inference delay (seconds): [" << inference_delay << "]";
|
||||||
|
LOG(INFO) << "Inter-benchmark delay (seconds): [" << inter_benchmark_delay
|
||||||
|
<< "]";
|
||||||
LOG(INFO) << "Num threads: [" << num_threads << "]";
|
LOG(INFO) << "Num threads: [" << num_threads << "]";
|
||||||
LOG(INFO) << "Benchmark name: [" << benchmark_name << "]";
|
LOG(INFO) << "Benchmark name: [" << benchmark_name << "]";
|
||||||
LOG(INFO) << "Output prefix: [" << output_prefix << "]";
|
LOG(INFO) << "Output prefix: [" << output_prefix << "]";
|
||||||
@ -434,7 +458,12 @@ int Main(int argc, char** argv) {
|
|||||||
stats_options.show_summary = show_summary;
|
stats_options.show_summary = show_summary;
|
||||||
stats.reset(new tensorflow::StatSummarizer(stats_options));
|
stats.reset(new tensorflow::StatSummarizer(stats_options));
|
||||||
|
|
||||||
const double sleep_seconds = std::strtod(run_delay.c_str(), nullptr);
|
const double inter_inference_sleep_seconds =
|
||||||
|
std::strtod(inference_delay.c_str(), nullptr);
|
||||||
|
const double inter_benchmark_sleep_seconds =
|
||||||
|
std::strtod(inter_benchmark_delay.c_str(), nullptr);
|
||||||
|
const double max_benchmark_time_seconds =
|
||||||
|
std::strtod(max_time.c_str(), nullptr);
|
||||||
|
|
||||||
std::vector<InputLayerInfo> inputs;
|
std::vector<InputLayerInfo> inputs;
|
||||||
for (int n = 0; n < inputs_count; ++n) {
|
for (int n = 0; n < inputs_count; ++n) {
|
||||||
@ -466,10 +495,11 @@ int Main(int argc, char** argv) {
|
|||||||
// If requested, run through the graph first to preinitialize everything
|
// If requested, run through the graph first to preinitialize everything
|
||||||
// before the benchmarking runs.
|
// before the benchmarking runs.
|
||||||
int64 warmup_time_us = 0;
|
int64 warmup_time_us = 0;
|
||||||
|
int64 num_warmup_runs = 0;
|
||||||
if (warmup_runs > 0) {
|
if (warmup_runs > 0) {
|
||||||
Status warmup_time_status =
|
Status warmup_time_status = TimeMultipleRuns(
|
||||||
TimeMultipleRuns(sleep_seconds, warmup_runs, inputs, output_layers,
|
inter_inference_sleep_seconds, warmup_runs, -1.0, inputs, output_layers,
|
||||||
session.get(), nullptr, &warmup_time_us);
|
session.get(), nullptr, &warmup_time_us, &num_warmup_runs);
|
||||||
if (!warmup_time_status.ok()) {
|
if (!warmup_time_status.ok()) {
|
||||||
LOG(ERROR) << "Timing failed with " << warmup_time_status;
|
LOG(ERROR) << "Timing failed with " << warmup_time_status;
|
||||||
return -1;
|
return -1;
|
||||||
@ -478,10 +508,13 @@ int Main(int argc, char** argv) {
|
|||||||
|
|
||||||
// Capture overall inference time without stat logging overhead. This is the
|
// Capture overall inference time without stat logging overhead. This is the
|
||||||
// timing data that can be compared to other libaries.
|
// timing data that can be compared to other libaries.
|
||||||
|
SleepSeconds(inter_benchmark_sleep_seconds);
|
||||||
int64 no_stat_time_us = 0;
|
int64 no_stat_time_us = 0;
|
||||||
Status no_stat_time_status =
|
int64 no_stat_num_runs = 0;
|
||||||
TimeMultipleRuns(sleep_seconds, num_runs, inputs, output_layers,
|
Status no_stat_time_status = TimeMultipleRuns(
|
||||||
session.get(), nullptr, &no_stat_time_us);
|
inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
|
||||||
|
inputs, output_layers, session.get(), nullptr, &no_stat_time_us,
|
||||||
|
&no_stat_num_runs);
|
||||||
const double no_stat_wall_time = no_stat_time_us / 1000000.0;
|
const double no_stat_wall_time = no_stat_time_us / 1000000.0;
|
||||||
if (!no_stat_time_status.ok()) {
|
if (!no_stat_time_status.ok()) {
|
||||||
LOG(ERROR) << "Timing failed with " << no_stat_time_status;
|
LOG(ERROR) << "Timing failed with " << no_stat_time_status;
|
||||||
@ -490,10 +523,13 @@ int Main(int argc, char** argv) {
|
|||||||
|
|
||||||
// Run again to gather detailed log stats to get a better idea of where
|
// Run again to gather detailed log stats to get a better idea of where
|
||||||
// relative time is going within the graph.
|
// relative time is going within the graph.
|
||||||
|
SleepSeconds(inter_benchmark_sleep_seconds);
|
||||||
int64 stat_time_us = 0;
|
int64 stat_time_us = 0;
|
||||||
Status stat_time_status =
|
int64 stat_num_runs = 0;
|
||||||
TimeMultipleRuns(sleep_seconds, num_runs, inputs, output_layers,
|
Status stat_time_status = TimeMultipleRuns(
|
||||||
session.get(), stats.get(), &stat_time_us);
|
inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
|
||||||
|
inputs, output_layers, session.get(), stats.get(), &stat_time_us,
|
||||||
|
&stat_num_runs);
|
||||||
if (!stat_time_status.ok()) {
|
if (!stat_time_status.ok()) {
|
||||||
LOG(ERROR) << "Timing failed with " << stat_time_status;
|
LOG(ERROR) << "Timing failed with " << stat_time_status;
|
||||||
return -1;
|
return -1;
|
||||||
@ -502,8 +538,8 @@ int Main(int argc, char** argv) {
|
|||||||
LOG(INFO) << "Average inference timings in us: "
|
LOG(INFO) << "Average inference timings in us: "
|
||||||
<< "Warmup: "
|
<< "Warmup: "
|
||||||
<< (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
|
<< (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
|
||||||
<< "no stats: " << no_stat_time_us / num_runs << ", "
|
<< "no stats: " << no_stat_time_us / no_stat_num_runs << ", "
|
||||||
<< "with stats: " << stat_time_us / num_runs;
|
<< "with stats: " << stat_time_us / stat_num_runs;
|
||||||
|
|
||||||
stats->PrintStepStats();
|
stats->PrintStepStats();
|
||||||
|
|
||||||
@ -535,7 +571,7 @@ int Main(int argc, char** argv) {
|
|||||||
pretty_flops = strings::StrCat(rounded_flops, " billion FLOPs");
|
pretty_flops = strings::StrCat(rounded_flops, " billion FLOPs");
|
||||||
}
|
}
|
||||||
LOG(INFO) << "FLOPs estimate: " << strings::HumanReadableNum(total_flops);
|
LOG(INFO) << "FLOPs estimate: " << strings::HumanReadableNum(total_flops);
|
||||||
const double mean_run_time = no_stat_wall_time / num_runs;
|
const double mean_run_time = no_stat_wall_time / no_stat_num_runs;
|
||||||
LOG(INFO) << "FLOPs/second: "
|
LOG(INFO) << "FLOPs/second: "
|
||||||
<< strings::HumanReadableNum(
|
<< strings::HumanReadableNum(
|
||||||
static_cast<int64>(total_flops / mean_run_time));
|
static_cast<int64>(total_flops / mean_run_time));
|
||||||
@ -547,14 +583,14 @@ int Main(int argc, char** argv) {
|
|||||||
|
|
||||||
// Throughput in MB/s
|
// Throughput in MB/s
|
||||||
const double throughput =
|
const double throughput =
|
||||||
DataTypeSize(inputs[0].data_type) * total_size * num_runs /
|
DataTypeSize(inputs[0].data_type) * total_size * no_stat_num_runs /
|
||||||
static_cast<double>(no_stat_wall_time) / (1024 * 1024);
|
static_cast<double>(no_stat_wall_time) / (1024 * 1024);
|
||||||
|
|
||||||
// Report the stats.
|
// Report the stats.
|
||||||
TestReporter reporter(output_prefix, benchmark_name);
|
TestReporter reporter(output_prefix, benchmark_name);
|
||||||
TF_QCHECK_OK(reporter.Initialize());
|
TF_QCHECK_OK(reporter.Initialize());
|
||||||
TF_QCHECK_OK(
|
TF_QCHECK_OK(reporter.Benchmark(no_stat_num_runs, -1.0, no_stat_wall_time,
|
||||||
reporter.Benchmark(num_runs, -1.0, no_stat_wall_time, throughput));
|
throughput));
|
||||||
TF_QCHECK_OK(reporter.Close());
|
TF_QCHECK_OK(reporter.Close());
|
||||||
|
|
||||||
std::map<string, int64> node_type_map_count;
|
std::map<string, int64> node_type_map_count;
|
||||||
@ -575,7 +611,8 @@ int Main(int argc, char** argv) {
|
|||||||
|
|
||||||
TF_QCHECK_OK(node_reporter.Initialize());
|
TF_QCHECK_OK(node_reporter.Initialize());
|
||||||
TF_QCHECK_OK(node_reporter.Benchmark(
|
TF_QCHECK_OK(node_reporter.Benchmark(
|
||||||
num_runs, -1.0, (time.second * num_runs) / 1000000.0f, -1.0));
|
stat_num_runs, -1.0, (time.second * stat_num_runs) / 1000000.0f,
|
||||||
|
-1.0));
|
||||||
TF_QCHECK_OK(node_reporter.Close());
|
TF_QCHECK_OK(node_reporter.Close());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -41,10 +41,11 @@ Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
|
|||||||
StatSummarizer* stats, int64* inference_time_us);
|
StatSummarizer* stats, int64* inference_time_us);
|
||||||
|
|
||||||
// Runs the model multiple time, keeping track of timing information.
|
// Runs the model multiple time, keeping track of timing information.
|
||||||
Status TimeMultipleRuns(double sleep_seconds, int num_runs,
|
Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
|
||||||
const std::vector<InputLayerInfo>& inputs,
|
const std::vector<InputLayerInfo>& inputs,
|
||||||
const std::vector<string>& outputs, Session* session,
|
const std::vector<string>& outputs, Session* session,
|
||||||
StatSummarizer* stats, int64* total_time_us);
|
StatSummarizer* stats, int64* total_time_us,
|
||||||
|
int64* actual_num_runs);
|
||||||
|
|
||||||
// Handles all setup and argument parsing.
|
// Handles all setup and argument parsing.
|
||||||
int Main(int argc, char** argv);
|
int Main(int argc, char** argv);
|
||||||
|
@ -62,8 +62,11 @@ TEST(BenchmarkModelTest, InitializeAndRun) {
|
|||||||
std::unique_ptr<StatSummarizer> stats;
|
std::unique_ptr<StatSummarizer> stats;
|
||||||
stats.reset(new tensorflow::StatSummarizer(*(loaded_graph_def.get())));
|
stats.reset(new tensorflow::StatSummarizer(*(loaded_graph_def.get())));
|
||||||
int64 time;
|
int64 time;
|
||||||
|
int64 num_runs;
|
||||||
TF_ASSERT_OK(benchmark_model::TimeMultipleRuns(
|
TF_ASSERT_OK(benchmark_model::TimeMultipleRuns(
|
||||||
0.0, 10, {input}, {output_name}, session.get(), stats.get(), &time));
|
0.0, 10, 0.0, {input}, {output_name}, session.get(), stats.get(), &time,
|
||||||
|
&num_runs));
|
||||||
|
ASSERT_EQ(num_runs, 10);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
Loading…
Reference in New Issue
Block a user