Detailed report for performance profiling.

PiperOrigin-RevId: 287975140
Change-Id: I864bce22a4969acc5c485bf5925b242f2a1fa052
This commit is contained in:
A. Unique TensorFlower 2020-01-03 04:37:53 -08:00 committed by TensorFlower Gardener
parent 4d1c59f8e9
commit c8871a18a9
4 changed files with 78 additions and 18 deletions

View File

@ -89,6 +89,7 @@ cc_library(
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:types",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/time",
],
)

View File

@ -15,6 +15,8 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
#include <map>
#include <string>
#include <vector>
#include "absl/strings/str_cat.h"
@ -208,7 +210,8 @@ ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
result.dispatches.resize(events_.size());
for (int i = 0; i < events_.size(); ++i) {
result.dispatches[i].label = events_[i].GetName();
result.dispatches[i].time_ns = events_[i].GetEventTimeNs();
result.dispatches[i].duration =
absl::Nanoseconds(events_[i].GetEventTimeNs());
}
return result;
}
@ -321,6 +324,42 @@ Status CreateProfilingCommandQueue(const CLDevice& device,
return OkStatus();
}
absl::Duration ProfilingInfo::GetTotalTime() const {
absl::Duration total_time;
for (auto dispatch : dispatches) {
total_time += dispatch.duration;
}
return total_time;
}
std::string ProfilingInfo::GetDetailedReport() const {
std::string result;
std::map<std::string, double> timing;
result +=
"Per kernel timing(" + std::to_string(dispatches.size()) + " kernels):\n";
for (auto dispatch : dispatches) {
result += " " + dispatch.label + " - " +
std::to_string(absl::ToDoubleMilliseconds(dispatch.duration)) +
"ms\n";
auto name = dispatch.label.substr(0, dispatch.label.find(" "));
if (timing.find(name) != timing.end()) {
timing[name] += absl::ToDoubleMilliseconds(dispatch.duration);
} else {
timing[name] = absl::ToDoubleMilliseconds(dispatch.duration);
}
}
result += "--------------------\n";
result += "Accumulated time per operation type:\n";
for (auto& t : timing) {
result += " " + t.first + " - " + std::to_string(t.second) + "ms\n";
}
result += "--------------------\n";
result += "Ideal total time: " +
std::to_string(absl::ToDoubleMilliseconds(GetTotalTime())) + "\n";
result += "--------------------\n";
return result;
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -20,6 +20,7 @@ limitations under the License.
#include <string>
#include <vector>
#include "absl/time/time.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
@ -35,11 +36,26 @@ namespace cl {
struct ProfilingInfo {
struct DispatchInfo {
std::string label;
uint64_t time_ns;
double GetTimeMs() const { return static_cast<double>(time_ns) * 1e-6; }
absl::Duration duration;
};
std::vector<DispatchInfo> dispatches;
absl::Duration GetTotalTime() const;
// Returns report (string of lines delimited by \n)
// This method uses GPU counters and measure GPU time only.
// Report has next structure:
// Per kernel timing(K kernels):
// conv2d 3.2ms
// ...
// --------------------
// Accumulated time per operation type:
// conv2d - 14.5ms
// ....
// --------------------
// Ideal total time: 23.4ms // Total time for all kernels
std::string GetDetailedReport() const;
};
// A wrapper around opencl command queue

View File

@ -123,7 +123,10 @@ Status RunModelSample(const std::string& model_name) {
InferenceContext::CreateInferenceInfo create_info;
create_info.precision = CalculationsPrecision::F16;
create_info.storage_type = TensorStorageType::TEXTURE_2D;
create_info.storage_type = GetFastestStorageType(env.device());
std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
std::cout << "Storage type: " << ToString(create_info.storage_type)
<< std::endl;
InferenceContext context;
RETURN_IF_ERROR(
context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));
@ -131,26 +134,27 @@ Status RunModelSample(const std::string& model_name) {
auto* queue = env.profiling_queue();
ProfilingInfo profiling_info;
RETURN_IF_ERROR(context.Profile(queue, &profiling_info));
double total_ms_time = 0.0;
for (auto dispatch : profiling_info.dispatches) {
std::cout << dispatch.label << " - " << dispatch.GetTimeMs() << "ms"
<< std::endl;
total_ms_time += dispatch.GetTimeMs();
}
std::cout << "Ideal total time - " << total_ms_time << std::endl;
std::cout << profiling_info.GetDetailedReport() << std::endl;
uint64_t mem_bytes = context.GetSizeOfMemoryAllocatedForIntermediateTensors();
std::cout << "Memory for intermediate tensors - "
<< mem_bytes / 1024.0 / 1024.0 << " MB" << std::endl;
int runs1000ms = std::max(1, static_cast<int>(1000.0f / total_ms_time));
const int num_runs_per_sec = std::max(
1, static_cast<int>(1000.0f / absl::ToDoubleMilliseconds(
profiling_info.GetTotalTime())));
for (int i = 0; i < 10; ++i) {
const int kNumRuns = 10;
for (int i = 0; i < kNumRuns; ++i) {
const auto start = absl::Now();
for (int k = 0; k < runs1000ms; ++k) {
for (int k = 0; k < num_runs_per_sec; ++k) {
RETURN_IF_ERROR(context.AddToQueue(env.queue()));
}
RETURN_IF_ERROR(env.queue()->WaitForCompletion());
const auto end = absl::Now();
const double time_ms =
const double total_time_ms =
static_cast<double>((end - start) / absl::Nanoseconds(1)) * 1e-6;
std::cout << "Total time - " << time_ms / runs1000ms << "ms" << std::endl;
const double average_inference_time = total_time_ms / num_runs_per_sec;
std::cout << "Total time - " << average_inference_time << "ms" << std::endl;
}
return OkStatus();
@ -168,13 +172,13 @@ int main(int argc, char** argv) {
auto load_status = tflite::gpu::cl::LoadOpenCL();
if (!load_status.ok()) {
std::cerr << load_status.error_message();
std::cerr << load_status.message();
return -1;
}
auto run_status = tflite::gpu::cl::RunModelSample(argv[1]);
if (!run_status.ok()) {
std::cerr << run_status.error_message();
std::cerr << run_status.message();
return -1;
}