Detailed report for performance profiling.
PiperOrigin-RevId: 287975140 Change-Id: I864bce22a4969acc5c485bf5925b242f2a1fa052
This commit is contained in:
parent
4d1c59f8e9
commit
c8871a18a9
@ -89,6 +89,7 @@ cc_library(
|
||||
"//tensorflow/lite/delegates/gpu/common:status",
|
||||
"//tensorflow/lite/delegates/gpu/common:types",
|
||||
"@com_google_absl//absl/strings",
|
||||
"@com_google_absl//absl/time",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -15,6 +15,8 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/strings/str_cat.h"
|
||||
@ -208,7 +210,8 @@ ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
|
||||
result.dispatches.resize(events_.size());
|
||||
for (int i = 0; i < events_.size(); ++i) {
|
||||
result.dispatches[i].label = events_[i].GetName();
|
||||
result.dispatches[i].time_ns = events_[i].GetEventTimeNs();
|
||||
result.dispatches[i].duration =
|
||||
absl::Nanoseconds(events_[i].GetEventTimeNs());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@ -321,6 +324,42 @@ Status CreateProfilingCommandQueue(const CLDevice& device,
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Duration ProfilingInfo::GetTotalTime() const {
|
||||
absl::Duration total_time;
|
||||
for (auto dispatch : dispatches) {
|
||||
total_time += dispatch.duration;
|
||||
}
|
||||
return total_time;
|
||||
}
|
||||
|
||||
std::string ProfilingInfo::GetDetailedReport() const {
|
||||
std::string result;
|
||||
std::map<std::string, double> timing;
|
||||
result +=
|
||||
"Per kernel timing(" + std::to_string(dispatches.size()) + " kernels):\n";
|
||||
for (auto dispatch : dispatches) {
|
||||
result += " " + dispatch.label + " - " +
|
||||
std::to_string(absl::ToDoubleMilliseconds(dispatch.duration)) +
|
||||
"ms\n";
|
||||
auto name = dispatch.label.substr(0, dispatch.label.find(" "));
|
||||
if (timing.find(name) != timing.end()) {
|
||||
timing[name] += absl::ToDoubleMilliseconds(dispatch.duration);
|
||||
} else {
|
||||
timing[name] = absl::ToDoubleMilliseconds(dispatch.duration);
|
||||
}
|
||||
}
|
||||
result += "--------------------\n";
|
||||
result += "Accumulated time per operation type:\n";
|
||||
for (auto& t : timing) {
|
||||
result += " " + t.first + " - " + std::to_string(t.second) + "ms\n";
|
||||
}
|
||||
result += "--------------------\n";
|
||||
result += "Ideal total time: " +
|
||||
std::to_string(absl::ToDoubleMilliseconds(GetTotalTime())) + "\n";
|
||||
result += "--------------------\n";
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
} // namespace tflite
|
||||
|
@ -20,6 +20,7 @@ limitations under the License.
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/time/time.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
|
||||
@ -35,11 +36,26 @@ namespace cl {
|
||||
struct ProfilingInfo {
|
||||
struct DispatchInfo {
|
||||
std::string label;
|
||||
uint64_t time_ns;
|
||||
double GetTimeMs() const { return static_cast<double>(time_ns) * 1e-6; }
|
||||
absl::Duration duration;
|
||||
};
|
||||
|
||||
std::vector<DispatchInfo> dispatches;
|
||||
|
||||
absl::Duration GetTotalTime() const;
|
||||
|
||||
// Returns report (string of lines delimited by \n)
|
||||
// This method uses GPU counters and measure GPU time only.
|
||||
// Report has next structure:
|
||||
// Per kernel timing(K kernels):
|
||||
// conv2d 3.2ms
|
||||
// ...
|
||||
// --------------------
|
||||
// Accumulated time per operation type:
|
||||
// conv2d - 14.5ms
|
||||
// ....
|
||||
// --------------------
|
||||
// Ideal total time: 23.4ms // Total time for all kernels
|
||||
std::string GetDetailedReport() const;
|
||||
};
|
||||
|
||||
// A wrapper around opencl command queue
|
||||
|
@ -123,7 +123,10 @@ Status RunModelSample(const std::string& model_name) {
|
||||
|
||||
InferenceContext::CreateInferenceInfo create_info;
|
||||
create_info.precision = CalculationsPrecision::F16;
|
||||
create_info.storage_type = TensorStorageType::TEXTURE_2D;
|
||||
create_info.storage_type = GetFastestStorageType(env.device());
|
||||
std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
|
||||
std::cout << "Storage type: " << ToString(create_info.storage_type)
|
||||
<< std::endl;
|
||||
InferenceContext context;
|
||||
RETURN_IF_ERROR(
|
||||
context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));
|
||||
@ -131,26 +134,27 @@ Status RunModelSample(const std::string& model_name) {
|
||||
auto* queue = env.profiling_queue();
|
||||
ProfilingInfo profiling_info;
|
||||
RETURN_IF_ERROR(context.Profile(queue, &profiling_info));
|
||||
double total_ms_time = 0.0;
|
||||
for (auto dispatch : profiling_info.dispatches) {
|
||||
std::cout << dispatch.label << " - " << dispatch.GetTimeMs() << "ms"
|
||||
<< std::endl;
|
||||
total_ms_time += dispatch.GetTimeMs();
|
||||
}
|
||||
std::cout << "Ideal total time - " << total_ms_time << std::endl;
|
||||
std::cout << profiling_info.GetDetailedReport() << std::endl;
|
||||
uint64_t mem_bytes = context.GetSizeOfMemoryAllocatedForIntermediateTensors();
|
||||
std::cout << "Memory for intermediate tensors - "
|
||||
<< mem_bytes / 1024.0 / 1024.0 << " MB" << std::endl;
|
||||
|
||||
int runs1000ms = std::max(1, static_cast<int>(1000.0f / total_ms_time));
|
||||
const int num_runs_per_sec = std::max(
|
||||
1, static_cast<int>(1000.0f / absl::ToDoubleMilliseconds(
|
||||
profiling_info.GetTotalTime())));
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
const int kNumRuns = 10;
|
||||
for (int i = 0; i < kNumRuns; ++i) {
|
||||
const auto start = absl::Now();
|
||||
for (int k = 0; k < runs1000ms; ++k) {
|
||||
for (int k = 0; k < num_runs_per_sec; ++k) {
|
||||
RETURN_IF_ERROR(context.AddToQueue(env.queue()));
|
||||
}
|
||||
RETURN_IF_ERROR(env.queue()->WaitForCompletion());
|
||||
const auto end = absl::Now();
|
||||
const double time_ms =
|
||||
const double total_time_ms =
|
||||
static_cast<double>((end - start) / absl::Nanoseconds(1)) * 1e-6;
|
||||
std::cout << "Total time - " << time_ms / runs1000ms << "ms" << std::endl;
|
||||
const double average_inference_time = total_time_ms / num_runs_per_sec;
|
||||
std::cout << "Total time - " << average_inference_time << "ms" << std::endl;
|
||||
}
|
||||
|
||||
return OkStatus();
|
||||
@ -168,13 +172,13 @@ int main(int argc, char** argv) {
|
||||
|
||||
auto load_status = tflite::gpu::cl::LoadOpenCL();
|
||||
if (!load_status.ok()) {
|
||||
std::cerr << load_status.error_message();
|
||||
std::cerr << load_status.message();
|
||||
return -1;
|
||||
}
|
||||
|
||||
auto run_status = tflite::gpu::cl::RunModelSample(argv[1]);
|
||||
if (!run_status.ok()) {
|
||||
std::cerr << run_status.error_message();
|
||||
std::cerr << run_status.message();
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user