diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD index 1749e3b4ba0..b58d40f960d 100644 --- a/tensorflow/lite/delegates/gpu/cl/BUILD +++ b/tensorflow/lite/delegates/gpu/cl/BUILD @@ -89,6 +89,7 @@ cc_library( "//tensorflow/lite/delegates/gpu/common:status", "//tensorflow/lite/delegates/gpu/common:types", "@com_google_absl//absl/strings", + "@com_google_absl//absl/time", ], ) diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc index 5371baae6d6..a18f627e240 100644 --- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc +++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc @@ -15,6 +15,8 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h" +#include <map> +#include <string> #include <vector> #include "absl/strings/str_cat.h" @@ -208,7 +210,8 @@ ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const { result.dispatches.resize(events_.size()); for (int i = 0; i < events_.size(); ++i) { result.dispatches[i].label = events_[i].GetName(); - result.dispatches[i].time_ns = events_[i].GetEventTimeNs(); + result.dispatches[i].duration = + absl::Nanoseconds(events_[i].GetEventTimeNs()); } return result; } @@ -321,6 +324,42 @@ Status CreateProfilingCommandQueue(const CLDevice& device, return OkStatus(); } +absl::Duration ProfilingInfo::GetTotalTime() const { + absl::Duration total_time; + for (auto dispatch : dispatches) { + total_time += dispatch.duration; + } + return total_time; +} + +std::string ProfilingInfo::GetDetailedReport() const { + std::string result; + std::map<std::string, double> timing; + result += + "Per kernel timing(" + std::to_string(dispatches.size()) + " kernels):\n"; + for (auto dispatch : dispatches) { + result += " " + dispatch.label + " - " + + std::to_string(absl::ToDoubleMilliseconds(dispatch.duration)) + + "ms\n"; + auto name = dispatch.label.substr(0, dispatch.label.find(" ")); + if (timing.find(name) != timing.end()) { + timing[name] += absl::ToDoubleMilliseconds(dispatch.duration); + } else { + timing[name] = absl::ToDoubleMilliseconds(dispatch.duration); + } + } + result += "--------------------\n"; + result += "Accumulated time per operation type:\n"; + for (auto& t : timing) { + result += " " + t.first + " - " + std::to_string(t.second) + "ms\n"; + } + result += "--------------------\n"; + result += "Ideal total time: " + + std::to_string(absl::ToDoubleMilliseconds(GetTotalTime())) + "\n"; + result += "--------------------\n"; + return result; +} + } // namespace cl } // namespace gpu } // namespace tflite diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h index caea7c41628..915dbaf4dfb 100644 --- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h +++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h @@ -20,6 +20,7 @@ limitations under the License. #include <string> #include <vector> +#include "absl/time/time.h" #include "tensorflow/lite/delegates/gpu/cl/cl_context.h" #include "tensorflow/lite/delegates/gpu/cl/cl_device.h" #include "tensorflow/lite/delegates/gpu/cl/cl_event.h" @@ -35,11 +36,26 @@ namespace cl { struct ProfilingInfo { struct DispatchInfo { std::string label; - uint64_t time_ns; - double GetTimeMs() const { return static_cast<double>(time_ns) * 1e-6; } + absl::Duration duration; }; std::vector<DispatchInfo> dispatches; + + absl::Duration GetTotalTime() const; + + // Returns report (string of lines delimited by \n) + // This method uses GPU counters and measure GPU time only. + // Report has next structure: + // Per kernel timing(K kernels): + // conv2d 3.2ms + // ... + // -------------------- + // Accumulated time per operation type: + // conv2d - 14.5ms + // .... + // -------------------- + // Ideal total time: 23.4ms // Total time for all kernels + std::string GetDetailedReport() const; }; // A wrapper around opencl command queue diff --git a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc index 4c64bb8309c..4f67e3d4a2d 100644 --- a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc +++ b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc @@ -123,7 +123,10 @@ Status RunModelSample(const std::string& model_name) { InferenceContext::CreateInferenceInfo create_info; create_info.precision = CalculationsPrecision::F16; - create_info.storage_type = TensorStorageType::TEXTURE_2D; + create_info.storage_type = GetFastestStorageType(env.device()); + std::cout << "Precision: " << ToString(create_info.precision) << std::endl; + std::cout << "Storage type: " << ToString(create_info.storage_type) + << std::endl; InferenceContext context; RETURN_IF_ERROR( context.InitFromGraphWithTransforms(create_info, &graph_cl, &env)); @@ -131,26 +134,27 @@ Status RunModelSample(const std::string& model_name) { auto* queue = env.profiling_queue(); ProfilingInfo profiling_info; RETURN_IF_ERROR(context.Profile(queue, &profiling_info)); - double total_ms_time = 0.0; - for (auto dispatch : profiling_info.dispatches) { - std::cout << dispatch.label << " - " << dispatch.GetTimeMs() << "ms" - << std::endl; - total_ms_time += dispatch.GetTimeMs(); - } - std::cout << "Ideal total time - " << total_ms_time << std::endl; + std::cout << profiling_info.GetDetailedReport() << std::endl; + uint64_t mem_bytes = context.GetSizeOfMemoryAllocatedForIntermediateTensors(); + std::cout << "Memory for intermediate tensors - " + << mem_bytes / 1024.0 / 1024.0 << " MB" << std::endl; - int runs1000ms = std::max(1, static_cast<int>(1000.0f / total_ms_time)); + const int num_runs_per_sec = std::max( + 1, static_cast<int>(1000.0f / absl::ToDoubleMilliseconds( + profiling_info.GetTotalTime()))); - for (int i = 0; i < 10; ++i) { + const int kNumRuns = 10; + for (int i = 0; i < kNumRuns; ++i) { const auto start = absl::Now(); - for (int k = 0; k < runs1000ms; ++k) { + for (int k = 0; k < num_runs_per_sec; ++k) { RETURN_IF_ERROR(context.AddToQueue(env.queue())); } RETURN_IF_ERROR(env.queue()->WaitForCompletion()); const auto end = absl::Now(); - const double time_ms = + const double total_time_ms = static_cast<double>((end - start) / absl::Nanoseconds(1)) * 1e-6; - std::cout << "Total time - " << time_ms / runs1000ms << "ms" << std::endl; + const double average_inference_time = total_time_ms / num_runs_per_sec; + std::cout << "Total time - " << average_inference_time << "ms" << std::endl; } return OkStatus(); @@ -168,13 +172,13 @@ int main(int argc, char** argv) { auto load_status = tflite::gpu::cl::LoadOpenCL(); if (!load_status.ok()) { - std::cerr << load_status.error_message(); + std::cerr << load_status.message(); return -1; } auto run_status = tflite::gpu::cl::RunModelSample(argv[1]); if (!run_status.ok()) { - std::cerr << run_status.error_message(); + std::cerr << run_status.message(); return -1; }