Detailed report for performance profiling.

PiperOrigin-RevId: 287975140 Change-Id: I864bce22a4969acc5c485bf5925b242f2a1fa052
2020-01-03 04:37:53 -08:00 · 2020-01-03 04:37:53 -08:00 · c8871a18a9
commit c8871a18a9
parent 4d1c59f8e9
4 changed files with 78 additions and 18 deletions
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@ -89,6 +89,7 @@ cc_library(
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:types",
        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
    ],
 )

--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
@ -15,6 +15,8 @@ limitations under the License.

 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"

+#include <map>
+#include <string>
 #include <vector>

 #include "absl/strings/str_cat.h"
@ -208,7 +210,8 @@ ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
  result.dispatches.resize(events_.size());
  for (int i = 0; i < events_.size(); ++i) {
    result.dispatches[i].label = events_[i].GetName();
-    result.dispatches[i].time_ns = events_[i].GetEventTimeNs();
+    result.dispatches[i].duration =
+        absl::Nanoseconds(events_[i].GetEventTimeNs());
  }
  return result;
 }
@ -321,6 +324,42 @@ Status CreateProfilingCommandQueue(const CLDevice& device,
  return OkStatus();
 }

+absl::Duration ProfilingInfo::GetTotalTime() const {
+  absl::Duration total_time;
+  for (auto dispatch : dispatches) {
+    total_time += dispatch.duration;
+  }
+  return total_time;
+}
+
+std::string ProfilingInfo::GetDetailedReport() const {
+  std::string result;
+  std::map<std::string, double> timing;
+  result +=
+      "Per kernel timing(" + std::to_string(dispatches.size()) + " kernels):\n";
+  for (auto dispatch : dispatches) {
+    result += "  " + dispatch.label + " - " +
+              std::to_string(absl::ToDoubleMilliseconds(dispatch.duration)) +
+              "ms\n";
+    auto name = dispatch.label.substr(0, dispatch.label.find(" "));
+    if (timing.find(name) != timing.end()) {
+      timing[name] += absl::ToDoubleMilliseconds(dispatch.duration);
+    } else {
+      timing[name] = absl::ToDoubleMilliseconds(dispatch.duration);
+    }
+  }
+  result += "--------------------\n";
+  result += "Accumulated time per operation type:\n";
+  for (auto& t : timing) {
+    result += "  " + t.first + " - " + std::to_string(t.second) + "ms\n";
+  }
+  result += "--------------------\n";
+  result += "Ideal total time: " +
+            std::to_string(absl::ToDoubleMilliseconds(GetTotalTime())) + "\n";
+  result += "--------------------\n";
+  return result;
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>

+#include "absl/time/time.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
@ -35,11 +36,26 @@ namespace cl {
 struct ProfilingInfo {
  struct DispatchInfo {
    std::string label;
-    uint64_t time_ns;
-    double GetTimeMs() const { return static_cast<double>(time_ns) * 1e-6; }
+    absl::Duration duration;
  };

  std::vector<DispatchInfo> dispatches;
+
+  absl::Duration GetTotalTime() const;
+
+  // Returns report (string of lines delimited by \n)
+  // This method uses GPU counters and measure GPU time only.
+  // Report has next structure:
+  // Per kernel timing(K kernels):
+  //   conv2d 3.2ms
+  //   ...
+  // --------------------
+  // Accumulated time per operation type:
+  //   conv2d - 14.5ms
+  //   ....
+  // --------------------
+  // Ideal total time: 23.4ms // Total time for all kernels
+  std::string GetDetailedReport() const;
 };

 // A wrapper around opencl command queue
--- a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
@ -123,7 +123,10 @@ Status RunModelSample(const std::string& model_name) {

  InferenceContext::CreateInferenceInfo create_info;
  create_info.precision = CalculationsPrecision::F16;
-  create_info.storage_type = TensorStorageType::TEXTURE_2D;
+  create_info.storage_type = GetFastestStorageType(env.device());
+  std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
+  std::cout << "Storage type: " << ToString(create_info.storage_type)
+            << std::endl;
  InferenceContext context;
  RETURN_IF_ERROR(
      context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));
@ -131,26 +134,27 @@ Status RunModelSample(const std::string& model_name) {
  auto* queue = env.profiling_queue();
  ProfilingInfo profiling_info;
  RETURN_IF_ERROR(context.Profile(queue, &profiling_info));
-  double total_ms_time = 0.0;
-  for (auto dispatch : profiling_info.dispatches) {
-    std::cout << dispatch.label << " - " << dispatch.GetTimeMs() << "ms"
-              << std::endl;
-    total_ms_time += dispatch.GetTimeMs();
-  }
-  std::cout << "Ideal total time - " << total_ms_time << std::endl;
+  std::cout << profiling_info.GetDetailedReport() << std::endl;
+  uint64_t mem_bytes = context.GetSizeOfMemoryAllocatedForIntermediateTensors();
+  std::cout << "Memory for intermediate tensors - "
+            << mem_bytes / 1024.0 / 1024.0 << " MB" << std::endl;

-  int runs1000ms = std::max(1, static_cast<int>(1000.0f / total_ms_time));
+  const int num_runs_per_sec = std::max(
+      1, static_cast<int>(1000.0f / absl::ToDoubleMilliseconds(
+                                        profiling_info.GetTotalTime())));

-  for (int i = 0; i < 10; ++i) {
+  const int kNumRuns = 10;
+  for (int i = 0; i < kNumRuns; ++i) {
    const auto start = absl::Now();
-    for (int k = 0; k < runs1000ms; ++k) {
+    for (int k = 0; k < num_runs_per_sec; ++k) {
      RETURN_IF_ERROR(context.AddToQueue(env.queue()));
    }
    RETURN_IF_ERROR(env.queue()->WaitForCompletion());
    const auto end = absl::Now();
-    const double time_ms =
+    const double total_time_ms =
        static_cast<double>((end - start) / absl::Nanoseconds(1)) * 1e-6;
-    std::cout << "Total time - " << time_ms / runs1000ms << "ms" << std::endl;
+    const double average_inference_time = total_time_ms / num_runs_per_sec;
+    std::cout << "Total time - " << average_inference_time << "ms" << std::endl;
  }

  return OkStatus();
@ -168,13 +172,13 @@ int main(int argc, char** argv) {

  auto load_status = tflite::gpu::cl::LoadOpenCL();
  if (!load_status.ok()) {
-    std::cerr << load_status.error_message();
+    std::cerr << load_status.message();
    return -1;
  }

  auto run_status = tflite::gpu::cl::RunModelSample(argv[1]);
  if (!run_status.ok()) {
-    std::cerr << run_status.error_message();
+    std::cerr << run_status.message();
    return -1;
  }