diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 1749e3b4ba0..b58d40f960d 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -89,6 +89,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
index 5371baae6d6..a18f627e240 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 
+#include <map>
+#include <string>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
@@ -208,7 +210,8 @@ ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
   result.dispatches.resize(events_.size());
   for (int i = 0; i < events_.size(); ++i) {
     result.dispatches[i].label = events_[i].GetName();
-    result.dispatches[i].time_ns = events_[i].GetEventTimeNs();
+    result.dispatches[i].duration =
+        absl::Nanoseconds(events_[i].GetEventTimeNs());
   }
   return result;
 }
@@ -321,6 +324,42 @@ Status CreateProfilingCommandQueue(const CLDevice& device,
   return OkStatus();
 }
 
+absl::Duration ProfilingInfo::GetTotalTime() const {
+  absl::Duration total_time;
+  for (auto dispatch : dispatches) {
+    total_time += dispatch.duration;
+  }
+  return total_time;
+}
+
+std::string ProfilingInfo::GetDetailedReport() const {
+  std::string result;
+  std::map<std::string, double> timing;
+  result +=
+      "Per kernel timing(" + std::to_string(dispatches.size()) + " kernels):\n";
+  for (auto dispatch : dispatches) {
+    result += "  " + dispatch.label + " - " +
+              std::to_string(absl::ToDoubleMilliseconds(dispatch.duration)) +
+              "ms\n";
+    auto name = dispatch.label.substr(0, dispatch.label.find(" "));
+    if (timing.find(name) != timing.end()) {
+      timing[name] += absl::ToDoubleMilliseconds(dispatch.duration);
+    } else {
+      timing[name] = absl::ToDoubleMilliseconds(dispatch.duration);
+    }
+  }
+  result += "--------------------\n";
+  result += "Accumulated time per operation type:\n";
+  for (auto& t : timing) {
+    result += "  " + t.first + " - " + std::to_string(t.second) + "ms\n";
+  }
+  result += "--------------------\n";
+  result += "Ideal total time: " +
+            std::to_string(absl::ToDoubleMilliseconds(GetTotalTime())) + "\n";
+  result += "--------------------\n";
+  return result;
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
index caea7c41628..915dbaf4dfb 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/time/time.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
@@ -35,11 +36,26 @@ namespace cl {
 struct ProfilingInfo {
   struct DispatchInfo {
     std::string label;
-    uint64_t time_ns;
-    double GetTimeMs() const { return static_cast<double>(time_ns) * 1e-6; }
+    absl::Duration duration;
   };
 
   std::vector<DispatchInfo> dispatches;
+
+  absl::Duration GetTotalTime() const;
+
+  // Returns report (string of lines delimited by \n)
+  // This method uses GPU counters and measure GPU time only.
+  // Report has next structure:
+  // Per kernel timing(K kernels):
+  //   conv2d 3.2ms
+  //   ...
+  // --------------------
+  // Accumulated time per operation type:
+  //   conv2d - 14.5ms
+  //   ....
+  // --------------------
+  // Ideal total time: 23.4ms // Total time for all kernels
+  std::string GetDetailedReport() const;
 };
 
 // A wrapper around opencl command queue
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
index 4c64bb8309c..4f67e3d4a2d 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
@@ -123,7 +123,10 @@ Status RunModelSample(const std::string& model_name) {
 
   InferenceContext::CreateInferenceInfo create_info;
   create_info.precision = CalculationsPrecision::F16;
-  create_info.storage_type = TensorStorageType::TEXTURE_2D;
+  create_info.storage_type = GetFastestStorageType(env.device());
+  std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
+  std::cout << "Storage type: " << ToString(create_info.storage_type)
+            << std::endl;
   InferenceContext context;
   RETURN_IF_ERROR(
       context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));
@@ -131,26 +134,27 @@ Status RunModelSample(const std::string& model_name) {
   auto* queue = env.profiling_queue();
   ProfilingInfo profiling_info;
   RETURN_IF_ERROR(context.Profile(queue, &profiling_info));
-  double total_ms_time = 0.0;
-  for (auto dispatch : profiling_info.dispatches) {
-    std::cout << dispatch.label << " - " << dispatch.GetTimeMs() << "ms"
-              << std::endl;
-    total_ms_time += dispatch.GetTimeMs();
-  }
-  std::cout << "Ideal total time - " << total_ms_time << std::endl;
+  std::cout << profiling_info.GetDetailedReport() << std::endl;
+  uint64_t mem_bytes = context.GetSizeOfMemoryAllocatedForIntermediateTensors();
+  std::cout << "Memory for intermediate tensors - "
+            << mem_bytes / 1024.0 / 1024.0 << " MB" << std::endl;
 
-  int runs1000ms = std::max(1, static_cast<int>(1000.0f / total_ms_time));
+  const int num_runs_per_sec = std::max(
+      1, static_cast<int>(1000.0f / absl::ToDoubleMilliseconds(
+                                        profiling_info.GetTotalTime())));
 
-  for (int i = 0; i < 10; ++i) {
+  const int kNumRuns = 10;
+  for (int i = 0; i < kNumRuns; ++i) {
     const auto start = absl::Now();
-    for (int k = 0; k < runs1000ms; ++k) {
+    for (int k = 0; k < num_runs_per_sec; ++k) {
       RETURN_IF_ERROR(context.AddToQueue(env.queue()));
     }
     RETURN_IF_ERROR(env.queue()->WaitForCompletion());
     const auto end = absl::Now();
-    const double time_ms =
+    const double total_time_ms =
         static_cast<double>((end - start) / absl::Nanoseconds(1)) * 1e-6;
-    std::cout << "Total time - " << time_ms / runs1000ms << "ms" << std::endl;
+    const double average_inference_time = total_time_ms / num_runs_per_sec;
+    std::cout << "Total time - " << average_inference_time << "ms" << std::endl;
   }
 
   return OkStatus();
@@ -168,13 +172,13 @@ int main(int argc, char** argv) {
 
   auto load_status = tflite::gpu::cl::LoadOpenCL();
   if (!load_status.ok()) {
-    std::cerr << load_status.error_message();
+    std::cerr << load_status.message();
     return -1;
   }
 
   auto run_status = tflite::gpu::cl::RunModelSample(argv[1]);
   if (!run_status.ok()) {
-    std::cerr << run_status.error_message();
+    std::cerr << run_status.message();
     return -1;
   }