From 90e6bdca1ffa37bec962034cc10d1655451fd6c3 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <dlibenzi@google.com>
Date: Sat, 25 Jan 2020 12:53:33 -0800
Subject: [PATCH] Instrument XRT with metrics and add op to fetch them from
 client side.

PiperOrigin-RevId: 291547054
Change-Id: Ia44b4d724805912961cf4f1fae165df9bad0c3b2
---
 tensorflow/compiler/xrt/BUILD                 |   5 +-
 .../compiler/xrt/kernels/xrt_compile_ops.cc   |   4 +
 .../compiler/xrt/kernels/xrt_execute_op.cc    |   4 +
 .../compiler/xrt/kernels/xrt_state_ops.cc     |  37 ++-
 .../compiler/xrt/kernels/xrt_state_ops.h      |  18 ++
 tensorflow/compiler/xrt/ops/xrt_state_ops.cc  |  12 +
 tensorflow/compiler/xrt/tests/raw_api_test.cc |  21 ++
 tensorflow/compiler/xrt/xrt.proto             |  50 ++++
 tensorflow/compiler/xrt/xrt_memory_manager.cc |   8 +
 tensorflow/compiler/xrt/xrt_metrics.cc        | 255 ++++++++++++++++++
 tensorflow/compiler/xrt/xrt_metrics.h         |  55 ++++
 tensorflow/core/BUILD                         |   1 +
 tensorflow/core/lib/monitoring/BUILD          |  13 +
 .../core/lib/monitoring/percentile_sampler.cc |   4 +-
 tensorflow/core/lib/monitoring/timed.h        |  48 ++++
 15 files changed, 530 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/compiler/xrt/xrt_metrics.cc
 create mode 100644 tensorflow/compiler/xrt/xrt_metrics.h
 create mode 100644 tensorflow/core/lib/monitoring/timed.h

diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index a3f6dafbffb..93ad08fbfdf 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -45,6 +45,7 @@ cc_library(
         "xrt_compilation_cache.cc",
         "xrt_device.cc",
         "xrt_memory_manager.cc",
+        "xrt_metrics.cc",
         "xrt_state.cc",
         "xrt_util.cc",
     ],
@@ -52,6 +53,7 @@ cc_library(
         "xrt_compilation_cache.h",
         "xrt_device.h",
         "xrt_memory_manager.h",
+        "xrt_metrics.h",
         "xrt_refptr.h",
         "xrt_state.h",
         "xrt_util.h",
@@ -75,10 +77,11 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
     ],
 )
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
index 32030d851c8..99fb092335e 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
+#include "tensorflow/compiler/xrt/xrt_metrics.h"
 #include "tensorflow/compiler/xrt/xrt_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -41,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/monitoring/timed.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/types.h"
@@ -137,6 +139,7 @@ Status XRTCompileOp::Compile(OpKernelContext* ctx,
 
 void XRTCompileOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XRTCompileOp::Compute";
+  auto timed = monitoring::MakeTimed(xrt_metrics::GetCompileCell());
 
   ResourceMgr* rm;
   OP_REQUIRES_OK(ctx, XRTGenericDeviceAccessor::GetResourceManager(ctx, &rm));
@@ -207,6 +210,7 @@ XRTReleaseCompilationRefOp::~XRTReleaseCompilationRefOp() = default;
 
 void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XRTReleaseCompilationRefOp::Compute";
+  auto timed = monitoring::MakeTimed(xrt_metrics::GetReleaseCompilationCell());
 
   ResourceMgr* rm;
   OP_REQUIRES_OK(ctx, XRTGenericDeviceAccessor::GetResourceManager(ctx, &rm));
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index a612f9950ad..8e54afd02ab 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
 #include "tensorflow/compiler/xrt/xrt_memory_manager.h"
+#include "tensorflow/compiler/xrt/xrt_metrics.h"
 #include "tensorflow/compiler/xrt/xrt_state.h"
 #include "tensorflow/compiler/xrt/xrt_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -35,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/monitoring/timed.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
@@ -248,6 +250,7 @@ void XRTExecuteOp::ComputeAsync(OpKernelContext* context, DoneCallback done) {
 
 Status XRTExecuteOp::DoWork(OpKernelContext* context) {
   VLOG(1) << "XRTExecuteOp::Compute";
+  auto timed = monitoring::MakeTimed(xrt_metrics::GetExecuteCell());
   ResourceMgr* rm;
   TF_RETURN_IF_ERROR(
       XRTGenericDeviceAccessor::GetResourceManager(context, &rm));
@@ -333,6 +336,7 @@ void XRTExecuteChainedOp::ComputeAsync(OpKernelContext* context,
 
 Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
   VLOG(1) << "XRTExecuteChainedOp::Compute";
+  auto timed = monitoring::MakeTimed(xrt_metrics::GetExecuteChainedCell());
   ResourceMgr* rm;
   TF_RETURN_IF_ERROR(
       XRTGenericDeviceAccessor::GetResourceManager(context, &rm));
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
index 6eab3716391..02b9a2e068b 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
@@ -16,15 +16,45 @@ limitations under the License.
 // Classes for allocating XLA literals in device memory and managing handles
 // that refer to them.
 
+#include "tensorflow/compiler/xrt/kernels/xrt_state_ops.h"
+
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xrt/kernels/xrt_state_ops.h"
-
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xrt/xrt_metrics.h"
 
 namespace tensorflow {
+namespace {
+
+class XRTMetricsCollectOp : public OpKernel {
+ public:
+  explicit XRTMetricsCollectOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTMetricsCollectOp::Compute";
+
+    const Tensor& metrics_proto = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(metrics_proto.shape()),
+                errors::Internal("request input should be a string scalar"));
+    xrt::XRTMetricsCollect metrics;
+    OP_REQUIRES(ctx, metrics.ParseFromString(metrics_proto.scalar<tstring>()()),
+                errors::InvalidArgument(
+                    "Unable to parse request input to XRTMetricsCollect"));
+
+    xla::StatusOr<xrt::MetricsReport> collected_metrics_or =
+        CollectMetrics(metrics);
+    OP_REQUIRES_OK(ctx, collected_metrics_or.status());
+    xrt::MetricsReport collected_metrics =
+        collected_metrics_or.ConsumeValueOrDie();
+    Tensor output(DT_STRING, TensorShape({}));
+    output.scalar<tstring>()() = collected_metrics.SerializeAsString();
+    ctx->set_output(0, output);
+  }
+};
+
+}  // namespace
 
 REGISTER_KERNEL_BUILDER(Name("XRTAllocate")
                             .Device(DEVICE_XLA_GPU)
@@ -161,4 +191,7 @@ REGISTER_KERNEL_BUILDER(Name("XRTCompactAllocations").Device(DEVICE_XLA_GPU),
 REGISTER_KERNEL_BUILDER(Name("XRTCompactAllocations").Device(DEVICE_XLA_CPU),
                         XRTCompactAllocationsOp<XRTGenericDeviceAccessor>);
 
+REGISTER_KERNEL_BUILDER(Name("XRTMetricsCollect").Device(DEVICE_CPU),
+                        XRTMetricsCollectOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index 769ec188349..ffb5a3e8db3 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
 #include "tensorflow/compiler/xrt/xrt_memory_manager.h"
+#include "tensorflow/compiler/xrt/xrt_metrics.h"
 #include "tensorflow/compiler/xrt/xrt_state.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -46,6 +47,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/monitoring/percentile_sampler.h"
+#include "tensorflow/core/lib/monitoring/timed.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -170,6 +173,7 @@ class XRTAllocateOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTAllocateOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetAllocateCell());
 
     const Tensor& allocation_info = ctx->input(0);
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(allocation_info.shape()),
@@ -223,6 +227,8 @@ class XRTAllocateUninitializedOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTAllocateUninitializedOp::Compute";
+    auto timed =
+        monitoring::MakeTimed(xrt_metrics::GetAllocateUninitializedCell());
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
 
@@ -294,6 +300,8 @@ class XRTAllocateFromTensorOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTAllocateFromTensorOp::Compute";
+    auto timed =
+        monitoring::MakeTimed(xrt_metrics::GetAllocateFromTensorCell());
 
     OpInputList values;
     OP_REQUIRES_OK(ctx, ctx->input_list("inputs", &values));
@@ -362,6 +370,7 @@ class XRTSubTupleOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTSubTupleOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetSubTupleCell());
 
     const Tensor& handle_tensor = ctx->input(0);
     OP_REQUIRES(
@@ -412,6 +421,7 @@ class XRTMakeTupleOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTMakeTupleOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetMakeTupleCell());
 
     const Tensor& tuple_info = ctx->input(0);
     OP_REQUIRES(
@@ -482,6 +492,7 @@ class XRTReadLiteralOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTReadLiteralOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetReadLiteralCell());
 
     const Tensor& handle_tensor = ctx->input(0);
     OP_REQUIRES(
@@ -532,6 +543,7 @@ class XRTReadToTensorOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTReadToTensorOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetReadToTensorCell());
 
     const Tensor& handle_tensor = ctx->input(0);
     // TODO(phawkins,dlibenzi): accept multiple handles (i.e., vectors, not
@@ -615,6 +627,7 @@ class XRTWriteLiteralOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTWriteLiteralOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetWriteLiteralCell());
 
     const Tensor& handle_tensor = ctx->input(0);
     OP_REQUIRES(
@@ -665,6 +678,7 @@ class XRTReleaseAllocationOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTReleaseAllocationOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetReleaseAllocationCell());
 
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
@@ -693,6 +707,8 @@ class XRTReleaseAllAllocationsOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTReleaseAllAllocationsOp::Compute";
+    auto timed =
+        monitoring::MakeTimed(xrt_metrics::GetReleaseAllAllocationsCell());
 
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
@@ -710,6 +726,8 @@ class XRTCompactAllocationsOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTCompactAllocationsOp::Compute";
+    auto timed =
+        monitoring::MakeTimed(xrt_metrics::GetCompactAllocationsCell());
 
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
index 49a2656a0f9..dca757bec3a 100644
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@@ -216,4 +216,16 @@ backing the handles, and re-allocate and send back the data to the device.
 This operation helps with device memory fragmentation.
 )");
 
+REGISTER_OP("XRTMetricsCollect")
+    .Input("request: string")
+    .Output("result: string")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(
+        R"(
+Reads the selected metric values from the metrics collection registry.
+
+'request' is a serialized xrt::XRTMetricsCollect proto.
+'result' is a serialized xrt::MetricsReport proto.
+)");
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 68f56a52d0e..08a99756426 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -1675,6 +1675,27 @@ TEST(RawApiTest, TestDeviceMemorySwap) {
   }
 }
 
+TEST(RawApiTest, TestMetricsFetch) {
+  xrt::XRTMetricsCollect metrics;
+  metrics.add_metrics_regex("/tensorflow/xrt/.*");
+
+  Scope root = Scope::NewRootScope().WithDevice("/device:CPU:0");
+  auto metrics_value = ops::Const(root, metrics.SerializeAsString());
+  Output result = ops::XRTMetricsCollect(root, metrics_value);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({result}, &outputs));
+  ASSERT_EQ(outputs.size(), 1);
+
+  xrt::MetricsReport report;
+  EXPECT_TRUE(report.ParseFromString(outputs[0].scalar<tstring>()()));
+  for (auto& metric : report.metrics()) {
+    EXPECT_EQ(metric.name().compare(0, 16, "/tensorflow/xrt/"), 0);
+  }
+}
+
 }  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index 0a123a9a48a..1cf9a0b650f 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -191,3 +191,53 @@ message XRTChainedExecutePlan {
   // The post order with the XRT computations to be executed.
   repeated XRTChainedExecuteOp ops = 1;
 }
+
+// The message used to encode the options for the XRTMetricsCollect operation.
+message XRTMetricsCollect {
+  // A list of regular expressions to match the metric names. Empty means to
+  // return all the metrics reported by the collection registry.
+  repeated string metrics_regex = 1;
+}
+
+message Percentiles {
+  message Point {
+    // In the [0, 100] range.
+    double percentile = 1;
+    double value = 2;
+  }
+
+  // The time (in nanoseconds) of the first sample within the samples buffer.
+  uint64 start_nstime = 1;
+  // The time (in nanoseconds) of the last sample within the samples buffer.
+  uint64 end_nstime = 2;
+  // The minimum value of the samples within the samples buffer.
+  double min_value = 3;
+  // The maximum value of the samples within the samples buffer.
+  double max_value = 4;
+  // The mean value of the samples within the samples buffer.
+  double mean = 5;
+  // The stndard deviation of the samples within the samples buffer.
+  double stddev = 6;
+  // The number samples within the samples buffer.
+  uint64 num_samples = 7;
+  // The total number of times this metrics has been posted a value to.
+  uint64 total_samples = 8;
+  // The sum of all the posted values.
+  double accumulator = 9;
+  // The percentile points reported by the metric.
+  repeated Point points = 10;
+}
+
+message MetricValues {
+  // The metric name.
+  string name = 1;
+
+  oneof values_oneof {
+    Percentiles percentiles_value = 2;
+    int64 int64_value = 3;
+  }
+}
+
+message MetricsReport {
+  repeated MetricValues metrics = 1;
+}
diff --git a/tensorflow/compiler/xrt/xrt_memory_manager.cc b/tensorflow/compiler/xrt/xrt_memory_manager.cc
index 14986be3d1e..7042e35a98e 100644
--- a/tensorflow/compiler/xrt/xrt_memory_manager.cc
+++ b/tensorflow/compiler/xrt/xrt_memory_manager.cc
@@ -20,7 +20,10 @@ limitations under the License.
 #include <unordered_map>
 
 #include "absl/memory/memory.h"
+#include "tensorflow/compiler/xrt/xrt_metrics.h"
+#include "tensorflow/core/lib/monitoring/timed.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 namespace {
@@ -97,6 +100,9 @@ class XRTMemoryManager::DeviceContext {
 
   Status CompactAllocations(XRTMemoryManager* memory_manager,
                             xla::Backend* backend) {
+    profiler::TraceMe trace_me("XRTMemoryManager::CompactAllocations",
+                               /*level=*/2);
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetMemoryCompactCell());
     VLOG(4) << "CompactAllocations started";
     mutex_lock lock(lock_);
     Status status;
@@ -143,6 +149,8 @@ class XRTMemoryManager::DeviceContext {
   // Tries to free size bytes by freeing some unpinned device memory. Returns
   // the amount of memory which was able to free.
   xla::StatusOr<size_t> TryFreeMemory(xla::Backend* backend, size_t size) {
+    profiler::TraceMe trace_me("XRTMemoryManager::TryFreeMemory", /*level=*/2);
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetTryFreeMemoryCell());
     mutex_lock lock(lock_);
     size_t swapped_size = 0;
     for (auto it = allocs_.rbegin(); it != allocs_.rend(); ++it) {
diff --git a/tensorflow/compiler/xrt/xrt_metrics.cc b/tensorflow/compiler/xrt/xrt_metrics.cc
new file mode 100644
index 00000000000..ec4ac774b68
--- /dev/null
+++ b/tensorflow/compiler/xrt/xrt_metrics.cc
@@ -0,0 +1,255 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xrt/xrt_metrics.h"
+
+#include "tensorflow/core/lib/monitoring/collection_registry.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace tensorflow {
+namespace {
+
+static const size_t kMaxSamples = 1024;
+
+std::vector<double> GetDefaultPercentiles() {
+  return {25.0, 50.0, 80.0, 90.0, 95.0, 99.0};
+}
+
+bool IsSelectedMetric(const xrt::XRTMetricsCollect& metrics,
+                      const string& name) {
+  if (metrics.metrics_regex_size() == 0) {
+    return true;
+  }
+  for (auto& metric_regex : metrics.metrics_regex()) {
+    if (RE2::FullMatch(name, metric_regex)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Status AddMetrics(xrt::MetricsReport* report,
+                  const monitoring::PointSet& point_set) {
+  for (auto& point : point_set.points) {
+    xrt::MetricValues* metrics = report->add_metrics();
+    metrics->set_name(point_set.metric_name);
+    if (point->value_type == monitoring::ValueType::kPercentiles) {
+      xrt::Percentiles* percentiles = metrics->mutable_percentiles_value();
+      percentiles->set_start_nstime(point->percentiles_value.start_nstime);
+      percentiles->set_end_nstime(point->percentiles_value.end_nstime);
+      percentiles->set_min_value(point->percentiles_value.min_value);
+      percentiles->set_max_value(point->percentiles_value.max_value);
+      percentiles->set_mean(point->percentiles_value.mean);
+      percentiles->set_stddev(point->percentiles_value.stddev);
+      percentiles->set_num_samples(point->percentiles_value.num_samples);
+      percentiles->set_total_samples(point->percentiles_value.total_samples);
+      percentiles->set_accumulator(point->percentiles_value.accumulator);
+      for (auto& pct_point : point->percentiles_value.points) {
+        xrt::Percentiles::Point* xpoint = percentiles->add_points();
+        xpoint->set_percentile(pct_point.percentile);
+        xpoint->set_value(pct_point.value);
+      }
+    } else if (point->value_type == monitoring::ValueType::kInt64) {
+      metrics->set_int64_value(point->int64_value);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+namespace xrt_metrics {
+
+monitoring::PercentileSamplerCell* GetAllocateCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/allocate", "Tracks XRTAllocate times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetAllocateUninitializedCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/allocate_uninitialized",
+           "Tracks XRTAllocateUninitialized times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetAllocateFromTensorCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/allocate_from_tensor",
+           "Tracks XRTAllocateFromTensor times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetSubTupleCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/sub_tuple", "Tracks XRTSubTuple times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetMakeTupleCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/make_tuple", "Tracks XRTMakeTuple times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetReadLiteralCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/read_literal", "Tracks XRTReadLiteral times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetReadToTensorCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/read_tensor", "Tracks XRTReadToTensor times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetWriteLiteralCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/write_literal", "Tracks XRTWriteLiteral times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetReleaseAllocationCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/release_allocation",
+           "Tracks XRTReleaseAllocation times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetReleaseAllAllocationsCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/release_all_allocations",
+           "Tracks XRTReleaseAllAllocations times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetCompactAllocationsCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/compact_allocations",
+           "Tracks XRTCompactAllocations times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetCompileCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/compile", "Tracks XRTCompile times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetReleaseCompilationCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/release_compilation",
+           "Tracks XRTReleaseCompilationRef times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetExecuteCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/execute", "Tracks XRTExecute times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetExecuteChainedCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/execute_chained",
+           "Tracks XRTExecuteChained times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetMemoryCompactCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/memory_manager/compaction",
+           "Tracks XRT memory manager memory compaction times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetTryFreeMemoryCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/memory_manager/try_free_memory",
+           "Tracks XRT memory manager times in trying to "
+           "free memory by swpping device memory to host memory"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+}  // namespace xrt_metrics
+
+xla::StatusOr<xrt::MetricsReport> CollectMetrics(
+    const xrt::XRTMetricsCollect& metrics) {
+  auto* collection_registry = monitoring::CollectionRegistry::Default();
+  monitoring::CollectionRegistry::CollectMetricsOptions options;
+  options.collect_metric_descriptors = false;
+  auto collected_metrics = collection_registry->CollectMetrics(options);
+  xrt::MetricsReport report;
+  for (auto& name_pointset : collected_metrics->point_set_map) {
+    if (IsSelectedMetric(metrics, name_pointset.first)) {
+      TF_RETURN_IF_ERROR(AddMetrics(&report, *name_pointset.second));
+    }
+  }
+  return std::move(report);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_metrics.h b/tensorflow/compiler/xrt/xrt_metrics.h
new file mode 100644
index 00000000000..3e61e817ebd
--- /dev/null
+++ b/tensorflow/compiler/xrt/xrt_metrics.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XRT_XRT_METRICS_H_
+#define TENSORFLOW_COMPILER_XRT_XRT_METRICS_H_
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xrt/xrt.pb.h"
+#include "tensorflow/core/lib/monitoring/percentile_sampler.h"
+
+namespace tensorflow {
+namespace xrt_metrics {
+
+// Defines the singletons of the metrics populated by the XRT op framework.
+// Single of a single XRT op there can be many device specific versions (CPU,
+// GPU, TPU), and since the monitoring subsystem does not allow multiple
+// registrations of the same metric name, we define them all in this file.
+monitoring::PercentileSamplerCell* GetAllocateCell();
+monitoring::PercentileSamplerCell* GetAllocateUninitializedCell();
+monitoring::PercentileSamplerCell* GetAllocateFromTensorCell();
+monitoring::PercentileSamplerCell* GetSubTupleCell();
+monitoring::PercentileSamplerCell* GetMakeTupleCell();
+monitoring::PercentileSamplerCell* GetReadLiteralCell();
+monitoring::PercentileSamplerCell* GetReadToTensorCell();
+monitoring::PercentileSamplerCell* GetWriteLiteralCell();
+monitoring::PercentileSamplerCell* GetReleaseAllocationCell();
+monitoring::PercentileSamplerCell* GetReleaseAllAllocationsCell();
+monitoring::PercentileSamplerCell* GetCompactAllocationsCell();
+monitoring::PercentileSamplerCell* GetCompileCell();
+monitoring::PercentileSamplerCell* GetReleaseCompilationCell();
+monitoring::PercentileSamplerCell* GetExecuteCell();
+monitoring::PercentileSamplerCell* GetExecuteChainedCell();
+monitoring::PercentileSamplerCell* GetMemoryCompactCell();
+monitoring::PercentileSamplerCell* GetTryFreeMemoryCell();
+
+}  // namespace xrt_metrics
+
+xla::StatusOr<xrt::MetricsReport> CollectMetrics(
+    const xrt::XRTMetricsCollect& metrics);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_XRT_XRT_METRICS_H_
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 422df45c797..e9a2480c1bf 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1895,6 +1895,7 @@ cc_library(
         "//tensorflow/core/lib/monitoring:mobile_sampler",
         "//tensorflow/core/lib/monitoring:percentile_sampler",
         "//tensorflow/core/lib/monitoring:sampler",
+        "//tensorflow/core/lib/monitoring:timed",
         "//tensorflow/core/lib/random:exact_uniform_int",
         "//tensorflow/core/lib/random:philox",
         "//tensorflow/core/lib/random:philox_random",
diff --git a/tensorflow/core/lib/monitoring/BUILD b/tensorflow/core/lib/monitoring/BUILD
index 9fa3f2d75f3..fd74298eae0 100644
--- a/tensorflow/core/lib/monitoring/BUILD
+++ b/tensorflow/core/lib/monitoring/BUILD
@@ -25,6 +25,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "timed",
+    hdrs = [
+        "timed.h",
+    ],
+    deps = [
+        "//tensorflow/core/platform:env_time",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
 cc_library(
     name = "collected_metrics",
     hdrs = [
@@ -210,6 +221,7 @@ filegroup(
         "mobile_gauge.h",
         "mobile_sampler.h",
         "sampler.h",
+        "timed.h",
         "types.h",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
@@ -225,6 +237,7 @@ filegroup(
         "metric_def.h",
         "percentile_sampler.h",
         "sampler.h",
+        "timed.h",
         "types.h",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
diff --git a/tensorflow/core/lib/monitoring/percentile_sampler.cc b/tensorflow/core/lib/monitoring/percentile_sampler.cc
index 3d9c644cc0d..988e50ded52 100644
--- a/tensorflow/core/lib/monitoring/percentile_sampler.cc
+++ b/tensorflow/core/lib/monitoring/percentile_sampler.cc
@@ -31,10 +31,10 @@ void PercentileSamplerCell::Add(double sample) {
   mutex_lock l(mu_);
   samples_[next_position_] = {nstime, sample};
   ++next_position_;
-  if (next_position_ >= samples_.size()) {
+  if (TF_PREDICT_FALSE(next_position_ >= samples_.size())) {
     next_position_ = 0;
   }
-  if (num_samples_ < samples_.size()) {
+  if (TF_PREDICT_FALSE(num_samples_ < samples_.size())) {
     ++num_samples_;
   }
   ++total_samples_;
diff --git a/tensorflow/core/lib/monitoring/timed.h b/tensorflow/core/lib/monitoring/timed.h
new file mode 100644
index 00000000000..09b412676ee
--- /dev/null
+++ b/tensorflow/core/lib/monitoring/timed.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_TIMED_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_TIMED_H_
+
+#include "tensorflow/core/platform/env_time.h"
+
+namespace tensorflow {
+namespace monitoring {
+
+// Takes a Sampler, PercentileSample or Gauge cell, and post timing values
+// (default in milliseconds) according to its scope lifetime.
+template <typename T>
+class Timed {
+ public:
+  explicit Timed(T* cell, double scale = 1e-6)
+      : cell_(cell), scale_(scale), start_(EnvTime::NowNanos()) {}
+
+  ~Timed() { cell_->Add(scale_ * (EnvTime::NowNanos() - start_)); }
+
+ private:
+  T* cell_ = nullptr;
+  double scale_ = 1e-6;
+  uint64 start_ = 0;
+};
+
+template <typename T>
+Timed<T> MakeTimed(T* cell, double scale = 1e-6) {
+  return Timed<T>(cell, scale);
+}
+
+}  // namespace monitoring
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_TIMED_H_