Instrument XRT with metrics and add op to fetch them from client side.

PiperOrigin-RevId: 291547054 Change-Id: Ia44b4d724805912961cf4f1fae165df9bad0c3b2
2020-01-25 12:53:33 -08:00 · 2020-01-25 12:53:33 -08:00 · 90e6bdca1f
commit 90e6bdca1f
parent c71fb79cbc
15 changed files with 530 additions and 5 deletions
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@ -45,6 +45,7 @@ cc_library(
        "xrt_compilation_cache.cc",
        "xrt_device.cc",
        "xrt_memory_manager.cc",
+        "xrt_metrics.cc",
        "xrt_state.cc",
        "xrt_util.cc",
    ],
@ -52,6 +53,7 @@ cc_library(
        "xrt_compilation_cache.h",
        "xrt_device.h",
        "xrt_memory_manager.h",
+        "xrt_metrics.h",
        "xrt_refptr.h",
        "xrt_state.h",
        "xrt_util.h",
@ -75,10 +77,11 @@ cc_library(
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/profiler/lib:traceme",
        "//tensorflow/stream_executor",
        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/synchronization",
    ],
 )
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
+#include "tensorflow/compiler/xrt/xrt_metrics.h"
 #include "tensorflow/compiler/xrt/xrt_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@ -41,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/monitoring/timed.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/types.h"
@ -137,6 +139,7 @@ Status XRTCompileOp::Compile(OpKernelContext* ctx,

 void XRTCompileOp::Compute(OpKernelContext* ctx) {
  VLOG(1) << "XRTCompileOp::Compute";
+  auto timed = monitoring::MakeTimed(xrt_metrics::GetCompileCell());

  ResourceMgr* rm;
  OP_REQUIRES_OK(ctx, XRTGenericDeviceAccessor::GetResourceManager(ctx, &rm));
@ -207,6 +210,7 @@ XRTReleaseCompilationRefOp::~XRTReleaseCompilationRefOp() = default;

 void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
  VLOG(1) << "XRTReleaseCompilationRefOp::Compute";
+  auto timed = monitoring::MakeTimed(xrt_metrics::GetReleaseCompilationCell());

  ResourceMgr* rm;
  OP_REQUIRES_OK(ctx, XRTGenericDeviceAccessor::GetResourceManager(ctx, &rm));
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
 #include "tensorflow/compiler/xrt/xrt_memory_manager.h"
+#include "tensorflow/compiler/xrt/xrt_metrics.h"
 #include "tensorflow/compiler/xrt/xrt_state.h"
 #include "tensorflow/compiler/xrt/xrt_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@ -35,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/monitoring/timed.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
@ -248,6 +250,7 @@ void XRTExecuteOp::ComputeAsync(OpKernelContext* context, DoneCallback done) {

 Status XRTExecuteOp::DoWork(OpKernelContext* context) {
  VLOG(1) << "XRTExecuteOp::Compute";
+  auto timed = monitoring::MakeTimed(xrt_metrics::GetExecuteCell());
  ResourceMgr* rm;
  TF_RETURN_IF_ERROR(
      XRTGenericDeviceAccessor::GetResourceManager(context, &rm));
@ -333,6 +336,7 @@ void XRTExecuteChainedOp::ComputeAsync(OpKernelContext* context,

 Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
  VLOG(1) << "XRTExecuteChainedOp::Compute";
+  auto timed = monitoring::MakeTimed(xrt_metrics::GetExecuteChainedCell());
  ResourceMgr* rm;
  TF_RETURN_IF_ERROR(
      XRTGenericDeviceAccessor::GetResourceManager(context, &rm));
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
@ -16,15 +16,45 @@ limitations under the License.
 // Classes for allocating XLA literals in device memory and managing handles
 // that refer to them.

+#include "tensorflow/compiler/xrt/kernels/xrt_state_ops.h"
+
 #include <memory>
 #include <string>

-#include "tensorflow/compiler/xrt/kernels/xrt_state_ops.h"
-
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xrt/xrt_metrics.h"

 namespace tensorflow {
+namespace {
+
+class XRTMetricsCollectOp : public OpKernel {
+ public:
+  explicit XRTMetricsCollectOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTMetricsCollectOp::Compute";
+
+    const Tensor& metrics_proto = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(metrics_proto.shape()),
+                errors::Internal("request input should be a string scalar"));
+    xrt::XRTMetricsCollect metrics;
+    OP_REQUIRES(ctx, metrics.ParseFromString(metrics_proto.scalar<tstring>()()),
+                errors::InvalidArgument(
+                    "Unable to parse request input to XRTMetricsCollect"));
+
+    xla::StatusOr<xrt::MetricsReport> collected_metrics_or =
+        CollectMetrics(metrics);
+    OP_REQUIRES_OK(ctx, collected_metrics_or.status());
+    xrt::MetricsReport collected_metrics =
+        collected_metrics_or.ConsumeValueOrDie();
+    Tensor output(DT_STRING, TensorShape({}));
+    output.scalar<tstring>()() = collected_metrics.SerializeAsString();
+    ctx->set_output(0, output);
+  }
+};
+
+}  // namespace

 REGISTER_KERNEL_BUILDER(Name("XRTAllocate")
                            .Device(DEVICE_XLA_GPU)
@ -161,4 +191,7 @@ REGISTER_KERNEL_BUILDER(Name("XRTCompactAllocations").Device(DEVICE_XLA_GPU),
 REGISTER_KERNEL_BUILDER(Name("XRTCompactAllocations").Device(DEVICE_XLA_CPU),
                        XRTCompactAllocationsOp<XRTGenericDeviceAccessor>);

+REGISTER_KERNEL_BUILDER(Name("XRTMetricsCollect").Device(DEVICE_CPU),
+                        XRTMetricsCollectOp);
+
 }  // namespace tensorflow
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
 #include "tensorflow/compiler/xrt/xrt_memory_manager.h"
+#include "tensorflow/compiler/xrt/xrt_metrics.h"
 #include "tensorflow/compiler/xrt/xrt_state.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/op_kernel.h"
@ -46,6 +47,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/monitoring/percentile_sampler.h"
+#include "tensorflow/core/lib/monitoring/timed.h"
 #include "tensorflow/core/platform/types.h"

 namespace tensorflow {
@ -170,6 +173,7 @@ class XRTAllocateOp : public OpKernel {

  void Compute(OpKernelContext* ctx) override {
    VLOG(1) << "XRTAllocateOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetAllocateCell());

    const Tensor& allocation_info = ctx->input(0);
    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(allocation_info.shape()),
@ -223,6 +227,8 @@ class XRTAllocateUninitializedOp : public OpKernel {

  void Compute(OpKernelContext* ctx) override {
    VLOG(1) << "XRTAllocateUninitializedOp::Compute";
+    auto timed =
+        monitoring::MakeTimed(xrt_metrics::GetAllocateUninitializedCell());
    ResourceMgr* rm;
    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));

@ -294,6 +300,8 @@ class XRTAllocateFromTensorOp : public OpKernel {

  void Compute(OpKernelContext* ctx) override {
    VLOG(1) << "XRTAllocateFromTensorOp::Compute";
+    auto timed =
+        monitoring::MakeTimed(xrt_metrics::GetAllocateFromTensorCell());

    OpInputList values;
    OP_REQUIRES_OK(ctx, ctx->input_list("inputs", &values));
@ -362,6 +370,7 @@ class XRTSubTupleOp : public OpKernel {

  void Compute(OpKernelContext* ctx) override {
    VLOG(1) << "XRTSubTupleOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetSubTupleCell());

    const Tensor& handle_tensor = ctx->input(0);
    OP_REQUIRES(
@ -412,6 +421,7 @@ class XRTMakeTupleOp : public OpKernel {

  void Compute(OpKernelContext* ctx) override {
    VLOG(1) << "XRTMakeTupleOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetMakeTupleCell());

    const Tensor& tuple_info = ctx->input(0);
    OP_REQUIRES(
@ -482,6 +492,7 @@ class XRTReadLiteralOp : public OpKernel {

  void Compute(OpKernelContext* ctx) override {
    VLOG(1) << "XRTReadLiteralOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetReadLiteralCell());

    const Tensor& handle_tensor = ctx->input(0);
    OP_REQUIRES(
@ -532,6 +543,7 @@ class XRTReadToTensorOp : public OpKernel {

  void Compute(OpKernelContext* ctx) override {
    VLOG(1) << "XRTReadToTensorOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetReadToTensorCell());

    const Tensor& handle_tensor = ctx->input(0);
    // TODO(phawkins,dlibenzi): accept multiple handles (i.e., vectors, not
@ -615,6 +627,7 @@ class XRTWriteLiteralOp : public OpKernel {

  void Compute(OpKernelContext* ctx) override {
    VLOG(1) << "XRTWriteLiteralOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetWriteLiteralCell());

    const Tensor& handle_tensor = ctx->input(0);
    OP_REQUIRES(
@ -665,6 +678,7 @@ class XRTReleaseAllocationOp : public OpKernel {

  void Compute(OpKernelContext* ctx) override {
    VLOG(1) << "XRTReleaseAllocationOp::Compute";
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetReleaseAllocationCell());

    ResourceMgr* rm;
    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
@ -693,6 +707,8 @@ class XRTReleaseAllAllocationsOp : public OpKernel {

  void Compute(OpKernelContext* ctx) override {
    VLOG(1) << "XRTReleaseAllAllocationsOp::Compute";
+    auto timed =
+        monitoring::MakeTimed(xrt_metrics::GetReleaseAllAllocationsCell());

    ResourceMgr* rm;
    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
@ -710,6 +726,8 @@ class XRTCompactAllocationsOp : public OpKernel {

  void Compute(OpKernelContext* ctx) override {
    VLOG(1) << "XRTCompactAllocationsOp::Compute";
+    auto timed =
+        monitoring::MakeTimed(xrt_metrics::GetCompactAllocationsCell());

    ResourceMgr* rm;
    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@ -216,4 +216,16 @@ backing the handles, and re-allocate and send back the data to the device.
 This operation helps with device memory fragmentation.
 )");

+REGISTER_OP("XRTMetricsCollect")
+    .Input("request: string")
+    .Output("result: string")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(
+        R"(
+Reads the selected metric values from the metrics collection registry.
+
+'request' is a serialized xrt::XRTMetricsCollect proto.
+'result' is a serialized xrt::MetricsReport proto.
+)");
+
 }  // namespace tensorflow
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@ -1675,6 +1675,27 @@ TEST(RawApiTest, TestDeviceMemorySwap) {
  }
 }

+TEST(RawApiTest, TestMetricsFetch) {
+  xrt::XRTMetricsCollect metrics;
+  metrics.add_metrics_regex("/tensorflow/xrt/.*");
+
+  Scope root = Scope::NewRootScope().WithDevice("/device:CPU:0");
+  auto metrics_value = ops::Const(root, metrics.SerializeAsString());
+  Output result = ops::XRTMetricsCollect(root, metrics_value);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({result}, &outputs));
+  ASSERT_EQ(outputs.size(), 1);
+
+  xrt::MetricsReport report;
+  EXPECT_TRUE(report.ParseFromString(outputs[0].scalar<tstring>()()));
+  for (auto& metric : report.metrics()) {
+    EXPECT_EQ(metric.name().compare(0, 16, "/tensorflow/xrt/"), 0);
+  }
+}
+
 }  // namespace

 }  // namespace tensorflow
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@ -191,3 +191,53 @@ message XRTChainedExecutePlan {
  // The post order with the XRT computations to be executed.
  repeated XRTChainedExecuteOp ops = 1;
 }
+
+// The message used to encode the options for the XRTMetricsCollect operation.
+message XRTMetricsCollect {
+  // A list of regular expressions to match the metric names. Empty means to
+  // return all the metrics reported by the collection registry.
+  repeated string metrics_regex = 1;
+}
+
+message Percentiles {
+  message Point {
+    // In the [0, 100] range.
+    double percentile = 1;
+    double value = 2;
+  }
+
+  // The time (in nanoseconds) of the first sample within the samples buffer.
+  uint64 start_nstime = 1;
+  // The time (in nanoseconds) of the last sample within the samples buffer.
+  uint64 end_nstime = 2;
+  // The minimum value of the samples within the samples buffer.
+  double min_value = 3;
+  // The maximum value of the samples within the samples buffer.
+  double max_value = 4;
+  // The mean value of the samples within the samples buffer.
+  double mean = 5;
+  // The stndard deviation of the samples within the samples buffer.
+  double stddev = 6;
+  // The number samples within the samples buffer.
+  uint64 num_samples = 7;
+  // The total number of times this metrics has been posted a value to.
+  uint64 total_samples = 8;
+  // The sum of all the posted values.
+  double accumulator = 9;
+  // The percentile points reported by the metric.
+  repeated Point points = 10;
+}
+
+message MetricValues {
+  // The metric name.
+  string name = 1;
+
+  oneof values_oneof {
+    Percentiles percentiles_value = 2;
+    int64 int64_value = 3;
+  }
+}
+
+message MetricsReport {
+  repeated MetricValues metrics = 1;
+}
--- a/tensorflow/compiler/xrt/xrt_memory_manager.cc
+++ b/tensorflow/compiler/xrt/xrt_memory_manager.cc
@ -20,7 +20,10 @@ limitations under the License.
 #include <unordered_map>

 #include "absl/memory/memory.h"
+#include "tensorflow/compiler/xrt/xrt_metrics.h"
+#include "tensorflow/core/lib/monitoring/timed.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/profiler/lib/traceme.h"

 namespace tensorflow {
 namespace {
@ -97,6 +100,9 @@ class XRTMemoryManager::DeviceContext {

  Status CompactAllocations(XRTMemoryManager* memory_manager,
                            xla::Backend* backend) {
+    profiler::TraceMe trace_me("XRTMemoryManager::CompactAllocations",
+                               /*level=*/2);
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetMemoryCompactCell());
    VLOG(4) << "CompactAllocations started";
    mutex_lock lock(lock_);
    Status status;
@ -143,6 +149,8 @@ class XRTMemoryManager::DeviceContext {
  // Tries to free size bytes by freeing some unpinned device memory. Returns
  // the amount of memory which was able to free.
  xla::StatusOr<size_t> TryFreeMemory(xla::Backend* backend, size_t size) {
+    profiler::TraceMe trace_me("XRTMemoryManager::TryFreeMemory", /*level=*/2);
+    auto timed = monitoring::MakeTimed(xrt_metrics::GetTryFreeMemoryCell());
    mutex_lock lock(lock_);
    size_t swapped_size = 0;
    for (auto it = allocs_.rbegin(); it != allocs_.rend(); ++it) {
--- a/tensorflow/compiler/xrt/xrt_metrics.cc
+++ b/tensorflow/compiler/xrt/xrt_metrics.cc
@ -0,0 +1,255 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xrt/xrt_metrics.h"
+
+#include "tensorflow/core/lib/monitoring/collection_registry.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace tensorflow {
+namespace {
+
+static const size_t kMaxSamples = 1024;
+
+std::vector<double> GetDefaultPercentiles() {
+  return {25.0, 50.0, 80.0, 90.0, 95.0, 99.0};
+}
+
+bool IsSelectedMetric(const xrt::XRTMetricsCollect& metrics,
+                      const string& name) {
+  if (metrics.metrics_regex_size() == 0) {
+    return true;
+  }
+  for (auto& metric_regex : metrics.metrics_regex()) {
+    if (RE2::FullMatch(name, metric_regex)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Status AddMetrics(xrt::MetricsReport* report,
+                  const monitoring::PointSet& point_set) {
+  for (auto& point : point_set.points) {
+    xrt::MetricValues* metrics = report->add_metrics();
+    metrics->set_name(point_set.metric_name);
+    if (point->value_type == monitoring::ValueType::kPercentiles) {
+      xrt::Percentiles* percentiles = metrics->mutable_percentiles_value();
+      percentiles->set_start_nstime(point->percentiles_value.start_nstime);
+      percentiles->set_end_nstime(point->percentiles_value.end_nstime);
+      percentiles->set_min_value(point->percentiles_value.min_value);
+      percentiles->set_max_value(point->percentiles_value.max_value);
+      percentiles->set_mean(point->percentiles_value.mean);
+      percentiles->set_stddev(point->percentiles_value.stddev);
+      percentiles->set_num_samples(point->percentiles_value.num_samples);
+      percentiles->set_total_samples(point->percentiles_value.total_samples);
+      percentiles->set_accumulator(point->percentiles_value.accumulator);
+      for (auto& pct_point : point->percentiles_value.points) {
+        xrt::Percentiles::Point* xpoint = percentiles->add_points();
+        xpoint->set_percentile(pct_point.percentile);
+        xpoint->set_value(pct_point.value);
+      }
+    } else if (point->value_type == monitoring::ValueType::kInt64) {
+      metrics->set_int64_value(point->int64_value);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+namespace xrt_metrics {
+
+monitoring::PercentileSamplerCell* GetAllocateCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/allocate", "Tracks XRTAllocate times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetAllocateUninitializedCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/allocate_uninitialized",
+           "Tracks XRTAllocateUninitialized times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetAllocateFromTensorCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/allocate_from_tensor",
+           "Tracks XRTAllocateFromTensor times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetSubTupleCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/sub_tuple", "Tracks XRTSubTuple times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetMakeTupleCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/make_tuple", "Tracks XRTMakeTuple times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetReadLiteralCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/read_literal", "Tracks XRTReadLiteral times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetReadToTensorCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/read_tensor", "Tracks XRTReadToTensor times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetWriteLiteralCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/write_literal", "Tracks XRTWriteLiteral times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetReleaseAllocationCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/release_allocation",
+           "Tracks XRTReleaseAllocation times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetReleaseAllAllocationsCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/release_all_allocations",
+           "Tracks XRTReleaseAllAllocations times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetCompactAllocationsCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/compact_allocations",
+           "Tracks XRTCompactAllocations times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetCompileCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/compile", "Tracks XRTCompile times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetReleaseCompilationCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/release_compilation",
+           "Tracks XRTReleaseCompilationRef times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetExecuteCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/execute", "Tracks XRTExecute times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetExecuteChainedCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/ops/execute_chained",
+           "Tracks XRTExecuteChained times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetMemoryCompactCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/memory_manager/compaction",
+           "Tracks XRT memory manager memory compaction times"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+monitoring::PercentileSamplerCell* GetTryFreeMemoryCell() {
+  static monitoring::PercentileSamplerCell* cell =
+      monitoring::PercentileSampler<0>::New(
+          {"/tensorflow/xrt/memory_manager/try_free_memory",
+           "Tracks XRT memory manager times in trying to "
+           "free memory by swpping device memory to host memory"},
+          GetDefaultPercentiles(), kMaxSamples)
+          ->GetCell();
+  return cell;
+}
+
+}  // namespace xrt_metrics
+
+xla::StatusOr<xrt::MetricsReport> CollectMetrics(
+    const xrt::XRTMetricsCollect& metrics) {
+  auto* collection_registry = monitoring::CollectionRegistry::Default();
+  monitoring::CollectionRegistry::CollectMetricsOptions options;
+  options.collect_metric_descriptors = false;
+  auto collected_metrics = collection_registry->CollectMetrics(options);
+  xrt::MetricsReport report;
+  for (auto& name_pointset : collected_metrics->point_set_map) {
+    if (IsSelectedMetric(metrics, name_pointset.first)) {
+      TF_RETURN_IF_ERROR(AddMetrics(&report, *name_pointset.second));
+    }
+  }
+  return std::move(report);
+}
+
+}  // namespace tensorflow
--- a/tensorflow/compiler/xrt/xrt_metrics.h
+++ b/tensorflow/compiler/xrt/xrt_metrics.h
@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XRT_XRT_METRICS_H_
+#define TENSORFLOW_COMPILER_XRT_XRT_METRICS_H_
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xrt/xrt.pb.h"
+#include "tensorflow/core/lib/monitoring/percentile_sampler.h"
+
+namespace tensorflow {
+namespace xrt_metrics {
+
+// Defines the singletons of the metrics populated by the XRT op framework.
+// Single of a single XRT op there can be many device specific versions (CPU,
+// GPU, TPU), and since the monitoring subsystem does not allow multiple
+// registrations of the same metric name, we define them all in this file.
+monitoring::PercentileSamplerCell* GetAllocateCell();
+monitoring::PercentileSamplerCell* GetAllocateUninitializedCell();
+monitoring::PercentileSamplerCell* GetAllocateFromTensorCell();
+monitoring::PercentileSamplerCell* GetSubTupleCell();
+monitoring::PercentileSamplerCell* GetMakeTupleCell();
+monitoring::PercentileSamplerCell* GetReadLiteralCell();
+monitoring::PercentileSamplerCell* GetReadToTensorCell();
+monitoring::PercentileSamplerCell* GetWriteLiteralCell();
+monitoring::PercentileSamplerCell* GetReleaseAllocationCell();
+monitoring::PercentileSamplerCell* GetReleaseAllAllocationsCell();
+monitoring::PercentileSamplerCell* GetCompactAllocationsCell();
+monitoring::PercentileSamplerCell* GetCompileCell();
+monitoring::PercentileSamplerCell* GetReleaseCompilationCell();
+monitoring::PercentileSamplerCell* GetExecuteCell();
+monitoring::PercentileSamplerCell* GetExecuteChainedCell();
+monitoring::PercentileSamplerCell* GetMemoryCompactCell();
+monitoring::PercentileSamplerCell* GetTryFreeMemoryCell();
+
+}  // namespace xrt_metrics
+
+xla::StatusOr<xrt::MetricsReport> CollectMetrics(
+    const xrt::XRTMetricsCollect& metrics);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_XRT_XRT_METRICS_H_
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -1895,6 +1895,7 @@ cc_library(
        "//tensorflow/core/lib/monitoring:mobile_sampler",
        "//tensorflow/core/lib/monitoring:percentile_sampler",
        "//tensorflow/core/lib/monitoring:sampler",
+        "//tensorflow/core/lib/monitoring:timed",
        "//tensorflow/core/lib/random:exact_uniform_int",
        "//tensorflow/core/lib/random:philox",
        "//tensorflow/core/lib/random:philox_random",
--- a/tensorflow/core/lib/monitoring/BUILD
+++ b/tensorflow/core/lib/monitoring/BUILD
@ -25,6 +25,17 @@ cc_library(
    ],
 )

+cc_library(
+    name = "timed",
+    hdrs = [
+        "timed.h",
+    ],
+    deps = [
+        "//tensorflow/core/platform:env_time",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
 cc_library(
    name = "collected_metrics",
    hdrs = [
@ -210,6 +221,7 @@ filegroup(
        "mobile_gauge.h",
        "mobile_sampler.h",
        "sampler.h",
+        "timed.h",
        "types.h",
    ],
    visibility = ["//tensorflow/core:__pkg__"],
@ -225,6 +237,7 @@ filegroup(
        "metric_def.h",
        "percentile_sampler.h",
        "sampler.h",
+        "timed.h",
        "types.h",
    ],
    visibility = ["//tensorflow/core:__pkg__"],
--- a/tensorflow/core/lib/monitoring/percentile_sampler.cc
+++ b/tensorflow/core/lib/monitoring/percentile_sampler.cc
@ -31,10 +31,10 @@ void PercentileSamplerCell::Add(double sample) {
  mutex_lock l(mu_);
  samples_[next_position_] = {nstime, sample};
  ++next_position_;
-  if (next_position_ >= samples_.size()) {
+  if (TF_PREDICT_FALSE(next_position_ >= samples_.size())) {
    next_position_ = 0;
  }
-  if (num_samples_ < samples_.size()) {
+  if (TF_PREDICT_FALSE(num_samples_ < samples_.size())) {
    ++num_samples_;
  }
  ++total_samples_;
--- a/tensorflow/core/lib/monitoring/timed.h
+++ b/tensorflow/core/lib/monitoring/timed.h
@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_TIMED_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_TIMED_H_
+
+#include "tensorflow/core/platform/env_time.h"
+
+namespace tensorflow {
+namespace monitoring {
+
+// Takes a Sampler, PercentileSample or Gauge cell, and post timing values
+// (default in milliseconds) according to its scope lifetime.
+template <typename T>
+class Timed {
+ public:
+  explicit Timed(T* cell, double scale = 1e-6)
+      : cell_(cell), scale_(scale), start_(EnvTime::NowNanos()) {}
+
+  ~Timed() { cell_->Add(scale_ * (EnvTime::NowNanos() - start_)); }
+
+ private:
+  T* cell_ = nullptr;
+  double scale_ = 1e-6;
+  uint64 start_ = 0;
+};
+
+template <typename T>
+Timed<T> MakeTimed(T* cell, double scale = 1e-6) {
+  return Timed<T>(cell, scale);
+}
+
+}  // namespace monitoring
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_TIMED_H_