Expose device memory information via XRT API.

PiperOrigin-RevId: 318893328 Change-Id: I1bdfc8c6fcabe7b4f9a662272aa0c40a795299da
2020-06-29 14:24:22 -07:00 · 2020-06-29 14:24:22 -07:00 · 1e1ce81457
commit 1e1ce81457
parent 1b53b995da
9 changed files with 99 additions and 7 deletions
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
@ -195,4 +195,9 @@ REGISTER_KERNEL_BUILDER(Name("XRTCompactAllocations").Device(DEVICE_XLA_CPU),
 REGISTER_KERNEL_BUILDER(Name("XRTMetricsCollect").Device(DEVICE_CPU),
                        XRTMetricsCollectOp);

+REGISTER_KERNEL_BUILDER(Name("XRTMemoryInfo").Device(DEVICE_XLA_GPU),
+                        XRTMemoryInfoOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTMemoryInfo").Device(DEVICE_XLA_CPU),
+                        XRTMemoryInfoOp<XRTGenericDeviceAccessor>);
+
 }  // namespace tensorflow
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@ -739,6 +739,42 @@ class XRTCompactAllocationsOp : public OpKernel {
  }
 };

+template <class DeviceAccessor>
+class XRTMemoryInfoOp : public OpKernel {
+ public:
+  explicit XRTMemoryInfoOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~XRTMemoryInfoOp() override = default;
+  XRTMemoryInfoOp(const XRTMemoryInfoOp&) = delete;
+  XRTMemoryInfoOp& operator=(const XRTMemoryInfoOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    auto kernel_fn = [&]() -> Status {
+      VLOG(1) << "XRTMemoryInfoOp::Compute";
+
+      class DeviceAccessor::ScopedRef device_ref;
+      TF_RETURN_IF_ERROR(DeviceAccessor::InitScopedRef(ctx, &device_ref));
+      TF_ASSIGN_OR_RETURN(
+          se::StreamExecutor * stream_executor,
+          device_ref.backend()->stream_executor(device_ref.device_ordinal()));
+      int64 mem_free = -1;
+      int64 mem_total = -1;
+      if (!stream_executor->DeviceMemoryUsage(&mem_free, &mem_total)) {
+        VLOG(2) << "Device " << ctx->device()->name()
+                << " does not expose memory information";
+      }
+      xrt::MemoryInfo mem_info;
+      mem_info.set_kb_total((mem_total >= 0) ? mem_total / 1024 : -1);
+      mem_info.set_kb_free((mem_free >= 0) ? mem_free / 1024 : -1);
+
+      Tensor output(DT_STRING, TensorShape({}));
+      output.scalar<tstring>()() = mem_info.SerializeAsString();
+      ctx->set_output(0, output);
+      return Status::OK();
+    };
+    OP_REQUIRES_OK(ctx, kernel_fn());
+  }
+};
+
 }  // namespace tensorflow

 #endif  // TENSORFLOW_COMPILER_XRT_KERNELS_XRT_STATE_OPS_H_
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@ -228,4 +228,14 @@ Reads the selected metric values from the metrics collection registry.
 'result' is a serialized xrt::MetricsReport proto.
 )");

+REGISTER_OP("XRTMemoryInfo")
+    .Output("result: string")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(
+        R"(
+Returns the memory information of the device this op executes on/
+
+'result' is a serialized xrt::MemoryInfo proto.
+)");
+
 }  // namespace tensorflow
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@ -2206,6 +2206,22 @@ TEST(RawApiTest, TestMetricsFetch) {
  }
 }

+TEST(RawApiTest, TestMemoryInfo) {
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Output result = ops::XRTMemoryInfo(root);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({result}, &outputs));
+  ASSERT_EQ(outputs.size(), 1);
+
+  xrt::MemoryInfo mem_info;
+  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &mem_info));
+  EXPECT_GT(mem_info.kb_total(), 0);
+  EXPECT_GT(mem_info.kb_free(), 0);
+}
+
 }  // namespace

 }  // namespace tensorflow
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@ -268,3 +268,10 @@ message MetricValues {
 message MetricsReport {
  repeated MetricValues metrics = 1;
 }
+
+message MemoryInfo {
+  // The total memory on a device, in KB.
+  int64 kb_total = 1;
+  // The free memory on a device, in KB.
+  int64 kb_free = 2;
+}
--- a/tensorflow/core/platform/default/port.cc
+++ b/tensorflow/core/platform/default/port.cc
@ -348,15 +348,17 @@ double NominalCPUFrequency() {
  return absl::base_internal::NominalCPUFrequency();
 }

-int64 AvailableRam() {
+MemoryInfo GetMemoryInfo() {
+  MemoryInfo mem_info = {INT64_MAX, INT64_MAX};
 #if defined(__linux__) && !defined(__ANDROID__)
  struct sysinfo info;
  int err = sysinfo(&info);
  if (err == 0) {
-    return info.freeram;
+    mem_info.free = info.freeram;
+    mem_info.total = info.totalram;
  }
 #endif
-  return INT64_MAX;
+  return mem_info;
 }

 }  // namespace port
--- a/tensorflow/core/platform/mem.h
+++ b/tensorflow/core/platform/mem.h
@ -59,8 +59,18 @@ void MallocExtension_ReleaseToSystem(std::size_t num_bytes);
 // routine, this routine returns 0.
 std::size_t MallocExtension_GetAllocatedSize(const void* p);

+struct MemoryInfo {
+  int64 total = 0;
+  int64 free = 0;
+};
+
+// Retrieves the host memory information. If any of the fields in the returned
+// MemoryInfo structure is INT64_MAX, it means such information is not
+// available.
+MemoryInfo GetMemoryInfo();
+
 // Returns the amount of RAM available in bytes, or INT64_MAX if unknown.
-int64 AvailableRam();
+static inline int64 AvailableRam() { return GetMemoryInfo().free; }

 }  // namespace port
 }  // namespace tensorflow
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@ -17,6 +17,7 @@ limitations under the License.
 // class declaration].
 #include "tensorflow/stream_executor/host/host_gpu_executor.h"

+#include <stdint.h>
 #include <string.h>

 #include "absl/strings/numbers.h"
@ -58,6 +59,13 @@ port::Status HostExecutor::Init(int device_ordinal,
  return port::Status::OK();
 }

+bool HostExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
+  tensorflow::port::MemoryInfo mem_info = tensorflow::port::GetMemoryInfo();
+  *free = (mem_info.free != INT64_MAX) ? mem_info.free : -1;
+  *total = (mem_info.total != INT64_MAX) ? mem_info.total : -1;
+  return true;
+}
+
 DeviceMemoryBase HostExecutor::Allocate(uint64 size, int64 memory_space) {
  CHECK_EQ(memory_space, 0);
  // Use a minimum alignment of 64 bytes to be friendly to AVX512 code.
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@ -130,9 +130,7 @@ class HostExecutor : public internal::StreamExecutorInterface {

  int PlatformDeviceCount() override { return 1; }

-  bool DeviceMemoryUsage(int64 *free, int64 *total) const override {
-    return false;
-  }
+  bool DeviceMemoryUsage(int64 *free, int64 *total) const override;

  port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
      const override {