Add function tf.config.experimental.get_memory_info.

PiperOrigin-RevId: 353324579 Change-Id: I71bfeb9fd08a6834ef07ccabb359065aec1ba641
2021-01-22 14:54:00 -08:00 · 2021-01-22 14:54:00 -08:00 · 697f117f36
commit 697f117f36
parent 6bb7f19c3d
7 changed files with 108 additions and 13 deletions
--- a/RELEASE.md
+++ b/RELEASE.md
@ -97,6 +97,9 @@
        `tf.while_loop`, and compositions like `tf.foldl`) computed with
        `tf.GradientTape` inside a `tf.function`.
    *   Changed the default step size in `gradient_checker_v2.compute_gradients` to be exactly representable as a binary floating point numbers. This avoids poluting gradient approximations needlessly, which is some cases leads to false negatives in op gradient tests.
+    * Added `tf.config.experimental.get_memory_info`, returning a dict with the
+      current and peak memory usage. Deprecated 
+      `tf.config.experimental.get_memory_usage` in favor of this new function.

 *   `tf.summary`:
  *   New `tf.summary.graph` allows manual write of TensorFlow graph
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@ -1438,11 +1438,16 @@ class Context(object):

    self._visible_device_list = visible_device_list

-  def get_total_memory_usage(self, dev):
-    """Returns total memory usage in bytes for the current device."""
+  def get_memory_info(self, dev):
+    """Returns a dict of memory info for the device."""
    self._initialize_physical_devices()
    self.ensure_initialized()
-    return pywrap_tfe.TFE_GetTotalMemoryUsage(self._context_handle, dev)
+    return pywrap_tfe.TFE_GetMemoryInfo(self._context_handle, dev)
+
+  # TODO(reedwm): Remove this function
+  def get_total_memory_usage(self, dev):
+    """Returns total memory usage in bytes for the current device."""
+    return self.get_memory_info(dev)["current"]

  def get_memory_growth(self, dev):
    """Get if memory growth is enabled for a PhysicalDevice."""
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@ -510,9 +510,58 @@ def set_visible_devices(devices, device_type=None):
  context.context().set_visible_devices(devices, device_type)


+@tf_export('config.experimental.get_memory_info')
+def get_memory_info(device):
+  """Get memory info for the chosen device, as a dict.
+
+  This function returns a dict containing information about the device's memory
+  usage. For example:
+
+  >>> if tf.config.list_physical_devices('GPU'):
+  ...   # Returns a dict in the form {'current': <current mem usage>,
+  ...   #                             'peak': <peak mem usage>}
+  ...   tf.config.experimental.get_memory_info('GPU:0')
+
+  Currently returns the following keys:
+    `'current'`: The current memory used by the device, in bytes.
+    `'peak'`: The peak memory used by the device across the run of the program,
+        in bytes.
+
+  More keys may be added in the future, including device-specific keys.
+
+  Currently raises an exception for the CPU.
+
+  For GPUs, TensorFlow will allocate all the memory by default, unless changed
+  with `tf.config.experimental.set_memory_growth`. The dict specifies only the
+  current and peak memory that TensorFlow is actually using, not the memory that
+  TensorFlow has allocated on the GPU.
+
+  Args:
+    device: Device string to get the memory information for, e.g. `"GPU:0"`. See
+      https://www.tensorflow.org/api_docs/python/tf/device for specifying device
+      strings.
+
+  Returns:
+    A dict with keys `'current'` and `'peak'`, specifying the current and peak
+    memory usage respectively.
+
+  Raises:
+    ValueError: Non-existent or CPU device specified.
+
+  """
+  return context.context().get_memory_info(device)
+
+
+@deprecation.deprecated(
+    None,
+    "Use tf.config.experimental.get_memory_info(device)['current'] instead.")
@tf_export('config.experimental.get_memory_usage')
 def get_memory_usage(device):
-  """Get the memory usage, in bytes, for the chosen device.
+  """Get the current memory usage, in bytes, for the chosen device.
+
+  This function is deprecated in favor of
+  `tf.config.experimental.get_memory_info`. Calling this function is equivalent
+  to calling `tf.config.experimental.get_memory_info()['current']`.

  See https://www.tensorflow.org/api_docs/python/tf/device for specifying device
  strings.
@ -525,8 +574,13 @@ def get_memory_usage(device):

  Does not work for CPU.

+  For GPUs, TensorFlow will allocate all the memory by default, unless changed
+  with `tf.config.experimental.set_memory_growth`. This function only returns
+  the memory that TensorFlow is actually using, not the memory that TensorFlow
+  has allocated on the GPU.
+
  Args:
-    device: Device string to get the bytes in use for.
+    device: Device string to get the bytes in use for, e.g. `"GPU:0"`

  Returns:
    Total memory usage in bytes.
@ -534,7 +588,7 @@ def get_memory_usage(device):
  Raises:
    ValueError: Non-existent or CPU device specified.
  """
-  return context.context().get_total_memory_usage(device)
+  return get_memory_info(device)['current']


@tf_export('config.experimental.get_memory_growth')
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@ -599,25 +599,48 @@ class DeviceTest(test.TestCase):

  @test_util.run_gpu_only
  @reset_eager
-  def testGetMemoryUsage(self):
+  def testGetMemoryInfoBasic(self):
    device = array_ops.zeros([]).backing_device
-    self.assertGreater(config.get_memory_usage(device), 0)
+    info = config.get_memory_info(device)
+    self.assertGreater(info['current'], 0)
+    self.assertGreater(info['peak'], 0)
+    self.assertEqual(info.keys(), {'current', 'peak'})
+    self.assertEqual(config.get_memory_usage(device), info['current'])

  @test_util.run_gpu_only
  @reset_eager
  def testGetMemoryUsageSubstring(self):
-    self.assertGreater(config.get_memory_usage('GPU:0'), 0)
+    info = config.get_memory_info('GPU:0')
+    self.assertGreater(info['current'], 0)

  @reset_eager
-  def testGetMemoryUsageCPU(self):
+  def testGetMemoryInfoCPU(self):
+    with self.assertRaisesRegex(ValueError, 'CPU does not support'):
+      config.get_memory_info('CPU:0')
    with self.assertRaisesRegex(ValueError, 'CPU does not support'):
      config.get_memory_usage('CPU:0')

  @reset_eager
-  def testGetMemoryUsageUnknownDevice(self):
+  def testGetMemoryInfoUnknownDevice(self):
+    with self.assertRaisesRegex(ValueError, 'Failed parsing device name'):
+      config.get_memory_info('unknown_device')
    with self.assertRaisesRegex(ValueError, 'Failed parsing device name'):
      config.get_memory_usage('unknown_device')

+  @test_util.run_gpu_only
+  @reset_eager
+  def testPeakMemoryUsage(self):
+    x1 = array_ops.zeros((1000, 1000))
+    peak1 = config.get_memory_info('GPU:0')['peak']
+    self.assertGreaterEqual(peak1, 4 * 1000 * 1000)
+    x2 = array_ops.ones((1000, 1000))
+    peak2 = config.get_memory_info('GPU:0')['peak']
+    self.assertGreaterEqual(peak2, peak1 + 4 * 1000 * 1000)
+    del x1, x2  # With CPython, causes tensor memory to be immediately freed
+    peak3 = config.get_memory_info('GPU:0')['peak']
+    self.assertGreaterEqual(peak3, peak2)
+    self.assertGreaterEqual(peak3, config.get_memory_info('GPU:0')['current'])
+
  @test_util.run_gpu_only
  @reset_eager
  def testGetMemoryUsageAmbiguousDevice(self):
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@ -517,7 +517,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
  });

  m.def(
-      "TFE_GetTotalMemoryUsage", [](py::handle& ctx, const char* device_name) {
+      "TFE_GetMemoryInfo", [](py::handle& ctx, const char* device_name) {
        tensorflow::EagerContext* context = tensorflow::ContextFromInterface(
            reinterpret_cast<tensorflow::ImmediateExecutionContext*>(
                tensorflow::InputTFE_Context(ctx)));
@ -568,7 +568,9 @@ PYBIND11_MODULE(_pywrap_tfe, m) {

        if (absl::optional<tensorflow::AllocatorStats> stats =
                allocator->GetStats()) {
-          return stats->bytes_in_use;
+          return std::map<std::string, int64_t>{
+              {"current", stats->bytes_in_use},
+              {"peak", stats->peak_bytes_in_use}};
        }

        tensorflow::ThrowTypeError(
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt
@ -40,6 +40,10 @@ tf_module {
    name: "get_memory_growth"
    argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
  }
+  member_method {
+    name: "get_memory_info"
+    argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
+  }
  member_method {
    name: "get_memory_usage"
    argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt
@ -40,6 +40,10 @@ tf_module {
    name: "get_memory_growth"
    argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
  }
+  member_method {
+    name: "get_memory_info"
+    argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
+  }
  member_method {
    name: "get_memory_usage"
    argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"