Remove tf.keras.utils.HDF5Matrix as its deprecation date is overdue.

PiperOrigin-RevId: 331848144 Change-Id: I72dbb6bf9aef527edf35b6d18278a5c1cf53fcda
2020-09-15 13:58:47 -07:00 · 2020-09-15 13:58:47 -07:00 · 67548eff59
commit 67548eff59
parent 94b9e540f4
8 changed files with 2 additions and 329 deletions
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@ -423,8 +423,8 @@ class TensorLikeDataAdapter(DataAdapter):
 class GenericArrayLikeDataAdapter(TensorLikeDataAdapter):
  """Adapter that handles array-like data without forcing it into memory.

-  As an example, this adapter handles `keras.utils.HDF5Matrix` which holds
-  datasets that may be too big to fully fit into memory.
+  This adapter handles array-like datasets that may be too big to fully
+  fit into memory.

  Specifically, this adapter handles any Python class which implements:
  `__get_item__`, `__len__`, `shape`, and `dtype` with the same meanings
--- a/tensorflow/python/keras/utils/all_utils.py
+++ b/tensorflow/python/keras/utils/all_utils.py
@ -34,7 +34,6 @@ from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import get_custom_objects
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras.utils.layer_utils import get_source_inputs
 from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
 from tensorflow.python.keras.utils.np_utils import normalize
--- a/tensorflow/python/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/utils/io_utils.py
@ -18,21 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import collections
 import os
 import sys

-import numpy as np
 import six
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import type_spec
-from tensorflow.python.util import deprecation
-from tensorflow.python.util.tf_export import keras_export
-
-try:
-  import h5py
-except ImportError:
-  h5py = None


 if sys.version_info >= (3, 6):
@ -77,162 +66,6 @@ def path_to_string(path):
  return _path_to_string(path)


-@keras_export('keras.utils.HDF5Matrix')
-class HDF5Matrix(object):
-  """Representation of HDF5 dataset to be used instead of a Numpy array.
-
-  THIS CLASS IS DEPRECATED.
-  Training with HDF5Matrix may not be optimized for performance, and might
-  not work with every distribution strategy.
-
-  We recommend using https://github.com/tensorflow/io to load your
-  HDF5 data into a tf.data Dataset and passing that dataset to Keras.
-  """
-  refs = collections.defaultdict(int)
-
-  @deprecation.deprecated('2020-05-30', 'Training with '
-                          'HDF5Matrix is not optimized for performance. '
-                          'Instead, we recommend using '
-                          'https://github.com/tensorflow/io to load your '
-                          'HDF5 data into a tf.data Dataset and passing '
-                          'that dataset to Keras.')
-  def __init__(self, datapath, dataset, start=0, end=None, normalizer=None):
-    """Representation of HDF5 dataset to be used instead of a Numpy array.
-
-    Example:
-
-    ```python
-        x_data = HDF5Matrix('input/file.hdf5', 'data')
-        model.predict(x_data)
-    ```
-
-    Providing `start` and `end` allows use of a slice of the dataset.
-
-    Optionally, a normalizer function (or lambda) can be given. This will
-    be called on every slice of data retrieved.
-
-    Arguments:
-        datapath: string, path to a HDF5 file
-        dataset: string, name of the HDF5 dataset in the file specified
-            in datapath
-        start: int, start of desired slice of the specified dataset
-        end: int, end of desired slice of the specified dataset
-        normalizer: function to be called on data when retrieved
-
-    Returns:
-        An array-like HDF5 dataset.
-
-    Raises:
-      ImportError if HDF5 & h5py are not installed
-    """
-    if h5py is None:
-      raise ImportError('The use of HDF5Matrix requires '
-                        'HDF5 and h5py installed.')
-
-    if datapath not in list(self.refs.keys()):
-      f = h5py.File(datapath)
-      self.refs[datapath] = f
-    else:
-      f = self.refs[datapath]
-    self.data = f[dataset]
-    self.start = start
-    if end is None:
-      self.end = self.data.shape[0]
-    else:
-      self.end = end
-    self.normalizer = normalizer
-
-  def __len__(self):
-    return self.end - self.start
-
-  def __getitem__(self, key):
-    if isinstance(key, slice):
-      start, stop = key.start, key.stop
-      if start is None:
-        start = 0
-      if stop is None:
-        stop = self.shape[0]
-      if stop + self.start <= self.end:
-        idx = slice(start + self.start, stop + self.start)
-      else:
-        raise IndexError
-    elif isinstance(key, (int, np.integer)):
-      if key + self.start < self.end:
-        idx = key + self.start
-      else:
-        raise IndexError
-    elif isinstance(key, np.ndarray):
-      if np.max(key) + self.start < self.end:
-        idx = (self.start + key).tolist()
-      else:
-        raise IndexError
-    else:
-      # Assume list/iterable
-      if max(key) + self.start < self.end:
-        idx = [x + self.start for x in key]
-      else:
-        raise IndexError
-    if self.normalizer is not None:
-      return self.normalizer(self.data[idx])
-    else:
-      return self.data[idx]
-
-  @property
-  def shape(self):
-    """Gets a numpy-style shape tuple giving the dataset dimensions.
-
-    Returns:
-        A numpy-style shape tuple.
-    """
-    return (self.end - self.start,) + self.data.shape[1:]
-
-  @property
-  def dtype(self):
-    """Gets the datatype of the dataset.
-
-    Returns:
-        A numpy dtype string.
-    """
-    return self.data.dtype
-
-  @property
-  def ndim(self):
-    """Gets the number of dimensions (rank) of the dataset.
-
-    Returns:
-        An integer denoting the number of dimensions (rank) of the dataset.
-    """
-    return self.data.ndim
-
-  @property
-  def size(self):
-    """Gets the total dataset size (number of elements).
-
-    Returns:
-        An integer denoting the number of elements in the dataset.
-    """
-    return np.prod(self.shape)
-
-  @staticmethod
-  def _to_type_spec(value):
-    """Gets the Tensorflow TypeSpec corresponding to the passed dataset.
-
-    Args:
-      value: A HDF5Matrix object.
-
-    Returns:
-      A tf.TensorSpec.
-    """
-    if not isinstance(value, HDF5Matrix):
-      raise TypeError('Expected value to be a HDF5Matrix, but saw: {}'.format(
-          type(value)))
-    return tensor_spec.TensorSpec(shape=value.shape, dtype=value.dtype)
-
-
-type_spec.register_type_spec_from_value_converter(HDF5Matrix,
-                                                  HDF5Matrix._to_type_spec)  # pylint: disable=protected-access
-
-
 def ask_to_proceed_with_overwrite(filepath):
  """Produces a prompt asking about overwriting a file.

--- a/tensorflow/python/keras/utils/io_utils_test.py
+++ b/tensorflow/python/keras/utils/io_utils_test.py
@ -18,110 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import os
-import shutil
 import sys

-import numpy as np
 import six

-from tensorflow.python import keras
 from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.utils import io_utils
 from tensorflow.python.platform import test

-try:
-  import h5py  # pylint:disable=g-import-not-at-top
-except ImportError:
-  h5py = None
-
-
-def create_dataset(h5_path='test.h5'):
-  x = np.random.randn(200, 10).astype('float32')
-  y = np.random.randint(0, 2, size=(200, 1))
-  f = h5py.File(h5_path, 'w')
-  # Creating dataset to store features
-  x_dset = f.create_dataset('my_data', (200, 10), dtype='f')
-  x_dset[:] = x
-  # Creating dataset to store labels
-  y_dset = f.create_dataset('my_labels', (200, 1), dtype='i')
-  y_dset[:] = y
-  f.close()
-

 class TestIOUtils(keras_parameterized.TestCase):

-  @keras_parameterized.run_all_keras_modes
-  def test_HDF5Matrix(self):
-    if h5py is None:
-      return
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir)
-
-    h5_path = os.path.join(temp_dir, 'test.h5')
-    create_dataset(h5_path)
-
-    # Instantiating HDF5Matrix for the training set,
-    # which is a slice of the first 150 elements
-    x_train = io_utils.HDF5Matrix(h5_path, 'my_data', start=0, end=150)
-    y_train = io_utils.HDF5Matrix(h5_path, 'my_labels', start=0, end=150)
-
-    # Likewise for the test set
-    x_test = io_utils.HDF5Matrix(h5_path, 'my_data', start=150, end=200)
-    y_test = io_utils.HDF5Matrix(h5_path, 'my_labels', start=150, end=200)
-
-    # HDF5Matrix behave more or less like Numpy matrices
-    # with regard to indexing
-    self.assertEqual(y_train.shape, (150, 1))
-    # But they do not support negative indices, so don't try print(x_train[-1])
-
-    self.assertEqual(y_train.dtype, np.dtype('i'))
-    self.assertEqual(y_train.ndim, 2)
-    self.assertEqual(y_train.size, 150)
-
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(64, input_shape=(10,), activation='relu'))
-    model.add(keras.layers.Dense(1, activation='sigmoid'))
-    model.compile(
-        loss='binary_crossentropy',
-        optimizer='sgd',
-        run_eagerly=testing_utils.should_run_eagerly())
-
-    # Note: you have to use shuffle='batch' or False with HDF5Matrix
-    model.fit(x_train, y_train, batch_size=32, shuffle='batch', verbose=False)
-    # test that evaluation and prediction
-    # don't crash and return reasonable results
-    out_pred = model.predict(x_test, batch_size=32, verbose=False)
-    out_eval = model.evaluate(x_test, y_test, batch_size=32, verbose=False)
-
-    self.assertEqual(out_pred.shape, (50, 1))
-    self.assertGreater(out_eval, 0)
-
-    # test slicing for shortened array
-    self.assertEqual(len(x_train[0:]), len(x_train))
-
-    # test __getitem__ invalid use cases
-    with self.assertRaises(IndexError):
-      _ = x_train[1000]
-    with self.assertRaises(IndexError):
-      _ = x_train[1000: 1001]
-    with self.assertRaises(IndexError):
-      _ = x_train[[1000, 1001]]
-    with self.assertRaises(IndexError):
-      _ = x_train[six.moves.range(1000, 1001)]
-    with self.assertRaises(IndexError):
-      _ = x_train[np.array([1000])]
-    with self.assertRaises(TypeError):
-      _ = x_train[None]
-
-    # test normalizer
-    normalizer = lambda x: x + 1
-    normalized_x_train = io_utils.HDF5Matrix(
-        h5_path, 'my_data', start=0, end=150, normalizer=normalizer)
-    self.assertAllClose(normalized_x_train[0][0], x_train[0][0] + 1)
-
  def test_ask_to_proceed_with_overwrite(self):
    with test.mock.patch.object(six.moves, 'input') as mock_log:
      mock_log.return_value = 'y'
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
@ -1,29 +0,0 @@
-path: "tensorflow.keras.utils.HDF5Matrix"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.utils.io_utils.HDF5Matrix\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ndim"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "refs"
-    mtype: "<type \'collections.defaultdict\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "size"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'datapath\', \'dataset\', \'start\', \'end\', \'normalizer\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
-  }
-}
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
@ -8,10 +8,6 @@ tf_module {
    name: "GeneratorEnqueuer"
    mtype: "<type \'type\'>"
  }
-  member {
-    name: "HDF5Matrix"
-    mtype: "<type \'type\'>"
-  }
  member {
    name: "OrderedEnqueuer"
    mtype: "<type \'type\'>"
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
@ -1,29 +0,0 @@
-path: "tensorflow.keras.utils.HDF5Matrix"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.utils.io_utils.HDF5Matrix\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ndim"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "refs"
-    mtype: "<type \'collections.defaultdict\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "size"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'datapath\', \'dataset\', \'start\', \'end\', \'normalizer\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
-  }
-}
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
@ -8,10 +8,6 @@ tf_module {
    name: "GeneratorEnqueuer"
    mtype: "<type \'type\'>"
  }
-  member {
-    name: "HDF5Matrix"
-    mtype: "<type \'type\'>"
-  }
  member {
    name: "OrderedEnqueuer"
    mtype: "<type \'type\'>"