Remove tf.keras.utils.HDF5Matrix as its deprecation date is overdue.

PiperOrigin-RevId: 331848144
Change-Id: I72dbb6bf9aef527edf35b6d18278a5c1cf53fcda
This commit is contained in:
Yanhui Liang 2020-09-15 13:58:47 -07:00 committed by TensorFlower Gardener
parent 94b9e540f4
commit 67548eff59
8 changed files with 2 additions and 329 deletions

View File

@ -423,8 +423,8 @@ class TensorLikeDataAdapter(DataAdapter):
class GenericArrayLikeDataAdapter(TensorLikeDataAdapter): class GenericArrayLikeDataAdapter(TensorLikeDataAdapter):
"""Adapter that handles array-like data without forcing it into memory. """Adapter that handles array-like data without forcing it into memory.
As an example, this adapter handles `keras.utils.HDF5Matrix` which holds This adapter handles array-like datasets that may be too big to fully
datasets that may be too big to fully fit into memory. fit into memory.
Specifically, this adapter handles any Python class which implements: Specifically, this adapter handles any Python class which implements:
`__get_item__`, `__len__`, `shape`, and `dtype` with the same meanings `__get_item__`, `__len__`, `shape`, and `dtype` with the same meanings

View File

@ -34,7 +34,6 @@ from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
from tensorflow.python.keras.utils.generic_utils import get_custom_objects from tensorflow.python.keras.utils.generic_utils import get_custom_objects
from tensorflow.python.keras.utils.generic_utils import Progbar from tensorflow.python.keras.utils.generic_utils import Progbar
from tensorflow.python.keras.utils.generic_utils import serialize_keras_object from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
from tensorflow.python.keras.utils.io_utils import HDF5Matrix
from tensorflow.python.keras.utils.layer_utils import get_source_inputs from tensorflow.python.keras.utils.layer_utils import get_source_inputs
from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
from tensorflow.python.keras.utils.np_utils import normalize from tensorflow.python.keras.utils.np_utils import normalize

View File

@ -18,21 +18,10 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import collections
import os import os
import sys import sys
import numpy as np
import six import six
from tensorflow.python.framework import tensor_spec
from tensorflow.python.framework import type_spec
from tensorflow.python.util import deprecation
from tensorflow.python.util.tf_export import keras_export
try:
import h5py
except ImportError:
h5py = None
if sys.version_info >= (3, 6): if sys.version_info >= (3, 6):
@ -77,162 +66,6 @@ def path_to_string(path):
return _path_to_string(path) return _path_to_string(path)
@keras_export('keras.utils.HDF5Matrix')
class HDF5Matrix(object):
"""Representation of HDF5 dataset to be used instead of a Numpy array.
THIS CLASS IS DEPRECATED.
Training with HDF5Matrix may not be optimized for performance, and might
not work with every distribution strategy.
We recommend using https://github.com/tensorflow/io to load your
HDF5 data into a tf.data Dataset and passing that dataset to Keras.
"""
refs = collections.defaultdict(int)
@deprecation.deprecated('2020-05-30', 'Training with '
'HDF5Matrix is not optimized for performance. '
'Instead, we recommend using '
'https://github.com/tensorflow/io to load your '
'HDF5 data into a tf.data Dataset and passing '
'that dataset to Keras.')
def __init__(self, datapath, dataset, start=0, end=None, normalizer=None):
"""Representation of HDF5 dataset to be used instead of a Numpy array.
Example:
```python
x_data = HDF5Matrix('input/file.hdf5', 'data')
model.predict(x_data)
```
Providing `start` and `end` allows use of a slice of the dataset.
Optionally, a normalizer function (or lambda) can be given. This will
be called on every slice of data retrieved.
Arguments:
datapath: string, path to a HDF5 file
dataset: string, name of the HDF5 dataset in the file specified
in datapath
start: int, start of desired slice of the specified dataset
end: int, end of desired slice of the specified dataset
normalizer: function to be called on data when retrieved
Returns:
An array-like HDF5 dataset.
Raises:
ImportError if HDF5 & h5py are not installed
"""
if h5py is None:
raise ImportError('The use of HDF5Matrix requires '
'HDF5 and h5py installed.')
if datapath not in list(self.refs.keys()):
f = h5py.File(datapath)
self.refs[datapath] = f
else:
f = self.refs[datapath]
self.data = f[dataset]
self.start = start
if end is None:
self.end = self.data.shape[0]
else:
self.end = end
self.normalizer = normalizer
def __len__(self):
return self.end - self.start
def __getitem__(self, key):
if isinstance(key, slice):
start, stop = key.start, key.stop
if start is None:
start = 0
if stop is None:
stop = self.shape[0]
if stop + self.start <= self.end:
idx = slice(start + self.start, stop + self.start)
else:
raise IndexError
elif isinstance(key, (int, np.integer)):
if key + self.start < self.end:
idx = key + self.start
else:
raise IndexError
elif isinstance(key, np.ndarray):
if np.max(key) + self.start < self.end:
idx = (self.start + key).tolist()
else:
raise IndexError
else:
# Assume list/iterable
if max(key) + self.start < self.end:
idx = [x + self.start for x in key]
else:
raise IndexError
if self.normalizer is not None:
return self.normalizer(self.data[idx])
else:
return self.data[idx]
@property
def shape(self):
"""Gets a numpy-style shape tuple giving the dataset dimensions.
Returns:
A numpy-style shape tuple.
"""
return (self.end - self.start,) + self.data.shape[1:]
@property
def dtype(self):
"""Gets the datatype of the dataset.
Returns:
A numpy dtype string.
"""
return self.data.dtype
@property
def ndim(self):
"""Gets the number of dimensions (rank) of the dataset.
Returns:
An integer denoting the number of dimensions (rank) of the dataset.
"""
return self.data.ndim
@property
def size(self):
"""Gets the total dataset size (number of elements).
Returns:
An integer denoting the number of elements in the dataset.
"""
return np.prod(self.shape)
@staticmethod
def _to_type_spec(value):
"""Gets the Tensorflow TypeSpec corresponding to the passed dataset.
Args:
value: A HDF5Matrix object.
Returns:
A tf.TensorSpec.
"""
if not isinstance(value, HDF5Matrix):
raise TypeError('Expected value to be a HDF5Matrix, but saw: {}'.format(
type(value)))
return tensor_spec.TensorSpec(shape=value.shape, dtype=value.dtype)
type_spec.register_type_spec_from_value_converter(HDF5Matrix,
HDF5Matrix._to_type_spec) # pylint: disable=protected-access
def ask_to_proceed_with_overwrite(filepath): def ask_to_proceed_with_overwrite(filepath):
"""Produces a prompt asking about overwriting a file. """Produces a prompt asking about overwriting a file.

View File

@ -18,110 +18,17 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import os
import shutil
import sys import sys
import numpy as np
import six import six
from tensorflow.python import keras
from tensorflow.python.keras import keras_parameterized from tensorflow.python.keras import keras_parameterized
from tensorflow.python.keras import testing_utils
from tensorflow.python.keras.utils import io_utils from tensorflow.python.keras.utils import io_utils
from tensorflow.python.platform import test from tensorflow.python.platform import test
try:
import h5py # pylint:disable=g-import-not-at-top
except ImportError:
h5py = None
def create_dataset(h5_path='test.h5'):
x = np.random.randn(200, 10).astype('float32')
y = np.random.randint(0, 2, size=(200, 1))
f = h5py.File(h5_path, 'w')
# Creating dataset to store features
x_dset = f.create_dataset('my_data', (200, 10), dtype='f')
x_dset[:] = x
# Creating dataset to store labels
y_dset = f.create_dataset('my_labels', (200, 1), dtype='i')
y_dset[:] = y
f.close()
class TestIOUtils(keras_parameterized.TestCase): class TestIOUtils(keras_parameterized.TestCase):
@keras_parameterized.run_all_keras_modes
def test_HDF5Matrix(self):
if h5py is None:
return
temp_dir = self.get_temp_dir()
self.addCleanup(shutil.rmtree, temp_dir)
h5_path = os.path.join(temp_dir, 'test.h5')
create_dataset(h5_path)
# Instantiating HDF5Matrix for the training set,
# which is a slice of the first 150 elements
x_train = io_utils.HDF5Matrix(h5_path, 'my_data', start=0, end=150)
y_train = io_utils.HDF5Matrix(h5_path, 'my_labels', start=0, end=150)
# Likewise for the test set
x_test = io_utils.HDF5Matrix(h5_path, 'my_data', start=150, end=200)
y_test = io_utils.HDF5Matrix(h5_path, 'my_labels', start=150, end=200)
# HDF5Matrix behave more or less like Numpy matrices
# with regard to indexing
self.assertEqual(y_train.shape, (150, 1))
# But they do not support negative indices, so don't try print(x_train[-1])
self.assertEqual(y_train.dtype, np.dtype('i'))
self.assertEqual(y_train.ndim, 2)
self.assertEqual(y_train.size, 150)
model = keras.models.Sequential()
model.add(keras.layers.Dense(64, input_shape=(10,), activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(
loss='binary_crossentropy',
optimizer='sgd',
run_eagerly=testing_utils.should_run_eagerly())
# Note: you have to use shuffle='batch' or False with HDF5Matrix
model.fit(x_train, y_train, batch_size=32, shuffle='batch', verbose=False)
# test that evaluation and prediction
# don't crash and return reasonable results
out_pred = model.predict(x_test, batch_size=32, verbose=False)
out_eval = model.evaluate(x_test, y_test, batch_size=32, verbose=False)
self.assertEqual(out_pred.shape, (50, 1))
self.assertGreater(out_eval, 0)
# test slicing for shortened array
self.assertEqual(len(x_train[0:]), len(x_train))
# test __getitem__ invalid use cases
with self.assertRaises(IndexError):
_ = x_train[1000]
with self.assertRaises(IndexError):
_ = x_train[1000: 1001]
with self.assertRaises(IndexError):
_ = x_train[[1000, 1001]]
with self.assertRaises(IndexError):
_ = x_train[six.moves.range(1000, 1001)]
with self.assertRaises(IndexError):
_ = x_train[np.array([1000])]
with self.assertRaises(TypeError):
_ = x_train[None]
# test normalizer
normalizer = lambda x: x + 1
normalized_x_train = io_utils.HDF5Matrix(
h5_path, 'my_data', start=0, end=150, normalizer=normalizer)
self.assertAllClose(normalized_x_train[0][0], x_train[0][0] + 1)
def test_ask_to_proceed_with_overwrite(self): def test_ask_to_proceed_with_overwrite(self):
with test.mock.patch.object(six.moves, 'input') as mock_log: with test.mock.patch.object(six.moves, 'input') as mock_log:
mock_log.return_value = 'y' mock_log.return_value = 'y'

View File

@ -1,29 +0,0 @@
path: "tensorflow.keras.utils.HDF5Matrix"
tf_class {
is_instance: "<class \'tensorflow.python.keras.utils.io_utils.HDF5Matrix\'>"
is_instance: "<type \'object\'>"
member {
name: "dtype"
mtype: "<type \'property\'>"
}
member {
name: "ndim"
mtype: "<type \'property\'>"
}
member {
name: "refs"
mtype: "<type \'collections.defaultdict\'>"
}
member {
name: "shape"
mtype: "<type \'property\'>"
}
member {
name: "size"
mtype: "<type \'property\'>"
}
member_method {
name: "__init__"
argspec: "args=[\'self\', \'datapath\', \'dataset\', \'start\', \'end\', \'normalizer\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
}
}

View File

@ -8,10 +8,6 @@ tf_module {
name: "GeneratorEnqueuer" name: "GeneratorEnqueuer"
mtype: "<type \'type\'>" mtype: "<type \'type\'>"
} }
member {
name: "HDF5Matrix"
mtype: "<type \'type\'>"
}
member { member {
name: "OrderedEnqueuer" name: "OrderedEnqueuer"
mtype: "<type \'type\'>" mtype: "<type \'type\'>"

View File

@ -1,29 +0,0 @@
path: "tensorflow.keras.utils.HDF5Matrix"
tf_class {
is_instance: "<class \'tensorflow.python.keras.utils.io_utils.HDF5Matrix\'>"
is_instance: "<type \'object\'>"
member {
name: "dtype"
mtype: "<type \'property\'>"
}
member {
name: "ndim"
mtype: "<type \'property\'>"
}
member {
name: "refs"
mtype: "<type \'collections.defaultdict\'>"
}
member {
name: "shape"
mtype: "<type \'property\'>"
}
member {
name: "size"
mtype: "<type \'property\'>"
}
member_method {
name: "__init__"
argspec: "args=[\'self\', \'datapath\', \'dataset\', \'start\', \'end\', \'normalizer\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
}
}

View File

@ -8,10 +8,6 @@ tf_module {
name: "GeneratorEnqueuer" name: "GeneratorEnqueuer"
mtype: "<type \'type\'>" mtype: "<type \'type\'>"
} }
member {
name: "HDF5Matrix"
mtype: "<type \'type\'>"
}
member { member {
name: "OrderedEnqueuer" name: "OrderedEnqueuer"
mtype: "<type \'type\'>" mtype: "<type \'type\'>"