Fix the ImageDataGenerator methods to return Keras sequences instead of just generators. This makes it so that Keras fit
avoids an infinite loop when users pass the results of ImageDataGenerator.flow* directly to fit/evaluate/predict.
PiperOrigin-RevId: 311028701 Change-Id: Ia5c3b01b3c8fa6b842bddb881ced64e4b89fe2ba
This commit is contained in:
parent
22a24beeee
commit
b53ed4d560
@ -85,6 +85,7 @@ tf_py_test(
|
||||
deps = [
|
||||
":image",
|
||||
"//tensorflow/python:client_testlib",
|
||||
"//tensorflow/python/keras",
|
||||
"//third_party/py/numpy",
|
||||
],
|
||||
)
|
||||
|
@ -14,6 +14,7 @@
|
||||
# ==============================================================================
|
||||
# pylint: disable=invalid-name
|
||||
# pylint: disable=g-import-not-at-top
|
||||
# pylint: disable=g-classes-have-attributes
|
||||
"""Set of tools for real-time data augmentation on image data.
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
@ -35,6 +36,7 @@ from tensorflow.python.keras.utils import data_utils
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import image_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
from tensorflow.python.platform import tf_logging
|
||||
from tensorflow.python.util import tf_inspect
|
||||
from tensorflow.python.util.tf_export import keras_export
|
||||
|
||||
@ -459,6 +461,123 @@ class NumpyArrayIterator(image.NumpyArrayIterator, Iterator):
|
||||
**kwargs)
|
||||
|
||||
|
||||
class DataFrameIterator(image.DataFrameIterator, Iterator):
|
||||
"""Iterator capable of reading images from a directory on disk as a dataframe.
|
||||
|
||||
Arguments:
|
||||
dataframe: Pandas dataframe containing the filepaths relative to
|
||||
`directory` (or absolute paths if `directory` is None) of the images in
|
||||
a string column. It should include other column/s
|
||||
depending on the `class_mode`: - if `class_mode` is `"categorical"`
|
||||
(default value) it must include the `y_col` column with the class/es
|
||||
of each image. Values in column can be string/list/tuple if a single
|
||||
class or list/tuple if multiple classes. - if `class_mode` is
|
||||
`"binary"` or `"sparse"` it must include the given `y_col` column
|
||||
with class values as strings. - if `class_mode` is `"raw"` or
|
||||
`"multi_output"` it should contain the columns specified in `y_col`.
|
||||
- if `class_mode` is `"input"` or `None` no extra column is needed.
|
||||
directory: string, path to the directory to read images from. If `None`,
|
||||
data in `x_col` column should be absolute paths.
|
||||
image_data_generator: Instance of `ImageDataGenerator` to use for random
|
||||
transformations and normalization. If None, no transformations and
|
||||
normalizations are made.
|
||||
x_col: string, column in `dataframe` that contains the filenames (or
|
||||
absolute paths if `directory` is `None`).
|
||||
y_col: string or list, column/s in `dataframe` that has the target data.
|
||||
weight_col: string, column in `dataframe` that contains the sample
|
||||
weights. Default: `None`.
|
||||
target_size: tuple of integers, dimensions to resize input images to.
|
||||
color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`. Color mode to read
|
||||
images.
|
||||
classes: Optional list of strings, classes to use (e.g. `["dogs",
|
||||
"cats"]`). If None, all classes in `y_col` will be used.
|
||||
class_mode: one of "binary", "categorical", "input", "multi_output",
|
||||
"raw", "sparse" or None. Default: "categorical".
|
||||
Mode for yielding the targets:
|
||||
- `"binary"`: 1D numpy array of binary labels,
|
||||
- `"categorical"`: 2D numpy array of one-hot encoded labels. Supports
|
||||
multi-label output.
|
||||
- `"input"`: images identical to input images (mainly used to work
|
||||
with autoencoders),
|
||||
- `"multi_output"`: list with the values of the different columns,
|
||||
- `"raw"`: numpy array of values in `y_col` column(s),
|
||||
- `"sparse"`: 1D numpy array of integer labels, - `None`, no targets
|
||||
are returned (the generator will only yield batches of image data,
|
||||
which is useful to use in `model.predict_generator()`).
|
||||
batch_size: Integer, size of a batch.
|
||||
shuffle: Boolean, whether to shuffle the data between epochs.
|
||||
seed: Random seed for data shuffling.
|
||||
data_format: String, one of `channels_first`, `channels_last`.
|
||||
save_to_dir: Optional directory where to save the pictures being yielded,
|
||||
in a viewable format. This is useful for visualizing the random
|
||||
transformations being applied, for debugging purposes.
|
||||
save_prefix: String prefix to use for saving sample images (if
|
||||
`save_to_dir` is set).
|
||||
save_format: Format to use for saving sample images (if `save_to_dir` is
|
||||
set).
|
||||
subset: Subset of data (`"training"` or `"validation"`) if
|
||||
validation_split is set in ImageDataGenerator.
|
||||
interpolation: Interpolation method used to resample the image if the
|
||||
target size is different from that of the loaded image. Supported
|
||||
methods are "nearest", "bilinear", and "bicubic". If PIL version 1.1.3
|
||||
or newer is installed, "lanczos" is also supported. If PIL version 3.4.0
|
||||
or newer is installed, "box" and "hamming" are also supported. By
|
||||
default, "nearest" is used.
|
||||
dtype: Dtype to use for the generated arrays.
|
||||
validate_filenames: Boolean, whether to validate image filenames in
|
||||
`x_col`. If `True`, invalid images will be ignored. Disabling this
|
||||
option
|
||||
can lead to speed-up in the instantiation of this class. Default: `True`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dataframe,
|
||||
directory=None,
|
||||
image_data_generator=None,
|
||||
x_col='filename',
|
||||
y_col='class',
|
||||
weight_col=None,
|
||||
target_size=(256, 256),
|
||||
color_mode='rgb',
|
||||
classes=None,
|
||||
class_mode='categorical',
|
||||
batch_size=32,
|
||||
shuffle=True,
|
||||
seed=None,
|
||||
data_format='channels_last',
|
||||
save_to_dir=None,
|
||||
save_prefix='',
|
||||
save_format='png',
|
||||
subset=None,
|
||||
interpolation='nearest',
|
||||
dtype='float32',
|
||||
validate_filenames=True):
|
||||
super(DataFrameIterator, self).__init__(
|
||||
dataframe=dataframe,
|
||||
directory=directory,
|
||||
image_data_generator=image_data_generator,
|
||||
x_col=x_col,
|
||||
y_col=y_col,
|
||||
weight_col=weight_col,
|
||||
target_size=target_size,
|
||||
color_mode=color_mode,
|
||||
classes=classes,
|
||||
class_mode=class_mode,
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle,
|
||||
seed=seed,
|
||||
data_format=data_format,
|
||||
save_to_dir=save_to_dir,
|
||||
save_prefix=save_prefix,
|
||||
save_format=save_format,
|
||||
subset=subset,
|
||||
interpolation=interpolation,
|
||||
dtype=dtype,
|
||||
validate_filenames=validate_filenames
|
||||
)
|
||||
|
||||
|
||||
@keras_export('keras.preprocessing.image.ImageDataGenerator')
|
||||
class ImageDataGenerator(image.ImageDataGenerator):
|
||||
"""Generate batches of tensor image data with real-time data augmentation.
|
||||
@ -686,6 +805,302 @@ class ImageDataGenerator(image.ImageDataGenerator):
|
||||
validation_split=validation_split,
|
||||
**kwargs)
|
||||
|
||||
def flow(self,
|
||||
x,
|
||||
y=None,
|
||||
batch_size=32,
|
||||
shuffle=True,
|
||||
sample_weight=None,
|
||||
seed=None,
|
||||
save_to_dir=None,
|
||||
save_prefix='',
|
||||
save_format='png',
|
||||
subset=None):
|
||||
"""Takes data & label arrays, generates batches of augmented data.
|
||||
|
||||
Arguments:
|
||||
x: Input data. Numpy array of rank 4 or a tuple. If tuple, the first
|
||||
element should contain the images and the second element another numpy
|
||||
array or a list of numpy arrays that gets passed to the output without
|
||||
any modifications. Can be used to feed the model miscellaneous data
|
||||
along with the images. In case of grayscale data, the channels axis of
|
||||
the image array should have value 1, in case of RGB data, it should
|
||||
have value 3, and in case of RGBA data, it should have value 4.
|
||||
y: Labels.
|
||||
batch_size: Int (default: 32).
|
||||
shuffle: Boolean (default: True).
|
||||
sample_weight: Sample weights.
|
||||
seed: Int (default: None).
|
||||
save_to_dir: None or str (default: None). This allows you to optionally
|
||||
specify a directory to which to save the augmented pictures being
|
||||
generated (useful for visualizing what you are doing).
|
||||
save_prefix: Str (default: `''`). Prefix to use for filenames of saved
|
||||
pictures (only relevant if `save_to_dir` is set).
|
||||
save_format: one of "png", "jpeg"
|
||||
(only relevant if `save_to_dir` is set). Default: "png".
|
||||
subset: Subset of data (`"training"` or `"validation"`) if
|
||||
`validation_split` is set in `ImageDataGenerator`.
|
||||
|
||||
Returns:
|
||||
An `Iterator` yielding tuples of `(x, y)`
|
||||
where `x` is a numpy array of image data
|
||||
(in the case of a single image input) or a list
|
||||
of numpy arrays (in the case with
|
||||
additional inputs) and `y` is a numpy array
|
||||
of corresponding labels. If 'sample_weight' is not None,
|
||||
the yielded tuples are of the form `(x, y, sample_weight)`.
|
||||
If `y` is None, only the numpy array `x` is returned.
|
||||
"""
|
||||
return NumpyArrayIterator(
|
||||
x,
|
||||
y,
|
||||
self,
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle,
|
||||
sample_weight=sample_weight,
|
||||
seed=seed,
|
||||
data_format=self.data_format,
|
||||
save_to_dir=save_to_dir,
|
||||
save_prefix=save_prefix,
|
||||
save_format=save_format,
|
||||
subset=subset)
|
||||
|
||||
def flow_from_directory(self,
|
||||
directory,
|
||||
target_size=(256, 256),
|
||||
color_mode='rgb',
|
||||
classes=None,
|
||||
class_mode='categorical',
|
||||
batch_size=32,
|
||||
shuffle=True,
|
||||
seed=None,
|
||||
save_to_dir=None,
|
||||
save_prefix='',
|
||||
save_format='png',
|
||||
follow_links=False,
|
||||
subset=None,
|
||||
interpolation='nearest'):
|
||||
"""Takes the path to a directory & generates batches of augmented data.
|
||||
|
||||
Arguments:
|
||||
directory: string, path to the target directory. It should contain one
|
||||
subdirectory per class. Any PNG, JPG, BMP, PPM or TIF images inside
|
||||
each of the subdirectories directory tree will be included in the
|
||||
generator. See [this script](
|
||||
https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d)
|
||||
for more details.
|
||||
target_size: Tuple of integers `(height, width)`, defaults to `(256,
|
||||
256)`. The dimensions to which all images found will be resized.
|
||||
color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb". Whether
|
||||
the images will be converted to have 1, 3, or 4 channels.
|
||||
classes: Optional list of class subdirectories
|
||||
(e.g. `['dogs', 'cats']`). Default: None. If not provided, the list
|
||||
of classes will be automatically inferred from the subdirectory
|
||||
names/structure under `directory`, where each subdirectory will be
|
||||
treated as a different class (and the order of the classes, which
|
||||
will map to the label indices, will be alphanumeric). The
|
||||
dictionary containing the mapping from class names to class
|
||||
indices can be obtained via the attribute `class_indices`.
|
||||
class_mode: One of "categorical", "binary", "sparse",
|
||||
"input", or None. Default: "categorical".
|
||||
Determines the type of label arrays that are returned: -
|
||||
"categorical" will be 2D one-hot encoded labels, - "binary" will
|
||||
be 1D binary labels, "sparse" will be 1D integer labels, - "input"
|
||||
will be images identical to input images (mainly used to work with
|
||||
autoencoders). - If None, no labels are returned (the generator
|
||||
will only yield batches of image data, which is useful to use with
|
||||
`model.predict_generator()`). Please note that in case of
|
||||
class_mode None, the data still needs to reside in a subdirectory
|
||||
of `directory` for it to work correctly.
|
||||
batch_size: Size of the batches of data (default: 32).
|
||||
shuffle: Whether to shuffle the data (default: True) If set to False,
|
||||
sorts the data in alphanumeric order.
|
||||
seed: Optional random seed for shuffling and transformations.
|
||||
save_to_dir: None or str (default: None). This allows you to optionally
|
||||
specify a directory to which to save the augmented pictures being
|
||||
generated (useful for visualizing what you are doing).
|
||||
save_prefix: Str. Prefix to use for filenames of saved pictures (only
|
||||
relevant if `save_to_dir` is set).
|
||||
save_format: One of "png", "jpeg"
|
||||
(only relevant if `save_to_dir` is set). Default: "png".
|
||||
follow_links: Whether to follow symlinks inside
|
||||
class subdirectories (default: False).
|
||||
subset: Subset of data (`"training"` or `"validation"`) if
|
||||
`validation_split` is set in `ImageDataGenerator`.
|
||||
interpolation: Interpolation method used to resample the image if the
|
||||
target size is different from that of the loaded image. Supported
|
||||
methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
|
||||
1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
|
||||
version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
|
||||
supported. By default, `"nearest"` is used.
|
||||
|
||||
Returns:
|
||||
A `DirectoryIterator` yielding tuples of `(x, y)`
|
||||
where `x` is a numpy array containing a batch
|
||||
of images with shape `(batch_size, *target_size, channels)`
|
||||
and `y` is a numpy array of corresponding labels.
|
||||
"""
|
||||
return DirectoryIterator(
|
||||
directory,
|
||||
self,
|
||||
target_size=target_size,
|
||||
color_mode=color_mode,
|
||||
classes=classes,
|
||||
class_mode=class_mode,
|
||||
data_format=self.data_format,
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle,
|
||||
seed=seed,
|
||||
save_to_dir=save_to_dir,
|
||||
save_prefix=save_prefix,
|
||||
save_format=save_format,
|
||||
follow_links=follow_links,
|
||||
subset=subset,
|
||||
interpolation=interpolation)
|
||||
|
||||
def flow_from_dataframe(self,
|
||||
dataframe,
|
||||
directory=None,
|
||||
x_col='filename',
|
||||
y_col='class',
|
||||
weight_col=None,
|
||||
target_size=(256, 256),
|
||||
color_mode='rgb',
|
||||
classes=None,
|
||||
class_mode='categorical',
|
||||
batch_size=32,
|
||||
shuffle=True,
|
||||
seed=None,
|
||||
save_to_dir=None,
|
||||
save_prefix='',
|
||||
save_format='png',
|
||||
subset=None,
|
||||
interpolation='nearest',
|
||||
validate_filenames=True,
|
||||
**kwargs):
|
||||
"""Takes the dataframe and the path to a directory + generates batches.
|
||||
|
||||
The generated batches contain augmented/normalized data.
|
||||
|
||||
**A simple tutorial can be found **[here](
|
||||
http://bit.ly/keras_flow_from_dataframe).
|
||||
|
||||
Arguments:
|
||||
dataframe: Pandas dataframe containing the filepaths relative to
|
||||
`directory` (or absolute paths if `directory` is None) of the images
|
||||
in a string column. It should include other column/s
|
||||
depending on the `class_mode`: - if `class_mode` is `"categorical"`
|
||||
(default value) it must include the `y_col` column with the
|
||||
class/es of each image. Values in column can be string/list/tuple
|
||||
if a single class or list/tuple if multiple classes. - if
|
||||
`class_mode` is `"binary"` or `"sparse"` it must include the given
|
||||
`y_col` column with class values as strings. - if `class_mode` is
|
||||
`"raw"` or `"multi_output"` it should contain the columns
|
||||
specified in `y_col`. - if `class_mode` is `"input"` or `None` no
|
||||
extra column is needed.
|
||||
directory: string, path to the directory to read images from. If `None`,
|
||||
data in `x_col` column should be absolute paths.
|
||||
x_col: string, column in `dataframe` that contains the filenames (or
|
||||
absolute paths if `directory` is `None`).
|
||||
y_col: string or list, column/s in `dataframe` that has the target data.
|
||||
weight_col: string, column in `dataframe` that contains the sample
|
||||
weights. Default: `None`.
|
||||
target_size: tuple of integers `(height, width)`, default: `(256, 256)`.
|
||||
The dimensions to which all images found will be resized.
|
||||
color_mode: one of "grayscale", "rgb", "rgba". Default: "rgb". Whether
|
||||
the images will be converted to have 1 or 3 color channels.
|
||||
classes: optional list of classes (e.g. `['dogs', 'cats']`). Default is
|
||||
None. If not provided, the list of classes will be automatically
|
||||
inferred from the `y_col`, which will map to the label indices, will
|
||||
be alphanumeric). The dictionary containing the mapping from class
|
||||
names to class indices can be obtained via the attribute
|
||||
`class_indices`.
|
||||
class_mode: one of "binary", "categorical", "input", "multi_output",
|
||||
"raw", sparse" or None. Default: "categorical".
|
||||
Mode for yielding the targets:
|
||||
- `"binary"`: 1D numpy array of binary labels,
|
||||
- `"categorical"`: 2D numpy array of one-hot encoded labels.
|
||||
Supports multi-label output.
|
||||
- `"input"`: images identical to input images (mainly used to work
|
||||
with autoencoders),
|
||||
- `"multi_output"`: list with the values of the different columns,
|
||||
- `"raw"`: numpy array of values in `y_col` column(s),
|
||||
- `"sparse"`: 1D numpy array of integer labels, - `None`, no targets
|
||||
are returned (the generator will only yield batches of image data,
|
||||
which is useful to use in `model.predict_generator()`).
|
||||
batch_size: size of the batches of data (default: 32).
|
||||
shuffle: whether to shuffle the data (default: True)
|
||||
seed: optional random seed for shuffling and transformations.
|
||||
save_to_dir: None or str (default: None). This allows you to optionally
|
||||
specify a directory to which to save the augmented pictures being
|
||||
generated (useful for visualizing what you are doing).
|
||||
save_prefix: str. Prefix to use for filenames of saved pictures (only
|
||||
relevant if `save_to_dir` is set).
|
||||
save_format: one of "png", "jpeg"
|
||||
(only relevant if `save_to_dir` is set). Default: "png".
|
||||
subset: Subset of data (`"training"` or `"validation"`) if
|
||||
`validation_split` is set in `ImageDataGenerator`.
|
||||
interpolation: Interpolation method used to resample the image if the
|
||||
target size is different from that of the loaded image. Supported
|
||||
methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
|
||||
1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
|
||||
version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
|
||||
supported. By default, `"nearest"` is used.
|
||||
validate_filenames: Boolean, whether to validate image filenames in
|
||||
`x_col`. If `True`, invalid images will be ignored. Disabling this
|
||||
option can lead to speed-up in the execution of this function.
|
||||
Defaults to `True`.
|
||||
**kwargs: legacy arguments for raising deprecation warnings.
|
||||
|
||||
Returns:
|
||||
A `DataFrameIterator` yielding tuples of `(x, y)`
|
||||
where `x` is a numpy array containing a batch
|
||||
of images with shape `(batch_size, *target_size, channels)`
|
||||
and `y` is a numpy array of corresponding labels.
|
||||
"""
|
||||
if 'has_ext' in kwargs:
|
||||
tf_logging.warn(
|
||||
'has_ext is deprecated, filenames in the dataframe have '
|
||||
'to match the exact filenames in disk.', DeprecationWarning)
|
||||
if 'sort' in kwargs:
|
||||
tf_logging.warn(
|
||||
'sort is deprecated, batches will be created in the'
|
||||
'same order than the filenames provided if shuffle'
|
||||
'is set to False.', DeprecationWarning)
|
||||
if class_mode == 'other':
|
||||
tf_logging.warn(
|
||||
'`class_mode` "other" is deprecated, please use '
|
||||
'`class_mode` "raw".', DeprecationWarning)
|
||||
class_mode = 'raw'
|
||||
if 'drop_duplicates' in kwargs:
|
||||
tf_logging.warn(
|
||||
'drop_duplicates is deprecated, you can drop duplicates '
|
||||
'by using the pandas.DataFrame.drop_duplicates method.',
|
||||
DeprecationWarning)
|
||||
|
||||
return DataFrameIterator(
|
||||
dataframe,
|
||||
directory,
|
||||
self,
|
||||
x_col=x_col,
|
||||
y_col=y_col,
|
||||
weight_col=weight_col,
|
||||
target_size=target_size,
|
||||
color_mode=color_mode,
|
||||
classes=classes,
|
||||
class_mode=class_mode,
|
||||
data_format=self.data_format,
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle,
|
||||
seed=seed,
|
||||
save_to_dir=save_to_dir,
|
||||
save_prefix=save_prefix,
|
||||
save_format=save_format,
|
||||
subset=subset,
|
||||
interpolation=interpolation,
|
||||
validate_filenames=validate_filenames)
|
||||
|
||||
|
||||
keras_export('keras.preprocessing.image.random_rotation')(random_rotation)
|
||||
keras_export('keras.preprocessing.image.random_shift')(random_shift)
|
||||
keras_export('keras.preprocessing.image.random_shear')(random_shear)
|
||||
|
@ -25,6 +25,9 @@ import tempfile
|
||||
import numpy as np
|
||||
|
||||
from tensorflow.python.framework import test_util
|
||||
from tensorflow.python.keras import keras_parameterized
|
||||
from tensorflow.python.keras import layers
|
||||
from tensorflow.python.keras.engine import sequential
|
||||
from tensorflow.python.keras.preprocessing import image as preprocessing_image
|
||||
from tensorflow.python.platform import test
|
||||
|
||||
@ -52,7 +55,7 @@ def _generate_test_images():
|
||||
return [rgb_images, gray_images]
|
||||
|
||||
|
||||
class TestImage(test.TestCase):
|
||||
class TestImage(keras_parameterized.TestCase):
|
||||
|
||||
@test_util.run_v2_only
|
||||
def test_smart_resize(self):
|
||||
@ -319,14 +322,21 @@ class TestImage(test.TestCase):
|
||||
self.assertEqual(
|
||||
len(set(train_iterator.filenames) & set(filenames)), num_training)
|
||||
|
||||
model = sequential.Sequential([layers.Flatten(), layers.Dense(2)])
|
||||
model.compile(optimizer='sgd', loss='mse')
|
||||
model.fit(train_iterator, epochs=1)
|
||||
|
||||
shutil.rmtree(tmp_folder)
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
def test_directory_iterator_with_validation_split_25_percent(self):
|
||||
self.directory_iterator_with_validation_split_test_helper(0.25)
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
def test_directory_iterator_with_validation_split_40_percent(self):
|
||||
self.directory_iterator_with_validation_split_test_helper(0.40)
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
def test_directory_iterator_with_validation_split_50_percent(self):
|
||||
self.directory_iterator_with_validation_split_test_helper(0.50)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user