-spectrogram augmentations
This commit is contained in:
parent
5d5ef15ab7
commit
0cc5ff230f
|
@ -16,6 +16,7 @@ from util.config import Config
|
||||||
from util.logging import log_error
|
from util.logging import log_error
|
||||||
from util.text import text_to_char_array
|
from util.text import text_to_char_array
|
||||||
from util.flags import FLAGS
|
from util.flags import FLAGS
|
||||||
|
from util.spectrogram_augmentations import augment_sparse_deform, augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up
|
||||||
|
|
||||||
def read_csvs(csv_files):
|
def read_csvs(csv_files):
|
||||||
source_data = None
|
source_data = None
|
||||||
|
@ -36,6 +37,32 @@ def samples_to_mfccs(samples, sample_rate):
|
||||||
window_size=Config.audio_window_samples,
|
window_size=Config.audio_window_samples,
|
||||||
stride=Config.audio_step_samples,
|
stride=Config.audio_step_samples,
|
||||||
magnitude_squared=True)
|
magnitude_squared=True)
|
||||||
|
|
||||||
|
if FLAGS.augmention_sparse_deform:
|
||||||
|
spectrogram = augment_sparse_deform(spectrogram,
|
||||||
|
time_warping_para=FLAGS.augmentation_time_warp_max_warping,
|
||||||
|
normal_around_warping_std=FLAGS.augmentation_sparse_deform_std_warp)
|
||||||
|
|
||||||
|
if FLAGS.augmentation_spec_dropout_keeprate < 1:
|
||||||
|
spectrogram = augment_dropout(spectrogram,
|
||||||
|
keep_prob=FLAGS.augmentation_spec_dropout_keeprate)
|
||||||
|
|
||||||
|
if FLAGS.augmentation_freq_and_time_masking:
|
||||||
|
spectrogram = augment_freq_time_mask(spectrogram,
|
||||||
|
frequency_masking_para=FLAGS.augmentation_freq_and_time_masking_freq_mask_range,
|
||||||
|
time_masking_para=FLAGS.augmentation_freq_and_time_masking_time_mask_range,
|
||||||
|
frequency_mask_num=FLAGS.augmentation_freq_and_time_masking_number_freq_masks,
|
||||||
|
time_mask_num=FLAGS.augmentation_freq_and_time_masking_number_time_masks)
|
||||||
|
|
||||||
|
if FLAGS.augmentation_pitch_and_tempo_scaling:
|
||||||
|
spectrogram = augment_pitch_and_tempo(spectrogram,
|
||||||
|
max_tempo=FLAGS.augmentation_pitch_and_tempo_scaling_max_tempo,
|
||||||
|
max_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_max_pitch,
|
||||||
|
min_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_min_pitch)
|
||||||
|
|
||||||
|
if FLAGS.augmentation_speed_up_std > 0:
|
||||||
|
spectrogram = augment_speed_up(spectrogram, speed_std=FLAGS.augmentation_speed_up_std)
|
||||||
|
|
||||||
mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input)
|
mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input)
|
||||||
mfccs = tf.reshape(mfccs, [-1, Config.n_input])
|
mfccs = tf.reshape(mfccs, [-1, Config.n_input])
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,25 @@ def create_flags():
|
||||||
f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise')
|
f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise')
|
||||||
f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise')
|
f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise')
|
||||||
|
|
||||||
|
f.DEFINE_integer('augmention_sparse_deform', 0, 'whether to use time-warping augmentation')
|
||||||
|
f.DEFINE_integer('augmentation_time_warp_max_warping', 12, 'max value for warping')
|
||||||
|
f.DEFINE_float('augmentation_sparse_deform_std_warp', 0.5, 'std for warping different values to different frequencies')
|
||||||
|
|
||||||
|
f.DEFINE_float('augmentation_spec_dropout_keeprate', 1, 'keep rate of dropout augmentation on spectrogram (if 1, no dropout will be performed on spectrogram)')
|
||||||
|
|
||||||
|
f.DEFINE_integer('augmentation_freq_and_time_masking', 0, 'whether to use frequency and time masking augmentation')
|
||||||
|
f.DEFINE_integer('augmentation_freq_and_time_masking_freq_mask_range', 5, 'max range of masks in the frequency domain when performing freqtime-mask augmentation')
|
||||||
|
f.DEFINE_integer('augmentation_freq_and_time_masking_number_freq_masks', 3, 'number of masks in the frequency domain when performing freqtime-mask augmentation')
|
||||||
|
f.DEFINE_integer('augmentation_freq_and_time_masking_time_mask_range', 2, 'max range of masks in the time domain when performing freqtime-mask augmentation')
|
||||||
|
f.DEFINE_integer('augmentation_freq_and_time_masking_number_time_masks', 3, 'number of masks in the time domain when performing freqtime-mask augmentation')
|
||||||
|
|
||||||
|
f.DEFINE_float('augmentation_speed_up_std', 0.5, 'std for speeding-up tempo. If std is 0, this augmentation is not performed')
|
||||||
|
|
||||||
|
f.DEFINE_integer('augmentation_pitch_and_tempo_scaling', 0, 'whether to use spectrogram speed and tempo scaling')
|
||||||
|
f.DEFINE_float('augmentation_pitch_and_tempo_scaling_min_pitch', 0.95, 'min value of pitch scaling')
|
||||||
|
f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_pitch', 1.2, 'max value of pitch scaling')
|
||||||
|
f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_tempo', 1.2, 'max vlaue of tempo scaling')
|
||||||
|
|
||||||
|
|
||||||
# Global Constants
|
# Global Constants
|
||||||
# ================
|
# ================
|
||||||
|
|
|
@ -0,0 +1,177 @@
|
||||||
|
## Implementation of sparse_image_warp that handles dynamic shapes
|
||||||
|
from tensorflow.contrib.image.python.ops import dense_image_warp
|
||||||
|
from tensorflow.contrib.image.python.ops import interpolate_spline
|
||||||
|
|
||||||
|
from tensorflow.python.framework import dtypes
|
||||||
|
from tensorflow.python.framework import ops
|
||||||
|
from tensorflow.python.framework import tensor_shape
|
||||||
|
from tensorflow.python.ops import array_ops
|
||||||
|
from tensorflow.python.ops import math_ops
|
||||||
|
|
||||||
|
|
||||||
|
def _get_grid_locations(image_height, image_width):
|
||||||
|
"""Wrapper for array_ops.meshgrid."""
|
||||||
|
|
||||||
|
y_range = math_ops.linspace(0.0, math_ops.to_float(image_height) - 1,
|
||||||
|
image_height)
|
||||||
|
x_range = math_ops.linspace(0.0, math_ops.to_float(image_width) - 1,
|
||||||
|
image_width)
|
||||||
|
y_grid, x_grid = array_ops.meshgrid(y_range, x_range, indexing='ij')
|
||||||
|
return array_ops.stack((y_grid, x_grid), -1)
|
||||||
|
|
||||||
|
|
||||||
|
def _expand_to_minibatch(array, batch_size):
|
||||||
|
"""Tile arbitrarily-sized array to include new batch dimension."""
|
||||||
|
batch_size = array_ops.expand_dims(batch_size, 0)
|
||||||
|
array_ones = array_ops.ones((array_ops.rank(array)), dtype=dtypes.int32)
|
||||||
|
tiles = array_ops.concat([batch_size, array_ones], axis=0)
|
||||||
|
return array_ops.tile(array_ops.expand_dims(array, 0), tiles)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_boundary_locations(image_height, image_width, num_points_per_edge):
|
||||||
|
"""Compute evenly-spaced indices along edge of image."""
|
||||||
|
image_height = math_ops.to_float(image_height)
|
||||||
|
image_width = math_ops.to_float(image_width)
|
||||||
|
y_range = math_ops.linspace(0.0, image_height - 1, num_points_per_edge + 2)
|
||||||
|
x_range = math_ops.linspace(0.0, image_width - 1, num_points_per_edge + 2)
|
||||||
|
ys, xs = array_ops.meshgrid(y_range, x_range, indexing='ij')
|
||||||
|
is_boundary = math_ops.logical_or(
|
||||||
|
math_ops.logical_or(math_ops.equal(xs, 0), # pylint: disable=bad-continuation
|
||||||
|
math_ops.equal(xs, image_width - 1)),
|
||||||
|
math_ops.logical_or(math_ops.equal(ys, 0), # pylint: disable=bad-continuation
|
||||||
|
math_ops.equal(ys, image_height - 1)))
|
||||||
|
return array_ops.stack([array_ops.boolean_mask(ys, is_boundary),
|
||||||
|
array_ops.boolean_mask(xs, is_boundary)], axis=-1)
|
||||||
|
|
||||||
|
|
||||||
|
def _add_zero_flow_controls_at_boundary(control_point_locations,
|
||||||
|
control_point_flows, image_height,
|
||||||
|
image_width, boundary_points_per_edge):
|
||||||
|
"""Add control points for zero-flow boundary conditions.
|
||||||
|
Augment the set of control points with extra points on the
|
||||||
|
boundary of the image that have zero flow.
|
||||||
|
Args:
|
||||||
|
control_point_locations: input control points
|
||||||
|
control_point_flows: their flows
|
||||||
|
image_height: image height
|
||||||
|
image_width: image width
|
||||||
|
boundary_points_per_edge: number of points to add in the middle of each
|
||||||
|
edge (not including the corners).
|
||||||
|
The total number of points added is
|
||||||
|
4 + 4*(boundary_points_per_edge).
|
||||||
|
Returns:
|
||||||
|
merged_control_point_locations: augmented set of control point locations
|
||||||
|
merged_control_point_flows: augmented set of control point flows
|
||||||
|
"""
|
||||||
|
|
||||||
|
batch_size = tensor_shape.dimension_value(control_point_locations.shape[0])
|
||||||
|
|
||||||
|
boundary_point_locations = _get_boundary_locations(image_height, image_width,
|
||||||
|
boundary_points_per_edge)
|
||||||
|
|
||||||
|
boundary_point_flows = array_ops.zeros([array_ops.shape(boundary_point_locations)[0], 2])
|
||||||
|
|
||||||
|
boundary_point_locations = _expand_to_minibatch(boundary_point_locations,
|
||||||
|
batch_size)
|
||||||
|
|
||||||
|
boundary_point_flows = _expand_to_minibatch(boundary_point_flows, batch_size)
|
||||||
|
|
||||||
|
merged_control_point_locations = array_ops.concat([control_point_locations, boundary_point_locations], 1)
|
||||||
|
|
||||||
|
merged_control_point_flows = array_ops.concat([control_point_flows, boundary_point_flows], 1)
|
||||||
|
|
||||||
|
return merged_control_point_locations, merged_control_point_flows
|
||||||
|
|
||||||
|
|
||||||
|
def sparse_image_warp(image,
|
||||||
|
source_control_point_locations,
|
||||||
|
dest_control_point_locations,
|
||||||
|
interpolation_order=2,
|
||||||
|
regularization_weight=0.0,
|
||||||
|
num_boundary_points=0,
|
||||||
|
name='sparse_image_warp'):
|
||||||
|
"""Image warping using correspondences between sparse control points.
|
||||||
|
Apply a non-linear warp to the image, where the warp is specified by
|
||||||
|
the source and destination locations of a (potentially small) number of
|
||||||
|
control points. First, we use a polyharmonic spline
|
||||||
|
(`tf.contrib.image.interpolate_spline`) to interpolate the displacements
|
||||||
|
between the corresponding control points to a dense flow field.
|
||||||
|
Then, we warp the image using this dense flow field
|
||||||
|
(`tf.contrib.image.dense_image_warp`).
|
||||||
|
Let t index our control points. For regularization_weight=0, we have:
|
||||||
|
warped_image[b, dest_control_point_locations[b, t, 0],
|
||||||
|
dest_control_point_locations[b, t, 1], :] =
|
||||||
|
image[b, source_control_point_locations[b, t, 0],
|
||||||
|
source_control_point_locations[b, t, 1], :].
|
||||||
|
For regularization_weight > 0, this condition is met approximately, since
|
||||||
|
regularized interpolation trades off smoothness of the interpolant vs.
|
||||||
|
reconstruction of the interpolant at the control points.
|
||||||
|
See `tf.contrib.image.interpolate_spline` for further documentation of the
|
||||||
|
interpolation_order and regularization_weight arguments.
|
||||||
|
Args:
|
||||||
|
image: `[batch, height, width, channels]` float `Tensor`
|
||||||
|
source_control_point_locations: `[batch, num_control_points, 2]` float
|
||||||
|
`Tensor`
|
||||||
|
dest_control_point_locations: `[batch, num_control_points, 2]` float
|
||||||
|
`Tensor`
|
||||||
|
interpolation_order: polynomial order used by the spline interpolation
|
||||||
|
regularization_weight: weight on smoothness regularizer in interpolation
|
||||||
|
num_boundary_points: How many zero-flow boundary points to include at
|
||||||
|
each image edge.Usage:
|
||||||
|
num_boundary_points=0: don't add zero-flow points
|
||||||
|
num_boundary_points=1: 4 corners of the image
|
||||||
|
num_boundary_points=2: 4 corners and one in the middle of each edge
|
||||||
|
(8 points total)
|
||||||
|
num_boundary_points=n: 4 corners and n-1 along each edge
|
||||||
|
name: A name for the operation (optional).
|
||||||
|
Note that image and offsets can be of type tf.half, tf.float32, or
|
||||||
|
tf.float64, and do not necessarily have to be the same type.
|
||||||
|
Returns:
|
||||||
|
warped_image: `[batch, height, width, channels]` float `Tensor` with same
|
||||||
|
type as input image.
|
||||||
|
flow_field: `[batch, height, width, 2]` float `Tensor` containing the dense
|
||||||
|
flow field produced by the interpolation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
image = ops.convert_to_tensor(image)
|
||||||
|
source_control_point_locations = ops.convert_to_tensor(
|
||||||
|
source_control_point_locations)
|
||||||
|
dest_control_point_locations = ops.convert_to_tensor(
|
||||||
|
dest_control_point_locations)
|
||||||
|
|
||||||
|
control_point_flows = (
|
||||||
|
dest_control_point_locations - source_control_point_locations)
|
||||||
|
|
||||||
|
clamp_boundaries = num_boundary_points > 0
|
||||||
|
boundary_points_per_edge = num_boundary_points - 1
|
||||||
|
|
||||||
|
with ops.name_scope(name):
|
||||||
|
batch_size, image_height, image_width = (array_ops.shape(image)[0],
|
||||||
|
array_ops.shape(image)[1],
|
||||||
|
array_ops.shape(image)[2])
|
||||||
|
# This generates the dense locations where the interpolant
|
||||||
|
# will be evaluated.
|
||||||
|
grid_locations = _get_grid_locations(image_height, image_width)
|
||||||
|
|
||||||
|
flattened_grid_locations = array_ops.reshape(grid_locations,
|
||||||
|
[image_height*image_width, 2])
|
||||||
|
|
||||||
|
flattened_grid_locations = _expand_to_minibatch(flattened_grid_locations,
|
||||||
|
batch_size)
|
||||||
|
|
||||||
|
if clamp_boundaries:
|
||||||
|
(dest_control_point_locations,
|
||||||
|
control_point_flows) = _add_zero_flow_controls_at_boundary(dest_control_point_locations,
|
||||||
|
control_point_flows, image_height,
|
||||||
|
image_width, boundary_points_per_edge)
|
||||||
|
|
||||||
|
flattened_flows = interpolate_spline.interpolate_spline(dest_control_point_locations, control_point_flows,
|
||||||
|
flattened_grid_locations, interpolation_order,
|
||||||
|
regularization_weight)
|
||||||
|
|
||||||
|
dense_flows = array_ops.reshape(flattened_flows,
|
||||||
|
[batch_size, image_height, image_width, 2])
|
||||||
|
|
||||||
|
warped_image = dense_image_warp.dense_image_warp(image, dense_flows)
|
||||||
|
|
||||||
|
return warped_image, dense_flows
|
|
@ -0,0 +1,97 @@
|
||||||
|
import tensorflow as tf
|
||||||
|
from util.sparse_image_warp import sparse_image_warp
|
||||||
|
|
||||||
|
def augment_sparse_deform(mel_spectrogram,
|
||||||
|
time_warping_para=12,
|
||||||
|
normal_around_warping_std=0.5):
|
||||||
|
mel_spectrogram = tf.expand_dims(mel_spectrogram, -1)
|
||||||
|
freq_max = tf.shape(mel_spectrogram)[1]
|
||||||
|
time_max = tf.shape(mel_spectrogram)[2]
|
||||||
|
center_freq = tf.cast(freq_max, tf.float32)/2.0
|
||||||
|
random_time_point = tf.random.uniform(shape=(), minval=time_warping_para, maxval=tf.cast(time_max, tf.float32) - time_warping_para)
|
||||||
|
chosen_warping = tf.random.uniform(shape=(), minval=0, maxval=time_warping_para)
|
||||||
|
#add different warping values to different frequencies
|
||||||
|
normal_around_warping = tf.random.normal(mean=chosen_warping, stddev=normal_around_warping_std, shape=(3,))
|
||||||
|
|
||||||
|
control_point_freqs = tf.stack([0.0, center_freq, tf.cast(freq_max, tf.float32)], axis=0)
|
||||||
|
control_point_times_src = tf.stack([random_time_point, random_time_point, random_time_point], axis=0)
|
||||||
|
control_point_times_dst = control_point_times_src+normal_around_warping
|
||||||
|
|
||||||
|
control_src = tf.expand_dims(tf.stack([control_point_freqs, control_point_times_src], axis=-1), 0)
|
||||||
|
control_dst = tf.expand_dims(tf.stack([control_point_freqs, control_point_times_dst], axis=1), 0)
|
||||||
|
warped_mel_spectrogram, _ = sparse_image_warp(mel_spectrogram,
|
||||||
|
source_control_point_locations=control_src,
|
||||||
|
dest_control_point_locations=control_dst,
|
||||||
|
interpolation_order=2,
|
||||||
|
regularization_weight=0,
|
||||||
|
num_boundary_points=1
|
||||||
|
)
|
||||||
|
warped_mel_spectrogram = warped_mel_spectrogram[:, :, :, 0]
|
||||||
|
return warped_mel_spectrogram
|
||||||
|
|
||||||
|
def augment_freq_time_mask(mel_spectrogram,
|
||||||
|
frequency_masking_para=30,
|
||||||
|
time_masking_para=10,
|
||||||
|
frequency_mask_num=3,
|
||||||
|
time_mask_num=3):
|
||||||
|
freq_max = tf.shape(mel_spectrogram)[1]
|
||||||
|
time_max = tf.shape(mel_spectrogram)[2]
|
||||||
|
# Frequency masking
|
||||||
|
# Testing without loop
|
||||||
|
for _ in range(frequency_mask_num):
|
||||||
|
f = tf.random.uniform(shape=(), minval=0, maxval=frequency_masking_para, dtype=tf.dtypes.int32)
|
||||||
|
f0 = tf.random.uniform(shape=(), minval=0, maxval=freq_max - f, dtype=tf.dtypes.int32)
|
||||||
|
value_ones_freq_prev = tf.ones(shape=[1, f0, time_max])
|
||||||
|
value_zeros_freq = tf.zeros(shape=[1, f, time_max])
|
||||||
|
value_ones_freq_next = tf.ones(shape=[1, freq_max-(f0+f), time_max])
|
||||||
|
freq_mask = tf.concat([value_ones_freq_prev, value_zeros_freq, value_ones_freq_next], axis=1)
|
||||||
|
#mel_spectrogram[:, f0:f0 + f, :] = 0 #can't assign to tensor
|
||||||
|
#mel_spectrogram[:, f0:f0 + f, :] = value_zeros_freq #can't assign to tensor
|
||||||
|
mel_spectrogram = mel_spectrogram*freq_mask
|
||||||
|
|
||||||
|
# Time masking
|
||||||
|
# Testing without loop
|
||||||
|
for _ in range(time_mask_num):
|
||||||
|
t = tf.random.uniform(shape=(), minval=0, maxval=time_masking_para, dtype=tf.dtypes.int32)
|
||||||
|
t0 = tf.random.uniform(shape=(), minval=0, maxval=time_max - t, dtype=tf.dtypes.int32)
|
||||||
|
value_zeros_time_prev = tf.ones(shape=[1, freq_max, t0])
|
||||||
|
value_zeros_time = tf.zeros(shape=[1, freq_max, t])
|
||||||
|
value_zeros_time_next = tf.ones(shape=[1, freq_max, time_max-(t0+t)])
|
||||||
|
time_mask = tf.concat([value_zeros_time_prev, value_zeros_time, value_zeros_time_next], axis=2)
|
||||||
|
#mel_spectrogram[:, :, t0:t0 + t] = 0 #can't assign to tensor
|
||||||
|
#mel_spectrogram[:, :, t0:t0 + t] = value_zeros_time #can't assign to tensor
|
||||||
|
mel_spectrogram = mel_spectrogram*time_mask
|
||||||
|
|
||||||
|
return mel_spectrogram
|
||||||
|
|
||||||
|
def augment_pitch_and_tempo(spectrogram,
|
||||||
|
max_tempo=1.2,
|
||||||
|
max_pitch=1.1,
|
||||||
|
min_pitch=0.95):
|
||||||
|
original_shape = tf.shape(spectrogram)
|
||||||
|
choosen_pitch = tf.random.uniform(shape=(), minval=min_pitch, maxval=max_pitch)
|
||||||
|
choosen_tempo = tf.random.uniform(shape=(), minval=1, maxval=max_tempo)
|
||||||
|
new_height = tf.cast(tf.cast(original_shape[1], tf.float32)*choosen_pitch, tf.int32)
|
||||||
|
new_width = tf.cast(tf.cast(original_shape[2], tf.float32)/(choosen_tempo), tf.int32)
|
||||||
|
spectrogram_aug = tf.image.resize_bilinear(tf.expand_dims(spectrogram, -1), [new_height, new_width])
|
||||||
|
spectrogram_aug = tf.image.crop_to_bounding_box(spectrogram_aug, offset_height=0, offset_width=0, target_height=tf.minimum(original_shape[1],new_height), target_width=tf.shape(spectrogram_aug)[2])
|
||||||
|
spectrogram_aug = tf.cond(choosen_pitch < 1,
|
||||||
|
lambda: tf.image.pad_to_bounding_box(spectrogram_aug, offset_height=0, offset_width=0,
|
||||||
|
target_height=original_shape[1], target_width=tf.shape(spectrogram_aug)[2]),
|
||||||
|
lambda: spectrogram_aug)
|
||||||
|
return spectrogram_aug[:, :, :, 0]
|
||||||
|
|
||||||
|
|
||||||
|
def augment_speed_up(spectrogram,
|
||||||
|
speed_std=0.1):
|
||||||
|
original_shape = tf.shape(spectrogram)
|
||||||
|
choosen_speed = tf.math.abs(tf.random.normal(shape=(), stddev=speed_std)) # abs makes sure the augmention will only speed up
|
||||||
|
choosen_speed = 1 + choosen_speed
|
||||||
|
new_height = tf.cast(tf.cast(original_shape[1], tf.float32), tf.int32)
|
||||||
|
new_width = tf.cast(tf.cast(original_shape[2], tf.float32)/(choosen_speed), tf.int32)
|
||||||
|
spectrogram_aug = tf.image.resize_bilinear(tf.expand_dims(spectrogram, -1), [new_height, new_width])
|
||||||
|
return spectrogram_aug[:, :, :, 0]
|
||||||
|
|
||||||
|
def augment_dropout(spectrogram,
|
||||||
|
keep_prob=0.95):
|
||||||
|
return tf.nn.dropout(spectrogram, rate=1-keep_prob)
|
Loading…
Reference in New Issue