prepare files for refactoring
This commit is contained in:
parent
271e3639a7
commit
271a58e464
@ -14,7 +14,7 @@ from tensorflow.python.ops import gen_audio_ops as contrib_audio
|
||||
from util.config import Config
|
||||
from util.text import text_to_char_array
|
||||
from util.flags import FLAGS
|
||||
from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up
|
||||
from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up, augment_sparse_warp
|
||||
from util.audio import read_frames_from_file, vad_split, DEFAULT_FORMAT
|
||||
|
||||
|
||||
@ -58,6 +58,8 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False):
|
||||
if FLAGS.augmentation_speed_up_std > 0:
|
||||
spectrogram = augment_speed_up(spectrogram, speed_std=FLAGS.augmentation_speed_up_std)
|
||||
|
||||
# spectrogram = augment_sparse_warp(spectrogram)
|
||||
|
||||
mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input)
|
||||
mfccs = tf.reshape(mfccs, [-1, Config.n_input])
|
||||
|
||||
|
203
util/sparse_image_warp.py
Normal file
203
util/sparse_image_warp.py
Normal file
@ -0,0 +1,203 @@
|
||||
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Image warping using sparse flow defined at control points."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
import tensorflow as tf
|
||||
from tensorflow.compat import v1 as tfv1
|
||||
from tensorflow.contrib.image.python.ops import dense_image_warp
|
||||
from tensorflow.contrib.image.python.ops import interpolate_spline
|
||||
|
||||
from tensorflow.python.framework import constant_op
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.framework import tensor_shape
|
||||
from tensorflow.python.ops import array_ops
|
||||
|
||||
|
||||
def _get_grid_locations(image_height, image_width):
|
||||
"""Wrapper for np.meshgrid."""
|
||||
|
||||
y_range = np.linspace(0, image_height - 1, image_height)
|
||||
x_range = np.linspace(0, image_width - 1, image_width)
|
||||
y_grid, x_grid = np.meshgrid(y_range, x_range, indexing='ij')
|
||||
return np.stack((y_grid, x_grid), -1)
|
||||
|
||||
|
||||
def _expand_to_minibatch(np_array, batch_size):
|
||||
"""Tile arbitrarily-sized np_array to include new batch dimension."""
|
||||
tiles = [batch_size] + [1] * np_array.ndim
|
||||
return np.tile(np.expand_dims(np_array, 0), tiles)
|
||||
|
||||
|
||||
def _get_boundary_locations(image_height, image_width, num_points_per_edge):
|
||||
"""Compute evenly-spaced indices along edge of image."""
|
||||
y_range = np.linspace(0, image_height - 1, num_points_per_edge + 2)
|
||||
x_range = np.linspace(0, image_width - 1, num_points_per_edge + 2)
|
||||
ys, xs = np.meshgrid(y_range, x_range, indexing='ij')
|
||||
is_boundary = np.logical_or(
|
||||
np.logical_or(xs == 0, xs == image_width - 1),
|
||||
np.logical_or(ys == 0, ys == image_height - 1))
|
||||
return np.stack([ys[is_boundary], xs[is_boundary]], axis=-1)
|
||||
|
||||
|
||||
def _add_zero_flow_controls_at_boundary(control_point_locations,
|
||||
control_point_flows, image_height,
|
||||
image_width, boundary_points_per_edge):
|
||||
"""Add control points for zero-flow boundary conditions.
|
||||
|
||||
Augment the set of control points with extra points on the
|
||||
boundary of the image that have zero flow.
|
||||
|
||||
Args:
|
||||
control_point_locations: input control points
|
||||
control_point_flows: their flows
|
||||
image_height: image height
|
||||
image_width: image width
|
||||
boundary_points_per_edge: number of points to add in the middle of each
|
||||
edge (not including the corners).
|
||||
The total number of points added is
|
||||
4 + 4*(boundary_points_per_edge).
|
||||
|
||||
Returns:
|
||||
merged_control_point_locations: augmented set of control point locations
|
||||
merged_control_point_flows: augmented set of control point flows
|
||||
"""
|
||||
|
||||
batch_size = tensor_shape.dimension_value(control_point_locations.shape[0])
|
||||
|
||||
boundary_point_locations = _get_boundary_locations(image_height, image_width,
|
||||
boundary_points_per_edge)
|
||||
|
||||
boundary_point_flows = np.zeros([boundary_point_locations.shape[0], 2])
|
||||
|
||||
type_to_use = control_point_locations.dtype
|
||||
boundary_point_locations = constant_op.constant(
|
||||
_expand_to_minibatch(boundary_point_locations, batch_size),
|
||||
dtype=type_to_use)
|
||||
|
||||
boundary_point_flows = constant_op.constant(
|
||||
_expand_to_minibatch(boundary_point_flows, batch_size), dtype=type_to_use)
|
||||
|
||||
merged_control_point_locations = array_ops.concat(
|
||||
[control_point_locations, boundary_point_locations], 1)
|
||||
|
||||
merged_control_point_flows = array_ops.concat(
|
||||
[control_point_flows, boundary_point_flows], 1)
|
||||
|
||||
return merged_control_point_locations, merged_control_point_flows
|
||||
|
||||
|
||||
def sparse_image_warp(image,
|
||||
source_control_point_locations,
|
||||
dest_control_point_locations,
|
||||
interpolation_order=2,
|
||||
regularization_weight=0.0,
|
||||
num_boundary_points=0,
|
||||
name='sparse_image_warp'):
|
||||
"""Image warping using correspondences between sparse control points.
|
||||
|
||||
Apply a non-linear warp to the image, where the warp is specified by
|
||||
the source and destination locations of a (potentially small) number of
|
||||
control points. First, we use a polyharmonic spline
|
||||
(`tf.contrib.image.interpolate_spline`) to interpolate the displacements
|
||||
between the corresponding control points to a dense flow field.
|
||||
Then, we warp the image using this dense flow field
|
||||
(`tf.contrib.image.dense_image_warp`).
|
||||
|
||||
Let t index our control points. For regularization_weight=0, we have:
|
||||
warped_image[b, dest_control_point_locations[b, t, 0],
|
||||
dest_control_point_locations[b, t, 1], :] =
|
||||
image[b, source_control_point_locations[b, t, 0],
|
||||
source_control_point_locations[b, t, 1], :].
|
||||
|
||||
For regularization_weight > 0, this condition is met approximately, since
|
||||
regularized interpolation trades off smoothness of the interpolant vs.
|
||||
reconstruction of the interpolant at the control points.
|
||||
See `tf.contrib.image.interpolate_spline` for further documentation of the
|
||||
interpolation_order and regularization_weight arguments.
|
||||
|
||||
|
||||
Args:
|
||||
image: `[batch, height, width, channels]` float `Tensor`
|
||||
source_control_point_locations: `[batch, num_control_points, 2]` float
|
||||
`Tensor`
|
||||
dest_control_point_locations: `[batch, num_control_points, 2]` float
|
||||
`Tensor`
|
||||
interpolation_order: polynomial order used by the spline interpolation
|
||||
regularization_weight: weight on smoothness regularizer in interpolation
|
||||
num_boundary_points: How many zero-flow boundary points to include at
|
||||
each image edge.Usage:
|
||||
num_boundary_points=0: don't add zero-flow points
|
||||
num_boundary_points=1: 4 corners of the image
|
||||
num_boundary_points=2: 4 corners and one in the middle of each edge
|
||||
(8 points total)
|
||||
num_boundary_points=n: 4 corners and n-1 along each edge
|
||||
name: A name for the operation (optional).
|
||||
|
||||
Note that image and offsets can be of type tf.half, tf.float32, or
|
||||
tf.float64, and do not necessarily have to be the same type.
|
||||
|
||||
Returns:
|
||||
warped_image: `[batch, height, width, channels]` float `Tensor` with same
|
||||
type as input image.
|
||||
flow_field: `[batch, height, width, 2]` float `Tensor` containing the dense
|
||||
flow field produced by the interpolation.
|
||||
"""
|
||||
|
||||
image = ops.convert_to_tensor(image)
|
||||
source_control_point_locations = ops.convert_to_tensor(
|
||||
source_control_point_locations)
|
||||
dest_control_point_locations = ops.convert_to_tensor(
|
||||
dest_control_point_locations)
|
||||
|
||||
control_point_flows = (
|
||||
dest_control_point_locations - source_control_point_locations)
|
||||
|
||||
clamp_boundaries = num_boundary_points > 0
|
||||
boundary_points_per_edge = num_boundary_points - 1
|
||||
|
||||
with ops.name_scope(name):
|
||||
batch_size, image_height, image_width, _ = image.get_shape().as_list()
|
||||
|
||||
# This generates the dense locations where the interpolant
|
||||
# will be evaluated.
|
||||
grid_locations = _get_grid_locations(image_height, image_width)
|
||||
|
||||
flattened_grid_locations = np.reshape(grid_locations,
|
||||
[image_height * image_width, 2])
|
||||
|
||||
flattened_grid_locations = constant_op.constant(
|
||||
_expand_to_minibatch(flattened_grid_locations, batch_size), image.dtype)
|
||||
|
||||
if clamp_boundaries:
|
||||
(dest_control_point_locations,
|
||||
control_point_flows) = _add_zero_flow_controls_at_boundary(
|
||||
dest_control_point_locations, control_point_flows, image_height,
|
||||
image_width, boundary_points_per_edge)
|
||||
|
||||
flattened_flows = interpolate_spline.interpolate_spline(
|
||||
dest_control_point_locations, control_point_flows,
|
||||
flattened_grid_locations, interpolation_order, regularization_weight)
|
||||
|
||||
dense_flows = array_ops.reshape(flattened_flows,
|
||||
[batch_size, image_height, image_width, 2])
|
||||
|
||||
warped_image = dense_image_warp.dense_image_warp(image, dense_flows)
|
||||
|
||||
return warped_image, dense_flows
|
@ -1,4 +1,5 @@
|
||||
import tensorflow as tf
|
||||
from util.sparse_image_warp import sparse_image_warp
|
||||
|
||||
def augment_freq_time_mask(mel_spectrogram,
|
||||
frequency_masking_para=30,
|
||||
@ -9,41 +10,53 @@ def augment_freq_time_mask(mel_spectrogram,
|
||||
time_max = tf.shape(mel_spectrogram)[2]
|
||||
# Frequency masking
|
||||
for _ in range(frequency_mask_num):
|
||||
f = tf.random.uniform(shape=(), minval=0, maxval=frequency_masking_para, dtype=tf.dtypes.int32)
|
||||
f0 = tf.random.uniform(shape=(), minval=0, maxval=freq_max - f, dtype=tf.dtypes.int32)
|
||||
f = tf.random.uniform(
|
||||
shape=(), minval=0, maxval=frequency_masking_para, dtype=tf.dtypes.int32)
|
||||
f0 = tf.random.uniform(
|
||||
shape=(), minval=0, maxval=freq_max - f, dtype=tf.dtypes.int32)
|
||||
value_ones_freq_prev = tf.ones(shape=[1, f0, time_max])
|
||||
value_zeros_freq = tf.zeros(shape=[1, f, time_max])
|
||||
value_ones_freq_next = tf.ones(shape=[1, freq_max-(f0+f), time_max])
|
||||
freq_mask = tf.concat([value_ones_freq_prev, value_zeros_freq, value_ones_freq_next], axis=1)
|
||||
#mel_spectrogram[:, f0:f0 + f, :] = 0 #can't assign to tensor
|
||||
#mel_spectrogram[:, f0:f0 + f, :] = value_zeros_freq #can't assign to tensor
|
||||
freq_mask = tf.concat(
|
||||
[value_ones_freq_prev, value_zeros_freq, value_ones_freq_next], axis=1)
|
||||
# mel_spectrogram[:, f0:f0 + f, :] = 0 #can't assign to tensor
|
||||
# mel_spectrogram[:, f0:f0 + f, :] = value_zeros_freq #can't assign to tensor
|
||||
mel_spectrogram = mel_spectrogram*freq_mask
|
||||
|
||||
# Time masking
|
||||
for _ in range(time_mask_num):
|
||||
t = tf.random.uniform(shape=(), minval=0, maxval=time_masking_para, dtype=tf.dtypes.int32)
|
||||
t0 = tf.random.uniform(shape=(), minval=0, maxval=time_max - t, dtype=tf.dtypes.int32)
|
||||
t = tf.random.uniform(shape=(), minval=0,
|
||||
maxval=time_masking_para, dtype=tf.dtypes.int32)
|
||||
t0 = tf.random.uniform(
|
||||
shape=(), minval=0, maxval=time_max - t, dtype=tf.dtypes.int32)
|
||||
value_zeros_time_prev = tf.ones(shape=[1, freq_max, t0])
|
||||
value_zeros_time = tf.zeros(shape=[1, freq_max, t])
|
||||
value_zeros_time_next = tf.ones(shape=[1, freq_max, time_max-(t0+t)])
|
||||
time_mask = tf.concat([value_zeros_time_prev, value_zeros_time, value_zeros_time_next], axis=2)
|
||||
#mel_spectrogram[:, :, t0:t0 + t] = 0 #can't assign to tensor
|
||||
#mel_spectrogram[:, :, t0:t0 + t] = value_zeros_time #can't assign to tensor
|
||||
time_mask = tf.concat(
|
||||
[value_zeros_time_prev, value_zeros_time, value_zeros_time_next], axis=2)
|
||||
# mel_spectrogram[:, :, t0:t0 + t] = 0 #can't assign to tensor
|
||||
# mel_spectrogram[:, :, t0:t0 + t] = value_zeros_time #can't assign to tensor
|
||||
mel_spectrogram = mel_spectrogram*time_mask
|
||||
|
||||
return mel_spectrogram
|
||||
|
||||
|
||||
def augment_pitch_and_tempo(spectrogram,
|
||||
max_tempo=1.2,
|
||||
max_pitch=1.1,
|
||||
min_pitch=0.95):
|
||||
original_shape = tf.shape(spectrogram)
|
||||
choosen_pitch = tf.random.uniform(shape=(), minval=min_pitch, maxval=max_pitch)
|
||||
choosen_pitch = tf.random.uniform(
|
||||
shape=(), minval=min_pitch, maxval=max_pitch)
|
||||
choosen_tempo = tf.random.uniform(shape=(), minval=1, maxval=max_tempo)
|
||||
new_height = tf.cast(tf.cast(original_shape[1], tf.float32)*choosen_pitch, tf.int32)
|
||||
new_width = tf.cast(tf.cast(original_shape[2], tf.float32)/(choosen_tempo), tf.int32)
|
||||
spectrogram_aug = tf.image.resize_bilinear(tf.expand_dims(spectrogram, -1), [new_height, new_width])
|
||||
spectrogram_aug = tf.image.crop_to_bounding_box(spectrogram_aug, offset_height=0, offset_width=0, target_height=tf.minimum(original_shape[1], new_height), target_width=tf.shape(spectrogram_aug)[2])
|
||||
new_height = tf.cast(
|
||||
tf.cast(original_shape[1], tf.float32)*choosen_pitch, tf.int32)
|
||||
new_width = tf.cast(
|
||||
tf.cast(original_shape[2], tf.float32)/(choosen_tempo), tf.int32)
|
||||
spectrogram_aug = tf.image.resize_bilinear(
|
||||
tf.expand_dims(spectrogram, -1), [new_height, new_width])
|
||||
spectrogram_aug = tf.image.crop_to_bounding_box(spectrogram_aug, offset_height=0, offset_width=0, target_height=tf.minimum(
|
||||
original_shape[1], new_height), target_width=tf.shape(spectrogram_aug)[2])
|
||||
spectrogram_aug = tf.cond(choosen_pitch < 1,
|
||||
lambda: tf.image.pad_to_bounding_box(spectrogram_aug, offset_height=0, offset_width=0,
|
||||
target_height=original_shape[1], target_width=tf.shape(spectrogram_aug)[2]),
|
||||
@ -54,13 +67,70 @@ def augment_pitch_and_tempo(spectrogram,
|
||||
def augment_speed_up(spectrogram,
|
||||
speed_std=0.1):
|
||||
original_shape = tf.shape(spectrogram)
|
||||
choosen_speed = tf.math.abs(tf.random.normal(shape=(), stddev=speed_std)) # abs makes sure the augmention will only speed up
|
||||
# abs makes sure the augmention will only speed up
|
||||
choosen_speed = tf.math.abs(tf.random.normal(shape=(), stddev=speed_std))
|
||||
choosen_speed = 1 + choosen_speed
|
||||
new_height = tf.cast(tf.cast(original_shape[1], tf.float32), tf.int32)
|
||||
new_width = tf.cast(tf.cast(original_shape[2], tf.float32)/(choosen_speed), tf.int32)
|
||||
spectrogram_aug = tf.image.resize_bilinear(tf.expand_dims(spectrogram, -1), [new_height, new_width])
|
||||
new_width = tf.cast(
|
||||
tf.cast(original_shape[2], tf.float32)/(choosen_speed), tf.int32)
|
||||
spectrogram_aug = tf.image.resize_bilinear(
|
||||
tf.expand_dims(spectrogram, -1), [new_height, new_width])
|
||||
return spectrogram_aug[:, :, :, 0]
|
||||
|
||||
|
||||
def augment_dropout(spectrogram,
|
||||
keep_prob=0.95):
|
||||
return tf.nn.dropout(spectrogram, rate=1-keep_prob)
|
||||
|
||||
|
||||
def augment_sparse_warp(spectrogram: tf.Tensor, time_warping_para=80):
|
||||
"""Spec augmentation Calculation Function.
|
||||
'SpecAugment' have 3 steps for audio data augmentation.
|
||||
first step is time warping using Tensorflow's image_sparse_warp function.
|
||||
Second step is frequency masking, last step is time masking.
|
||||
# Arguments:
|
||||
mel_spectrogram(numpy array): audio file path of you want to warping and masking.
|
||||
time_warping_para(float): Augmentation parameter, "time warp parameter W".
|
||||
If none, default = 80 for LibriSpeech.
|
||||
# Returns
|
||||
mel_spectrogram(numpy array): warped and masked mel spectrogram.
|
||||
"""
|
||||
if spectrogram.get_shape().ndims == 2:
|
||||
spectrogram = tf.reshape(spectrogram, shape=[1, -1, spectrogram.shape[1], 1])
|
||||
elif spectrogram.get_shape().ndims == 3:
|
||||
spectrogram = tf.reshape(spectrogram, shape=[spectrogram.shape[0], -1, spectrogram.shape[2], 1])
|
||||
assert spectrogram.get_shape().ndims == 4
|
||||
fbank_size = tf.shape(spectrogram)
|
||||
n, v = fbank_size[1], fbank_size[2]
|
||||
|
||||
# Step 1 : Time warping
|
||||
# Image warping control point setting.
|
||||
# Source
|
||||
# radnom point along the time axis
|
||||
pt = tf.random.uniform([], time_warping_para, n -
|
||||
time_warping_para, tf.int32)
|
||||
src_ctr_pt_freq = tf.range(tf.floordiv(v, 2)) # control points on freq-axis
|
||||
# control points on time-axis
|
||||
src_ctr_pt_time = tf.ones_like(src_ctr_pt_freq) * pt
|
||||
src_ctr_pts = tf.stack((src_ctr_pt_time, src_ctr_pt_freq), -1)
|
||||
src_ctr_pts = tf.cast(src_ctr_pts, dtype=tf.float32)
|
||||
|
||||
# Destination
|
||||
w = tf.random.uniform([], -time_warping_para,
|
||||
time_warping_para, tf.int32) # distance
|
||||
dest_ctr_pt_freq = src_ctr_pt_freq
|
||||
dest_ctr_pt_time = src_ctr_pt_time + w
|
||||
dest_ctr_pts = tf.stack((dest_ctr_pt_time, dest_ctr_pt_freq), -1)
|
||||
dest_ctr_pts = tf.cast(dest_ctr_pts, dtype=tf.float32)
|
||||
|
||||
# warp
|
||||
source_control_point_locations = tf.expand_dims(
|
||||
src_ctr_pts, 0) # (1, v//2, 2)
|
||||
dest_control_point_locations = tf.expand_dims(
|
||||
dest_ctr_pts, 0) # (1, v//2, 2)
|
||||
|
||||
print(spectrogram.shape)
|
||||
warped_image, _ = sparse_image_warp(spectrogram,
|
||||
source_control_point_locations,
|
||||
dest_control_point_locations)
|
||||
return warped_image
|
||||
|
Loading…
x
Reference in New Issue
Block a user