prepare files for refactoring

2019-12-02 12:19:32 +08:00 · 2019-12-02 12:19:32 +08:00 · 271a58e464
commit 271a58e464
parent 271e3639a7
3 changed files with 294 additions and 19 deletions
--- a/util/feeding.py
+++ b/util/feeding.py
@ -14,7 +14,7 @@ from tensorflow.python.ops import gen_audio_ops as contrib_audio
 from util.config import Config
 from util.text import text_to_char_array
 from util.flags import FLAGS
-from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up
+from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up, augment_sparse_warp
 from util.audio import read_frames_from_file, vad_split, DEFAULT_FORMAT


@ -58,6 +58,8 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False):
        if FLAGS.augmentation_speed_up_std > 0:
            spectrogram = augment_speed_up(spectrogram, speed_std=FLAGS.augmentation_speed_up_std)

+        # spectrogram = augment_sparse_warp(spectrogram)
+
    mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input)
    mfccs = tf.reshape(mfccs, [-1, Config.n_input])

--- a/util/sparse_image_warp.py
+++ b/util/sparse_image_warp.py
@ -0,0 +1,203 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Image warping using sparse flow defined at control points."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import tensorflow as tf
+from tensorflow.compat import v1 as tfv1
+from tensorflow.contrib.image.python.ops import dense_image_warp
+from tensorflow.contrib.image.python.ops import interpolate_spline
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+
+
+def _get_grid_locations(image_height, image_width):
+    """Wrapper for np.meshgrid."""
+
+    y_range = np.linspace(0, image_height - 1, image_height)
+    x_range = np.linspace(0, image_width - 1, image_width)
+    y_grid, x_grid = np.meshgrid(y_range, x_range, indexing='ij')
+    return np.stack((y_grid, x_grid), -1)
+
+
+def _expand_to_minibatch(np_array, batch_size):
+    """Tile arbitrarily-sized np_array to include new batch dimension."""
+    tiles = [batch_size] + [1] * np_array.ndim
+    return np.tile(np.expand_dims(np_array, 0), tiles)
+
+
+def _get_boundary_locations(image_height, image_width, num_points_per_edge):
+    """Compute evenly-spaced indices along edge of image."""
+    y_range = np.linspace(0, image_height - 1, num_points_per_edge + 2)
+    x_range = np.linspace(0, image_width - 1, num_points_per_edge + 2)
+    ys, xs = np.meshgrid(y_range, x_range, indexing='ij')
+    is_boundary = np.logical_or(
+        np.logical_or(xs == 0, xs == image_width - 1),
+        np.logical_or(ys == 0, ys == image_height - 1))
+    return np.stack([ys[is_boundary], xs[is_boundary]], axis=-1)
+
+
+def _add_zero_flow_controls_at_boundary(control_point_locations,
+                                        control_point_flows, image_height,
+                                        image_width, boundary_points_per_edge):
+    """Add control points for zero-flow boundary conditions.
+
+     Augment the set of control points with extra points on the
+     boundary of the image that have zero flow.
+
+    Args:
+      control_point_locations: input control points
+      control_point_flows: their flows
+      image_height: image height
+      image_width: image width
+      boundary_points_per_edge: number of points to add in the middle of each
+                             edge (not including the corners).
+                             The total number of points added is
+                             4 + 4*(boundary_points_per_edge).
+
+    Returns:
+      merged_control_point_locations: augmented set of control point locations
+      merged_control_point_flows: augmented set of control point flows
+    """
+
+    batch_size = tensor_shape.dimension_value(control_point_locations.shape[0])
+
+    boundary_point_locations = _get_boundary_locations(image_height, image_width,
+                                                       boundary_points_per_edge)
+
+    boundary_point_flows = np.zeros([boundary_point_locations.shape[0], 2])
+
+    type_to_use = control_point_locations.dtype
+    boundary_point_locations = constant_op.constant(
+        _expand_to_minibatch(boundary_point_locations, batch_size),
+        dtype=type_to_use)
+
+    boundary_point_flows = constant_op.constant(
+        _expand_to_minibatch(boundary_point_flows, batch_size), dtype=type_to_use)
+
+    merged_control_point_locations = array_ops.concat(
+        [control_point_locations, boundary_point_locations], 1)
+
+    merged_control_point_flows = array_ops.concat(
+        [control_point_flows, boundary_point_flows], 1)
+
+    return merged_control_point_locations, merged_control_point_flows
+
+
+def sparse_image_warp(image,
+                      source_control_point_locations,
+                      dest_control_point_locations,
+                      interpolation_order=2,
+                      regularization_weight=0.0,
+                      num_boundary_points=0,
+                      name='sparse_image_warp'):
+    """Image warping using correspondences between sparse control points.
+
+    Apply a non-linear warp to the image, where the warp is specified by
+    the source and destination locations of a (potentially small) number of
+    control points. First, we use a polyharmonic spline
+    (`tf.contrib.image.interpolate_spline`) to interpolate the displacements
+    between the corresponding control points to a dense flow field.
+    Then, we warp the image using this dense flow field
+    (`tf.contrib.image.dense_image_warp`).
+
+    Let t index our control points. For regularization_weight=0, we have:
+    warped_image[b, dest_control_point_locations[b, t, 0],
+                    dest_control_point_locations[b, t, 1], :] =
+    image[b, source_control_point_locations[b, t, 0],
+             source_control_point_locations[b, t, 1], :].
+
+    For regularization_weight > 0, this condition is met approximately, since
+    regularized interpolation trades off smoothness of the interpolant vs.
+    reconstruction of the interpolant at the control points.
+    See `tf.contrib.image.interpolate_spline` for further documentation of the
+    interpolation_order and regularization_weight arguments.
+
+
+    Args:
+      image: `[batch, height, width, channels]` float `Tensor`
+      source_control_point_locations: `[batch, num_control_points, 2]` float
+        `Tensor`
+      dest_control_point_locations: `[batch, num_control_points, 2]` float
+        `Tensor`
+      interpolation_order: polynomial order used by the spline interpolation
+      regularization_weight: weight on smoothness regularizer in interpolation
+      num_boundary_points: How many zero-flow boundary points to include at
+        each image edge.Usage:
+          num_boundary_points=0: don't add zero-flow points
+          num_boundary_points=1: 4 corners of the image
+          num_boundary_points=2: 4 corners and one in the middle of each edge
+            (8 points total)
+          num_boundary_points=n: 4 corners and n-1 along each edge
+      name: A name for the operation (optional).
+
+      Note that image and offsets can be of type tf.half, tf.float32, or
+      tf.float64, and do not necessarily have to be the same type.
+
+    Returns:
+      warped_image: `[batch, height, width, channels]` float `Tensor` with same
+        type as input image.
+      flow_field: `[batch, height, width, 2]` float `Tensor` containing the dense
+        flow field produced by the interpolation.
+    """
+
+    image = ops.convert_to_tensor(image)
+    source_control_point_locations = ops.convert_to_tensor(
+        source_control_point_locations)
+    dest_control_point_locations = ops.convert_to_tensor(
+        dest_control_point_locations)
+
+    control_point_flows = (
+        dest_control_point_locations - source_control_point_locations)
+
+    clamp_boundaries = num_boundary_points > 0
+    boundary_points_per_edge = num_boundary_points - 1
+
+    with ops.name_scope(name):
+        batch_size, image_height, image_width, _ = image.get_shape().as_list()
+
+        # This generates the dense locations where the interpolant
+        # will be evaluated.
+        grid_locations = _get_grid_locations(image_height, image_width)
+
+        flattened_grid_locations = np.reshape(grid_locations,
+                                              [image_height * image_width, 2])
+
+        flattened_grid_locations = constant_op.constant(
+            _expand_to_minibatch(flattened_grid_locations, batch_size), image.dtype)
+
+        if clamp_boundaries:
+            (dest_control_point_locations,
+             control_point_flows) = _add_zero_flow_controls_at_boundary(
+                 dest_control_point_locations, control_point_flows, image_height,
+                 image_width, boundary_points_per_edge)
+
+        flattened_flows = interpolate_spline.interpolate_spline(
+            dest_control_point_locations, control_point_flows,
+            flattened_grid_locations, interpolation_order, regularization_weight)
+
+        dense_flows = array_ops.reshape(flattened_flows,
+                                        [batch_size, image_height, image_width, 2])
+
+        warped_image = dense_image_warp.dense_image_warp(image, dense_flows)
+
+        return warped_image, dense_flows
--- a/util/spectrogram_augmentations.py
+++ b/util/spectrogram_augmentations.py
@ -1,4 +1,5 @@
 import tensorflow as tf
+from util.sparse_image_warp import sparse_image_warp

 def augment_freq_time_mask(mel_spectrogram,
                           frequency_masking_para=30,
@ -9,41 +10,53 @@ def augment_freq_time_mask(mel_spectrogram,
    time_max = tf.shape(mel_spectrogram)[2]
    # Frequency masking
    for _ in range(frequency_mask_num):
-        f = tf.random.uniform(shape=(), minval=0, maxval=frequency_masking_para, dtype=tf.dtypes.int32)
-        f0 = tf.random.uniform(shape=(), minval=0, maxval=freq_max - f, dtype=tf.dtypes.int32)
+        f = tf.random.uniform(
+            shape=(), minval=0, maxval=frequency_masking_para, dtype=tf.dtypes.int32)
+        f0 = tf.random.uniform(
+            shape=(), minval=0, maxval=freq_max - f, dtype=tf.dtypes.int32)
        value_ones_freq_prev = tf.ones(shape=[1, f0, time_max])
        value_zeros_freq = tf.zeros(shape=[1, f, time_max])
        value_ones_freq_next = tf.ones(shape=[1, freq_max-(f0+f), time_max])
-        freq_mask = tf.concat([value_ones_freq_prev, value_zeros_freq, value_ones_freq_next], axis=1)
-        #mel_spectrogram[:, f0:f0 + f, :] = 0 #can't assign to tensor
-        #mel_spectrogram[:, f0:f0 + f, :] = value_zeros_freq #can't assign to tensor
+        freq_mask = tf.concat(
+            [value_ones_freq_prev, value_zeros_freq, value_ones_freq_next], axis=1)
+        # mel_spectrogram[:, f0:f0 + f, :] = 0 #can't assign to tensor
+        # mel_spectrogram[:, f0:f0 + f, :] = value_zeros_freq #can't assign to tensor
        mel_spectrogram = mel_spectrogram*freq_mask

    # Time masking
    for _ in range(time_mask_num):
-        t = tf.random.uniform(shape=(), minval=0, maxval=time_masking_para, dtype=tf.dtypes.int32)
-        t0 = tf.random.uniform(shape=(), minval=0, maxval=time_max - t, dtype=tf.dtypes.int32)
+        t = tf.random.uniform(shape=(), minval=0,
+                              maxval=time_masking_para, dtype=tf.dtypes.int32)
+        t0 = tf.random.uniform(
+            shape=(), minval=0, maxval=time_max - t, dtype=tf.dtypes.int32)
        value_zeros_time_prev = tf.ones(shape=[1, freq_max, t0])
        value_zeros_time = tf.zeros(shape=[1, freq_max, t])
        value_zeros_time_next = tf.ones(shape=[1, freq_max, time_max-(t0+t)])
-        time_mask = tf.concat([value_zeros_time_prev, value_zeros_time, value_zeros_time_next], axis=2)
-        #mel_spectrogram[:, :, t0:t0 + t] = 0 #can't assign to tensor
-        #mel_spectrogram[:, :, t0:t0 + t] = value_zeros_time #can't assign to tensor
+        time_mask = tf.concat(
+            [value_zeros_time_prev, value_zeros_time, value_zeros_time_next], axis=2)
+        # mel_spectrogram[:, :, t0:t0 + t] = 0 #can't assign to tensor
+        # mel_spectrogram[:, :, t0:t0 + t] = value_zeros_time #can't assign to tensor
        mel_spectrogram = mel_spectrogram*time_mask

    return mel_spectrogram

+
 def augment_pitch_and_tempo(spectrogram,
                            max_tempo=1.2,
                            max_pitch=1.1,
                            min_pitch=0.95):
    original_shape = tf.shape(spectrogram)
-    choosen_pitch = tf.random.uniform(shape=(), minval=min_pitch, maxval=max_pitch)
+    choosen_pitch = tf.random.uniform(
+        shape=(), minval=min_pitch, maxval=max_pitch)
    choosen_tempo = tf.random.uniform(shape=(), minval=1, maxval=max_tempo)
-    new_height = tf.cast(tf.cast(original_shape[1], tf.float32)*choosen_pitch, tf.int32)
-    new_width = tf.cast(tf.cast(original_shape[2], tf.float32)/(choosen_tempo), tf.int32)
-    spectrogram_aug = tf.image.resize_bilinear(tf.expand_dims(spectrogram, -1), [new_height, new_width])
-    spectrogram_aug = tf.image.crop_to_bounding_box(spectrogram_aug, offset_height=0, offset_width=0, target_height=tf.minimum(original_shape[1], new_height), target_width=tf.shape(spectrogram_aug)[2])
+    new_height = tf.cast(
+        tf.cast(original_shape[1], tf.float32)*choosen_pitch, tf.int32)
+    new_width = tf.cast(
+        tf.cast(original_shape[2], tf.float32)/(choosen_tempo), tf.int32)
+    spectrogram_aug = tf.image.resize_bilinear(
+        tf.expand_dims(spectrogram, -1), [new_height, new_width])
+    spectrogram_aug = tf.image.crop_to_bounding_box(spectrogram_aug, offset_height=0, offset_width=0, target_height=tf.minimum(
+        original_shape[1], new_height), target_width=tf.shape(spectrogram_aug)[2])
    spectrogram_aug = tf.cond(choosen_pitch < 1,
                              lambda: tf.image.pad_to_bounding_box(spectrogram_aug, offset_height=0, offset_width=0,
                                                                   target_height=original_shape[1], target_width=tf.shape(spectrogram_aug)[2]),
@ -54,13 +67,70 @@ def augment_pitch_and_tempo(spectrogram,
 def augment_speed_up(spectrogram,
                     speed_std=0.1):
    original_shape = tf.shape(spectrogram)
-    choosen_speed = tf.math.abs(tf.random.normal(shape=(), stddev=speed_std)) # abs makes sure the augmention will only speed up
+    # abs makes sure the augmention will only speed up
+    choosen_speed = tf.math.abs(tf.random.normal(shape=(), stddev=speed_std))
    choosen_speed = 1 + choosen_speed
    new_height = tf.cast(tf.cast(original_shape[1], tf.float32), tf.int32)
-    new_width = tf.cast(tf.cast(original_shape[2], tf.float32)/(choosen_speed), tf.int32)
-    spectrogram_aug = tf.image.resize_bilinear(tf.expand_dims(spectrogram, -1), [new_height, new_width])
+    new_width = tf.cast(
+        tf.cast(original_shape[2], tf.float32)/(choosen_speed), tf.int32)
+    spectrogram_aug = tf.image.resize_bilinear(
+        tf.expand_dims(spectrogram, -1), [new_height, new_width])
    return spectrogram_aug[:, :, :, 0]

+
 def augment_dropout(spectrogram,
                    keep_prob=0.95):
    return tf.nn.dropout(spectrogram, rate=1-keep_prob)
+
+
+def augment_sparse_warp(spectrogram: tf.Tensor, time_warping_para=80):
+    """Spec augmentation Calculation Function.
+    'SpecAugment' have 3 steps for audio data augmentation.
+    first step is time warping using Tensorflow's image_sparse_warp function.
+    Second step is frequency masking, last step is time masking.
+    # Arguments:
+      mel_spectrogram(numpy array): audio file path of you want to warping and masking.
+      time_warping_para(float): Augmentation parameter, "time warp parameter W".
+        If none, default = 80 for LibriSpeech.
+    # Returns
+      mel_spectrogram(numpy array): warped and masked mel spectrogram.
+    """
+    if spectrogram.get_shape().ndims == 2:
+        spectrogram = tf.reshape(spectrogram, shape=[1, -1, spectrogram.shape[1], 1])
+    elif spectrogram.get_shape().ndims == 3:
+        spectrogram = tf.reshape(spectrogram, shape=[spectrogram.shape[0], -1, spectrogram.shape[2], 1])
+    assert spectrogram.get_shape().ndims == 4
+    fbank_size = tf.shape(spectrogram)
+    n, v = fbank_size[1], fbank_size[2]
+
+    # Step 1 : Time warping
+    # Image warping control point setting.
+    # Source
+    # radnom point along the time axis
+    pt = tf.random.uniform([], time_warping_para, n -
+                           time_warping_para, tf.int32)
+    src_ctr_pt_freq = tf.range(tf.floordiv(v, 2))  # control points on freq-axis
+    # control points on time-axis
+    src_ctr_pt_time = tf.ones_like(src_ctr_pt_freq) * pt
+    src_ctr_pts = tf.stack((src_ctr_pt_time, src_ctr_pt_freq), -1)
+    src_ctr_pts = tf.cast(src_ctr_pts, dtype=tf.float32)
+
+    # Destination
+    w = tf.random.uniform([], -time_warping_para,
+                          time_warping_para, tf.int32)  # distance
+    dest_ctr_pt_freq = src_ctr_pt_freq
+    dest_ctr_pt_time = src_ctr_pt_time + w
+    dest_ctr_pts = tf.stack((dest_ctr_pt_time, dest_ctr_pt_freq), -1)
+    dest_ctr_pts = tf.cast(dest_ctr_pts, dtype=tf.float32)
+
+    # warp
+    source_control_point_locations = tf.expand_dims(
+        src_ctr_pts, 0)  # (1, v//2, 2)
+    dest_control_point_locations = tf.expand_dims(
+        dest_ctr_pts, 0)  # (1, v//2, 2)
+
+    print(spectrogram.shape)
+    warped_image, _ = sparse_image_warp(spectrogram,
+                                        source_control_point_locations,
+                                        dest_control_point_locations)
+    return warped_image