Normalize sample rate of train_files by default

2021-01-18 12:10:00 +00:00 · 2021-01-18 12:10:00 +00:00 · 8c0d46cb7f
commit 8c0d46cb7f
parent d4152f6e67
5 changed files with 29 additions and 25 deletions
--- a/setup.py
+++ b/setup.py
@ -53,9 +53,6 @@ def main():
        'absl-py',
        'attrdict',
        'bs4',
-        'librosa',
-        'llvmlite == 0.31.0', # for numba==0.47.0
-        'numba == 0.47.0', # ships py3.5 wheel
        'numpy',
        'optuna',
        'opuslib == 2.0.0',
@ -63,6 +60,7 @@ def main():
        'progressbar2',
        'pyogg >= 0.6.14a1',
        'pyxdg',
+        'resampy >= 0.2.2',
        'requests',
        'semver',
        'six',
--- a/training/deepspeech_training/util/augmentations.py
+++ b/training/deepspeech_training/util/augmentations.py
@ -3,6 +3,7 @@ import os
 import re
 import math
 import random
+import resampy
 import numpy as np

 from multiprocessing import Queue, Process
@ -129,7 +130,7 @@ def apply_graph_augmentations(domain, tensor, augmentations, transcript=None, cl
    Tensor of type float32
        The augmented spectrogram
    """
-    if augmentations is not None:
+    if augmentations:
        for augmentation in augmentations:
            if isinstance(augmentation, GraphAugmentation):
                tensor = augmentation.maybe_apply(domain, tensor, transcript=transcript, clock=clock)
@ -348,24 +349,25 @@ class Resample(SampleAugmentation):
        self.rate = int_range(rate)

    def apply(self, sample, clock=0.0):
-        # late binding librosa and its dependencies
-        # pre-importing sklearn fixes https://github.com/scikit-learn/scikit-learn/issues/14485
-        import sklearn  # pylint: disable=import-outside-toplevel
-        from librosa.core import resample  # pylint: disable=import-outside-toplevel
        sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP)
        rate = pick_value_from_range(self.rate, clock=clock)
-        audio = sample.audio
-        orig_len = len(audio)
-        audio = np.swapaxes(audio, 0, 1)
-        if audio.shape[0] < 2:
-            # since v0.8 librosa enforces a shape of (samples,) instead of (channels, samples) for mono samples
-            resampled = resample(audio[0], sample.audio_format.rate, rate)
-            audio[0] = resample(resampled, rate, sample.audio_format.rate)[:orig_len]
-        else:
-            audio = resample(audio, sample.audio_format.rate, rate)
-            audio = resample(audio, rate, sample.audio_format.rate)
-        audio = np.swapaxes(audio, 0, 1)[0:orig_len]
-        sample.audio = audio
+        orig_len = len(sample.audio)
+        resampled = resampy.resample(sample.audio, sample.audio_format.rate, rate, axis=0, filter='kaiser_fast')
+        sample.audio = resampy.resample(resampled, rate, sample.audio_format.rate, axis=0, filter='kaiser_fast')[:orig_len]
+
+
+class NormalizeSampleRate(SampleAugmentation):
+    def __init__(self, rate):
+        super().__init__(p=1.0)
+        self.rate = rate
+
+    def apply(self, sample, clock=0.0):
+        if sample.audio_format.rate == self.rate:
+            return
+
+        sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP)
+        sample.audio = resampy.resample(sample.audio, sample.audio_format.rate, self.rate, axis=0, filter='kaiser_fast')
+        sample.audio_format = sample.audio_format._replace(rate=self.rate)


 class Volume(SampleAugmentation):
--- a/training/deepspeech_training/util/config.py
+++ b/training/deepspeech_training/util/config.py
@ -12,7 +12,7 @@ from .flags import FLAGS
 from .gpu import get_available_gpus
 from .logging import log_error, log_warn
 from .helpers import parse_file_size
-from .augmentations import parse_augmentations
+from .augmentations import parse_augmentations, NormalizeSampleRate
 from .io import path_exists_remote

 class ConfigSingleton:
@ -33,11 +33,14 @@ def initialize_globals():

    # Augmentations
    c.augmentations = parse_augmentations(FLAGS.augment)
-    if len(c.augmentations) > 0 and FLAGS.feature_cache and FLAGS.cache_for_epochs == 0:
+    if c.augmentations and FLAGS.feature_cache and FLAGS.cache_for_epochs == 0:
        log_warn('Due to current feature-cache settings the exact same sample augmentations of the first '
                 'epoch will be repeated on all following epochs. This could lead to unintended over-fitting. '
                 'You could use --cache_for_epochs <n_epochs> to invalidate the cache after a given number of epochs.')

+    if FLAGS.normalize_sample_rate:
+        c.augmentations = [NormalizeSampleRate(FLAGS.audio_sample_rate)] + c['augmentations']
+
    # Caching
    if FLAGS.cache_for_epochs == 1:
        log_warn('--cache_for_epochs == 1 is (re-)creating the feature cache on every epoch but will never use it.')
--- a/training/deepspeech_training/util/feeding.py
+++ b/training/deepspeech_training/util/feeding.py
@ -28,7 +28,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
                lambda: tf.no_op(),
                name='matching_sample_rate')

-    if train_phase and augmentations is not None:
+    if train_phase and augmentations:
        audio = apply_graph_augmentations('signal', audio, augmentations, transcript=transcript, clock=clock)

    spectrogram = contrib_audio.audio_spectrogram(audio,
@ -36,7 +36,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
                                                  stride=Config.audio_step_samples,
                                                  magnitude_squared=True)

-    if train_phase and augmentations is not None:
+    if train_phase and augmentations:
        spectrogram = apply_graph_augmentations('spectrogram', spectrogram, augmentations, transcript=transcript, clock=clock)

    features = contrib_audio.mfcc(spectrogram=spectrogram,
@ -45,7 +45,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
                                  upper_frequency_limit=FLAGS.audio_sample_rate / 2)
    features = tf.reshape(features, [-1, Config.n_input])

-    if train_phase and augmentations is not None:
+    if train_phase and augmentations:
        features = apply_graph_augmentations('features', features, augmentations, transcript=transcript, clock=clock)

    return features, tf.shape(input=features)[0]
--- a/training/deepspeech_training/util/flags.py
+++ b/training/deepspeech_training/util/flags.py
@ -24,6 +24,7 @@ def create_flags():
    f.DEFINE_integer('feature_win_len', 32, 'feature extraction audio window length in milliseconds')
    f.DEFINE_integer('feature_win_step', 20, 'feature extraction window step length in milliseconds')
    f.DEFINE_integer('audio_sample_rate', 16000, 'sample rate value expected by model')
+    f.DEFINE_boolean('normalize_sample_rate', True, 'normalize sample rate of all train_files to --audio_sample_rate')

    # Data Augmentation
    # ================