Normalize sample rate of train_files by default

This commit is contained in:
Reuben Morais 2021-01-18 12:10:00 +00:00
parent d4152f6e67
commit 8c0d46cb7f
5 changed files with 29 additions and 25 deletions

View File

@ -53,9 +53,6 @@ def main():
'absl-py', 'absl-py',
'attrdict', 'attrdict',
'bs4', 'bs4',
'librosa',
'llvmlite == 0.31.0', # for numba==0.47.0
'numba == 0.47.0', # ships py3.5 wheel
'numpy', 'numpy',
'optuna', 'optuna',
'opuslib == 2.0.0', 'opuslib == 2.0.0',
@ -63,6 +60,7 @@ def main():
'progressbar2', 'progressbar2',
'pyogg >= 0.6.14a1', 'pyogg >= 0.6.14a1',
'pyxdg', 'pyxdg',
'resampy >= 0.2.2',
'requests', 'requests',
'semver', 'semver',
'six', 'six',

View File

@ -3,6 +3,7 @@ import os
import re import re
import math import math
import random import random
import resampy
import numpy as np import numpy as np
from multiprocessing import Queue, Process from multiprocessing import Queue, Process
@ -129,7 +130,7 @@ def apply_graph_augmentations(domain, tensor, augmentations, transcript=None, cl
Tensor of type float32 Tensor of type float32
The augmented spectrogram The augmented spectrogram
""" """
if augmentations is not None: if augmentations:
for augmentation in augmentations: for augmentation in augmentations:
if isinstance(augmentation, GraphAugmentation): if isinstance(augmentation, GraphAugmentation):
tensor = augmentation.maybe_apply(domain, tensor, transcript=transcript, clock=clock) tensor = augmentation.maybe_apply(domain, tensor, transcript=transcript, clock=clock)
@ -348,24 +349,25 @@ class Resample(SampleAugmentation):
self.rate = int_range(rate) self.rate = int_range(rate)
def apply(self, sample, clock=0.0): def apply(self, sample, clock=0.0):
# late binding librosa and its dependencies
# pre-importing sklearn fixes https://github.com/scikit-learn/scikit-learn/issues/14485
import sklearn # pylint: disable=import-outside-toplevel
from librosa.core import resample # pylint: disable=import-outside-toplevel
sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP) sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP)
rate = pick_value_from_range(self.rate, clock=clock) rate = pick_value_from_range(self.rate, clock=clock)
audio = sample.audio orig_len = len(sample.audio)
orig_len = len(audio) resampled = resampy.resample(sample.audio, sample.audio_format.rate, rate, axis=0, filter='kaiser_fast')
audio = np.swapaxes(audio, 0, 1) sample.audio = resampy.resample(resampled, rate, sample.audio_format.rate, axis=0, filter='kaiser_fast')[:orig_len]
if audio.shape[0] < 2:
# since v0.8 librosa enforces a shape of (samples,) instead of (channels, samples) for mono samples
resampled = resample(audio[0], sample.audio_format.rate, rate) class NormalizeSampleRate(SampleAugmentation):
audio[0] = resample(resampled, rate, sample.audio_format.rate)[:orig_len] def __init__(self, rate):
else: super().__init__(p=1.0)
audio = resample(audio, sample.audio_format.rate, rate) self.rate = rate
audio = resample(audio, rate, sample.audio_format.rate)
audio = np.swapaxes(audio, 0, 1)[0:orig_len] def apply(self, sample, clock=0.0):
sample.audio = audio if sample.audio_format.rate == self.rate:
return
sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP)
sample.audio = resampy.resample(sample.audio, sample.audio_format.rate, self.rate, axis=0, filter='kaiser_fast')
sample.audio_format = sample.audio_format._replace(rate=self.rate)
class Volume(SampleAugmentation): class Volume(SampleAugmentation):

View File

@ -12,7 +12,7 @@ from .flags import FLAGS
from .gpu import get_available_gpus from .gpu import get_available_gpus
from .logging import log_error, log_warn from .logging import log_error, log_warn
from .helpers import parse_file_size from .helpers import parse_file_size
from .augmentations import parse_augmentations from .augmentations import parse_augmentations, NormalizeSampleRate
from .io import path_exists_remote from .io import path_exists_remote
class ConfigSingleton: class ConfigSingleton:
@ -33,11 +33,14 @@ def initialize_globals():
# Augmentations # Augmentations
c.augmentations = parse_augmentations(FLAGS.augment) c.augmentations = parse_augmentations(FLAGS.augment)
if len(c.augmentations) > 0 and FLAGS.feature_cache and FLAGS.cache_for_epochs == 0: if c.augmentations and FLAGS.feature_cache and FLAGS.cache_for_epochs == 0:
log_warn('Due to current feature-cache settings the exact same sample augmentations of the first ' log_warn('Due to current feature-cache settings the exact same sample augmentations of the first '
'epoch will be repeated on all following epochs. This could lead to unintended over-fitting. ' 'epoch will be repeated on all following epochs. This could lead to unintended over-fitting. '
'You could use --cache_for_epochs <n_epochs> to invalidate the cache after a given number of epochs.') 'You could use --cache_for_epochs <n_epochs> to invalidate the cache after a given number of epochs.')
if FLAGS.normalize_sample_rate:
c.augmentations = [NormalizeSampleRate(FLAGS.audio_sample_rate)] + c['augmentations']
# Caching # Caching
if FLAGS.cache_for_epochs == 1: if FLAGS.cache_for_epochs == 1:
log_warn('--cache_for_epochs == 1 is (re-)creating the feature cache on every epoch but will never use it.') log_warn('--cache_for_epochs == 1 is (re-)creating the feature cache on every epoch but will never use it.')

View File

@ -28,7 +28,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
lambda: tf.no_op(), lambda: tf.no_op(),
name='matching_sample_rate') name='matching_sample_rate')
if train_phase and augmentations is not None: if train_phase and augmentations:
audio = apply_graph_augmentations('signal', audio, augmentations, transcript=transcript, clock=clock) audio = apply_graph_augmentations('signal', audio, augmentations, transcript=transcript, clock=clock)
spectrogram = contrib_audio.audio_spectrogram(audio, spectrogram = contrib_audio.audio_spectrogram(audio,
@ -36,7 +36,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
stride=Config.audio_step_samples, stride=Config.audio_step_samples,
magnitude_squared=True) magnitude_squared=True)
if train_phase and augmentations is not None: if train_phase and augmentations:
spectrogram = apply_graph_augmentations('spectrogram', spectrogram, augmentations, transcript=transcript, clock=clock) spectrogram = apply_graph_augmentations('spectrogram', spectrogram, augmentations, transcript=transcript, clock=clock)
features = contrib_audio.mfcc(spectrogram=spectrogram, features = contrib_audio.mfcc(spectrogram=spectrogram,
@ -45,7 +45,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
upper_frequency_limit=FLAGS.audio_sample_rate / 2) upper_frequency_limit=FLAGS.audio_sample_rate / 2)
features = tf.reshape(features, [-1, Config.n_input]) features = tf.reshape(features, [-1, Config.n_input])
if train_phase and augmentations is not None: if train_phase and augmentations:
features = apply_graph_augmentations('features', features, augmentations, transcript=transcript, clock=clock) features = apply_graph_augmentations('features', features, augmentations, transcript=transcript, clock=clock)
return features, tf.shape(input=features)[0] return features, tf.shape(input=features)[0]

View File

@ -24,6 +24,7 @@ def create_flags():
f.DEFINE_integer('feature_win_len', 32, 'feature extraction audio window length in milliseconds') f.DEFINE_integer('feature_win_len', 32, 'feature extraction audio window length in milliseconds')
f.DEFINE_integer('feature_win_step', 20, 'feature extraction window step length in milliseconds') f.DEFINE_integer('feature_win_step', 20, 'feature extraction window step length in milliseconds')
f.DEFINE_integer('audio_sample_rate', 16000, 'sample rate value expected by model') f.DEFINE_integer('audio_sample_rate', 16000, 'sample rate value expected by model')
f.DEFINE_boolean('normalize_sample_rate', True, 'normalize sample rate of all train_files to --audio_sample_rate')
# Data Augmentation # Data Augmentation
# ================ # ================