Normalize sample rate of train_files by default

This commit is contained in:
Reuben Morais 2021-01-18 12:10:00 +00:00
parent d4152f6e67
commit 8c0d46cb7f
5 changed files with 29 additions and 25 deletions

View File

@ -53,9 +53,6 @@ def main():
'absl-py',
'attrdict',
'bs4',
'librosa',
'llvmlite == 0.31.0', # for numba==0.47.0
'numba == 0.47.0', # ships py3.5 wheel
'numpy',
'optuna',
'opuslib == 2.0.0',
@ -63,6 +60,7 @@ def main():
'progressbar2',
'pyogg >= 0.6.14a1',
'pyxdg',
'resampy >= 0.2.2',
'requests',
'semver',
'six',

View File

@ -3,6 +3,7 @@ import os
import re
import math
import random
import resampy
import numpy as np
from multiprocessing import Queue, Process
@ -129,7 +130,7 @@ def apply_graph_augmentations(domain, tensor, augmentations, transcript=None, cl
Tensor of type float32
The augmented spectrogram
"""
if augmentations is not None:
if augmentations:
for augmentation in augmentations:
if isinstance(augmentation, GraphAugmentation):
tensor = augmentation.maybe_apply(domain, tensor, transcript=transcript, clock=clock)
@ -348,24 +349,25 @@ class Resample(SampleAugmentation):
self.rate = int_range(rate)
def apply(self, sample, clock=0.0):
# late binding librosa and its dependencies
# pre-importing sklearn fixes https://github.com/scikit-learn/scikit-learn/issues/14485
import sklearn # pylint: disable=import-outside-toplevel
from librosa.core import resample # pylint: disable=import-outside-toplevel
sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP)
rate = pick_value_from_range(self.rate, clock=clock)
audio = sample.audio
orig_len = len(audio)
audio = np.swapaxes(audio, 0, 1)
if audio.shape[0] < 2:
# since v0.8 librosa enforces a shape of (samples,) instead of (channels, samples) for mono samples
resampled = resample(audio[0], sample.audio_format.rate, rate)
audio[0] = resample(resampled, rate, sample.audio_format.rate)[:orig_len]
else:
audio = resample(audio, sample.audio_format.rate, rate)
audio = resample(audio, rate, sample.audio_format.rate)
audio = np.swapaxes(audio, 0, 1)[0:orig_len]
sample.audio = audio
orig_len = len(sample.audio)
resampled = resampy.resample(sample.audio, sample.audio_format.rate, rate, axis=0, filter='kaiser_fast')
sample.audio = resampy.resample(resampled, rate, sample.audio_format.rate, axis=0, filter='kaiser_fast')[:orig_len]
class NormalizeSampleRate(SampleAugmentation):
def __init__(self, rate):
super().__init__(p=1.0)
self.rate = rate
def apply(self, sample, clock=0.0):
if sample.audio_format.rate == self.rate:
return
sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP)
sample.audio = resampy.resample(sample.audio, sample.audio_format.rate, self.rate, axis=0, filter='kaiser_fast')
sample.audio_format = sample.audio_format._replace(rate=self.rate)
class Volume(SampleAugmentation):

View File

@ -12,7 +12,7 @@ from .flags import FLAGS
from .gpu import get_available_gpus
from .logging import log_error, log_warn
from .helpers import parse_file_size
from .augmentations import parse_augmentations
from .augmentations import parse_augmentations, NormalizeSampleRate
from .io import path_exists_remote
class ConfigSingleton:
@ -33,11 +33,14 @@ def initialize_globals():
# Augmentations
c.augmentations = parse_augmentations(FLAGS.augment)
if len(c.augmentations) > 0 and FLAGS.feature_cache and FLAGS.cache_for_epochs == 0:
if c.augmentations and FLAGS.feature_cache and FLAGS.cache_for_epochs == 0:
log_warn('Due to current feature-cache settings the exact same sample augmentations of the first '
'epoch will be repeated on all following epochs. This could lead to unintended over-fitting. '
'You could use --cache_for_epochs <n_epochs> to invalidate the cache after a given number of epochs.')
if FLAGS.normalize_sample_rate:
c.augmentations = [NormalizeSampleRate(FLAGS.audio_sample_rate)] + c['augmentations']
# Caching
if FLAGS.cache_for_epochs == 1:
log_warn('--cache_for_epochs == 1 is (re-)creating the feature cache on every epoch but will never use it.')

View File

@ -28,7 +28,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
lambda: tf.no_op(),
name='matching_sample_rate')
if train_phase and augmentations is not None:
if train_phase and augmentations:
audio = apply_graph_augmentations('signal', audio, augmentations, transcript=transcript, clock=clock)
spectrogram = contrib_audio.audio_spectrogram(audio,
@ -36,7 +36,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
stride=Config.audio_step_samples,
magnitude_squared=True)
if train_phase and augmentations is not None:
if train_phase and augmentations:
spectrogram = apply_graph_augmentations('spectrogram', spectrogram, augmentations, transcript=transcript, clock=clock)
features = contrib_audio.mfcc(spectrogram=spectrogram,
@ -45,7 +45,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
upper_frequency_limit=FLAGS.audio_sample_rate / 2)
features = tf.reshape(features, [-1, Config.n_input])
if train_phase and augmentations is not None:
if train_phase and augmentations:
features = apply_graph_augmentations('features', features, augmentations, transcript=transcript, clock=clock)
return features, tf.shape(input=features)[0]

View File

@ -24,6 +24,7 @@ def create_flags():
f.DEFINE_integer('feature_win_len', 32, 'feature extraction audio window length in milliseconds')
f.DEFINE_integer('feature_win_step', 20, 'feature extraction window step length in milliseconds')
f.DEFINE_integer('audio_sample_rate', 16000, 'sample rate value expected by model')
f.DEFINE_boolean('normalize_sample_rate', True, 'normalize sample rate of all train_files to --audio_sample_rate')
# Data Augmentation
# ================