Normalize sample rate of train_files by default
This commit is contained in:
parent
d4152f6e67
commit
8c0d46cb7f
4
setup.py
4
setup.py
|
@ -53,9 +53,6 @@ def main():
|
|||
'absl-py',
|
||||
'attrdict',
|
||||
'bs4',
|
||||
'librosa',
|
||||
'llvmlite == 0.31.0', # for numba==0.47.0
|
||||
'numba == 0.47.0', # ships py3.5 wheel
|
||||
'numpy',
|
||||
'optuna',
|
||||
'opuslib == 2.0.0',
|
||||
|
@ -63,6 +60,7 @@ def main():
|
|||
'progressbar2',
|
||||
'pyogg >= 0.6.14a1',
|
||||
'pyxdg',
|
||||
'resampy >= 0.2.2',
|
||||
'requests',
|
||||
'semver',
|
||||
'six',
|
||||
|
|
|
@ -3,6 +3,7 @@ import os
|
|||
import re
|
||||
import math
|
||||
import random
|
||||
import resampy
|
||||
import numpy as np
|
||||
|
||||
from multiprocessing import Queue, Process
|
||||
|
@ -129,7 +130,7 @@ def apply_graph_augmentations(domain, tensor, augmentations, transcript=None, cl
|
|||
Tensor of type float32
|
||||
The augmented spectrogram
|
||||
"""
|
||||
if augmentations is not None:
|
||||
if augmentations:
|
||||
for augmentation in augmentations:
|
||||
if isinstance(augmentation, GraphAugmentation):
|
||||
tensor = augmentation.maybe_apply(domain, tensor, transcript=transcript, clock=clock)
|
||||
|
@ -348,24 +349,25 @@ class Resample(SampleAugmentation):
|
|||
self.rate = int_range(rate)
|
||||
|
||||
def apply(self, sample, clock=0.0):
|
||||
# late binding librosa and its dependencies
|
||||
# pre-importing sklearn fixes https://github.com/scikit-learn/scikit-learn/issues/14485
|
||||
import sklearn # pylint: disable=import-outside-toplevel
|
||||
from librosa.core import resample # pylint: disable=import-outside-toplevel
|
||||
sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP)
|
||||
rate = pick_value_from_range(self.rate, clock=clock)
|
||||
audio = sample.audio
|
||||
orig_len = len(audio)
|
||||
audio = np.swapaxes(audio, 0, 1)
|
||||
if audio.shape[0] < 2:
|
||||
# since v0.8 librosa enforces a shape of (samples,) instead of (channels, samples) for mono samples
|
||||
resampled = resample(audio[0], sample.audio_format.rate, rate)
|
||||
audio[0] = resample(resampled, rate, sample.audio_format.rate)[:orig_len]
|
||||
else:
|
||||
audio = resample(audio, sample.audio_format.rate, rate)
|
||||
audio = resample(audio, rate, sample.audio_format.rate)
|
||||
audio = np.swapaxes(audio, 0, 1)[0:orig_len]
|
||||
sample.audio = audio
|
||||
orig_len = len(sample.audio)
|
||||
resampled = resampy.resample(sample.audio, sample.audio_format.rate, rate, axis=0, filter='kaiser_fast')
|
||||
sample.audio = resampy.resample(resampled, rate, sample.audio_format.rate, axis=0, filter='kaiser_fast')[:orig_len]
|
||||
|
||||
|
||||
class NormalizeSampleRate(SampleAugmentation):
|
||||
def __init__(self, rate):
|
||||
super().__init__(p=1.0)
|
||||
self.rate = rate
|
||||
|
||||
def apply(self, sample, clock=0.0):
|
||||
if sample.audio_format.rate == self.rate:
|
||||
return
|
||||
|
||||
sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP)
|
||||
sample.audio = resampy.resample(sample.audio, sample.audio_format.rate, self.rate, axis=0, filter='kaiser_fast')
|
||||
sample.audio_format = sample.audio_format._replace(rate=self.rate)
|
||||
|
||||
|
||||
class Volume(SampleAugmentation):
|
||||
|
|
|
@ -12,7 +12,7 @@ from .flags import FLAGS
|
|||
from .gpu import get_available_gpus
|
||||
from .logging import log_error, log_warn
|
||||
from .helpers import parse_file_size
|
||||
from .augmentations import parse_augmentations
|
||||
from .augmentations import parse_augmentations, NormalizeSampleRate
|
||||
from .io import path_exists_remote
|
||||
|
||||
class ConfigSingleton:
|
||||
|
@ -33,11 +33,14 @@ def initialize_globals():
|
|||
|
||||
# Augmentations
|
||||
c.augmentations = parse_augmentations(FLAGS.augment)
|
||||
if len(c.augmentations) > 0 and FLAGS.feature_cache and FLAGS.cache_for_epochs == 0:
|
||||
if c.augmentations and FLAGS.feature_cache and FLAGS.cache_for_epochs == 0:
|
||||
log_warn('Due to current feature-cache settings the exact same sample augmentations of the first '
|
||||
'epoch will be repeated on all following epochs. This could lead to unintended over-fitting. '
|
||||
'You could use --cache_for_epochs <n_epochs> to invalidate the cache after a given number of epochs.')
|
||||
|
||||
if FLAGS.normalize_sample_rate:
|
||||
c.augmentations = [NormalizeSampleRate(FLAGS.audio_sample_rate)] + c['augmentations']
|
||||
|
||||
# Caching
|
||||
if FLAGS.cache_for_epochs == 1:
|
||||
log_warn('--cache_for_epochs == 1 is (re-)creating the feature cache on every epoch but will never use it.')
|
||||
|
|
|
@ -28,7 +28,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
|
|||
lambda: tf.no_op(),
|
||||
name='matching_sample_rate')
|
||||
|
||||
if train_phase and augmentations is not None:
|
||||
if train_phase and augmentations:
|
||||
audio = apply_graph_augmentations('signal', audio, augmentations, transcript=transcript, clock=clock)
|
||||
|
||||
spectrogram = contrib_audio.audio_spectrogram(audio,
|
||||
|
@ -36,7 +36,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
|
|||
stride=Config.audio_step_samples,
|
||||
magnitude_squared=True)
|
||||
|
||||
if train_phase and augmentations is not None:
|
||||
if train_phase and augmentations:
|
||||
spectrogram = apply_graph_augmentations('spectrogram', spectrogram, augmentations, transcript=transcript, clock=clock)
|
||||
|
||||
features = contrib_audio.mfcc(spectrogram=spectrogram,
|
||||
|
@ -45,7 +45,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
|
|||
upper_frequency_limit=FLAGS.audio_sample_rate / 2)
|
||||
features = tf.reshape(features, [-1, Config.n_input])
|
||||
|
||||
if train_phase and augmentations is not None:
|
||||
if train_phase and augmentations:
|
||||
features = apply_graph_augmentations('features', features, augmentations, transcript=transcript, clock=clock)
|
||||
|
||||
return features, tf.shape(input=features)[0]
|
||||
|
|
|
@ -24,6 +24,7 @@ def create_flags():
|
|||
f.DEFINE_integer('feature_win_len', 32, 'feature extraction audio window length in milliseconds')
|
||||
f.DEFINE_integer('feature_win_step', 20, 'feature extraction window step length in milliseconds')
|
||||
f.DEFINE_integer('audio_sample_rate', 16000, 'sample rate value expected by model')
|
||||
f.DEFINE_boolean('normalize_sample_rate', True, 'normalize sample rate of all train_files to --audio_sample_rate')
|
||||
|
||||
# Data Augmentation
|
||||
# ================
|
||||
|
|
Loading…
Reference in New Issue