Normalize sample rate of train_files by default
This commit is contained in:
parent
d4152f6e67
commit
8c0d46cb7f
4
setup.py
4
setup.py
|
@ -53,9 +53,6 @@ def main():
|
||||||
'absl-py',
|
'absl-py',
|
||||||
'attrdict',
|
'attrdict',
|
||||||
'bs4',
|
'bs4',
|
||||||
'librosa',
|
|
||||||
'llvmlite == 0.31.0', # for numba==0.47.0
|
|
||||||
'numba == 0.47.0', # ships py3.5 wheel
|
|
||||||
'numpy',
|
'numpy',
|
||||||
'optuna',
|
'optuna',
|
||||||
'opuslib == 2.0.0',
|
'opuslib == 2.0.0',
|
||||||
|
@ -63,6 +60,7 @@ def main():
|
||||||
'progressbar2',
|
'progressbar2',
|
||||||
'pyogg >= 0.6.14a1',
|
'pyogg >= 0.6.14a1',
|
||||||
'pyxdg',
|
'pyxdg',
|
||||||
|
'resampy >= 0.2.2',
|
||||||
'requests',
|
'requests',
|
||||||
'semver',
|
'semver',
|
||||||
'six',
|
'six',
|
||||||
|
|
|
@ -3,6 +3,7 @@ import os
|
||||||
import re
|
import re
|
||||||
import math
|
import math
|
||||||
import random
|
import random
|
||||||
|
import resampy
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from multiprocessing import Queue, Process
|
from multiprocessing import Queue, Process
|
||||||
|
@ -129,7 +130,7 @@ def apply_graph_augmentations(domain, tensor, augmentations, transcript=None, cl
|
||||||
Tensor of type float32
|
Tensor of type float32
|
||||||
The augmented spectrogram
|
The augmented spectrogram
|
||||||
"""
|
"""
|
||||||
if augmentations is not None:
|
if augmentations:
|
||||||
for augmentation in augmentations:
|
for augmentation in augmentations:
|
||||||
if isinstance(augmentation, GraphAugmentation):
|
if isinstance(augmentation, GraphAugmentation):
|
||||||
tensor = augmentation.maybe_apply(domain, tensor, transcript=transcript, clock=clock)
|
tensor = augmentation.maybe_apply(domain, tensor, transcript=transcript, clock=clock)
|
||||||
|
@ -348,24 +349,25 @@ class Resample(SampleAugmentation):
|
||||||
self.rate = int_range(rate)
|
self.rate = int_range(rate)
|
||||||
|
|
||||||
def apply(self, sample, clock=0.0):
|
def apply(self, sample, clock=0.0):
|
||||||
# late binding librosa and its dependencies
|
|
||||||
# pre-importing sklearn fixes https://github.com/scikit-learn/scikit-learn/issues/14485
|
|
||||||
import sklearn # pylint: disable=import-outside-toplevel
|
|
||||||
from librosa.core import resample # pylint: disable=import-outside-toplevel
|
|
||||||
sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP)
|
sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP)
|
||||||
rate = pick_value_from_range(self.rate, clock=clock)
|
rate = pick_value_from_range(self.rate, clock=clock)
|
||||||
audio = sample.audio
|
orig_len = len(sample.audio)
|
||||||
orig_len = len(audio)
|
resampled = resampy.resample(sample.audio, sample.audio_format.rate, rate, axis=0, filter='kaiser_fast')
|
||||||
audio = np.swapaxes(audio, 0, 1)
|
sample.audio = resampy.resample(resampled, rate, sample.audio_format.rate, axis=0, filter='kaiser_fast')[:orig_len]
|
||||||
if audio.shape[0] < 2:
|
|
||||||
# since v0.8 librosa enforces a shape of (samples,) instead of (channels, samples) for mono samples
|
|
||||||
resampled = resample(audio[0], sample.audio_format.rate, rate)
|
class NormalizeSampleRate(SampleAugmentation):
|
||||||
audio[0] = resample(resampled, rate, sample.audio_format.rate)[:orig_len]
|
def __init__(self, rate):
|
||||||
else:
|
super().__init__(p=1.0)
|
||||||
audio = resample(audio, sample.audio_format.rate, rate)
|
self.rate = rate
|
||||||
audio = resample(audio, rate, sample.audio_format.rate)
|
|
||||||
audio = np.swapaxes(audio, 0, 1)[0:orig_len]
|
def apply(self, sample, clock=0.0):
|
||||||
sample.audio = audio
|
if sample.audio_format.rate == self.rate:
|
||||||
|
return
|
||||||
|
|
||||||
|
sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP)
|
||||||
|
sample.audio = resampy.resample(sample.audio, sample.audio_format.rate, self.rate, axis=0, filter='kaiser_fast')
|
||||||
|
sample.audio_format = sample.audio_format._replace(rate=self.rate)
|
||||||
|
|
||||||
|
|
||||||
class Volume(SampleAugmentation):
|
class Volume(SampleAugmentation):
|
||||||
|
|
|
@ -12,7 +12,7 @@ from .flags import FLAGS
|
||||||
from .gpu import get_available_gpus
|
from .gpu import get_available_gpus
|
||||||
from .logging import log_error, log_warn
|
from .logging import log_error, log_warn
|
||||||
from .helpers import parse_file_size
|
from .helpers import parse_file_size
|
||||||
from .augmentations import parse_augmentations
|
from .augmentations import parse_augmentations, NormalizeSampleRate
|
||||||
from .io import path_exists_remote
|
from .io import path_exists_remote
|
||||||
|
|
||||||
class ConfigSingleton:
|
class ConfigSingleton:
|
||||||
|
@ -33,11 +33,14 @@ def initialize_globals():
|
||||||
|
|
||||||
# Augmentations
|
# Augmentations
|
||||||
c.augmentations = parse_augmentations(FLAGS.augment)
|
c.augmentations = parse_augmentations(FLAGS.augment)
|
||||||
if len(c.augmentations) > 0 and FLAGS.feature_cache and FLAGS.cache_for_epochs == 0:
|
if c.augmentations and FLAGS.feature_cache and FLAGS.cache_for_epochs == 0:
|
||||||
log_warn('Due to current feature-cache settings the exact same sample augmentations of the first '
|
log_warn('Due to current feature-cache settings the exact same sample augmentations of the first '
|
||||||
'epoch will be repeated on all following epochs. This could lead to unintended over-fitting. '
|
'epoch will be repeated on all following epochs. This could lead to unintended over-fitting. '
|
||||||
'You could use --cache_for_epochs <n_epochs> to invalidate the cache after a given number of epochs.')
|
'You could use --cache_for_epochs <n_epochs> to invalidate the cache after a given number of epochs.')
|
||||||
|
|
||||||
|
if FLAGS.normalize_sample_rate:
|
||||||
|
c.augmentations = [NormalizeSampleRate(FLAGS.audio_sample_rate)] + c['augmentations']
|
||||||
|
|
||||||
# Caching
|
# Caching
|
||||||
if FLAGS.cache_for_epochs == 1:
|
if FLAGS.cache_for_epochs == 1:
|
||||||
log_warn('--cache_for_epochs == 1 is (re-)creating the feature cache on every epoch but will never use it.')
|
log_warn('--cache_for_epochs == 1 is (re-)creating the feature cache on every epoch but will never use it.')
|
||||||
|
|
|
@ -28,7 +28,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
|
||||||
lambda: tf.no_op(),
|
lambda: tf.no_op(),
|
||||||
name='matching_sample_rate')
|
name='matching_sample_rate')
|
||||||
|
|
||||||
if train_phase and augmentations is not None:
|
if train_phase and augmentations:
|
||||||
audio = apply_graph_augmentations('signal', audio, augmentations, transcript=transcript, clock=clock)
|
audio = apply_graph_augmentations('signal', audio, augmentations, transcript=transcript, clock=clock)
|
||||||
|
|
||||||
spectrogram = contrib_audio.audio_spectrogram(audio,
|
spectrogram = contrib_audio.audio_spectrogram(audio,
|
||||||
|
@ -36,7 +36,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
|
||||||
stride=Config.audio_step_samples,
|
stride=Config.audio_step_samples,
|
||||||
magnitude_squared=True)
|
magnitude_squared=True)
|
||||||
|
|
||||||
if train_phase and augmentations is not None:
|
if train_phase and augmentations:
|
||||||
spectrogram = apply_graph_augmentations('spectrogram', spectrogram, augmentations, transcript=transcript, clock=clock)
|
spectrogram = apply_graph_augmentations('spectrogram', spectrogram, augmentations, transcript=transcript, clock=clock)
|
||||||
|
|
||||||
features = contrib_audio.mfcc(spectrogram=spectrogram,
|
features = contrib_audio.mfcc(spectrogram=spectrogram,
|
||||||
|
@ -45,7 +45,7 @@ def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phas
|
||||||
upper_frequency_limit=FLAGS.audio_sample_rate / 2)
|
upper_frequency_limit=FLAGS.audio_sample_rate / 2)
|
||||||
features = tf.reshape(features, [-1, Config.n_input])
|
features = tf.reshape(features, [-1, Config.n_input])
|
||||||
|
|
||||||
if train_phase and augmentations is not None:
|
if train_phase and augmentations:
|
||||||
features = apply_graph_augmentations('features', features, augmentations, transcript=transcript, clock=clock)
|
features = apply_graph_augmentations('features', features, augmentations, transcript=transcript, clock=clock)
|
||||||
|
|
||||||
return features, tf.shape(input=features)[0]
|
return features, tf.shape(input=features)[0]
|
||||||
|
|
|
@ -24,6 +24,7 @@ def create_flags():
|
||||||
f.DEFINE_integer('feature_win_len', 32, 'feature extraction audio window length in milliseconds')
|
f.DEFINE_integer('feature_win_len', 32, 'feature extraction audio window length in milliseconds')
|
||||||
f.DEFINE_integer('feature_win_step', 20, 'feature extraction window step length in milliseconds')
|
f.DEFINE_integer('feature_win_step', 20, 'feature extraction window step length in milliseconds')
|
||||||
f.DEFINE_integer('audio_sample_rate', 16000, 'sample rate value expected by model')
|
f.DEFINE_integer('audio_sample_rate', 16000, 'sample rate value expected by model')
|
||||||
|
f.DEFINE_boolean('normalize_sample_rate', True, 'normalize sample rate of all train_files to --audio_sample_rate')
|
||||||
|
|
||||||
# Data Augmentation
|
# Data Augmentation
|
||||||
# ================
|
# ================
|
||||||
|
|
Loading…
Reference in New Issue