From c5ceee26dddf0e41486a09566d99c0128fe0476b Mon Sep 17 00:00:00 2001 From: Tilman Kamp <5991088+tilmankamp@users.noreply.github.com> Date: Thu, 9 Apr 2020 14:23:38 +0200 Subject: [PATCH] Live audio augmentation --- .gitignore | 1 + bin/build_sdb.py | 13 +- bin/compare_samples.py | 66 +++++ bin/play.py | 110 +++++--- bin/run-tc-signal_augmentations.sh | 66 +++++ doc/TRAINING.rst | 130 +++++++++- taskcluster/tc-train-tests.sh | 2 + training/deepspeech_training/train.py | 2 + training/deepspeech_training/util/audio.py | 61 +++-- training/deepspeech_training/util/feeding.py | 17 +- training/deepspeech_training/util/flags.py | 3 + training/deepspeech_training/util/helpers.py | 64 ++++- .../util/sample_collections.py | 235 ++++++++++++++---- .../util/signal_augmentations.py | 200 +++++++++++++++ 14 files changed, 857 insertions(+), 113 deletions(-) create mode 100755 bin/compare_samples.py create mode 100755 bin/run-tc-signal_augmentations.sh create mode 100644 training/deepspeech_training/util/signal_augmentations.py diff --git a/.gitignore b/.gitignore index 464c21dd..560e67c1 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.pyc *.swp *.DS_Store +*.egg-info .pit* /.run /werlog.js diff --git a/bin/build_sdb.py b/bin/build_sdb.py index d0e2f8e4..ef4810e7 100755 --- a/bin/build_sdb.py +++ b/bin/build_sdb.py @@ -15,7 +15,7 @@ from deepspeech_training.util.audio import ( from deepspeech_training.util.downloader import SIMPLE_BAR from deepspeech_training.util.sample_collections import ( DirectSDBWriter, - samples_from_files, + samples_from_sources, ) AUDIO_TYPE_LOOKUP = {"wav": AUDIO_TYPE_WAV, "opus": AUDIO_TYPE_OPUS} @@ -26,12 +26,10 @@ def build_sdb(): with DirectSDBWriter( CLI_ARGS.target, audio_type=audio_type, labeled=not CLI_ARGS.unlabeled ) as sdb_writer: - samples = samples_from_files(CLI_ARGS.sources, labeled=not CLI_ARGS.unlabeled) + samples = samples_from_sources(CLI_ARGS.sources, labeled=not CLI_ARGS.unlabeled) bar = progressbar.ProgressBar(max_value=len(samples), widgets=SIMPLE_BAR) for sample in bar( - change_audio_types( - samples, audio_type=audio_type, processes=CLI_ARGS.workers - ) + change_audio_types(samples, audio_type=audio_type, bitrate=CLI_ARGS.bitrate, processes=CLI_ARGS.workers) ): sdb_writer.add(sample) @@ -55,6 +53,11 @@ def handle_args(): choices=AUDIO_TYPE_LOOKUP.keys(), help="Audio representation inside target SDB", ) + parser.add_argument( + "--bitrate", + type=int, + help="Bitrate for lossy compressed SDB samples like in case of --audio-type opus", + ) parser.add_argument( "--workers", type=int, default=None, help="Number of encoding SDB workers" ) diff --git a/bin/compare_samples.py b/bin/compare_samples.py new file mode 100755 index 00000000..94108a7a --- /dev/null +++ b/bin/compare_samples.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +""" +Tool for comparing two wav samples +""" +import sys +import argparse + +from deepspeech_training.util.audio import AUDIO_TYPE_NP, mean_dbfs +from deepspeech_training.util.sample_collections import load_sample + + +def fail(message): + print(message, file=sys.stderr, flush=True) + sys.exit(1) + + +def compare_samples(): + sample1 = load_sample(CLI_ARGS.sample1) + sample2 = load_sample(CLI_ARGS.sample2) + if sample1.audio_format != sample2.audio_format: + fail('Samples differ on: audio-format ({} and {})'.format(sample1.audio_format, sample2.audio_format)) + if sample1.duration != sample2.duration: + fail('Samples differ on: duration ({} and {})'.format(sample1.duration, sample2.duration)) + sample1.change_audio_type(AUDIO_TYPE_NP) + sample2.change_audio_type(AUDIO_TYPE_NP) + audio_diff = sample1.audio - sample2.audio + diff_dbfs = mean_dbfs(audio_diff) + differ_msg = 'Samples differ on: sample data ({:0.2f} dB difference) '.format(diff_dbfs) + equal_msg = 'Samples are considered equal ({:0.2f} dB difference)'.format(diff_dbfs) + if CLI_ARGS.if_differ: + if diff_dbfs <= CLI_ARGS.threshold: + fail(equal_msg) + if not CLI_ARGS.no_success_output: + print(differ_msg, file=sys.stderr, flush=True) + else: + if diff_dbfs > CLI_ARGS.threshold: + fail(differ_msg) + if not CLI_ARGS.no_success_output: + print(equal_msg, file=sys.stderr, flush=True) + + +def handle_args(): + parser = argparse.ArgumentParser( + description="Tool for checking similarity of two samples" + ) + parser.add_argument("sample1", help="Filename of sample 1 to compare") + parser.add_argument("sample2", help="Filename of sample 2 to compare") + parser.add_argument("--threshold", type=float, default=-60.0, + help="dB of sample deltas above which they are considered different") + parser.add_argument( + "--if-differ", + action="store_true", + help="If to succeed and return status code 0 on different signals and fail on equal ones (inverse check)." + "This will still fail on different formats or durations.", + ) + parser.add_argument( + "--no-success-output", + action="store_true", + help="Stay silent on success (if samples are equal of - with --if-differ - samples are not equal)", + ) + return parser.parse_args() + + +if __name__ == "__main__": + CLI_ARGS = handle_args() + compare_samples() diff --git a/bin/play.py b/bin/play.py index 9774732e..8f985da9 100755 --- a/bin/play.py +++ b/bin/play.py @@ -1,54 +1,72 @@ #!/usr/bin/env python """ -Tool for playing samples from Sample Databases (SDB files) and DeepSpeech CSV files +Tool for playing (and augmenting) single samples or samples from Sample Databases (SDB files) and DeepSpeech CSV files Use "python3 build_sdb.py -h" for help """ -import argparse -import random +import os import sys +import random +import argparse -from deepspeech_training.util.audio import AUDIO_TYPE_PCM -from deepspeech_training.util.sample_collections import LabeledSample, samples_from_file +from deepspeech_training.util.audio import LOADABLE_AUDIO_EXTENSIONS, AUDIO_TYPE_PCM, AUDIO_TYPE_WAV +from deepspeech_training.util.sample_collections import SampleList, LabeledSample, samples_from_source, prepare_samples -def play_sample(samples, index): - if index < 0: - index = len(samples) + index - if CLI_ARGS.random: - index = random.randint(0, len(samples)) - elif index >= len(samples): - print("No sample with index {}".format(CLI_ARGS.start)) - sys.exit(1) - sample = samples[index] - print('Sample "{}"'.format(sample.sample_id)) - if isinstance(sample, LabeledSample): - print(' "{}"'.format(sample.transcript)) - sample.change_audio_type(AUDIO_TYPE_PCM) - rate, channels, width = sample.audio_format - wave_obj = simpleaudio.WaveObject(sample.audio, channels, width, rate) - play_obj = wave_obj.play() - play_obj.wait_done() - - -def play_collection(): - samples = samples_from_file(CLI_ARGS.collection, buffering=0) +def get_samples_in_play_order(): + ext = os.path.splitext(CLI_ARGS.source)[1].lower() + if ext in LOADABLE_AUDIO_EXTENSIONS: + samples = SampleList([(CLI_ARGS.source, 0)], labeled=False) + else: + samples = samples_from_source(CLI_ARGS.source, buffering=0) played = 0 index = CLI_ARGS.start while True: if 0 <= CLI_ARGS.number <= played: return - play_sample(samples, index) + if CLI_ARGS.random: + yield samples[random.randint(0, len(samples) - 1)] + elif index < 0: + yield samples[len(samples) + index] + elif index >= len(samples): + print("No sample with index {}".format(CLI_ARGS.start)) + sys.exit(1) + else: + yield samples[index] played += 1 index = (index + 1) % len(samples) +def play_collection(): + samples = get_samples_in_play_order() + samples = prepare_samples(samples, + audio_type=AUDIO_TYPE_PCM, + augmentation_specs=CLI_ARGS.augment, + process_ahead=0, + fixed_clock=CLI_ARGS.clock) + for sample in samples: + if not CLI_ARGS.quiet: + print('Sample "{}"'.format(sample.sample_id), file=sys.stderr) + if isinstance(sample, LabeledSample): + print(' "{}"'.format(sample.transcript), file=sys.stderr) + if CLI_ARGS.pipe: + sample.change_audio_type(AUDIO_TYPE_WAV) + sys.stdout.buffer.write(sample.audio.getvalue()) + return + wave_obj = simpleaudio.WaveObject(sample.audio, + sample.audio_format.channels, + sample.audio_format.width, + sample.audio_format.rate) + play_obj = wave_obj.play() + play_obj.wait_done() + + def handle_args(): parser = argparse.ArgumentParser( - description="Tool for playing samples from Sample Databases (SDB files) " + description="Tool for playing (and augmenting) single samples or samples from Sample Databases (SDB files) " "and DeepSpeech CSV files" ) - parser.add_argument("collection", help="Sample DB or CSV file to play samples from") + parser.add_argument("source", help="Sample DB, CSV or WAV file to play samples from") parser.add_argument( "--start", type=int, @@ -66,16 +84,40 @@ def handle_args(): action="store_true", help="If samples should be played in random order", ) + parser.add_argument( + "--augment", + action='append', + help="Add an augmentation operation", + ) + parser.add_argument( + "--clock", + type=float, + default=0.5, + help="Simulates clock value used for augmentations during training." + "Ranges from 0.0 (representing parameter start values) to" + "1.0 (representing parameter end values)", + ) + parser.add_argument( + "--pipe", + action="store_true", + help="Pipe first sample as wav file to stdout. Forces --number to 1.", + ) + parser.add_argument( + "--quiet", + action="store_true", + help="No info logging to console", + ) return parser.parse_args() if __name__ == "__main__": - try: - import simpleaudio - except ModuleNotFoundError: - print('play.py requires Python package "simpleaudio"') - sys.exit(1) CLI_ARGS = handle_args() + if not CLI_ARGS.pipe: + try: + import simpleaudio + except ModuleNotFoundError: + print('Unless using the --pipe flag, play.py requires Python package "simpleaudio" for playing samples') + sys.exit(1) try: play_collection() except KeyboardInterrupt: diff --git a/bin/run-tc-signal_augmentations.sh b/bin/run-tc-signal_augmentations.sh new file mode 100755 index 00000000..5be8ed12 --- /dev/null +++ b/bin/run-tc-signal_augmentations.sh @@ -0,0 +1,66 @@ +#!/bin/sh + +set -xe + +ldc93s1_dir=`cd data/smoke_test; pwd` +ldc93s1_csv="${ldc93s1_dir}/LDC93S1.csv" +ldc93s1_wav="${ldc93s1_dir}/LDC93S1.wav" +ldc93s1_overlay_csv="${ldc93s1_dir}/LDC93S1_overlay.csv" +ldc93s1_overlay_wav="${ldc93s1_dir}/LDC93S1_reversed.wav" + +play="python bin/play.py --number 1 --quiet" +compare="python bin/compare_samples.py --no-success-output" + +if [ ! -f "${ldc93s1_csv}" ]; then + echo "Downloading and preprocessing LDC93S1 example data, saving in ${ldc93s1_dir}." + python -u bin/import_ldc93s1.py ${ldc93s1_dir} +fi; + +if [ ! -f "${ldc93s1_overlay_csv}" ]; then + echo "Reversing ${ldc93s1_wav} to ${ldc93s1_overlay_wav}." + sox "${ldc93s1_wav}" "${ldc93s1_overlay_wav}" reverse + + echo "Creating ${ldc93s1_overlay_csv}." + printf "wav_filename\n${ldc93s1_overlay_wav}" > "${ldc93s1_overlay_csv}" +fi; + +if ! $compare --if-differ "${ldc93s1_wav}" "${ldc93s1_overlay_wav}"; then + echo "Sample comparison tool not working correctly" + exit 1 +fi + +$play ${ldc93s1_wav} --augment overlay[source="${ldc93s1_overlay_csv}",snr=20] --pipe >/tmp/overlay-test.wav +if ! $compare --if-differ "${ldc93s1_wav}" /tmp/overlay-test.wav; then + echo "Overlay augmentation had no effect or changed basic sample properties" + exit 1 +fi + +$play ${ldc93s1_wav} --augment reverb[delay=50.0,decay=2.0] --pipe >/tmp/reverb-test.wav +if ! $compare --if-differ "${ldc93s1_wav}" /tmp/reverb-test.wav; then + echo "Reverb augmentation had no effect or changed basic sample properties" + exit 1 +fi + +$play ${ldc93s1_wav} --augment gaps[n=10,size=100.0] --pipe >/tmp/gaps-test.wav +if ! $compare --if-differ "${ldc93s1_wav}" /tmp/gaps-test.wav; then + echo "Gaps augmentation had no effect or changed basic sample properties" + exit 1 +fi + +$play ${ldc93s1_wav} --augment resample[rate=4000] --pipe >/tmp/resample-test.wav +if ! $compare --if-differ "${ldc93s1_wav}" /tmp/resample-test.wav; then + echo "Resample augmentation had no effect or changed basic sample properties" + exit 1 +fi + +$play ${ldc93s1_wav} --augment codec[bitrate=4000] --pipe >/tmp/codec-test.wav +if ! $compare --if-differ "${ldc93s1_wav}" /tmp/codec-test.wav; then + echo "Codec augmentation had no effect or changed basic sample properties" + exit 1 +fi + +$play ${ldc93s1_wav} --augment volume --pipe >/tmp/volume-test.wav +if ! $compare --if-differ "${ldc93s1_wav}" /tmp/volume-test.wav; then + echo "Volume augmentation had no effect or changed basic sample properties" + exit 1 +fi diff --git a/doc/TRAINING.rst b/doc/TRAINING.rst index fe31ccd1..756c38f2 100644 --- a/doc/TRAINING.rst +++ b/doc/TRAINING.rst @@ -263,21 +263,139 @@ UTF-8 mode DeepSpeech includes a UTF-8 operating mode which can be useful to model languages with very large alphabets, such as Chinese Mandarin. For details on how it works and how to use it, see :ref:`decoder-docs`. -Training with augmentation -^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Augmentation +^^^^^^^^^^^^ Augmentation is a useful technique for better generalization of machine learning models. Thus, a pre-processing pipeline with various augmentation techniques on raw pcm and spectrogram has been implemented and can be used while training the model. Following are the available augmentation techniques that can be enabled at training time by using the corresponding flags in the command line. -Audio Augmentation -~~~~~~~~~~~~~~~~~~ +Audio Augmentation before feature caching +----------------------------------------- + +Augmentations that are applied before potential feature caching can be specified through the ``--augment`` multi-flag. + +Each sample of the training data will get treated by every specified augmentation in their given order. However: If an augmentation will actually get applied to a sample, is decided by chance on base of the augmentation's probability value. For example a value of ``p=0.1`` would apply the according augmentation to just 10% of all samples. This also means that augmentations are not mutually exclusive on a per-sample basis. + +The ``--augment`` flag's value follows a common format (given by an overlay example): + +.. code-block:: bash + + python3 DeepSpeech.py --augment overlay[p=0.1,source=/path/to/audio.sdb,snr=20.0] ... + + +Values specified in the following as ```` or ```` are supporting the following formats: + + * ````: A constant value + + * ``~``: A center value with a randomization radius around it. E.g. ``1.2~0.4`` will result in picking of a random value between 0.8 and 1.6 on each sample augmentation. + + * ``:``: The value will range from `` at the beginning of an epoch to `` at the end of an epoch. E.g. ``-0.2:1.2`` (float) or ``2000:4000`` (int) + + * ``:~``: Combination of the latter two cases with a ranging center value. E.g. ``4-6~2`` would at the beginning of an epoch pick values between 2 and 6 and at the end of an epoch between 4 and 8. + + +The flag ``--augmentations_per_epoch`` allows to specify how often the whole training-set should be repeated per epoch for re-augmenting all its samples. Be aware: This will also multiply the required size of the feature cache (if enabled). + + +**Overlay augmentation** ``--augment overlay[p=,source=,snr=,layers=]`` + Layers another audio source (multiple times) onto augmented samples. + + * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method + + * **source**: path to the sample collection to use for augmenting (*.sdb or *.csv file) + + * **snr**: signal to noise ratio in dB - positive values for lowering volume of the overlay in relation to the sample + + * **layers**: number of layers of the overlay signal (e.g. 10 layers of speech to get "cocktail-party effect") + + +**Reverb augmentation** ``--augment reverb[p=,delay=,decay=]`` + Adds reverberation to the augmented samples. + + * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method + + * **delay**: time delay in ms for the first signal reflection - higher values are widening the perceived "room" + + * **decay**: sound decay in dB per reflection - higher values will result in a less reflective perceived "room" + + +**Gaps augmentation** ``--augment gaps[p=,n=,size=]`` + Zeros time-intervals within the augmented samples. + + * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method + + * **n**: number of intervals to zero + + * **size**: interval durations in ms + + +**Resample augmentation** ``--augment resample[p=,rate=]`` + Re-samples augmented samples to another sample-rate and back. + + * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method + + * **rate**: sample-rate to re-sample to + + +**Codec augmentation** ``--augment codec[p=,bitrate=]`` + Compresses and re-expands augmented samples using the lossy Opus audio codec. + + * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method + + * **bitrate**: bitrate used during compression + + +**Volume augmentation** ``--augment volume[p=,dbfs=]`` + Measures and levels augmented samples to a target dBFS value. + + * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method + + * **dbfs** : target volume in dBFS (default value of 3.0103 will normalize min and max amplitudes to -1.0/1.0) + + +Example training with all augmentations: + +.. code-block:: bash + + python -u DeepSpeech.py \ + --train_files "train.sdb" \ + --augmentations_per_epoch 10 \ + --augment overlay[p=0.5,source=noise.sdb,layers=1,snr=50:20~10] \ + --augment overlay[p=0.2,source=voices.sdb,layers=10:6,snr=50:20~10] \ + --augment reverb[p=0.1,delay=50.0~30.0,decay=10.0:2.0~1.0] \ + --augment gaps[p=0.05,n=1:3~2,size=10:100] \ + --augment resample[p=0.1,rate=12000:8000~4000] \ + --augment codec[p=0.1,bitrate=48000:16000] \ + --augment volume[p=0.1,dbfs=-10:-40] \ + [...] + + +The ``bin/play.py`` tool also supports ``--augment`` parameters and can be used for experimenting with different configurations. + +Example of playing all samples with reverberation and maximized volume: + +.. code-block:: bash + + bin/play.py --augment reverb[p=0.1,delay=50.0,decay=2.0] --augment volume --random test.sdb + +Example simulation of the codec augmentation of a wav-file first at the beginning and then at the end of an epoch: + +.. code-block:: bash + + bin/play.py --augment codec[p=0.1,bitrate=48000:16000] --clock 0.0 test.wav + bin/play.py --augment codec[p=0.1,bitrate=48000:16000] --clock 1.0 test.wav + + +Audio Augmentation after feature caching +---------------------------------------- #. **Standard deviation for Gaussian additive noise:** ``--data_aug_features_additive`` #. **Standard deviation for Normal distribution around 1 for multiplicative noise:** ``--data_aug_features_multiplicative`` #. **Standard deviation for speeding-up tempo. If Standard deviation is 0, this augmentation is not performed:** ``--augmentation_speed_up_std`` -Spectrogram Augmentation -~~~~~~~~~~~~~~~~~~~~~~~~ +Spectrogram Augmentation after feature caching +---------------------------------------------- Inspired by Google Paper on `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `_ diff --git a/taskcluster/tc-train-tests.sh b/taskcluster/tc-train-tests.sh index c8708f50..88d5ffbb 100644 --- a/taskcluster/tc-train-tests.sh +++ b/taskcluster/tc-train-tests.sh @@ -39,6 +39,8 @@ echo "Moving ${sample_name} to LDC93S1.wav" mv "${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/${sample_name}" "${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/LDC93S1.wav" pushd ${HOME}/DeepSpeech/ds/ + # Testing signal augmentations + time ./bin/run-tc-signal_augmentations.sh # Run twice to test preprocessed features time ./bin/run-tc-ldc93s1_new.sh 249 "${sample_rate}" time ./bin/run-tc-ldc93s1_new.sh 1 "${sample_rate}" diff --git a/training/deepspeech_training/train.py b/training/deepspeech_training/train.py index 69b2870f..f1832e62 100644 --- a/training/deepspeech_training/train.py +++ b/training/deepspeech_training/train.py @@ -424,6 +424,8 @@ def train(): # Create training and validation datasets train_set = create_dataset(FLAGS.train_files.split(','), batch_size=FLAGS.train_batch_size, + repetitions=FLAGS.augmentations_per_epoch, + augmentation_specs=FLAGS.augment, enable_cache=FLAGS.feature_cache and do_cache_dataset, cache_path=FLAGS.feature_cache, train_phase=True, diff --git a/training/deepspeech_training/util/audio.py b/training/deepspeech_training/util/audio.py index a6eb2593..a1f0a93e 100644 --- a/training/deepspeech_training/util/audio.py +++ b/training/deepspeech_training/util/audio.py @@ -1,14 +1,16 @@ import os import io import wave +import math import tempfile import collections import numpy as np -from .helpers import LimitingPool +from .helpers import LimitingPool, np_capped_squares from collections import namedtuple AudioFormat = namedtuple('AudioFormat', 'rate channels width') +dBFS = namedtuple('dBFS', 'mean max') DEFAULT_RATE = 16000 DEFAULT_CHANNELS = 1 @@ -20,6 +22,7 @@ AUDIO_TYPE_PCM = 'application/vnd.mozilla.pcm' AUDIO_TYPE_WAV = 'audio/wav' AUDIO_TYPE_OPUS = 'application/vnd.mozilla.opus' SERIALIZABLE_AUDIO_TYPES = [AUDIO_TYPE_WAV, AUDIO_TYPE_OPUS] +LOADABLE_AUDIO_EXTENSIONS = {'.wav': AUDIO_TYPE_WAV} OPUS_PCM_LEN_SIZE = 4 OPUS_RATE_SIZE = 4 @@ -81,7 +84,7 @@ class Sample: else: raise ValueError('Unsupported audio type: {}'.format(self.audio_type)) - def change_audio_type(self, new_audio_type): + def change_audio_type(self, new_audio_type, bitrate=None): """ In-place conversion of audio data into a different representation. @@ -89,6 +92,8 @@ class Sample: ---------- new_audio_type : str New audio-type - see `__init__`. + bitrate : int + Bitrate to use in case of converting to a lossy audio-type. """ if self.audio_type == new_audio_type: return @@ -104,7 +109,7 @@ class Sample: elif new_audio_type in SERIALIZABLE_AUDIO_TYPES: self.change_audio_type(AUDIO_TYPE_PCM) audio_bytes = io.BytesIO() - write_audio(new_audio_type, audio_bytes, self.audio, audio_format=self.audio_format) + write_audio(new_audio_type, audio_bytes, self.audio, audio_format=self.audio_format, bitrate=bitrate) audio_bytes.seek(0) self.audio = audio_bytes else: @@ -114,14 +119,20 @@ class Sample: def _change_audio_type(sample_and_audio_type): - sample, audio_type = sample_and_audio_type - sample.change_audio_type(audio_type) + sample, audio_type, bitrate = sample_and_audio_type + sample.change_audio_type(audio_type, bitrate=bitrate) return sample -def change_audio_types(samples, audio_type=AUDIO_TYPE_PCM, processes=None, process_ahead=None): +def change_audio_types(samples, audio_type=AUDIO_TYPE_PCM, bitrate=None, processes=None, process_ahead=None): with LimitingPool(processes=processes, process_ahead=process_ahead) as pool: - yield from pool.imap(_change_audio_type, map(lambda s: (s, audio_type), samples)) + yield from pool.imap(_change_audio_type, map(lambda s: (s, audio_type, bitrate), samples)) + + +def get_audio_type_from_extension(ext): + if ext in LOADABLE_AUDIO_EXTENSIONS: + return LOADABLE_AUDIO_EXTENSIONS[ext] + return None def read_audio_format_from_wav_file(wav_file): @@ -264,10 +275,12 @@ def get_opus_frame_size(rate): return 60 * rate // 1000 -def write_opus(opus_file, audio_data, audio_format=DEFAULT_FORMAT): +def write_opus(opus_file, audio_data, audio_format=DEFAULT_FORMAT, bitrate=None): frame_size = get_opus_frame_size(audio_format.rate) import opuslib # pylint: disable=import-outside-toplevel encoder = opuslib.Encoder(audio_format.rate, audio_format.channels, 'audio') + if bitrate is not None: + encoder.bitrate = bitrate chunk_size = frame_size * audio_format.channels * audio_format.width opus_file.write(pack_number(len(audio_data), OPUS_PCM_LEN_SIZE)) opus_file.write(pack_number(audio_format.rate, OPUS_RATE_SIZE)) @@ -277,7 +290,7 @@ def write_opus(opus_file, audio_data, audio_format=DEFAULT_FORMAT): chunk = audio_data[i:i + chunk_size] # Preventing non-deterministic encoding results from uninitialized remainder of the encoder buffer if len(chunk) < chunk_size: - chunk = chunk + bytearray(chunk_size - len(chunk)) + chunk = chunk + b'\0' * (chunk_size - len(chunk)) encoded = encoder.encode(chunk, frame_size) opus_file.write(pack_number(len(encoded), OPUS_CHUNK_LEN_SIZE)) opus_file.write(encoded) @@ -304,7 +317,7 @@ def read_opus(opus_file): decoded = decoder.decode(chunk, frame_size) audio_data.extend(decoded) audio_data = audio_data[:pcm_buffer_size] - return audio_format, audio_data + return audio_format, bytes(audio_data) def write_wav(wav_file, pcm_data, audio_format=DEFAULT_FORMAT): @@ -331,11 +344,11 @@ def read_audio(audio_type, audio_file): raise ValueError('Unsupported audio type: {}'.format(audio_type)) -def write_audio(audio_type, audio_file, pcm_data, audio_format=DEFAULT_FORMAT): +def write_audio(audio_type, audio_file, pcm_data, audio_format=DEFAULT_FORMAT, bitrate=None): if audio_type == AUDIO_TYPE_WAV: return write_wav(audio_file, pcm_data, audio_format=audio_format) if audio_type == AUDIO_TYPE_OPUS: - return write_opus(audio_file, pcm_data, audio_format=audio_format) + return write_opus(audio_file, pcm_data, audio_format=audio_format, bitrate=bitrate) raise ValueError('Unsupported audio type: {}'.format(audio_type)) @@ -376,6 +389,26 @@ def np_to_pcm(np_data, audio_format=DEFAULT_FORMAT): assert audio_format.channels == 1 # only mono supported for now dtype = get_dtype(audio_format) np_data = np_data.squeeze() - np_data *= dtype.max + np_data = np_data * np.iinfo(dtype).max np_data = np_data.astype(dtype) - return bytearray(np_data.tobytes()) + return np_data.tobytes() + + +def rms_to_dbfs(rms): + return 20.0 * math.log10(max(1e-16, rms)) + 3.0103 + + +def mean_dbfs(sample_data): + return rms_to_dbfs(math.sqrt(np.mean(np_capped_squares(sample_data)))) + + +def max_dbfs(sample_data): + return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data)))) + + +def gain_db_to_ratio(gain_db): + return math.pow(10.0, gain_db / 20.0) + + +def normalize_audio(sample_data, dbfs=3.0103): + return np.maximum(np.minimum(sample_data * gain_db_to_ratio(dbfs - max_dbfs(sample_data)), 1.0), -1.0) diff --git a/training/deepspeech_training/util/feeding.py b/training/deepspeech_training/util/feeding.py index b1ef7c20..1044c8f6 100644 --- a/training/deepspeech_training/util/feeding.py +++ b/training/deepspeech_training/util/feeding.py @@ -12,8 +12,8 @@ from .config import Config from .text import text_to_char_array from .flags import FLAGS from .spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up, augment_sparse_warp -from .audio import change_audio_types, read_frames_from_file, vad_split, pcm_to_np, DEFAULT_FORMAT, AUDIO_TYPE_NP -from .sample_collections import samples_from_files +from .audio import read_frames_from_file, vad_split, pcm_to_np, DEFAULT_FORMAT +from .sample_collections import samples_from_sources, prepare_samples from .helpers import remember_exception, MEGABYTE @@ -109,6 +109,8 @@ def to_sparse_tuple(sequence): def create_dataset(sources, batch_size, + repetitions=1, + augmentation_specs=None, enable_cache=False, cache_path=None, train_phase=False, @@ -116,10 +118,13 @@ def create_dataset(sources, process_ahead=None, buffering=1 * MEGABYTE): def generate_values(): - samples = samples_from_files(sources, buffering=buffering, labeled=True) - for sample in change_audio_types(samples, - AUDIO_TYPE_NP, - process_ahead=2 * batch_size if process_ahead is None else process_ahead): + samples = samples_from_sources(sources, buffering=buffering, labeled=True) + samples = prepare_samples(samples, + repetitions=repetitions, + augmentation_specs=augmentation_specs, + buffering=buffering, + process_ahead=2 * batch_size if process_ahead is None else process_ahead) + for sample in samples: transcript = text_to_char_array(sample.transcript, Config.alphabet, context=sample.sample_id) transcript = to_sparse_tuple(transcript) yield sample.sample_id, sample.audio, sample.audio_format.rate, transcript diff --git a/training/deepspeech_training/util/flags.py b/training/deepspeech_training/util/flags.py index 2bd65be0..69774c78 100644 --- a/training/deepspeech_training/util/flags.py +++ b/training/deepspeech_training/util/flags.py @@ -26,6 +26,9 @@ def create_flags(): # Data Augmentation # ================ + f.DEFINE_multi_string('augment', None, 'specifies an augmentation of the training samples. Format is "--augment operation[param1=value1, ...]"') + f.DEFINE_integer('augmentations_per_epoch', 1, 'how often the train set should be repeated and re-augmented per epoch') + f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise') f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise') diff --git a/training/deepspeech_training/util/helpers.py b/training/deepspeech_training/util/helpers.py index c158066d..5100f2b6 100644 --- a/training/deepspeech_training/util/helpers.py +++ b/training/deepspeech_training/util/helpers.py @@ -1,10 +1,14 @@ import os import sys import time +import math import heapq import semver +import random +import numpy as np from multiprocessing import Pool +from collections import namedtuple KILO = 1024 KILOBYTE = 1 * KILO @@ -13,6 +17,8 @@ GIGABYTE = KILO * MEGABYTE TERABYTE = KILO * GIGABYTE SIZE_PREFIX_LOOKUP = {'k': KILOBYTE, 'm': MEGABYTE, 'g': GIGABYTE, 't': TERABYTE} +ValueRange = namedtuple('ValueRange', 'start end r') + def parse_file_size(file_size): file_size = file_size.lower().strip() @@ -79,11 +85,11 @@ class LimitingPool: """Limits unbound ahead-processing of multiprocessing.Pool's imap method before items get consumed by the iteration caller. This prevents OOM issues in situations where items represent larger memory allocations.""" - def __init__(self, processes=None, process_ahead=None, sleeping_for=0.1): + def __init__(self, processes=None, initializer=None, initargs=None, process_ahead=None, sleeping_for=0.1): self.process_ahead = os.cpu_count() if process_ahead is None else process_ahead self.sleeping_for = sleeping_for self.processed = 0 - self.pool = Pool(processes=processes) + self.pool = Pool(processes=processes, initializer=initializer, initargs=initargs) def __enter__(self): return self @@ -100,6 +106,9 @@ class LimitingPool: self.processed -= 1 yield obj + def terminate(self): + self.pool.terminate() + def __exit__(self, exc_type, exc_value, traceback): self.pool.close() @@ -128,3 +137,54 @@ def remember_exception(iterable, exception_box=None): except Exception as ex: # pylint: disable = broad-except exception_box.exception = ex return iterable if exception_box is None else do_iterate + + +def get_value_range(value, target_type): + if isinstance(value, str): + r = target_type(0) + parts = value.split('~') + if len(parts) == 2: + value = parts[0] + r = target_type(parts[1]) + elif len(parts) > 2: + raise ValueError('Cannot parse value range') + parts = value.split(':') + if len(parts) == 1: + parts.append(parts[0]) + elif len(parts) > 2: + raise ValueError('Cannot parse value range') + return ValueRange(target_type(parts[0]), target_type(parts[1]), r) + if isinstance(value, tuple): + if len(value) == 2: + return ValueRange(target_type(value[0]), target_type(value[1]), 0) + if len(value) == 3: + return ValueRange(target_type(value[0]), target_type(value[1]), target_type(value[1])) + raise ValueError('Cannot convert to ValueRange: Wrong tuple size') + return ValueRange(target_type(value), target_type(value), 0) + + +def int_range(value): + return get_value_range(value, int) + + +def float_range(value): + return get_value_range(value, float) + + +def pick_value_from_range(value_range, clock=None): + clock = random.random() if clock is None else max(0.0, min(1.0, float(clock))) + value = value_range.start + clock * (value_range.end - value_range.start) + value = random.uniform(value - value_range.r, value + value_range.r) + return round(value) if isinstance(value_range.start, int) else value + + +def call_if_exists(o, name, *args, **kwargs): + method = getattr(o, name, None) + if callable(method): + method(*args, **kwargs) + + +def np_capped_squares(data): + sqrt_max = math.sqrt(np.finfo(data.dtype).max) + data = np.minimum(np.maximum(data, -sqrt_max), sqrt_max) # prevent overflow during squaring + return data ** 2 diff --git a/training/deepspeech_training/util/sample_collections.py b/training/deepspeech_training/util/sample_collections.py index 2a78daea..97dd0b6f 100644 --- a/training/deepspeech_training/util/sample_collections.py +++ b/training/deepspeech_training/util/sample_collections.py @@ -2,11 +2,14 @@ import os import csv import json +import random from pathlib import Path from functools import partial -from .helpers import MEGABYTE, GIGABYTE, Interleaved -from .audio import Sample, DEFAULT_FORMAT, AUDIO_TYPE_WAV, AUDIO_TYPE_OPUS, SERIALIZABLE_AUDIO_TYPES + +from .signal_augmentations import parse_augmentation +from .helpers import MEGABYTE, GIGABYTE, Interleaved, LimitingPool, call_if_exists +from .audio import Sample, DEFAULT_FORMAT, AUDIO_TYPE_OPUS, AUDIO_TYPE_NP, SERIALIZABLE_AUDIO_TYPES, get_audio_type_from_extension BIG_ENDIAN = 'big' INT_SIZE = 4 @@ -47,9 +50,42 @@ class LabeledSample(Sample): self.transcript = transcript +def load_sample(filename, label=None): + """ + Loads audio-file as a (labeled or unlabeled) sample + + Parameters + ---------- + filename : str + Filename of the audio-file to load as sample + label : str + Label (transcript) of the sample. + If None: return util.audio.Sample instance + Otherwise: return util.sample_collections.LabeledSample instance + + Returns + ------- + util.audio.Sample instance if label is None, else util.sample_collections.LabeledSample instance + """ + ext = os.path.splitext(filename)[1].lower() + audio_type = get_audio_type_from_extension(ext) + if audio_type is None: + raise ValueError('Unknown audio type extension "{}"'.format(ext)) + with open(filename, 'rb') as audio_file: + if label is None: + return Sample(audio_type, audio_file.read(), sample_id=filename) + return LabeledSample(audio_type, audio_file.read(), label, sample_id=filename) + + class DirectSDBWriter: """Sample collection writer for creating a Sample DB (SDB) file""" - def __init__(self, sdb_filename, buffering=BUFFER_SIZE, audio_type=AUDIO_TYPE_OPUS, id_prefix=None, labeled=True): + def __init__(self, + sdb_filename, + buffering=BUFFER_SIZE, + audio_type=AUDIO_TYPE_OPUS, + bitrate=None, + id_prefix=None, + labeled=True): """ Parameters ---------- @@ -59,6 +95,8 @@ class DirectSDBWriter: Write-buffer size to use while writing the SDB file audio_type : str See util.audio.Sample.__init__ . + bitrate : int + Bitrate for sample-compression in case of lossy audio_type (e.g. AUDIO_TYPE_OPUS) id_prefix : str Prefix for IDs of written samples - defaults to sdb_filename labeled : bool or None @@ -71,6 +109,7 @@ class DirectSDBWriter: if audio_type not in SERIALIZABLE_AUDIO_TYPES: raise ValueError('Audio type "{}" not supported'.format(audio_type)) self.audio_type = audio_type + self.bitrate = bitrate self.sdb_file = open(sdb_filename, 'wb', buffering=buffering) self.offsets = [] self.num_samples = 0 @@ -100,7 +139,7 @@ class DirectSDBWriter: def add(self, sample): def to_bytes(n): return n.to_bytes(INT_SIZE, BIG_ENDIAN) - sample.change_audio_type(self.audio_type) + sample.change_audio_type(self.audio_type, bitrate=self.bitrate) opus = sample.audio.getbuffer() opus_len = to_bytes(len(opus)) if self.labeled: @@ -260,7 +299,36 @@ class SDB: # pylint: disable=too-many-instance-attributes self.close() -class CSV: +class SampleList: + """Sample collection reader for reading a DeepSpeech CSV file + Automatically orders samples by CSV column wav_filesize (if available).""" + def __init__(self, samples, labeled=True): + """ + Parameters + ---------- + samples : iterable of tuples of the form (sample_filename, filesize [, transcript]) + File-size is used for ordering the samples; transcript has to be provided if labeled=True + labeled : bool or None + If True: Reads LabeledSample instances. Fails, if CSV file has no transcript column. + If False: Ignores transcripts (if available) and reads (unlabeled) util.audio.Sample instances. + """ + self.labeled = labeled + self.samples = list(samples) + self.samples.sort(key=lambda r: r[1]) + + def __getitem__(self, i): + sample_spec = self.samples[i] + return load_sample(sample_spec[0], label=sample_spec[2] if self.labeled else None) + + def __iter__(self): + for i in range(len(self.samples)): + yield self[i] + + def __len__(self): + return len(self.samples) + + +class CSV(SampleList): """Sample collection reader for reading a DeepSpeech CSV file Automatically orders samples by CSV column wav_filesize (if available).""" def __init__(self, csv_filename, labeled=None): @@ -275,16 +343,14 @@ class CSV: If None: Automatically determines if CSV file has a transcript column (reading util.sample_collections.LabeledSample instances) or not (reading util.audio.Sample instances). """ - self.csv_filename = csv_filename - self.labeled = labeled - self.rows = [] + rows = [] csv_dir = Path(csv_filename).parent with open(csv_filename, 'r', encoding='utf8') as csv_file: reader = csv.DictReader(csv_file) if 'transcript' in reader.fieldnames: - if self.labeled is None: - self.labeled = True - elif self.labeled: + if labeled is None: + labeled = True + elif labeled: raise RuntimeError('No transcript data (missing CSV column)') for row in reader: wav_filename = Path(row['wav_filename']) @@ -292,36 +358,20 @@ class CSV: wav_filename = csv_dir / wav_filename wav_filename = str(wav_filename) wav_filesize = int(row['wav_filesize']) if 'wav_filesize' in row else 0 - if self.labeled: - self.rows.append((wav_filename, wav_filesize, row['transcript'])) + if labeled: + rows.append((wav_filename, wav_filesize, row['transcript'])) else: - self.rows.append((wav_filename, wav_filesize)) - self.rows.sort(key=lambda r: r[1]) - - def __getitem__(self, i): - row = self.rows[i] - wav_filename = row[0] - with open(wav_filename, 'rb') as wav_file: - if self.labeled: - return LabeledSample(AUDIO_TYPE_WAV, wav_file.read(), row[2], sample_id=wav_filename) - return Sample(AUDIO_TYPE_WAV, wav_file.read(), sample_id=wav_filename) - - def __iter__(self): - for i in range(len(self.rows)): - yield self[i] - - def __len__(self): - return len(self.rows) + rows.append((wav_filename, wav_filesize)) + super(CSV, self).__init__(rows, labeled=labeled) -def samples_from_file(filename, buffering=BUFFER_SIZE, labeled=None): +def samples_from_source(sample_source, buffering=BUFFER_SIZE, labeled=None): """ - Returns an iterable of util.sample_collections.LabeledSample or util.audio.Sample instances - loaded from a sample source file. + Loads samples from a sample source file. Parameters ---------- - filename : str + sample_source : str Path to the sample source file (SDB or CSV) buffering : int Read-buffer size to use while reading files @@ -330,23 +380,27 @@ def samples_from_file(filename, buffering=BUFFER_SIZE, labeled=None): If False: Ignores transcripts (if available) and reads (unlabeled) util.audio.Sample instances. If None: Automatically determines if source provides transcripts (reading util.sample_collections.LabeledSample instances) or not (reading util.audio.Sample instances). + + Returns + ------- + iterable of util.sample_collections.LabeledSample or util.audio.Sample instances supporting len. """ - ext = os.path.splitext(filename)[1].lower() + ext = os.path.splitext(sample_source)[1].lower() if ext == '.sdb': - return SDB(filename, buffering=buffering, labeled=labeled) + return SDB(sample_source, buffering=buffering, labeled=labeled) if ext == '.csv': - return CSV(filename, labeled=labeled) + return CSV(sample_source, labeled=labeled) raise ValueError('Unknown file type: "{}"'.format(ext)) -def samples_from_files(filenames, buffering=BUFFER_SIZE, labeled=None): +def samples_from_sources(sample_sources, buffering=BUFFER_SIZE, labeled=None): """ - Returns an iterable of util.sample_collections.LabeledSample or util.audio.Sample instances - loaded from a collection of sample source files. + Loads and combines samples from a list of source files. Sources are combined in an interleaving way to + keep default sample order from shortest to longest. Parameters ---------- - filenames : list of str + sample_sources : list of str Paths to sample source files (SDBs or CSVs) buffering : int Read-buffer size to use while reading files @@ -355,11 +409,100 @@ def samples_from_files(filenames, buffering=BUFFER_SIZE, labeled=None): If False: Ignores transcripts (if available) and always reads (unlabeled) util.audio.Sample instances. If None: Reads util.sample_collections.LabeledSample instances from sources with transcripts and util.audio.Sample instances from sources with no transcripts. + + Returns + ------- + iterable of util.sample_collections.LabeledSample (labeled=True) or util.audio.Sample (labeled=False) supporting len """ - filenames = list(filenames) - if len(filenames) == 0: + sample_sources = list(sample_sources) + if len(sample_sources) == 0: raise ValueError('No files') - if len(filenames) == 1: - return samples_from_file(filenames[0], buffering=buffering, labeled=labeled) - cols = list(map(partial(samples_from_file, buffering=buffering, labeled=labeled), filenames)) + if len(sample_sources) == 1: + return samples_from_source(sample_sources[0], buffering=buffering, labeled=labeled) + cols = list(map(partial(samples_from_source, buffering=buffering, labeled=labeled), sample_sources)) return Interleaved(*cols, key=lambda s: s.duration) + + +class PreparationContext: + def __init__(self, target_audio_type, augmentations): + self.target_audio_type = target_audio_type + self.augmentations = augmentations + + +PREPARATION_CONTEXT = None + + +def _init_preparation_worker(preparation_context): + global PREPARATION_CONTEXT # pylint: disable=global-statement + PREPARATION_CONTEXT = preparation_context + + +def _prepare_sample(timed_sample, context=None): + context = PREPARATION_CONTEXT if context is None else context + sample, clock = timed_sample + for augmentation in context.augmentations: + if random.random() < augmentation.probability: + augmentation.apply(sample, clock) + sample.change_audio_type(new_audio_type=context.target_audio_type) + return sample + + +def prepare_samples(samples, + audio_type=AUDIO_TYPE_NP, + augmentation_specs=None, + buffering=BUFFER_SIZE, + process_ahead=None, + repetitions=1, + fixed_clock=None): + """ + Prepares samples for being used during training. + This includes parallel and buffered application of augmentations and a conversion to a specified audio-type. + + Parameters + ---------- + samples : Sample enumeration + Typically produced by samples_from_sources. + audio_type : str + Target audio-type to convert samples to. See util.audio.Sample.__init__ . + augmentation_specs : list of str + Augmentation specifications like ["reverb[delay=20.0,decay=-20]", "volume"]. See TRAINING.rst. + buffering : int + Read-buffer size to use while reading files. + process_ahead : int + Number of samples to pre-process ahead of time. + repetitions : int + How often the input sample enumeration should get repeated for being re-augmented. + fixed_clock : float + Sets the internal clock to a value between 0.0 (beginning of epoch) and 1.0 (end of epoch). + Setting this to a number is used for simulating augmentations at a certain epoch-time. + If kept at None (default), the internal clock will run regularly from 0.0 to 1.0, + hence preparing them for training. + + Returns + ------- + iterable of util.sample_collections.LabeledSample or util.audio.Sample + """ + def timed_samples(): + for repetition in range(repetitions): + for sample_index, sample in enumerate(samples): + if fixed_clock is None: + yield sample, (repetition * len(samples) + sample_index) / (repetitions * len(samples)) + else: + yield sample, fixed_clock + + augmentations = [] if augmentation_specs is None else list(map(parse_augmentation, augmentation_specs)) + try: + for augmentation in augmentations: + call_if_exists(augmentation, 'start', buffering=buffering) + context = PreparationContext(audio_type, augmentations) + if process_ahead == 0: + for timed_sample in timed_samples(): + yield _prepare_sample(timed_sample, context=context) + else: + with LimitingPool(process_ahead=process_ahead, + initializer=_init_preparation_worker, + initargs=(context,)) as pool: + yield from pool.imap(_prepare_sample, timed_samples()) + finally: + for augmentation in augmentations: + call_if_exists(augmentation, 'stop') diff --git a/training/deepspeech_training/util/signal_augmentations.py b/training/deepspeech_training/util/signal_augmentations.py new file mode 100644 index 00000000..15027623 --- /dev/null +++ b/training/deepspeech_training/util/signal_augmentations.py @@ -0,0 +1,200 @@ + +import os +import re +import math +import random +import numpy as np + +from multiprocessing import Queue, Process +from .audio import gain_db_to_ratio, max_dbfs, normalize_audio, AUDIO_TYPE_NP, AUDIO_TYPE_PCM, AUDIO_TYPE_OPUS +from .helpers import int_range, float_range, pick_value_from_range, MEGABYTE + +SPEC_PARSER = re.compile(r'^([a-z]+)(\[(.*)\])?$') +BUFFER_SIZE = 1 * MEGABYTE + + +def _enqueue_overlay_samples(sample_source, queue, buffering=BUFFER_SIZE): + # preventing cyclic import problems + from .sample_collections import samples_from_source # pylint: disable=import-outside-toplevel + samples = samples_from_source(sample_source, buffering=buffering, labeled=False) + while True: + for sample in samples: + queue.put(sample) + + +class Overlay: + """See "Overlay augmentation" in TRAINING.rst""" + def __init__(self, source, p=1.0, snr=3.0, layers=1): + self.source = source + self.probability = float(p) + self.snr = float_range(snr) + self.layers = int_range(layers) + self.queue = Queue(max(1, math.floor(self.probability * self.layers[1] * os.cpu_count()))) + self.current_sample = None + self.enqueue_process = None + + def start(self, buffering=BUFFER_SIZE): + self.enqueue_process = Process(target=_enqueue_overlay_samples, + args=(self.source, self.queue), + kwargs={'buffering': buffering}) + self.enqueue_process.start() + + def apply(self, sample, clock): + sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP) + n_layers = pick_value_from_range(self.layers, clock=clock) + audio = sample.audio + overlay_data = np.zeros_like(audio) + for _ in range(n_layers): + overlay_offset = 0 + while overlay_offset < len(audio): + if self.current_sample is None: + next_overlay_sample = self.queue.get() + next_overlay_sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP) + self.current_sample = next_overlay_sample.audio + n_required = len(audio) - overlay_offset + n_current = len(self.current_sample) + if n_required >= n_current: # take it completely + overlay_data[overlay_offset:overlay_offset + n_current] += self.current_sample + overlay_offset += n_current + self.current_sample = None + else: # take required slice from head and keep tail for next layer or sample + overlay_data[overlay_offset:overlay_offset + n_required] += self.current_sample[0:n_required] + overlay_offset += n_required + self.current_sample = self.current_sample[n_required:] + snr_db = pick_value_from_range(self.snr, clock=clock) + orig_dbfs = max_dbfs(audio) + overlay_gain = orig_dbfs - max_dbfs(overlay_data) - snr_db + audio += overlay_data * gain_db_to_ratio(overlay_gain) + sample.audio = normalize_audio(audio, dbfs=orig_dbfs) + + def stop(self): + if self.enqueue_process is not None: + self.enqueue_process.terminate() + + +class Reverb: + """See "Reverb augmentation" in TRAINING.rst""" + def __init__(self, p=1.0, delay=20.0, decay=10.0): + self.probability = float(p) + self.delay = float_range(delay) + self.decay = float_range(decay) + + def apply(self, sample, clock): + sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP) + audio = np.array(sample.audio, dtype=np.float64) + orig_dbfs = max_dbfs(audio) + delay = pick_value_from_range(self.delay, clock=clock) + decay = pick_value_from_range(self.decay, clock=clock) + decay = gain_db_to_ratio(-decay) + result = np.copy(audio) + primes = [17, 19, 23, 29, 31] + for delay_prime in primes: # primes to minimize comb filter interference + layer = np.copy(audio) + n_delay = math.floor(delay * (delay_prime / primes[0]) * sample.audio_format.rate / 1000.0) + n_delay = max(16, n_delay) # 16 samples minimum to avoid performance trap and risk of division by zero + for w_index in range(0, math.floor(len(audio) / n_delay)): + w1 = w_index * n_delay + w2 = (w_index + 1) * n_delay + width = min(len(audio) - w2, n_delay) # last window could be smaller + layer[w2:w2 + width] += decay * layer[w1:w1 + width] + result += layer + audio = normalize_audio(result, dbfs=orig_dbfs) + sample.audio = np.array(audio, dtype=np.float32) + + +class Resample: + """See "Resample augmentation" in TRAINING.rst""" + def __init__(self, p=1.0, rate=8000): + self.probability = float(p) + self.rate = int_range(rate) + + def apply(self, sample, clock): + # late binding librosa and its dependencies + from librosa.core import resample # pylint: disable=import-outside-toplevel + sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP) + rate = pick_value_from_range(self.rate, clock=clock) + audio = sample.audio + orig_len = len(audio) + audio = np.swapaxes(audio, 0, 1) + audio = resample(audio, sample.audio_format.rate, rate) + audio = resample(audio, rate, sample.audio_format.rate) + audio = np.swapaxes(audio, 0, 1)[0:orig_len] + sample.audio = audio + + +class Codec: + """See "Codec augmentation" in TRAINING.rst""" + def __init__(self, p=1.0, bitrate=3200): + self.probability = float(p) + self.bitrate = int_range(bitrate) + + def apply(self, sample, clock): + bitrate = pick_value_from_range(self.bitrate, clock=clock) + sample.change_audio_type(new_audio_type=AUDIO_TYPE_PCM) # decoding to ensure it has to get encoded again + sample.change_audio_type(new_audio_type=AUDIO_TYPE_OPUS, bitrate=bitrate) # will get decoded again downstream + + +class Gaps: + """See "Gaps augmentation" in TRAINING.rst""" + def __init__(self, p=1.0, n=1, size=50.0): + self.probability = float(p) + self.n_gaps = int_range(n) + self.size = float_range(size) + + def apply(self, sample, clock): + sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP) + audio = sample.audio + n_gaps = pick_value_from_range(self.n_gaps, clock=clock) + for _ in range(n_gaps): + size = pick_value_from_range(self.size, clock=clock) + size = int(size * sample.audio_format.rate / 1000.0) + size = min(size, len(audio) // 10) # a gap should never exceed 10 percent of the audio + offset = random.randint(0, max(0, len(audio) - size - 1)) + audio[offset:offset + size] = 0 + sample.audio = audio + + +class Volume: + """See "Volume augmentation" in TRAINING.rst""" + def __init__(self, p=1.0, dbfs=3.0103): + self.probability = float(p) + self.target_dbfs = float_range(dbfs) + + def apply(self, sample, clock): + sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP) + target_dbfs = pick_value_from_range(self.target_dbfs, clock=clock) + sample.audio = normalize_audio(sample.audio, dbfs=target_dbfs) + + +def parse_augmentation(augmentation_spec): + """ + Parses an augmentation specification. + + Parameters + ---------- + augmentation_spec : str + Augmentation specification like "reverb[delay=20.0,decay=-20]". + + Returns + ------- + Instance of an augmentation class from util.signal_augmentations.*. + """ + match = SPEC_PARSER.match(augmentation_spec) + if not match: + raise ValueError('Augmentation specification has wrong format') + cls_name = match.group(1)[0].upper() + match.group(1)[1:] + if cls_name not in globals(): + raise ValueError('Unknown augmentation: {}'.format(cls_name)) + augmentation_cls = globals()[cls_name] + parameters = [] if match.group(3) is None else match.group(3).split(',') + args = [] + kwargs = {} + for parameter in parameters: + pair = tuple(list(map(str.strip, (parameter.split('='))))) + if len(pair) == 1: + args.append(pair) + elif len(pair) == 2: + kwargs[pair[0]] = pair[1] + else: + raise ValueError('Unable to parse augmentation value assignment') + return augmentation_cls(*args, **kwargs)