2021-01-03 10:17:53 +00:00

560 lines
25 KiB

import os
import re
import math
import random
import numpy as np
from multiprocessing import Queue, Process
from .audio import gain_db_to_ratio, max_dbfs, normalize_audio, AUDIO_TYPE_NP, AUDIO_TYPE_PCM, AUDIO_TYPE_OPUS
from .helpers import LimitingPool, int_range, float_range, pick_value_from_range, tf_pick_value_from_range, MEGABYTE
from .sample_collections import samples_from_source, unpack_maybe
SPEC_PARSER = re.compile(r'^(?P<cls>[a-z_]+)(\[(?P<params>.*)\])?$')
class Augmentation:
def __init__(self, p=1.0):
self.probability = float(p)
class SampleAugmentation(Augmentation):
def start(self, buffering=BUFFER_SIZE):
def apply(self, sample, clock=0.0):
raise NotImplementedError
def stop(self):
class GraphAugmentation(Augmentation):
def __init__(self, p=1.0, domain='spectrogram'):
super(GraphAugmentation, self).__init__(p)
if domain not in ['signal', 'spectrogram', 'features']:
raise ValueError('Unsupported augmentation domain: {}'.format(domain))
self.domain = domain
def apply(self, tensor, transcript=None, clock=0.0):
raise NotImplementedError
def apply_with_probability(self, tensor, transcript=None, clock=0.0):
import tensorflow as tf # pylint: disable=import-outside-toplevel
rv = tf.random.stateless_uniform([], seed=(clock * tf.int32.min, clock * tf.int32.max))
return tf.cond(tf.less(rv, self.probability),
lambda: self.apply(tensor, transcript=transcript, clock=clock),
lambda: tensor)
def maybe_apply(self, domain, tensor, transcript=None, clock=0.0):
if domain == self.domain:
return self.apply_with_probability(tensor, transcript=transcript, clock=clock)
return tensor
def units_per_ms(self):
from .flags import FLAGS # pylint: disable=import-outside-toplevel
return FLAGS.audio_sample_rate / 1000.0 if self.domain == 'signal' else 1.0 / FLAGS.feature_win_step
def parse_augmentation(augmentation_spec):
Parses an augmentation specification.
augmentation_spec : str
Augmentation specification like "reverb[delay=20.0,decay=1.0]".
Instance of an augmentation class from util.augmentations.*.
match = SPEC_PARSER.match(augmentation_spec)
if not match:
raise ValueError('Augmentation specification has wrong format')
cls_name = ''.join(map(lambda p: p[0].upper() + p[1:],'cls').split('_')))
augmentation_cls = globals()[cls_name] if cls_name in globals() else None
if augmentation_cls is None or not issubclass(augmentation_cls, Augmentation) or augmentation_cls == Augmentation:
raise ValueError('Unknown augmentation: {}'.format(cls_name))
parameters ='params')
parameters = [] if parameters is None else parameters.split(',')
args = []
kwargs = {}
for parameter in parameters:
pair = tuple(list(map(str.strip, (parameter.split('=')))))
if len(pair) == 1:
elif len(pair) == 2:
kwargs[pair[0]] = pair[1]
raise ValueError('Unable to parse augmentation value assignment')
return augmentation_cls(*args, **kwargs)
def parse_augmentations(augmentation_specs):
Parses an augmentation specification.
augmentation_specs : list of str
List of augmentation specifications like ["reverb[delay=20.0,decay=1.0]", "volume"].
List of augmentation class instances from util.augmentations.*.
return [] if augmentation_specs is None else list(map(parse_augmentation, augmentation_specs))
def apply_graph_augmentations(domain, tensor, augmentations, transcript=None, clock=0.0):
Augments training sample tensor of a certain domain with matching augmentations of passed list.
domain : str
Domain of the tensor to apply augmentations to. One of "signal", "spectrogram" or "features"
tensor : Tensor of type float32
Tensor to apply augmentations to.
augmentations : list of augmentation class instances from util.augmentations.*.
List of augmentations of which only the spectrogram ones will get applied to the samples.
transcript : SparseTensor
clock : Tensor of type float32
Time indicator for augmentation value-ranges. Running from 0.0 (start of training) to 1.0 (end of training).
Tensor of type float32
The augmented spectrogram
if augmentations is not None:
for augmentation in augmentations:
if isinstance(augmentation, GraphAugmentation):
tensor = augmentation.maybe_apply(domain, tensor, transcript=transcript, clock=clock)
return tensor
class AugmentationContext:
def __init__(self, target_audio_type, augmentations):
self.target_audio_type = target_audio_type
self.augmentations = augmentations
def _init_augmentation_worker(preparation_context):
global AUGMENTATION_CONTEXT # pylint: disable=global-statement
AUGMENTATION_CONTEXT = preparation_context
def _load_and_augment_sample(timed_sample, context=None):
sample, clock = timed_sample
realized_sample = unpack_maybe(sample)
return _augment_sample((realized_sample, clock), context)
def _augment_sample(timed_sample, context=None):
context = AUGMENTATION_CONTEXT if context is None else context
sample, clock = timed_sample
for augmentation in context.augmentations:
if random.random() < augmentation.probability:
augmentation.apply(sample, clock)
return sample
def apply_sample_augmentations(samples,
Prepares samples for being used during training.
This includes parallel and buffered application of augmentations and a conversion to a specified audio-type.
samples : Sample enumeration
Typically produced by util.sample_collections.samples_from_sources.
augmentations : list of augmentation class instances from util.augmentations.*.
List of augmentations of which only the signal ones will get applied to the samples.
audio_type : str
Target audio-type to convert samples to. See .
buffering : int
Read-buffer size to use while reading files.
process_ahead : int
Number of samples to pre-process ahead of time.
clock : float
Start or fixed clock value between 0.0 and 1.0 for the first or all samples. Has to be <= than clock_to.
final_clock : float
Final clock value between 0.0 and 1.0 for the last sample. Has to be >= than clock.
Requires samples.__len__ attribute.
iterable of util.sample_collections.LabeledSample or
def timed_samples():
if final_clock is None:
for sample in samples:
yield sample, clock
for sample_index, sample in enumerate(samples):
sample_clock = clock + (final_clock - clock) * (sample_index / len(samples))
yield sample, sample_clock
assert 0.0 <= clock <= 1.0
if final_clock is not None:
assert 0.0 <= final_clock <= 1.0
assert clock <= final_clock
augmentations = [aug for aug in augmentations if isinstance(aug, SampleAugmentation)] if augmentations else []
for augmentation in augmentations:
context = AugmentationContext(audio_type, augmentations)
if process_ahead == 0:
for timed_sample in timed_samples():
yield _load_and_augment_sample(timed_sample, context=context)
with LimitingPool(process_ahead=process_ahead,
initargs=(context,)) as pool:
yield from pool.imap(_load_and_augment_sample, timed_samples())
for augmentation in augmentations:
def _enqueue_overlay_samples(sample_source, queue, buffering=BUFFER_SIZE):
As the central distribution point for overlay samples this function is supposed to run in one process only.
This ensures that samples are not used twice if not required.
It loads the (raw and still compressed) data and provides it to the actual augmentation workers.
These are then doing decompression, potential conversion and overlaying in parallel.
samples = samples_from_source(sample_source, buffering=buffering, labeled=False)
while True:
for sample in samples:
class Overlay(SampleAugmentation):
"""See "Overlay augmentation" in training documentation"""
def __init__(self, source, p=1.0, snr=3.0, layers=1):
super(Overlay, self).__init__(p)
self.source = source
self.snr = float_range(snr)
self.layers = int_range(layers)
self.current_sample = None
self.queue = None
self.enqueue_process = None
def start(self, buffering=BUFFER_SIZE):
self.queue = Queue(max(1, math.floor(self.probability * self.layers[1] * os.cpu_count())))
self.enqueue_process = Process(target=_enqueue_overlay_samples,
args=(self.source, self.queue),
kwargs={'buffering': buffering})
def apply(self, sample, clock=0.0):
sample = unpack_maybe(sample)
n_layers = pick_value_from_range(self.layers, clock=clock)
audio =
overlay_data = np.zeros_like(audio)
for _ in range(n_layers):
overlay_offset = 0
while overlay_offset < len(audio):
if self.current_sample is None:
next_overlay_sample = self.queue.get()
next_overlay_sample = unpack_maybe(next_overlay_sample)
self.current_sample =
n_required = len(audio) - overlay_offset
n_current = len(self.current_sample)
if n_required >= n_current: # take it completely
overlay_data[overlay_offset:overlay_offset + n_current] += self.current_sample
overlay_offset += n_current
self.current_sample = None
else: # take required slice from head and keep tail for next layer or sample
overlay_data[overlay_offset:overlay_offset + n_required] += self.current_sample[0:n_required]
overlay_offset += n_required
self.current_sample = self.current_sample[n_required:]
snr_db = pick_value_from_range(self.snr, clock=clock)
orig_dbfs = max_dbfs(audio)
overlay_gain = orig_dbfs - max_dbfs(overlay_data) - snr_db
audio += overlay_data * gain_db_to_ratio(overlay_gain) = normalize_audio(audio, dbfs=orig_dbfs)
def stop(self):
if self.enqueue_process is not None:
self.enqueue_process = None
self.current_sample = None
self.queue = None
class Codec(SampleAugmentation):
"""See "Codec augmentation" in training documentation"""
def __init__(self, p=1.0, bitrate=3200):
super(Codec, self).__init__(p)
self.bitrate = int_range(bitrate)
def apply(self, sample, clock=0.0):
bitrate = pick_value_from_range(self.bitrate, clock=clock)
sample.change_audio_type(new_audio_type=AUDIO_TYPE_PCM) # decoding to ensure it has to get encoded again
sample.change_audio_type(new_audio_type=AUDIO_TYPE_OPUS, bitrate=bitrate) # will get decoded again downstream
class Reverb(SampleAugmentation):
"""See "Reverb augmentation" in training documentation"""
def __init__(self, p=1.0, delay=20.0, decay=10.0):
super(Reverb, self).__init__(p)
self.delay = float_range(delay)
self.decay = float_range(decay)
def apply(self, sample, clock=0.0):
audio = np.array(, dtype=np.float64)
orig_dbfs = max_dbfs(audio)
delay = pick_value_from_range(self.delay, clock=clock)
decay = pick_value_from_range(self.decay, clock=clock)
decay = gain_db_to_ratio(-decay)
result = np.copy(audio)
primes = [17, 19, 23, 29, 31]
for delay_prime in primes: # primes to minimize comb filter interference
layer = np.copy(audio)
n_delay = math.floor(delay * (delay_prime / primes[0]) * sample.audio_format.rate / 1000.0)
n_delay = max(16, n_delay) # 16 samples minimum to avoid performance trap and risk of division by zero
for w_index in range(0, math.floor(len(audio) / n_delay)):
w1 = w_index * n_delay
w2 = (w_index + 1) * n_delay
width = min(len(audio) - w2, n_delay) # last window could be smaller
layer[w2:w2 + width] += decay * layer[w1:w1 + width]
result += layer
audio = normalize_audio(result, dbfs=orig_dbfs) = np.array(audio, dtype=np.float32)
class Resample(SampleAugmentation):
"""See "Resample augmentation" in training documentation"""
def __init__(self, p=1.0, rate=8000):
super(Resample, self).__init__(p)
self.rate = int_range(rate)
def apply(self, sample, clock=0.0):
# late binding librosa and its dependencies
# pre-importing sklearn fixes
import sklearn # pylint: disable=import-outside-toplevel
from librosa.core import resample # pylint: disable=import-outside-toplevel
rate = pick_value_from_range(self.rate, clock=clock)
audio =
orig_len = len(audio)
audio = np.swapaxes(audio, 0, 1)
if audio.shape[0] < 2:
# since v0.8 librosa enforces a shape of (samples,) instead of (channels, samples) for mono samples
resampled = resample(audio[0], sample.audio_format.rate, rate)
audio[0] = resample(resampled, rate, sample.audio_format.rate)[:orig_len]
audio = resample(audio, sample.audio_format.rate, rate)
audio = resample(audio, rate, sample.audio_format.rate)
audio = np.swapaxes(audio, 0, 1)[0:orig_len] = audio
class Volume(SampleAugmentation):
"""See "Volume augmentation" in training documentation"""
def __init__(self, p=1.0, dbfs=3.0103):
super(Volume, self).__init__(p)
self.target_dbfs = float_range(dbfs)
def apply(self, sample, clock=0.0):
target_dbfs = pick_value_from_range(self.target_dbfs, clock=clock) = normalize_audio(, dbfs=target_dbfs)
class Pitch(GraphAugmentation):
"""See "Pitch augmentation" in training documentation"""
def __init__(self, p=1.0, pitch=(1.075, 1.075, 0.125)):
super(Pitch, self).__init__(p, domain='spectrogram')
self.pitch = float_range(pitch)
def apply(self, tensor, transcript=None, clock=0.0):
import tensorflow as tf # pylint: disable=import-outside-toplevel
import tensorflow.compat.v1 as tfv1 # pylint: disable=import-outside-toplevel
original_shape = tf.shape(tensor)
pitch = tf_pick_value_from_range(self.pitch, clock=clock)
new_freq_size = tf.cast(tf.cast(original_shape[2], tf.float32) * pitch, tf.int32)
spectrogram_aug = tfv1.image.resize_bilinear(tf.expand_dims(tensor, -1), [original_shape[1], new_freq_size])
spectrogram_aug = tf.image.crop_to_bounding_box(spectrogram_aug,
target_width=tf.math.minimum(original_shape[2], new_freq_size))
spectrogram_aug = tf.cond(pitch < 1,
lambda: tf.image.pad_to_bounding_box(spectrogram_aug,
lambda: spectrogram_aug)
return spectrogram_aug[:, :, :, 0]
class Tempo(GraphAugmentation):
"""See "Tempo augmentation" in training documentation"""
def __init__(self, p=1.0, factor=1.1, max_time=-1):
super(Tempo, self).__init__(p, domain='spectrogram')
self.factor = float_range(factor)
self.max_time = float(max_time)
def apply(self, tensor, transcript=None, clock=0.0):
import tensorflow as tf # pylint: disable=import-outside-toplevel
import tensorflow.compat.v1 as tfv1 # pylint: disable=import-outside-toplevel
factor = tf_pick_value_from_range(self.factor, clock=clock)
original_shape = tf.shape(tensor)
new_time_size = tf.cast(tf.cast(original_shape[1], tf.float32) / factor, tf.int32)
if transcript is not None:
new_time_size = tf.math.maximum(new_time_size, tf.shape(transcript)[1])
if self.max_time > 0:
new_time_size = tf.math.minimum(new_time_size, tf.cast(self.max_time * self.units_per_ms(), tf.int32))
spectrogram_aug = tfv1.image.resize_bilinear(tf.expand_dims(tensor, -1), [new_time_size, original_shape[2]])
return spectrogram_aug[:, :, :, 0]
class Warp(GraphAugmentation):
"""See "Warp augmentation" in training documentation"""
def __init__(self, p=1.0, nt=1, nf=1, wt=0.1, wf=0.0):
super(Warp, self).__init__(p, domain='spectrogram')
self.num_t = int_range(nt)
self.num_f = int_range(nf)
self.warp_t = float_range(wt)
self.warp_f = float_range(wf)
def apply(self, tensor, transcript=None, clock=0.0):
import tensorflow as tf # pylint: disable=import-outside-toplevel
import tensorflow.compat.v1 as tfv1 # pylint: disable=import-outside-toplevel
import tensorflow_addons as tfa # pylint: disable=import-outside-toplevel
original_shape = tf.shape(tensor)
size_t, size_f = original_shape[1], original_shape[2]
seed = (clock * tf.int32.min, clock * tf.int32.max)
num_t = tf_pick_value_from_range(self.num_t, clock=clock)
num_f = tf_pick_value_from_range(self.num_f, clock=clock)
def get_flows(n, size, warp):
warp = tf_pick_value_from_range(warp, clock=clock)
warp = warp * tf.cast(size, dtype=tf.float32) / tf.cast(2 * (n + 1), dtype=tf.float32)
f = tf.random.stateless_normal([num_t, num_f], seed, mean=0.0, stddev=warp, dtype=tf.float32)
return tf.pad(f, tf.constant([[1, 1], [1, 1]]), 'CONSTANT') # zero flow at all edges
flows = tf.stack([get_flows(num_t, size_t, self.warp_t), get_flows(num_f, size_f, self.warp_f)], axis=2)
flows = tfv1.image.resize_bicubic(tf.expand_dims(flows, 0), [size_t, size_f])
spectrogram_aug = tfa.image.dense_image_warp(tf.expand_dims(tensor, -1), flows)
return tf.reshape(spectrogram_aug, shape=(1, -1, size_f))
class FrequencyMask(GraphAugmentation):
"""See "Frequency mask augmentation" in training documentation"""
def __init__(self, p=1.0, n=3, size=2):
super(FrequencyMask, self).__init__(p, domain='spectrogram')
self.n = int_range(n) # pylint: disable=invalid-name
self.size = int_range(size)
def apply(self, tensor, transcript=None, clock=0.0):
import tensorflow as tf # pylint: disable=import-outside-toplevel
time_max = tf.shape(tensor)[1]
freq_max = tf.shape(tensor)[2]
n = tf_pick_value_from_range(self.n, clock=clock)
def body(i, spectrogram_aug):
size = tf_pick_value_from_range(self.size, clock=clock)
size = tf.math.maximum(1, tf.math.minimum(freq_max - 1, size))
seed = tf.cast(clock * tf.int32.max, tf.int32) - i
f0 = tf.random.stateless_uniform((), (-seed, seed), minval=0, maxval=freq_max - size, dtype=tf.dtypes.int32)
freq_mask = tf.concat([tf.ones([1, time_max, f0]),
tf.zeros([1, time_max, size]),
tf.ones([1, time_max, freq_max - f0 - size])], axis=2)
return i + 1, spectrogram_aug * freq_mask
return tf.while_loop(lambda i, spectrogram_aug: i < n, body, (0, tensor))[1]
class TimeMask(GraphAugmentation):
"""See "Time mask augmentation" in training documentation"""
def __init__(self, p=1.0, domain='spectrogram', n=3, size=10.0):
super(TimeMask, self).__init__(p, domain=domain)
self.n = int_range(n) # pylint: disable=invalid-name
self.size = float_range(size)
def apply(self, tensor, transcript=None, clock=0.0):
import tensorflow as tf # pylint: disable=import-outside-toplevel
time_max = tf.shape(tensor)[0 if self.domain == 'signal' else 1]
n = tf_pick_value_from_range(self.n, clock=clock)
def body(i, augmented):
size = tf.cast(tf_pick_value_from_range(self.size, clock=clock) * self.units_per_ms(), dtype=tf.int32)
size = tf.math.maximum(1, tf.math.minimum(time_max - 1, size))
seed = tf.cast(clock * tf.int32.max, tf.int32) - i
t0 = tf.random.stateless_uniform((), (-seed, seed), minval=0, maxval=time_max - size, dtype=tf.dtypes.int32)
rest = time_max - t0 - size
if self.domain == 'spectrogram':
fm = tf.shape(tensor)[2]
time_mask = tf.concat([tf.ones([1, t0, fm]), tf.zeros([1, size, fm]), tf.ones([1, rest, fm])], axis=1)
elif self.domain == 'signal':
time_mask = tf.concat([tf.ones([t0, 1]), tf.zeros([size, 1]), tf.ones([rest, 1])], axis=0)
time_mask = tf.concat([tf.ones([1, t0]), tf.zeros([1, size]), tf.ones([1, rest])], axis=1)
return i + 1, augmented * time_mask
return tf.while_loop(lambda i, augmented: i < n, body, (0, tensor))[1]
class Dropout(GraphAugmentation):
"""See "Dropout augmentation" in training documentation"""
def __init__(self, p=1.0, domain='spectrogram', rate=0.05):
super(Dropout, self).__init__(p, domain=domain)
self.rate = float_range(rate)
def apply(self, tensor, transcript=None, clock=0.0):
import tensorflow as tf # pylint: disable=import-outside-toplevel
rate = tf_pick_value_from_range(self.rate, clock=clock)
rate = tf.math.maximum(0.0, rate)
factors = tf.random.stateless_uniform(tf.shape(tensor),
(clock * tf.int32.min, clock * tf.int32.max),
return tensor * tf.math.sign(tf.math.floor(factors + rate))
class Add(GraphAugmentation):
"""See "Add augmentation" in training documentation"""
def __init__(self, p=1.0, domain='features', stddev=5):
super(Add, self).__init__(p, domain=domain)
self.stddev = float_range(stddev)
def apply(self, tensor, transcript=None, clock=0.0):
import tensorflow as tf # pylint: disable=import-outside-toplevel
stddev = tf_pick_value_from_range(self.stddev, clock=clock)
seed = (clock * tf.int32.min, clock * tf.int32.max)
return tensor + tf.random.stateless_normal(tf.shape(tensor), seed, mean=0.0, stddev=stddev)
class Multiply(GraphAugmentation):
"""See "Multiply augmentation" in training documentation"""
def __init__(self, p=1.0, domain='features', stddev=5):
super(Multiply, self).__init__(p, domain=domain)
self.stddev = float_range(stddev)
def apply(self, tensor, transcript=None, clock=0.0):
import tensorflow as tf # pylint: disable=import-outside-toplevel
stddev = tf_pick_value_from_range(self.stddev, clock=clock)
seed = (clock * tf.int32.min, clock * tf.int32.max)
return tensor * tf.random.stateless_normal(tf.shape(tensor), seed, mean=1.0, stddev=stddev)