diff --git a/training/coqui_stt_training/util/augmentations.py b/training/coqui_stt_training/util/augmentations.py index be88c05b..dee7a725 100644 --- a/training/coqui_stt_training/util/augmentations.py +++ b/training/coqui_stt_training/util/augmentations.py @@ -1,4 +1,3 @@ - import os import re import math @@ -10,6 +9,7 @@ from multiprocessing import Queue, Process from .audio import gain_db_to_ratio, max_dbfs, normalize_audio, AUDIO_TYPE_NP, AUDIO_TYPE_PCM, AUDIO_TYPE_OPUS from .helpers import LimitingPool, int_range, float_range, pick_value_from_range, tf_pick_value_from_range, MEGABYTE from .sample_collections import samples_from_source, unpack_maybe +from .logging import log_info BUFFER_SIZE = 1 * MEGABYTE SPEC_PARSER = re.compile(r'^(?P[a-z_]+)(\[(?P.*)\])?$') @@ -90,6 +90,7 @@ def parse_augmentation(augmentation_spec): kwargs[pair[0]] = pair[1] else: raise ValueError('Unable to parse augmentation value assignment') + log_info('Processed augmentation type: [{}] with parameter settings: {}'.format(augmentation_cls.__name__, kwargs)) return augmentation_cls(*args, **kwargs) @@ -106,7 +107,7 @@ def parse_augmentations(augmentation_specs): ------- List of augmentation class instances from util.augmentations.*. """ - return [] if augmentation_specs is None else list(map(parse_augmentation, augmentation_specs)) + return list(map(parse_augmentation, augmentation_specs or [])) def apply_graph_augmentations(domain, tensor, augmentations, transcript=None, clock=0.0): diff --git a/training/coqui_stt_training/util/helpers.py b/training/coqui_stt_training/util/helpers.py index 77372360..5f434480 100644 --- a/training/coqui_stt_training/util/helpers.py +++ b/training/coqui_stt_training/util/helpers.py @@ -163,27 +163,41 @@ def remember_exception(iterable, exception_box=None): def get_value_range(value, target_type): + """ + This function converts all possible supplied values for augmentation + into the [start,end,r] ValueRange type. The expected inputs are of the form: + + + ~ + :~ + + Any "missing" values are filled so that ValueRange always includes [start,end,r]. + """ if isinstance(value, str): - r = target_type(0) - parts = value.split('~') - if len(parts) == 2: + if '~' in value: + parts = value.split('~') + if len(parts) != 2: + raise ValueError('Cannot parse value range') value = parts[0] - r = target_type(parts[1]) - elif len(parts) > 2: - raise ValueError('Cannot parse value range') + r = parts[1] + else: + r = 0 # if no supplied, use 0 parts = value.split(':') if len(parts) == 1: - parts.append(parts[0]) - elif len(parts) > 2: + parts.append(parts[0]) # only one given, so double it + if len(parts) != 2: raise ValueError('Cannot parse value range') - return ValueRange(target_type(parts[0]), target_type(parts[1]), r) + return ValueRange(target_type(parts[0]), target_type(parts[1]), target_type(r)) if isinstance(value, tuple): if len(value) == 2: - return ValueRange(target_type(value[0]), target_type(value[1]), 0) + return ValueRange(target_type(value[0]), target_type(value[1]), target_type(0)) if len(value) == 3: return ValueRange(target_type(value[0]), target_type(value[1]), target_type(value[2])) - raise ValueError('Cannot convert to ValueRange: Wrong tuple size') - return ValueRange(target_type(value), target_type(value), 0) + else: + raise ValueError('Cannot convert to ValueRange: Wrong tuple size') + if isinstance(value, int) or isinstance(value, float): + return ValueRange(target_type(value), target_type(value), target_type(0)) + raise ValueError('Cannot convert to ValueRange: Wrong tuple size') def int_range(value): @@ -203,14 +217,20 @@ def pick_value_from_range(value_range, clock=None): def tf_pick_value_from_range(value_range, clock=None, double_precision=False): import tensorflow as tf # pylint: disable=import-outside-toplevel - clock = (tf.random.stateless_uniform([], seed=(-1, 1), dtype=tf.float64) if clock is None - else tf.maximum(tf.constant(0.0, dtype=tf.float64), tf.minimum(tf.constant(1.0, dtype=tf.float64), clock))) + if clock is None: + clock = tf.random.stateless_uniform([], seed=(-1, 1), dtype=tf.float64) + else: + clock = tf.maximum(tf.constant(0.0, dtype=tf.float64), + tf.minimum(tf.constant(1.0, dtype=tf.float64), clock)) value = value_range.start + clock * (value_range.end - value_range.start) - value = tf.random.stateless_uniform([], - minval=value - value_range.r, - maxval=value + value_range.r, - seed=(clock * tf.int32.min, clock * tf.int32.max), - dtype=tf.float64) + if value_range.r: + # if the option (~, randomization radius) is supplied, + # sample the value from a uniform distribution with "radius" + value = tf.random.stateless_uniform([], + minval=value - value_range.r, + maxval=value + value_range.r, + seed=(clock * tf.int32.min, clock * tf.int32.max), + dtype=tf.float64) if isinstance(value_range.start, int): return tf.cast(tf.math.round(value), tf.int64 if double_precision else tf.int32) return tf.cast(value, tf.float64 if double_precision else tf.float32)