From ecbdf46940d2497307d1c616b37f63a1e14d81ef Mon Sep 17 00:00:00 2001 From: Tilman Kamp <5991088+tilmankamp@users.noreply.github.com> Date: Thu, 23 Jul 2020 17:18:40 +0200 Subject: [PATCH] Fixes #3178 - Librosa requires 1-dimensional array for mono samples --- training/deepspeech_training/util/augmentations.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/training/deepspeech_training/util/augmentations.py b/training/deepspeech_training/util/augmentations.py index 7ac52c41..941c17f2 100644 --- a/training/deepspeech_training/util/augmentations.py +++ b/training/deepspeech_training/util/augmentations.py @@ -349,8 +349,13 @@ class Resample(SampleAugmentation): audio = sample.audio orig_len = len(audio) audio = np.swapaxes(audio, 0, 1) - audio = resample(audio, sample.audio_format.rate, rate) - audio = resample(audio, rate, sample.audio_format.rate) + if audio.shape[0] < 2: + # since v0.8 librosa enforces a shape of (samples,) instead of (channels, samples) for mono samples + resampled = resample(audio[0], sample.audio_format.rate, rate) + audio[0] = resample(resampled, rate, sample.audio_format.rate)[:orig_len] + else: + audio = resample(audio, sample.audio_format.rate, rate) + audio = resample(audio, rate, sample.audio_format.rate) audio = np.swapaxes(audio, 0, 1)[0:orig_len] sample.audio = audio