From 9aa23ed387439257579ac5bd9869b43ecd432248 Mon Sep 17 00:00:00 2001 From: kdavis-mozilla Date: Fri, 16 Nov 2018 14:46:57 +0100 Subject: [PATCH 1/4] Fixed #1726 (Imported 8khz training audio compromised by unfiltered upsampling) --- bin/import_fisher.py | 35 +++++++++++++++-------------------- requirements.txt | 2 ++ 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/bin/import_fisher.py b/bin/import_fisher.py index 9a9a2eef..50247e1a 100755 --- a/bin/import_fisher.py +++ b/bin/import_fisher.py @@ -16,8 +16,8 @@ import os import pandas import subprocess import unicodedata -import wave -import audioop +import librosa +import resampy from util.text import validate_label @@ -142,7 +142,7 @@ def _split_wav_and_sentences(data_dir, trans_data, original_data, converted_data print("splitting {} according to {}".format(wav_files, trans_file)) - origAudios = [wave.open(wav_file, "r") for wav_file in wav_files] + origAudios = [librosa.load(wav_file, sr=None, mono=False) for wav_file in wav_files] # Loop over segments and split wav_file for each segment for segment in segments: @@ -160,26 +160,21 @@ def _split_wav_and_sentences(data_dir, trans_data, original_data, converted_data if transcript != None: files.append((os.path.abspath(new_wav_file), new_wav_filesize, transcript)) - # Close origAudios - for origAudio in origAudios: - origAudio.close() - return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"]) +def _split_audio(origAudio, start_time, stop_time): + audioData = origAudio[0] + frameRate = origAudio[1] + nChannels = len(audioData.shape) + startIndex = int(start_time * frameRate) + stopIndex = int(stop_time * frameRate) + return audioData[startIndex: stopIndex] if 1 == nChannels else audioData[:, startIndex: stopIndex] + def _split_and_resample_wav(origAudio, start_time, stop_time, new_wav_file): - nChannels = origAudio.getnchannels() - sampleWidth = origAudio.getsampwidth() - frameRate = origAudio.getframerate() - origAudio.setpos(int(start_time * frameRate)) - chunkData = origAudio.readframes(int((stop_time - start_time) * frameRate)) - # by doubling the frame-rate we effectively go from 8 kHz to 16 kHz - chunkData, _ = audioop.ratecv(chunkData, sampleWidth, nChannels, frameRate, 2 * frameRate, None) - chunkAudio = wave.open(new_wav_file, "w") - chunkAudio.setnchannels(nChannels) - chunkAudio.setsampwidth(sampleWidth) - chunkAudio.setframerate(2 * frameRate) - chunkAudio.writeframes(chunkData) - chunkAudio.close() + frameRate = origAudio[1] + chunkData = _split_audio(origAudio, start_time, stop_time) + chunkData = resampy.resample(chunkData, frameRate, 16000) + librosa.output.write_wav(new_wav_file, chunkData, 16000) def _split_sets(filelist): # We initially split the entire set into 80% train and 20% test, then diff --git a/requirements.txt b/requirements.txt index 8feeb860..22f03afd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,5 @@ requests tables attrdict setuptools +librosa +resampy From ce943dd65c73a53c024cba8be4d196e51839cc7c Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Sat, 17 Nov 2018 14:03:30 -0200 Subject: [PATCH 2/4] Disable StepCounterHook to avoid useless warning during validation --- DeepSpeech.py | 1 + 1 file changed, 1 insertion(+) diff --git a/DeepSpeech.py b/DeepSpeech.py index 76b3e6ce..72ed9173 100755 --- a/DeepSpeech.py +++ b/DeepSpeech.py @@ -540,6 +540,7 @@ def train(server=None): hooks=hooks, checkpoint_dir=FLAGS.checkpoint_dir, save_checkpoint_secs=None, # already taken care of by a hook + log_step_count_steps=0, # disable logging of steps/s to avoid TF warning in validation sets config=Config.session_config) as session: tf.get_default_graph().finalize() From f03669171f1563e3087999d74f26290db10a8ccd Mon Sep 17 00:00:00 2001 From: kdavis-mozilla Date: Fri, 16 Nov 2018 14:46:57 +0100 Subject: [PATCH 3/4] Fixed #1726 (Imported 8khz training audio compromised by unfiltered upsampling) --- bin/import_fisher.py | 35 +++++++++++++++-------------------- requirements.txt | 2 ++ 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/bin/import_fisher.py b/bin/import_fisher.py index 9a9a2eef..50247e1a 100755 --- a/bin/import_fisher.py +++ b/bin/import_fisher.py @@ -16,8 +16,8 @@ import os import pandas import subprocess import unicodedata -import wave -import audioop +import librosa +import resampy from util.text import validate_label @@ -142,7 +142,7 @@ def _split_wav_and_sentences(data_dir, trans_data, original_data, converted_data print("splitting {} according to {}".format(wav_files, trans_file)) - origAudios = [wave.open(wav_file, "r") for wav_file in wav_files] + origAudios = [librosa.load(wav_file, sr=None, mono=False) for wav_file in wav_files] # Loop over segments and split wav_file for each segment for segment in segments: @@ -160,26 +160,21 @@ def _split_wav_and_sentences(data_dir, trans_data, original_data, converted_data if transcript != None: files.append((os.path.abspath(new_wav_file), new_wav_filesize, transcript)) - # Close origAudios - for origAudio in origAudios: - origAudio.close() - return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"]) +def _split_audio(origAudio, start_time, stop_time): + audioData = origAudio[0] + frameRate = origAudio[1] + nChannels = len(audioData.shape) + startIndex = int(start_time * frameRate) + stopIndex = int(stop_time * frameRate) + return audioData[startIndex: stopIndex] if 1 == nChannels else audioData[:, startIndex: stopIndex] + def _split_and_resample_wav(origAudio, start_time, stop_time, new_wav_file): - nChannels = origAudio.getnchannels() - sampleWidth = origAudio.getsampwidth() - frameRate = origAudio.getframerate() - origAudio.setpos(int(start_time * frameRate)) - chunkData = origAudio.readframes(int((stop_time - start_time) * frameRate)) - # by doubling the frame-rate we effectively go from 8 kHz to 16 kHz - chunkData, _ = audioop.ratecv(chunkData, sampleWidth, nChannels, frameRate, 2 * frameRate, None) - chunkAudio = wave.open(new_wav_file, "w") - chunkAudio.setnchannels(nChannels) - chunkAudio.setsampwidth(sampleWidth) - chunkAudio.setframerate(2 * frameRate) - chunkAudio.writeframes(chunkData) - chunkAudio.close() + frameRate = origAudio[1] + chunkData = _split_audio(origAudio, start_time, stop_time) + chunkData = resampy.resample(chunkData, frameRate, 16000) + librosa.output.write_wav(new_wav_file, chunkData, 16000) def _split_sets(filelist): # We initially split the entire set into 80% train and 20% test, then diff --git a/requirements.txt b/requirements.txt index 8feeb860..22f03afd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,5 @@ requests tables attrdict setuptools +librosa +resampy From b2f967ac4ddab33fda9255990bbb734f7bbb72f2 Mon Sep 17 00:00:00 2001 From: kdavis-mozilla Date: Sat, 17 Nov 2018 20:24:01 +0100 Subject: [PATCH 4/4] Addressed review comments --- bin/import_fisher.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bin/import_fisher.py b/bin/import_fisher.py index 50247e1a..e4f95983 100755 --- a/bin/import_fisher.py +++ b/bin/import_fisher.py @@ -163,8 +163,7 @@ def _split_wav_and_sentences(data_dir, trans_data, original_data, converted_data return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"]) def _split_audio(origAudio, start_time, stop_time): - audioData = origAudio[0] - frameRate = origAudio[1] + audioData, frameRate = origAudio nChannels = len(audioData.shape) startIndex = int(start_time * frameRate) stopIndex = int(stop_time * frameRate)