Merge pull request #1956 from tilmankamp/fix1955

Fix #1955
This commit is contained in:
Tilman Kamp 2019-03-14 15:01:35 +01:00 committed by GitHub
commit 35ebcd2075
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -8,11 +8,11 @@ import sys
sys.path.insert(1, os.path.join(sys.path[0], '..')) sys.path.insert(1, os.path.join(sys.path[0], '..'))
import csv import csv
import sox
import subprocess import subprocess
import progressbar import progressbar
from os import path from os import path
from sox import Transformer
from threading import RLock from threading import RLock
from multiprocessing.dummy import Pool from multiprocessing.dummy import Pool
from multiprocessing import cpu_count from multiprocessing import cpu_count
@ -62,7 +62,7 @@ def _maybe_convert_set(audio_dir, input_tsv):
samples.append((row['path'], row['sentence'])) samples.append((row['path'], row['sentence']))
# Keep track of how many samples are good vs. problematic # Keep track of how many samples are good vs. problematic
counter = { 'all': 0, 'too_short': 0, 'too_long': 0 } counter = { 'all': 0, 'failed': 0, 'too_short': 0, 'too_long': 0 }
lock = RLock() lock = RLock()
num_samples = len(samples) num_samples = len(samples)
rows = [] rows = []
@ -75,10 +75,15 @@ def _maybe_convert_set(audio_dir, input_tsv):
# Storing wav files next to the mp3 ones - just with a different suffix # Storing wav files next to the mp3 ones - just with a different suffix
wav_filename = path.splitext(mp3_filename)[0] + ".wav" wav_filename = path.splitext(mp3_filename)[0] + ".wav"
_maybe_convert_wav(mp3_filename, wav_filename) _maybe_convert_wav(mp3_filename, wav_filename)
frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT)) file_size = -1
if path.exists(wav_filename):
file_size = path.getsize(wav_filename) file_size = path.getsize(wav_filename)
frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
with lock: with lock:
if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])): if file_size == -1:
# Excluding samples that failed upon conversion
counter['failed'] += 1
elif int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])):
# Excluding samples that are too short to fit the transcript # Excluding samples that are too short to fit the transcript
counter['too_short'] += 1 counter['too_short'] += 1
elif frames/SAMPLE_RATE > MAX_SECS: elif frames/SAMPLE_RATE > MAX_SECS:
@ -106,7 +111,9 @@ def _maybe_convert_set(audio_dir, input_tsv):
for filename, file_size, transcript in bar(rows): for filename, file_size, transcript in bar(rows):
writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript }) writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript })
print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long'])) print('Imported %d samples.' % (counter['all'] - counter['failed'] - counter['too_short'] - counter['too_long']))
if counter['failed'] > 0:
print('Skipped %d samples that failed upon conversion.' % counter['failed'])
if counter['too_short'] > 0: if counter['too_short'] > 0:
print('Skipped %d samples that were too short to match the transcript.' % counter['too_short']) print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
if counter['too_long'] > 0: if counter['too_long'] > 0:
@ -114,9 +121,13 @@ def _maybe_convert_set(audio_dir, input_tsv):
def _maybe_convert_wav(mp3_filename, wav_filename): def _maybe_convert_wav(mp3_filename, wav_filename):
if not path.exists(wav_filename): if not path.exists(wav_filename):
transformer = Transformer() transformer = sox.Transformer()
transformer.convert(samplerate=SAMPLE_RATE) transformer.convert(samplerate=SAMPLE_RATE)
try:
transformer.build(mp3_filename, wav_filename) transformer.build(mp3_filename, wav_filename)
except sox.core.SoxError:
pass
if __name__ == "__main__": if __name__ == "__main__":
audio_dir = sys.argv[1] audio_dir = sys.argv[1]