From 179ba1b5339b24087065ae0d344208d3542c2f0a Mon Sep 17 00:00:00 2001 From: Quentin Brunet Date: Sun, 3 Mar 2019 17:27:50 +0100 Subject: [PATCH] Fix import_cv2.py binary file + permissions --- bin/import_cv2.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) mode change 100644 => 100755 bin/import_cv2.py diff --git a/bin/import_cv2.py b/bin/import_cv2.py old mode 100644 new mode 100755 index c20b0069..d8f94bf3 --- a/bin/import_cv2.py +++ b/bin/import_cv2.py @@ -20,16 +20,16 @@ from util.downloader import SIMPLE_BAR ''' Broadly speaking, this script takes the audio downloaded from Common Voice -for a certain language, in addition to the *.tsv files output by CorporaCeator, +for a certain language, in addition to the *.tsv files output by CorporaCreator, and the script formats the data and transcripts to be in a state usable by DeepSpeech.py Usage: $ python3 import_cv2.py /path/to/audio/data_dir /path/to/tsv_dir -Input: +Input: (1) audio_dir (string) path to dir of audio downloaded from Common Voice - (2) tsv_dir (string) path to dir containing {train,test,dev}.tsv files + (2) tsv_dir (string) path to dir containing {train,test,dev}.tsv files which were generated by CorporaCreator Ouput: @@ -53,23 +53,25 @@ def _preprocess_data(audio_dir, tsv_dir): def _maybe_convert_set(audio_dir, input_tsv): output_csv = path.join(audio_dir,os.path.split(input_tsv)[-1].replace('tsv', 'csv')) print("Saving new DeepSpeech-formatted CSV file to: ", output_csv) - + # Get audiofile path and transcript for each sentence in tsv samples = [] with open(input_tsv) as input_tsv_file: reader = csv.DictReader(input_tsv_file, delimiter='\t') for row in reader: samples.append((row['path'], row['sentence'])) - + # Keep track of how many samples are good vs. problematic counter = { 'all': 0, 'too_short': 0, 'too_long': 0 } lock = RLock() num_samples = len(samples) rows = [] - + def one_sample(sample): """ Take a audio file, and optionally convert it to 16kHz WAV """ mp3_filename = path.join(audio_dir, sample[0]) + if not path.splitext(mp3_filename.lower())[1] == '.mp3': + mp3_filename += ".mp3" # Storing wav files next to the mp3 ones - just with a different suffix wav_filename = path.splitext(mp3_filename)[0] + ".wav" _maybe_convert_wav(mp3_filename, wav_filename) @@ -86,7 +88,7 @@ def _maybe_convert_set(audio_dir, input_tsv): # This one is good - keep it for the target CSV rows.append((wav_filename, file_size, sample[1])) counter['all'] += 1 - + print("Importing mp3 files...") pool = Pool(cpu_count()) bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR) @@ -95,7 +97,7 @@ def _maybe_convert_set(audio_dir, input_tsv): bar.update(num_samples) pool.close() pool.join() - + with open(output_csv, 'w') as output_csv_file: print('Writing CSV file for DeepSpeech.py as: ', output_csv) writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES) @@ -103,7 +105,7 @@ def _maybe_convert_set(audio_dir, input_tsv): bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR) for filename, file_size, transcript in bar(rows): writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript }) - + print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long'])) if counter['too_short'] > 0: print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])