From e26aefc39bcfd61be39a1413c5cdfb1f427febfc Mon Sep 17 00:00:00 2001
From: josh <jmeyer@mozilla.com>
Date: Fri, 1 Feb 2019 13:32:27 +0100
Subject: [PATCH] better handling of empty TSV dir / comments

---
 bin/import_cv2.py | 69 +++++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/bin/import_cv2.py b/bin/import_cv2.py
index 59cc3e4c..ce70eb6e 100644
--- a/bin/import_cv2.py
+++ b/bin/import_cv2.py
@@ -21,46 +21,54 @@ from util.downloader import SIMPLE_BAR
 
 
 '''
-This script will do the following:
+Broadly speaking, this script takes the audio downloaded from Common Voice
+for a certain language, in addition to the *.tsv files output by CorporaCeator,
+and the script formats the data and transcripts to be in a state usable by
+DeepSpeech.py
 
 Input: 
-        audio_dir (string) path to dir of audio downloaded from Common Voice
-        tsv_dir (string) path to dir containing tsv files generated by CorporaCreator
+        (1) audio_dir (string) path to dir of audio downloaded from Common Voice
+        (2) tsv_dir (string) path to dir containing tsv files generated by CorporaCreator
 
 Ouput:
-        csv files in format needed by DeepSpeech.py, saved into audio_dir
-        optionally converted wav files, saved into audio_dir alongside their mp3s
+        (1) csv files in format needed by DeepSpeech.py, saved into audio_dir
+        (2) wav files, saved into audio_dir alongside their mp3s
 '''
 
 FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript']
 SAMPLE_RATE = 16000
 MAX_SECS = 10
 
-
 def _preprocess_data(audio_dir, tsv_dir):
-    for source_tsv in glob(path.join(path.abspath(tsv_dir), '*.tsv')):
-        if not os.path.isfile(source_tsv):
-            print("ERROR: TSV file not found:", source_tsv)
-        else:
-            _maybe_convert_set(audio_dir, source_tsv)
+    try:
+        # Check if there is at least one TSV file in tsv_dir
+        os.path.isfile(glob(path.join(path.abspath(tsv_dir), '*.tsv'))[0])
+        for input_tsv in glob(path.join(path.abspath(tsv_dir), '*.tsv')):
+            print("Loading in TSV file: ", input_tsv)
+            _maybe_convert_set(audio_dir, input_tsv)
+    except IndexError:
+        print("ERROR: no TSV file found in: ", tsv_dir)
 
-def _maybe_convert_set(audio_dir, source_tsv):
-    # save new csv to the dir where the data is
-    target_csv =  path.join(audio_dir,os.path.split(source_tsv)[-1].replace('tsv', 'csv'))
-    print("Saving new DeepSpeech-formatted CSV file to: ", target_csv)
+
+def _maybe_convert_set(audio_dir, input_tsv):
+    output_csv =  path.join(audio_dir,os.path.split(input_tsv)[-1].replace('tsv', 'csv'))
+    print("Saving new DeepSpeech-formatted CSV file to: ", output_csv)
+    
+    # Get audiofile path and transcript for each sentence in tsv
     samples = []
-    with open(source_tsv) as source_tsv_file:
-        reader = csv.DictReader(source_tsv_file, delimiter='\t')
+    with open(input_tsv) as input_tsv_file:
+        reader = csv.DictReader(input_tsv_file, delimiter='\t')
         for row in reader:
             samples.append((row['path'], row['sentence']))
-
-    # Mutable counters for the concurrent embedded routine
+            
+    # Keep track of how many samples are good vs. problematic
     counter = { 'all': 0, 'too_short': 0, 'too_long': 0 }
     lock = RLock()
     num_samples = len(samples)
     rows = []
-
+    
     def one_sample(sample):
+        """ Take a audio file, and optionally convert it to 16kHz WAV """
         mp3_filename = path.join(audio_dir, sample[0])
         # Storing wav files next to the mp3 ones - just with a different suffix
         wav_filename = path.splitext(mp3_filename)[0] + ".wav"
@@ -78,8 +86,8 @@ def _maybe_convert_set(audio_dir, source_tsv):
                 # This one is good - keep it for the target CSV
                 rows.append((wav_filename, file_size, sample[1]))
             counter['all'] += 1
-
-    print('Importing mp3 files...')
+            
+    print("Importing mp3 files...")
     pool = Pool(cpu_count())
     bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
     for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1):
@@ -87,15 +95,15 @@ def _maybe_convert_set(audio_dir, source_tsv):
     bar.update(num_samples)
     pool.close()
     pool.join()
-
-    with open(target_csv, 'w') as target_csv_file:
-        print('Writing out ', target_csv)
-        writer = csv.DictWriter(target_csv_file, fieldnames=FIELDNAMES)
+    
+    with open(output_csv, 'w') as output_csv_file:
+        print('Writing CSV file for DeepSpeech.py as: ', output_csv)
+        writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
         writer.writeheader()
         bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
         for filename, file_size, transcript in bar(rows):
             writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript })
-
+            
     print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long']))
     if counter['too_short'] > 0:
         print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
@@ -107,13 +115,10 @@ def _maybe_convert_wav(mp3_filename, wav_filename):
         transformer = Transformer()
         transformer.convert(samplerate=SAMPLE_RATE)
         transformer.build(mp3_filename, wav_filename)
-        
 
 if __name__ == "__main__":
     audio_dir = sys.argv[1]
     tsv_dir = sys.argv[2]
-    print('Expecting your audio from Common Voice to be in ', audio_dir)
-    print('Looking for *.tsv files (generated by CorporaCreator) in ', tsv_dir)
+    print('Expecting your audio from Common Voice to be in: ', audio_dir)
+    print('Looking for *.tsv files (generated by CorporaCreator) in: ', tsv_dir)
     _preprocess_data(audio_dir, tsv_dir)
-
-