Fix import_cv2.py binary file + permissions

2019-03-03 17:27:50 +01:00 · 2019-03-03 17:27:50 +01:00 · 179ba1b533
commit 179ba1b533
parent 3c3401ec60
1 changed files with 11 additions and 9 deletions
--- a/bin/import_cv2.py
+++ b/bin/import_cv2.py
@ -20,16 +20,16 @@ from util.downloader import SIMPLE_BAR

 '''
 Broadly speaking, this script takes the audio downloaded from Common Voice
-for a certain language, in addition to the *.tsv files output by CorporaCeator,
+for a certain language, in addition to the *.tsv files output by CorporaCreator,
 and the script formats the data and transcripts to be in a state usable by
 DeepSpeech.py

 Usage:
        $ python3 import_cv2.py /path/to/audio/data_dir /path/to/tsv_dir

-Input: 
+Input:
        (1) audio_dir (string) path to dir of audio downloaded from Common Voice
-        (2) tsv_dir (string) path to dir containing {train,test,dev}.tsv files 
+        (2) tsv_dir (string) path to dir containing {train,test,dev}.tsv files
            which were generated by CorporaCreator

 Ouput:
@ -53,23 +53,25 @@ def _preprocess_data(audio_dir, tsv_dir):
 def _maybe_convert_set(audio_dir, input_tsv):
    output_csv =  path.join(audio_dir,os.path.split(input_tsv)[-1].replace('tsv', 'csv'))
    print("Saving new DeepSpeech-formatted CSV file to: ", output_csv)
-    
+
    # Get audiofile path and transcript for each sentence in tsv
    samples = []
    with open(input_tsv) as input_tsv_file:
        reader = csv.DictReader(input_tsv_file, delimiter='\t')
        for row in reader:
            samples.append((row['path'], row['sentence']))
-            
+
    # Keep track of how many samples are good vs. problematic
    counter = { 'all': 0, 'too_short': 0, 'too_long': 0 }
    lock = RLock()
    num_samples = len(samples)
    rows = []
-    
+
    def one_sample(sample):
        """ Take a audio file, and optionally convert it to 16kHz WAV """
        mp3_filename = path.join(audio_dir, sample[0])
+        if not path.splitext(mp3_filename.lower())[1] == '.mp3':
+            mp3_filename += ".mp3"
        # Storing wav files next to the mp3 ones - just with a different suffix
        wav_filename = path.splitext(mp3_filename)[0] + ".wav"
        _maybe_convert_wav(mp3_filename, wav_filename)
@ -86,7 +88,7 @@ def _maybe_convert_set(audio_dir, input_tsv):
                # This one is good - keep it for the target CSV
                rows.append((wav_filename, file_size, sample[1]))
            counter['all'] += 1
-            
+
    print("Importing mp3 files...")
    pool = Pool(cpu_count())
    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
@ -95,7 +97,7 @@ def _maybe_convert_set(audio_dir, input_tsv):
    bar.update(num_samples)
    pool.close()
    pool.join()
-    
+
    with open(output_csv, 'w') as output_csv_file:
        print('Writing CSV file for DeepSpeech.py as: ', output_csv)
        writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
@ -103,7 +105,7 @@ def _maybe_convert_set(audio_dir, input_tsv):
        bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
        for filename, file_size, transcript in bar(rows):
            writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript })
-            
+
    print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long']))
    if counter['too_short'] > 0:
        print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])