Merge pull request #2478 from tilmankamp/fixmailab

Fix for empty skip list case; making linter happy
2019-10-30 13:08:45 +01:00 · 2019-10-30 13:08:45 +01:00 · 0ba549b83f
commit 0ba549b83f
parent 31ec7a71f2 df1df83720
1 changed files with 14 additions and 14 deletions
--- a/bin/import_m-ailabs.py
+++ b/bin/import_m-ailabs.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# pylint: disable=invalid-name
 from __future__ import absolute_import, division, print_function

 # Make sure we can import stuff from util/
@ -7,13 +8,9 @@ import argparse
 import os
 import sys

-
 sys.path.insert(1, os.path.join(sys.path[0], '..'))

 import csv
-import re
-import sox
-import zipfile
 import subprocess
 import progressbar
 import unicodedata
@ -39,7 +36,6 @@ ARCHIVE_DIR_NAME = '{language}'
 ARCHIVE_NAME = '{language}.tgz'
 ARCHIVE_URL = 'http://www.caito.de/data/Training/stt_tts/' + ARCHIVE_NAME

-SKIP_LIST = []

 def _download_and_preprocess_data(target_dir):
    # Making path absolute
@ -51,6 +47,7 @@ def _download_and_preprocess_data(target_dir):
    # Produce CSV files
    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)

+
 def _maybe_extract(target_dir, extracted_data, archive_path):
    # If target_dir/extracted_data does not exist, extract archive in target_dir
    extracted_path = path.join(target_dir, extracted_data)
@ -64,6 +61,7 @@ def _maybe_extract(target_dir, extracted_data, archive_path):
    else:
        print('Found directory "%s" - not extracting it from archive.' % archive_path)

+
 def _maybe_convert_sets(target_dir, extracted_data):
    extracted_dir = path.join(target_dir, extracted_data)
    # override existing CSV with normalized one
@ -77,14 +75,14 @@ def _maybe_convert_sets(target_dir, extracted_data):
    samples = []
    glob_dir = os.path.join(wav_root_dir, '**/metadata.csv')
    for record in glob(glob_dir, recursive=True):
-        for sk in SKIP_LIST:
-            if not (sk in record):
-                with open(record, 'r') as rec:
-                    for re in rec.readlines():
-                        re = re.strip().split('|')
-                        audio = os.path.join(os.path.dirname(record), 'wavs', re[0] + '.wav')
-                        transcript = re[2]
-                        samples.append((audio, transcript))
+        if any(map(lambda sk: sk in record, SKIP_LIST)):  # pylint: disable=cell-var-from-loop
+            continue
+        with open(record, 'r') as rec:
+            for re in rec.readlines():
+                re = re.strip().split('|')
+                audio = os.path.join(os.path.dirname(record), 'wavs', re[0] + '.wav')
+                transcript = re[2]
+                samples.append((audio, transcript))

    # Keep track of how many samples are good vs. problematic
    counter = {'all': 0, 'failed': 0, 'invalid_label': 0, 'too_short': 0, 'too_long': 0, 'total_time': 0}
@ -168,6 +166,7 @@ def _maybe_convert_sets(target_dir, extracted_data):
        print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))
    print('Final amount of imported audio: %s.' % secs_to_hours(counter['total_time'] / SAMPLE_RATE))

+
 def handle_args():
    parser = argparse.ArgumentParser(description='Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/.')
    parser.add_argument(dest='target_dir')
@ -177,10 +176,11 @@ def handle_args():
    parser.add_argument('--language', required=True, type=str, help='Dataset language to use')
    return parser.parse_args()

+
 if __name__ == "__main__":
    CLI_ARGS = handle_args()
    ALPHABET = Alphabet(CLI_ARGS.filter_alphabet) if CLI_ARGS.filter_alphabet else None
-    SKIP_LIST = CLI_ARGS.skiplist.split(',')
+    SKIP_LIST = filter(None, CLI_ARGS.skiplist.split(','))

    def label_filter(label):
        if CLI_ARGS.normalize: