Merge pull request #2148 from lissyx/lingua-libre-bogus

Do not import known bogus Lingua Libre records
2019-06-04 18:48:56 +02:00 · 2019-06-04 18:48:56 +02:00 · 8174f3f6db
commit 8174f3f6db
parent 10d98e1df9 3a17896463
1 changed files with 14 additions and 1 deletions
--- a/bin/import_lingua_libre.py
+++ b/bin/import_lingua_libre.py
@ -11,6 +11,7 @@ import sys
 sys.path.insert(1, os.path.join(sys.path[0], '..'))

 import csv
+import re
 import sox
 import zipfile
 import subprocess
@ -74,7 +75,8 @@ def _maybe_convert_sets(target_dir, extracted_data):
    glob_dir = os.path.join(ogg_root_dir, '**/*.ogg')
    for record in glob(glob_dir, recursive=True):
        record_file = record.replace(ogg_root_dir + os.path.sep, '')
-        samples.append((record_file, os.path.splitext(os.path.basename(record_file))[0]))
+        if record_filter(record_file):
+            samples.append((record_file, os.path.splitext(os.path.basename(record_file))[0]))

    # Keep track of how many samples are good vs. problematic
    counter = {'all': 0, 'failed': 0, 'invalid_label': 0, 'too_short': 0, 'too_long': 0, 'total_time': 0}
@ -177,12 +179,23 @@ def handle_args():
    parser.add_argument('--english-name', type=str, required=True, help='Enligh name of the language')
    parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet')
    parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones')
+    parser.add_argument('--bogus-records', type=argparse.FileType('r'), required=False, help='Text file listing well-known bogus record to skip from importing, from https://lingualibre.fr/wiki/LinguaLibre:Misleading_items')
    return parser.parse_args()

 if __name__ == "__main__":
    CLI_ARGS = handle_args()
    ALPHABET = Alphabet(CLI_ARGS.filter_alphabet) if CLI_ARGS.filter_alphabet else None

+    bogus_regexes = []
+    for line in CLI_ARGS.bogus_records:
+        bogus_regexes.append(re.compile(line.strip()))
+
+    def record_filter(path):
+        if any(regex.match(path) for regex in bogus_regexes):
+            print('Reject', path)
+            return False
+        return True
+
    def label_filter(label):
        if CLI_ARGS.normalize:
            label = unicodedata.normalize("NFKD", label.strip()) \