Merge pull request #2148 from lissyx/lingua-libre-bogus

Do not import known bogus Lingua Libre records
This commit is contained in:
lissyx 2019-06-04 18:48:56 +02:00 committed by GitHub
commit 8174f3f6db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -11,6 +11,7 @@ import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import csv
import re
import sox
import zipfile
import subprocess
@ -74,7 +75,8 @@ def _maybe_convert_sets(target_dir, extracted_data):
glob_dir = os.path.join(ogg_root_dir, '**/*.ogg')
for record in glob(glob_dir, recursive=True):
record_file = record.replace(ogg_root_dir + os.path.sep, '')
samples.append((record_file, os.path.splitext(os.path.basename(record_file))[0]))
if record_filter(record_file):
samples.append((record_file, os.path.splitext(os.path.basename(record_file))[0]))
# Keep track of how many samples are good vs. problematic
counter = {'all': 0, 'failed': 0, 'invalid_label': 0, 'too_short': 0, 'too_long': 0, 'total_time': 0}
@ -177,12 +179,23 @@ def handle_args():
parser.add_argument('--english-name', type=str, required=True, help='Enligh name of the language')
parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet')
parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones')
parser.add_argument('--bogus-records', type=argparse.FileType('r'), required=False, help='Text file listing well-known bogus record to skip from importing, from https://lingualibre.fr/wiki/LinguaLibre:Misleading_items')
return parser.parse_args()
if __name__ == "__main__":
CLI_ARGS = handle_args()
ALPHABET = Alphabet(CLI_ARGS.filter_alphabet) if CLI_ARGS.filter_alphabet else None
bogus_regexes = []
for line in CLI_ARGS.bogus_records:
bogus_regexes.append(re.compile(line.strip()))
def record_filter(path):
if any(regex.match(path) for regex in bogus_regexes):
print('Reject', path)
return False
return True
def label_filter(label):
if CLI_ARGS.normalize:
label = unicodedata.normalize("NFKD", label.strip()) \