From 084da3724d0dc87eb8f74d87a6ab515e16d01e04 Mon Sep 17 00:00:00 2001 From: Tilman Kamp <5991088+tilmankamp@users.noreply.github.com> Date: Tue, 7 Jul 2020 10:37:12 +0200 Subject: [PATCH] Fix: #3130 - Missing deepspeech_training.util.text.Alphabet --- bin/import_cv2.py | 91 ++++++++++++++++++-------------------- bin/import_lingua_libre.py | 11 ++--- bin/import_m-ailabs.py | 9 ++-- bin/import_slr57.py | 12 ++--- bin/import_swc.py | 17 +++---- bin/import_tuda.py | 21 ++++----- 6 files changed, 68 insertions(+), 93 deletions(-) diff --git a/bin/import_cv2.py b/bin/import_cv2.py index 9db63656..d6c8c192 100755 --- a/bin/import_cv2.py +++ b/bin/import_cv2.py @@ -7,7 +7,6 @@ DeepSpeech.py Use "python3 import_cv2.py -h" for help """ import csv -import itertools import os import subprocess import unicodedata @@ -24,27 +23,39 @@ from deepspeech_training.util.importers import ( get_validate_label, print_import_report, ) -from deepspeech_training.util.text import Alphabet +from ds_ctcdecoder import Alphabet FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"] SAMPLE_RATE = 16000 MAX_SECS = 10 +PARAMS = None +FILTER_OBJ = None -def _preprocess_data(tsv_dir, audio_dir, filter_obj, space_after_every_character=False): - exclude = [] - for dataset in ["test", "dev", "train", "validated", "other"]: - set_samples = _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_every_character) - if dataset in ["test", "dev"]: - exclude += set_samples - if dataset == "validated": - _maybe_convert_set("train-all", tsv_dir, audio_dir, filter_obj, space_after_every_character, - rows=set_samples, exclude=exclude) +class LabelFilter: + def __init__(self, normalize, alphabet, validate_fun): + self.normalize = normalize + self.alphabet = alphabet + self.validate_fun = validate_fun + + def filter(self, label): + if self.normalize: + label = unicodedata.normalize("NFKD", label.strip()).encode("ascii", "ignore").decode("ascii", "ignore") + label = self.validate_fun(label) + if self.alphabet and label and not self.alphabet.CanEncode(label): + label = None + return label -def one_sample(args): +def init_worker(params): + global FILTER_OBJ # pylint: disable=global-statement + validate_label = get_validate_label(params) + alphabet = Alphabet(params.filter_alphabet) if params.filter_alphabet else None + FILTER_OBJ = LabelFilter(params.normalize, alphabet, validate_label) + + +def one_sample(sample): """ Take an audio file, and optionally convert it to 16kHz WAV """ - sample, filter_obj = args mp3_filename = sample[0] if not os.path.splitext(mp3_filename.lower())[1] == ".mp3": mp3_filename += ".mp3" @@ -60,7 +71,7 @@ def one_sample(args): ["soxi", "-s", wav_filename], stderr=subprocess.STDOUT ) ) - label = filter_obj.filter(sample[1]) + label = FILTER_OBJ.filter(sample[1]) rows = [] counter = get_counter() if file_size == -1: @@ -110,10 +121,9 @@ def _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_ever num_samples = len(samples) print("Importing mp3 files...") - pool = Pool() + pool = Pool(initializer=init_worker, initargs=(PARAMS,)) bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR) - samples_with_context = itertools.zip_longest(samples, [], fillvalue=filter_obj) - for i, processed in enumerate(pool.imap_unordered(one_sample, samples_with_context), start=1): + for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1): counter += processed[0] rows += processed[1] bar.update(i) @@ -155,6 +165,17 @@ def _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_ever return rows +def _preprocess_data(tsv_dir, audio_dir, space_after_every_character=False): + exclude = [] + for dataset in ["test", "dev", "train", "validated", "other"]: + set_samples = _maybe_convert_set(dataset, tsv_dir, audio_dir, space_after_every_character) + if dataset in ["test", "dev"]: + exclude += set_samples + if dataset == "validated": + _maybe_convert_set("train-all", tsv_dir, audio_dir, space_after_every_character, + rows=set_samples, exclude=exclude) + + def _maybe_convert_wav(mp3_filename, wav_filename): if not os.path.exists(wav_filename): transformer = sox.Transformer() @@ -164,28 +185,8 @@ def _maybe_convert_wav(mp3_filename, wav_filename): except sox.core.SoxError: pass -class LabelFilter: - def __init__(self, normalize, alphabet, validate_fun): - self.normalize = normalize - self.alphabet = alphabet - self.validate_fun = validate_fun - def filter(self, label): - if self.normalize: - label = ( - unicodedata.normalize("NFKD", label.strip()) - .encode("ascii", "ignore") - .decode("ascii", "ignore") - ) - label = self.validate_fun(label) - if self.alphabet and label: - try: - self.alphabet.encode(label) - except KeyError: - label = None - return label - -def main(): +def parse_args(): parser = get_importers_parser(description="Import CommonVoice v2.0 corpora") parser.add_argument("tsv_dir", help="Directory containing tsv files") parser.add_argument( @@ -206,18 +207,14 @@ def main(): action="store_true", help="To help transcript join by white space", ) + return parser.parse_args() - params = parser.parse_args() - validate_label = get_validate_label(params) - audio_dir = ( - params.audio_dir if params.audio_dir else os.path.join(params.tsv_dir, "clips") - ) - alphabet = Alphabet(params.filter_alphabet) if params.filter_alphabet else None +def main(): + audio_dir = PARAMS.audio_dir if PARAMS.audio_dir else os.path.join(PARAMS.tsv_dir, "clips") + _preprocess_data(PARAMS.tsv_dir, audio_dir, PARAMS.space_after_every_character) - filter_obj = LabelFilter(params.normalize, alphabet, validate_label) - _preprocess_data(params.tsv_dir, audio_dir, filter_obj, - params.space_after_every_character) if __name__ == "__main__": + PARAMS = parse_args() main() diff --git a/bin/import_lingua_libre.py b/bin/import_lingua_libre.py index 2273aae6..956d7a0b 100755 --- a/bin/import_lingua_libre.py +++ b/bin/import_lingua_libre.py @@ -20,7 +20,7 @@ from deepspeech_training.util.importers import ( get_validate_label, print_import_report, ) -from deepspeech_training.util.text import Alphabet +from ds_ctcdecoder import Alphabet FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"] SAMPLE_RATE = 16000 @@ -198,7 +198,7 @@ def handle_args(): "--iso639-3", type=str, required=True, help="ISO639-3 language code" ) parser.add_argument( - "--english-name", type=str, required=True, help="Enligh name of the language" + "--english-name", type=str, required=True, help="English name of the language" ) parser.add_argument( "--filter_alphabet", @@ -242,11 +242,8 @@ if __name__ == "__main__": .decode("ascii", "ignore") ) label = validate_label(label) - if ALPHABET and label: - try: - ALPHABET.encode(label) - except KeyError: - label = None + if ALPHABET and label and not ALPHABET.CanEncode(label): + label = None return label ARCHIVE_NAME = ARCHIVE_NAME.format( diff --git a/bin/import_m-ailabs.py b/bin/import_m-ailabs.py index 63bb1f4f..bbaa744b 100755 --- a/bin/import_m-ailabs.py +++ b/bin/import_m-ailabs.py @@ -18,7 +18,7 @@ from deepspeech_training.util.importers import ( get_validate_label, print_import_report, ) -from deepspeech_training.util.text import Alphabet +from ds_ctcdecoder import Alphabet FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"] SAMPLE_RATE = 16000 @@ -215,11 +215,8 @@ if __name__ == "__main__": .decode("ascii", "ignore") ) label = validate_label(label) - if ALPHABET and label: - try: - ALPHABET.encode(label) - except KeyError: - label = None + if ALPHABET and label and not ALPHABET.CanEncode(label): + label = None return label ARCHIVE_DIR_NAME = ARCHIVE_DIR_NAME.format(language=CLI_ARGS.language) diff --git a/bin/import_slr57.py b/bin/import_slr57.py index 4edb3654..57588696 100755 --- a/bin/import_slr57.py +++ b/bin/import_slr57.py @@ -1,16 +1,13 @@ #!/usr/bin/env python3 import csv import os -import re import subprocess import tarfile import unicodedata -import zipfile from glob import glob from multiprocessing import Pool import progressbar -import sox from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download from deepspeech_training.util.importers import ( @@ -20,7 +17,7 @@ from deepspeech_training.util.importers import ( get_validate_label, print_import_report, ) -from deepspeech_training.util.text import Alphabet +from ds_ctcdecoder import Alphabet FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"] SAMPLE_RATE = 16000 @@ -227,11 +224,8 @@ if __name__ == "__main__": .decode("ascii", "ignore") ) label = validate_label(label) - if ALPHABET and label: - try: - ALPHABET.encode(label) - except KeyError: - label = None + if ALPHABET and label and not ALPHABET.CanEncode(label): + label = None return label _download_and_preprocess_data(target_dir=CLI_ARGS.target_dir) diff --git a/bin/import_swc.py b/bin/import_swc.py index 0c0ceb64..6ccf256c 100755 --- a/bin/import_swc.py +++ b/bin/import_swc.py @@ -24,7 +24,7 @@ import sox from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download from deepspeech_training.util.importers import validate_label_eng as validate_label -from deepspeech_training.util.text import Alphabet +from ds_ctcdecoder import Alphabet SWC_URL = "https://www2.informatik.uni-hamburg.de/nats/pub/SWC/SWC_{language}.tar" SWC_ARCHIVE = "SWC_{language}.tar" @@ -170,7 +170,8 @@ def read_token(token): def in_alphabet(alphabet, c): - return True if alphabet is None else alphabet.has_char(c) + return alphabet.CanEncode(c) if alphabet else True + ALPHABETS = {} @@ -201,16 +202,8 @@ def label_filter(label, language): dont_normalize = DONT_NORMALIZE[language] if language in DONT_NORMALIZE else "" alphabet = get_alphabet(language) for c in label: - if ( - CLI_ARGS.normalize - and c not in dont_normalize - and not in_alphabet(alphabet, c) - ): - c = ( - unicodedata.normalize("NFKD", c) - .encode("ascii", "ignore") - .decode("ascii", "ignore") - ) + if CLI_ARGS.normalize and c not in dont_normalize and not in_alphabet(alphabet, c): + c = unicodedata.normalize("NFKD", c).encode("ascii", "ignore").decode("ascii", "ignore") for sc in c: if not in_alphabet(alphabet, sc): return None, "illegal character" diff --git a/bin/import_tuda.py b/bin/import_tuda.py index 5facd073..d56efa58 100755 --- a/bin/import_tuda.py +++ b/bin/import_tuda.py @@ -16,7 +16,7 @@ import progressbar from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download from deepspeech_training.util.importers import validate_label_eng as validate_label -from deepspeech_training.util.text import Alphabet +from ds_ctcdecoder import Alphabet TUDA_VERSION = "v2" TUDA_PACKAGE = "german-speechdata-package-{}".format(TUDA_VERSION) @@ -46,22 +46,18 @@ def maybe_extract(archive): return extracted +def in_alphabet(c): + return ALPHABET.CanEncode(c) if ALPHABET else True + + def check_and_prepare_sentence(sentence): sentence = sentence.lower().replace("co2", "c o zwei") chars = [] for c in sentence: - if ( - CLI_ARGS.normalize - and c not in "äöüß" - and (ALPHABET is None or not ALPHABET.has_char(c)) - ): - c = ( - unicodedata.normalize("NFKD", c) - .encode("ascii", "ignore") - .decode("ascii", "ignore") - ) + if CLI_ARGS.normalize and c not in "äöüß" and not in_alphabet(c): + c = unicodedata.normalize("NFKD", c).encode("ascii", "ignore").decode("ascii", "ignore") for sc in c: - if ALPHABET is not None and not ALPHABET.has_char(c): + if not in_alphabet(c): return None chars.append(sc) return validate_label("".join(chars)) @@ -122,6 +118,7 @@ def write_csvs(extracted): sentence = list(meta.iter("cleaned_sentence"))[0].text sentence = check_and_prepare_sentence(sentence) if sentence is None: + reasons['alphabet filter'] += 1 continue for wav_name in wav_names: sample_counter += 1