diff --git a/bin/import_aidatatang.py b/bin/import_aidatatang.py index d1367281..703c570f 100755 --- a/bin/import_aidatatang.py +++ b/bin/import_aidatatang.py @@ -7,7 +7,7 @@ import os import sys sys.path.insert(1, os.path.join(sys.path[0], '..')) -import argparse +from util.importers import get_importers_parser import glob import pandas import tarfile @@ -81,7 +81,7 @@ def preprocess_data(tgz_file, target_dir): def main(): # https://www.openslr.org/62/ - parser = argparse.ArgumentParser(description='Import aidatatang_200zh corpus') + parser = get_importers_parser(description='Import aidatatang_200zh corpus') parser.add_argument('tgz_file', help='Path to aidatatang_200zh.tgz') parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.') params = parser.parse_args() diff --git a/bin/import_aishell.py b/bin/import_aishell.py index 5de1121b..939b5c92 100755 --- a/bin/import_aishell.py +++ b/bin/import_aishell.py @@ -7,7 +7,7 @@ import os import sys sys.path.insert(1, os.path.join(sys.path[0], '..')) -import argparse +from util.importers import get_importers_parser import glob import tarfile import pandas @@ -80,7 +80,7 @@ def preprocess_data(tgz_file, target_dir): def main(): # http://www.openslr.org/33/ - parser = argparse.ArgumentParser(description='Import AISHELL corpus') + parser = get_importers_parser(description='Import AISHELL corpus') parser.add_argument('aishell_tgz_file', help='Path to data_aishell.tgz') parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.') params = parser.parse_args() diff --git a/bin/import_cv2.py b/bin/import_cv2.py index acea122b..a4aba6bc 100755 --- a/bin/import_cv2.py +++ b/bin/import_cv2.py @@ -16,7 +16,6 @@ sys.path.insert(1, os.path.join(sys.path[0], '..')) import csv import sox -import argparse import subprocess import progressbar import unicodedata @@ -26,7 +25,8 @@ from threading import RLock from multiprocessing.dummy import Pool from multiprocessing import cpu_count from util.downloader import SIMPLE_BAR -from util.text import Alphabet, validate_label +from util.text import Alphabet +from util.importers import get_importers_parser, validate_label_eng as validate_label from util.helpers import secs_to_hours @@ -136,7 +136,7 @@ def _maybe_convert_wav(mp3_filename, wav_filename): if __name__ == "__main__": - PARSER = argparse.ArgumentParser(description='Import CommonVoice v2.0 corpora') + PARSER = get_importers_parser(description='Import CommonVoice v2.0 corpora') PARSER.add_argument('tsv_dir', help='Directory containing tsv files') PARSER.add_argument('--audio_dir', help='Directory containing the audio clips - defaults to "/clips"') PARSER.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet') diff --git a/bin/import_freestmandarin.py b/bin/import_freestmandarin.py index e600befb..8e6f5615 100755 --- a/bin/import_freestmandarin.py +++ b/bin/import_freestmandarin.py @@ -7,7 +7,7 @@ import os import sys sys.path.insert(1, os.path.join(sys.path[0], '..')) -import argparse +from util.importers import get_importers_parser import glob import numpy as np import pandas @@ -81,7 +81,7 @@ def preprocess_data(tgz_file, target_dir): def main(): # https://www.openslr.org/38/ - parser = argparse.ArgumentParser(description='Import Free ST Chinese Mandarin corpus') + parser = get_importers_parser(description='Import Free ST Chinese Mandarin corpus') parser.add_argument('tgz_file', help='Path to ST-CMDS-20170001_1-OS.tar.gz') parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.') params = parser.parse_args() diff --git a/bin/import_gram_vaani.py b/bin/import_gram_vaani.py index e1fdd078..500ed5de 100755 --- a/bin/import_gram_vaani.py +++ b/bin/import_gram_vaani.py @@ -1,12 +1,16 @@ #!/usr/bin/env python +# Make sure we can import stuff from util/ +# This script needs to be run from the root of the DeepSpeech repository import os -import csv import sys +sys.path.insert(1, os.path.join(sys.path[0], '..')) + +import csv import math import urllib import logging -import argparse +from util.importers import get_importers_parser import subprocess from os import path from pathlib import Path @@ -38,7 +42,7 @@ def parse_args(args): Returns: :obj:`argparse.Namespace`: command line parameters namespace """ - parser = argparse.ArgumentParser( + parser = get_importers_parser( description="Imports GramVaani data for Deep Speech" ) parser.add_argument( diff --git a/bin/import_lingua_libre.py b/bin/import_lingua_libre.py index ae893350..b9d6106a 100755 --- a/bin/import_lingua_libre.py +++ b/bin/import_lingua_libre.py @@ -3,13 +3,12 @@ from __future__ import absolute_import, division, print_function # Make sure we can import stuff from util/ # This script needs to be run from the root of the DeepSpeech repository -import argparse import os import sys - - sys.path.insert(1, os.path.join(sys.path[0], '..')) +from util.importers import get_importers_parser + import csv import re import sox @@ -173,7 +172,7 @@ def _maybe_convert_wav(ogg_filename, wav_filename): print('SoX processing error', ex, ogg_filename, wav_filename) def handle_args(): - parser = argparse.ArgumentParser(description='Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details.') + parser = get_importers_parser(description='Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details.') parser.add_argument(dest='target_dir') parser.add_argument('--qId', type=int, required=True, help='LinguaLibre language qId') parser.add_argument('--iso639-3', type=str, required=True, help='ISO639-3 language code') diff --git a/bin/import_m-ailabs.py b/bin/import_m-ailabs.py index 060e8f2a..16d1bf54 100755 --- a/bin/import_m-ailabs.py +++ b/bin/import_m-ailabs.py @@ -4,12 +4,13 @@ from __future__ import absolute_import, division, print_function # Make sure we can import stuff from util/ # This script needs to be run from the root of the DeepSpeech repository -import argparse import os import sys sys.path.insert(1, os.path.join(sys.path[0], '..')) +from util.importers import get_importers_parser + import csv import subprocess import progressbar @@ -168,7 +169,7 @@ def _maybe_convert_sets(target_dir, extracted_data): def handle_args(): - parser = argparse.ArgumentParser(description='Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/.') + parser = get_importers_parser(description='Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/.') parser.add_argument(dest='target_dir') parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet') parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones') diff --git a/bin/import_magicdata.py b/bin/import_magicdata.py index 2ec01549..27dbf74a 100755 --- a/bin/import_magicdata.py +++ b/bin/import_magicdata.py @@ -7,7 +7,7 @@ import os import sys sys.path.insert(1, os.path.join(sys.path[0], '..')) -import argparse +from util.importers import get_importers_parser import glob import pandas import tarfile @@ -99,7 +99,7 @@ def preprocess_data(folder_with_archives, target_dir): def main(): # https://openslr.org/68/ - parser = argparse.ArgumentParser(description='Import MAGICDATA corpus') + parser = get_importers_parser(description='Import MAGICDATA corpus') parser.add_argument('folder_with_archives', help='Path to folder containing magicdata_{train,dev,test}.tar.gz') parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to a folder called magicdata next to the archives') params = parser.parse_args() diff --git a/bin/import_primewords.py b/bin/import_primewords.py index 63f21cf7..0d6fdc52 100755 --- a/bin/import_primewords.py +++ b/bin/import_primewords.py @@ -7,7 +7,7 @@ import os import sys sys.path.insert(1, os.path.join(sys.path[0], '..')) -import argparse +from util.importers import get_importers_parser import glob import json import numpy as np @@ -93,7 +93,7 @@ def preprocess_data(tgz_file, target_dir): def main(): # https://www.openslr.org/47/ - parser = argparse.ArgumentParser(description='Import Primewords Chinese corpus set 1') + parser = get_importers_parser(description='Import Primewords Chinese corpus set 1') parser.add_argument('tgz_file', help='Path to primewords_md_2018_set1.tar.gz') parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.') params = parser.parse_args() diff --git a/bin/import_slr57.py b/bin/import_slr57.py index 5dde767a..16bac05b 100755 --- a/bin/import_slr57.py +++ b/bin/import_slr57.py @@ -3,13 +3,12 @@ from __future__ import absolute_import, division, print_function # Make sure we can import stuff from util/ # This script needs to be run from the root of the DeepSpeech repository -import argparse import os import sys - - sys.path.insert(1, os.path.join(sys.path[0], '..')) +from util.importers import get_importers_parser + import csv import re import sox @@ -195,7 +194,7 @@ def _maybe_convert_sets(target_dir, extracted_data): print('Final amount of imported audio: %s.' % secs_to_hours(counter['total_time'] / SAMPLE_RATE)) def handle_args(): - parser = argparse.ArgumentParser(description='Importer for African Accented French dataset. More information on http://www.openslr.org/57/.') + parser = get_importers_parser(description='Importer for African Accented French dataset. More information on http://www.openslr.org/57/.') parser.add_argument(dest='target_dir') parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet') parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones') diff --git a/bin/import_ts.py b/bin/import_ts.py index 4aaa058c..363e639e 100755 --- a/bin/import_ts.py +++ b/bin/import_ts.py @@ -3,14 +3,13 @@ from __future__ import absolute_import, division, print_function # Make sure we can import stuff from util/ # This script needs to be run from the root of the DeepSpeech repository -import argparse import os import re import sys - - sys.path.insert(1, os.path.join(sys.path[0], '..')) +from util.importers import get_importers_parser + import csv import unidecode import zipfile @@ -186,7 +185,7 @@ def cleanup_transcript(text, english_compatible=False): def handle_args(): - parser = argparse.ArgumentParser(description='Importer for TrainingSpeech dataset.') + parser = get_importers_parser(description='Importer for TrainingSpeech dataset.') parser.add_argument(dest='target_dir') parser.add_argument('--english-compatible', action='store_true', dest='english_compatible', help='Remove diactrics and other non-ascii chars.') return parser.parse_args() diff --git a/util/importers.py b/util/importers.py new file mode 100644 index 00000000..9f7ba8df --- /dev/null +++ b/util/importers.py @@ -0,0 +1,28 @@ +import argparse +import re + +def get_importers_parser(description): + parser = argparse.ArgumentParser(description=description) + return parser + +# Validate and normalize transcriptions. Returns a cleaned version of the label +# or None if it's invalid. +def validate_label_eng(label): + # For now we can only handle [a-z '] + if re.search(r"[0-9]|[(<\[\]&*{]", label) is not None: + return None + + label = label.replace("-", " ") + label = label.replace("_", " ") + label = re.sub("[ ]{2,}", " ", label) + label = label.replace(".", "") + label = label.replace(",", "") + label = label.replace(";", "") + label = label.replace("?", "") + label = label.replace("!", "") + label = label.replace(":", "") + label = label.replace("\"", "") + label = label.strip() + label = label.lower() + + return label if label else None diff --git a/util/test_importers.py b/util/test_importers.py new file mode 100644 index 00000000..884c2193 --- /dev/null +++ b/util/test_importers.py @@ -0,0 +1,12 @@ +import unittest + +from .importers import validate_label_eng + +class TestValidateLabelEng(unittest.TestCase): + + def test_numbers(self): + label = validate_label_eng("this is a 1 2 3 test") + self.assertEqual(label, None) + +if __name__ == '__main__': + unittest.main() diff --git a/util/text.py b/util/text.py index af958191..60bfe9f1 100644 --- a/util/text.py +++ b/util/text.py @@ -1,7 +1,6 @@ from __future__ import absolute_import, division, print_function import numpy as np -import re import struct from six.moves import range @@ -166,25 +165,3 @@ def levenshtein(a, b): current[j] = min(add, delete, change) return current[n] - -# Validate and normalize transcriptions. Returns a cleaned version of the label -# or None if it's invalid. -def validate_label(label): - # For now we can only handle [a-z '] - if re.search(r"[0-9]|[(<\[\]&*{]", label) is not None: - return None - - label = label.replace("-", " ") - label = label.replace("_", " ") - label = re.sub("[ ]{2,}", " ", label) - label = label.replace(".", "") - label = label.replace(",", "") - label = label.replace(";", "") - label = label.replace("?", "") - label = label.replace("!", "") - label = label.replace(":", "") - label = label.replace("\"", "") - label = label.strip() - label = label.lower() - - return label if label else None