Share argparser amongst importers

This commit is contained in:
Alexandre Lissy 2020-03-10 12:13:12 +01:00 committed by Alexandre Lissy
parent 29a2ac37f0
commit f9e05fe0c3
14 changed files with 72 additions and 53 deletions

View File

@ -7,7 +7,7 @@ import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import argparse
from util.importers import get_importers_parser
import glob
import pandas
import tarfile
@ -81,7 +81,7 @@ def preprocess_data(tgz_file, target_dir):
def main():
# https://www.openslr.org/62/
parser = argparse.ArgumentParser(description='Import aidatatang_200zh corpus')
parser = get_importers_parser(description='Import aidatatang_200zh corpus')
parser.add_argument('tgz_file', help='Path to aidatatang_200zh.tgz')
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.')
params = parser.parse_args()

View File

@ -7,7 +7,7 @@ import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import argparse
from util.importers import get_importers_parser
import glob
import tarfile
import pandas
@ -80,7 +80,7 @@ def preprocess_data(tgz_file, target_dir):
def main():
# http://www.openslr.org/33/
parser = argparse.ArgumentParser(description='Import AISHELL corpus')
parser = get_importers_parser(description='Import AISHELL corpus')
parser.add_argument('aishell_tgz_file', help='Path to data_aishell.tgz')
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.')
params = parser.parse_args()

View File

@ -16,7 +16,6 @@ sys.path.insert(1, os.path.join(sys.path[0], '..'))
import csv
import sox
import argparse
import subprocess
import progressbar
import unicodedata
@ -26,7 +25,8 @@ from threading import RLock
from multiprocessing.dummy import Pool
from multiprocessing import cpu_count
from util.downloader import SIMPLE_BAR
from util.text import Alphabet, validate_label
from util.text import Alphabet
from util.importers import get_importers_parser, validate_label_eng as validate_label
from util.helpers import secs_to_hours
@ -136,7 +136,7 @@ def _maybe_convert_wav(mp3_filename, wav_filename):
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(description='Import CommonVoice v2.0 corpora')
PARSER = get_importers_parser(description='Import CommonVoice v2.0 corpora')
PARSER.add_argument('tsv_dir', help='Directory containing tsv files')
PARSER.add_argument('--audio_dir', help='Directory containing the audio clips - defaults to "<tsv_dir>/clips"')
PARSER.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet')

View File

@ -7,7 +7,7 @@ import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import argparse
from util.importers import get_importers_parser
import glob
import numpy as np
import pandas
@ -81,7 +81,7 @@ def preprocess_data(tgz_file, target_dir):
def main():
# https://www.openslr.org/38/
parser = argparse.ArgumentParser(description='Import Free ST Chinese Mandarin corpus')
parser = get_importers_parser(description='Import Free ST Chinese Mandarin corpus')
parser.add_argument('tgz_file', help='Path to ST-CMDS-20170001_1-OS.tar.gz')
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.')
params = parser.parse_args()

View File

@ -1,12 +1,16 @@
#!/usr/bin/env python
# Make sure we can import stuff from util/
# This script needs to be run from the root of the DeepSpeech repository
import os
import csv
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import csv
import math
import urllib
import logging
import argparse
from util.importers import get_importers_parser
import subprocess
from os import path
from pathlib import Path
@ -38,7 +42,7 @@ def parse_args(args):
Returns:
:obj:`argparse.Namespace`: command line parameters namespace
"""
parser = argparse.ArgumentParser(
parser = get_importers_parser(
description="Imports GramVaani data for Deep Speech"
)
parser.add_argument(

View File

@ -3,13 +3,12 @@ from __future__ import absolute_import, division, print_function
# Make sure we can import stuff from util/
# This script needs to be run from the root of the DeepSpeech repository
import argparse
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from util.importers import get_importers_parser
import csv
import re
import sox
@ -173,7 +172,7 @@ def _maybe_convert_wav(ogg_filename, wav_filename):
print('SoX processing error', ex, ogg_filename, wav_filename)
def handle_args():
parser = argparse.ArgumentParser(description='Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details.')
parser = get_importers_parser(description='Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details.')
parser.add_argument(dest='target_dir')
parser.add_argument('--qId', type=int, required=True, help='LinguaLibre language qId')
parser.add_argument('--iso639-3', type=str, required=True, help='ISO639-3 language code')

View File

@ -4,12 +4,13 @@ from __future__ import absolute_import, division, print_function
# Make sure we can import stuff from util/
# This script needs to be run from the root of the DeepSpeech repository
import argparse
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from util.importers import get_importers_parser
import csv
import subprocess
import progressbar
@ -168,7 +169,7 @@ def _maybe_convert_sets(target_dir, extracted_data):
def handle_args():
parser = argparse.ArgumentParser(description='Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/.')
parser = get_importers_parser(description='Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/.')
parser.add_argument(dest='target_dir')
parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet')
parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones')

View File

@ -7,7 +7,7 @@ import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import argparse
from util.importers import get_importers_parser
import glob
import pandas
import tarfile
@ -99,7 +99,7 @@ def preprocess_data(folder_with_archives, target_dir):
def main():
# https://openslr.org/68/
parser = argparse.ArgumentParser(description='Import MAGICDATA corpus')
parser = get_importers_parser(description='Import MAGICDATA corpus')
parser.add_argument('folder_with_archives', help='Path to folder containing magicdata_{train,dev,test}.tar.gz')
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to a folder called magicdata next to the archives')
params = parser.parse_args()

View File

@ -7,7 +7,7 @@ import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import argparse
from util.importers import get_importers_parser
import glob
import json
import numpy as np
@ -93,7 +93,7 @@ def preprocess_data(tgz_file, target_dir):
def main():
# https://www.openslr.org/47/
parser = argparse.ArgumentParser(description='Import Primewords Chinese corpus set 1')
parser = get_importers_parser(description='Import Primewords Chinese corpus set 1')
parser.add_argument('tgz_file', help='Path to primewords_md_2018_set1.tar.gz')
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.')
params = parser.parse_args()

View File

@ -3,13 +3,12 @@ from __future__ import absolute_import, division, print_function
# Make sure we can import stuff from util/
# This script needs to be run from the root of the DeepSpeech repository
import argparse
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from util.importers import get_importers_parser
import csv
import re
import sox
@ -195,7 +194,7 @@ def _maybe_convert_sets(target_dir, extracted_data):
print('Final amount of imported audio: %s.' % secs_to_hours(counter['total_time'] / SAMPLE_RATE))
def handle_args():
parser = argparse.ArgumentParser(description='Importer for African Accented French dataset. More information on http://www.openslr.org/57/.')
parser = get_importers_parser(description='Importer for African Accented French dataset. More information on http://www.openslr.org/57/.')
parser.add_argument(dest='target_dir')
parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet')
parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones')

View File

@ -3,14 +3,13 @@ from __future__ import absolute_import, division, print_function
# Make sure we can import stuff from util/
# This script needs to be run from the root of the DeepSpeech repository
import argparse
import os
import re
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from util.importers import get_importers_parser
import csv
import unidecode
import zipfile
@ -186,7 +185,7 @@ def cleanup_transcript(text, english_compatible=False):
def handle_args():
parser = argparse.ArgumentParser(description='Importer for TrainingSpeech dataset.')
parser = get_importers_parser(description='Importer for TrainingSpeech dataset.')
parser.add_argument(dest='target_dir')
parser.add_argument('--english-compatible', action='store_true', dest='english_compatible', help='Remove diactrics and other non-ascii chars.')
return parser.parse_args()

28
util/importers.py Normal file
View File

@ -0,0 +1,28 @@
import argparse
import re
def get_importers_parser(description):
parser = argparse.ArgumentParser(description=description)
return parser
# Validate and normalize transcriptions. Returns a cleaned version of the label
# or None if it's invalid.
def validate_label_eng(label):
# For now we can only handle [a-z ']
if re.search(r"[0-9]|[(<\[\]&*{]", label) is not None:
return None
label = label.replace("-", " ")
label = label.replace("_", " ")
label = re.sub("[ ]{2,}", " ", label)
label = label.replace(".", "")
label = label.replace(",", "")
label = label.replace(";", "")
label = label.replace("?", "")
label = label.replace("!", "")
label = label.replace(":", "")
label = label.replace("\"", "")
label = label.strip()
label = label.lower()
return label if label else None

12
util/test_importers.py Normal file
View File

@ -0,0 +1,12 @@
import unittest
from .importers import validate_label_eng
class TestValidateLabelEng(unittest.TestCase):
def test_numbers(self):
label = validate_label_eng("this is a 1 2 3 test")
self.assertEqual(label, None)
if __name__ == '__main__':
unittest.main()

View File

@ -1,7 +1,6 @@
from __future__ import absolute_import, division, print_function
import numpy as np
import re
import struct
from six.moves import range
@ -166,25 +165,3 @@ def levenshtein(a, b):
current[j] = min(add, delete, change)
return current[n]
# Validate and normalize transcriptions. Returns a cleaned version of the label
# or None if it's invalid.
def validate_label(label):
# For now we can only handle [a-z ']
if re.search(r"[0-9]|[(<\[\]&*{]", label) is not None:
return None
label = label.replace("-", " ")
label = label.replace("_", " ")
label = re.sub("[ ]{2,}", " ", label)
label = label.replace(".", "")
label = label.replace(",", "")
label = label.replace(";", "")
label = label.replace("?", "")
label = label.replace("!", "")
label = label.replace(":", "")
label = label.replace("\"", "")
label = label.strip()
label = label.lower()
return label if label else None