Share argparser amongst importers
This commit is contained in:
parent
29a2ac37f0
commit
f9e05fe0c3
@ -7,7 +7,7 @@ import os
|
||||
import sys
|
||||
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
|
||||
import argparse
|
||||
from util.importers import get_importers_parser
|
||||
import glob
|
||||
import pandas
|
||||
import tarfile
|
||||
@ -81,7 +81,7 @@ def preprocess_data(tgz_file, target_dir):
|
||||
|
||||
def main():
|
||||
# https://www.openslr.org/62/
|
||||
parser = argparse.ArgumentParser(description='Import aidatatang_200zh corpus')
|
||||
parser = get_importers_parser(description='Import aidatatang_200zh corpus')
|
||||
parser.add_argument('tgz_file', help='Path to aidatatang_200zh.tgz')
|
||||
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.')
|
||||
params = parser.parse_args()
|
||||
|
@ -7,7 +7,7 @@ import os
|
||||
import sys
|
||||
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
|
||||
import argparse
|
||||
from util.importers import get_importers_parser
|
||||
import glob
|
||||
import tarfile
|
||||
import pandas
|
||||
@ -80,7 +80,7 @@ def preprocess_data(tgz_file, target_dir):
|
||||
|
||||
def main():
|
||||
# http://www.openslr.org/33/
|
||||
parser = argparse.ArgumentParser(description='Import AISHELL corpus')
|
||||
parser = get_importers_parser(description='Import AISHELL corpus')
|
||||
parser.add_argument('aishell_tgz_file', help='Path to data_aishell.tgz')
|
||||
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.')
|
||||
params = parser.parse_args()
|
||||
|
@ -16,7 +16,6 @@ sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
|
||||
import csv
|
||||
import sox
|
||||
import argparse
|
||||
import subprocess
|
||||
import progressbar
|
||||
import unicodedata
|
||||
@ -26,7 +25,8 @@ from threading import RLock
|
||||
from multiprocessing.dummy import Pool
|
||||
from multiprocessing import cpu_count
|
||||
from util.downloader import SIMPLE_BAR
|
||||
from util.text import Alphabet, validate_label
|
||||
from util.text import Alphabet
|
||||
from util.importers import get_importers_parser, validate_label_eng as validate_label
|
||||
from util.helpers import secs_to_hours
|
||||
|
||||
|
||||
@ -136,7 +136,7 @@ def _maybe_convert_wav(mp3_filename, wav_filename):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
PARSER = argparse.ArgumentParser(description='Import CommonVoice v2.0 corpora')
|
||||
PARSER = get_importers_parser(description='Import CommonVoice v2.0 corpora')
|
||||
PARSER.add_argument('tsv_dir', help='Directory containing tsv files')
|
||||
PARSER.add_argument('--audio_dir', help='Directory containing the audio clips - defaults to "<tsv_dir>/clips"')
|
||||
PARSER.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet')
|
||||
|
@ -7,7 +7,7 @@ import os
|
||||
import sys
|
||||
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
|
||||
import argparse
|
||||
from util.importers import get_importers_parser
|
||||
import glob
|
||||
import numpy as np
|
||||
import pandas
|
||||
@ -81,7 +81,7 @@ def preprocess_data(tgz_file, target_dir):
|
||||
|
||||
def main():
|
||||
# https://www.openslr.org/38/
|
||||
parser = argparse.ArgumentParser(description='Import Free ST Chinese Mandarin corpus')
|
||||
parser = get_importers_parser(description='Import Free ST Chinese Mandarin corpus')
|
||||
parser.add_argument('tgz_file', help='Path to ST-CMDS-20170001_1-OS.tar.gz')
|
||||
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.')
|
||||
params = parser.parse_args()
|
||||
|
@ -1,12 +1,16 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Make sure we can import stuff from util/
|
||||
# This script needs to be run from the root of the DeepSpeech repository
|
||||
import os
|
||||
import csv
|
||||
import sys
|
||||
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
|
||||
import csv
|
||||
import math
|
||||
import urllib
|
||||
import logging
|
||||
import argparse
|
||||
from util.importers import get_importers_parser
|
||||
import subprocess
|
||||
from os import path
|
||||
from pathlib import Path
|
||||
@ -38,7 +42,7 @@ def parse_args(args):
|
||||
Returns:
|
||||
:obj:`argparse.Namespace`: command line parameters namespace
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
parser = get_importers_parser(
|
||||
description="Imports GramVaani data for Deep Speech"
|
||||
)
|
||||
parser.add_argument(
|
||||
|
@ -3,13 +3,12 @@ from __future__ import absolute_import, division, print_function
|
||||
|
||||
# Make sure we can import stuff from util/
|
||||
# This script needs to be run from the root of the DeepSpeech repository
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
|
||||
from util.importers import get_importers_parser
|
||||
|
||||
import csv
|
||||
import re
|
||||
import sox
|
||||
@ -173,7 +172,7 @@ def _maybe_convert_wav(ogg_filename, wav_filename):
|
||||
print('SoX processing error', ex, ogg_filename, wav_filename)
|
||||
|
||||
def handle_args():
|
||||
parser = argparse.ArgumentParser(description='Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details.')
|
||||
parser = get_importers_parser(description='Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details.')
|
||||
parser.add_argument(dest='target_dir')
|
||||
parser.add_argument('--qId', type=int, required=True, help='LinguaLibre language qId')
|
||||
parser.add_argument('--iso639-3', type=str, required=True, help='ISO639-3 language code')
|
||||
|
@ -4,12 +4,13 @@ from __future__ import absolute_import, division, print_function
|
||||
|
||||
# Make sure we can import stuff from util/
|
||||
# This script needs to be run from the root of the DeepSpeech repository
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
|
||||
from util.importers import get_importers_parser
|
||||
|
||||
import csv
|
||||
import subprocess
|
||||
import progressbar
|
||||
@ -168,7 +169,7 @@ def _maybe_convert_sets(target_dir, extracted_data):
|
||||
|
||||
|
||||
def handle_args():
|
||||
parser = argparse.ArgumentParser(description='Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/.')
|
||||
parser = get_importers_parser(description='Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/.')
|
||||
parser.add_argument(dest='target_dir')
|
||||
parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet')
|
||||
parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones')
|
||||
|
@ -7,7 +7,7 @@ import os
|
||||
import sys
|
||||
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
|
||||
import argparse
|
||||
from util.importers import get_importers_parser
|
||||
import glob
|
||||
import pandas
|
||||
import tarfile
|
||||
@ -99,7 +99,7 @@ def preprocess_data(folder_with_archives, target_dir):
|
||||
|
||||
def main():
|
||||
# https://openslr.org/68/
|
||||
parser = argparse.ArgumentParser(description='Import MAGICDATA corpus')
|
||||
parser = get_importers_parser(description='Import MAGICDATA corpus')
|
||||
parser.add_argument('folder_with_archives', help='Path to folder containing magicdata_{train,dev,test}.tar.gz')
|
||||
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to a folder called magicdata next to the archives')
|
||||
params = parser.parse_args()
|
||||
|
@ -7,7 +7,7 @@ import os
|
||||
import sys
|
||||
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
|
||||
import argparse
|
||||
from util.importers import get_importers_parser
|
||||
import glob
|
||||
import json
|
||||
import numpy as np
|
||||
@ -93,7 +93,7 @@ def preprocess_data(tgz_file, target_dir):
|
||||
|
||||
def main():
|
||||
# https://www.openslr.org/47/
|
||||
parser = argparse.ArgumentParser(description='Import Primewords Chinese corpus set 1')
|
||||
parser = get_importers_parser(description='Import Primewords Chinese corpus set 1')
|
||||
parser.add_argument('tgz_file', help='Path to primewords_md_2018_set1.tar.gz')
|
||||
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.')
|
||||
params = parser.parse_args()
|
||||
|
@ -3,13 +3,12 @@ from __future__ import absolute_import, division, print_function
|
||||
|
||||
# Make sure we can import stuff from util/
|
||||
# This script needs to be run from the root of the DeepSpeech repository
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
|
||||
from util.importers import get_importers_parser
|
||||
|
||||
import csv
|
||||
import re
|
||||
import sox
|
||||
@ -195,7 +194,7 @@ def _maybe_convert_sets(target_dir, extracted_data):
|
||||
print('Final amount of imported audio: %s.' % secs_to_hours(counter['total_time'] / SAMPLE_RATE))
|
||||
|
||||
def handle_args():
|
||||
parser = argparse.ArgumentParser(description='Importer for African Accented French dataset. More information on http://www.openslr.org/57/.')
|
||||
parser = get_importers_parser(description='Importer for African Accented French dataset. More information on http://www.openslr.org/57/.')
|
||||
parser.add_argument(dest='target_dir')
|
||||
parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet')
|
||||
parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones')
|
||||
|
@ -3,14 +3,13 @@ from __future__ import absolute_import, division, print_function
|
||||
|
||||
# Make sure we can import stuff from util/
|
||||
# This script needs to be run from the root of the DeepSpeech repository
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
||||
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
|
||||
from util.importers import get_importers_parser
|
||||
|
||||
import csv
|
||||
import unidecode
|
||||
import zipfile
|
||||
@ -186,7 +185,7 @@ def cleanup_transcript(text, english_compatible=False):
|
||||
|
||||
|
||||
def handle_args():
|
||||
parser = argparse.ArgumentParser(description='Importer for TrainingSpeech dataset.')
|
||||
parser = get_importers_parser(description='Importer for TrainingSpeech dataset.')
|
||||
parser.add_argument(dest='target_dir')
|
||||
parser.add_argument('--english-compatible', action='store_true', dest='english_compatible', help='Remove diactrics and other non-ascii chars.')
|
||||
return parser.parse_args()
|
||||
|
28
util/importers.py
Normal file
28
util/importers.py
Normal file
@ -0,0 +1,28 @@
|
||||
import argparse
|
||||
import re
|
||||
|
||||
def get_importers_parser(description):
|
||||
parser = argparse.ArgumentParser(description=description)
|
||||
return parser
|
||||
|
||||
# Validate and normalize transcriptions. Returns a cleaned version of the label
|
||||
# or None if it's invalid.
|
||||
def validate_label_eng(label):
|
||||
# For now we can only handle [a-z ']
|
||||
if re.search(r"[0-9]|[(<\[\]&*{]", label) is not None:
|
||||
return None
|
||||
|
||||
label = label.replace("-", " ")
|
||||
label = label.replace("_", " ")
|
||||
label = re.sub("[ ]{2,}", " ", label)
|
||||
label = label.replace(".", "")
|
||||
label = label.replace(",", "")
|
||||
label = label.replace(";", "")
|
||||
label = label.replace("?", "")
|
||||
label = label.replace("!", "")
|
||||
label = label.replace(":", "")
|
||||
label = label.replace("\"", "")
|
||||
label = label.strip()
|
||||
label = label.lower()
|
||||
|
||||
return label if label else None
|
12
util/test_importers.py
Normal file
12
util/test_importers.py
Normal file
@ -0,0 +1,12 @@
|
||||
import unittest
|
||||
|
||||
from .importers import validate_label_eng
|
||||
|
||||
class TestValidateLabelEng(unittest.TestCase):
|
||||
|
||||
def test_numbers(self):
|
||||
label = validate_label_eng("this is a 1 2 3 test")
|
||||
self.assertEqual(label, None)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
23
util/text.py
23
util/text.py
@ -1,7 +1,6 @@
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import numpy as np
|
||||
import re
|
||||
import struct
|
||||
|
||||
from six.moves import range
|
||||
@ -166,25 +165,3 @@ def levenshtein(a, b):
|
||||
current[j] = min(add, delete, change)
|
||||
|
||||
return current[n]
|
||||
|
||||
# Validate and normalize transcriptions. Returns a cleaned version of the label
|
||||
# or None if it's invalid.
|
||||
def validate_label(label):
|
||||
# For now we can only handle [a-z ']
|
||||
if re.search(r"[0-9]|[(<\[\]&*{]", label) is not None:
|
||||
return None
|
||||
|
||||
label = label.replace("-", " ")
|
||||
label = label.replace("_", " ")
|
||||
label = re.sub("[ ]{2,}", " ", label)
|
||||
label = label.replace(".", "")
|
||||
label = label.replace(",", "")
|
||||
label = label.replace(";", "")
|
||||
label = label.replace("?", "")
|
||||
label = label.replace("!", "")
|
||||
label = label.replace(":", "")
|
||||
label = label.replace("\"", "")
|
||||
label = label.strip()
|
||||
label = label.lower()
|
||||
|
||||
return label if label else None
|
||||
|
Loading…
x
Reference in New Issue
Block a user