Merge pull request #1647 from nicolaspanel/master
[TrainingSpeech importer] generate english-complatible dataset
This commit is contained in:
commit
5118f7ffed
@ -3,6 +3,7 @@ from __future__ import absolute_import, division, print_function
|
|||||||
|
|
||||||
# Make sure we can import stuff from util/
|
# Make sure we can import stuff from util/
|
||||||
# This script needs to be run from the root of the DeepSpeech repository
|
# This script needs to be run from the root of the DeepSpeech repository
|
||||||
|
import argparse
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
@ -11,6 +12,7 @@ import sys
|
|||||||
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
|
import unidecode
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
@ -25,7 +27,7 @@ ARCHIVE_DIR_NAME = 'ts_' + ARCHIVE_NAME
|
|||||||
ARCHIVE_URL = 'https://s3.eu-west-3.amazonaws.com/audiocorp/releases/' + ARCHIVE_NAME + '.zip'
|
ARCHIVE_URL = 'https://s3.eu-west-3.amazonaws.com/audiocorp/releases/' + ARCHIVE_NAME + '.zip'
|
||||||
|
|
||||||
|
|
||||||
def _download_and_preprocess_data(target_dir):
|
def _download_and_preprocess_data(target_dir, english_compatible=False):
|
||||||
# Making path absolute
|
# Making path absolute
|
||||||
target_dir = path.abspath(target_dir)
|
target_dir = path.abspath(target_dir)
|
||||||
# Conditionally download data
|
# Conditionally download data
|
||||||
@ -33,7 +35,8 @@ def _download_and_preprocess_data(target_dir):
|
|||||||
# Conditionally extract archive data
|
# Conditionally extract archive data
|
||||||
_maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
|
_maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
|
||||||
# Conditionally convert TrainingSpeech data to DeepSpeech CSVs and wav
|
# Conditionally convert TrainingSpeech data to DeepSpeech CSVs and wav
|
||||||
_maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)
|
_maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME, english_compatible=english_compatible)
|
||||||
|
|
||||||
|
|
||||||
def _maybe_extract(target_dir, extracted_data, archive_path):
|
def _maybe_extract(target_dir, extracted_data, archive_path):
|
||||||
# If target_dir/extracted_data does not exist, extract archive in target_dir
|
# If target_dir/extracted_data does not exist, extract archive in target_dir
|
||||||
@ -47,7 +50,8 @@ def _maybe_extract(target_dir, extracted_data, archive_path):
|
|||||||
else:
|
else:
|
||||||
print('Found directory "%s" - not extracting it from archive.' % archive_path)
|
print('Found directory "%s" - not extracting it from archive.' % archive_path)
|
||||||
|
|
||||||
def _maybe_convert_sets(target_dir, extracted_data):
|
|
||||||
|
def _maybe_convert_sets(target_dir, extracted_data, english_compatible=False):
|
||||||
extracted_dir = path.join(target_dir, extracted_data)
|
extracted_dir = path.join(target_dir, extracted_data)
|
||||||
# override existing CSV with normalized one
|
# override existing CSV with normalized one
|
||||||
target_csv_template = os.path.join(target_dir, 'ts_' + ARCHIVE_NAME + '_{}.csv')
|
target_csv_template = os.path.join(target_dir, 'ts_' + ARCHIVE_NAME + '_{}.csv')
|
||||||
@ -70,7 +74,7 @@ def _maybe_convert_sets(target_dir, extracted_data):
|
|||||||
test_writer.writeheader()
|
test_writer.writeheader()
|
||||||
|
|
||||||
for i, item in enumerate(data):
|
for i, item in enumerate(data):
|
||||||
transcript = validate_label(cleanup_transcript(item['text']))
|
transcript = validate_label(cleanup_transcript(item['text'], english_compatible=english_compatible))
|
||||||
if not transcript:
|
if not transcript:
|
||||||
continue
|
continue
|
||||||
wav_filename = os.path.join(target_dir, extracted_data, item['path'])
|
wav_filename = os.path.join(target_dir, extracted_data, item['path'])
|
||||||
@ -92,12 +96,22 @@ PUNCTUATIONS_REG = re.compile(r"[°\-,;!?.()\[\]*…—]")
|
|||||||
MULTIPLE_SPACES_REG = re.compile(r'\s{2,}')
|
MULTIPLE_SPACES_REG = re.compile(r'\s{2,}')
|
||||||
|
|
||||||
|
|
||||||
def cleanup_transcript(text):
|
def cleanup_transcript(text, english_compatible=False):
|
||||||
text = text.replace('’', "'").replace('\u00A0', ' ')
|
text = text.replace('’', "'").replace('\u00A0', ' ')
|
||||||
text = PUNCTUATIONS_REG.sub(' ', text)
|
text = PUNCTUATIONS_REG.sub(' ', text)
|
||||||
text = MULTIPLE_SPACES_REG.sub(' ', text)
|
text = MULTIPLE_SPACES_REG.sub(' ', text)
|
||||||
|
if english_compatible:
|
||||||
|
text = unidecode.unidecode(text)
|
||||||
return text.strip().lower()
|
return text.strip().lower()
|
||||||
|
|
||||||
|
|
||||||
|
def handle_args():
|
||||||
|
parser = argparse.ArgumentParser(description='Importer for TrainingSpeech dataset.')
|
||||||
|
parser.add_argument(dest='target_dir')
|
||||||
|
parser.add_argument('--english-compatible', action='store_true', dest='english_compatible', help='Remove diactrics and other non-ascii chars.')
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
_download_and_preprocess_data(sys.argv[1])
|
cli_args = handle_args()
|
||||||
|
_download_and_preprocess_data(cli_args.target_dir, cli_args.english_compatible)
|
||||||
|
Loading…
Reference in New Issue
Block a user