Merge pull request #2625 from tilmankamp/swc_debug

Implements #2624 - SWC importer: CSV columns for article and speaker
This commit is contained in:
Tilman Kamp 2020-01-02 13:19:05 +01:00 committed by GitHub
commit 242d70dc8c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -34,8 +34,10 @@ SWC_URL = "https://www2.informatik.uni-hamburg.de/nats/pub/SWC/SWC_{language}.ta
SWC_ARCHIVE = "SWC_{language}.tar" SWC_ARCHIVE = "SWC_{language}.tar"
LANGUAGES = ['dutch', 'english', 'german'] LANGUAGES = ['dutch', 'english', 'german']
FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript'] FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript']
FIELDNAMES_EXT = FIELDNAMES + ['article', 'speaker']
CHANNELS = 1 CHANNELS = 1
SAMPLE_RATE = 16000 SAMPLE_RATE = 16000
UNKNOWN = '<unknown>'
AUDIO_PATTERN = 'audio*.ogg' AUDIO_PATTERN = 'audio*.ogg'
WAV_NAME = 'audio.wav' WAV_NAME = 'audio.wav'
ALIGNED_NAME = 'aligned.swc' ALIGNED_NAME = 'aligned.swc'
@ -65,11 +67,12 @@ PRE_FILTER = str.maketrans(dict.fromkeys('/()[]{}<>:'))
class Sample: class Sample:
def __init__(self, wav_path, start, end, text, speaker, sub_set=None): def __init__(self, wav_path, start, end, text, article, speaker, sub_set=None):
self.wav_path = wav_path self.wav_path = wav_path
self.start = start self.start = start
self.end = end self.end = end
self.text = text self.text = text
self.article = article
self.speaker = speaker self.speaker = speaker
self.sub_set = sub_set self.sub_set = sub_set
@ -203,7 +206,7 @@ def collect_samples(base_dir, language):
samples = [] samples = []
reasons = Counter() reasons = Counter()
def add_sample(p_wav_path, p_speaker, p_start, p_end, p_text, p_reason='complete'): def add_sample(p_wav_path, p_article, p_speaker, p_start, p_end, p_text, p_reason='complete'):
if p_start is not None and p_end is not None and p_text is not None: if p_start is not None and p_end is not None and p_text is not None:
duration = p_end - p_start duration = p_end - p_start
text, filter_reason = label_filter(p_text, language) text, filter_reason = label_filter(p_text, language)
@ -211,6 +214,12 @@ def collect_samples(base_dir, language):
if filter_reason is not None: if filter_reason is not None:
skip = True skip = True
p_reason = filter_reason p_reason = filter_reason
elif CLI_ARGS.exclude_unknown_speakers and p_speaker == UNKNOWN:
skip = True
p_reason = 'unknown speaker'
elif CLI_ARGS.exclude_unknown_articles and p_article == UNKNOWN:
skip = True
p_reason = 'unknown article'
elif duration > CLI_ARGS.max_duration > 0 and CLI_ARGS.ignore_too_long: elif duration > CLI_ARGS.max_duration > 0 and CLI_ARGS.ignore_too_long:
skip = True skip = True
p_reason = 'exceeded duration' p_reason = 'exceeded duration'
@ -223,7 +232,7 @@ def collect_samples(base_dir, language):
if skip: if skip:
reasons[p_reason] += 1 reasons[p_reason] += 1
else: else:
samples.append(Sample(p_wav_path, p_start, p_end, text, p_speaker)) samples.append(Sample(p_wav_path, p_start, p_end, text, p_article, p_speaker))
elif p_start is None or p_end is None: elif p_start is None or p_end is None:
reasons['missing timestamps'] += 1 reasons['missing timestamps'] += 1
else: else:
@ -234,12 +243,15 @@ def collect_samples(base_dir, language):
for root in bar(roots): for root in bar(roots):
wav_path = path.join(root, WAV_NAME) wav_path = path.join(root, WAV_NAME)
aligned = ET.parse(path.join(root, ALIGNED_NAME)) aligned = ET.parse(path.join(root, ALIGNED_NAME))
speaker = '<unknown>' article = UNKNOWN
speaker = UNKNOWN
for prop in aligned.iter('prop'): for prop in aligned.iter('prop'):
attributes = prop.attrib attributes = prop.attrib
if 'key' in attributes and 'value' in attributes and attributes['key'] == 'reader.name': if 'key' in attributes and 'value' in attributes:
speaker = attributes['value'] if attributes['key'] == 'DC.identifier':
break article = attributes['value']
elif attributes['key'] == 'reader.name':
speaker = attributes['value']
for sentence in aligned.iter('s'): for sentence in aligned.iter('s'):
if ignored(sentence): if ignored(sentence):
continue continue
@ -248,7 +260,7 @@ def collect_samples(base_dir, language):
sample_start, sample_end, token_texts, sample_texts = None, None, [], [] sample_start, sample_end, token_texts, sample_texts = None, None, [], []
for token_start, token_end, token_text in tokens: for token_start, token_end, token_text in tokens:
if CLI_ARGS.exclude_numbers and any(c.isdigit() for c in token_text): if CLI_ARGS.exclude_numbers and any(c.isdigit() for c in token_text):
add_sample(wav_path, speaker, sample_start, sample_end, ' '.join(sample_texts), add_sample(wav_path, article, speaker, sample_start, sample_end, ' '.join(sample_texts),
p_reason='has numbers') p_reason='has numbers')
sample_start, sample_end, token_texts, sample_texts = None, None, [], [] sample_start, sample_end, token_texts, sample_texts = None, None, [], []
continue continue
@ -259,7 +271,7 @@ def collect_samples(base_dir, language):
token_texts.append(token_text) token_texts.append(token_text)
if token_end is not None: if token_end is not None:
if token_start != sample_start and token_end - sample_start > CLI_ARGS.max_duration > 0: if token_start != sample_start and token_end - sample_start > CLI_ARGS.max_duration > 0:
add_sample(wav_path, speaker, sample_start, sample_end, ' '.join(sample_texts), add_sample(wav_path, article, speaker, sample_start, sample_end, ' '.join(sample_texts),
p_reason='split') p_reason='split')
sample_start = sample_end sample_start = sample_end
sample_texts = [] sample_texts = []
@ -267,7 +279,7 @@ def collect_samples(base_dir, language):
sample_end = token_end sample_end = token_end
sample_texts.extend(token_texts) sample_texts.extend(token_texts)
token_texts = [] token_texts = []
add_sample(wav_path, speaker, sample_start, sample_end, ' '.join(sample_texts), add_sample(wav_path, article, speaker, sample_start, sample_end, ' '.join(sample_texts),
p_reason='split' if split else 'complete') p_reason='split' if split else 'complete')
print('Skipped samples:') print('Skipped samples:')
for reason, n in reasons.most_common(): for reason, n in reasons.most_common():
@ -382,15 +394,19 @@ def write_csvs(samples, language):
csv_path = path.join(base_dir, language + '-' + sub_set + '.csv') csv_path = path.join(base_dir, language + '-' + sub_set + '.csv')
print('Writing "{}"...'.format(csv_path)) print('Writing "{}"...'.format(csv_path))
with open(csv_path, 'w') as csv_file: with open(csv_path, 'w') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=FIELDNAMES) writer = csv.DictWriter(csv_file, fieldnames=FIELDNAMES_EXT if CLI_ARGS.add_meta else FIELDNAMES)
writer.writeheader() writer.writeheader()
bar = progressbar.ProgressBar(max_value=len(set_samples), widgets=SIMPLE_BAR) bar = progressbar.ProgressBar(max_value=len(set_samples), widgets=SIMPLE_BAR)
for sample in bar(set_samples): for sample in bar(set_samples):
writer.writerow({ row = {
'wav_filename': path.relpath(sample.wav_path, base_dir), 'wav_filename': path.relpath(sample.wav_path, base_dir),
'wav_filesize': path.getsize(sample.wav_path), 'wav_filesize': path.getsize(sample.wav_path),
'transcript': sample.text 'transcript': sample.text
}) }
if CLI_ARGS.add_meta:
row['article'] = sample.article
row['speaker'] = sample.speaker
writer.writerow(row)
def cleanup(archive, language): def cleanup(archive, language):
@ -428,6 +444,9 @@ def handle_args():
for language in LANGUAGES: for language in LANGUAGES:
parser.add_argument('--{}_alphabet'.format(language), parser.add_argument('--{}_alphabet'.format(language),
help='Exclude {} samples with characters not in provided alphabet file'.format(language)) help='Exclude {} samples with characters not in provided alphabet file'.format(language))
parser.add_argument('--add_meta', action='store_true', help='Adds article and speaker CSV columns')
parser.add_argument('--exclude_unknown_speakers', action='store_true', help='Exclude unknown speakers')
parser.add_argument('--exclude_unknown_articles', action='store_true', help='Exclude unknown articles')
parser.add_argument('--keep_archive', type=bool, default=True, parser.add_argument('--keep_archive', type=bool, default=True,
help='If downloaded archives should be kept') help='If downloaded archives should be kept')
parser.add_argument('--keep_intermediate', type=bool, default=False, parser.add_argument('--keep_intermediate', type=bool, default=False,