From 259a60b7b1fd0ef348cebf1da446d221e8eafcf5 Mon Sep 17 00:00:00 2001 From: Tilman Kamp <5991088+tilmankamp@users.noreply.github.com> Date: Mon, 30 Dec 2019 18:09:44 +0100 Subject: [PATCH] Implements #2624 - SWC importer: CSV columns for article and speaker --- bin/import_swc.py | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/bin/import_swc.py b/bin/import_swc.py index 6b9273cf..93410805 100755 --- a/bin/import_swc.py +++ b/bin/import_swc.py @@ -34,8 +34,10 @@ SWC_URL = "https://www2.informatik.uni-hamburg.de/nats/pub/SWC/SWC_{language}.ta SWC_ARCHIVE = "SWC_{language}.tar" LANGUAGES = ['dutch', 'english', 'german'] FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript'] +FIELDNAMES_EXT = FIELDNAMES + ['article', 'speaker'] CHANNELS = 1 SAMPLE_RATE = 16000 +UNKNOWN = '' AUDIO_PATTERN = 'audio*.ogg' WAV_NAME = 'audio.wav' ALIGNED_NAME = 'aligned.swc' @@ -65,11 +67,12 @@ PRE_FILTER = str.maketrans(dict.fromkeys('/()[]{}<>:')) class Sample: - def __init__(self, wav_path, start, end, text, speaker, sub_set=None): + def __init__(self, wav_path, start, end, text, article, speaker, sub_set=None): self.wav_path = wav_path self.start = start self.end = end self.text = text + self.article = article self.speaker = speaker self.sub_set = sub_set @@ -203,7 +206,7 @@ def collect_samples(base_dir, language): samples = [] reasons = Counter() - def add_sample(p_wav_path, p_speaker, p_start, p_end, p_text, p_reason='complete'): + def add_sample(p_wav_path, p_article, p_speaker, p_start, p_end, p_text, p_reason='complete'): if p_start is not None and p_end is not None and p_text is not None: duration = p_end - p_start text, filter_reason = label_filter(p_text, language) @@ -211,6 +214,12 @@ def collect_samples(base_dir, language): if filter_reason is not None: skip = True p_reason = filter_reason + elif CLI_ARGS.exclude_unknown_speakers and p_speaker == UNKNOWN: + skip = True + p_reason = 'unknown speaker' + elif CLI_ARGS.exclude_unknown_articles and p_article == UNKNOWN: + skip = True + p_reason = 'unknown article' elif duration > CLI_ARGS.max_duration > 0 and CLI_ARGS.ignore_too_long: skip = True p_reason = 'exceeded duration' @@ -223,7 +232,7 @@ def collect_samples(base_dir, language): if skip: reasons[p_reason] += 1 else: - samples.append(Sample(p_wav_path, p_start, p_end, text, p_speaker)) + samples.append(Sample(p_wav_path, p_start, p_end, text, p_article, p_speaker)) elif p_start is None or p_end is None: reasons['missing timestamps'] += 1 else: @@ -234,12 +243,15 @@ def collect_samples(base_dir, language): for root in bar(roots): wav_path = path.join(root, WAV_NAME) aligned = ET.parse(path.join(root, ALIGNED_NAME)) - speaker = '' + article = UNKNOWN + speaker = UNKNOWN for prop in aligned.iter('prop'): attributes = prop.attrib - if 'key' in attributes and 'value' in attributes and attributes['key'] == 'reader.name': - speaker = attributes['value'] - break + if 'key' in attributes and 'value' in attributes: + if attributes['key'] == 'DC.identifier': + article = attributes['value'] + elif attributes['key'] == 'reader.name': + speaker = attributes['value'] for sentence in aligned.iter('s'): if ignored(sentence): continue @@ -248,7 +260,7 @@ def collect_samples(base_dir, language): sample_start, sample_end, token_texts, sample_texts = None, None, [], [] for token_start, token_end, token_text in tokens: if CLI_ARGS.exclude_numbers and any(c.isdigit() for c in token_text): - add_sample(wav_path, speaker, sample_start, sample_end, ' '.join(sample_texts), + add_sample(wav_path, article, speaker, sample_start, sample_end, ' '.join(sample_texts), p_reason='has numbers') sample_start, sample_end, token_texts, sample_texts = None, None, [], [] continue @@ -259,7 +271,7 @@ def collect_samples(base_dir, language): token_texts.append(token_text) if token_end is not None: if token_start != sample_start and token_end - sample_start > CLI_ARGS.max_duration > 0: - add_sample(wav_path, speaker, sample_start, sample_end, ' '.join(sample_texts), + add_sample(wav_path, article, speaker, sample_start, sample_end, ' '.join(sample_texts), p_reason='split') sample_start = sample_end sample_texts = [] @@ -267,7 +279,7 @@ def collect_samples(base_dir, language): sample_end = token_end sample_texts.extend(token_texts) token_texts = [] - add_sample(wav_path, speaker, sample_start, sample_end, ' '.join(sample_texts), + add_sample(wav_path, article, speaker, sample_start, sample_end, ' '.join(sample_texts), p_reason='split' if split else 'complete') print('Skipped samples:') for reason, n in reasons.most_common(): @@ -382,15 +394,19 @@ def write_csvs(samples, language): csv_path = path.join(base_dir, language + '-' + sub_set + '.csv') print('Writing "{}"...'.format(csv_path)) with open(csv_path, 'w') as csv_file: - writer = csv.DictWriter(csv_file, fieldnames=FIELDNAMES) + writer = csv.DictWriter(csv_file, fieldnames=FIELDNAMES_EXT if CLI_ARGS.add_meta else FIELDNAMES) writer.writeheader() bar = progressbar.ProgressBar(max_value=len(set_samples), widgets=SIMPLE_BAR) for sample in bar(set_samples): - writer.writerow({ + row = { 'wav_filename': path.relpath(sample.wav_path, base_dir), 'wav_filesize': path.getsize(sample.wav_path), 'transcript': sample.text - }) + } + if CLI_ARGS.add_meta: + row['article'] = sample.article + row['speaker'] = sample.speaker + writer.writerow(row) def cleanup(archive, language): @@ -428,6 +444,9 @@ def handle_args(): for language in LANGUAGES: parser.add_argument('--{}_alphabet'.format(language), help='Exclude {} samples with characters not in provided alphabet file'.format(language)) + parser.add_argument('--add_meta', action='store_true', help='Adds article and speaker CSV columns') + parser.add_argument('--exclude_unknown_speakers', action='store_true', help='Exclude unknown speakers') + parser.add_argument('--exclude_unknown_articles', action='store_true', help='Exclude unknown articles') parser.add_argument('--keep_archive', type=bool, default=True, help='If downloaded archives should be kept') parser.add_argument('--keep_intermediate', type=bool, default=False,