Merge pull request #2625 from tilmankamp/swc_debug

Implements #2624 - SWC importer: CSV columns for article and speaker
2020-01-02 13:19:05 +01:00 · 2020-01-02 13:19:05 +01:00 · 242d70dc8c
commit 242d70dc8c
parent 6fa2babdfd 259a60b7b1
1 changed files with 32 additions and 13 deletions
--- a/bin/import_swc.py
+++ b/bin/import_swc.py
@ -34,8 +34,10 @@ SWC_URL = "https://www2.informatik.uni-hamburg.de/nats/pub/SWC/SWC_{language}.ta
 SWC_ARCHIVE = "SWC_{language}.tar"
 LANGUAGES = ['dutch', 'english', 'german']
 FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript']
+FIELDNAMES_EXT = FIELDNAMES + ['article', 'speaker']
 CHANNELS = 1
 SAMPLE_RATE = 16000
+UNKNOWN = '<unknown>'
 AUDIO_PATTERN = 'audio*.ogg'
 WAV_NAME = 'audio.wav'
 ALIGNED_NAME = 'aligned.swc'
@ -65,11 +67,12 @@ PRE_FILTER = str.maketrans(dict.fromkeys('/()[]{}<>:'))


 class Sample:
-    def __init__(self, wav_path, start, end, text, speaker, sub_set=None):
+    def __init__(self, wav_path, start, end, text, article, speaker, sub_set=None):
        self.wav_path = wav_path
        self.start = start
        self.end = end
        self.text = text
+        self.article = article
        self.speaker = speaker
        self.sub_set = sub_set

@ -203,7 +206,7 @@ def collect_samples(base_dir, language):
    samples = []
    reasons = Counter()

-    def add_sample(p_wav_path, p_speaker, p_start, p_end, p_text, p_reason='complete'):
+    def add_sample(p_wav_path, p_article, p_speaker, p_start, p_end, p_text, p_reason='complete'):
        if p_start is not None and p_end is not None and p_text is not None:
            duration = p_end - p_start
            text, filter_reason = label_filter(p_text, language)
@ -211,6 +214,12 @@ def collect_samples(base_dir, language):
            if filter_reason is not None:
                skip = True
                p_reason = filter_reason
+            elif CLI_ARGS.exclude_unknown_speakers and p_speaker == UNKNOWN:
+                skip = True
+                p_reason = 'unknown speaker'
+            elif CLI_ARGS.exclude_unknown_articles and p_article == UNKNOWN:
+                skip = True
+                p_reason = 'unknown article'
            elif duration > CLI_ARGS.max_duration > 0 and CLI_ARGS.ignore_too_long:
                skip = True
                p_reason = 'exceeded duration'
@ -223,7 +232,7 @@ def collect_samples(base_dir, language):
            if skip:
                reasons[p_reason] += 1
            else:
-                samples.append(Sample(p_wav_path, p_start, p_end, text, p_speaker))
+                samples.append(Sample(p_wav_path, p_start, p_end, text, p_article, p_speaker))
        elif p_start is None or p_end is None:
            reasons['missing timestamps'] += 1
        else:
@ -234,12 +243,15 @@ def collect_samples(base_dir, language):
    for root in bar(roots):
        wav_path = path.join(root, WAV_NAME)
        aligned = ET.parse(path.join(root, ALIGNED_NAME))
-        speaker = '<unknown>'
+        article = UNKNOWN
+        speaker = UNKNOWN
        for prop in aligned.iter('prop'):
            attributes = prop.attrib
-            if 'key' in attributes and 'value' in attributes and attributes['key'] == 'reader.name':
-                speaker = attributes['value']
-                break
+            if 'key' in attributes and 'value' in attributes:
+                if attributes['key'] == 'DC.identifier':
+                    article = attributes['value']
+                elif attributes['key'] == 'reader.name':
+                    speaker = attributes['value']
        for sentence in aligned.iter('s'):
            if ignored(sentence):
                continue
@ -248,7 +260,7 @@ def collect_samples(base_dir, language):
            sample_start, sample_end, token_texts, sample_texts = None, None, [], []
            for token_start, token_end, token_text in tokens:
                if CLI_ARGS.exclude_numbers and any(c.isdigit() for c in token_text):
-                    add_sample(wav_path, speaker, sample_start, sample_end, ' '.join(sample_texts),
+                    add_sample(wav_path, article, speaker, sample_start, sample_end, ' '.join(sample_texts),
                               p_reason='has numbers')
                    sample_start, sample_end, token_texts, sample_texts = None, None, [], []
                    continue
@ -259,7 +271,7 @@ def collect_samples(base_dir, language):
                token_texts.append(token_text)
                if token_end is not None:
                    if token_start != sample_start and token_end - sample_start > CLI_ARGS.max_duration > 0:
-                        add_sample(wav_path, speaker, sample_start, sample_end, ' '.join(sample_texts),
+                        add_sample(wav_path, article, speaker, sample_start, sample_end, ' '.join(sample_texts),
                                   p_reason='split')
                        sample_start = sample_end
                        sample_texts = []
@ -267,7 +279,7 @@ def collect_samples(base_dir, language):
                    sample_end = token_end
                    sample_texts.extend(token_texts)
                    token_texts = []
-            add_sample(wav_path, speaker, sample_start, sample_end, ' '.join(sample_texts),
+            add_sample(wav_path, article, speaker, sample_start, sample_end, ' '.join(sample_texts),
                       p_reason='split' if split else 'complete')
    print('Skipped samples:')
    for reason, n in reasons.most_common():
@ -382,15 +394,19 @@ def write_csvs(samples, language):
        csv_path = path.join(base_dir, language + '-' + sub_set + '.csv')
        print('Writing "{}"...'.format(csv_path))
        with open(csv_path, 'w') as csv_file:
-            writer = csv.DictWriter(csv_file, fieldnames=FIELDNAMES)
+            writer = csv.DictWriter(csv_file, fieldnames=FIELDNAMES_EXT if CLI_ARGS.add_meta else FIELDNAMES)
            writer.writeheader()
            bar = progressbar.ProgressBar(max_value=len(set_samples), widgets=SIMPLE_BAR)
            for sample in bar(set_samples):
-                writer.writerow({
+                row = {
                    'wav_filename': path.relpath(sample.wav_path, base_dir),
                    'wav_filesize': path.getsize(sample.wav_path),
                    'transcript': sample.text
-                })
+                }
+                if CLI_ARGS.add_meta:
+                    row['article'] = sample.article
+                    row['speaker'] = sample.speaker
+                writer.writerow(row)


 def cleanup(archive, language):
@ -428,6 +444,9 @@ def handle_args():
    for language in LANGUAGES:
        parser.add_argument('--{}_alphabet'.format(language),
                            help='Exclude {} samples with characters not in provided alphabet file'.format(language))
+    parser.add_argument('--add_meta', action='store_true', help='Adds article and speaker CSV columns')
+    parser.add_argument('--exclude_unknown_speakers', action='store_true', help='Exclude unknown speakers')
+    parser.add_argument('--exclude_unknown_articles', action='store_true', help='Exclude unknown articles')
    parser.add_argument('--keep_archive', type=bool, default=True,
                        help='If downloaded archives should be kept')
    parser.add_argument('--keep_intermediate', type=bool, default=False,