Merge pull request #2625 from tilmankamp/swc_debug
Implements #2624 - SWC importer: CSV columns for article and speaker
This commit is contained in:
commit
242d70dc8c
@ -34,8 +34,10 @@ SWC_URL = "https://www2.informatik.uni-hamburg.de/nats/pub/SWC/SWC_{language}.ta
|
|||||||
SWC_ARCHIVE = "SWC_{language}.tar"
|
SWC_ARCHIVE = "SWC_{language}.tar"
|
||||||
LANGUAGES = ['dutch', 'english', 'german']
|
LANGUAGES = ['dutch', 'english', 'german']
|
||||||
FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript']
|
FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript']
|
||||||
|
FIELDNAMES_EXT = FIELDNAMES + ['article', 'speaker']
|
||||||
CHANNELS = 1
|
CHANNELS = 1
|
||||||
SAMPLE_RATE = 16000
|
SAMPLE_RATE = 16000
|
||||||
|
UNKNOWN = '<unknown>'
|
||||||
AUDIO_PATTERN = 'audio*.ogg'
|
AUDIO_PATTERN = 'audio*.ogg'
|
||||||
WAV_NAME = 'audio.wav'
|
WAV_NAME = 'audio.wav'
|
||||||
ALIGNED_NAME = 'aligned.swc'
|
ALIGNED_NAME = 'aligned.swc'
|
||||||
@ -65,11 +67,12 @@ PRE_FILTER = str.maketrans(dict.fromkeys('/()[]{}<>:'))
|
|||||||
|
|
||||||
|
|
||||||
class Sample:
|
class Sample:
|
||||||
def __init__(self, wav_path, start, end, text, speaker, sub_set=None):
|
def __init__(self, wav_path, start, end, text, article, speaker, sub_set=None):
|
||||||
self.wav_path = wav_path
|
self.wav_path = wav_path
|
||||||
self.start = start
|
self.start = start
|
||||||
self.end = end
|
self.end = end
|
||||||
self.text = text
|
self.text = text
|
||||||
|
self.article = article
|
||||||
self.speaker = speaker
|
self.speaker = speaker
|
||||||
self.sub_set = sub_set
|
self.sub_set = sub_set
|
||||||
|
|
||||||
@ -203,7 +206,7 @@ def collect_samples(base_dir, language):
|
|||||||
samples = []
|
samples = []
|
||||||
reasons = Counter()
|
reasons = Counter()
|
||||||
|
|
||||||
def add_sample(p_wav_path, p_speaker, p_start, p_end, p_text, p_reason='complete'):
|
def add_sample(p_wav_path, p_article, p_speaker, p_start, p_end, p_text, p_reason='complete'):
|
||||||
if p_start is not None and p_end is not None and p_text is not None:
|
if p_start is not None and p_end is not None and p_text is not None:
|
||||||
duration = p_end - p_start
|
duration = p_end - p_start
|
||||||
text, filter_reason = label_filter(p_text, language)
|
text, filter_reason = label_filter(p_text, language)
|
||||||
@ -211,6 +214,12 @@ def collect_samples(base_dir, language):
|
|||||||
if filter_reason is not None:
|
if filter_reason is not None:
|
||||||
skip = True
|
skip = True
|
||||||
p_reason = filter_reason
|
p_reason = filter_reason
|
||||||
|
elif CLI_ARGS.exclude_unknown_speakers and p_speaker == UNKNOWN:
|
||||||
|
skip = True
|
||||||
|
p_reason = 'unknown speaker'
|
||||||
|
elif CLI_ARGS.exclude_unknown_articles and p_article == UNKNOWN:
|
||||||
|
skip = True
|
||||||
|
p_reason = 'unknown article'
|
||||||
elif duration > CLI_ARGS.max_duration > 0 and CLI_ARGS.ignore_too_long:
|
elif duration > CLI_ARGS.max_duration > 0 and CLI_ARGS.ignore_too_long:
|
||||||
skip = True
|
skip = True
|
||||||
p_reason = 'exceeded duration'
|
p_reason = 'exceeded duration'
|
||||||
@ -223,7 +232,7 @@ def collect_samples(base_dir, language):
|
|||||||
if skip:
|
if skip:
|
||||||
reasons[p_reason] += 1
|
reasons[p_reason] += 1
|
||||||
else:
|
else:
|
||||||
samples.append(Sample(p_wav_path, p_start, p_end, text, p_speaker))
|
samples.append(Sample(p_wav_path, p_start, p_end, text, p_article, p_speaker))
|
||||||
elif p_start is None or p_end is None:
|
elif p_start is None or p_end is None:
|
||||||
reasons['missing timestamps'] += 1
|
reasons['missing timestamps'] += 1
|
||||||
else:
|
else:
|
||||||
@ -234,12 +243,15 @@ def collect_samples(base_dir, language):
|
|||||||
for root in bar(roots):
|
for root in bar(roots):
|
||||||
wav_path = path.join(root, WAV_NAME)
|
wav_path = path.join(root, WAV_NAME)
|
||||||
aligned = ET.parse(path.join(root, ALIGNED_NAME))
|
aligned = ET.parse(path.join(root, ALIGNED_NAME))
|
||||||
speaker = '<unknown>'
|
article = UNKNOWN
|
||||||
|
speaker = UNKNOWN
|
||||||
for prop in aligned.iter('prop'):
|
for prop in aligned.iter('prop'):
|
||||||
attributes = prop.attrib
|
attributes = prop.attrib
|
||||||
if 'key' in attributes and 'value' in attributes and attributes['key'] == 'reader.name':
|
if 'key' in attributes and 'value' in attributes:
|
||||||
speaker = attributes['value']
|
if attributes['key'] == 'DC.identifier':
|
||||||
break
|
article = attributes['value']
|
||||||
|
elif attributes['key'] == 'reader.name':
|
||||||
|
speaker = attributes['value']
|
||||||
for sentence in aligned.iter('s'):
|
for sentence in aligned.iter('s'):
|
||||||
if ignored(sentence):
|
if ignored(sentence):
|
||||||
continue
|
continue
|
||||||
@ -248,7 +260,7 @@ def collect_samples(base_dir, language):
|
|||||||
sample_start, sample_end, token_texts, sample_texts = None, None, [], []
|
sample_start, sample_end, token_texts, sample_texts = None, None, [], []
|
||||||
for token_start, token_end, token_text in tokens:
|
for token_start, token_end, token_text in tokens:
|
||||||
if CLI_ARGS.exclude_numbers and any(c.isdigit() for c in token_text):
|
if CLI_ARGS.exclude_numbers and any(c.isdigit() for c in token_text):
|
||||||
add_sample(wav_path, speaker, sample_start, sample_end, ' '.join(sample_texts),
|
add_sample(wav_path, article, speaker, sample_start, sample_end, ' '.join(sample_texts),
|
||||||
p_reason='has numbers')
|
p_reason='has numbers')
|
||||||
sample_start, sample_end, token_texts, sample_texts = None, None, [], []
|
sample_start, sample_end, token_texts, sample_texts = None, None, [], []
|
||||||
continue
|
continue
|
||||||
@ -259,7 +271,7 @@ def collect_samples(base_dir, language):
|
|||||||
token_texts.append(token_text)
|
token_texts.append(token_text)
|
||||||
if token_end is not None:
|
if token_end is not None:
|
||||||
if token_start != sample_start and token_end - sample_start > CLI_ARGS.max_duration > 0:
|
if token_start != sample_start and token_end - sample_start > CLI_ARGS.max_duration > 0:
|
||||||
add_sample(wav_path, speaker, sample_start, sample_end, ' '.join(sample_texts),
|
add_sample(wav_path, article, speaker, sample_start, sample_end, ' '.join(sample_texts),
|
||||||
p_reason='split')
|
p_reason='split')
|
||||||
sample_start = sample_end
|
sample_start = sample_end
|
||||||
sample_texts = []
|
sample_texts = []
|
||||||
@ -267,7 +279,7 @@ def collect_samples(base_dir, language):
|
|||||||
sample_end = token_end
|
sample_end = token_end
|
||||||
sample_texts.extend(token_texts)
|
sample_texts.extend(token_texts)
|
||||||
token_texts = []
|
token_texts = []
|
||||||
add_sample(wav_path, speaker, sample_start, sample_end, ' '.join(sample_texts),
|
add_sample(wav_path, article, speaker, sample_start, sample_end, ' '.join(sample_texts),
|
||||||
p_reason='split' if split else 'complete')
|
p_reason='split' if split else 'complete')
|
||||||
print('Skipped samples:')
|
print('Skipped samples:')
|
||||||
for reason, n in reasons.most_common():
|
for reason, n in reasons.most_common():
|
||||||
@ -382,15 +394,19 @@ def write_csvs(samples, language):
|
|||||||
csv_path = path.join(base_dir, language + '-' + sub_set + '.csv')
|
csv_path = path.join(base_dir, language + '-' + sub_set + '.csv')
|
||||||
print('Writing "{}"...'.format(csv_path))
|
print('Writing "{}"...'.format(csv_path))
|
||||||
with open(csv_path, 'w') as csv_file:
|
with open(csv_path, 'w') as csv_file:
|
||||||
writer = csv.DictWriter(csv_file, fieldnames=FIELDNAMES)
|
writer = csv.DictWriter(csv_file, fieldnames=FIELDNAMES_EXT if CLI_ARGS.add_meta else FIELDNAMES)
|
||||||
writer.writeheader()
|
writer.writeheader()
|
||||||
bar = progressbar.ProgressBar(max_value=len(set_samples), widgets=SIMPLE_BAR)
|
bar = progressbar.ProgressBar(max_value=len(set_samples), widgets=SIMPLE_BAR)
|
||||||
for sample in bar(set_samples):
|
for sample in bar(set_samples):
|
||||||
writer.writerow({
|
row = {
|
||||||
'wav_filename': path.relpath(sample.wav_path, base_dir),
|
'wav_filename': path.relpath(sample.wav_path, base_dir),
|
||||||
'wav_filesize': path.getsize(sample.wav_path),
|
'wav_filesize': path.getsize(sample.wav_path),
|
||||||
'transcript': sample.text
|
'transcript': sample.text
|
||||||
})
|
}
|
||||||
|
if CLI_ARGS.add_meta:
|
||||||
|
row['article'] = sample.article
|
||||||
|
row['speaker'] = sample.speaker
|
||||||
|
writer.writerow(row)
|
||||||
|
|
||||||
|
|
||||||
def cleanup(archive, language):
|
def cleanup(archive, language):
|
||||||
@ -428,6 +444,9 @@ def handle_args():
|
|||||||
for language in LANGUAGES:
|
for language in LANGUAGES:
|
||||||
parser.add_argument('--{}_alphabet'.format(language),
|
parser.add_argument('--{}_alphabet'.format(language),
|
||||||
help='Exclude {} samples with characters not in provided alphabet file'.format(language))
|
help='Exclude {} samples with characters not in provided alphabet file'.format(language))
|
||||||
|
parser.add_argument('--add_meta', action='store_true', help='Adds article and speaker CSV columns')
|
||||||
|
parser.add_argument('--exclude_unknown_speakers', action='store_true', help='Exclude unknown speakers')
|
||||||
|
parser.add_argument('--exclude_unknown_articles', action='store_true', help='Exclude unknown articles')
|
||||||
parser.add_argument('--keep_archive', type=bool, default=True,
|
parser.add_argument('--keep_archive', type=bool, default=True,
|
||||||
help='If downloaded archives should be kept')
|
help='If downloaded archives should be kept')
|
||||||
parser.add_argument('--keep_intermediate', type=bool, default=False,
|
parser.add_argument('--keep_intermediate', type=bool, default=False,
|
||||||
|
Loading…
Reference in New Issue
Block a user