From 4d764c05591248e363d33086cd04a0c7ad6a5a6f Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 6 Apr 2021 11:59:41 +0200 Subject: [PATCH] Add importer for English subset of Multilingual LibriSpeech --- bin/import_mls_english.py | 96 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 bin/import_mls_english.py diff --git a/bin/import_mls_english.py b/bin/import_mls_english.py new file mode 100644 index 00000000..6e21c0ca --- /dev/null +++ b/bin/import_mls_english.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +import argparse +import ctypes +import os + +import pandas + +from pathlib import Path +from tqdm import tqdm + + +def read_ogg_opus_duration(ogg_file_path): + error = ctypes.c_int() + opusfile = pyogg.opus.op_open_file( + ogg_file_path.encode("utf-8"), ctypes.pointer(error) + ) + + if error.value != 0: + raise ValueError( + ("Ogg/Opus file could not be read." "Error code: {}").format(error.value) + ) + + pcm_buffer_size = pyogg.opus.op_pcm_total(opusfile, -1) + channel_count = pyogg.opus.op_channel_count(opusfile, -1) + sample_rate = 48000 # opus files are always 48kHz + sample_width = 2 # always 16-bit + pyogg.opus.op_free(opusfile) + return pcm_buffer_size / sample_rate + + +def main(root_dir): + for subset in ("test",): + print("Processing {} subset...".format(subset)) + with open(Path(root_dir) / subset / "transcripts.txt") as fin: + subset_entries = [] + for i, line in tqdm(enumerate(fin)): + audio_id, transcript = line.split("\t") + audio_id_parts = audio_id.split("_") + # e.g. 4800_10003_000000 -> train/audio/4800/10003/4800_10003_000000.opus + audio_path = ( + Path(root_dir) + / subset + / "audio" + / audio_id_parts[0] + / audio_id_parts[1] + / "{}.opus".format(audio_id) + ) + audio_filesize = os.path.getsize(audio_path) + # TODO: support other languages + transcript = ( + transcript.strip() + .replace("-", " ") + .replace("ñ", "n") + .replace(".", "") + .translate( + { + ord(ch): None + for ch in ( + "а", + "в", + "е", + "и", + "к", + "м", + "н", + "о", + "п", + "р", + "т", + "ы", + "я", + ) + } + ) + ) + subset_entries.append( + ( + audio_path.relative_to(root_dir), + audio_filesize, + transcript.strip(), + ) + ) + df = pandas.DataFrame( + columns=["wav_filename", "wav_filesize", "transcript"], + data=subset_entries, + ) + csv_name = Path(root_dir) / "{}.csv".format(subset) + df.to_csv(csv_name, index=False) + print("Wrote {}".format(csv_name)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("root_dir", help="Path to the mls_english_opus directory.") + args = parser.parse_args() + main(args.root_dir)