STT/bin/import_ted.py
2021-06-10 10:49:54 -04:00

165 lines
5.4 KiB
Python
Executable File

#!/usr/bin/env python
import sys
import tarfile
import unicodedata
import wave
from glob import glob
from os import makedirs, path, remove, rmdir
import pandas
from coqui_stt_training.util.downloader import maybe_download
from coqui_stt_training.util.stm import parse_stm_file
from sox import Transformer
from tensorflow.python.platform import gfile
def _download_and_preprocess_data(data_dir):
# Conditionally download data
TED_DATA = "TEDLIUM_release2.tar.gz"
TED_DATA_URL = "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz"
local_file = maybe_download(TED_DATA, data_dir, TED_DATA_URL)
# Conditionally extract TED data
TED_DIR = "TEDLIUM_release2"
_maybe_extract(data_dir, TED_DIR, local_file)
# Conditionally convert TED sph data to wav
_maybe_convert_wav(data_dir, TED_DIR)
# Conditionally split TED wav and text data into sentences
train_files, dev_files, test_files = _maybe_split_sentences(data_dir, TED_DIR)
# Write sets to disk as CSV files
train_files.to_csv(path.join(data_dir, "ted-train.csv"), index=False)
dev_files.to_csv(path.join(data_dir, "ted-dev.csv"), index=False)
test_files.to_csv(path.join(data_dir, "ted-test.csv"), index=False)
def _maybe_extract(data_dir, extracted_data, archive):
# If data_dir/extracted_data does not exist, extract archive in data_dir
if not gfile.Exists(path.join(data_dir, extracted_data)):
tar = tarfile.open(archive)
tar.extractall(data_dir)
tar.close()
def _maybe_convert_wav(data_dir, extracted_data):
# Create extracted_data dir
extracted_dir = path.join(data_dir, extracted_data)
# Conditionally convert dev sph to wav
_maybe_convert_wav_dataset(extracted_dir, "dev")
# Conditionally convert train sph to wav
_maybe_convert_wav_dataset(extracted_dir, "train")
# Conditionally convert test sph to wav
_maybe_convert_wav_dataset(extracted_dir, "test")
def _maybe_convert_wav_dataset(extracted_dir, data_set):
# Create source dir
source_dir = path.join(extracted_dir, data_set, "sph")
# Create target dir
target_dir = path.join(extracted_dir, data_set, "wav")
# Conditionally convert sph files to wav files
if not gfile.Exists(target_dir):
# Create target_dir
makedirs(target_dir)
# Loop over sph files in source_dir and convert each to wav
for sph_file in glob(path.join(source_dir, "*.sph")):
transformer = Transformer()
wav_filename = path.splitext(path.basename(sph_file))[0] + ".wav"
wav_file = path.join(target_dir, wav_filename)
transformer.build(sph_file, wav_file)
remove(sph_file)
# Remove source_dir
rmdir(source_dir)
def _maybe_split_sentences(data_dir, extracted_data):
# Create extracted_data dir
extracted_dir = path.join(data_dir, extracted_data)
# Conditionally split dev wav
dev_files = _maybe_split_dataset(extracted_dir, "dev")
# Conditionally split train wav
train_files = _maybe_split_dataset(extracted_dir, "train")
# Conditionally split test wav
test_files = _maybe_split_dataset(extracted_dir, "test")
return train_files, dev_files, test_files
def _maybe_split_dataset(extracted_dir, data_set):
# Create stm dir
stm_dir = path.join(extracted_dir, data_set, "stm")
# Create wav dir
wav_dir = path.join(extracted_dir, data_set, "wav")
files = []
# Loop over stm files and split corresponding wav
for stm_file in glob(path.join(stm_dir, "*.stm")):
# Parse stm file
stm_segments = parse_stm_file(stm_file)
# Open wav corresponding to stm_file
wav_filename = path.splitext(path.basename(stm_file))[0] + ".wav"
wav_file = path.join(wav_dir, wav_filename)
origAudio = wave.open(wav_file, "r")
# Loop over stm_segments and split wav_file for each segment
for stm_segment in stm_segments:
# Create wav segment filename
start_time = stm_segment.start_time
stop_time = stm_segment.stop_time
new_wav_filename = (
path.splitext(path.basename(stm_file))[0]
+ "-"
+ str(start_time)
+ "-"
+ str(stop_time)
+ ".wav"
)
new_wav_file = path.join(wav_dir, new_wav_filename)
# If the wav segment filename does not exist create it
if not gfile.Exists(new_wav_file):
_split_wav(origAudio, start_time, stop_time, new_wav_file)
new_wav_filesize = path.getsize(new_wav_file)
files.append(
(path.abspath(new_wav_file), new_wav_filesize, stm_segment.transcript)
)
# Close origAudio
origAudio.close()
return pandas.DataFrame(
data=files, columns=["wav_filename", "wav_filesize", "transcript"]
)
def _split_wav(origAudio, start_time, stop_time, new_wav_file):
frameRate = origAudio.getframerate()
origAudio.setpos(int(start_time * frameRate))
chunkData = origAudio.readframes(int((stop_time - start_time) * frameRate))
chunkAudio = wave.open(new_wav_file, "w")
chunkAudio.setnchannels(origAudio.getnchannels())
chunkAudio.setsampwidth(origAudio.getsampwidth())
chunkAudio.setframerate(frameRate)
chunkAudio.writeframes(chunkData)
chunkAudio.close()
if __name__ == "__main__":
_download_and_preprocess_data(sys.argv[1])