751 lines
26 KiB
Python
Executable File
751 lines
26 KiB
Python
Executable File
#!/usr/bin/env python
|
|
"""
|
|
Importer for dataset published from Centre de Conférence Pierre Mendès-France
|
|
Ministère de l'Économie, des Finances et de la Relance
|
|
"""
|
|
|
|
import csv
|
|
import decimal
|
|
import hashlib
|
|
import math
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import unicodedata
|
|
import xml.etree.ElementTree as ET
|
|
import zipfile
|
|
from glob import glob
|
|
from multiprocessing import Pool
|
|
|
|
import progressbar
|
|
import sox
|
|
|
|
try:
|
|
from num2words import num2words
|
|
except ImportError as ex:
|
|
print("pip install num2words")
|
|
sys.exit(1)
|
|
|
|
import json
|
|
|
|
import requests
|
|
from coqui_stt_ctcdecoder import Alphabet
|
|
from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
|
|
from coqui_stt_training.util.helpers import secs_to_hours
|
|
from coqui_stt_training.util.importers import (
|
|
get_counter,
|
|
get_imported_samples,
|
|
get_importers_parser,
|
|
get_validate_label,
|
|
print_import_report,
|
|
)
|
|
|
|
FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
|
|
SAMPLE_RATE = 16000
|
|
CHANNELS = 1
|
|
BIT_DEPTH = 16
|
|
MAX_SECS = 10
|
|
MIN_SECS = 0.85
|
|
|
|
DATASET_RELEASE_CSV = "https://data.economie.gouv.fr/explore/dataset/transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020/download/?format=csv&timezone=Europe/Berlin&lang=fr&use_labels_for_header=true&csv_separator=%3B"
|
|
DATASET_RELEASE_SHA = [
|
|
(
|
|
"863d39a06a388c6491c6ff2f6450b151f38f1b57",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.001",
|
|
),
|
|
(
|
|
"2f3a0305aa04c61220bb00b5a4e553e45dbf12e1",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.002",
|
|
),
|
|
(
|
|
"5e55e9f1f844097349188ac875947e5a3d7fe9f1",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.003",
|
|
),
|
|
(
|
|
"8bf54842cf07948ca5915e27a8bd5fa5139c06ae",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.004",
|
|
),
|
|
(
|
|
"c8963504aadc015ac48f9af80058a0bb3440b94f",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.005",
|
|
),
|
|
(
|
|
"d95e225e908621d83ce4e9795fd108d9d310e244",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.006",
|
|
),
|
|
(
|
|
"de6ed9c2b0ee80ca879aae8ba7923cc93217d811",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.007",
|
|
),
|
|
(
|
|
"234283c47dacfcd4450d836c52c25f3e807fc5f2",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.008",
|
|
),
|
|
(
|
|
"4e6b67a688639bb72f8cd81782eaba604a8d32a6",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.009",
|
|
),
|
|
(
|
|
"4165a51389777c8af8e6253d87bdacb877e8b3b0",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.010",
|
|
),
|
|
(
|
|
"34322e7009780d97ef5bd02bf2f2c7a31f00baff",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.011",
|
|
),
|
|
(
|
|
"48c5be3b2ca9d6108d525da6a03e91d93a95dbac",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.012",
|
|
),
|
|
(
|
|
"87573172f506a189c2ebc633856fe11a2e9cd213",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.013",
|
|
),
|
|
(
|
|
"6ab2c9e508e9278d5129f023e018725c4a7c69e8",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.014",
|
|
),
|
|
(
|
|
"4f84df831ef46dce5d3ab3e21817687a2d8c12d0",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.015",
|
|
),
|
|
(
|
|
"e69bfb079885c299cb81080ef88b1b8b57158aa6",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.016",
|
|
),
|
|
(
|
|
"5f764ba788ee273981cf211b242c29b49ca22c5e",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.017",
|
|
),
|
|
(
|
|
"b6aa81a959525363223494830c1e7307d4c4bae6",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.018",
|
|
),
|
|
(
|
|
"91ddcf43c7bf113a6f2528b857c7ec22a50a148a",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.019",
|
|
),
|
|
(
|
|
"fa1b29273dd77b9a7494983a2f9ae52654b931d7",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.020",
|
|
),
|
|
(
|
|
"1113aef4f5e2be2f7fbf2d54b6c710c1c0e7135f",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.021",
|
|
),
|
|
(
|
|
"ce6420d5d0b6b5135ba559f83e1a82d4d615c470",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.022",
|
|
),
|
|
(
|
|
"d0976ed292ac24fcf1590d1ea195077c74b05471",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.023",
|
|
),
|
|
(
|
|
"ec746cd6af066f62d9bf8d3b2f89174783ff4e3c",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.024",
|
|
),
|
|
(
|
|
"570d9e1e84178e32fd867171d4b3aaecda1fd4fb",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.025",
|
|
),
|
|
(
|
|
"c29ccc7467a75b2cae3d7f2e9fbbb2ab276cb8ac",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.026",
|
|
),
|
|
(
|
|
"08406a51146d88e208704ce058c060a1e44efa50",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.027",
|
|
),
|
|
(
|
|
"199aedad733a78ea1e7d47def9c71c6fd5795e02",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.028",
|
|
),
|
|
(
|
|
"db856a068f92fb4f01f410bba42c7271de0f231a",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.029",
|
|
),
|
|
(
|
|
"e3c0135f16c6c9d25a09dcb4f99a685438a84740",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.030",
|
|
),
|
|
(
|
|
"e51b8bb9c0ae4339f98b4f21e6d29b825109f0ac",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.031",
|
|
),
|
|
(
|
|
"be5e80cbc49b59b31ae33c30576ef0e1a162d84e",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.032",
|
|
),
|
|
(
|
|
"501df58e3ff55fcfd75b93dab57566dc536948b8",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.033",
|
|
),
|
|
(
|
|
"1a114875811a8cdcb8d85a9f6dbee78be3e05131",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.034",
|
|
),
|
|
(
|
|
"465d824e7ee46448369182c0c28646d155a2249b",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.035",
|
|
),
|
|
(
|
|
"37f341b1b266d143eb73138c31cfff3201b9d619",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.036",
|
|
),
|
|
(
|
|
"9e7d8255987a8a77a90e0d4b55c8fd38b9fb5694",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.037",
|
|
),
|
|
(
|
|
"54886755630cb080a53098cb1b6c951c6714a143",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.038",
|
|
),
|
|
(
|
|
"4b7cbb0154697be795034f7a49712e882a97197a",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.039",
|
|
),
|
|
(
|
|
"c8e1e565a0e7a1f6ff1dbfcefe677aa74a41d2f2",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.040",
|
|
),
|
|
]
|
|
|
|
|
|
def _download_and_preprocess_data(csv_url, target_dir):
|
|
dataset_sources = os.path.join(
|
|
target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020", "data.txt"
|
|
)
|
|
if os.path.exists(dataset_sources):
|
|
return dataset_sources
|
|
|
|
# Making path absolute
|
|
target_dir = os.path.abspath(target_dir)
|
|
csv_ref = requests.get(csv_url).text.split("\r\n")[1:-1]
|
|
for part in csv_ref:
|
|
part_filename = (
|
|
requests.head(part)
|
|
.headers.get("Content-Disposition")
|
|
.split(" ")[1]
|
|
.split("=")[1]
|
|
.replace('"', "")
|
|
)
|
|
if not os.path.exists(os.path.join(target_dir, part_filename)):
|
|
part_path = maybe_download(part_filename, target_dir, part)
|
|
|
|
def _big_sha1(fname):
|
|
s = hashlib.sha1()
|
|
buffer_size = 65536
|
|
with open(fname, "rb") as f:
|
|
while True:
|
|
data = f.read(buffer_size)
|
|
if not data:
|
|
break
|
|
s.update(data)
|
|
return s.hexdigest()
|
|
|
|
for (sha1, filename) in DATASET_RELEASE_SHA:
|
|
print("Checking {} SHA1:".format(filename))
|
|
csum = _big_sha1(os.path.join(target_dir, filename))
|
|
if csum == sha1:
|
|
print("\t{}: OK {}".format(filename, sha1))
|
|
else:
|
|
print("\t{}: ERROR: expected {}, computed {}".format(filename, sha1, csum))
|
|
assert csum == sha1
|
|
|
|
# Conditionally extract data
|
|
_maybe_extract(
|
|
target_dir,
|
|
"transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020",
|
|
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip",
|
|
"transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020.zip",
|
|
)
|
|
|
|
# Produce source text for extraction / conversion
|
|
return _maybe_create_sources(
|
|
os.path.join(target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020")
|
|
)
|
|
|
|
|
|
def _maybe_extract(target_dir, extracted_data, archive, final):
|
|
# If target_dir/extracted_data does not exist, extract archive in target_dir
|
|
extracted_path = os.path.join(target_dir, extracted_data)
|
|
archive_path = os.path.join(target_dir, archive)
|
|
final_archive = os.path.join(extracted_path, final)
|
|
|
|
if not os.path.exists(extracted_path):
|
|
if not os.path.exists(archive_path):
|
|
print('No archive "%s" - building ...' % archive_path)
|
|
all_zip_parts = glob(archive_path + ".*")
|
|
all_zip_parts.sort()
|
|
cmdline = "cat {} > {}".format(" ".join(all_zip_parts), archive_path)
|
|
print('Building with "%s"' % cmdline)
|
|
subprocess.check_call(cmdline, shell=True, cwd=target_dir)
|
|
assert os.path.exists(archive_path)
|
|
|
|
print(
|
|
'No directory "%s" - extracting archive %s ...'
|
|
% (extracted_path, archive_path)
|
|
)
|
|
with zipfile.ZipFile(archive_path) as zip_f:
|
|
zip_f.extractall(extracted_path)
|
|
|
|
with zipfile.ZipFile(final_archive) as zip_f:
|
|
zip_f.extractall(target_dir)
|
|
else:
|
|
print('Found directory "%s" - not extracting it from archive.' % extracted_path)
|
|
|
|
|
|
def _maybe_create_sources(dir):
|
|
dataset_sources = os.path.join(dir, "data.txt")
|
|
MP3 = glob(os.path.join(dir, "**", "*.mp3"))
|
|
XML = glob(os.path.join(dir, "**", "*.xml"))
|
|
|
|
MP3_XML_Scores = []
|
|
MP3_XML_Fin = {}
|
|
|
|
for f_mp3 in MP3:
|
|
for f_xml in XML:
|
|
b_mp3 = os.path.splitext(os.path.basename(f_mp3))[0]
|
|
b_xml = os.path.splitext(os.path.basename(f_xml))[0]
|
|
a_mp3 = b_mp3.split("_")
|
|
a_xml = b_xml.split("_")
|
|
score = 0
|
|
date_mp3 = a_mp3[0]
|
|
date_xml = a_xml[0]
|
|
|
|
if date_mp3 != date_xml:
|
|
continue
|
|
|
|
for i in range(min(len(a_mp3), len(a_xml))):
|
|
if a_mp3[i] == a_xml[i]:
|
|
score += 1
|
|
|
|
if score >= 1:
|
|
MP3_XML_Scores.append((f_mp3, f_xml, score))
|
|
|
|
# sort by score
|
|
MP3_XML_Scores.sort(key=lambda x: x[2], reverse=True)
|
|
for s_mp3, s_xml, score in MP3_XML_Scores:
|
|
# print(s_mp3, s_xml, score)
|
|
if score not in MP3_XML_Fin:
|
|
MP3_XML_Fin[score] = {}
|
|
|
|
if s_mp3 not in MP3_XML_Fin[score]:
|
|
try:
|
|
MP3.index(s_mp3)
|
|
MP3.remove(s_mp3)
|
|
MP3_XML_Fin[score][s_mp3] = s_xml
|
|
except ValueError as ex:
|
|
pass
|
|
else:
|
|
print("here:", MP3_XML_Fin[score][s_mp3], s_xml, file=sys.stderr)
|
|
|
|
with open(dataset_sources, "w") as ds:
|
|
for score in MP3_XML_Fin:
|
|
for mp3 in MP3_XML_Fin[score]:
|
|
xml = MP3_XML_Fin[score][mp3]
|
|
if os.path.getsize(mp3) > 0 and os.path.getsize(xml) > 0:
|
|
mp3 = os.path.relpath(mp3, dir)
|
|
xml = os.path.relpath(xml, dir)
|
|
ds.write("{},{},{:0.2e}\n".format(xml, mp3, 2.5e-4))
|
|
else:
|
|
print("Empty file {} or {}".format(mp3, xml), file=sys.stderr)
|
|
|
|
print("Missing XML pairs:", MP3, file=sys.stderr)
|
|
return dataset_sources
|
|
|
|
|
|
def maybe_normalize_for_digits(label):
|
|
# first, try to identify numbers like "50 000", "260 000"
|
|
if " " in label:
|
|
if any(s.isdigit() for s in label):
|
|
thousands = re.compile(r"(\d{1,3}(?:\s*\d{3})*(?:,\d+)?)")
|
|
maybe_thousands = thousands.findall(label)
|
|
if len(maybe_thousands) > 0:
|
|
while True:
|
|
(label, r) = re.subn(r"(\d)\s(\d{3})", "\\1\\2", label)
|
|
if r == 0:
|
|
break
|
|
|
|
# this might be a time or duration in the form "hh:mm" or "hh:mm:ss"
|
|
if ":" in label:
|
|
for s in label.split(" "):
|
|
if any(i.isdigit() for i in s):
|
|
date_or_time = re.compile(r"(\d{1,2}):(\d{2}):?(\d{2})?")
|
|
maybe_date_or_time = date_or_time.findall(s)
|
|
if len(maybe_date_or_time) > 0:
|
|
maybe_hours = maybe_date_or_time[0][0]
|
|
maybe_minutes = maybe_date_or_time[0][1]
|
|
maybe_seconds = maybe_date_or_time[0][2]
|
|
if len(maybe_seconds) > 0:
|
|
label = label.replace(
|
|
"{}:{}:{}".format(
|
|
maybe_hours, maybe_minutes, maybe_seconds
|
|
),
|
|
"{} heures {} minutes et {} secondes".format(
|
|
maybe_hours, maybe_minutes, maybe_seconds
|
|
),
|
|
)
|
|
else:
|
|
label = label.replace(
|
|
"{}:{}".format(maybe_hours, maybe_minutes),
|
|
"{} heures et {} minutes".format(
|
|
maybe_hours, maybe_minutes
|
|
),
|
|
)
|
|
|
|
new_label = []
|
|
# pylint: disable=too-many-nested-blocks
|
|
for s in label.split(" "):
|
|
if any(i.isdigit() for i in s):
|
|
s = s.replace(",", ".") # num2words requires "." for floats
|
|
s = s.replace('"', "") # clean some data, num2words would choke on 1959"
|
|
|
|
last_c = s[-1]
|
|
if not last_c.isdigit(): # num2words will choke on "0.6.", "24 ?"
|
|
s = s[:-1]
|
|
|
|
if any(
|
|
i.isalpha() for i in s
|
|
): # So we have any(isdigit()) **and** any(sialpha), like "3D"
|
|
ns = []
|
|
for c in s:
|
|
nc = c
|
|
if c.isdigit(): # convert "3" to "trois-"
|
|
try:
|
|
nc = num2words(c, lang="fr") + "-"
|
|
except decimal.InvalidOperation as ex:
|
|
print("decimal.InvalidOperation: '{}'".format(s))
|
|
raise ex
|
|
ns.append(nc)
|
|
s = "".join(s)
|
|
else:
|
|
try:
|
|
s = num2words(s, lang="fr")
|
|
except decimal.InvalidOperation as ex:
|
|
print("decimal.InvalidOperation: '{}'".format(s))
|
|
raise ex
|
|
new_label.append(s)
|
|
return " ".join(new_label)
|
|
|
|
|
|
def maybe_normalize_for_specials_chars(label):
|
|
label = label.replace("%", "pourcents")
|
|
label = label.replace("/", ", ") # clean intervals like 2019/2022 to "2019 2022"
|
|
label = label.replace("-", ", ") # clean intervals like 70-80 to "70 80"
|
|
label = label.replace("+", " plus ") # clean + and make it speakable
|
|
label = label.replace("€", " euros ") # clean euro symbol and make it speakable
|
|
label = label.replace(
|
|
"., ", ", "
|
|
) # clean some strange "4.0., " (20181017_Innovation.xml)
|
|
label = label.replace(
|
|
"°", " degré "
|
|
) # clean some strange "°5" (20181210_EtatsGeneraux-1000_fre_750_und.xml)
|
|
label = label.replace("...", ".") # remove ellipsis
|
|
label = label.replace("..", ".") # remove broken ellipsis
|
|
label = label.replace(
|
|
"m²", "mètre-carrés"
|
|
) # 20150616_Defi_Climat_3_wmv_0_fre_minefi.xml
|
|
label = label.replace(
|
|
"[end]", ""
|
|
) # broken tag in 20150123_Entretiens_Tresor_PGM_wmv_0_fre_minefi.xml
|
|
label = label.replace(
|
|
u"\xB8c", " ç"
|
|
) # strange cedilla in 20150417_Printemps_Economie_2_wmv_0_fre_minefi.xml
|
|
label = label.replace(
|
|
"C0²", "CO 2"
|
|
) # 20121016_Syteme_sante_copie_wmv_0_fre_minefi.xml
|
|
return label
|
|
|
|
|
|
def maybe_normalize_for_anglicisms(label):
|
|
label = label.replace("B2B", "B to B")
|
|
label = label.replace("B2C", "B to C")
|
|
label = label.replace("#", "hashtag ")
|
|
label = label.replace("@", "at ")
|
|
return label
|
|
|
|
|
|
def maybe_normalize(label):
|
|
label = maybe_normalize_for_specials_chars(label)
|
|
label = maybe_normalize_for_anglicisms(label)
|
|
label = maybe_normalize_for_digits(label)
|
|
return label
|
|
|
|
|
|
def one_sample(sample):
|
|
file_size = -1
|
|
frames = 0
|
|
|
|
audio_source = sample[0]
|
|
target_dir = sample[1]
|
|
dataset_basename = sample[2]
|
|
|
|
start_time = sample[3]
|
|
duration = sample[4]
|
|
label = label_filter_fun(sample[5])
|
|
sample_id = sample[6]
|
|
|
|
_wav_filename = os.path.basename(
|
|
audio_source.replace(".wav", "_{:06}.wav".format(sample_id))
|
|
)
|
|
wav_fullname = os.path.join(target_dir, dataset_basename, _wav_filename)
|
|
|
|
if not os.path.exists(wav_fullname):
|
|
subprocess.check_output(
|
|
[
|
|
"ffmpeg",
|
|
"-i",
|
|
audio_source,
|
|
"-ss",
|
|
str(start_time),
|
|
"-t",
|
|
str(duration),
|
|
"-c",
|
|
"copy",
|
|
wav_fullname,
|
|
],
|
|
stdin=subprocess.DEVNULL,
|
|
stderr=subprocess.STDOUT,
|
|
)
|
|
|
|
file_size = os.path.getsize(wav_fullname)
|
|
frames = int(
|
|
subprocess.check_output(["soxi", "-s", wav_fullname], stderr=subprocess.STDOUT)
|
|
)
|
|
|
|
_counter = get_counter()
|
|
_rows = []
|
|
|
|
if file_size == -1:
|
|
# Excluding samples that failed upon conversion
|
|
_counter["failed"] += 1
|
|
elif label is None:
|
|
# Excluding samples that failed on label validation
|
|
_counter["invalid_label"] += 1
|
|
elif int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(label)):
|
|
# Excluding samples that are too short to fit the transcript
|
|
_counter["too_short"] += 1
|
|
elif frames / SAMPLE_RATE < MIN_SECS:
|
|
# Excluding samples that are too short
|
|
_counter["too_short"] += 1
|
|
elif frames / SAMPLE_RATE > MAX_SECS:
|
|
# Excluding very long samples to keep a reasonable batch-size
|
|
_counter["too_long"] += 1
|
|
else:
|
|
# This one is good - keep it for the target CSV
|
|
_rows.append((os.path.join(dataset_basename, _wav_filename), file_size, label))
|
|
_counter["imported_time"] += frames
|
|
_counter["all"] += 1
|
|
_counter["total_time"] += frames
|
|
|
|
return (_counter, _rows)
|
|
|
|
|
|
def _maybe_import_data(xml_file, audio_source, target_dir, rel_tol=1e-1):
|
|
dataset_basename = os.path.splitext(os.path.split(xml_file)[1])[0]
|
|
wav_root = os.path.join(target_dir, dataset_basename)
|
|
if not os.path.exists(wav_root):
|
|
os.makedirs(wav_root)
|
|
|
|
source_frames = int(
|
|
subprocess.check_output(["soxi", "-s", audio_source], stderr=subprocess.STDOUT)
|
|
)
|
|
print("Source audio length: %s" % secs_to_hours(source_frames / SAMPLE_RATE))
|
|
|
|
# Get audiofile path and transcript for each sentence in tsv
|
|
samples = []
|
|
tree = ET.parse(xml_file)
|
|
root = tree.getroot()
|
|
seq_id = 0
|
|
this_time = 0.0
|
|
this_duration = 0.0
|
|
prev_time = 0.0
|
|
prev_duration = 0.0
|
|
this_text = ""
|
|
for child in root:
|
|
if child.tag == "row":
|
|
cur_time = float(child.attrib["timestamp"])
|
|
cur_duration = float(child.attrib["timedur"])
|
|
cur_text = child.text
|
|
|
|
if this_time == 0.0:
|
|
this_time = cur_time
|
|
|
|
delta = cur_time - (prev_time + prev_duration)
|
|
# rel_tol value is made from trial/error to try and compromise between:
|
|
# - cutting enough to skip missing words
|
|
# - not too short, not too long sentences
|
|
is_close = math.isclose(
|
|
cur_time, this_time + this_duration, rel_tol=rel_tol
|
|
)
|
|
is_short = (this_duration + cur_duration + delta) < MAX_SECS
|
|
|
|
# when the previous element is close enough **and** this does not
|
|
# go over MAX_SECS, we append content
|
|
if is_close and is_short:
|
|
this_duration += cur_duration + delta
|
|
this_text += cur_text
|
|
else:
|
|
samples.append(
|
|
(
|
|
audio_source,
|
|
target_dir,
|
|
dataset_basename,
|
|
this_time,
|
|
this_duration,
|
|
this_text,
|
|
seq_id,
|
|
)
|
|
)
|
|
|
|
this_time = cur_time
|
|
this_duration = cur_duration
|
|
this_text = cur_text
|
|
|
|
seq_id += 1
|
|
|
|
prev_time = cur_time
|
|
prev_duration = cur_duration
|
|
|
|
# Keep track of how many samples are good vs. problematic
|
|
_counter = get_counter()
|
|
num_samples = len(samples)
|
|
_rows = []
|
|
|
|
print("Processing XML data: {}".format(xml_file))
|
|
pool = Pool()
|
|
bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
|
|
for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
|
|
_counter += processed[0]
|
|
_rows += processed[1]
|
|
bar.update(i)
|
|
bar.update(num_samples)
|
|
pool.close()
|
|
pool.join()
|
|
|
|
imported_samples = get_imported_samples(_counter)
|
|
assert _counter["all"] == num_samples
|
|
assert len(_rows) == imported_samples
|
|
|
|
print_import_report(_counter, SAMPLE_RATE, MAX_SECS)
|
|
print(
|
|
"Import efficiency: %.1f%%" % ((_counter["total_time"] / source_frames) * 100)
|
|
)
|
|
print("")
|
|
|
|
return _counter, _rows
|
|
|
|
|
|
def _maybe_convert_wav(mp3_filename, _wav_filename):
|
|
if not os.path.exists(_wav_filename):
|
|
print("Converting {} to WAV file: {}".format(mp3_filename, _wav_filename))
|
|
transformer = sox.Transformer()
|
|
transformer.convert(
|
|
samplerate=SAMPLE_RATE, n_channels=CHANNELS, bitdepth=BIT_DEPTH
|
|
)
|
|
try:
|
|
transformer.build(mp3_filename, _wav_filename)
|
|
except sox.core.SoxError:
|
|
pass
|
|
|
|
|
|
def write_general_csv(target_dir, _rows, _counter):
|
|
target_csv_template = os.path.join(target_dir, "ccpmf_{}.csv")
|
|
with open(target_csv_template.format("train"), "w") as train_csv_file: # 80%
|
|
with open(target_csv_template.format("dev"), "w") as dev_csv_file: # 10%
|
|
with open(target_csv_template.format("test"), "w") as test_csv_file: # 10%
|
|
train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES)
|
|
train_writer.writeheader()
|
|
dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES)
|
|
dev_writer.writeheader()
|
|
test_writer = csv.DictWriter(test_csv_file, fieldnames=FIELDNAMES)
|
|
test_writer.writeheader()
|
|
|
|
bar = progressbar.ProgressBar(max_value=len(_rows), widgets=SIMPLE_BAR)
|
|
for i, item in enumerate(bar(_rows)):
|
|
i_mod = i % 10
|
|
if i_mod == 0:
|
|
writer = test_writer
|
|
elif i_mod == 1:
|
|
writer = dev_writer
|
|
else:
|
|
writer = train_writer
|
|
writer.writerow(
|
|
{
|
|
"wav_filename": item[0],
|
|
"wav_filesize": item[1],
|
|
"transcript": item[2],
|
|
}
|
|
)
|
|
|
|
print("")
|
|
print("~~~~ FINAL STATISTICS ~~~~")
|
|
print_import_report(_counter, SAMPLE_RATE, MAX_SECS)
|
|
print("~~~~ (FINAL STATISTICS) ~~~~")
|
|
print("")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
PARSER = get_importers_parser(
|
|
description="Import XML from Conference Centre for Economics, France"
|
|
)
|
|
PARSER.add_argument("target_dir", help="Destination directory")
|
|
PARSER.add_argument(
|
|
"--filter_alphabet",
|
|
help="Exclude samples with characters not in provided alphabet",
|
|
)
|
|
PARSER.add_argument(
|
|
"--normalize",
|
|
action="store_true",
|
|
help="Converts diacritic characters to their base ones",
|
|
)
|
|
|
|
PARAMS = PARSER.parse_args()
|
|
validate_label = get_validate_label(PARAMS)
|
|
ALPHABET = Alphabet(PARAMS.filter_alphabet) if PARAMS.filter_alphabet else None
|
|
|
|
def label_filter_fun(label):
|
|
if PARAMS.normalize:
|
|
label = (
|
|
unicodedata.normalize("NFKD", label.strip())
|
|
.encode("ascii", "ignore")
|
|
.decode("ascii", "ignore")
|
|
)
|
|
label = maybe_normalize(label)
|
|
label = validate_label(label)
|
|
if ALPHABET and label:
|
|
try:
|
|
ALPHABET.encode(label)
|
|
except KeyError:
|
|
label = None
|
|
return label
|
|
|
|
dataset_sources = _download_and_preprocess_data(
|
|
csv_url=DATASET_RELEASE_CSV, target_dir=PARAMS.target_dir
|
|
)
|
|
sources_root_dir = os.path.dirname(dataset_sources)
|
|
all_counter = get_counter()
|
|
all_rows = []
|
|
with open(dataset_sources, "r") as sources:
|
|
for line in sources.readlines():
|
|
d = line.split(",")
|
|
this_xml = os.path.join(sources_root_dir, d[0])
|
|
this_mp3 = os.path.join(sources_root_dir, d[1])
|
|
this_rel = float(d[2])
|
|
|
|
wav_filename = os.path.join(
|
|
sources_root_dir,
|
|
os.path.splitext(os.path.basename(this_mp3))[0] + ".wav",
|
|
)
|
|
_maybe_convert_wav(this_mp3, wav_filename)
|
|
counter, rows = _maybe_import_data(
|
|
this_xml, wav_filename, sources_root_dir, this_rel
|
|
)
|
|
|
|
all_counter += counter
|
|
all_rows += rows
|
|
write_general_csv(sources_root_dir, _counter=all_counter, _rows=all_rows)
|