751 lines
		
	
	
		
			26 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			751 lines
		
	
	
		
			26 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env python
 | |
| """
 | |
| Importer for dataset published from Centre de Conférence Pierre Mendès-France
 | |
| Ministère de l'Économie, des Finances et de la Relance
 | |
| """
 | |
| 
 | |
| import csv
 | |
| import decimal
 | |
| import hashlib
 | |
| import math
 | |
| import os
 | |
| import re
 | |
| import subprocess
 | |
| import sys
 | |
| import unicodedata
 | |
| import xml.etree.ElementTree as ET
 | |
| import zipfile
 | |
| from glob import glob
 | |
| from multiprocessing import Pool
 | |
| 
 | |
| import progressbar
 | |
| import sox
 | |
| 
 | |
| try:
 | |
|     from num2words import num2words
 | |
| except ImportError as ex:
 | |
|     print("pip install num2words")
 | |
|     sys.exit(1)
 | |
| 
 | |
| import json
 | |
| 
 | |
| import requests
 | |
| from coqui_stt_ctcdecoder import Alphabet
 | |
| from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
 | |
| from coqui_stt_training.util.helpers import secs_to_hours
 | |
| from coqui_stt_training.util.importers import (
 | |
|     get_counter,
 | |
|     get_imported_samples,
 | |
|     get_importers_parser,
 | |
|     get_validate_label,
 | |
|     print_import_report,
 | |
| )
 | |
| 
 | |
| FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
 | |
| SAMPLE_RATE = 16000
 | |
| CHANNELS = 1
 | |
| BIT_DEPTH = 16
 | |
| MAX_SECS = 10
 | |
| MIN_SECS = 0.85
 | |
| 
 | |
| DATASET_RELEASE_CSV = "https://data.economie.gouv.fr/explore/dataset/transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020/download/?format=csv&timezone=Europe/Berlin&lang=fr&use_labels_for_header=true&csv_separator=%3B"
 | |
| DATASET_RELEASE_SHA = [
 | |
|     (
 | |
|         "863d39a06a388c6491c6ff2f6450b151f38f1b57",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.001",
 | |
|     ),
 | |
|     (
 | |
|         "2f3a0305aa04c61220bb00b5a4e553e45dbf12e1",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.002",
 | |
|     ),
 | |
|     (
 | |
|         "5e55e9f1f844097349188ac875947e5a3d7fe9f1",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.003",
 | |
|     ),
 | |
|     (
 | |
|         "8bf54842cf07948ca5915e27a8bd5fa5139c06ae",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.004",
 | |
|     ),
 | |
|     (
 | |
|         "c8963504aadc015ac48f9af80058a0bb3440b94f",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.005",
 | |
|     ),
 | |
|     (
 | |
|         "d95e225e908621d83ce4e9795fd108d9d310e244",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.006",
 | |
|     ),
 | |
|     (
 | |
|         "de6ed9c2b0ee80ca879aae8ba7923cc93217d811",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.007",
 | |
|     ),
 | |
|     (
 | |
|         "234283c47dacfcd4450d836c52c25f3e807fc5f2",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.008",
 | |
|     ),
 | |
|     (
 | |
|         "4e6b67a688639bb72f8cd81782eaba604a8d32a6",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.009",
 | |
|     ),
 | |
|     (
 | |
|         "4165a51389777c8af8e6253d87bdacb877e8b3b0",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.010",
 | |
|     ),
 | |
|     (
 | |
|         "34322e7009780d97ef5bd02bf2f2c7a31f00baff",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.011",
 | |
|     ),
 | |
|     (
 | |
|         "48c5be3b2ca9d6108d525da6a03e91d93a95dbac",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.012",
 | |
|     ),
 | |
|     (
 | |
|         "87573172f506a189c2ebc633856fe11a2e9cd213",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.013",
 | |
|     ),
 | |
|     (
 | |
|         "6ab2c9e508e9278d5129f023e018725c4a7c69e8",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.014",
 | |
|     ),
 | |
|     (
 | |
|         "4f84df831ef46dce5d3ab3e21817687a2d8c12d0",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.015",
 | |
|     ),
 | |
|     (
 | |
|         "e69bfb079885c299cb81080ef88b1b8b57158aa6",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.016",
 | |
|     ),
 | |
|     (
 | |
|         "5f764ba788ee273981cf211b242c29b49ca22c5e",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.017",
 | |
|     ),
 | |
|     (
 | |
|         "b6aa81a959525363223494830c1e7307d4c4bae6",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.018",
 | |
|     ),
 | |
|     (
 | |
|         "91ddcf43c7bf113a6f2528b857c7ec22a50a148a",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.019",
 | |
|     ),
 | |
|     (
 | |
|         "fa1b29273dd77b9a7494983a2f9ae52654b931d7",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.020",
 | |
|     ),
 | |
|     (
 | |
|         "1113aef4f5e2be2f7fbf2d54b6c710c1c0e7135f",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.021",
 | |
|     ),
 | |
|     (
 | |
|         "ce6420d5d0b6b5135ba559f83e1a82d4d615c470",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.022",
 | |
|     ),
 | |
|     (
 | |
|         "d0976ed292ac24fcf1590d1ea195077c74b05471",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.023",
 | |
|     ),
 | |
|     (
 | |
|         "ec746cd6af066f62d9bf8d3b2f89174783ff4e3c",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.024",
 | |
|     ),
 | |
|     (
 | |
|         "570d9e1e84178e32fd867171d4b3aaecda1fd4fb",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.025",
 | |
|     ),
 | |
|     (
 | |
|         "c29ccc7467a75b2cae3d7f2e9fbbb2ab276cb8ac",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.026",
 | |
|     ),
 | |
|     (
 | |
|         "08406a51146d88e208704ce058c060a1e44efa50",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.027",
 | |
|     ),
 | |
|     (
 | |
|         "199aedad733a78ea1e7d47def9c71c6fd5795e02",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.028",
 | |
|     ),
 | |
|     (
 | |
|         "db856a068f92fb4f01f410bba42c7271de0f231a",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.029",
 | |
|     ),
 | |
|     (
 | |
|         "e3c0135f16c6c9d25a09dcb4f99a685438a84740",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.030",
 | |
|     ),
 | |
|     (
 | |
|         "e51b8bb9c0ae4339f98b4f21e6d29b825109f0ac",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.031",
 | |
|     ),
 | |
|     (
 | |
|         "be5e80cbc49b59b31ae33c30576ef0e1a162d84e",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.032",
 | |
|     ),
 | |
|     (
 | |
|         "501df58e3ff55fcfd75b93dab57566dc536948b8",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.033",
 | |
|     ),
 | |
|     (
 | |
|         "1a114875811a8cdcb8d85a9f6dbee78be3e05131",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.034",
 | |
|     ),
 | |
|     (
 | |
|         "465d824e7ee46448369182c0c28646d155a2249b",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.035",
 | |
|     ),
 | |
|     (
 | |
|         "37f341b1b266d143eb73138c31cfff3201b9d619",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.036",
 | |
|     ),
 | |
|     (
 | |
|         "9e7d8255987a8a77a90e0d4b55c8fd38b9fb5694",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.037",
 | |
|     ),
 | |
|     (
 | |
|         "54886755630cb080a53098cb1b6c951c6714a143",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.038",
 | |
|     ),
 | |
|     (
 | |
|         "4b7cbb0154697be795034f7a49712e882a97197a",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.039",
 | |
|     ),
 | |
|     (
 | |
|         "c8e1e565a0e7a1f6ff1dbfcefe677aa74a41d2f2",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.040",
 | |
|     ),
 | |
| ]
 | |
| 
 | |
| 
 | |
| def _download_and_preprocess_data(csv_url, target_dir):
 | |
|     dataset_sources = os.path.join(
 | |
|         target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020", "data.txt"
 | |
|     )
 | |
|     if os.path.exists(dataset_sources):
 | |
|         return dataset_sources
 | |
| 
 | |
|     # Making path absolute
 | |
|     target_dir = os.path.abspath(target_dir)
 | |
|     csv_ref = requests.get(csv_url).text.split("\r\n")[1:-1]
 | |
|     for part in csv_ref:
 | |
|         part_filename = (
 | |
|             requests.head(part)
 | |
|             .headers.get("Content-Disposition")
 | |
|             .split(" ")[1]
 | |
|             .split("=")[1]
 | |
|             .replace('"', "")
 | |
|         )
 | |
|         if not os.path.exists(os.path.join(target_dir, part_filename)):
 | |
|             part_path = maybe_download(part_filename, target_dir, part)
 | |
| 
 | |
|     def _big_sha1(fname):
 | |
|         s = hashlib.sha1()
 | |
|         buffer_size = 65536
 | |
|         with open(fname, "rb") as f:
 | |
|             while True:
 | |
|                 data = f.read(buffer_size)
 | |
|                 if not data:
 | |
|                     break
 | |
|                 s.update(data)
 | |
|         return s.hexdigest()
 | |
| 
 | |
|     for (sha1, filename) in DATASET_RELEASE_SHA:
 | |
|         print("Checking {} SHA1:".format(filename))
 | |
|         csum = _big_sha1(os.path.join(target_dir, filename))
 | |
|         if csum == sha1:
 | |
|             print("\t{}: OK {}".format(filename, sha1))
 | |
|         else:
 | |
|             print("\t{}: ERROR: expected {}, computed {}".format(filename, sha1, csum))
 | |
|         assert csum == sha1
 | |
| 
 | |
|     # Conditionally extract data
 | |
|     _maybe_extract(
 | |
|         target_dir,
 | |
|         "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020",
 | |
|         "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip",
 | |
|         "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020.zip",
 | |
|     )
 | |
| 
 | |
|     # Produce source text for extraction / conversion
 | |
|     return _maybe_create_sources(
 | |
|         os.path.join(target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020")
 | |
|     )
 | |
| 
 | |
| 
 | |
| def _maybe_extract(target_dir, extracted_data, archive, final):
 | |
|     # If target_dir/extracted_data does not exist, extract archive in target_dir
 | |
|     extracted_path = os.path.join(target_dir, extracted_data)
 | |
|     archive_path = os.path.join(target_dir, archive)
 | |
|     final_archive = os.path.join(extracted_path, final)
 | |
| 
 | |
|     if not os.path.exists(extracted_path):
 | |
|         if not os.path.exists(archive_path):
 | |
|             print('No archive "%s" - building ...' % archive_path)
 | |
|             all_zip_parts = glob(archive_path + ".*")
 | |
|             all_zip_parts.sort()
 | |
|             cmdline = "cat {} > {}".format(" ".join(all_zip_parts), archive_path)
 | |
|             print('Building with "%s"' % cmdline)
 | |
|             subprocess.check_call(cmdline, shell=True, cwd=target_dir)
 | |
|             assert os.path.exists(archive_path)
 | |
| 
 | |
|         print(
 | |
|             'No directory "%s" - extracting archive %s ...'
 | |
|             % (extracted_path, archive_path)
 | |
|         )
 | |
|         with zipfile.ZipFile(archive_path) as zip_f:
 | |
|             zip_f.extractall(extracted_path)
 | |
| 
 | |
|         with zipfile.ZipFile(final_archive) as zip_f:
 | |
|             zip_f.extractall(target_dir)
 | |
|     else:
 | |
|         print('Found directory "%s" - not extracting it from archive.' % extracted_path)
 | |
| 
 | |
| 
 | |
| def _maybe_create_sources(dir):
 | |
|     dataset_sources = os.path.join(dir, "data.txt")
 | |
|     MP3 = glob(os.path.join(dir, "**", "*.mp3"))
 | |
|     XML = glob(os.path.join(dir, "**", "*.xml"))
 | |
| 
 | |
|     MP3_XML_Scores = []
 | |
|     MP3_XML_Fin = {}
 | |
| 
 | |
|     for f_mp3 in MP3:
 | |
|         for f_xml in XML:
 | |
|             b_mp3 = os.path.splitext(os.path.basename(f_mp3))[0]
 | |
|             b_xml = os.path.splitext(os.path.basename(f_xml))[0]
 | |
|             a_mp3 = b_mp3.split("_")
 | |
|             a_xml = b_xml.split("_")
 | |
|             score = 0
 | |
|             date_mp3 = a_mp3[0]
 | |
|             date_xml = a_xml[0]
 | |
| 
 | |
|             if date_mp3 != date_xml:
 | |
|                 continue
 | |
| 
 | |
|             for i in range(min(len(a_mp3), len(a_xml))):
 | |
|                 if a_mp3[i] == a_xml[i]:
 | |
|                     score += 1
 | |
| 
 | |
|             if score >= 1:
 | |
|                 MP3_XML_Scores.append((f_mp3, f_xml, score))
 | |
| 
 | |
|     # sort by score
 | |
|     MP3_XML_Scores.sort(key=lambda x: x[2], reverse=True)
 | |
|     for s_mp3, s_xml, score in MP3_XML_Scores:
 | |
|         # print(s_mp3, s_xml, score)
 | |
|         if score not in MP3_XML_Fin:
 | |
|             MP3_XML_Fin[score] = {}
 | |
| 
 | |
|         if s_mp3 not in MP3_XML_Fin[score]:
 | |
|             try:
 | |
|                 MP3.index(s_mp3)
 | |
|                 MP3.remove(s_mp3)
 | |
|                 MP3_XML_Fin[score][s_mp3] = s_xml
 | |
|             except ValueError as ex:
 | |
|                 pass
 | |
|         else:
 | |
|             print("here:", MP3_XML_Fin[score][s_mp3], s_xml, file=sys.stderr)
 | |
| 
 | |
|     with open(dataset_sources, "w") as ds:
 | |
|         for score in MP3_XML_Fin:
 | |
|             for mp3 in MP3_XML_Fin[score]:
 | |
|                 xml = MP3_XML_Fin[score][mp3]
 | |
|                 if os.path.getsize(mp3) > 0 and os.path.getsize(xml) > 0:
 | |
|                     mp3 = os.path.relpath(mp3, dir)
 | |
|                     xml = os.path.relpath(xml, dir)
 | |
|                     ds.write("{},{},{:0.2e}\n".format(xml, mp3, 2.5e-4))
 | |
|                 else:
 | |
|                     print("Empty file {} or {}".format(mp3, xml), file=sys.stderr)
 | |
| 
 | |
|     print("Missing XML pairs:", MP3, file=sys.stderr)
 | |
|     return dataset_sources
 | |
| 
 | |
| 
 | |
| def maybe_normalize_for_digits(label):
 | |
|     # first, try to identify numbers like "50 000", "260 000"
 | |
|     if " " in label:
 | |
|         if any(s.isdigit() for s in label):
 | |
|             thousands = re.compile(r"(\d{1,3}(?:\s*\d{3})*(?:,\d+)?)")
 | |
|             maybe_thousands = thousands.findall(label)
 | |
|             if len(maybe_thousands) > 0:
 | |
|                 while True:
 | |
|                     (label, r) = re.subn(r"(\d)\s(\d{3})", "\\1\\2", label)
 | |
|                     if r == 0:
 | |
|                         break
 | |
| 
 | |
|     # this might be a time or duration in the form "hh:mm" or "hh:mm:ss"
 | |
|     if ":" in label:
 | |
|         for s in label.split(" "):
 | |
|             if any(i.isdigit() for i in s):
 | |
|                 date_or_time = re.compile(r"(\d{1,2}):(\d{2}):?(\d{2})?")
 | |
|                 maybe_date_or_time = date_or_time.findall(s)
 | |
|                 if len(maybe_date_or_time) > 0:
 | |
|                     maybe_hours = maybe_date_or_time[0][0]
 | |
|                     maybe_minutes = maybe_date_or_time[0][1]
 | |
|                     maybe_seconds = maybe_date_or_time[0][2]
 | |
|                     if len(maybe_seconds) > 0:
 | |
|                         label = label.replace(
 | |
|                             "{}:{}:{}".format(
 | |
|                                 maybe_hours, maybe_minutes, maybe_seconds
 | |
|                             ),
 | |
|                             "{} heures {} minutes et {} secondes".format(
 | |
|                                 maybe_hours, maybe_minutes, maybe_seconds
 | |
|                             ),
 | |
|                         )
 | |
|                     else:
 | |
|                         label = label.replace(
 | |
|                             "{}:{}".format(maybe_hours, maybe_minutes),
 | |
|                             "{} heures et {} minutes".format(
 | |
|                                 maybe_hours, maybe_minutes
 | |
|                             ),
 | |
|                         )
 | |
| 
 | |
|     new_label = []
 | |
|     # pylint: disable=too-many-nested-blocks
 | |
|     for s in label.split(" "):
 | |
|         if any(i.isdigit() for i in s):
 | |
|             s = s.replace(",", ".")  # num2words requires "." for floats
 | |
|             s = s.replace('"', "")  # clean some data, num2words would choke on 1959"
 | |
| 
 | |
|             last_c = s[-1]
 | |
|             if not last_c.isdigit():  # num2words will choke on "0.6.", "24 ?"
 | |
|                 s = s[:-1]
 | |
| 
 | |
|             if any(
 | |
|                 i.isalpha() for i in s
 | |
|             ):  # So we have any(isdigit()) **and** any(sialpha), like "3D"
 | |
|                 ns = []
 | |
|                 for c in s:
 | |
|                     nc = c
 | |
|                     if c.isdigit():  # convert "3" to "trois-"
 | |
|                         try:
 | |
|                             nc = num2words(c, lang="fr") + "-"
 | |
|                         except decimal.InvalidOperation as ex:
 | |
|                             print("decimal.InvalidOperation: '{}'".format(s))
 | |
|                             raise ex
 | |
|                     ns.append(nc)
 | |
|                 s = "".join(s)
 | |
|             else:
 | |
|                 try:
 | |
|                     s = num2words(s, lang="fr")
 | |
|                 except decimal.InvalidOperation as ex:
 | |
|                     print("decimal.InvalidOperation: '{}'".format(s))
 | |
|                     raise ex
 | |
|         new_label.append(s)
 | |
|     return " ".join(new_label)
 | |
| 
 | |
| 
 | |
| def maybe_normalize_for_specials_chars(label):
 | |
|     label = label.replace("%", "pourcents")
 | |
|     label = label.replace("/", ", ")  # clean intervals like 2019/2022 to "2019 2022"
 | |
|     label = label.replace("-", ", ")  # clean intervals like 70-80 to "70 80"
 | |
|     label = label.replace("+", " plus ")  # clean + and make it speakable
 | |
|     label = label.replace("€", " euros ")  # clean euro symbol and make it speakable
 | |
|     label = label.replace(
 | |
|         "., ", ", "
 | |
|     )  # clean some strange "4.0., " (20181017_Innovation.xml)
 | |
|     label = label.replace(
 | |
|         "°", " degré "
 | |
|     )  # clean some strange "°5" (20181210_EtatsGeneraux-1000_fre_750_und.xml)
 | |
|     label = label.replace("...", ".")  # remove ellipsis
 | |
|     label = label.replace("..", ".")  # remove broken ellipsis
 | |
|     label = label.replace(
 | |
|         "m²", "mètre-carrés"
 | |
|     )  # 20150616_Defi_Climat_3_wmv_0_fre_minefi.xml
 | |
|     label = label.replace(
 | |
|         "[end]", ""
 | |
|     )  # broken tag in 20150123_Entretiens_Tresor_PGM_wmv_0_fre_minefi.xml
 | |
|     label = label.replace(
 | |
|         u"\xB8c", " ç"
 | |
|     )  # strange cedilla in 20150417_Printemps_Economie_2_wmv_0_fre_minefi.xml
 | |
|     label = label.replace(
 | |
|         "C0²", "CO 2"
 | |
|     )  # 20121016_Syteme_sante_copie_wmv_0_fre_minefi.xml
 | |
|     return label
 | |
| 
 | |
| 
 | |
| def maybe_normalize_for_anglicisms(label):
 | |
|     label = label.replace("B2B", "B to B")
 | |
|     label = label.replace("B2C", "B to C")
 | |
|     label = label.replace("#", "hashtag ")
 | |
|     label = label.replace("@", "at ")
 | |
|     return label
 | |
| 
 | |
| 
 | |
| def maybe_normalize(label):
 | |
|     label = maybe_normalize_for_specials_chars(label)
 | |
|     label = maybe_normalize_for_anglicisms(label)
 | |
|     label = maybe_normalize_for_digits(label)
 | |
|     return label
 | |
| 
 | |
| 
 | |
| def one_sample(sample):
 | |
|     file_size = -1
 | |
|     frames = 0
 | |
| 
 | |
|     audio_source = sample[0]
 | |
|     target_dir = sample[1]
 | |
|     dataset_basename = sample[2]
 | |
| 
 | |
|     start_time = sample[3]
 | |
|     duration = sample[4]
 | |
|     label = label_filter_fun(sample[5])
 | |
|     sample_id = sample[6]
 | |
| 
 | |
|     _wav_filename = os.path.basename(
 | |
|         audio_source.replace(".wav", "_{:06}.wav".format(sample_id))
 | |
|     )
 | |
|     wav_fullname = os.path.join(target_dir, dataset_basename, _wav_filename)
 | |
| 
 | |
|     if not os.path.exists(wav_fullname):
 | |
|         subprocess.check_output(
 | |
|             [
 | |
|                 "ffmpeg",
 | |
|                 "-i",
 | |
|                 audio_source,
 | |
|                 "-ss",
 | |
|                 str(start_time),
 | |
|                 "-t",
 | |
|                 str(duration),
 | |
|                 "-c",
 | |
|                 "copy",
 | |
|                 wav_fullname,
 | |
|             ],
 | |
|             stdin=subprocess.DEVNULL,
 | |
|             stderr=subprocess.STDOUT,
 | |
|         )
 | |
| 
 | |
|     file_size = os.path.getsize(wav_fullname)
 | |
|     frames = int(
 | |
|         subprocess.check_output(["soxi", "-s", wav_fullname], stderr=subprocess.STDOUT)
 | |
|     )
 | |
| 
 | |
|     _counter = get_counter()
 | |
|     _rows = []
 | |
| 
 | |
|     if file_size == -1:
 | |
|         # Excluding samples that failed upon conversion
 | |
|         _counter["failed"] += 1
 | |
|     elif label is None:
 | |
|         # Excluding samples that failed on label validation
 | |
|         _counter["invalid_label"] += 1
 | |
|     elif int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(label)):
 | |
|         # Excluding samples that are too short to fit the transcript
 | |
|         _counter["too_short"] += 1
 | |
|     elif frames / SAMPLE_RATE < MIN_SECS:
 | |
|         # Excluding samples that are too short
 | |
|         _counter["too_short"] += 1
 | |
|     elif frames / SAMPLE_RATE > MAX_SECS:
 | |
|         # Excluding very long samples to keep a reasonable batch-size
 | |
|         _counter["too_long"] += 1
 | |
|     else:
 | |
|         # This one is good - keep it for the target CSV
 | |
|         _rows.append((os.path.join(dataset_basename, _wav_filename), file_size, label))
 | |
|         _counter["imported_time"] += frames
 | |
|     _counter["all"] += 1
 | |
|     _counter["total_time"] += frames
 | |
| 
 | |
|     return (_counter, _rows)
 | |
| 
 | |
| 
 | |
| def _maybe_import_data(xml_file, audio_source, target_dir, rel_tol=1e-1):
 | |
|     dataset_basename = os.path.splitext(os.path.split(xml_file)[1])[0]
 | |
|     wav_root = os.path.join(target_dir, dataset_basename)
 | |
|     if not os.path.exists(wav_root):
 | |
|         os.makedirs(wav_root)
 | |
| 
 | |
|     source_frames = int(
 | |
|         subprocess.check_output(["soxi", "-s", audio_source], stderr=subprocess.STDOUT)
 | |
|     )
 | |
|     print("Source audio length: %s" % secs_to_hours(source_frames / SAMPLE_RATE))
 | |
| 
 | |
|     # Get audiofile path and transcript for each sentence in tsv
 | |
|     samples = []
 | |
|     tree = ET.parse(xml_file)
 | |
|     root = tree.getroot()
 | |
|     seq_id = 0
 | |
|     this_time = 0.0
 | |
|     this_duration = 0.0
 | |
|     prev_time = 0.0
 | |
|     prev_duration = 0.0
 | |
|     this_text = ""
 | |
|     for child in root:
 | |
|         if child.tag == "row":
 | |
|             cur_time = float(child.attrib["timestamp"])
 | |
|             cur_duration = float(child.attrib["timedur"])
 | |
|             cur_text = child.text
 | |
| 
 | |
|             if this_time == 0.0:
 | |
|                 this_time = cur_time
 | |
| 
 | |
|             delta = cur_time - (prev_time + prev_duration)
 | |
|             # rel_tol value is made from trial/error to try and compromise between:
 | |
|             # - cutting enough to skip missing words
 | |
|             # - not too short, not too long sentences
 | |
|             is_close = math.isclose(
 | |
|                 cur_time, this_time + this_duration, rel_tol=rel_tol
 | |
|             )
 | |
|             is_short = (this_duration + cur_duration + delta) < MAX_SECS
 | |
| 
 | |
|             # when the previous element is close enough **and** this does not
 | |
|             # go over MAX_SECS, we append content
 | |
|             if is_close and is_short:
 | |
|                 this_duration += cur_duration + delta
 | |
|                 this_text += cur_text
 | |
|             else:
 | |
|                 samples.append(
 | |
|                     (
 | |
|                         audio_source,
 | |
|                         target_dir,
 | |
|                         dataset_basename,
 | |
|                         this_time,
 | |
|                         this_duration,
 | |
|                         this_text,
 | |
|                         seq_id,
 | |
|                     )
 | |
|                 )
 | |
| 
 | |
|                 this_time = cur_time
 | |
|                 this_duration = cur_duration
 | |
|                 this_text = cur_text
 | |
| 
 | |
|                 seq_id += 1
 | |
| 
 | |
|             prev_time = cur_time
 | |
|             prev_duration = cur_duration
 | |
| 
 | |
|     # Keep track of how many samples are good vs. problematic
 | |
|     _counter = get_counter()
 | |
|     num_samples = len(samples)
 | |
|     _rows = []
 | |
| 
 | |
|     print("Processing XML data: {}".format(xml_file))
 | |
|     pool = Pool()
 | |
|     bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
 | |
|     for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
 | |
|         _counter += processed[0]
 | |
|         _rows += processed[1]
 | |
|         bar.update(i)
 | |
|     bar.update(num_samples)
 | |
|     pool.close()
 | |
|     pool.join()
 | |
| 
 | |
|     imported_samples = get_imported_samples(_counter)
 | |
|     assert _counter["all"] == num_samples
 | |
|     assert len(_rows) == imported_samples
 | |
| 
 | |
|     print_import_report(_counter, SAMPLE_RATE, MAX_SECS)
 | |
|     print(
 | |
|         "Import efficiency: %.1f%%" % ((_counter["total_time"] / source_frames) * 100)
 | |
|     )
 | |
|     print("")
 | |
| 
 | |
|     return _counter, _rows
 | |
| 
 | |
| 
 | |
| def _maybe_convert_wav(mp3_filename, _wav_filename):
 | |
|     if not os.path.exists(_wav_filename):
 | |
|         print("Converting {} to WAV file: {}".format(mp3_filename, _wav_filename))
 | |
|         transformer = sox.Transformer()
 | |
|         transformer.convert(
 | |
|             samplerate=SAMPLE_RATE, n_channels=CHANNELS, bitdepth=BIT_DEPTH
 | |
|         )
 | |
|         try:
 | |
|             transformer.build(mp3_filename, _wav_filename)
 | |
|         except sox.core.SoxError:
 | |
|             pass
 | |
| 
 | |
| 
 | |
| def write_general_csv(target_dir, _rows, _counter):
 | |
|     target_csv_template = os.path.join(target_dir, "ccpmf_{}.csv")
 | |
|     with open(target_csv_template.format("train"), "w") as train_csv_file:  # 80%
 | |
|         with open(target_csv_template.format("dev"), "w") as dev_csv_file:  # 10%
 | |
|             with open(target_csv_template.format("test"), "w") as test_csv_file:  # 10%
 | |
|                 train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES)
 | |
|                 train_writer.writeheader()
 | |
|                 dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES)
 | |
|                 dev_writer.writeheader()
 | |
|                 test_writer = csv.DictWriter(test_csv_file, fieldnames=FIELDNAMES)
 | |
|                 test_writer.writeheader()
 | |
| 
 | |
|                 bar = progressbar.ProgressBar(max_value=len(_rows), widgets=SIMPLE_BAR)
 | |
|                 for i, item in enumerate(bar(_rows)):
 | |
|                     i_mod = i % 10
 | |
|                     if i_mod == 0:
 | |
|                         writer = test_writer
 | |
|                     elif i_mod == 1:
 | |
|                         writer = dev_writer
 | |
|                     else:
 | |
|                         writer = train_writer
 | |
|                     writer.writerow(
 | |
|                         {
 | |
|                             "wav_filename": item[0],
 | |
|                             "wav_filesize": item[1],
 | |
|                             "transcript": item[2],
 | |
|                         }
 | |
|                     )
 | |
| 
 | |
|     print("")
 | |
|     print("~~~~ FINAL STATISTICS ~~~~")
 | |
|     print_import_report(_counter, SAMPLE_RATE, MAX_SECS)
 | |
|     print("~~~~ (FINAL STATISTICS) ~~~~")
 | |
|     print("")
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     PARSER = get_importers_parser(
 | |
|         description="Import XML from Conference Centre for Economics, France"
 | |
|     )
 | |
|     PARSER.add_argument("target_dir", help="Destination directory")
 | |
|     PARSER.add_argument(
 | |
|         "--filter_alphabet",
 | |
|         help="Exclude samples with characters not in provided alphabet",
 | |
|     )
 | |
|     PARSER.add_argument(
 | |
|         "--normalize",
 | |
|         action="store_true",
 | |
|         help="Converts diacritic characters to their base ones",
 | |
|     )
 | |
| 
 | |
|     PARAMS = PARSER.parse_args()
 | |
|     validate_label = get_validate_label(PARAMS)
 | |
|     ALPHABET = Alphabet(PARAMS.filter_alphabet) if PARAMS.filter_alphabet else None
 | |
| 
 | |
|     def label_filter_fun(label):
 | |
|         if PARAMS.normalize:
 | |
|             label = (
 | |
|                 unicodedata.normalize("NFKD", label.strip())
 | |
|                 .encode("ascii", "ignore")
 | |
|                 .decode("ascii", "ignore")
 | |
|             )
 | |
|         label = maybe_normalize(label)
 | |
|         label = validate_label(label)
 | |
|         if ALPHABET and label:
 | |
|             try:
 | |
|                 ALPHABET.encode(label)
 | |
|             except KeyError:
 | |
|                 label = None
 | |
|         return label
 | |
| 
 | |
|     dataset_sources = _download_and_preprocess_data(
 | |
|         csv_url=DATASET_RELEASE_CSV, target_dir=PARAMS.target_dir
 | |
|     )
 | |
|     sources_root_dir = os.path.dirname(dataset_sources)
 | |
|     all_counter = get_counter()
 | |
|     all_rows = []
 | |
|     with open(dataset_sources, "r") as sources:
 | |
|         for line in sources.readlines():
 | |
|             d = line.split(",")
 | |
|             this_xml = os.path.join(sources_root_dir, d[0])
 | |
|             this_mp3 = os.path.join(sources_root_dir, d[1])
 | |
|             this_rel = float(d[2])
 | |
| 
 | |
|             wav_filename = os.path.join(
 | |
|                 sources_root_dir,
 | |
|                 os.path.splitext(os.path.basename(this_mp3))[0] + ".wav",
 | |
|             )
 | |
|             _maybe_convert_wav(this_mp3, wav_filename)
 | |
|             counter, rows = _maybe_import_data(
 | |
|                 this_xml, wav_filename, sources_root_dir, this_rel
 | |
|             )
 | |
| 
 | |
|             all_counter += counter
 | |
|             all_rows += rows
 | |
|     write_general_csv(sources_root_dir, _counter=all_counter, _rows=all_rows)
 |