Importer for dataset from Centre de Conférences Pierre Mendès-France

Released by Ministère de l'Economie, des Finances, et de la Relance
2020-01-22 18:58:33 +01:00 · 2020-01-22 18:58:33 +01:00 · c822a6e875
parent ecc48062a7
commit c822a6e875
2 changed files with 514 additions and 1 deletions
--- a/bin/import_ccpmf.py
+++ b/bin/import_ccpmf.py
@ -0,0 +1,513 @@
+#!/usr/bin/env python
+"""
+Importer for dataset published from Centre de Conférence Pierre Mendès-France
+Ministère de l'Économie, des Finances et de la Relance
+"""
+
+import csv
+import sys
+import os
+import progressbar
+import subprocess
+import zipfile
+from glob import glob
+from multiprocessing import Pool
+
+import hashlib
+import decimal
+import math
+import unicodedata
+import re
+import sox
+import xml.etree.ElementTree as ET
+
+try:
+    from num2words import num2words
+except ImportError as ex:
+    print("pip install num2words")
+    sys.exit(1)
+
+import requests
+import json
+
+from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download
+from deepspeech_training.util.helpers import secs_to_hours
+from deepspeech_training.util.importers import (
+    get_counter,
+    get_importers_parser,
+    get_imported_samples,
+    get_validate_label,
+    print_import_report,
+)
+from ds_ctcdecoder import Alphabet
+
+FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
+SAMPLE_RATE = 16000
+CHANNELS = 1
+BIT_DEPTH = 16
+MAX_SECS = 10
+MIN_SECS = 0.85
+
+DATASET_RELEASE_CSV = "https://data.economie.gouv.fr/explore/dataset/transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020/download/?format=csv&timezone=Europe/Berlin&lang=fr&use_labels_for_header=true&csv_separator=%3B"
+DATASET_RELEASE_SHA = [
+    ("863d39a06a388c6491c6ff2f6450b151f38f1b57", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.001"),
+    ("2f3a0305aa04c61220bb00b5a4e553e45dbf12e1", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.002"),
+    ("5e55e9f1f844097349188ac875947e5a3d7fe9f1", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.003"),
+    ("8bf54842cf07948ca5915e27a8bd5fa5139c06ae", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.004"),
+    ("c8963504aadc015ac48f9af80058a0bb3440b94f", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.005"),
+    ("d95e225e908621d83ce4e9795fd108d9d310e244", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.006"),
+    ("de6ed9c2b0ee80ca879aae8ba7923cc93217d811", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.007"),
+    ("234283c47dacfcd4450d836c52c25f3e807fc5f2", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.008"),
+    ("4e6b67a688639bb72f8cd81782eaba604a8d32a6", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.009"),
+    ("4165a51389777c8af8e6253d87bdacb877e8b3b0", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.010"),
+    ("34322e7009780d97ef5bd02bf2f2c7a31f00baff", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.011"),
+    ("48c5be3b2ca9d6108d525da6a03e91d93a95dbac", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.012"),
+    ("87573172f506a189c2ebc633856fe11a2e9cd213", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.013"),
+    ("6ab2c9e508e9278d5129f023e018725c4a7c69e8", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.014"),
+    ("4f84df831ef46dce5d3ab3e21817687a2d8c12d0", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.015"),
+    ("e69bfb079885c299cb81080ef88b1b8b57158aa6", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.016"),
+    ("5f764ba788ee273981cf211b242c29b49ca22c5e", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.017"),
+    ("b6aa81a959525363223494830c1e7307d4c4bae6", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.018"),
+    ("91ddcf43c7bf113a6f2528b857c7ec22a50a148a", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.019"),
+    ("fa1b29273dd77b9a7494983a2f9ae52654b931d7", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.020"),
+    ("1113aef4f5e2be2f7fbf2d54b6c710c1c0e7135f", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.021"),
+    ("ce6420d5d0b6b5135ba559f83e1a82d4d615c470", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.022"),
+    ("d0976ed292ac24fcf1590d1ea195077c74b05471", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.023"),
+    ("ec746cd6af066f62d9bf8d3b2f89174783ff4e3c", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.024"),
+    ("570d9e1e84178e32fd867171d4b3aaecda1fd4fb", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.025"),
+    ("c29ccc7467a75b2cae3d7f2e9fbbb2ab276cb8ac", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.026"),
+    ("08406a51146d88e208704ce058c060a1e44efa50", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.027"),
+    ("199aedad733a78ea1e7d47def9c71c6fd5795e02", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.028"),
+    ("db856a068f92fb4f01f410bba42c7271de0f231a", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.029"),
+    ("e3c0135f16c6c9d25a09dcb4f99a685438a84740", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.030"),
+    ("e51b8bb9c0ae4339f98b4f21e6d29b825109f0ac", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.031"),
+    ("be5e80cbc49b59b31ae33c30576ef0e1a162d84e", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.032"),
+    ("501df58e3ff55fcfd75b93dab57566dc536948b8", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.033"),
+    ("1a114875811a8cdcb8d85a9f6dbee78be3e05131", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.034"),
+    ("465d824e7ee46448369182c0c28646d155a2249b", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.035"),
+    ("37f341b1b266d143eb73138c31cfff3201b9d619", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.036"),
+    ("9e7d8255987a8a77a90e0d4b55c8fd38b9fb5694", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.037"),
+    ("54886755630cb080a53098cb1b6c951c6714a143", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.038"),
+    ("4b7cbb0154697be795034f7a49712e882a97197a", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.039"),
+    ("c8e1e565a0e7a1f6ff1dbfcefe677aa74a41d2f2", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.040"),
+]
+
+def _download_and_preprocess_data(csv_url, target_dir):
+    dataset_sources = os.path.join(target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020", "data.txt")
+    if os.path.exists(dataset_sources):
+        return dataset_sources
+
+    # Making path absolute
+    target_dir = os.path.abspath(target_dir)
+    csv_ref = requests.get(csv_url).text.split('\r\n')[1:-1]
+    for part in csv_ref:
+        part_filename = requests.head(part).headers.get("Content-Disposition").split(" ")[1].split("=")[1].replace('"', "")
+        if not os.path.exists(os.path.join(target_dir, part_filename)):
+            part_path = maybe_download(part_filename, target_dir, part)
+
+    def _big_sha1(fname):
+        s = hashlib.sha1()
+        buffer_size = 65536
+        with open(fname, "rb") as f:
+            while True:
+                data = f.read(buffer_size)
+                if not data:
+                    break
+                s.update(data)
+        return s.hexdigest()
+
+    for (sha1, filename) in DATASET_RELEASE_SHA:
+        print("Checking {} SHA1:".format(filename))
+        csum = _big_sha1(os.path.join(target_dir, filename))
+        if csum == sha1:
+            print("\t{}: OK {}".format(filename, sha1))
+        else:
+            print("\t{}: ERROR: expected {}, computed {}".format(filename, sha1, csum))
+        assert csum == sha1
+
+    # Conditionally extract data
+    _maybe_extract(target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip", "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020.zip")
+
+    # Produce source text for extraction / conversion
+    return _maybe_create_sources(os.path.join(target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020"))
+
+def _maybe_extract(target_dir, extracted_data, archive, final):
+    # If target_dir/extracted_data does not exist, extract archive in target_dir
+    extracted_path = os.path.join(target_dir, extracted_data)
+    archive_path = os.path.join(target_dir, archive)
+    final_archive = os.path.join(extracted_path, final)
+
+    if not os.path.exists(extracted_path):
+        if not os.path.exists(archive_path):
+            print('No archive "%s" - building ...' % archive_path)
+            all_zip_parts = glob(archive_path + ".*")
+            all_zip_parts.sort()
+            cmdline = "cat {} > {}".format(" ".join(all_zip_parts), archive_path)
+            print('Building with "%s"' % cmdline)
+            subprocess.check_call(cmdline, shell=True, cwd=target_dir)
+            assert os.path.exists(archive_path)
+
+        print('No directory "%s" - extracting archive %s ...' % (extracted_path, archive_path))
+        with zipfile.ZipFile(archive_path) as zip_f:
+            zip_f.extractall(extracted_path)
+
+        with zipfile.ZipFile(final_archive) as zip_f:
+            zip_f.extractall(target_dir)
+    else:
+        print('Found directory "%s" - not extracting it from archive.' % extracted_path)
+
+def _maybe_create_sources(dir):
+    dataset_sources = os.path.join(dir, "data.txt")
+    MP3 = glob(os.path.join(dir, "**", "*.mp3"))
+    XML = glob(os.path.join(dir, "**", "*.xml"))
+
+    MP3_XML_Scores = []
+    MP3_XML_Fin = {}
+
+    for f_mp3 in MP3:
+        for f_xml in XML:
+            b_mp3 = os.path.splitext(os.path.basename(f_mp3))[0]
+            b_xml = os.path.splitext(os.path.basename(f_xml))[0]
+            a_mp3 = b_mp3.split('_')
+            a_xml = b_xml.split('_')
+            score = 0
+            date_mp3 = a_mp3[0]
+            date_xml = a_xml[0]
+
+            if date_mp3 != date_xml:
+                continue
+
+            for i in range(min(len(a_mp3), len(a_xml))):
+                if (a_mp3[i] == a_xml[i]):
+                    score += 1
+
+            if score >= 1:
+                MP3_XML_Scores.append((f_mp3, f_xml, score))
+
+    # sort by score
+    MP3_XML_Scores.sort(key=lambda x: x[2], reverse=True)
+    for s_mp3, s_xml, score in MP3_XML_Scores:
+        #print(s_mp3, s_xml, score)
+        if score not in MP3_XML_Fin:
+            MP3_XML_Fin[score] = {}
+
+        if s_mp3 not in MP3_XML_Fin[score]:
+            try:
+                MP3.index(s_mp3)
+                MP3.remove(s_mp3)
+                MP3_XML_Fin[score][s_mp3] = s_xml
+            except ValueError as ex:
+                pass
+        else:
+            print("here:", MP3_XML_Fin[score][s_mp3], s_xml, file=sys.stderr)
+
+    with open(dataset_sources, "w") as ds:
+        for score in MP3_XML_Fin:
+            for mp3 in MP3_XML_Fin[score]:
+                xml = MP3_XML_Fin[score][mp3]
+                if os.path.getsize(mp3) > 0 and os.path.getsize(xml) > 0:
+                    mp3 = os.path.relpath(mp3, dir)
+                    xml = os.path.relpath(xml, dir)
+                    ds.write('{},{},{:0.2e}\n'.format(xml, mp3, 2.5e-4))
+                else:
+                    print("Empty file {} or {}".format(mp3, xml), file=sys.stderr)
+
+    print("Missing XML pairs:", MP3, file=sys.stderr)
+    return dataset_sources
+
+def maybe_normalize_for_digits(label):
+    # first, try to identify numbers like "50 000", "260 000"
+    if " " in label:
+        if any(s.isdigit() for s in label):
+            thousands = re.compile(r"(\d{1,3}(?:\s*\d{3})*(?:,\d+)?)")
+            maybe_thousands = thousands.findall(label)
+            if len(maybe_thousands) > 0:
+                while True:
+                    (label, r) = re.subn(r"(\d)\s(\d{3})", "\\1\\2", label)
+                    if r == 0:
+                        break
+
+    # this might be a time or duration in the form "hh:mm" or "hh:mm:ss"
+    if ":" in label:
+        for s in label.split(" "):
+            if any(i.isdigit() for i in s):
+                date_or_time = re.compile(r"(\d{1,2}):(\d{2}):?(\d{2})?")
+                maybe_date_or_time = date_or_time.findall(s)
+                if len(maybe_date_or_time) > 0:
+                    maybe_hours   = maybe_date_or_time[0][0]
+                    maybe_minutes = maybe_date_or_time[0][1]
+                    maybe_seconds = maybe_date_or_time[0][2]
+                    if len(maybe_seconds) > 0:
+                        label = label.replace("{}:{}:{}".format(maybe_hours, maybe_minutes, maybe_seconds), "{} heures {} minutes et {} secondes".format(maybe_hours, maybe_minutes, maybe_seconds))
+                    else:
+                        label = label.replace("{}:{}".format(maybe_hours, maybe_minutes), "{} heures et {} minutes".format(maybe_hours, maybe_minutes))
+
+    new_label = []
+    # pylint: disable=too-many-nested-blocks
+    for s in label.split(" "):
+        if any(i.isdigit() for i in s):
+            s = s.replace(",", ".") # num2words requires "." for floats
+            s = s.replace("\"", "")  # clean some data, num2words would choke on 1959"
+
+            last_c = s[-1]
+            if not last_c.isdigit(): # num2words will choke on "0.6.", "24 ?"
+                s = s[:-1]
+
+            if any(i.isalpha() for i in s): # So we have any(isdigit()) **and** any(sialpha), like "3D"
+                ns = []
+                for c in s:
+                    nc = c
+                    if c.isdigit(): # convert "3" to "trois-"
+                        try:
+                            nc = num2words(c, lang="fr") + "-"
+                        except decimal.InvalidOperation as ex:
+                            print("decimal.InvalidOperation: '{}'".format(s))
+                            raise ex
+                    ns.append(nc)
+                s = "".join(s)
+            else:
+                try:
+                    s = num2words(s, lang="fr")
+                except decimal.InvalidOperation as ex:
+                    print("decimal.InvalidOperation: '{}'".format(s))
+                    raise ex
+        new_label.append(s)
+    return " ".join(new_label)
+
+def maybe_normalize_for_specials_chars(label):
+    label = label.replace("%", "pourcents")
+    label = label.replace("/", ", ") # clean intervals like 2019/2022 to "2019 2022"
+    label = label.replace("-", ", ") # clean intervals like 70-80 to "70 80"
+    label = label.replace("+", " plus ") # clean + and make it speakable
+    label = label.replace("€", " euros ") # clean euro symbol and make it speakable
+    label = label.replace("., ", ", ") # clean some strange "4.0., " (20181017_Innovation.xml)
+    label = label.replace("°", " degré ") # clean some strange "°5" (20181210_EtatsGeneraux-1000_fre_750_und.xml)
+    label = label.replace("...", ".") # remove ellipsis
+    label = label.replace("..", ".") # remove broken ellipsis
+    label = label.replace("m²", "mètre-carrés") # 20150616_Defi_Climat_3_wmv_0_fre_minefi.xml
+    label = label.replace("[end]", "") # broken tag in 20150123_Entretiens_Tresor_PGM_wmv_0_fre_minefi.xml
+    label = label.replace(u'\xB8c', " ç") # strange cedilla in 20150417_Printemps_Economie_2_wmv_0_fre_minefi.xml
+    label = label.replace("C0²", "CO 2") # 20121016_Syteme_sante_copie_wmv_0_fre_minefi.xml
+    return label
+
+def maybe_normalize_for_anglicisms(label):
+    label = label.replace("B2B", "B to B")
+    label = label.replace("B2C", "B to C")
+    label = label.replace("#", "hashtag ")
+    label = label.replace("@", "at ")
+    return label
+
+def maybe_normalize(label):
+    label = maybe_normalize_for_specials_chars(label)
+    label = maybe_normalize_for_anglicisms(label)
+    label = maybe_normalize_for_digits(label)
+    return label
+
+def one_sample(sample):
+    file_size = -1
+    frames = 0
+
+    audio_source = sample[0]
+    target_dir = sample[1]
+    dataset_basename = sample[2]
+
+    start_time = sample[3]
+    duration = sample[4]
+    label = label_filter_fun(sample[5])
+    sample_id = sample[6]
+
+    _wav_filename = os.path.basename(audio_source.replace(".wav", "_{:06}.wav".format(sample_id)))
+    wav_fullname = os.path.join(target_dir, dataset_basename, _wav_filename)
+
+    if not os.path.exists(wav_fullname):
+        subprocess.check_output(["ffmpeg", "-i", audio_source, "-ss", str(start_time), "-t", str(duration), "-c", "copy", wav_fullname], stdin=subprocess.DEVNULL, stderr=subprocess.STDOUT)
+
+    file_size = os.path.getsize(wav_fullname)
+    frames = int(subprocess.check_output(["soxi", "-s", wav_fullname], stderr=subprocess.STDOUT))
+
+    _counter = get_counter()
+    _rows = []
+
+    if file_size == -1:
+        # Excluding samples that failed upon conversion
+        _counter["failed"] += 1
+    elif label is None:
+        # Excluding samples that failed on label validation
+        _counter["invalid_label"] += 1
+    elif int(frames/SAMPLE_RATE*1000/10/2) < len(str(label)):
+        # Excluding samples that are too short to fit the transcript
+        _counter["too_short"] += 1
+    elif frames/SAMPLE_RATE < MIN_SECS:
+        # Excluding samples that are too short
+        _counter["too_short"] += 1
+    elif frames/SAMPLE_RATE > MAX_SECS:
+        # Excluding very long samples to keep a reasonable batch-size
+        _counter["too_long"] += 1
+    else:
+        # This one is good - keep it for the target CSV
+        _rows.append((os.path.join(dataset_basename, _wav_filename), file_size, label))
+        _counter["imported_time"] += frames
+    _counter["all"] += 1
+    _counter["total_time"] += frames
+
+    return (_counter, _rows)
+
+def _maybe_import_data(xml_file, audio_source, target_dir, rel_tol=1e-1):
+    dataset_basename = os.path.splitext(os.path.split(xml_file)[1])[0]
+    wav_root = os.path.join(target_dir, dataset_basename)
+    if not os.path.exists(wav_root):
+        os.makedirs(wav_root)
+
+    source_frames = int(subprocess.check_output(["soxi", "-s", audio_source], stderr=subprocess.STDOUT))
+    print("Source audio length: %s" % secs_to_hours(source_frames / SAMPLE_RATE))
+
+    # Get audiofile path and transcript for each sentence in tsv
+    samples = []
+    tree = ET.parse(xml_file)
+    root = tree.getroot()
+    seq_id        = 0
+    this_time     = 0.0
+    this_duration = 0.0
+    prev_time     = 0.0
+    prev_duration = 0.0
+    this_text     = ""
+    for child in root:
+        if child.tag == "row":
+            cur_time     = float(child.attrib["timestamp"])
+            cur_duration = float(child.attrib["timedur"])
+            cur_text     = child.text
+
+            if this_time == 0.0:
+                this_time = cur_time
+
+            delta    = cur_time - (prev_time + prev_duration)
+            # rel_tol value is made from trial/error to try and compromise between:
+            # - cutting enough to skip missing words
+            # - not too short, not too long sentences
+            is_close = math.isclose(cur_time, this_time + this_duration, rel_tol=rel_tol)
+            is_short = ((this_duration + cur_duration + delta) < MAX_SECS)
+
+            # when the previous element is close enough **and** this does not
+            # go over MAX_SECS, we append content
+            if (is_close and is_short):
+                this_duration += cur_duration + delta
+                this_text     += cur_text
+            else:
+                samples.append((audio_source, target_dir, dataset_basename, this_time, this_duration, this_text, seq_id))
+
+                this_time     = cur_time
+                this_duration = cur_duration
+                this_text     = cur_text
+
+                seq_id += 1
+
+            prev_time     = cur_time
+            prev_duration = cur_duration
+
+    # Keep track of how many samples are good vs. problematic
+    _counter = get_counter()
+    num_samples = len(samples)
+    _rows = []
+
+    print("Processing XML data: {}".format(xml_file))
+    pool = Pool()
+    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
+    for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
+        _counter += processed[0]
+        _rows += processed[1]
+        bar.update(i)
+    bar.update(num_samples)
+    pool.close()
+    pool.join()
+
+    imported_samples = get_imported_samples(_counter)
+    assert _counter["all"] == num_samples
+    assert len(_rows) == imported_samples
+
+    print_import_report(_counter, SAMPLE_RATE, MAX_SECS)
+    print("Import efficiency: %.1f%%" % ((_counter["total_time"] / source_frames)*100))
+    print("")
+
+    return _counter, _rows
+
+def _maybe_convert_wav(mp3_filename, _wav_filename):
+    if not os.path.exists(_wav_filename):
+        print("Converting {} to WAV file: {}".format(mp3_filename, _wav_filename))
+        transformer = sox.Transformer()
+        transformer.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS, bitdepth=BIT_DEPTH)
+        try:
+            transformer.build(mp3_filename, _wav_filename)
+        except sox.core.SoxError:
+            pass
+
+def write_general_csv(target_dir, _rows, _counter):
+    target_csv_template = os.path.join(target_dir, "ccpmf_{}.csv")
+    with open(target_csv_template.format("train"), "w") as train_csv_file:  # 80%
+        with open(target_csv_template.format("dev"), "w") as dev_csv_file:  # 10%
+            with open(target_csv_template.format("test"), "w") as test_csv_file:  # 10%
+                train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES)
+                train_writer.writeheader()
+                dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES)
+                dev_writer.writeheader()
+                test_writer = csv.DictWriter(test_csv_file, fieldnames=FIELDNAMES)
+                test_writer.writeheader()
+
+                bar = progressbar.ProgressBar(max_value=len(_rows), widgets=SIMPLE_BAR)
+                for i, item in enumerate(bar(_rows)):
+                    i_mod = i % 10
+                    if i_mod == 0:
+                        writer = test_writer
+                    elif i_mod == 1:
+                        writer = dev_writer
+                    else:
+                        writer = train_writer
+                    writer.writerow({"wav_filename": item[0], "wav_filesize": item[1], "transcript": item[2]})
+
+    print("")
+    print("~~~~ FINAL STATISTICS ~~~~")
+    print_import_report(_counter, SAMPLE_RATE, MAX_SECS)
+    print("~~~~ (FINAL STATISTICS) ~~~~")
+    print("")
+
+if __name__ == "__main__":
+    PARSER = get_importers_parser(description="Import XML from Conference Centre for Economics, France")
+    PARSER.add_argument("target_dir", help="Destination directory")
+    PARSER.add_argument("--filter_alphabet", help="Exclude samples with characters not in provided alphabet")
+    PARSER.add_argument("--normalize", action="store_true", help="Converts diacritic characters to their base ones")
+
+    PARAMS = PARSER.parse_args()
+    validate_label = get_validate_label(PARAMS)
+    ALPHABET = Alphabet(PARAMS.filter_alphabet) if PARAMS.filter_alphabet else None
+
+    def label_filter_fun(label):
+        if PARAMS.normalize:
+            label = unicodedata.normalize("NFKD", label.strip()) \
+                .encode("ascii", "ignore") \
+                .decode("ascii", "ignore")
+        label = maybe_normalize(label)
+        label = validate_label(label)
+        if ALPHABET and label:
+            try:
+                ALPHABET.encode(label)
+            except KeyError:
+                label = None
+        return label
+
+    dataset_sources = _download_and_preprocess_data(csv_url=DATASET_RELEASE_CSV, target_dir=PARAMS.target_dir)
+    sources_root_dir = os.path.dirname(dataset_sources)
+    all_counter = get_counter()
+    all_rows = []
+    with open(dataset_sources, "r") as sources:
+        for line in sources.readlines():
+            d = line.split(",")
+            this_xml = os.path.join(sources_root_dir, d[0])
+            this_mp3 = os.path.join(sources_root_dir, d[1])
+            this_rel = float(d[2])
+
+            wav_filename = os.path.join(sources_root_dir, os.path.splitext(os.path.basename(this_mp3))[0] + ".wav")
+            _maybe_convert_wav(this_mp3, wav_filename)
+            counter, rows = _maybe_import_data(this_xml, wav_filename, sources_root_dir, this_rel)
+
+            all_counter += counter
+            all_rows += rows
+    write_general_csv(sources_root_dir, _counter=all_counter, _rows=all_rows)
--- a/training/deepspeech_training/util/downloader.py
+++ b/training/deepspeech_training/util/downloader.py
@ -19,7 +19,7 @@ def maybe_download(archive_name, target_dir, archive_url):
        total_size = int(req.headers.get('content-length', 0))
        done = 0
        with open(archive_path, 'wb') as f:
-            bar = progressbar.ProgressBar(max_value=total_size, widgets=SIMPLE_BAR)
+            bar = progressbar.ProgressBar(max_value=total_size if total_size > 0 else progressbar.UnknownLength, widgets=SIMPLE_BAR)
            for data in req.iter_content(1024*1024):
                done += len(data)
                f.write(data)