From 609120f3da80d9622fc14803b05faf4bf476013a Mon Sep 17 00:00:00 2001
From: Nicolas Panel <nicolas.panel@live.fr>
Date: Sun, 14 Oct 2018 13:48:59 +0200
Subject: [PATCH 1/2] [TrainingSpeech importer] stick to data/alphabet.txt

aims to generate english-complatible dataset since no FR alphabet.txt for now
see https://github.com/mozilla/DeepSpeech/pull/1599#issuecomment-426544379 for more info
---
 bin/import_ts.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bin/import_ts.py b/bin/import_ts.py
index a2d061ca..51d91153 100755
--- a/bin/import_ts.py
+++ b/bin/import_ts.py
@@ -11,6 +11,7 @@ import sys
 sys.path.insert(1, os.path.join(sys.path[0], '..'))
 
 import csv
+import unidecode
 import zipfile
 
 from os import path
@@ -96,7 +97,7 @@ def cleanup_transcript(text):
     text = text.replace('’', "'").replace('\u00A0', ' ')
     text = PUNCTUATIONS_REG.sub(' ', text)
     text = MULTIPLE_SPACES_REG.sub(' ', text)
-    return text.strip().lower()
+    return unidecode.unidecode(text).strip().lower()
 
 
 if __name__ == "__main__":

From 61d9193ce52503b52954736d1e60761c32d0b339 Mon Sep 17 00:00:00 2001
From: nicolaspanel <nicolas.panel@live.fr>
Date: Mon, 15 Oct 2018 10:18:14 +0200
Subject: [PATCH 2/2] [TrainingSpeech importer] add `--english-compatible`
 option

---
 bin/import_ts.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/bin/import_ts.py b/bin/import_ts.py
index 51d91153..a6f1ff00 100755
--- a/bin/import_ts.py
+++ b/bin/import_ts.py
@@ -3,6 +3,7 @@ from __future__ import absolute_import, division, print_function
 
 # Make sure we can import stuff from util/
 # This script needs to be run from the root of the DeepSpeech repository
+import argparse
 import os
 import re
 import sys
@@ -26,7 +27,7 @@ ARCHIVE_DIR_NAME = 'ts_' + ARCHIVE_NAME
 ARCHIVE_URL = 'https://s3.eu-west-3.amazonaws.com/audiocorp/releases/' + ARCHIVE_NAME + '.zip'
 
 
-def _download_and_preprocess_data(target_dir):
+def _download_and_preprocess_data(target_dir, english_compatible=False):
     # Making path absolute
     target_dir = path.abspath(target_dir)
     # Conditionally download data
@@ -34,7 +35,8 @@ def _download_and_preprocess_data(target_dir):
     # Conditionally extract archive data
     _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
     # Conditionally convert TrainingSpeech data to DeepSpeech CSVs and wav
-    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)
+    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME, english_compatible=english_compatible)
+
 
 def _maybe_extract(target_dir, extracted_data, archive_path):
     # If target_dir/extracted_data does not exist, extract archive in target_dir
@@ -48,7 +50,8 @@ def _maybe_extract(target_dir, extracted_data, archive_path):
     else:
         print('Found directory "%s" - not extracting it from archive.' % archive_path)
 
-def _maybe_convert_sets(target_dir, extracted_data):
+
+def _maybe_convert_sets(target_dir, extracted_data, english_compatible=False):
     extracted_dir = path.join(target_dir, extracted_data)
     # override existing CSV with normalized one
     target_csv_template = os.path.join(target_dir, 'ts_' + ARCHIVE_NAME + '_{}.csv')
@@ -71,7 +74,7 @@ def _maybe_convert_sets(target_dir, extracted_data):
                 test_writer.writeheader()
 
                 for i, item in enumerate(data):
-                    transcript = validate_label(cleanup_transcript(item['text']))
+                    transcript = validate_label(cleanup_transcript(item['text'], english_compatible=english_compatible))
                     if not transcript:
                         continue
                     wav_filename = os.path.join(target_dir, extracted_data, item['path'])
@@ -93,12 +96,22 @@ PUNCTUATIONS_REG = re.compile(r"[°\-,;!?.()\[\]*…—]")
 MULTIPLE_SPACES_REG = re.compile(r'\s{2,}')
 
 
-def cleanup_transcript(text):
+def cleanup_transcript(text, english_compatible=False):
     text = text.replace('’', "'").replace('\u00A0', ' ')
     text = PUNCTUATIONS_REG.sub(' ', text)
     text = MULTIPLE_SPACES_REG.sub(' ', text)
-    return unidecode.unidecode(text).strip().lower()
+    if english_compatible:
+        text = unidecode.unidecode(text)
+    return text.strip().lower()
+
+
+def handle_args():
+    parser = argparse.ArgumentParser(description='Importer for TrainingSpeech dataset.')
+    parser.add_argument(dest='target_dir')
+    parser.add_argument('--english-compatible', action='store_true', dest='english_compatible', help='Remove diactrics and other non-ascii chars.')
+    return parser.parse_args()
 
 
 if __name__ == "__main__":
-    _download_and_preprocess_data(sys.argv[1])
+    cli_args = handle_args()
+    _download_and_preprocess_data(cli_args.target_dir, cli_args.english_compatible)