Merge pull request #1874 from mozilla/check-chars

--alphabet-format flag
2019-02-13 01:02:45 +01:00 · 2019-02-13 01:02:45 +01:00 · 4b2e3bc714
commit 4b2e3bc714
parent c8b1e71fed a56d968b73
1 changed files with 22 additions and 15 deletions
--- a/util/check_characters.py
+++ b/util/check_characters.py
@ -2,36 +2,39 @@ import csv
 import sys
 import glob

-'''
+"""
 Usage: $ python3 check_characters.py "INFILE"
- e.g.  $ python3 ../DeepSpeech/util/check_characters.py "/home/data/*.csv" 
- e.g.  $ python3 ../DeepSpeech/util/check_characters.py "/home/data/french.csv" 
- e.g.  $ python3 ../DeepSpeech/util/check_characters.py "train.csv test.csv" 
+ e.g.  $ python3 check_characters.py -csv /home/data/french.csv
+ e.g.  $ python3 check_characters.py -csv ../train.csv,../test.csv 
+ e.g.  $ python3 check_characters.py -alpha -csv ../train.csv 

 Point this script to your transcripts, and it returns 
 to the terminal the unique set of characters in those 
 files (combined).

-These files are assumed to be comma delimited, 
-with the transcript being the third field.
+These files are assumed to be csv, with the transcript being the third field.

 The script simply reads all the text from all the files, 
 storing a set of unique characters that were seen 
 along the way.
-'''
+"""
+import argparse
+import os

-inFiles=sys.argv[1]
-if "*" in inFiles:
-    inFiles = glob.glob(inFiles)
-else:
-    inFiles = inFiles.split()
+parser = argparse.ArgumentParser()
+
+parser.add_argument("-csv", "--csv-files", help="Str. Filenames as a comma separated list", required=True)
+parser.add_argument("-alpha", "--alphabet-format",help="Bool. Print in format for alphabet.txt",action="store_true")
+parser.set_defaults(alphabet_format=False)
+args = parser.parse_args()
+inFiles = [os.path.abspath(i) for i in args.csv_files.split(",")]

 print("### Reading in the following transcript files: ###")
 print(inFiles)

 allText = set()
 for inFile in (inFiles):
-    with open(inFile, 'r') as csvFile:
+    with open(inFile, "r") as csvFile:
        reader = csv.reader(csvFile)
        try:
            for row in reader:
@ -43,5 +46,9 @@ for inFile in (inFiles):
            csvFile.close()

 print("### The following unique characters were found in your transcripts: ###")
-print(list(allText))
-print("### All these characters should be in your data/alphabet.txt file ###")
+if args.alphabet_format:
+    for char in list(allText):
+        print(char)
+    print("### ^^^ You can copy-paste these into data/alphabet.txt ###")
+else:
+    print(list(allText))