Merge pull request #1874 from mozilla/check-chars

--alphabet-format flag
2019-02-13 01:02:45 +01:00 · 2019-02-13 01:02:45 +01:00 · 4b2e3bc714
commit 4b2e3bc714
parent c8b1e71fed a56d968b73
1 changed files with 22 additions and 15 deletions
--- a/util/check_characters.py
+++ b/util/check_characters.py
@ -2,36 +2,39 @@ import csv
 import sys
 import glob
-'''
+"""
 Usage: $ python3 check_characters.py "INFILE"
- e.g.  $ python3 ../DeepSpeech/util/check_characters.py "/home/data/*.csv" 
+ e.g.  $ python3 check_characters.py -csv /home/data/french.csv
- e.g.  $ python3 ../DeepSpeech/util/check_characters.py "/home/data/french.csv" 
+ e.g.  $ python3 check_characters.py -csv ../train.csv,../test.csv 
- e.g.  $ python3 ../DeepSpeech/util/check_characters.py "train.csv test.csv" 
+ e.g.  $ python3 check_characters.py -alpha -csv ../train.csv 
 Point this script to your transcripts, and it returns 
 to the terminal the unique set of characters in those 
 files (combined).
-These files are assumed to be comma delimited, 
+These files are assumed to be csv, with the transcript being the third field.
 with the transcript being the third field.
 The script simply reads all the text from all the files, 
 storing a set of unique characters that were seen 
 along the way.
-'''
+"""
 import argparse
 import os
-inFiles=sys.argv[1]
+parser = argparse.ArgumentParser()
-if "*" in inFiles:
+
-    inFiles = glob.glob(inFiles)
+parser.add_argument("-csv", "--csv-files", help="Str. Filenames as a comma separated list", required=True)
-else:
+parser.add_argument("-alpha", "--alphabet-format",help="Bool. Print in format for alphabet.txt",action="store_true")
-    inFiles = inFiles.split()
+parser.set_defaults(alphabet_format=False)
 args = parser.parse_args()
 inFiles = [os.path.abspath(i) for i in args.csv_files.split(",")]
 print("### Reading in the following transcript files: ###")
 print(inFiles)
 allText = set()
 for inFile in (inFiles):
-    with open(inFile, 'r') as csvFile:
+    with open(inFile, "r") as csvFile:
        reader = csv.reader(csvFile)
        try:
            for row in reader:
@ -43,5 +46,9 @@ for inFile in (inFiles):
            csvFile.close()
 print("### The following unique characters were found in your transcripts: ###")
 if args.alphabet_format:
    for char in list(allText):
        print(char)
    print("### ^^^ You can copy-paste these into data/alphabet.txt ###")
 else:
    print(list(allText))
 print("### All these characters should be in your data/alphabet.txt file ###")