Merge pull request #1874 from mozilla/check-chars
--alphabet-format flag
This commit is contained in:
commit
4b2e3bc714
@ -2,36 +2,39 @@ import csv
|
|||||||
import sys
|
import sys
|
||||||
import glob
|
import glob
|
||||||
|
|
||||||
'''
|
"""
|
||||||
Usage: $ python3 check_characters.py "INFILE"
|
Usage: $ python3 check_characters.py "INFILE"
|
||||||
e.g. $ python3 ../DeepSpeech/util/check_characters.py "/home/data/*.csv"
|
e.g. $ python3 check_characters.py -csv /home/data/french.csv
|
||||||
e.g. $ python3 ../DeepSpeech/util/check_characters.py "/home/data/french.csv"
|
e.g. $ python3 check_characters.py -csv ../train.csv,../test.csv
|
||||||
e.g. $ python3 ../DeepSpeech/util/check_characters.py "train.csv test.csv"
|
e.g. $ python3 check_characters.py -alpha -csv ../train.csv
|
||||||
|
|
||||||
Point this script to your transcripts, and it returns
|
Point this script to your transcripts, and it returns
|
||||||
to the terminal the unique set of characters in those
|
to the terminal the unique set of characters in those
|
||||||
files (combined).
|
files (combined).
|
||||||
|
|
||||||
These files are assumed to be comma delimited,
|
These files are assumed to be csv, with the transcript being the third field.
|
||||||
with the transcript being the third field.
|
|
||||||
|
|
||||||
The script simply reads all the text from all the files,
|
The script simply reads all the text from all the files,
|
||||||
storing a set of unique characters that were seen
|
storing a set of unique characters that were seen
|
||||||
along the way.
|
along the way.
|
||||||
'''
|
"""
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
inFiles=sys.argv[1]
|
parser = argparse.ArgumentParser()
|
||||||
if "*" in inFiles:
|
|
||||||
inFiles = glob.glob(inFiles)
|
parser.add_argument("-csv", "--csv-files", help="Str. Filenames as a comma separated list", required=True)
|
||||||
else:
|
parser.add_argument("-alpha", "--alphabet-format",help="Bool. Print in format for alphabet.txt",action="store_true")
|
||||||
inFiles = inFiles.split()
|
parser.set_defaults(alphabet_format=False)
|
||||||
|
args = parser.parse_args()
|
||||||
|
inFiles = [os.path.abspath(i) for i in args.csv_files.split(",")]
|
||||||
|
|
||||||
print("### Reading in the following transcript files: ###")
|
print("### Reading in the following transcript files: ###")
|
||||||
print(inFiles)
|
print(inFiles)
|
||||||
|
|
||||||
allText = set()
|
allText = set()
|
||||||
for inFile in (inFiles):
|
for inFile in (inFiles):
|
||||||
with open(inFile, 'r') as csvFile:
|
with open(inFile, "r") as csvFile:
|
||||||
reader = csv.reader(csvFile)
|
reader = csv.reader(csvFile)
|
||||||
try:
|
try:
|
||||||
for row in reader:
|
for row in reader:
|
||||||
@ -43,5 +46,9 @@ for inFile in (inFiles):
|
|||||||
csvFile.close()
|
csvFile.close()
|
||||||
|
|
||||||
print("### The following unique characters were found in your transcripts: ###")
|
print("### The following unique characters were found in your transcripts: ###")
|
||||||
|
if args.alphabet_format:
|
||||||
|
for char in list(allText):
|
||||||
|
print(char)
|
||||||
|
print("### ^^^ You can copy-paste these into data/alphabet.txt ###")
|
||||||
|
else:
|
||||||
print(list(allText))
|
print(list(allText))
|
||||||
print("### All these characters should be in your data/alphabet.txt file ###")
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user