96 lines
3.2 KiB
Python
96 lines
3.2 KiB
Python
"""
|
|
Usage:
|
|
From within the training/ directory, call this script as a module:
|
|
|
|
$ python3 -m coqui_stt_training.util.check_characters "INFILE"
|
|
e.g. $ python3 -m coqui_stt_training.util.check_characters -csv /home/data/french.csv
|
|
e.g. $ python3 -m coqui_stt_training.util.check_characters -csv ../train.csv,../test.csv
|
|
e.g. $ python3 -m coqui_stt_training.util.check_characters -alpha -csv ../train.csv
|
|
|
|
Point this script to your transcripts, and it returns
|
|
to the terminal the unique set of characters in those
|
|
files (combined).
|
|
|
|
These files are assumed to be csv, with the transcript being the third field.
|
|
|
|
The script simply reads all the text from all the files,
|
|
storing a set of unique characters that were seen
|
|
along the way.
|
|
"""
|
|
import argparse
|
|
import csv
|
|
import os
|
|
import sys
|
|
import unicodedata
|
|
|
|
from .io import open_remote
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument(
|
|
"-csv",
|
|
"--csv-files",
|
|
help="Str. Filenames as a comma separated list",
|
|
required=True,
|
|
)
|
|
parser.add_argument(
|
|
"-alpha",
|
|
"--alphabet-format",
|
|
help="Bool. Print in format for alphabet.txt",
|
|
action="store_true",
|
|
)
|
|
parser.add_argument(
|
|
"-unicode",
|
|
"--disable-unicode-variants",
|
|
help="Bool. DISABLE check for unicode consistency (use with --alphabet-format)",
|
|
action="store_true",
|
|
)
|
|
args = parser.parse_args()
|
|
in_files = args.csv_files.split(",")
|
|
|
|
print("### Reading in the following transcript files: ###")
|
|
print("### {} ###".format(in_files))
|
|
|
|
all_text = set()
|
|
for in_file in in_files:
|
|
with open_remote(in_file, "r") as csv_file:
|
|
reader = csv.reader(csv_file)
|
|
try:
|
|
next(reader, None) # skip the file header (i.e. "transcript")
|
|
for row in reader:
|
|
if not args.disable_unicode_variants:
|
|
unicode_transcript = unicodedata.normalize("NFKC", row[2])
|
|
if row[2] != unicode_transcript:
|
|
print(
|
|
"Your input file",
|
|
in_file,
|
|
"contains at least one transript with unicode chars on more than one code-point: '{}'. Consider using NFKC normalization: unicodedata.normalize('NFKC', str).".format(
|
|
row[2]
|
|
),
|
|
)
|
|
sys.exit(-1)
|
|
all_text |= set(row[2])
|
|
except IndexError:
|
|
print(
|
|
"Your input file",
|
|
in_file,
|
|
"is not formatted properly. Check if there are 3 columns with the 3rd containing the transcript",
|
|
)
|
|
sys.exit(-1)
|
|
finally:
|
|
csv_file.close()
|
|
|
|
print("### The following unique characters were found in your transcripts: ###")
|
|
if args.alphabet_format:
|
|
for char in list(all_text):
|
|
print(char)
|
|
print("### ^^^ You can copy-paste these into data/alphabet.txt ###")
|
|
else:
|
|
print(list(all_text))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|