* Redo remote I/O changes once more; this time without messing with taskcluster * Add bin changes * Fix merge-induced issue? * For the interleaved case with multiple collections, unpack audio on the fly To reproduce the previous failure rm data/smoke_test/ldc93s1.csv rm data/smoke_test/ldc93s1.sdb rm -rf /tmp/ldc93s1_cache_sdb_csv rm -rf /tmp/ckpt_sdb_csv rm -rf /tmp/train_sdb_csv ./bin/run-tc-ldc93s1_new_sdb_csv.sh 109 16000 python -u DeepSpeech.py --noshow_progressbar --noearly_stop --train_files ./data/smoke_test/ldc93s1.sdb,./data/smoke_test/ldc93s1.csv --train_batch_size 1 --feature_cache /tmp/ldc93s1_cache_sdb_csv --dev_files ./data/smoke_test/ldc93s1.sdb,./data/smoke_test/ldc93s1.csv --dev_batch_size 1 --test_files ./data/smoke_test/ldc93s1.sdb,./data/smoke_test/ldc93s1.csv --test_batch_size 1 --n_hidden 100 --epochs 109 --max_to_keep 1 --checkpoint_dir /tmp/ckpt_sdb_csv --learning_rate 0.001 --dropout_rate 0.05 --export_dir /tmp/train_sdb_csv --scorer_path data/smoke_test/pruned_lm.scorer --audio_sample_rate 16000 * Attempt to preserve length information with a wrapper around `map()`… this gets pretty python-y * Call the right `__next__()` * Properly implement the rest of the map wrappers here…… * Fix trailing whitespace situation and other linter complaints * Remove data accidentally checked in * Fix overlay augmentations * Wavs must be open in rb mode if we're passing in an external file pointer -- this confused me * Lint whitespace * Revert "Fix trailing whitespace situation and other linter complaints" This reverts commit c3c45397a2f98e9b00d00c18c4ced4fc52475032. * Fix linter issue but without such an aggressive diff * Move unpack_maybe into sample_collections * Use unpack_maybe in place of duplicate lambda * Fix confusing comment * Add clarifying comment for on-the-fly unpacking
65 lines
2.6 KiB
Python
65 lines
2.6 KiB
Python
"""
|
|
Usage: $ python3 check_characters.py "INFILE"
|
|
e.g. $ python3 check_characters.py -csv /home/data/french.csv
|
|
e.g. $ python3 check_characters.py -csv ../train.csv,../test.csv
|
|
e.g. $ python3 check_characters.py -alpha -csv ../train.csv
|
|
|
|
Point this script to your transcripts, and it returns
|
|
to the terminal the unique set of characters in those
|
|
files (combined).
|
|
|
|
These files are assumed to be csv, with the transcript being the third field.
|
|
|
|
The script simply reads all the text from all the files,
|
|
storing a set of unique characters that were seen
|
|
along the way.
|
|
"""
|
|
import argparse
|
|
import csv
|
|
import os
|
|
import sys
|
|
import unicodedata
|
|
from .io import open_remote
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument("-csv", "--csv-files", help="Str. Filenames as a comma separated list", required=True)
|
|
parser.add_argument("-alpha", "--alphabet-format", help="Bool. Print in format for alphabet.txt", action="store_true")
|
|
parser.add_argument("-unicode", "--disable-unicode-variants", help="Bool. DISABLE check for unicode consistency (use with --alphabet-format)", action="store_true")
|
|
args = parser.parse_args()
|
|
in_files = args.csv_files.split(",")
|
|
|
|
print("### Reading in the following transcript files: ###")
|
|
print("### {} ###".format(in_files))
|
|
|
|
all_text = set()
|
|
for in_file in in_files:
|
|
with open_remote(in_file, "r") as csv_file:
|
|
reader = csv.reader(csv_file)
|
|
try:
|
|
next(reader, None) # skip the file header (i.e. "transcript")
|
|
for row in reader:
|
|
if not args.disable_unicode_variants:
|
|
unicode_transcript = unicodedata.normalize("NFKC", row[2])
|
|
if row[2] != unicode_transcript:
|
|
print("Your input file", in_file, "contains at least one transript with unicode chars on more than one code-point: '{}'. Consider using NFKC normalization: unicodedata.normalize('NFKC', str).".format(row[2]))
|
|
sys.exit(-1)
|
|
all_text |= set(row[2])
|
|
except IndexError:
|
|
print("Your input file", in_file, "is not formatted properly. Check if there are 3 columns with the 3rd containing the transcript")
|
|
sys.exit(-1)
|
|
finally:
|
|
csv_file.close()
|
|
|
|
print("### The following unique characters were found in your transcripts: ###")
|
|
if args.alphabet_format:
|
|
for char in list(all_text):
|
|
print(char)
|
|
print("### ^^^ You can copy-paste these into data/alphabet.txt ###")
|
|
else:
|
|
print(list(all_text))
|
|
|
|
if __name__ == '__main__':
|
|
main()
|