Implement some change request.

This commit is contained in:
Daniel 2020-04-01 17:15:52 +02:00
parent b27e0347b1
commit 38afe38f0b

View File

@ -1,4 +1,6 @@
import argparse
import gzip
import io
import os
import subprocess
from collections import Counter
@ -10,11 +12,13 @@ def convert_and_filter_topk(args):
""" Convert to lowercase, count word occurrences and save top-k words to a file """
counter = Counter()
data_lower = os.path.join(args.output_dir, "lower.txt")
data_lower = os.path.join(args.output_dir, "lower.txt.gz")
print("\nConverting to lowercase and counting word occurrences ...")
with open(data_lower, "w+", encoding="utf8") as file_out:
with open(args.input_txt, encoding="utf8") as file_in:
with io.TextIOWrapper(
io.BufferedWriter(gzip.open(data_lower, "w+")), encoding="utf-8"
) as file_out:
with open(args.input_txt, encoding="utf-8") as file_in:
for line in progressbar.progressbar(file_in):
line_lower = line.lower()
counter.update(line_lower.split())
@ -127,7 +131,7 @@ def main():
)
parser.add_argument(
"--top_k",
help="Use top_k most frequent words for the vocab.txt file",
help="Use top_k most frequent words for the vocab.txt file. These will be used to filter the ARPA file.",
type=int,
default=500000,
)
@ -151,7 +155,7 @@ def main():
)
parser.add_argument(
"--arpa_prune",
help='ARPA pruning parameters. Separate values with "|"',
help="ARPA pruning parameters. Separate values with '|'",
type=str,
default="0|0|1",
)