From 38afe38f0b7cef5569d70f0643cbd98fb587cd15 Mon Sep 17 00:00:00 2001
From: Daniel <daniel@mail.de>
Date: Wed, 1 Apr 2020 17:15:52 +0200
Subject: [PATCH] Implement some change request.

---
 data/lm/generate_lm.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/data/lm/generate_lm.py b/data/lm/generate_lm.py
index 410dd8a4..755d6921 100644
--- a/data/lm/generate_lm.py
+++ b/data/lm/generate_lm.py
@@ -1,4 +1,6 @@
 import argparse
+import gzip
+import io
 import os
 import subprocess
 from collections import Counter
@@ -10,11 +12,13 @@ def convert_and_filter_topk(args):
     """ Convert to lowercase, count word occurrences and save top-k words to a file """
 
     counter = Counter()
-    data_lower = os.path.join(args.output_dir, "lower.txt")
+    data_lower = os.path.join(args.output_dir, "lower.txt.gz")
 
     print("\nConverting to lowercase and counting word occurrences ...")
-    with open(data_lower, "w+", encoding="utf8") as file_out:
-        with open(args.input_txt, encoding="utf8") as file_in:
+    with io.TextIOWrapper(
+        io.BufferedWriter(gzip.open(data_lower, "w+")), encoding="utf-8"
+    ) as file_out:
+        with open(args.input_txt, encoding="utf-8") as file_in:
             for line in progressbar.progressbar(file_in):
                 line_lower = line.lower()
                 counter.update(line_lower.split())
@@ -127,7 +131,7 @@ def main():
     )
     parser.add_argument(
         "--top_k",
-        help="Use top_k most frequent words for the vocab.txt file",
+        help="Use top_k most frequent words for the vocab.txt file. These will be used to filter the ARPA file.",
         type=int,
         default=500000,
     )
@@ -151,7 +155,7 @@ def main():
     )
     parser.add_argument(
         "--arpa_prune",
-        help='ARPA pruning parameters. Separate values with "|"',
+        help="ARPA pruning parameters. Separate values with '|'",
         type=str,
         default="0|0|1",
     )