Read from input.txt.gz again.
This commit is contained in:
parent
38afe38f0b
commit
e862cd41db
@ -18,12 +18,23 @@ def convert_and_filter_topk(args):
|
|||||||
with io.TextIOWrapper(
|
with io.TextIOWrapper(
|
||||||
io.BufferedWriter(gzip.open(data_lower, "w+")), encoding="utf-8"
|
io.BufferedWriter(gzip.open(data_lower, "w+")), encoding="utf-8"
|
||||||
) as file_out:
|
) as file_out:
|
||||||
with open(args.input_txt, encoding="utf-8") as file_in:
|
|
||||||
|
# Open the input file either from input.txt or input.txt.gz
|
||||||
|
_, file_extension = os.path.splitext(args.input_txt)
|
||||||
|
if file_extension == ".gz":
|
||||||
|
file_in = io.TextIOWrapper(
|
||||||
|
io.BufferedWriter(gzip.open(args.input_txt)), encoding="utf-8"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
file_in = open(args.input_txt, encoding="utf-8")
|
||||||
|
|
||||||
for line in progressbar.progressbar(file_in):
|
for line in progressbar.progressbar(file_in):
|
||||||
line_lower = line.lower()
|
line_lower = line.lower()
|
||||||
counter.update(line_lower.split())
|
counter.update(line_lower.split())
|
||||||
file_out.write(line_lower)
|
file_out.write(line_lower)
|
||||||
|
|
||||||
|
file_in.close()
|
||||||
|
|
||||||
# Save top-k words
|
# Save top-k words
|
||||||
print("\nSaving top {} words ...".format(args.top_k))
|
print("\nSaving top {} words ...".format(args.top_k))
|
||||||
top_counter = counter.most_common(args.top_k)
|
top_counter = counter.most_common(args.top_k)
|
||||||
@ -122,7 +133,7 @@ def main():
|
|||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--input_txt",
|
"--input_txt",
|
||||||
help="File path to a .txt with sample sentences",
|
help="Path to a file.txt or file.txt.gz with sample sentences",
|
||||||
type=str,
|
type=str,
|
||||||
required=True,
|
required=True,
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user