Add some statistics.

This commit is contained in:
Daniel 2020-03-03 16:49:52 +01:00
parent c9a433486f
commit c6109c30f3

View File

@ -24,13 +24,29 @@ def convert_and_filter_topk(args):
file_out.write(line_lower)
# Save top-k words
print('\nSaving top {} words'.format(args.top_k))
vocab_str = '\n'.join(word for word, count in counter.most_common(args.top_k))
print('\nSaving top {} words ...'.format(args.top_k))
top_counter = counter.most_common(args.top_k)
vocab_str = '\n'.join(word for word, count in top_counter)
vocab_path = 'vocab-{}.txt'.format(args.top_k)
vocab_path = os.path.join(args.output_dir, vocab_path)
with open(vocab_path, 'w+') as file:
file.write(vocab_str)
print('\nCalculating word statistics ...'.format(args.top_k))
total_words = sum(counter.values())
print(' Your text file has {} words in total'.format(total_words))
print(' It has {} unique words'.format(len(counter)))
top_words_sum = sum(count for word, count in top_counter)
word_fraction = (top_words_sum / total_words) * 100
print(' Your top-{} words are {:.4f} percent of all words'.format(args.top_k, word_fraction))
print(' Your most common word "{}" occurred {} times'.format(*top_counter[0]))
last_word, last_count = top_counter[-1]
print(' The least common word in your top-k is "{}" with {} times'.format(last_word, last_count))
for i, (w, c) in enumerate(reversed(top_counter)):
if c > last_count:
print(' The first word with {} occurrences is "{}" at place {}'.format(c, w, len(top_counter) - 1 - i))
break
return data_lower, vocab_str