Filter LM by removing very rare words

2019-11-13 17:38:40 +01:00 · 2019-11-13 17:38:40 +01:00 · ad2769f479
commit ad2769f479
parent f18643b1b9
3 changed files with 22 additions and 9 deletions
--- a/data/lm/README.rst
+++ b/data/lm/README.rst
@ -8,18 +8,27 @@ lm.binary was generated from the LibriSpeech normalized LM training text, availa
   import os

   from urllib import request
+   from collections import Counter

   # Grab corpus.
   url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
   data_upper = '/tmp/upper.txt.gz'
   request.urlretrieve(url, data_upper)

-   # Convert to lowercase and cleanup.
-   data_lower = '/tmp/lower.txt'
-   with open(data_lower, 'w', encoding='utf-8') as lower:
+   # Convert to lowercase and count word occurences.
+   counter = Counter()
+   data_lower = '/tmp/lower.txt.gz'
+   with io.TextIOWrapper(io.BufferedWriter(gzip.open(data_lower, 'w')), encoding='utf-8') as lower:
       with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
           for line in upper:
-               lower.write(line.lower())
+               line_lower = line.lower()
+               counter.update(line_lower.split())
+               lower.write(line_lower)
+
+   # Create vocabulary file with top 500k words
+   vocab_path = '/tmp/vocab-500k.txt'
+   with open(vocab_path, 'w') as fout:
+       fout.write('\n'.join(word for word, count in counter.most_common(500000)))

   # Build pruned LM.
   lm_path = '/tmp/lm.arpa'
@ -30,12 +39,16 @@ lm.binary was generated from the LibriSpeech normalized LM training text, availa
          --arpa {lm_path} \
          --prune 0 0 0 1

+   # Filter LM using vocabulary.
+   filtered_path = '/tmp/lm_filtered.arpa'
+   !filter single model:{lm_path} {filtered_path} < {vocab_path}
+
   # Quantize and produce trie binary.
   binary_path = '/tmp/lm.binary'
   !build_binary -a 255 \
                 -q 8 \
                 trie \
-                 {lm_path} \
+                 {filtered_path} \
                 {binary_path} 
   os.remove(lm_path)

--- a/data/lm/lm.binary
+++ b/data/lm/lm.binary
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e1fa6801b25912a3625f67e0f6cafcdacb24033be9fad5fa272152a0828d7193
-size 1800894585
+oid sha256:111bf908121c95e6121ebf7e600aad5762c49c9348c6bc652123a2dafd28587b
+size 1728766378
--- a/data/lm/trie
+++ b/data/lm/trie
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f2024f2e83b252df33b5d6c5cced8186abd2764ce7124970c15c0174034c3f2e
-size 24480560
+oid sha256:0281e5e784ffccb4aeae5e7d64099058a0c22e42dbb7aa2d3ef2fbbff53db3ab
+size 12200736