Merge pull request #1543 from mozilla/update_lm
Update language model to a trie-based LM created from the LibriSpeech LM corpus
This commit is contained in:
commit
b9946fddab
2
.gitattributes
vendored
2
.gitattributes
vendored
@ -1,3 +1,3 @@
|
||||
|
||||
*.binary filter=lfs diff=lfs merge=lfs -crlf
|
||||
data/lm/trie filter=lfs diff=lfs merge=lfs -crlf
|
||||
data/lm/vocab.txt filter=lfs diff=lfs merge=lfs -text
|
||||
|
46
data/lm/README.md
Normal file
46
data/lm/README.md
Normal file
@ -0,0 +1,46 @@
|
||||
lm.binary was generated from the LibriSpeech normalized LM training text, available [here](http://www.openslr.org/11), following this recipe (Jupyter notebook code):
|
||||
|
||||
```python
|
||||
import gzip
|
||||
import io
|
||||
import os
|
||||
|
||||
from urllib import request
|
||||
|
||||
# Grab corpus.
|
||||
url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
|
||||
data_upper = '/tmp/upper.txt.gz'
|
||||
request.urlretrieve(url, data_upper)
|
||||
|
||||
# Convert to lowercase and cleanup.
|
||||
data_lower = '/tmp/lower.txt'
|
||||
with open(data_lower, 'w', encoding='utf-8') as lower:
|
||||
with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
|
||||
for line in upper:
|
||||
lower.write(line.lower())
|
||||
|
||||
# Build pruned LM.
|
||||
lm_path = '/tmp/lm.arpa'
|
||||
!lmplz --order 5 \
|
||||
--temp_prefix /tmp/ \
|
||||
--memory 50% \
|
||||
--text {data_lower} \
|
||||
--arpa {lm_path} \
|
||||
--prune 0 0 0 1
|
||||
|
||||
# Quantize and produce trie binary.
|
||||
binary_path = '/tmp/lm.binary'
|
||||
!build_binary -a 255 \
|
||||
-q 8 \
|
||||
trie \
|
||||
{lm_path} \
|
||||
{binary_path}
|
||||
os.remove(lm_path)
|
||||
```
|
||||
|
||||
The trie was then generated from the list of unique words in the corpus (data/lm/vocab.txt):
|
||||
|
||||
```bash
|
||||
tr -s '[[:space:]]' '\n' < /tmp/lower.txt | sort -u > /tmp/vocab.txt
|
||||
./generate_trie ../data/alphabet.txt /tmp/lm.binary /tmp/vocab.txt /tmp/trie
|
||||
```
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a9da0224ae9baf92a32fa85dafd24c99b3837a67b880ade0a5a730449d15297b
|
||||
size 327902357
|
||||
oid sha256:e1fa6801b25912a3625f67e0f6cafcdacb24033be9fad5fa272152a0828d7193
|
||||
size 1800894585
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:c2f8f1d721eed0ae621160626e803925efa481c8156bb97e72013c0fbf879b75
|
||||
size 43550345
|
||||
oid sha256:00779e53cfcc1e170525b6cc6113096d3984036ddf84dd8f07ce275f7027c47d
|
||||
size 227692934
|
||||
|
94638
data/lm/vocab.txt
94638
data/lm/vocab.txt
File diff suppressed because it is too large
Load Diff
Binary file not shown.
529740
data/smoke_test/vocab.trie
529740
data/smoke_test/vocab.trie
File diff suppressed because it is too large
Load Diff
@ -8,7 +8,7 @@
|
||||
|
||||
#include "kenlm/lm/model.hh"
|
||||
|
||||
typedef lm::ngram::ProbingModel Model;
|
||||
typedef lm::ngram::QuantArrayTrieModel Model;
|
||||
|
||||
struct KenLMBeamState {
|
||||
float language_model_score;
|
||||
|
@ -7,7 +7,7 @@ using namespace std;
|
||||
#include "trie_node.h"
|
||||
#include "alphabet.h"
|
||||
|
||||
typedef lm::ngram::ProbingModel Model;
|
||||
typedef lm::ngram::QuantArrayTrieModel Model;
|
||||
|
||||
lm::WordIndex GetWordIndex(const Model& model, const std::string& word) {
|
||||
return model.GetVocabulary().Index(word);
|
||||
|
Loading…
Reference in New Issue
Block a user