Merge pull request #1543 from mozilla/update_lm
Update language model to a trie-based LM created from the LibriSpeech LM corpus
This commit is contained in:
commit
b9946fddab
2
.gitattributes
vendored
2
.gitattributes
vendored
@ -1,3 +1,3 @@
|
|||||||
|
|
||||||
*.binary filter=lfs diff=lfs merge=lfs -crlf
|
*.binary filter=lfs diff=lfs merge=lfs -crlf
|
||||||
data/lm/trie filter=lfs diff=lfs merge=lfs -crlf
|
data/lm/trie filter=lfs diff=lfs merge=lfs -crlf
|
||||||
|
data/lm/vocab.txt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
46
data/lm/README.md
Normal file
46
data/lm/README.md
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
lm.binary was generated from the LibriSpeech normalized LM training text, available [here](http://www.openslr.org/11), following this recipe (Jupyter notebook code):
|
||||||
|
|
||||||
|
```python
|
||||||
|
import gzip
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
|
||||||
|
from urllib import request
|
||||||
|
|
||||||
|
# Grab corpus.
|
||||||
|
url = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
|
||||||
|
data_upper = '/tmp/upper.txt.gz'
|
||||||
|
request.urlretrieve(url, data_upper)
|
||||||
|
|
||||||
|
# Convert to lowercase and cleanup.
|
||||||
|
data_lower = '/tmp/lower.txt'
|
||||||
|
with open(data_lower, 'w', encoding='utf-8') as lower:
|
||||||
|
with io.TextIOWrapper(io.BufferedReader(gzip.open(data_upper)), encoding='utf8') as upper:
|
||||||
|
for line in upper:
|
||||||
|
lower.write(line.lower())
|
||||||
|
|
||||||
|
# Build pruned LM.
|
||||||
|
lm_path = '/tmp/lm.arpa'
|
||||||
|
!lmplz --order 5 \
|
||||||
|
--temp_prefix /tmp/ \
|
||||||
|
--memory 50% \
|
||||||
|
--text {data_lower} \
|
||||||
|
--arpa {lm_path} \
|
||||||
|
--prune 0 0 0 1
|
||||||
|
|
||||||
|
# Quantize and produce trie binary.
|
||||||
|
binary_path = '/tmp/lm.binary'
|
||||||
|
!build_binary -a 255 \
|
||||||
|
-q 8 \
|
||||||
|
trie \
|
||||||
|
{lm_path} \
|
||||||
|
{binary_path}
|
||||||
|
os.remove(lm_path)
|
||||||
|
```
|
||||||
|
|
||||||
|
The trie was then generated from the list of unique words in the corpus (data/lm/vocab.txt):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
tr -s '[[:space:]]' '\n' < /tmp/lower.txt | sort -u > /tmp/vocab.txt
|
||||||
|
./generate_trie ../data/alphabet.txt /tmp/lm.binary /tmp/vocab.txt /tmp/trie
|
||||||
|
```
|
@ -1,3 +1,3 @@
|
|||||||
version https://git-lfs.github.com/spec/v1
|
version https://git-lfs.github.com/spec/v1
|
||||||
oid sha256:a9da0224ae9baf92a32fa85dafd24c99b3837a67b880ade0a5a730449d15297b
|
oid sha256:e1fa6801b25912a3625f67e0f6cafcdacb24033be9fad5fa272152a0828d7193
|
||||||
size 327902357
|
size 1800894585
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
version https://git-lfs.github.com/spec/v1
|
version https://git-lfs.github.com/spec/v1
|
||||||
oid sha256:c2f8f1d721eed0ae621160626e803925efa481c8156bb97e72013c0fbf879b75
|
oid sha256:00779e53cfcc1e170525b6cc6113096d3984036ddf84dd8f07ce275f7027c47d
|
||||||
size 43550345
|
size 227692934
|
||||||
|
94638
data/lm/vocab.txt
94638
data/lm/vocab.txt
File diff suppressed because it is too large
Load Diff
Binary file not shown.
529740
data/smoke_test/vocab.trie
529740
data/smoke_test/vocab.trie
File diff suppressed because it is too large
Load Diff
@ -8,7 +8,7 @@
|
|||||||
|
|
||||||
#include "kenlm/lm/model.hh"
|
#include "kenlm/lm/model.hh"
|
||||||
|
|
||||||
typedef lm::ngram::ProbingModel Model;
|
typedef lm::ngram::QuantArrayTrieModel Model;
|
||||||
|
|
||||||
struct KenLMBeamState {
|
struct KenLMBeamState {
|
||||||
float language_model_score;
|
float language_model_score;
|
||||||
|
@ -7,7 +7,7 @@ using namespace std;
|
|||||||
#include "trie_node.h"
|
#include "trie_node.h"
|
||||||
#include "alphabet.h"
|
#include "alphabet.h"
|
||||||
|
|
||||||
typedef lm::ngram::ProbingModel Model;
|
typedef lm::ngram::QuantArrayTrieModel Model;
|
||||||
|
|
||||||
lm::WordIndex GetWordIndex(const Model& model, const std::string& word) {
|
lm::WordIndex GetWordIndex(const Model& model, const std::string& word) {
|
||||||
return model.GetVocabulary().Index(word);
|
return model.GetVocabulary().Index(word);
|
||||||
|
Loading…
Reference in New Issue
Block a user