Address review comments

This commit is contained in:
Reuben Morais 2017-09-10 09:00:45 -03:00
parent 86b0ed612c
commit 1f3d26ddda
15 changed files with 214 additions and 7831 deletions

View File

@ -21,7 +21,7 @@ payload:
{{ SYSTEM_ADD_USER }} &&
echo -e "#!/bin/bash\nset -xe\nexport PATH=/home/build-user/bin:$PATH && env && id && wget https://github.com/git-lfs/git-lfs/releases/download/v2.2.1/git-lfs-linux-amd64-2.2.1.tar.gz -O - | tar -C /tmp -zxf - && PREFIX=/home/build-user/ /tmp/git-lfs-2.2.1/install.sh && mkdir ~/DeepSpeech/ && git clone --quiet {{ GITHUB_HEAD_REPO_URL }} ~/DeepSpeech/ds/ && cd ~/DeepSpeech/ds && git checkout --quiet {{ GITHUB_HEAD_SHA }}" > /tmp/clone.sh && chmod +x /tmp/clone.sh &&
{{ SYSTEM_DO_CLONE }} &&
sudo -H -u build-user TENSORFLOW_WHEEL=${TENSORFLOW_WHEEL} {{ TASK_ENV_VARS }} /bin/bash /home/build-user/DeepSpeech/ds/tc-train-tests.sh 2.7.13
sudo -H -u build-user TENSORFLOW_WHEEL=${TENSORFLOW_WHEEL} DEEPSPEECH_ARTIFACTS_ROOT=${DEEPSPEECH_ARTIFACTS_ROOT} /bin/bash /home/build-user/DeepSpeech/ds/tc-train-tests.sh 2.7.13
artifacts:
"public":
type: "directory"

View File

@ -143,6 +143,12 @@ tf.app.flags.DEFINE_float ('estop_std_thresh', 0.5, 'standard deviatio
tf.app.flags.DEFINE_string ('decoder_library_path', 'native_client/libctc_decoder_with_kenlm.so', 'path to the libctc_decoder_with_kenlm.so library containing the decoder implementation.')
tf.app.flags.DEFINE_string ('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.')
tf.app.flags.DEFINE_string ('lm_binary_path', 'data/lm/lm.binary', 'path to the language model binary file created with KenLM')
tf.app.flags.DEFINE_string ('lm_trie_path', 'data/lm/trie', 'path to the language model trie file created with native_client/generate_trie')
tf.app.flags.DEFINE_integer ('beam_width', 1024, 'beam width used in the CTC decoder when building candidate transcriptions')
tf.app.flags.DEFINE_float ('lm_weight', 2.15, 'the alpha hyperparameter of the CTC decoder. Language Model weight.')
tf.app.flags.DEFINE_float ('word_count_weight', -0.10, 'the beta hyperparameter of the CTC decoder. Word insertion weight (penalty).')
tf.app.flags.DEFINE_float ('valid_word_count_weight', 1.10, 'the beta\' hyperparameter of the CTC decoder. Valid word insertion weight.')
for var in ['b1', 'h1', 'b2', 'h2', 'b3', 'h3', 'b5', 'h5', 'b6', 'h6']:
tf.app.flags.DEFINE_float('%s_stddev' % var, None, 'standard deviation to use when initialising %s' % var)
@ -458,47 +464,12 @@ custom_op_module = tf.load_op_library(FLAGS.decoder_library_path)
def decode_with_lm(inputs, sequence_length, beam_width=100,
top_paths=1, merge_repeated=True):
"""Performs beam search decoding on the logits given in input.
**Note** The `ctc_greedy_decoder` is a special case of the
`ctc_beam_search_decoder` with `top_paths=1` and `beam_width=1` (but
that decoder is faster for this special case).
If `merge_repeated` is `True`, merge repeated classes in the output beams.
This means that if consecutive entries in a beam are the same,
only the first of these is emitted. That is, when the top path
is `A B B B B`, the return value is:
* `A B` if `merge_repeated = True`.
* `A B B B B` if `merge_repeated = False`.
Args:
inputs: 3-D `float` `Tensor`, size
`[max_time x batch_size x num_classes]`. The logits.
sequence_length: 1-D `int32` vector containing sequence lengths,
having size `[batch_size]`.
beam_width: An int scalar >= 0 (beam search beam width).
top_paths: An int scalar >= 0, <= beam_width (controls output size).
merge_repeated: Boolean. Default: True.
Returns:
A tuple `(decoded, log_probabilities)` where
decoded: A list of length top_paths, where `decoded[j]`
is a `SparseTensor` containing the decoded outputs:
`decoded[j].indices`: Indices matrix `(total_decoded_outputs[j] x 2)`
The rows store: [batch, time].
`decoded[j].values`: Values vector, size `(total_decoded_outputs[j])`.
The vector stores the decoded classes for beam j.
`decoded[j].shape`: Shape vector, size `(2)`.
The shape values are: `[batch_size, max_decoded_length[j]]`.
log_probability: A `float` matrix `(batch_size x top_paths)` containing
sequence log-probabilities.
"""
decoded_ixs, decoded_vals, decoded_shapes, log_probabilities = (
custom_op_module.ctc_beam_search_decoder_with_lm(
inputs, sequence_length, model_path="data/lm/lm.binary", trie_path="data/lm/trie", alphabet_path="data/alphabet.txt",
beam_width=beam_width, top_paths=top_paths, merge_repeated=merge_repeated))
inputs, sequence_length, beam_width=beam_width,
model_path=FLAGS.lm_binary_path, trie_path=FLAGS.lm_trie_path, alphabet_path=FLAGS.alphabet_config_path,
lm_weight=FLAGS.lm_weight, word_count_weight=FLAGS.word_count_weight, valid_word_count_weight=FLAGS.valid_word_count_weight,
top_paths=top_paths, merge_repeated=merge_repeated))
return (
[tf.SparseTensor(ix, val, shape) for (ix, val, shape)
@ -539,7 +510,7 @@ def calculate_mean_edit_distance_and_loss(model_feeder, tower, dropout):
avg_loss = tf.reduce_mean(total_loss)
# Beam search decode the batch
decoded, _ = decode_with_lm(logits, batch_seq_len, merge_repeated=False, beam_width=1024)
decoded, _ = decode_with_lm(logits, batch_seq_len, merge_repeated=False, beam_width=FLAGS.beam_width)
# Compute the edit (Levenshtein) distance
distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)

View File

@ -6876,7 +6876,7 @@ i would like to conclude by reading an email i got from one of them cindy the da
as i worked i couldnt help but think about the individuals and the stories represented in the images one in particular
a photo of women of all ages from grandmother to little girl gathered around a baby struck a chord because a similar photo from my family my grandmother and mother myself and newborn daughter hangs on our wall
across the globe throughout the ages our basic needs are just the same arent they thank you
i along with hundreds of other volunteers knew we couldnt just sit at home so i decided to join them for three weeks on may the thirteenth i made my way to the town of ōfunato its a small fishing town in iwate prefecture
i along with hundreds of other volunteers knew we couldnt just sit at home so i decided to join them for three weeks on may the thirteenth i made my way to the town of ofunato its a small fishing town in iwate prefecture
about fifty thousand people one of the first that was hit by the wave
the waters here have been recorded at reaching over twenty four meters in height
and traveled over two miles inland as you can imagine the town had been devastated
@ -48368,7 +48368,7 @@ what if we thought of fear as an amazing act of the imagination something that c
its easiest to see this link between fear and the imagination in young children whose fears are often extraordinarily vivid
when i was a child i lived in california which is you know mostly a very nice place to live
but at a certain point most of us learn to leave these kinds of visions behind and grow up we learn that there are no monsters hiding under the bed and not every earthquake brings buildings down
but maybe its no coincidence that some of our most creative minds fail to leave these kinds of fears behind as adults the same incredible imaginations that produced the origin of species jane eyre and the remembrance of things past also generated intense worries that haunted the adult lives of charles darwin charlotte brontăť
but maybe its no coincidence that some of our most creative minds fail to leave these kinds of fears behind as adults the same incredible imaginations that produced the origin of species jane eyre and the remembrance of things past also generated intense worries that haunted the adult lives of charles darwin charlotte brontat
and marcel proust so the question is what can the rest of us learn
lets take a look at the fears that their imaginations were generating as they drifted in the middle of the pacific twenty four hours had now passed since the capsizing of the ship the time had come for the men to make a plan but they had very few options
in his fascinating account of the disaster nathaniel philbrick wrote that these men were just about as far from land as it was possible to be anywhere on earth

View File

@ -40,7 +40,8 @@ cc_library(
name = "ctc_decoder_with_kenlm",
srcs = ["beam_search.cc",
"alphabet.h",
"trie_node.h"] +
"trie_node.h"
] +
glob(["kenlm/lm/*.cc", "kenlm/util/*.cc", "kenlm/util/double-conversion/*.cc",
"kenlm/lm/*.hh", "kenlm/util/*.hh", "kenlm/util/double-conversion/*.h"],
exclude = ["kenlm/*/*test.cc", "kenlm/*/*main.cc"]),
@ -59,10 +60,10 @@ cc_binary(
"trie_node.h",
"alphabet.h",
] + glob(["kenlm/lm/*.cc", "kenlm/util/*.cc", "kenlm/util/double-conversion/*.cc",
"kenlm/lm/*.hh", "kenlm/util/*.hh", "kenlm/util/double-conversion/*.h"],
exclude = ["kenlm/*/*test.cc", "kenlm/*/*main.cc"]),
"kenlm/lm/*.hh", "kenlm/util/*.hh", "kenlm/util/double-conversion/*.h"],
exclude = ["kenlm/*/*test.cc", "kenlm/*/*main.cc"]),
includes = ["kenlm"],
copts = ['-std=c++11'],
linkopts = ['-lm'],
copts = ["-std=c++11"],
linkopts = ["-lm"],
defines = ["KENLM_MAX_ORDER=6"],
)

View File

@ -3,6 +3,7 @@
#include <cassert>
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_map>
@ -46,6 +47,7 @@ public:
if (it != str_to_label_.end()) {
return it->second;
} else {
std::cerr << "Invalid label " << string << std::endl;
abort();
}
}
@ -55,6 +57,7 @@ public:
}
bool IsSpace(unsigned int label) const {
//TODO: we should probably do something more i18n-aware here
const std::string& str = StringFromLabel(label);
return str.size() == 1 && str[0] == ' ';
}

View File

@ -13,14 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// This test illustrates how to make use of the CTCBeamSearchDecoder using a
// custom BeamScorer and BeamState based on a dictionary with a few artificial
// words.
#include "tensorflow/core/util/ctc/ctc_beam_search.h"
#include <algorithm>
#include <cmath>
#include <vector>
#include <cmath>
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
@ -43,6 +40,9 @@ REGISTER_OP("CTCBeamSearchDecoderWithLM")
.Attr("model_path: string")
.Attr("trie_path: string")
.Attr("alphabet_path: string")
.Attr("lm_weight: float")
.Attr("word_count_weight: float")
.Attr("valid_word_count_weight: float")
.Attr("beam_width: int >= 1 = 100")
.Attr("top_paths: int >= 1 = 1")
.Attr("merge_repeated: bool = true")
@ -91,6 +91,12 @@ returned if merge_repeated = False.
inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
sequence_length: A vector containing sequence lengths, size `(batch)`.
model_path: A string containing the path to the KenLM model file to use.
trie_path: A string containing the path to the trie file built from the vocabulary.
alphabet_path: A string containing the path to the alphabet file (see alphabet.h).
lm_weight: alpha hyperparameter of CTC decoder. LM weight.
word_count_weight: beta hyperparameter of CTC decoder. Word insertion weight.
valid_word_count_weight: beta' hyperparameter of CTC decoder. Valid word insertion weight.
beam_width: A scalar >= 0 (beam search beam width).
top_paths: A scalar >= 0, <= beam_width (controls output size).
merge_repeated: If true, merge repeated classes in output.
@ -107,38 +113,36 @@ log_probability: A matrix, shaped: `(batch_size x top_paths)`. The
sequence log-probabilities.
)doc");
typedef lm::ngram::ProbingModel Model;
struct KenLMBeamState {
float language_model_score;
float score;
float delta_score;
std::string incomplete_word;
TrieNode *incomplete_word_trie_node;
lm::ngram::ProbingModel::State model_state;
Model::State model_state;
};
class KenLMBeamScorer : public tf::ctc::BaseBeamScorer<KenLMBeamState> {
public:
typedef lm::ngram::ProbingModel Model;
KenLMBeamScorer(const std::string &kenlm_path, const std::string &trie_path, const std::string &alphabet_path)
: lm_weight(1.0f)
, word_count_weight(-0.1f)
, valid_word_count_weight(1.0f)
KenLMBeamScorer(const std::string &kenlm_path, const std::string &trie_path,
const std::string &alphabet_path, float lm_weight,
float word_count_weight, float valid_word_count_weight)
: model(kenlm_path.c_str(), GetLMConfig())
, alphabet(alphabet_path.c_str())
, lm_weight_(lm_weight)
, word_count_weight_(word_count_weight)
, valid_word_count_weight_(valid_word_count_weight)
{
lm::ngram::Config config;
config.load_method = util::POPULATE_OR_READ;
model = new Model(kenlm_path.c_str(), config);
std::ifstream in(trie_path, std::ios::in);
TrieNode::ReadFromStream(in, trieRoot, alphabet.GetSize());
alphabet = new Alphabet(alphabet_path.c_str());
std::ifstream in;
in.open(trie_path, std::ios::in);
TrieNode::ReadFromStream(in, trieRoot, alphabet->GetSize());
in.close();
Model::State out;
oov_score_ = model.FullScore(model.NullContextState(), model.GetVocabulary().NotFound(), out).prob;
}
virtual ~KenLMBeamScorer() {
delete model;
delete trieRoot;
}
@ -149,7 +153,7 @@ class KenLMBeamScorer : public tf::ctc::BaseBeamScorer<KenLMBeamState> {
root->delta_score = 0.0f;
root->incomplete_word.clear();
root->incomplete_word_trie_node = trieRoot;
root->model_state = model->BeginSentenceState();
root->model_state = model.BeginSentenceState();
}
// ExpandState is called when expanding a beam to one of its children.
// Called at most once per child beam. In the simplest case, no state
@ -158,13 +162,12 @@ class KenLMBeamScorer : public tf::ctc::BaseBeamScorer<KenLMBeamState> {
KenLMBeamState* to_state, int to_label) const {
CopyState(from_state, to_state);
if (!alphabet->IsSpace(to_label)) {
to_state->incomplete_word += alphabet->StringFromLabel(to_label);
if (!alphabet.IsSpace(to_label)) {
to_state->incomplete_word += alphabet.StringFromLabel(to_label);
TrieNode *trie_node = from_state.incomplete_word_trie_node;
// TODO replace with OOV unigram prob?
// If we have no valid prefix we assume a very low log probability
float min_unigram_score = -10.0f;
float min_unigram_score = oov_score_;
// If prefix does exist
if (trie_node != nullptr) {
trie_node = trie_node->GetChildAt(to_label);
@ -185,9 +188,9 @@ class KenLMBeamScorer : public tf::ctc::BaseBeamScorer<KenLMBeamState> {
to_state->model_state);
// Give fixed word bonus
if (!IsOOV(to_state->incomplete_word)) {
to_state->language_model_score += valid_word_count_weight;
to_state->language_model_score += valid_word_count_weight_;
}
to_state->language_model_score += word_count_weight;
to_state->language_model_score += word_count_weight_;
UpdateWithLMScore(to_state, lm_score_delta);
ResetIncompleteWord(to_state);
}
@ -205,8 +208,8 @@ class KenLMBeamScorer : public tf::ctc::BaseBeamScorer<KenLMBeamState> {
ResetIncompleteWord(state);
state->model_state = out;
}
lm_score_delta += model->FullScore(state->model_state,
model->GetVocabulary().EndSentence(),
lm_score_delta += model.FullScore(state->model_state,
model.GetVocabulary().EndSentence(),
out).prob;
UpdateWithLMScore(state, lm_score_delta);
}
@ -219,7 +222,7 @@ class KenLMBeamScorer : public tf::ctc::BaseBeamScorer<KenLMBeamState> {
// there's no state expansion logic, the expansion score is zero.
float GetStateExpansionScore(const KenLMBeamState& state,
float previous_score) const {
return lm_weight * state.delta_score + previous_score;
return lm_weight_ * state.delta_score + previous_score;
}
// GetStateEndExpansionScore should be an inexpensive method to retrieve the
// (cached) expansion score computed within ExpandStateEnd. The score is
@ -227,28 +230,35 @@ class KenLMBeamScorer : public tf::ctc::BaseBeamScorer<KenLMBeamState> {
//
// The score returned should be a log-probability.
float GetStateEndExpansionScore(const KenLMBeamState& state) const {
return lm_weight * state.delta_score;
return lm_weight_ * state.delta_score;
}
void SetLMWeight(float lm_weight) {
this->lm_weight = lm_weight;
this->lm_weight_ = lm_weight;
}
void SetWordCountWeight(float word_count_weight) {
this->word_count_weight = word_count_weight;
this->word_count_weight_ = word_count_weight;
}
void SetValidWordCountWeight(float valid_word_count_weight) {
this->valid_word_count_weight = valid_word_count_weight;
this->valid_word_count_weight_ = valid_word_count_weight;
}
private:
Model *model;
Alphabet *alphabet;
Model model;
Alphabet alphabet;
TrieNode *trieRoot;
float lm_weight;
float word_count_weight;
float valid_word_count_weight;
float lm_weight_;
float word_count_weight_;
float valid_word_count_weight_;
float oov_score_;
lm::ngram::Config GetLMConfig() {
lm::ngram::Config config;
config.load_method = util::POPULATE_OR_READ;
return config;
}
void UpdateWithLMScore(KenLMBeamState *state, float lm_score_delta) const {
float previous_score = state->score;
@ -263,17 +273,15 @@ class KenLMBeamScorer : public tf::ctc::BaseBeamScorer<KenLMBeamState> {
}
bool IsOOV(const std::string& word) const {
auto &vocabulary = model->GetVocabulary();
auto &vocabulary = model.GetVocabulary();
return vocabulary.Index(word) == vocabulary.NotFound();
}
float ScoreIncompleteWord(const Model::State& model_state,
const std::string& word,
Model::State& out) const {
lm::FullScoreReturn full_score_return;
lm::WordIndex vocab = model->GetVocabulary().Index(word);
full_score_return = model->FullScore(model_state, vocab, out);
return full_score_return.prob;
lm::WordIndex word_index = model.GetVocabulary().Index(word);
return model.FullScore(model_state, word_index, out).prob;
}
void CopyState(const KenLMBeamState& from, KenLMBeamState* to) const {
@ -409,56 +417,22 @@ class CTCDecodeHelper {
TF_DISALLOW_COPY_AND_ASSIGN(CTCDecodeHelper);
};
// CTC beam search
class CTCBeamSearchDecoderOp : public tf::OpKernel {
class CTCBeamSearchDecoderWithLMOp : public tf::OpKernel {
public:
explicit CTCBeamSearchDecoderOp(tf::OpKernelConstruction *ctx)
explicit CTCBeamSearchDecoderWithLMOp(tf::OpKernelConstruction *ctx)
: tf::OpKernel(ctx)
, beam_scorer_(GetModelPath(ctx),
GetTriePath(ctx),
GetAlphabetPath(ctx))
GetAlphabetPath(ctx),
GetLMWeight(ctx),
GetWordCountWeight(ctx),
GetValidWordCountWeight(ctx))
{
OP_REQUIRES_OK(ctx, ctx->GetAttr("merge_repeated", &merge_repeated_));
OP_REQUIRES_OK(ctx, ctx->GetAttr("beam_width", &beam_width_));
int top_paths;
OP_REQUIRES_OK(ctx, ctx->GetAttr("top_paths", &top_paths));
decode_helper_.SetTopPaths(top_paths);
// const tf::Tensor* model_tensor;
// tf::Status status = ctx->input("model_path", &model_tensor);
// if (!status.ok()) return status;
// auto model_vec = model_tensor->flat<std::string>();
// *model_path = model_vec(0);
// const tf::Tensor* trie_tensor;
// status = ctx->input("trie_path", &trie_tensor);
// if (!status.ok()) return status;
// auto trie_vec = trie_tensor->flat<std::string>();
// *trie_path = model_vec(0);
// const tf::Tensor* alphabet_tensor;
// status = ctx->input("alphabet_path", &alphabet_tensor);
// if (!status.ok()) return status;
// auto alphabet_vec = alphabet_tensor->flat<std::string>();
// *alphabet_path = alphabet_vec(0);
}
std::string GetModelPath(tf::OpKernelConstruction *ctx) {
std::string model_path;
ctx->GetAttr("model_path", &model_path);
return model_path;
}
std::string GetTriePath(tf::OpKernelConstruction *ctx) {
std::string trie_path;
ctx->GetAttr("trie_path", &trie_path);
return trie_path;
}
std::string GetAlphabetPath(tf::OpKernelConstruction *ctx) {
std::string alphabet_path;
ctx->GetAttr("alphabet_path", &alphabet_path);
return alphabet_path;
}
void Compute(tf::OpKernelContext *ctx) override {
@ -539,8 +513,44 @@ class CTCBeamSearchDecoderOp : public tf::OpKernel {
KenLMBeamScorer beam_scorer_;
bool merge_repeated_;
int beam_width_;
TF_DISALLOW_COPY_AND_ASSIGN(CTCBeamSearchDecoderOp);
TF_DISALLOW_COPY_AND_ASSIGN(CTCBeamSearchDecoderWithLMOp);
std::string GetModelPath(tf::OpKernelConstruction *ctx) {
std::string model_path;
ctx->GetAttr("model_path", &model_path);
return model_path;
}
std::string GetTriePath(tf::OpKernelConstruction *ctx) {
std::string trie_path;
ctx->GetAttr("trie_path", &trie_path);
return trie_path;
}
std::string GetAlphabetPath(tf::OpKernelConstruction *ctx) {
std::string alphabet_path;
ctx->GetAttr("alphabet_path", &alphabet_path);
return alphabet_path;
}
float GetLMWeight(tf::OpKernelConstruction *ctx) {
float lm_weight;
ctx->GetAttr("lm_weight", &lm_weight);
return lm_weight;
}
float GetWordCountWeight(tf::OpKernelConstruction *ctx) {
float word_count_weight;
ctx->GetAttr("word_count_weight", &word_count_weight);
return word_count_weight;
}
float GetValidWordCountWeight(tf::OpKernelConstruction *ctx) {
float valid_word_count_weight;
ctx->GetAttr("valid_word_count_weight", &valid_word_count_weight);
return valid_word_count_weight;
}
};
REGISTER_KERNEL_BUILDER(Name("CTCBeamSearchDecoderWithLM").Device(tf::DEVICE_CPU),
CTCBeamSearchDecoderOp);
CTCBeamSearchDecoderWithLMOp);

View File

@ -10,17 +10,13 @@ using namespace std;
typedef lm::ngram::ProbingModel Model;
lm::WordIndex GetWordIndex(const Model& model, const std::string& word) {
lm::WordIndex vocab;
vocab = model.GetVocabulary().Index(word);
return vocab;
return model.GetVocabulary().Index(word);
}
float ScoreWord(const Model& model, lm::WordIndex vocab) {
Model::State in_state = model.NullContextState();
float ScoreWord(const Model& model, lm::WordIndex word_index) {
// We don't need to keep state here as we're scoring the words individually.
Model::State out;
lm::FullScoreReturn full_score_return;
full_score_return = model.FullScore(in_state, vocab, out);
return full_score_return.prob;
return model.FullScore(model.NullContextState(), word_index, out).prob;
}
int generate_trie(const char* alphabet_path, const char* kenlm_path, const char* vocab_path, const char* trie_path) {
@ -31,36 +27,38 @@ int generate_trie(const char* alphabet_path, const char* kenlm_path, const char*
Model model(kenlm_path, config);
TrieNode root(a.GetSize());
std::ifstream ifs;
ifs.open(vocab_path, std::ifstream::in);
if (!ifs.is_open()) {
std::cout << "unable to open vocabulary" << std::endl;
std::ifstream ifs(vocab_path, std::ifstream::in);
if (!ifs) {
std::cerr << "unable to open vocabulary file " << vocab_path << std::endl;
return -1;
}
std::ofstream ofs;
ofs.open(trie_path);
std::ofstream ofs(trie_path);
if (!ofs) {
std::cerr << "unable to open output file " << trie_path << std::endl;
return -1;
}
std::string word;
while (ifs >> word) {
for_each(word.begin(), word.end(), [](char& a) { a = tolower(a); });
lm::WordIndex vocab = GetWordIndex(model, word);
float unigram_score = ScoreWord(model, vocab);
root.Insert(word.c_str(), [&a](char c) {
return a.LabelFromString(string(1, c));
}, vocab, unigram_score);
lm::WordIndex word_index = GetWordIndex(model, word);
float unigram_score = ScoreWord(model, word_index);
root.Insert(word.c_str(),
[&a](const std::string& c) {
return a.LabelFromString(c);
},
word_index, unigram_score);
}
root.WriteToStream(ofs);
ifs.close();
ofs.close();
return 0;
}
int main(void) {
return generate_trie("/Users/remorais/Development/DeepSpeech/data/alphabet.txt",
"/Users/remorais/Development/DeepSpeech/data/lm/lm.binary",
"/Users/remorais/Development/DeepSpeech/data/lm/vocab.txt",
"/Users/remorais/Development/DeepSpeech/data/lm/trie");
int main(int argc, char** argv) {
if (argc != 5) {
std::cerr << "Usage: " << argv[0] << " <alphabet> <lm_model> <vocabulary> <trie_path>" << std::endl;
return -1;
}
return generate_trie(argv[1], argv[2], argv[3], argv[4]);
}

View File

@ -6,4 +6,4 @@ This corresponds to https://github.com/kpu/kenlm/commit/cdd794598ea15dc23a7daaf7
The following procedure was run to remove unneeded files:
cd kenlm
rm -rf windows include lm/filter lm/builder util/stream
rm -rf windows include lm/filter lm/builder util/stream util/getopt.* python

View File

@ -1,50 +0,0 @@
cdef extern from "lm/word_index.hh" namespace "lm":
ctypedef unsigned WordIndex
cdef extern from "lm/return.hh" namespace "lm":
cdef struct FullScoreReturn:
float prob
unsigned char ngram_length
cdef extern from "lm/state.hh" namespace "lm::ngram":
cdef cppclass State :
int Compare(const State &other) const
int hash_value(const State &state)
cdef extern from "lm/virtual_interface.hh" namespace "lm::base":
cdef cppclass Vocabulary:
WordIndex Index(char*)
WordIndex BeginSentence()
WordIndex EndSentence()
WordIndex NotFound()
ctypedef Vocabulary const_Vocabulary "const lm::base::Vocabulary"
cdef cppclass Model:
void BeginSentenceWrite(void *)
void NullContextWrite(void *)
unsigned int Order()
const_Vocabulary& BaseVocabulary()
float BaseScore(void *in_state, WordIndex new_word, void *out_state)
FullScoreReturn BaseFullScore(void *in_state, WordIndex new_word, void *out_state)
cdef extern from "util/mmap.hh" namespace "util":
cdef enum LoadMethod:
LAZY
POPULATE_OR_LAZY
POPULATE_OR_READ
READ
PARALLEL_READ
cdef extern from "lm/config.hh" namespace "lm::ngram":
cdef cppclass Config:
Config()
float probing_multiplier
LoadMethod load_method
cdef extern from "lm/model.hh" namespace "lm::ngram":
cdef Model *LoadVirtual(char *, Config &config) except +
#default constructor
cdef Model *LoadVirtual(char *) except +

View File

@ -1,43 +0,0 @@
#!/usr/bin/env python
import os
import kenlm
LM = os.path.join(os.path.dirname(__file__), '..', 'lm', 'test.arpa')
model = kenlm.Model(LM)
print('{0}-gram model'.format(model.order))
sentence = 'language modeling is fun .'
print(sentence)
print(model.score(sentence))
# Check that total full score = direct score
def score(s):
return sum(prob for prob, _, _ in model.full_scores(s))
assert (abs(score(sentence) - model.score(sentence)) < 1e-3)
# Show scores and n-gram matches
words = ['<s>'] + sentence.split() + ['</s>']
for i, (prob, length, oov) in enumerate(model.full_scores(sentence)):
print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2])))
if oov:
print('\t"{0}" is an OOV'.format(words[i+1]))
# Find out-of-vocabulary words
for w in words:
if not w in model:
print('"{0}" is an OOV'.format(w))
#Stateful query
state = kenlm.State()
state2 = kenlm.State()
#Use <s> as context. If you don't want <s>, use model.NullContextWrite(state).
model.BeginSentenceWrite(state)
accum = 0.0
accum += model.BaseScore(state, "a", state2)
accum += model.BaseScore(state2, "sentence", state)
#score defaults to bos = True and eos = True. Here we'll check without the end
#of sentence marker.
assert (abs(accum - model.score("a sentence", eos = False)) < 1e-3)
accum += model.BaseScore(state, "</s>", state2)
assert (abs(accum - model.score("a sentence")) < 1e-3)

File diff suppressed because it is too large Load Diff

View File

@ -1,261 +0,0 @@
import os
cimport _kenlm
cdef bytes as_str(data):
if isinstance(data, bytes):
return data
elif isinstance(data, unicode):
return data.encode('utf8')
raise TypeError('Cannot convert %s to string' % type(data))
cdef class FullScoreReturn:
"""
Wrapper around FullScoreReturn.
Notes:
`prob` has been renamed to `log_prob`
`oov` has been added to flag whether the word is OOV
"""
cdef float log_prob
cdef int ngram_length
cdef bint oov
def __cinit__(self, log_prob, ngram_length, oov):
self.log_prob = log_prob
self.ngram_length = ngram_length
self.oov = oov
def __repr__(self):
return '{0}({1}, {2}, {3})'.format(self.__class__.__name__, repr(self.log_prob), repr(self.ngram_length), repr(self.oov))
property log_prob:
def __get__(self):
return self.log_prob
property ngram_length:
def __get__(self):
return self.ngram_length
property oov:
def __get__(self):
return self.oov
cdef class State:
"""
Wrapper around lm::ngram::State so that python code can make incremental queries.
Notes:
* rich comparisons
* hashable
"""
cdef _kenlm.State _c_state
def __richcmp__(State qa, State qb, int op):
r = qa._c_state.Compare(qb._c_state)
if op == 0: # <
return r < 0
elif op == 1: # <=
return r <= 0
elif op == 2: # ==
return r == 0
elif op == 3: # !=
return r != 0
elif op == 4: # >
return r > 0
else: # >=
return r >= 0
def __hash__(self):
return _kenlm.hash_value(self._c_state)
class LoadMethod:
LAZY = _kenlm.LAZY
POPULATE_OR_LAZY = _kenlm.POPULATE_OR_LAZY
POPULATE_OR_READ = _kenlm.POPULATE_OR_READ
READ = _kenlm.READ
PARALLEL_READ = _kenlm.PARALLEL_READ
cdef class Config:
"""
Wrapper around lm::ngram::Config.
Pass this to Model's constructor to set the load_method.
"""
cdef _kenlm.Config _c_config
def __init__(self):
self._c_config = _kenlm.Config()
property load_method:
def __get__(self):
return self._c_config.load_method
def __set__(self, to):
self._c_config.load_method = to
cdef class Model:
"""
Wrapper around lm::ngram::Model.
"""
cdef _kenlm.Model* model
cdef public bytes path
cdef _kenlm.const_Vocabulary* vocab
def __init__(self, path, Config config = Config()):
"""
Load the language model.
:param path: path to an arpa file or a kenlm binary file.
:param config: configuration options (see lm/config.hh for documentation)
"""
self.path = os.path.abspath(as_str(path))
try:
self.model = _kenlm.LoadVirtual(self.path, config._c_config)
except RuntimeError as exception:
exception_message = str(exception).replace('\n', ' ')
raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\
from exception
self.vocab = &self.model.BaseVocabulary()
def __dealloc__(self):
del self.model
property order:
def __get__(self):
return self.model.Order()
def score(self, sentence, bos = True, eos = True):
"""
Return the log10 probability of a string. By default, the string is
treated as a sentence.
return log10 p(sentence </s> | <s>)
If you do not want to condition on the beginning of sentence, pass
bos = False
Never include <s> as part of the string. That would be predicting the
beginning of sentence. Language models are only supposed to condition
on it as context.
Similarly, the end of sentence token </s> can be omitted with
eos = False
Since language models explicitly predict </s>, it can be part of the
string.
Examples:
#Good: returns log10 p(this is a sentence . </s> | <s>)
model.score("this is a sentence .")
#Good: same as the above but more explicit
model.score("this is a sentence .", bos = True, eos = True)
#Bad: never include <s>
model.score("<s> this is a sentence")
#Bad: never include <s>, even if bos = False.
model.score("<s> this is a sentence", bos = False)
#Good: returns log10 p(a fragment)
model.score("a fragment", bos = False, eos = False)
#Good: returns log10 p(a fragment </s>)
model.score("a fragment", bos = False, eos = True)
#Ok, but bad practice: returns log10 p(a fragment </s>)
#Unlike <s>, the end of sentence token </s> can appear explicitly.
model.score("a fragment </s>", bos = False, eos = False)
"""
cdef list words = as_str(sentence).split()
cdef _kenlm.State state
if bos:
self.model.BeginSentenceWrite(&state)
else:
self.model.NullContextWrite(&state)
cdef _kenlm.State out_state
cdef float total = 0
for word in words:
total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state)
state = out_state
if eos:
total += self.model.BaseScore(&state, self.vocab.EndSentence(), &out_state)
return total
def perplexity(self, sentence):
"""
Compute perplexity of a sentence.
@param sentence One full sentence to score. Do not include <s> or </s>.
"""
words = len(as_str(sentence).split()) + 1 # For </s>
return 10.0**(-self.score(sentence) / words)
def full_scores(self, sentence, bos = True, eos = True):
"""
full_scores(sentence, bos = True, eos = Ture) -> generate full scores (prob, ngram length, oov)
@param sentence is a string (do not use boundary symbols)
@param bos should kenlm add a bos state
@param eos should kenlm add an eos state
"""
cdef list words = as_str(sentence).split()
cdef _kenlm.State state
if bos:
self.model.BeginSentenceWrite(&state)
else:
self.model.NullContextWrite(&state)
cdef _kenlm.State out_state
cdef _kenlm.FullScoreReturn ret
cdef float total = 0
cdef _kenlm.WordIndex wid
for word in words:
wid = self.vocab.Index(word)
ret = self.model.BaseFullScore(&state, wid, &out_state)
yield (ret.prob, ret.ngram_length, wid == 0)
state = out_state
if eos:
ret = self.model.BaseFullScore(&state,
self.vocab.EndSentence(), &out_state)
yield (ret.prob, ret.ngram_length, False)
def BeginSentenceWrite(self, State state):
"""Change the given state to a BOS state."""
self.model.BeginSentenceWrite(&state._c_state)
def NullContextWrite(self, State state):
"""Change the given state to a NULL state."""
self.model.NullContextWrite(&state._c_state)
def BaseScore(self, State in_state, str word, State out_state):
"""
Return p(word|in_state) and update the output state.
Wrapper around model.BaseScore(in_state, Index(word), out_state)
:param word: the suffix
:param state: the context (defaults to NullContext)
:returns: p(word|state)
"""
cdef float total = self.model.BaseScore(&in_state._c_state, self.vocab.Index(as_str(word)), &out_state._c_state)
return total
def BaseFullScore(self, State in_state, str word, State out_state):
"""
Wrapper around model.BaseScore(in_state, Index(word), out_state)
:param word: the suffix
:param state: the context (defaults to NullContext)
:returns: FullScoreReturn(word|state)
"""
cdef _kenlm.WordIndex wid = self.vocab.Index(as_str(word))
cdef _kenlm.FullScoreReturn ret = self.model.BaseFullScore(&in_state._c_state, wid, &out_state._c_state)
return FullScoreReturn(ret.prob, ret.ngram_length, wid == 0)
def __contains__(self, word):
cdef bytes w = as_str(word)
return (self.vocab.Index(w) != 0)
def __repr__(self):
return '<Model from {0}>'.format(os.path.basename(self.path))
def __reduce__(self):
return (_kenlm.LanguageModel, (self.path,))
class LanguageModel(Model):
"""Backwards compatability stub. Use Model."""

View File

@ -1,78 +0,0 @@
/*
POSIX getopt for Windows
AT&T Public License
Code given out at the 1985 UNIFORUM conference in Dallas.
*/
#ifndef __GNUC__
#include "getopt.hh"
#include <stdio.h>
#include <string.h>
#define NULL 0
#define EOF (-1)
#define ERR(s, c) if(opterr){\
char errbuf[2];\
errbuf[0] = c; errbuf[1] = '\n';\
fputs(argv[0], stderr);\
fputs(s, stderr);\
fputc(c, stderr);}
//(void) write(2, argv[0], (unsigned)strlen(argv[0]));\
//(void) write(2, s, (unsigned)strlen(s));\
//(void) write(2, errbuf, 2);}
int opterr = 1;
int optind = 1;
int optopt;
char *optarg;
int
getopt(argc, argv, opts)
int argc;
char **argv, *opts;
{
static int sp = 1;
register int c;
register char *cp;
if(sp == 1)
if(optind >= argc ||
argv[optind][0] != '-' || argv[optind][1] == '\0')
return(EOF);
else if(strcmp(argv[optind], "--") == NULL) {
optind++;
return(EOF);
}
optopt = c = argv[optind][sp];
if(c == ':' || (cp=strchr(opts, c)) == NULL) {
ERR(": illegal option -- ", c);
if(argv[optind][++sp] == '\0') {
optind++;
sp = 1;
}
return('?');
}
if(*++cp == ':') {
if(argv[optind][sp+1] != '\0')
optarg = &argv[optind++][sp+1];
else if(++optind >= argc) {
ERR(": option requires an argument -- ", c);
sp = 1;
return('?');
} else
optarg = argv[optind++];
sp = 1;
} else {
if(argv[optind][++sp] == '\0') {
sp = 1;
optind++;
}
optarg = NULL;
}
return(c);
}
#endif /* __GNUC__ */

View File

@ -1,33 +0,0 @@
/*
POSIX getopt for Windows
AT&T Public License
Code given out at the 1985 UNIFORUM conference in Dallas.
*/
#ifdef __GNUC__
#include <getopt.h>
#endif
#ifndef __GNUC__
#ifndef UTIL_GETOPT_H
#define UTIL_GETOPT_H
#ifdef __cplusplus
extern "C" {
#endif
extern int opterr;
extern int optind;
extern int optopt;
extern char *optarg;
extern int getopt(int argc, char **argv, char *opts);
#ifdef __cplusplus
}
#endif
#endif /* UTIL_GETOPT_H */
#endif /* __GNUC__ */

View File

@ -15,37 +15,40 @@ limitations under the License.
#include "lm/model.hh"
#include <codecvt>
#include <functional>
#include <istream>
#include <iostream>
#include <istream>
#include <limits>
#include <locale>
#include <string>
class TrieNode {
public:
TrieNode(int vocab_size)
: vocab_size(vocab_size)
, prefixCount(0)
, min_score_word(0)
, min_unigram_score(std::numeric_limits<float>::max())
: vocab_size_(vocab_size)
, prefixCount_(0)
, min_score_word_(0)
, min_unigram_score_(std::numeric_limits<float>::max())
{
children = new TrieNode*[vocab_size]();
children_ = new TrieNode*[vocab_size_]();
}
~TrieNode() {
for (int i = 0; i < vocab_size; i++) {
delete children[i];
for (int i = 0; i < vocab_size_; i++) {
delete children_[i];
}
delete children;
delete children_;
}
void WriteToStream(std::ostream& os) {
void WriteToStream(std::ostream& os) const {
WriteNode(os);
for (int i = 0; i < vocab_size; i++) {
if (children[i] == nullptr) {
for (int i = 0; i < vocab_size_; i++) {
if (children_[i] == nullptr) {
os << -1 << std::endl;
} else {
// Recursive call
children[i]->WriteToStream(os);
children_[i]->WriteToStream(os);
}
}
}
@ -63,61 +66,80 @@ public:
obj = new TrieNode(vocab_size);
obj->ReadNode(is, prefixCount);
for (int i = 0; i < vocab_size; i++) {
// Recursive call
ReadFromStream(is, obj->children[i], vocab_size);
ReadFromStream(is, obj->children_[i], vocab_size);
}
}
void Insert(const char* word, std::function<int (char)> translator,
void Insert(const char* word, std::function<int (const std::string&)> translator,
lm::WordIndex lm_word, float unigram_score) {
char wordCharacter = *word;
prefixCount++;
if (unigram_score < min_unigram_score) {
min_unigram_score = unigram_score;
min_score_word = lm_word;
// All strings are UTF-8 encoded at the API boundaries. We need to iterate
// on codepoints in order to support multi-byte characters, so we convert
// to UCS-4 to extract the first codepoint, then the codepoint back to
// UTF-8 to translate it into a vocabulary index.
//TODO We should normalize the input first, and possibly iterate by grapheme
// instead of codepoint for languages that don't have composed versions
// of multi-codepoint characters. This requires extra dependencies so
// leaving as a future improvement when the need arises.
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> ucs4conv;
std::u32string codepoints = ucs4conv.from_bytes(word);
Insert(codepoints.begin(), translator, lm_word, unigram_score);
}
void Insert(const std::u32string::iterator& codepoints,
std::function<int (const std::string&)> translator,
lm::WordIndex lm_word, float unigram_score) {
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> ucs4conv;
char32_t firstCodepoint = *codepoints;
std::string firstCodepoint_utf8 = ucs4conv.to_bytes(firstCodepoint);
prefixCount_++;
if (unigram_score < min_unigram_score_) {
min_unigram_score_ = unigram_score;
min_score_word_ = lm_word;
}
if (wordCharacter != '\0') {
int vocabIndex = translator(wordCharacter);
TrieNode *child = children[vocabIndex];
if (child == nullptr)
child = children[vocabIndex] = new TrieNode(vocab_size);
child->Insert(word + 1, translator, lm_word, unigram_score);
if (firstCodepoint != 0) {
int vocabIndex = translator(firstCodepoint_utf8);
if (children_[vocabIndex] == nullptr) {
children_[vocabIndex] = new TrieNode(vocab_size_);
}
children_[vocabIndex]->Insert(codepoints+1, translator, lm_word, unigram_score);
}
}
int GetFrequency() {
return prefixCount;
int GetPrefixCount() const {
return prefixCount_;
}
lm::WordIndex GetMinScoreWordIndex() {
return min_score_word;
lm::WordIndex GetMinScoreWordIndex() const {
return min_score_word_;
}
float GetMinUnigramScore() {
return min_unigram_score;
float GetMinUnigramScore() const {
return min_unigram_score_;
}
TrieNode *GetChildAt(int vocabIndex) {
return children[vocabIndex];
return children_[vocabIndex];
}
private:
int vocab_size;
int prefixCount;
lm::WordIndex min_score_word;
float min_unigram_score;
TrieNode **children;
int vocab_size_;
int prefixCount_;
lm::WordIndex min_score_word_;
float min_unigram_score_;
TrieNode **children_;
void WriteNode(std::ostream& os) const {
os << prefixCount << std::endl;
os << min_score_word << std::endl;
os << min_unigram_score << std::endl;
os << prefixCount_ << std::endl;
os << min_score_word_ << std::endl;
os << min_unigram_score_ << std::endl;
}
void ReadNode(std::istream& is, int first_input) {
prefixCount = first_input;
is >> min_score_word;
is >> min_unigram_score;
prefixCount_ = first_input;
is >> min_score_word_;
is >> min_unigram_score_;
}
};