diff --git a/data/README.rst b/data/README.rst index 9db78c6b..88314843 100644 --- a/data/README.rst +++ b/data/README.rst @@ -5,7 +5,7 @@ This directory contains language-specific data files. Most importantly, you will 1. A list of unique characters for the target language (e.g. English) in `data/alphabet.txt` -2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`, which includes a binary n-gram language model generated with `data/lm/generate_lm.py`. +2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`. The scorer package includes a binary n-gram language model generated with `data/lm/generate_lm.py`. For more information on how to build these resources from scratch, see `data/lm/README.md` diff --git a/data/lm/README.rst b/data/lm/README.rst index c1666700..cc3e11b7 100644 --- a/data/lm/README.rst +++ b/data/lm/README.rst @@ -1,8 +1,8 @@ -The LM binary was generated from the LibriSpeech normalized LM training text, available `here `_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM `_'s built binaries must be in your PATH (lmplz, build_binary, filter). +The LM binary was generated from the LibriSpeech normalized LM training text, available `here `_\ , using the `generate_lm.py` script (will generate `lm.binary` and `librispeech-vocab-500k.txt` in the folder it is run from). `KenLM `_'s built binaries must be in your PATH (lmplz, build_binary, filter). The scorer package was then built using the `generate_package.py` script: .. code-block:: bash - - python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer + python generate_lm.py # this will create lm.binary and librispeech-vocab-500k.txt + python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer diff --git a/evaluate_tflite.py b/evaluate_tflite.py index bdc5f231..aba6fb68 100644 --- a/evaluate_tflite.py +++ b/evaluate_tflite.py @@ -27,7 +27,7 @@ This module should be self-contained: - pip install native_client/python/dist/deepspeech*.whl - pip install -r requirements_eval_tflite.txt -Then run with a TF Lite model, LM and a CSV test file +Then run with a TF Lite model, a scorer and a CSV test file ''' BEAM_WIDTH = 500 diff --git a/native_client/ctcdecode/__init__.py b/native_client/ctcdecode/__init__.py index 2474741f..e0282ca5 100644 --- a/native_client/ctcdecode/__init__.py +++ b/native_client/ctcdecode/__init__.py @@ -20,16 +20,20 @@ class Scorer(swigwrapper.Scorer): super(Scorer, self).__init__() # Allow bare initialization if alphabet: + assert alpha is not None, 'alpha parameter is required' + assert beta is not None, 'beta parameter is required' + assert scorer_path, 'scorer_path parameter is required' + serialized = alphabet.serialize() native_alphabet = swigwrapper.Alphabet() err = native_alphabet.deserialize(serialized, len(serialized)) if err != 0: - raise ValueError("Error when deserializing alphabet.") + raise ValueError('Error when deserializing alphabet.') err = self.init(scorer_path.encode('utf-8'), native_alphabet) if err != 0: - raise ValueError("Scorer initialization failed with error code {}".format(err), err) + raise ValueError('Scorer initialization failed with error code {}'.format(err)) self.reset_params(alpha, beta) diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.cpp b/native_client/ctcdecode/ctc_beam_search_decoder.cpp index 2958dec9..5dadd57f 100644 --- a/native_client/ctcdecode/ctc_beam_search_decoder.cpp +++ b/native_client/ctcdecode/ctc_beam_search_decoder.cpp @@ -18,7 +18,7 @@ DecoderState::init(const Alphabet& alphabet, size_t beam_size, double cutoff_prob, size_t cutoff_top_n, - Scorer *ext_scorer) + std::shared_ptr ext_scorer) { // assign special ids abs_time_step_ = 0; @@ -36,7 +36,7 @@ DecoderState::init(const Alphabet& alphabet, prefix_root_.reset(root); prefixes_.push_back(root); - if (ext_scorer != nullptr && (bool)(ext_scorer_->dictionary)) { + if (ext_scorer && (bool)(ext_scorer_->dictionary)) { // no need for std::make_shared<>() since Copy() does 'new' behind the doors auto dict_ptr = std::shared_ptr(ext_scorer->dictionary->Copy(true)); root->set_dictionary(dict_ptr); @@ -58,7 +58,7 @@ DecoderState::next(const double *probs, float min_cutoff = -NUM_FLT_INF; bool full_beam = false; - if (ext_scorer_ != nullptr) { + if (ext_scorer_) { size_t num_prefixes = std::min(prefixes_.size(), beam_size_); std::partial_sort(prefixes_.begin(), prefixes_.begin() + num_prefixes, @@ -109,7 +109,7 @@ DecoderState::next(const double *probs, log_p = log_prob_c + prefix->score; } - if (ext_scorer_ != nullptr) { + if (ext_scorer_) { // skip scoring the space in word based LMs PathTrie* prefix_to_score; if (ext_scorer_->is_utf8_mode()) { @@ -166,7 +166,7 @@ DecoderState::decode() const } // score the last word of each prefix that doesn't end with space - if (ext_scorer_ != nullptr) { + if (ext_scorer_) { for (size_t i = 0; i < beam_size_ && i < prefixes_copy.size(); ++i) { auto prefix = prefixes_copy[i]; if (!ext_scorer_->is_scoring_boundary(prefix->parent, prefix->character)) { @@ -200,7 +200,7 @@ DecoderState::decode() const Output output; prefixes_copy[i]->get_path_vec(output.tokens, output.timesteps); double approx_ctc = scores[prefixes_copy[i]]; - if (ext_scorer_ != nullptr) { + if (ext_scorer_) { auto words = ext_scorer_->split_labels_into_scored_units(output.tokens); // remove term insertion weight approx_ctc -= words.size() * ext_scorer_->beta; @@ -222,7 +222,7 @@ std::vector ctc_beam_search_decoder( size_t beam_size, double cutoff_prob, size_t cutoff_top_n, - Scorer *ext_scorer) + std::shared_ptr ext_scorer) { DecoderState state; state.init(alphabet, beam_size, cutoff_prob, cutoff_top_n, ext_scorer); @@ -243,7 +243,7 @@ ctc_beam_search_decoder_batch( size_t num_processes, double cutoff_prob, size_t cutoff_top_n, - Scorer *ext_scorer) + std::shared_ptr ext_scorer) { VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!"); VALID_CHECK_EQ(batch_size, seq_lengths_size, "must have one sequence length per batch element"); diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.h b/native_client/ctcdecode/ctc_beam_search_decoder.h index 4d6b7ea5..a3d5c480 100644 --- a/native_client/ctcdecode/ctc_beam_search_decoder.h +++ b/native_client/ctcdecode/ctc_beam_search_decoder.h @@ -1,6 +1,7 @@ #ifndef CTC_BEAM_SEARCH_DECODER_H_ #define CTC_BEAM_SEARCH_DECODER_H_ +#include #include #include @@ -16,7 +17,7 @@ class DecoderState { double cutoff_prob_; size_t cutoff_top_n_; - Scorer* ext_scorer_; // weak + std::shared_ptr ext_scorer_; std::vector prefixes_; std::unique_ptr prefix_root_; @@ -45,7 +46,7 @@ public: size_t beam_size, double cutoff_prob, size_t cutoff_top_n, - Scorer *ext_scorer); + std::shared_ptr ext_scorer); /* Send data to the decoder * @@ -95,7 +96,7 @@ std::vector ctc_beam_search_decoder( size_t beam_size, double cutoff_prob, size_t cutoff_top_n, - Scorer *ext_scorer); + std::shared_ptr ext_scorer); /* CTC Beam Search Decoder for batch data * Parameters: @@ -126,6 +127,6 @@ ctc_beam_search_decoder_batch( size_t num_processes, double cutoff_prob, size_t cutoff_top_n, - Scorer *ext_scorer); + std::shared_ptr ext_scorer); #endif // CTC_BEAM_SEARCH_DECODER_H_ diff --git a/native_client/ctcdecode/scorer.cpp b/native_client/ctcdecode/scorer.cpp index c5ae54a2..7b6c74c9 100644 --- a/native_client/ctcdecode/scorer.cpp +++ b/native_client/ctcdecode/scorer.cpp @@ -71,8 +71,19 @@ void Scorer::setup_char_map() int Scorer::load_lm(const std::string& lm_path) { - // load language model + // Check if file is readable to avoid KenLM throwing an exception const char* filename = lm_path.c_str(); + if (access(filename, R_OK) != 0) { + return 1; + } + + // Check if the file format is valid to avoid KenLM throwing an exception + lm::ngram::ModelType model_type; + if (!lm::ngram::RecognizeBinary(filename, model_type)) { + return 1; + } + + // Load the LM lm::ngram::Config config; config.load_method = util::LoadMethod::LAZY; language_model_.reset(lm::ngram::LoadVirtual(filename, config)); @@ -100,21 +111,21 @@ int Scorer::load_trie(std::ifstream& fin, const std::string& file_path) int magic; fin.read(reinterpret_cast(&magic), sizeof(magic)); if (magic != MAGIC) { - std::cerr << "Error: Can't parse trie file, invalid header. Try updating " - "your trie file." << std::endl; + std::cerr << "Error: Can't parse scorer file, invalid header. Try updating " + "your scorer file." << std::endl; return 1; } int version; fin.read(reinterpret_cast(&version), sizeof(version)); if (version != FILE_VERSION) { - std::cerr << "Error: Trie file version mismatch (" << version + std::cerr << "Error: Scorer file version mismatch (" << version << " instead of expected " << FILE_VERSION << "). "; if (version < FILE_VERSION) { - std::cerr << "Update your trie file."; + std::cerr << "Update your scorer file."; } else { - std::cerr << "Downgrade your trie file or update your version of DeepSpeech."; + std::cerr << "Downgrade your scorer file or update your version of DeepSpeech."; } std::cerr << std::endl; return 1; diff --git a/native_client/ctcdecode/swigwrapper.i b/native_client/ctcdecode/swigwrapper.i index af3a1952..fd0f4f08 100644 --- a/native_client/ctcdecode/swigwrapper.i +++ b/native_client/ctcdecode/swigwrapper.i @@ -7,9 +7,10 @@ #include "workspace_status.h" %} -%include "pyabc.i" -%include "std_string.i" -%include "std_vector.i" +%include +%include +%include +%include %include "numpy.i" %init %{ @@ -20,6 +21,8 @@ namespace std { %template(StringVector) vector; } +%shared_ptr(Scorer); + // Convert NumPy arrays to pointer+lengths %apply (double* IN_ARRAY2, int DIM1, int DIM2) {(const double *probs, int time_dim, int class_dim)}; %apply (double* IN_ARRAY3, int DIM1, int DIM2, int DIM3) {(const double *probs, int batch_size, int time_dim, int class_dim)}; diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc index 0a61f3de..274ce41f 100644 --- a/native_client/deepspeech.cc +++ b/native_client/deepspeech.cc @@ -319,7 +319,7 @@ int DS_DisableExternalScorer(ModelState* aCtx) { if (aCtx->scorer_) { - aCtx->scorer_.reset(nullptr); + aCtx->scorer_.reset(); return DS_ERR_OK; } return DS_ERR_SCORER_NOT_ENABLED; @@ -363,7 +363,7 @@ DS_CreateStream(ModelState* aCtx, aCtx->beam_width_, cutoff_prob, cutoff_top_n, - aCtx->scorer_.get()); + aCtx->scorer_); *retval = ctx.release(); return DS_ERR_OK; diff --git a/native_client/java/README.rst b/native_client/java/README.rst index 7b3e3dcc..626400d0 100644 --- a/native_client/java/README.rst +++ b/native_client/java/README.rst @@ -51,12 +51,11 @@ Please push DeepSpeech data to ``/sdcard/deepspeech/``\ , including: * ``output_graph.tflite`` which is the TF Lite model -* ``kenlm.scorer``, if you want to use the language model ; please - be aware that too big language model will make the device run out of memory +* ``kenlm.scorer``, if you want to use the scorer; please be aware that too big + scorer will make the device run out of memory Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/ds``\ : - * ``deepspeech`` * ``libdeepspeech.so`` * ``libc++_shared.so`` diff --git a/native_client/javascript/client.js b/native_client/javascript/client.js index 79561a97..7266b85d 100644 --- a/native_client/javascript/client.js +++ b/native_client/javascript/client.js @@ -32,8 +32,8 @@ parser.addArgument(['--model'], {required: true, help: 'Path to the model (proto parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'}); parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'}); parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'}); -parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not set, use default value from scorer.', type: 'float'}); -parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not set, use default value from scorer.', type: 'float'}); +parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not specified, use default from the scorer package.', type: 'float'}); +parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.', type: 'float'}); parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'}); parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'}); var args = parser.parseArgs(); diff --git a/native_client/modelstate.h b/native_client/modelstate.h index ff106a62..d4f11c1c 100644 --- a/native_client/modelstate.h +++ b/native_client/modelstate.h @@ -16,7 +16,7 @@ struct ModelState { static constexpr unsigned int BATCH_SIZE = 1; Alphabet alphabet_; - std::unique_ptr scorer_; + std::shared_ptr scorer_; unsigned int beam_width_; unsigned int n_steps_; unsigned int n_context_; diff --git a/native_client/python/client.py b/native_client/python/client.py index ba5d70b2..2ef88caf 100644 --- a/native_client/python/client.py +++ b/native_client/python/client.py @@ -95,9 +95,9 @@ def main(): parser.add_argument('--beam_width', type=int, default=500, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, - help='Language model weight (lm_alpha)') + help='Language model weight (lm_alpha). If not specified, use default from the scorer package.') parser.add_argument('--lm_beta', type=float, - help='Word insertion bonus (lm_beta)') + help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true',