Address review comments

2020-02-05 17:19:53 +01:00 · 2020-02-05 17:19:53 +01:00 · 8dedda7759
commit 8dedda7759
parent 1d3b3a31a1
13 changed files with 56 additions and 38 deletions
--- a/data/README.rst
+++ b/data/README.rst
@ -5,7 +5,7 @@ This directory contains language-specific data files. Most importantly, you will

 1. A list of unique characters for the target language (e.g. English) in `data/alphabet.txt`

-2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`, which includes a binary n-gram language model generated with `data/lm/generate_lm.py`.
+2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`. The scorer package includes a binary n-gram language model generated with `data/lm/generate_lm.py`.

 For more information on how to build these resources from scratch, see `data/lm/README.md`

--- a/data/lm/README.rst
+++ b/data/lm/README.rst
@ -1,8 +1,8 @@

-The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
+The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate `lm.binary` and `librispeech-vocab-500k.txt` in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).

 The scorer package was then built using the `generate_package.py` script:

 .. code-block:: bash
-
-   python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer
+    python generate_lm.py # this will create lm.binary and librispeech-vocab-500k.txt
+    python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer
--- a/evaluate_tflite.py
+++ b/evaluate_tflite.py
@ -27,7 +27,7 @@ This module should be self-contained:
  - pip install native_client/python/dist/deepspeech*.whl
  - pip install -r requirements_eval_tflite.txt

-Then run with a TF Lite model, LM and a CSV test file
+Then run with a TF Lite model, a scorer and a CSV test file
 '''

 BEAM_WIDTH = 500
--- a/native_client/ctcdecode/init.py
+++ b/native_client/ctcdecode/init.py
@ -20,16 +20,20 @@ class Scorer(swigwrapper.Scorer):
        super(Scorer, self).__init__()
        # Allow bare initialization
        if alphabet:
+            assert alpha is not None, 'alpha parameter is required'
+            assert beta is not None, 'beta parameter is required'
+            assert scorer_path, 'scorer_path parameter is required'
+
            serialized = alphabet.serialize()
            native_alphabet = swigwrapper.Alphabet()
            err = native_alphabet.deserialize(serialized, len(serialized))
            if err != 0:
-                raise ValueError("Error when deserializing alphabet.")
+                raise ValueError('Error when deserializing alphabet.')

            err = self.init(scorer_path.encode('utf-8'),
                            native_alphabet)
            if err != 0:
-                raise ValueError("Scorer initialization failed with error code {}".format(err), err)
+                raise ValueError('Scorer initialization failed with error code {}'.format(err))

            self.reset_params(alpha, beta)

--- a/native_client/ctcdecode/ctc_beam_search_decoder.cpp
+++ b/native_client/ctcdecode/ctc_beam_search_decoder.cpp
@ -18,7 +18,7 @@ DecoderState::init(const Alphabet& alphabet,
                   size_t beam_size,
                   double cutoff_prob,
                   size_t cutoff_top_n,
-                   Scorer *ext_scorer)
+                   std::shared_ptr<Scorer> ext_scorer)
 {
  // assign special ids
  abs_time_step_ = 0;
@ -36,7 +36,7 @@ DecoderState::init(const Alphabet& alphabet,
  prefix_root_.reset(root);
  prefixes_.push_back(root);

-  if (ext_scorer != nullptr && (bool)(ext_scorer_->dictionary)) {
+  if (ext_scorer && (bool)(ext_scorer_->dictionary)) {
    // no need for std::make_shared<>() since Copy() does 'new' behind the doors
    auto dict_ptr = std::shared_ptr<PathTrie::FstType>(ext_scorer->dictionary->Copy(true));
    root->set_dictionary(dict_ptr);
@ -58,7 +58,7 @@ DecoderState::next(const double *probs,

    float min_cutoff = -NUM_FLT_INF;
    bool full_beam = false;
-    if (ext_scorer_ != nullptr) {
+    if (ext_scorer_) {
      size_t num_prefixes = std::min(prefixes_.size(), beam_size_);
      std::partial_sort(prefixes_.begin(),
                        prefixes_.begin() + num_prefixes,
@ -109,7 +109,7 @@ DecoderState::next(const double *probs,
            log_p = log_prob_c + prefix->score;
          }

-          if (ext_scorer_ != nullptr) {
+          if (ext_scorer_) {
            // skip scoring the space in word based LMs
            PathTrie* prefix_to_score;
            if (ext_scorer_->is_utf8_mode()) {
@ -166,7 +166,7 @@ DecoderState::decode() const
  }

  // score the last word of each prefix that doesn't end with space
-  if (ext_scorer_ != nullptr) {
+  if (ext_scorer_) {
    for (size_t i = 0; i < beam_size_ && i < prefixes_copy.size(); ++i) {
      auto prefix = prefixes_copy[i];
      if (!ext_scorer_->is_scoring_boundary(prefix->parent, prefix->character)) {
@ -200,7 +200,7 @@ DecoderState::decode() const
    Output output;
    prefixes_copy[i]->get_path_vec(output.tokens, output.timesteps);
    double approx_ctc = scores[prefixes_copy[i]];
-    if (ext_scorer_ != nullptr) {
+    if (ext_scorer_) {
      auto words = ext_scorer_->split_labels_into_scored_units(output.tokens);
      // remove term insertion weight
      approx_ctc -= words.size() * ext_scorer_->beta;
@ -222,7 +222,7 @@ std::vector<Output> ctc_beam_search_decoder(
    size_t beam_size,
    double cutoff_prob,
    size_t cutoff_top_n,
-    Scorer *ext_scorer)
+    std::shared_ptr<Scorer> ext_scorer)
 {
  DecoderState state;
  state.init(alphabet, beam_size, cutoff_prob, cutoff_top_n, ext_scorer);
@ -243,7 +243,7 @@ ctc_beam_search_decoder_batch(
    size_t num_processes,
    double cutoff_prob,
    size_t cutoff_top_n,
-    Scorer *ext_scorer)
+    std::shared_ptr<Scorer> ext_scorer)
 {
  VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
  VALID_CHECK_EQ(batch_size, seq_lengths_size, "must have one sequence length per batch element");
--- a/native_client/ctcdecode/ctc_beam_search_decoder.h
+++ b/native_client/ctcdecode/ctc_beam_search_decoder.h
@ -1,6 +1,7 @@
 #ifndef CTC_BEAM_SEARCH_DECODER_H_
 #define CTC_BEAM_SEARCH_DECODER_H_

+#include <memory>
 #include <string>
 #include <vector>

@ -16,7 +17,7 @@ class DecoderState {
  double cutoff_prob_;
  size_t cutoff_top_n_;

-  Scorer* ext_scorer_; // weak
+  std::shared_ptr<Scorer> ext_scorer_;
  std::vector<PathTrie*> prefixes_;
  std::unique_ptr<PathTrie> prefix_root_;

@ -45,7 +46,7 @@ public:
           size_t beam_size,
           double cutoff_prob,
           size_t cutoff_top_n,
-           Scorer *ext_scorer);
+           std::shared_ptr<Scorer> ext_scorer);

  /* Send data to the decoder
   *
@ -95,7 +96,7 @@ std::vector<Output> ctc_beam_search_decoder(
    size_t beam_size,
    double cutoff_prob,
    size_t cutoff_top_n,
-    Scorer *ext_scorer);
+    std::shared_ptr<Scorer> ext_scorer);

 /* CTC Beam Search Decoder for batch data
 * Parameters:
@ -126,6 +127,6 @@ ctc_beam_search_decoder_batch(
    size_t num_processes,
    double cutoff_prob,
    size_t cutoff_top_n,
-    Scorer *ext_scorer);
+    std::shared_ptr<Scorer> ext_scorer);

 #endif  // CTC_BEAM_SEARCH_DECODER_H_
--- a/native_client/ctcdecode/scorer.cpp
+++ b/native_client/ctcdecode/scorer.cpp
@ -71,8 +71,19 @@ void Scorer::setup_char_map()

 int Scorer::load_lm(const std::string& lm_path)
 {
-  // load language model
+  // Check if file is readable to avoid KenLM throwing an exception
  const char* filename = lm_path.c_str();
+  if (access(filename, R_OK) != 0) {
+    return 1;
+  }
+
+  // Check if the file format is valid to avoid KenLM throwing an exception
+  lm::ngram::ModelType model_type;
+  if (!lm::ngram::RecognizeBinary(filename, model_type)) {
+    return 1;
+  }
+
+  // Load the LM
  lm::ngram::Config config;
  config.load_method = util::LoadMethod::LAZY;
  language_model_.reset(lm::ngram::LoadVirtual(filename, config));
@ -100,21 +111,21 @@ int Scorer::load_trie(std::ifstream& fin, const std::string& file_path)
  int magic;
  fin.read(reinterpret_cast<char*>(&magic), sizeof(magic));
  if (magic != MAGIC) {
-    std::cerr << "Error: Can't parse trie file, invalid header. Try updating "
-                 "your trie file." << std::endl;
+    std::cerr << "Error: Can't parse scorer file, invalid header. Try updating "
+                 "your scorer file." << std::endl;
    return 1;
  }

  int version;
  fin.read(reinterpret_cast<char*>(&version), sizeof(version));
  if (version != FILE_VERSION) {
-    std::cerr << "Error: Trie file version mismatch (" << version
+    std::cerr << "Error: Scorer file version mismatch (" << version
              << " instead of expected " << FILE_VERSION
              << "). ";
    if (version < FILE_VERSION) {
-      std::cerr << "Update your trie file.";
+      std::cerr << "Update your scorer file.";
    } else {
-      std::cerr << "Downgrade your trie file or update your version of DeepSpeech.";
+      std::cerr << "Downgrade your scorer file or update your version of DeepSpeech.";
    }
    std::cerr << std::endl;
    return 1;
--- a/native_client/ctcdecode/swigwrapper.i
+++ b/native_client/ctcdecode/swigwrapper.i
@ -7,9 +7,10 @@
 #include "workspace_status.h"
 %}

-%include "pyabc.i"
-%include "std_string.i"
-%include "std_vector.i"
+%include <pyabc.i>
+%include <std_string.i>
+%include <std_vector.i>
+%include <std_shared_ptr.i>
 %include "numpy.i"

 %init %{
@ -20,6 +21,8 @@ namespace std {
    %template(StringVector) vector<string>;
 }

+%shared_ptr(Scorer);
+
 // Convert NumPy arrays to pointer+lengths
 %apply (double* IN_ARRAY2, int DIM1, int DIM2) {(const double *probs, int time_dim, int class_dim)};
 %apply (double* IN_ARRAY3, int DIM1, int DIM2, int DIM3) {(const double *probs, int batch_size, int time_dim, int class_dim)};
--- a/native_client/deepspeech.cc
+++ b/native_client/deepspeech.cc
@ -319,7 +319,7 @@ int
 DS_DisableExternalScorer(ModelState* aCtx)
 {
  if (aCtx->scorer_) {
-    aCtx->scorer_.reset(nullptr);
+    aCtx->scorer_.reset();
    return DS_ERR_OK;
  }
  return DS_ERR_SCORER_NOT_ENABLED;
@ -363,7 +363,7 @@ DS_CreateStream(ModelState* aCtx,
                           aCtx->beam_width_,
                           cutoff_prob,
                           cutoff_top_n,
-                           aCtx->scorer_.get());
+                           aCtx->scorer_);

  *retval = ctx.release();
  return DS_ERR_OK;
--- a/native_client/java/README.rst
+++ b/native_client/java/README.rst
@ -51,12 +51,11 @@ Please push DeepSpeech data to ``/sdcard/deepspeech/``\ , including:


 * ``output_graph.tflite`` which is the TF Lite model
-* ``kenlm.scorer``, if you want to use the language model ; please
-  be aware that too big language model will make the device run out of memory
+* ``kenlm.scorer``, if you want to use the scorer; please be aware that too big
+  scorer will make the device run out of memory

 Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/ds``\ :

-
 * ``deepspeech``
 * ``libdeepspeech.so``
 * ``libc++_shared.so``
--- a/native_client/javascript/client.js
+++ b/native_client/javascript/client.js
@ -32,8 +32,8 @@ parser.addArgument(['--model'], {required: true, help: 'Path to the model (proto
 parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'});
 parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
 parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'});
-parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not set, use default value from scorer.', type: 'float'});
-parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not set, use default value from scorer.', type: 'float'});
+parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not specified, use default from the scorer package.', type: 'float'});
+parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.', type: 'float'});
 parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
 parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
 var args = parser.parseArgs();
--- a/native_client/modelstate.h
+++ b/native_client/modelstate.h
@ -16,7 +16,7 @@ struct ModelState {
  static constexpr unsigned int BATCH_SIZE = 1;

  Alphabet alphabet_;
-  std::unique_ptr<Scorer> scorer_;
+  std::shared_ptr<Scorer> scorer_;
  unsigned int beam_width_;
  unsigned int n_steps_;
  unsigned int n_context_;
--- a/native_client/python/client.py
+++ b/native_client/python/client.py
@ -95,9 +95,9 @@ def main():
    parser.add_argument('--beam_width', type=int, default=500,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float,
-                        help='Language model weight (lm_alpha)')
+                        help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
    parser.add_argument('--lm_beta', type=float,
-                        help='Word insertion bonus (lm_beta)')
+                        help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',