From 1d3b3a31a18e188743fc4c9620306a4cc45c6c10 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Wed, 22 Jan 2020 15:18:17 +0100
Subject: [PATCH] Address review comments and update docs

---
 .gitattributes                                 |  3 ---
 README.rst                                     |  4 ++--
 data/README.rst                                |  4 +---
 data/lm/README.rst                             |  6 +++---
 data/lm/generate_lm.py                         |  7 +++++--
 data/lm/generate_package.py                    |  1 +
 doc/C-Examples.rst                             |  2 +-
 doc/NodeJS-API.rst                             |  6 ++++++
 doc/NodeJS-Examples.rst                        |  4 ++--
 doc/Python-API.rst                             |  6 ++++++
 doc/Python-Examples.rst                        |  4 ++--
 doc/USING.rst                                  |  6 +++---
 native_client/args.h                           | 18 +++++++++---------
 native_client/ctcdecode/__init__.py            |  8 ++++----
 .../ctcdecode/ctc_beam_search_decoder.cpp      |  2 +-
 native_client/java/README.rst                  |  2 +-
 native_client/javascript/index.js              |  5 +++++
 native_client/python/__init__.py               |  4 ++++
 taskcluster/examples-base.tyml                 |  2 +-
 taskcluster/win-opt-base.tyml                  |  2 +-
 20 files changed, 58 insertions(+), 38 deletions(-)

diff --git a/.gitattributes b/.gitattributes
index b2aaede4..4e2fd505 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,4 +1 @@
-*.binary filter=lfs diff=lfs merge=lfs -crlf
-data/lm/trie filter=lfs diff=lfs merge=lfs -crlf
-data/lm/vocab.txt filter=lfs diff=lfs merge=lfs -text
 data/lm/kenlm.scorer filter=lfs diff=lfs merge=lfs -text
diff --git a/README.rst b/README.rst
index d2ec566b..e0ed5ad8 100644
--- a/README.rst
+++ b/README.rst
@@ -36,7 +36,7 @@ To install and use deepspeech all you have to do is:
    tar xvf audio-0.6.1.tar.gz
 
    # Transcribe an audio file
-   deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --lm deepspeech-0.6.1-models/lm.binary --trie deepspeech-0.6.1-models/trie --audio audio/2830-3980-0043.wav
+   deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --scorer deepspeech-0.6.1-models/kenlm.scorer --audio audio/2830-3980-0043.wav
 
 A pre-trained English model is available for use and can be downloaded using `the instructions below <doc/USING.rst#using-a-pre-trained-model>`_. A package with some example audio files is available for download in our `release notes <https://github.com/mozilla/DeepSpeech/releases/latest>`_.
 
@@ -52,7 +52,7 @@ Quicker inference can be performed using a supported NVIDIA GPU on Linux. See th
    pip3 install deepspeech-gpu
 
    # Transcribe an audio file.
-   deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --lm deepspeech-0.6.1-models/lm.binary --trie deepspeech-0.6.1-models/trie --audio audio/2830-3980-0043.wav
+   deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --scorer deepspeech-0.6.1-models/kenlm.scorer --audio audio/2830-3980-0043.wav
 
 Please ensure you have the required `CUDA dependencies <doc/USING.rst#cuda-dependency>`_.
 
diff --git a/data/README.rst b/data/README.rst
index 54230080..9db78c6b 100644
--- a/data/README.rst
+++ b/data/README.rst
@@ -5,9 +5,7 @@ This directory contains language-specific data files. Most importantly, you will
 
 1. A list of unique characters for the target language (e.g. English) in `data/alphabet.txt`
 
-2. A binary n-gram language model compiled by `kenlm` in `data/lm/lm.binary`
-
-3. A trie model compiled by `generate_trie <https://github.com/mozilla/DeepSpeech#using-the-command-line-client>`_ in `data/lm/trie`
+2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`, which includes a binary n-gram language model generated with `data/lm/generate_lm.py`.
 
 For more information on how to build these resources from scratch, see `data/lm/README.md`
 
diff --git a/data/lm/README.rst b/data/lm/README.rst
index bd2c2d3b..c1666700 100644
--- a/data/lm/README.rst
+++ b/data/lm/README.rst
@@ -1,8 +1,8 @@
 
-lm.binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
+The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
 
-The trie was then generated from the vocabulary of the language model:
+The scorer package was then built using the `generate_package.py` script:
 
 .. code-block:: bash
 
-   ./generate_trie ../data/alphabet.txt lm.binary trie
+   python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer
diff --git a/data/lm/generate_lm.py b/data/lm/generate_lm.py
index 6dc320a5..00049996 100644
--- a/data/lm/generate_lm.py
+++ b/data/lm/generate_lm.py
@@ -39,10 +39,13 @@ def main():
                '--prune', '0', '0', '1'
     ])
 
-    # Filter LM using vocabulary of top 500k words
-    filtered_path = os.path.join(tmp, 'lm_filtered.arpa')
     vocab_str = '\n'.join(word for word, count in counter.most_common(500000))
+    with open('librispeech-vocab-500k.txt', 'w') as fout:
+      fout.write(vocab_str)
+
+    # Filter LM using vocabulary of top 500k words
     print('Filtering ARPA file...')
+    filtered_path = os.path.join(tmp, 'lm_filtered.arpa')
     subprocess.run(['filter', 'single', 'model:{}'.format(lm_path), filtered_path], input=vocab_str.encode('utf-8'), check=True)
 
     # Quantize and produce trie binary.
diff --git a/data/lm/generate_package.py b/data/lm/generate_package.py
index 2b9acf33..3b9aa372 100644
--- a/data/lm/generate_package.py
+++ b/data/lm/generate_package.py
@@ -41,6 +41,7 @@ def create_bundle(
 
     if force_utf8 != None:  # pylint: disable=singleton-comparison
         use_utf8 = force_utf8.value
+        print("Forcing UTF-8 mode = {}".format(use_utf8))
     else:
         use_utf8 = vocab_looks_char_based
 
diff --git a/doc/C-Examples.rst b/doc/C-Examples.rst
index 44ab46ac..5072ba30 100644
--- a/doc/C-Examples.rst
+++ b/doc/C-Examples.rst
@@ -7,7 +7,7 @@ Creating a model instance and loading model
 .. literalinclude:: ../native_client/client.cc
    :language: c
    :linenos:
-   :lines: 370-388
+   :lines: 370-390
 
 Performing inference
 --------------------
diff --git a/doc/NodeJS-API.rst b/doc/NodeJS-API.rst
index aa92e361..acdc3ab7 100644
--- a/doc/NodeJS-API.rst
+++ b/doc/NodeJS-API.rst
@@ -7,6 +7,12 @@ Model
 .. js:autoclass:: Model
    :members:
 
+Stream
+------
+
+.. js:autoclass:: Stream
+   :members:
+
 Module exported methods
 -----------------------
 
diff --git a/doc/NodeJS-Examples.rst b/doc/NodeJS-Examples.rst
index 4e8a73b3..a9549525 100644
--- a/doc/NodeJS-Examples.rst
+++ b/doc/NodeJS-Examples.rst
@@ -7,7 +7,7 @@ Creating a model instance and loading model
 .. literalinclude:: ../native_client/javascript/client.js
    :language: javascript
    :linenos:
-   :lines: 57-66
+   :lines: 54-72
 
 Performing inference
 --------------------
@@ -15,7 +15,7 @@ Performing inference
 .. literalinclude:: ../native_client/javascript/client.js
    :language: javascript
    :linenos:
-   :lines: 115-117
+   :lines: 117-121
 
 Full source code
 ----------------
diff --git a/doc/Python-API.rst b/doc/Python-API.rst
index 08851da6..b2b3567f 100644
--- a/doc/Python-API.rst
+++ b/doc/Python-API.rst
@@ -9,6 +9,12 @@ Model
 .. autoclass:: Model
    :members:
 
+Stream
+------
+
+.. autoclass:: Stream
+   :members:
+
 Metadata
 --------
 
diff --git a/doc/Python-Examples.rst b/doc/Python-Examples.rst
index 2cca86a0..26aee69c 100644
--- a/doc/Python-Examples.rst
+++ b/doc/Python-Examples.rst
@@ -7,7 +7,7 @@ Creating a model instance and loading model
 .. literalinclude:: ../native_client/python/client.py
    :language: python
    :linenos:
-   :lines: 69, 78
+   :lines: 111, 120
 
 Performing inference
 --------------------
@@ -15,7 +15,7 @@ Performing inference
 .. literalinclude:: ../native_client/python/client.py
    :language: python
    :linenos:
-   :lines: 95-98
+   :lines: 140-145
 
 Full source code
 ----------------
diff --git a/doc/USING.rst b/doc/USING.rst
index 9769d386..465d4319 100644
--- a/doc/USING.rst
+++ b/doc/USING.rst
@@ -106,9 +106,9 @@ Note: the following command assumes you `downloaded the pre-trained model <#gett
 
 .. code-block:: bash
 
-   deepspeech --model models/output_graph.pbmm --lm models/lm.binary --trie models/trie --audio my_audio_file.wav
+   deepspeech --model models/output_graph.pbmm --scorer models/kenlm.scorer --audio my_audio_file.wav
 
-The arguments ``--lm`` and ``--trie`` are optional, and represent a language model.
+The ``--scorer`` argument is optional, and represents an external language model to be used when transcribing the audio.
 
 See :github:`client.py <native_client/python/client.py>` for an example of how to use the package programatically.
 
@@ -162,7 +162,7 @@ Note: the following command assumes you `downloaded the pre-trained model <#gett
 
 .. code-block:: bash
 
-   ./deepspeech --model models/output_graph.pbmm --lm models/lm.binary --trie models/trie --audio audio_input.wav
+   ./deepspeech --model models/output_graph.pbmm --scorer models/kenlm.scorer --audio audio_input.wav
 
 See the help output with ``./deepspeech -h`` and the :github:`native client README <native_client/README.rst>` for more details.
 
diff --git a/native_client/args.h b/native_client/args.h
index a158fb18..d5a0f869 100644
--- a/native_client/args.h
+++ b/native_client/args.h
@@ -59,11 +59,11 @@ void PrintHelp(const char* bin)
 
 bool ProcessArgs(int argc, char** argv)
 {
-    const char* const short_opts = "m:a:s:r:w:c:d:b:tehv";
+    const char* const short_opts = "m:l:a:b:c:d:tejs:vh";
     const option long_opts[] = {
             {"model", required_argument, nullptr, 'm'},
             {"scorer", required_argument, nullptr, 'l'},
-            {"audio", required_argument, nullptr, 'w'},
+            {"audio", required_argument, nullptr, 'a'},
             {"beam_width", required_argument, nullptr, 'b'},
             {"lm_alpha", required_argument, nullptr, 'c'},
             {"lm_beta", required_argument, nullptr, 'd'},
@@ -71,8 +71,8 @@ bool ProcessArgs(int argc, char** argv)
             {"extended", no_argument, nullptr, 'e'},
             {"json", no_argument, nullptr, 'j'},
             {"stream", required_argument, nullptr, 's'},
-            {"help", no_argument, nullptr, 'h'},
             {"version", no_argument, nullptr, 'v'},
+            {"help", no_argument, nullptr, 'h'},
             {nullptr, no_argument, nullptr, 0}
     };
 
@@ -93,14 +93,14 @@ bool ProcessArgs(int argc, char** argv)
             scorer = optarg;
             break;
 
-        case 'w':
+        case 'a':
             audio = optarg;
             break;
 
         case 'b':
             beam_width = atoi(optarg);
             break;
-        
+
         case 'c':
             set_alphabeta = true;
             lm_alpha = atof(optarg);
@@ -115,10 +115,6 @@ bool ProcessArgs(int argc, char** argv)
             show_times = true;
             break;
 
-        case 'v':
-            has_versions = true;
-            break;
-
         case 'e':
             extended_metadata = true;
             break;
@@ -131,6 +127,10 @@ bool ProcessArgs(int argc, char** argv)
             stream_size = atoi(optarg);
             break;
 
+        case 'v':
+            has_versions = true;
+            break;
+
         case 'h': // -h or --help
         case '?': // Unrecognized option
         default:
diff --git a/native_client/ctcdecode/__init__.py b/native_client/ctcdecode/__init__.py
index 8ba2e9b2..2474741f 100644
--- a/native_client/ctcdecode/__init__.py
+++ b/native_client/ctcdecode/__init__.py
@@ -12,11 +12,11 @@ class Scorer(swigwrapper.Scorer):
     :type alpha: float
     :param beta: Word insertion bonus.
     :type beta: float
-    :model_path: Path to load scorer.
+    :scorer_path: Path to load scorer from.
     :alphabet: Alphabet
-    :type model_path: basestring
+    :type scorer_path: basestring
     """
-    def __init__(self, alpha=None, beta=None, model_path=None, alphabet=None):
+    def __init__(self, alpha=None, beta=None, scorer_path=None, alphabet=None):
         super(Scorer, self).__init__()
         # Allow bare initialization
         if alphabet:
@@ -26,7 +26,7 @@ class Scorer(swigwrapper.Scorer):
             if err != 0:
                 raise ValueError("Error when deserializing alphabet.")
 
-            err = self.init(model_path.encode('utf-8'),
+            err = self.init(scorer_path.encode('utf-8'),
                             native_alphabet)
             if err != 0:
                 raise ValueError("Scorer initialization failed with error code {}".format(err), err)
diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.cpp b/native_client/ctcdecode/ctc_beam_search_decoder.cpp
index 852ef34c..2958dec9 100644
--- a/native_client/ctcdecode/ctc_beam_search_decoder.cpp
+++ b/native_client/ctcdecode/ctc_beam_search_decoder.cpp
@@ -36,7 +36,7 @@ DecoderState::init(const Alphabet& alphabet,
   prefix_root_.reset(root);
   prefixes_.push_back(root);
 
-  if (ext_scorer != nullptr && (bool)ext_scorer_->dictionary) {
+  if (ext_scorer != nullptr && (bool)(ext_scorer_->dictionary)) {
     // no need for std::make_shared<>() since Copy() does 'new' behind the doors
     auto dict_ptr = std::shared_ptr<PathTrie::FstType>(ext_scorer->dictionary->Copy(true));
     root->set_dictionary(dict_ptr);
diff --git a/native_client/java/README.rst b/native_client/java/README.rst
index c345c094..7b3e3dcc 100644
--- a/native_client/java/README.rst
+++ b/native_client/java/README.rst
@@ -51,7 +51,7 @@ Please push DeepSpeech data to ``/sdcard/deepspeech/``\ , including:
 
 
 * ``output_graph.tflite`` which is the TF Lite model
-* ``lm.binary`` and ``trie`` files, if you want to use the language model ; please
+* ``kenlm.scorer``, if you want to use the language model ; please
   be aware that too big language model will make the device run out of memory
 
 Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/ds``\ :
diff --git a/native_client/javascript/index.js b/native_client/javascript/index.js
index 2ce039bf..772b1a82 100644
--- a/native_client/javascript/index.js
+++ b/native_client/javascript/index.js
@@ -123,6 +123,11 @@ Model.prototype.createStream = function() {
     return ctx;
 }
 
+/**
+ * @class
+ * Provides an interface to a DeepSpeech stream. The constructor cannot be called
+ * directly, use :js:func:`Model.createStream`.
+ */
 function Stream(nativeStream) {
     this._impl = nativeStream;
 }
diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py
index ee38287f..ccb53fc4 100644
--- a/native_client/python/__init__.py
+++ b/native_client/python/__init__.py
@@ -131,6 +131,10 @@ class Model(object):
 
 
 class Stream(object):
+    """
+    Class wrapping a DeepSpeech stream. The constructor cannot be called directly.
+    Use :func:`Model.createStream()`
+    """
     def __init__(self, native_stream):
         self._impl = native_stream
 
diff --git a/taskcluster/examples-base.tyml b/taskcluster/examples-base.tyml
index 9739f36a..acee40d9 100644
--- a/taskcluster/examples-base.tyml
+++ b/taskcluster/examples-base.tyml
@@ -34,7 +34,7 @@ then:
       DEEPSPEECH_AUDIO: "https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz"
       PIP_DEFAULT_TIMEOUT: "60"
       EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
-      EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05"
+      EXAMPLES_CHECKOUT_TARGET: "4b97ac41d03ca0d23fa92526433db72a90f47d4a"
 
     command:
       - "/bin/bash"
diff --git a/taskcluster/win-opt-base.tyml b/taskcluster/win-opt-base.tyml
index e0c12162..6bcc0acd 100644
--- a/taskcluster/win-opt-base.tyml
+++ b/taskcluster/win-opt-base.tyml
@@ -44,7 +44,7 @@ payload:
     MSYS: 'winsymlinks:nativestrict'
     TENSORFLOW_BUILD_ARTIFACT: ${build.tensorflow}
     EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
-    EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05"
+    EXAMPLES_CHECKOUT_TARGET: "4b97ac41d03ca0d23fa92526433db72a90f47d4a"
 
   command:
     - >-