From 1d3b3a31a18e188743fc4c9620306a4cc45c6c10 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Wed, 22 Jan 2020 15:18:17 +0100 Subject: [PATCH] Address review comments and update docs --- .gitattributes | 3 --- README.rst | 4 ++-- data/README.rst | 4 +--- data/lm/README.rst | 6 +++--- data/lm/generate_lm.py | 7 +++++-- data/lm/generate_package.py | 1 + doc/C-Examples.rst | 2 +- doc/NodeJS-API.rst | 6 ++++++ doc/NodeJS-Examples.rst | 4 ++-- doc/Python-API.rst | 6 ++++++ doc/Python-Examples.rst | 4 ++-- doc/USING.rst | 6 +++--- native_client/args.h | 18 +++++++++--------- native_client/ctcdecode/__init__.py | 8 ++++---- .../ctcdecode/ctc_beam_search_decoder.cpp | 2 +- native_client/java/README.rst | 2 +- native_client/javascript/index.js | 5 +++++ native_client/python/__init__.py | 4 ++++ taskcluster/examples-base.tyml | 2 +- taskcluster/win-opt-base.tyml | 2 +- 20 files changed, 58 insertions(+), 38 deletions(-) diff --git a/.gitattributes b/.gitattributes index b2aaede4..4e2fd505 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1 @@ -*.binary filter=lfs diff=lfs merge=lfs -crlf -data/lm/trie filter=lfs diff=lfs merge=lfs -crlf -data/lm/vocab.txt filter=lfs diff=lfs merge=lfs -text data/lm/kenlm.scorer filter=lfs diff=lfs merge=lfs -text diff --git a/README.rst b/README.rst index d2ec566b..e0ed5ad8 100644 --- a/README.rst +++ b/README.rst @@ -36,7 +36,7 @@ To install and use deepspeech all you have to do is: tar xvf audio-0.6.1.tar.gz # Transcribe an audio file - deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --lm deepspeech-0.6.1-models/lm.binary --trie deepspeech-0.6.1-models/trie --audio audio/2830-3980-0043.wav + deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --scorer deepspeech-0.6.1-models/kenlm.scorer --audio audio/2830-3980-0043.wav A pre-trained English model is available for use and can be downloaded using `the instructions below `_. A package with some example audio files is available for download in our `release notes `_. @@ -52,7 +52,7 @@ Quicker inference can be performed using a supported NVIDIA GPU on Linux. See th pip3 install deepspeech-gpu # Transcribe an audio file. - deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --lm deepspeech-0.6.1-models/lm.binary --trie deepspeech-0.6.1-models/trie --audio audio/2830-3980-0043.wav + deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --scorer deepspeech-0.6.1-models/kenlm.scorer --audio audio/2830-3980-0043.wav Please ensure you have the required `CUDA dependencies `_. diff --git a/data/README.rst b/data/README.rst index 54230080..9db78c6b 100644 --- a/data/README.rst +++ b/data/README.rst @@ -5,9 +5,7 @@ This directory contains language-specific data files. Most importantly, you will 1. A list of unique characters for the target language (e.g. English) in `data/alphabet.txt` -2. A binary n-gram language model compiled by `kenlm` in `data/lm/lm.binary` - -3. A trie model compiled by `generate_trie `_ in `data/lm/trie` +2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`, which includes a binary n-gram language model generated with `data/lm/generate_lm.py`. For more information on how to build these resources from scratch, see `data/lm/README.md` diff --git a/data/lm/README.rst b/data/lm/README.rst index bd2c2d3b..c1666700 100644 --- a/data/lm/README.rst +++ b/data/lm/README.rst @@ -1,8 +1,8 @@ -lm.binary was generated from the LibriSpeech normalized LM training text, available `here `_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM `_'s built binaries must be in your PATH (lmplz, build_binary, filter). +The LM binary was generated from the LibriSpeech normalized LM training text, available `here `_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM `_'s built binaries must be in your PATH (lmplz, build_binary, filter). -The trie was then generated from the vocabulary of the language model: +The scorer package was then built using the `generate_package.py` script: .. code-block:: bash - ./generate_trie ../data/alphabet.txt lm.binary trie + python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer diff --git a/data/lm/generate_lm.py b/data/lm/generate_lm.py index 6dc320a5..00049996 100644 --- a/data/lm/generate_lm.py +++ b/data/lm/generate_lm.py @@ -39,10 +39,13 @@ def main(): '--prune', '0', '0', '1' ]) - # Filter LM using vocabulary of top 500k words - filtered_path = os.path.join(tmp, 'lm_filtered.arpa') vocab_str = '\n'.join(word for word, count in counter.most_common(500000)) + with open('librispeech-vocab-500k.txt', 'w') as fout: + fout.write(vocab_str) + + # Filter LM using vocabulary of top 500k words print('Filtering ARPA file...') + filtered_path = os.path.join(tmp, 'lm_filtered.arpa') subprocess.run(['filter', 'single', 'model:{}'.format(lm_path), filtered_path], input=vocab_str.encode('utf-8'), check=True) # Quantize and produce trie binary. diff --git a/data/lm/generate_package.py b/data/lm/generate_package.py index 2b9acf33..3b9aa372 100644 --- a/data/lm/generate_package.py +++ b/data/lm/generate_package.py @@ -41,6 +41,7 @@ def create_bundle( if force_utf8 != None: # pylint: disable=singleton-comparison use_utf8 = force_utf8.value + print("Forcing UTF-8 mode = {}".format(use_utf8)) else: use_utf8 = vocab_looks_char_based diff --git a/doc/C-Examples.rst b/doc/C-Examples.rst index 44ab46ac..5072ba30 100644 --- a/doc/C-Examples.rst +++ b/doc/C-Examples.rst @@ -7,7 +7,7 @@ Creating a model instance and loading model .. literalinclude:: ../native_client/client.cc :language: c :linenos: - :lines: 370-388 + :lines: 370-390 Performing inference -------------------- diff --git a/doc/NodeJS-API.rst b/doc/NodeJS-API.rst index aa92e361..acdc3ab7 100644 --- a/doc/NodeJS-API.rst +++ b/doc/NodeJS-API.rst @@ -7,6 +7,12 @@ Model .. js:autoclass:: Model :members: +Stream +------ + +.. js:autoclass:: Stream + :members: + Module exported methods ----------------------- diff --git a/doc/NodeJS-Examples.rst b/doc/NodeJS-Examples.rst index 4e8a73b3..a9549525 100644 --- a/doc/NodeJS-Examples.rst +++ b/doc/NodeJS-Examples.rst @@ -7,7 +7,7 @@ Creating a model instance and loading model .. literalinclude:: ../native_client/javascript/client.js :language: javascript :linenos: - :lines: 57-66 + :lines: 54-72 Performing inference -------------------- @@ -15,7 +15,7 @@ Performing inference .. literalinclude:: ../native_client/javascript/client.js :language: javascript :linenos: - :lines: 115-117 + :lines: 117-121 Full source code ---------------- diff --git a/doc/Python-API.rst b/doc/Python-API.rst index 08851da6..b2b3567f 100644 --- a/doc/Python-API.rst +++ b/doc/Python-API.rst @@ -9,6 +9,12 @@ Model .. autoclass:: Model :members: +Stream +------ + +.. autoclass:: Stream + :members: + Metadata -------- diff --git a/doc/Python-Examples.rst b/doc/Python-Examples.rst index 2cca86a0..26aee69c 100644 --- a/doc/Python-Examples.rst +++ b/doc/Python-Examples.rst @@ -7,7 +7,7 @@ Creating a model instance and loading model .. literalinclude:: ../native_client/python/client.py :language: python :linenos: - :lines: 69, 78 + :lines: 111, 120 Performing inference -------------------- @@ -15,7 +15,7 @@ Performing inference .. literalinclude:: ../native_client/python/client.py :language: python :linenos: - :lines: 95-98 + :lines: 140-145 Full source code ---------------- diff --git a/doc/USING.rst b/doc/USING.rst index 9769d386..465d4319 100644 --- a/doc/USING.rst +++ b/doc/USING.rst @@ -106,9 +106,9 @@ Note: the following command assumes you `downloaded the pre-trained model <#gett .. code-block:: bash - deepspeech --model models/output_graph.pbmm --lm models/lm.binary --trie models/trie --audio my_audio_file.wav + deepspeech --model models/output_graph.pbmm --scorer models/kenlm.scorer --audio my_audio_file.wav -The arguments ``--lm`` and ``--trie`` are optional, and represent a language model. +The ``--scorer`` argument is optional, and represents an external language model to be used when transcribing the audio. See :github:`client.py ` for an example of how to use the package programatically. @@ -162,7 +162,7 @@ Note: the following command assumes you `downloaded the pre-trained model <#gett .. code-block:: bash - ./deepspeech --model models/output_graph.pbmm --lm models/lm.binary --trie models/trie --audio audio_input.wav + ./deepspeech --model models/output_graph.pbmm --scorer models/kenlm.scorer --audio audio_input.wav See the help output with ``./deepspeech -h`` and the :github:`native client README ` for more details. diff --git a/native_client/args.h b/native_client/args.h index a158fb18..d5a0f869 100644 --- a/native_client/args.h +++ b/native_client/args.h @@ -59,11 +59,11 @@ void PrintHelp(const char* bin) bool ProcessArgs(int argc, char** argv) { - const char* const short_opts = "m:a:s:r:w:c:d:b:tehv"; + const char* const short_opts = "m:l:a:b:c:d:tejs:vh"; const option long_opts[] = { {"model", required_argument, nullptr, 'm'}, {"scorer", required_argument, nullptr, 'l'}, - {"audio", required_argument, nullptr, 'w'}, + {"audio", required_argument, nullptr, 'a'}, {"beam_width", required_argument, nullptr, 'b'}, {"lm_alpha", required_argument, nullptr, 'c'}, {"lm_beta", required_argument, nullptr, 'd'}, @@ -71,8 +71,8 @@ bool ProcessArgs(int argc, char** argv) {"extended", no_argument, nullptr, 'e'}, {"json", no_argument, nullptr, 'j'}, {"stream", required_argument, nullptr, 's'}, - {"help", no_argument, nullptr, 'h'}, {"version", no_argument, nullptr, 'v'}, + {"help", no_argument, nullptr, 'h'}, {nullptr, no_argument, nullptr, 0} }; @@ -93,14 +93,14 @@ bool ProcessArgs(int argc, char** argv) scorer = optarg; break; - case 'w': + case 'a': audio = optarg; break; case 'b': beam_width = atoi(optarg); break; - + case 'c': set_alphabeta = true; lm_alpha = atof(optarg); @@ -115,10 +115,6 @@ bool ProcessArgs(int argc, char** argv) show_times = true; break; - case 'v': - has_versions = true; - break; - case 'e': extended_metadata = true; break; @@ -131,6 +127,10 @@ bool ProcessArgs(int argc, char** argv) stream_size = atoi(optarg); break; + case 'v': + has_versions = true; + break; + case 'h': // -h or --help case '?': // Unrecognized option default: diff --git a/native_client/ctcdecode/__init__.py b/native_client/ctcdecode/__init__.py index 8ba2e9b2..2474741f 100644 --- a/native_client/ctcdecode/__init__.py +++ b/native_client/ctcdecode/__init__.py @@ -12,11 +12,11 @@ class Scorer(swigwrapper.Scorer): :type alpha: float :param beta: Word insertion bonus. :type beta: float - :model_path: Path to load scorer. + :scorer_path: Path to load scorer from. :alphabet: Alphabet - :type model_path: basestring + :type scorer_path: basestring """ - def __init__(self, alpha=None, beta=None, model_path=None, alphabet=None): + def __init__(self, alpha=None, beta=None, scorer_path=None, alphabet=None): super(Scorer, self).__init__() # Allow bare initialization if alphabet: @@ -26,7 +26,7 @@ class Scorer(swigwrapper.Scorer): if err != 0: raise ValueError("Error when deserializing alphabet.") - err = self.init(model_path.encode('utf-8'), + err = self.init(scorer_path.encode('utf-8'), native_alphabet) if err != 0: raise ValueError("Scorer initialization failed with error code {}".format(err), err) diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.cpp b/native_client/ctcdecode/ctc_beam_search_decoder.cpp index 852ef34c..2958dec9 100644 --- a/native_client/ctcdecode/ctc_beam_search_decoder.cpp +++ b/native_client/ctcdecode/ctc_beam_search_decoder.cpp @@ -36,7 +36,7 @@ DecoderState::init(const Alphabet& alphabet, prefix_root_.reset(root); prefixes_.push_back(root); - if (ext_scorer != nullptr && (bool)ext_scorer_->dictionary) { + if (ext_scorer != nullptr && (bool)(ext_scorer_->dictionary)) { // no need for std::make_shared<>() since Copy() does 'new' behind the doors auto dict_ptr = std::shared_ptr(ext_scorer->dictionary->Copy(true)); root->set_dictionary(dict_ptr); diff --git a/native_client/java/README.rst b/native_client/java/README.rst index c345c094..7b3e3dcc 100644 --- a/native_client/java/README.rst +++ b/native_client/java/README.rst @@ -51,7 +51,7 @@ Please push DeepSpeech data to ``/sdcard/deepspeech/``\ , including: * ``output_graph.tflite`` which is the TF Lite model -* ``lm.binary`` and ``trie`` files, if you want to use the language model ; please +* ``kenlm.scorer``, if you want to use the language model ; please be aware that too big language model will make the device run out of memory Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/ds``\ : diff --git a/native_client/javascript/index.js b/native_client/javascript/index.js index 2ce039bf..772b1a82 100644 --- a/native_client/javascript/index.js +++ b/native_client/javascript/index.js @@ -123,6 +123,11 @@ Model.prototype.createStream = function() { return ctx; } +/** + * @class + * Provides an interface to a DeepSpeech stream. The constructor cannot be called + * directly, use :js:func:`Model.createStream`. + */ function Stream(nativeStream) { this._impl = nativeStream; } diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py index ee38287f..ccb53fc4 100644 --- a/native_client/python/__init__.py +++ b/native_client/python/__init__.py @@ -131,6 +131,10 @@ class Model(object): class Stream(object): + """ + Class wrapping a DeepSpeech stream. The constructor cannot be called directly. + Use :func:`Model.createStream()` + """ def __init__(self, native_stream): self._impl = native_stream diff --git a/taskcluster/examples-base.tyml b/taskcluster/examples-base.tyml index 9739f36a..acee40d9 100644 --- a/taskcluster/examples-base.tyml +++ b/taskcluster/examples-base.tyml @@ -34,7 +34,7 @@ then: DEEPSPEECH_AUDIO: "https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz" PIP_DEFAULT_TIMEOUT: "60" EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples" - EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05" + EXAMPLES_CHECKOUT_TARGET: "4b97ac41d03ca0d23fa92526433db72a90f47d4a" command: - "/bin/bash" diff --git a/taskcluster/win-opt-base.tyml b/taskcluster/win-opt-base.tyml index e0c12162..6bcc0acd 100644 --- a/taskcluster/win-opt-base.tyml +++ b/taskcluster/win-opt-base.tyml @@ -44,7 +44,7 @@ payload: MSYS: 'winsymlinks:nativestrict' TENSORFLOW_BUILD_ARTIFACT: ${build.tensorflow} EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples" - EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05" + EXAMPLES_CHECKOUT_TARGET: "4b97ac41d03ca0d23fa92526433db72a90f47d4a" command: - >-