From 4356a2764ba086dc7f4049d4c90a4d07d482b3f0 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Sun, 24 May 2020 15:34:25 +0200 Subject: [PATCH] Add data/lm doc to RTD, and some general doc improvements and fixes --- data/lm/README.rst | 32 --------- doc/C-API.rst | 11 +++- doc/Contributed-Examples.rst | 4 ++ doc/Decoder.rst | 6 +- doc/DotNet-API.rst | 17 +++-- doc/DotNet-Examples.rst | 2 +- doc/DotNet-contrib-examples.rst | 14 ---- doc/Error-Codes.rst | 2 + doc/NodeJS-contrib-Examples.rst | 25 ------- doc/Python-contrib-Examples.rst | 26 -------- doc/Scorer.rst | 57 ++++++++++++++++ doc/TRAINING.rst | 65 ++++++++++--------- doc/conf.py | 4 +- doc/index.rst | 28 +++----- .../dotnet/DeepSpeechClient/DeepSpeech.cs | 2 +- .../Interfaces/IDeepSpeech.cs | 2 +- 16 files changed, 137 insertions(+), 160 deletions(-) delete mode 100644 data/lm/README.rst create mode 100644 doc/Contributed-Examples.rst delete mode 100644 doc/DotNet-contrib-examples.rst delete mode 100644 doc/NodeJS-contrib-Examples.rst delete mode 100644 doc/Python-contrib-Examples.rst create mode 100644 doc/Scorer.rst diff --git a/data/lm/README.rst b/data/lm/README.rst deleted file mode 100644 index ed7f017d..00000000 --- a/data/lm/README.rst +++ /dev/null @@ -1,32 +0,0 @@ -The LM binary was generated from the LibriSpeech normalized LM training text, available `here `_. -It is created with `KenLM `_. - - -You can download the LibriSpeech corpus with the following commands: - -.. code-block:: bash - - wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz - - -Then use the ``generate_lm.py`` script to generate ``lm.binary`` and ``vocab-500000.txt``. - -As input you can use a plain text (e.g. ``file.txt``) or gzipped (e.g. ``file.txt.gz``) text file with one sentence in each line. - -If you are using a container created from the Dockerfile, you can use ``--kenlm_bins /DeepSpeech/native_client/kenlm/build/bin/``. -Else you have to build `KenLM `_ first and then pass the build directory to the script. - -.. code-block:: bash - - python3 generate_lm.py --input_txt librispeech-lm-norm.txt.gz --output_dir . \ - --top_k 500000 --kenlm_bins path/to/kenlm/build/bin/ \ - --arpa_order 5 --max_arpa_memory "85%" --arpa_prune "0|0|1" \ - --binary_a_bits 255 --binary_q_bits 8 --binary_type trie - - -Afterwards you can use ``generate_package.py`` to generate the scorer package using the ``lm.binary`` and ``vocab-500000.txt`` files: - -.. code-block:: bash - - python3 generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \ - --package kenlm.scorer --default_alpha 0.931289039105002 --default_beta 1.1834137581510284 diff --git a/doc/C-API.rst b/doc/C-API.rst index 2b0e7e05..7713eeb8 100644 --- a/doc/C-API.rst +++ b/doc/C-API.rst @@ -1,5 +1,12 @@ -C -= +C API +===== + +.. toctree:: + :maxdepth: 2 + + Structs + +See also the list of error codes including descriptions for each error in :ref:`error-codes`. .. doxygenfunction:: DS_CreateModel :project: deepspeech-c diff --git a/doc/Contributed-Examples.rst b/doc/Contributed-Examples.rst new file mode 100644 index 00000000..7eaba452 --- /dev/null +++ b/doc/Contributed-Examples.rst @@ -0,0 +1,4 @@ +User contributed examples +========================= + +There are also several user contributed examples available on a separate examples repository: `https://github.com/mozilla/DeepSpeech-examples `_. diff --git a/doc/Decoder.rst b/doc/Decoder.rst index e337f031..63e3ac2d 100644 --- a/doc/Decoder.rst +++ b/doc/Decoder.rst @@ -1,7 +1,7 @@ .. _decoder-docs: -CTC beam search decoder with external scorer -============================================ +CTC beam search decoder +======================= Introduction ^^^^^^^^^^^^ @@ -18,7 +18,7 @@ DeepSpeech clients support OPTIONAL use of an external language model to improve The use of an external scorer is fully optional. When an external scorer is not specified, DeepSpeech still uses a beam search decoding algorithm, but without any outside scoring. -Currently, the DeepSpeech external scorer is implemented with `KenLM `_, plus some tooling to package the necessary files and metadata into a single ``.scorer`` package. The tooling lives in ``data/lm/``. The scripts included in ``data/lm/`` can be used and modified to build your own language model based on your particular use case or language. +Currently, the DeepSpeech external scorer is implemented with `KenLM `_, plus some tooling to package the necessary files and metadata into a single ``.scorer`` package. The tooling lives in ``data/lm/``. The scripts included in ``data/lm/`` can be used and modified to build your own language model based on your particular use case or language. See :ref:`scorer-scripts` for more details on how to reproduce our scorer file as well as create your own. The scripts are geared towards replicating the language model files we release as part of `DeepSpeech model releases `_, but modifying them to use different datasets or language model construction parameters should be simple. diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst index 9effbabd..92342ded 100644 --- a/doc/DotNet-API.rst +++ b/doc/DotNet-API.rst @@ -1,12 +1,6 @@ -.Net Framework +.NET Framework ============== -DeepSpeech Interface --------------------- - -.. doxygeninterface:: DeepSpeechClient::Interfaces::IDeepSpeech - :project: deepspeech-dotnet - :members: DeepSpeech Class ---------------- @@ -25,6 +19,8 @@ DeepSpeechStream Class ErrorCodes ---------- +See also the main definition including descriptions for each error in :ref:`error-codes`. + .. doxygenenum:: DeepSpeechClient::Enums::ErrorCodes :project: deepspeech-dotnet @@ -48,3 +44,10 @@ TokenMetadata .. doxygenclass:: DeepSpeechClient::Models::TokenMetadata :project: deepspeech-dotnet :members: Text, Timestep, StartTime + +DeepSpeech Interface +-------------------- + +.. doxygeninterface:: DeepSpeechClient::Interfaces::IDeepSpeech + :project: deepspeech-dotnet + :members: diff --git a/doc/DotNet-Examples.rst b/doc/DotNet-Examples.rst index 69fff188..a00ee833 100644 --- a/doc/DotNet-Examples.rst +++ b/doc/DotNet-Examples.rst @@ -1,4 +1,4 @@ -.Net API Usage example +.NET API Usage example ====================== Examples are from `native_client/dotnet/DeepSpeechConsole/Program.cs`. diff --git a/doc/DotNet-contrib-examples.rst b/doc/DotNet-contrib-examples.rst deleted file mode 100644 index 60d9d163..00000000 --- a/doc/DotNet-contrib-examples.rst +++ /dev/null @@ -1,14 +0,0 @@ -.Net API contributed examples -============================= - -DeepSpeechWPF -------------- - -This examples demonstrates using the .Net Framework DeepSpeech NuGet to build -a graphical Windows application using DeepSpeech - -.. literalinclude:: examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs - :language: csharp - :linenos: - -Full source code available on `https://github.com/mozilla/DeepSpeech-examples `_. diff --git a/doc/Error-Codes.rst b/doc/Error-Codes.rst index f97ae3ea..361ca025 100644 --- a/doc/Error-Codes.rst +++ b/doc/Error-Codes.rst @@ -1,3 +1,5 @@ +.. _error-codes: + Error codes =========== diff --git a/doc/NodeJS-contrib-Examples.rst b/doc/NodeJS-contrib-Examples.rst deleted file mode 100644 index c8cf25cc..00000000 --- a/doc/NodeJS-contrib-Examples.rst +++ /dev/null @@ -1,25 +0,0 @@ -JavaScript contributed examples -=============================== - -NodeJS WAV ----------- - -This example demonstrates a very basic usage of the NodeJS API - -.. literalinclude:: examples/nodejs_wav/index.js - :language: javascript - :linenos: - -Full source code available on `https://github.com/mozilla/DeepSpeech-examples `_. - -FFMPEG VAD Streaming --------------------- - -This example demonstrates using the Streaming API with ffmpeg to perform some -Voice-Activity-Detection. - -.. literalinclude:: examples/ffmpeg_vad_streaming/index.js - :language: javascript - :linenos: - -Full source code available on `https://github.com/mozilla/DeepSpeech-examples `_. diff --git a/doc/Python-contrib-Examples.rst b/doc/Python-contrib-Examples.rst deleted file mode 100644 index 2b4f5e33..00000000 --- a/doc/Python-contrib-Examples.rst +++ /dev/null @@ -1,26 +0,0 @@ -Python contributed examples -=========================== - -Mic VAD Streaming ------------------ - -This example demonstrates getting audio from microphone, running -Voice-Activity-Detection and then outputting text. - -.. literalinclude:: examples/mic_vad_streaming/mic_vad_streaming.py - :language: python - :linenos: - -Full source code available on `https://github.com/mozilla/DeepSpeech-examples `_. - -VAD Transcriber ---------------- - -This example demonstrates VAD-based transcription with both console and -graphical interface. - -.. literalinclude:: examples/vad_transcriber/wavTranscriber.py - :language: python - :linenos: - -Full source code available on `https://github.com/mozilla/DeepSpeech-examples `_. diff --git a/doc/Scorer.rst b/doc/Scorer.rst new file mode 100644 index 00000000..78f72101 --- /dev/null +++ b/doc/Scorer.rst @@ -0,0 +1,57 @@ +.. _scorer-scripts: + +External scorer scripts +======================= + +DeepSpeech pre-trained models include an external scorer. This document explains how to reproduce our external scorer, as well as adapt the scripts to create your own. + +The scorer is composed of two sub-components, a KenLM language model and a trie data structure containing all words in the vocabulary. In order to create the scorer package, first we must create a KenLM language model (using ``data/lm/generate_lm.py``, and then use ``data/lm/generate_package.py`` to create the final package file including the trie data structure. + +Reproducing our external scorer +------------------------------- + +Our KenLM language model was generated from the LibriSpeech normalized LM training text, available `here `_. +It is created with `KenLM `_. + +You can download the LibriSpeech corpus with the following command: + +.. code-block:: bash + + cd data/lm + wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz + +Then use the ``generate_lm.py`` script to generate ``lm.binary`` and ``vocab-500000.txt``. + +As input you can use a plain text (e.g. ``file.txt``) or gzipped (e.g. ``file.txt.gz``) text file with one sentence in each line. + +If you are using a container created from the Dockerfile, you can use ``--kenlm_bins /DeepSpeech/native_client/kenlm/build/bin/``. +Else you have to build `KenLM `_ first and then pass the build directory to the script. + +.. code-block:: bash + + cd data/lm + python3 generate_lm.py --input_txt librispeech-lm-norm.txt.gz --output_dir . \ + --top_k 500000 --kenlm_bins path/to/kenlm/build/bin/ \ + --arpa_order 5 --max_arpa_memory "85%" --arpa_prune "0|0|1" \ + --binary_a_bits 255 --binary_q_bits 8 --binary_type trie + + +Afterwards you can use ``generate_package.py`` to generate the scorer package using the ``lm.binary`` and ``vocab-500000.txt`` files: + +.. code-block:: bash + + cd data/lm + python3 generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \ + --package kenlm.scorer --default_alpha 0.931289039105002 --default_beta 1.1834137581510284 + +Building your own scorer +------------------------ + +Building your own scorer can be useful if you're using models in a narrow usage context, with a more limited vocabulary, for example. Building a scorer requires text data matching your intended use case, which must be formatted in a text file with one sentence per line. + +The LibriSpeech LM training text used by our scorer is around 4GB uncompressed, which should give an idea of the size of a corpus needed for a reasonable language model for general speech recognition. For more constrained use cases with smaller vocabularies, you don't need as much data, but you should still try to gather as much as you can. + +With a text corpus in hand, you can then re-use the ``generate_lm.py`` and ``generate_package.py`` scripts to create your own scorer that is compatible with DeepSpeech clients and language bindings. Before building the language model, you must first familiarize yourself with the `KenLM toolkit `_. Most of the options exposed by the ``generate_lm.py`` script are simply forwarded to KenLM options of the same name, so you must read the KenLM documentation in order to fully understand their behavior. + +After using ``generate_lm.py`` to create a KenLM language model binary file, you can use ``generate_package.py`` to create a scorer package as described in the previous section. Note that we have a :github:`lm_optimizer.py script ` which can be used to find good default values for alpha and beta. To use it, you must first +generate a package with any value set for default alpha and beta flags. For this step, it doesn't matter what values you use, as they'll be overridden by ``lm_optimizer.py``. Then, use ``lm_optimizer.py`` with this scorer file to find good alpha and beta values. Finally, use ``generate_package.py`` again, this time with the new values. \ No newline at end of file diff --git a/doc/TRAINING.rst b/doc/TRAINING.rst index aebf9af8..99dd849a 100644 --- a/doc/TRAINING.rst +++ b/doc/TRAINING.rst @@ -258,6 +258,7 @@ You need to specify the location of the pre-trained model with ``--load_checkpoi --train_files my-new-language-train.csv \ --dev_files my-new-language-dev.csv \ --test_files my-new-language-test.csv + UTF-8 mode ^^^^^^^^^^ @@ -277,22 +278,28 @@ Augmentations that are applied before potential feature caching can be specified Each sample of the training data will get treated by every specified augmentation in their given order. However: whether an augmentation will actually get applied to a sample is decided by chance on base of the augmentation's probability value. For example a value of ``p=0.1`` would apply the according augmentation to just 10% of all samples. This also means that augmentations are not mutually exclusive on a per-sample basis. - The ``--augment`` flag uses a common syntax for all augmentation types: ``--augment augmentation_type1[param1=value1,param2=value2,...] --augment augmentation_type2[param1=value1,param2=value2,...] ...``. For example, for the ``overlay`` augmentation: +The ``--augment`` flag uses a common syntax for all augmentation types: -.. code-block:: bash +.. code-block:: - python3 DeepSpeech.py --augment overlay[p=0.1,source=/path/to/audio.sdb,snr=20.0] ... + --augment augmentation_type1[param1=value1,param2=value2,...] --augment augmentation_type2[param1=value1,param2=value2,...] ... + +For example, for the ``overlay`` augmentation: + +.. code-block:: + + python3 DeepSpeech.py --augment overlay[p=0.1,source=/path/to/audio.sdb,snr=20.0] ... In the documentation below, whenever a value is specified as ```` or ````, it supports one of the follow formats: - * ````: A constant (int or float) value. + * ````: A constant (int or float) value. - * ``~``: A center value with a randomization radius around it. E.g. ``1.2~0.4`` will result in picking of a uniformly random value between 0.8 and 1.6 on each sample augmentation. + * ``~``: A center value with a randomization radius around it. E.g. ``1.2~0.4`` will result in picking of a uniformly random value between 0.8 and 1.6 on each sample augmentation. - * ``:``: The value will range from `` at the beginning of an epoch to `` at the end of an epoch. E.g. ``-0.2:1.2`` (float) or ``2000:4000`` (int) + * ``:``: The value will range from `` at the beginning of an epoch to `` at the end of an epoch. E.g. ``-0.2:1.2`` (float) or ``2000:4000`` (int) - * ``:~``: Combination of the two previous cases with a ranging center value. E.g. ``4-6~2`` would at the beginning of an epoch pick values between 2 and 6 and at the end of an epoch between 4 and 8. + * ``:~``: Combination of the two previous cases with a ranging center value. E.g. ``4-6~2`` would at the beginning of an epoch pick values between 2 and 6 and at the end of an epoch between 4 and 8. Ranges specified with integer limits will only assume integer (rounded) values. @@ -300,59 +307,59 @@ If feature caching is enabled, these augmentations will only be performed on the **Overlay augmentation** ``--augment overlay[p=,source=,snr=,layers=]`` - Layers another audio source (multiple times) onto augmented samples. + Layers another audio source (multiple times) onto augmented samples. - * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method + * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method - * **source**: path to the sample collection to use for augmenting (*.sdb or *.csv file). It will be repeated if there are not enough samples left. + * **source**: path to the sample collection to use for augmenting (\*.sdb or \*.csv file). It will be repeated if there are not enough samples left. - * **snr**: signal to noise ratio in dB - positive values for lowering volume of the overlay in relation to the sample + * **snr**: signal to noise ratio in dB - positive values for lowering volume of the overlay in relation to the sample - * **layers**: number of layers added onto the sample (e.g. 10 layers of speech to get "cocktail-party effect"). A layer is just a sample of the same duration as the sample to augment. It gets stitched together from as many source samples as required. + * **layers**: number of layers added onto the sample (e.g. 10 layers of speech to get "cocktail-party effect"). A layer is just a sample of the same duration as the sample to augment. It gets stitched together from as many source samples as required. **Reverb augmentation** ``--augment reverb[p=,delay=,decay=]`` - Adds simplified (no all-pass filters) `Schroeder reverberation `_ to the augmented samples. + Adds simplified (no all-pass filters) `Schroeder reverberation `_ to the augmented samples. - * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method + * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method - * **delay**: time delay in ms for the first signal reflection - higher values are widening the perceived "room" + * **delay**: time delay in ms for the first signal reflection - higher values are widening the perceived "room" - * **decay**: sound decay in dB per reflection - higher values will result in a less reflective perceived "room" + * **decay**: sound decay in dB per reflection - higher values will result in a less reflective perceived "room" **Gaps augmentation** ``--augment gaps[p=,n=,size=]`` - Sets time-intervals within the augmented samples to zero (silence) at random positions. + Sets time-intervals within the augmented samples to zero (silence) at random positions. - * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method + * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method - * **n**: number of intervals to set to zero + * **n**: number of intervals to set to zero - * **size**: duration of intervals in ms + * **size**: duration of intervals in ms **Resample augmentation** ``--augment resample[p=,rate=]`` - Resamples augmented samples to another sample rate and then resamples back to the original sample rate. + Resamples augmented samples to another sample rate and then resamples back to the original sample rate. - * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method + * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method - * **rate**: sample-rate to re-sample to + * **rate**: sample-rate to re-sample to **Codec augmentation** ``--augment codec[p=,bitrate=]`` - Compresses and then decompresses augmented samples using the lossy Opus audio codec. + Compresses and then decompresses augmented samples using the lossy Opus audio codec. - * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method + * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method - * **bitrate**: bitrate used during compression + * **bitrate**: bitrate used during compression **Volume augmentation** ``--augment volume[p=,dbfs=]`` - Measures and levels augmented samples to a target dBFS value. + Measures and levels augmented samples to a target dBFS value. - * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method + * **p**: probability value between 0.0 (never) and 1.0 (always) if a given sample gets augmented by this method - * **dbfs** : target volume in dBFS (default value of 3.0103 will normalize min and max amplitudes to -1.0/1.0) + * **dbfs** : target volume in dBFS (default value of 3.0103 will normalize min and max amplitudes to -1.0/1.0) Example training with all augmentations: diff --git a/doc/conf.py b/doc/conf.py index f84fc478..27dc16e1 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -126,7 +126,9 @@ html_theme = 'sphinx_rtd_theme' # further. For a list of options available for each theme, see the # documentation. # -# html_theme_options = {} +html_theme_options = { + 'collapse_navigation': False, +} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/doc/index.rst b/doc/index.rst index fbf1a620..6eca9ca3 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -58,11 +58,17 @@ See the output of ``deepspeech -h`` for more information on the use of ``deepspe TRAINING +.. toctree:: + :maxdepth: 2 + :caption: Decoder and scorer + Decoder + Scorer + .. toctree:: :maxdepth: 2 - :caption: DeepSpeech Model + :caption: Architecture and training DeepSpeech @@ -71,17 +77,11 @@ See the output of ``deepspeech -h`` for more information on the use of ``deepspe ParallelOptimization .. toctree:: - :maxdepth: 2 - :caption: Enums and structs + :maxdepth: 3 + :caption: API Reference Error-Codes - Structs - -.. toctree:: - :maxdepth: 2 - :caption: API Reference - C-API DotNet-API @@ -106,15 +106,7 @@ See the output of ``deepspeech -h`` for more information on the use of ``deepspe Python-Examples -.. toctree:: - :maxdepth: 2 - :caption: Contributed examples - - DotNet-contrib-examples - - NodeJS-contrib-Examples - - Python-contrib-Examples + Contributed-Examples Indices and tables ================== diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs index a30bd4de..08a3808b 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs @@ -9,7 +9,7 @@ using DeepSpeechClient.Models; namespace DeepSpeechClient { /// - /// Client of the Mozilla's deepspeech implementation. + /// Concrete implementation of . /// public class DeepSpeech : IDeepSpeech { diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs index 37d6ce59..e1ed9cad 100644 --- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs @@ -5,7 +5,7 @@ using System.IO; namespace DeepSpeechClient.Interfaces { /// - /// Client interface of the Mozilla's DeepSpeech implementation. + /// Client interface of Mozilla's DeepSpeech implementation. /// public interface IDeepSpeech : IDisposable {