Merge pull request #2004 from coqui-ai/flashlight-docs

Improve decoder docs and include in RTD
2021-10-31 21:40:04 +01:00 · 2021-10-31 21:40:04 +01:00 · 2b955fc70f
commit 2b955fc70f
parent 3d1e3ed3ba 90feb63894
8 changed files with 247 additions and 97 deletions
--- a/doc/DECODER.rst
+++ b/doc/DECODER.rst
@ -1,14 +1,14 @@
 .. _decoder-docs:

-CTC beam search decoder
-=======================
+Beam search decoder
+===================

 Introduction
 ------------

 🐸STT uses the `Connectionist Temporal Classification <http://www.cs.toronto.edu/~graves/icml_2006.pdf>`_ loss function. For an excellent explanation of CTC and its usage, see this Distill article: `Sequence Modeling with CTC <https://distill.pub/2017/ctc/>`_. This document assumes the reader is familiar with the concepts described in that article, and describes 🐸STT specific behaviors that developers building systems with 🐸STT should know to avoid problems.

-Note: Documentation for the tooling for creating custom scorer packages is available in :ref:`language-model`.
+Note: Documentation for the tooling for creating custom scorer packages is available in :ref:`language-model`. Documentation for the coqui_stt_ctcdecoder Python package used by the training code for decoding is available in :ref:`decoder-api`.

 The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED",  "MAY", and "OPTIONAL" in this document are to be interpreted as described in `BCP 14 <https://tools.ietf.org/html/bcp14>`_ when, and only when, they appear in all capitals, as shown here.

--- a/doc/Decoder-API.rst
+++ b/doc/Decoder-API.rst
@ -0,0 +1,7 @@
+.. _decoder-api:
+
+Decoder API reference
+=====================
+
+.. automodule:: native_client.ctcdecode
+   :members:
--- a/doc/conf.py
+++ b/doc/conf.py
@ -24,7 +24,8 @@ import sys

 sys.path.insert(0, os.path.abspath("../"))

-autodoc_mock_imports = ["stt"]
+autodoc_mock_imports = ["stt", "native_client.ctcdecode.swigwrapper"]
+autodoc_member_order = "bysource"

 # This is in fact only relevant on ReadTheDocs, but we want to run the same way
 # on our CI as in RTD to avoid regressions on RTD that we would not catch on CI
@ -128,7 +129,6 @@ todo_include_todos = False

 add_module_names = False

-
 # -- Options for HTML output ----------------------------------------------

 # The theme to use for HTML and HTML Help pages.  See the documentation for
--- a/doc/index.rst
+++ b/doc/index.rst
@ -89,6 +89,17 @@ The fastest way to use a pre-trained 🐸STT model is with the 🐸STT model man

   playbook/README

+.. toctree::
+   :maxdepth: 1
+   :caption: Advanced topics
+
+   DECODER
+
+   Decoder-API
+
+   PARALLEL_OPTIMIZATION
+
+
 Indices and tables
 ==================

--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@ -4,6 +4,7 @@ sphinx==3.5.2
 sphinx-js==3.1
 furo==2021.2.28b28
 pygments==2.7.4
+docutils>=0.12,<=0.17.1
 #FIXME: switch to stable after C# changes have been merged: https://github.com/djungelorm/sphinx-csharp/pull/8
 git+https://github.com/reuben/sphinx-csharp.git@9dc6202f558e3d3fa14ec7f5f1e36a8e66e6d622
 recommonmark==0.7.1
--- a/native_client/ctcdecode/init.py
+++ b/native_client/ctcdecode/init.py
@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function
+import enum

 from . import swigwrapper  # pylint: disable=import-self

@ -13,41 +13,12 @@ for symbol in dir(swigwrapper):
        globals()[symbol] = getattr(swigwrapper, symbol)


-class FlashlightDecoderState(swigwrapper.FlashlightDecoderState):
-    pass
-
-
-class Scorer(swigwrapper.Scorer):
-    """Wrapper for Scorer.
-
-    :param alpha: Language model weight.
-    :type alpha: float
-    :param beta: Word insertion bonus.
-    :type beta: float
-    :scorer_path: Path to load scorer from.
-    :alphabet: Alphabet
-    :type scorer_path: basestring
-    """
-
-    def __init__(self, alpha=None, beta=None, scorer_path=None, alphabet=None):
-        super(Scorer, self).__init__()
-        # Allow bare initialization
-        if alphabet:
-            assert alpha is not None, "alpha parameter is required"
-            assert beta is not None, "beta parameter is required"
-            assert scorer_path, "scorer_path parameter is required"
-
-            err = self.init(scorer_path.encode("utf-8"), alphabet)
-            if err != 0:
-                raise ValueError(
-                    "Scorer initialization failed with error code 0x{:X}".format(err)
-                )
-
-            self.reset_params(alpha, beta)
-
-
 class Alphabet(swigwrapper.Alphabet):
-    """Convenience wrapper for Alphabet which calls init in the constructor"""
+    """An Alphabet is a bidirectional map from tokens (eg. characters) to
+    internal integer representations used by the underlying acoustic models
+    and external scorers. It can be created from alphabet configuration file
+    via the constructor, or from a list of tokens via :py:meth:`Alphabet.InitFromLabels`.
+    """

    def __init__(self, config_path=None):
        super(Alphabet, self).__init__()
@ -59,6 +30,10 @@ class Alphabet(swigwrapper.Alphabet):
                )

    def InitFromLabels(self, data):
+        """
+        Initialize Alphabet from a list of labels ``data``. Each label gets
+        associated with an integer value corresponding to its position in the list.
+        """
        return super(Alphabet, self).InitFromLabels([c.encode("utf-8") for c in data])

    def CanEncodeSingle(self, input):
@ -87,7 +62,7 @@ class Alphabet(swigwrapper.Alphabet):
        Encode a sequence of character/output classes into a sequence of labels.
        Characters are assumed to always take a single Unicode codepoint.
        Characters must be in the alphabet, this method will assert that. Use
-        `CanEncode` and `CanEncodeSingle` to test.
+        ``CanEncode`` and ``CanEncodeSingle`` to test.
        """
        # Convert SWIG's UnsignedIntVec to a Python list
        res = super(Alphabet, self).Encode(input.encode("utf-8"))
@ -103,57 +78,39 @@ class Alphabet(swigwrapper.Alphabet):
        return res.decode("utf-8")


-class UTF8Alphabet(swigwrapper.UTF8Alphabet):
-    """Convenience wrapper for Alphabet which calls init in the constructor"""
+class Scorer(swigwrapper.Scorer):
+    """An external scorer is a data structure composed of a language model built
+        from text data, as well as the vocabulary used in the construction of this
+        language model and additional parameters related to how the decoding
+        process uses the external scorer, such as the language model weight
+        ``alpha`` and the word insertion score ``beta``.

-    def __init__(self):
-        super(UTF8Alphabet, self).__init__()
-        err = self.init(b"")
-        if err != 0:
-            raise ValueError(
-                "UTF8Alphabet initialization failed with error code 0x{:X}".format(err)
-            )
+    :param alpha: Language model weight.
+    :type alpha: float
+    :param beta: Word insertion score.
+    :type beta: float
+    :param scorer_path: Path to load scorer from.
+    :type scorer_path: str
+    :param alphabet: Alphabet object matching the tokens used when creating the
+                     external scorer.
+    :type alphabet: Alphabet
+    """

-    def CanEncodeSingle(self, input):
-        """
-        Returns true if the single character/output class has a corresponding label
-        in the alphabet.
-        """
-        return super(UTF8Alphabet, self).CanEncodeSingle(input.encode("utf-8"))
+    def __init__(self, alpha=None, beta=None, scorer_path=None, alphabet=None):
+        super(Scorer, self).__init__()
+        # Allow bare initialization
+        if alphabet:
+            assert alpha is not None, "alpha parameter is required"
+            assert beta is not None, "beta parameter is required"
+            assert scorer_path, "scorer_path parameter is required"

-    def CanEncode(self, input):
-        """
-        Returns true if the entire string can be encoded into labels in this
-        alphabet.
-        """
-        return super(UTF8Alphabet, self).CanEncode(input.encode("utf-8"))
+            err = self.init(scorer_path.encode("utf-8"), alphabet)
+            if err != 0:
+                raise ValueError(
+                    "Scorer initialization failed with error code 0x{:X}".format(err)
+                )

-    def EncodeSingle(self, input):
-        """
-        Encode a single character/output class into a label. Character must be in
-        the alphabet, this method will assert that. Use `CanEncodeSingle` to test.
-        """
-        return super(UTF8Alphabet, self).EncodeSingle(input.encode("utf-8"))
-
-    def Encode(self, input):
-        """
-        Encode a sequence of character/output classes into a sequence of labels.
-        Characters are assumed to always take a single Unicode codepoint.
-        Characters must be in the alphabet, this method will assert that. Use
-        `CanEncode` and `CanEncodeSingle` to test.
-        """
-        # Convert SWIG's UnsignedIntVec to a Python list
-        res = super(UTF8Alphabet, self).Encode(input.encode("utf-8"))
-        return [el for el in res]
-
-    def DecodeSingle(self, input):
-        res = super(UTF8Alphabet, self).DecodeSingle(input)
-        return res.decode("utf-8")
-
-    def Decode(self, input):
-        """Decode a sequence of labels into a string."""
-        res = super(UTF8Alphabet, self).Decode(input)
-        return res.decode("utf-8")
+            self.reset_params(alpha, beta)


 def ctc_beam_search_decoder(
@ -186,7 +143,7 @@ def ctc_beam_search_decoder(
                   count or language model.
    :type scorer: Scorer
    :param hot_words: Map of words (keys) to their assigned boosts (values)
-    :type hot_words: map{string:float}
+    :type hot_words: dict[string, float]
    :param num_results: Number of beams to return.
    :type num_results: int
    :return: List of tuples of confidence and sentence as decoding
@ -245,7 +202,7 @@ def ctc_beam_search_decoder_batch(
                   count or language model.
    :type scorer: Scorer
    :param hot_words: Map of words (keys) to their assigned boosts (values)
-    :type hot_words: map{string:float}
+    :type hot_words: dict[string, float]
    :param num_results: Number of beams to return.
    :type num_results: int
    :return: List of tuples of confidence and sentence as decoding
@ -271,8 +228,63 @@ def ctc_beam_search_decoder_batch(
    return batch_beam_results


+class FlashlightDecoderState(swigwrapper.FlashlightDecoderState):
+    """
+    This class contains constants used to specify the desired behavior for the
+    :py:func:`flashlight_beam_search_decoder` and :py:func:`flashlight_beam_search_decoder_batch`
+    functions.
+    """
+
+    class CriterionType(enum.IntEnum):
+        """Constants used to specify which loss criterion was used by the
+        acoustic model. This class is a Python :py:class:`enum.IntEnum`.
+        """
+
+        #: Decoder mode for handling acoustic models trained with CTC loss
+        CTC = swigwrapper.FlashlightDecoderState.CTC
+
+        #: Decoder mode for handling acoustic models trained with ASG loss
+        ASG = swigwrapper.FlashlightDecoderState.ASG
+
+        #: Decoder mode for handling acoustic models trained with Seq2seq loss
+        #: Note: this criterion type is currently not supported.
+        S2S = swigwrapper.FlashlightDecoderState.S2S
+
+    class DecoderType(enum.IntEnum):
+        """Constants used to specify if decoder should operate in lexicon mode,
+        only predicting words present in a fixed vocabulary, or in lexicon-free
+        mode, without such restriction. This class is a Python :py:class:`enum.IntEnum`.
+        """
+
+        #: Lexicon mode, only predict words in specified vocabulary.
+        LexiconBased = swigwrapper.FlashlightDecoderState.LexiconBased
+
+        #: Lexicon-free mode, allow prediction of any word.
+        LexiconFree = swigwrapper.FlashlightDecoderState.LexiconFree
+
+    class TokenType(enum.IntEnum):
+        """Constants used to specify the granularity of text units used when training
+        the external scorer in relation to the text units used when training the
+        acoustic model. For example, you can have an acoustic model predicting
+        characters and an external scorer trained on words, or an acoustic model
+        and an external scorer both trained with sub-word units. If the acoustic
+        model and the scorer were both trained on the same text unit granularity,
+        use ``TokenType.Single``. Otherwise, if the external scorer was trained
+        on a sequence of acoustic model text units, use ``TokenType.Aggregate``.
+        This class is a Python :py:class:`enum.IntEnum`.
+        """
+
+        #: Token type for external scorers trained on the same textual units as
+        #: the acoustic model.
+        Single = swigwrapper.FlashlightDecoderState.Single
+
+        #: Token type for external scorers trained on a sequence of acoustic model
+        #: textual units.
+        Aggregate = swigwrapper.FlashlightDecoderState.Aggregate
+
+
 def flashlight_beam_search_decoder(
-    probs_seq,
+    logits_seq,
    alphabet,
    beam_size,
    decoder_type,
@ -283,12 +295,67 @@ def flashlight_beam_search_decoder(
    cutoff_top_n=40,
    silence_score=0.0,
    merge_with_log_add=False,
-    criterion_type=swigwrapper.FlashlightDecoderState.CTC,
+    criterion_type=FlashlightDecoderState.CriterionType.CTC,
    transitions=[],
    num_results=1,
 ):
+    """Decode acoustic model emissions for a single sample. Note that unlike
+        :py:func:`ctc_beam_search_decoder`, this function expects raw outputs
+        from CTC and ASG acoustic models, without softmaxing them over
+        timesteps.
+
+    :param logits_seq: 2-D list of acoustic model emissions, dimensions are
+                       time steps x number of output units.
+    :type logits_seq: 2-D list of floats or numpy array
+    :param alphabet: Alphabet object matching the tokens used when creating the
+                     acoustic model and external scorer if specified.
+    :type alphabet: Alphabet
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param decoder_type: Decoding mode, lexicon-constrained or lexicon-free.
+    :type decoder_type: FlashlightDecoderState.DecoderType
+    :param token_type: Type of token in the external scorer.
+    :type token_type: FlashlightDecoderState.TokenType
+    :param lm_tokens: List of tokens to constrain decoding to when in lexicon-constrained
+                      mode. Must match the token type used in the scorer, ie.
+                      must be a list of characters if scorer is character-based,
+                      or a list of words if scorer is word-based.
+    :param lm_tokens: list[str]
+    :param scorer: External scorer.
+    :type scorer: Scorer
+    :param beam_threshold: Maximum threshold in beam score from leading beam. Any
+                           newly created candidate beams which lag behind the best
+                           beam so far by more than this value will get pruned.
+                           This is a performance optimization parameter and an
+                           appropriate value should be found empirically using a
+                           validation set.
+    :type beam_threshold: float
+    :param cutoff_top_n: Maximum number of tokens to expand per time step during
+                         decoding. Only the highest probability cutoff_top_n
+                         candidates (characters, sub-word units, words) in a given
+                         timestep will be expanded. This is a performance
+                         optimization parameter and an appropriate value should
+                         be found empirically using a validation set.
+    :type cutoff_top_n: int
+    :param silence_score: Score to add to beam when encountering a predicted
+                          silence token (eg. the space symbol).
+    :type silence_score: float
+    :param merge_with_log_add: Whether to use log-add when merging scores of
+                               new candidate beams equivalent to existing ones
+                               (leading to the same transcription). When disabled,
+                               the maximum score is used.
+    :type merge_with_log_add: bool
+    :param criterion_type: Criterion used for training the acoustic model.
+    :type criterion_type: FlashlightDecoderState.CriterionType
+    :param transitions: Transition score matrix for ASG acoustic models.
+    :type transitions: list[float]
+    :param num_results: Number of beams to return.
+    :type num_results: int
+    :return: List of FlashlightOutput structures.
+    :rtype: list[FlashlightOutput]
+    """
    return swigwrapper.flashlight_beam_search_decoder(
-        probs_seq,
+        logits_seq,
        alphabet,
        beam_size,
        beam_threshold,
@ -319,10 +386,17 @@ def flashlight_beam_search_decoder_batch(
    cutoff_top_n=40,
    silence_score=0.0,
    merge_with_log_add=False,
-    criterion_type=swigwrapper.FlashlightDecoderState.CTC,
+    criterion_type=FlashlightDecoderState.CriterionType.CTC,
    transitions=[],
    num_results=1,
 ):
+    """Decode batch acoustic model emissions in parallel. ``num_processes``
+    controls how many samples from the batch will be decoded simultaneously.
+    All the other parameters are forwarded to :py:func:`flashlight_beam_search_decoder`.
+
+    Returns a list of lists of FlashlightOutput structures.
+    """
+
    return swigwrapper.flashlight_beam_search_decoder_batch(
        probs_seq,
        seq_lengths,
@ -341,3 +415,58 @@ def flashlight_beam_search_decoder_batch(
        num_results,
        num_processes,
    )
+
+
+class UTF8Alphabet(swigwrapper.UTF8Alphabet):
+    """Alphabet class representing 255 possible byte values for Bytes Output Mode.
+    For internal use only.
+    """
+
+    def __init__(self):
+        super(UTF8Alphabet, self).__init__()
+        err = self.init(b"")
+        if err != 0:
+            raise ValueError(
+                "UTF8Alphabet initialization failed with error code 0x{:X}".format(err)
+            )
+
+    def CanEncodeSingle(self, input):
+        """
+        Returns true if the single character/output class has a corresponding label
+        in the alphabet.
+        """
+        return super(UTF8Alphabet, self).CanEncodeSingle(input.encode("utf-8"))
+
+    def CanEncode(self, input):
+        """
+        Returns true if the entire string can be encoded into labels in this
+        alphabet.
+        """
+        return super(UTF8Alphabet, self).CanEncode(input.encode("utf-8"))
+
+    def EncodeSingle(self, input):
+        """
+        Encode a single character/output class into a label. Character must be in
+        the alphabet, this method will assert that. Use ``CanEncodeSingle`` to test.
+        """
+        return super(UTF8Alphabet, self).EncodeSingle(input.encode("utf-8"))
+
+    def Encode(self, input):
+        """
+        Encode a sequence of character/output classes into a sequence of labels.
+        Characters are assumed to always take a single Unicode codepoint.
+        Characters must be in the alphabet, this method will assert that. Use
+        ``CanEncode`` and ``CanEncodeSingle`` to test.
+        """
+        # Convert SWIG's UnsignedIntVec to a Python list
+        res = super(UTF8Alphabet, self).Encode(input.encode("utf-8"))
+        return [el for el in res]
+
+    def DecodeSingle(self, input):
+        res = super(UTF8Alphabet, self).DecodeSingle(input)
+        return res.decode("utf-8")
+
+    def Decode(self, input):
+        """Decode a sequence of labels into a string."""
+        res = super(UTF8Alphabet, self).Decode(input)
+        return res.decode("utf-8")
--- a/native_client/ctcdecode/setup.py
+++ b/native_client/ctcdecode/setup.py
@ -96,7 +96,9 @@ class BuildExtFirst(build):
 setup(
    name="coqui_stt_ctcdecoder",
    version=project_version,
-    description="""DS CTC decoder""",
+    description="Coqui STT Python decoder package.",
+    long_description="Documentation available at `stt.readthedocs.io <https://stt.readthedocs.io/en/latest/Decoder-API.html>`_",
+    long_description_content_type="text/x-rst; charset=UTF-8",
    cmdclass={"build": BuildExtFirst},
    ext_modules=[decoder_module],
    package_dir={"coqui_stt_ctcdecoder": "."},
--- a/training/coqui_stt_training/evaluate_flashlight.py
+++ b/training/coqui_stt_training/evaluate_flashlight.py
@ -142,8 +142,8 @@ def evaluate(test_csvs, create_model):
                    batch_lengths,
                    Config.alphabet,
                    beam_size=Config.export_beam_width,
-                    decoder_type=FlashlightDecoderState.LexiconBased,
-                    token_type=FlashlightDecoderState.Aggregate,
+                    decoder_type=FlashlightDecoderState.DecoderType.LexiconBased,
+                    token_type=FlashlightDecoderState.TokenType.Aggregate,
                    lm_tokens=vocab,
                    num_processes=num_processes,
                    scorer=scorer,