Update docs

2020-02-25 15:43:36 +01:00 · 2020-02-25 15:43:36 +01:00 · e9ae38bf47
commit e9ae38bf47
parent c52f3b32fa
13 changed files with 314 additions and 120 deletions
--- a/doc/C-API.rst
+++ b/doc/C-API.rst
@ -34,6 +34,9 @@ C
 .. doxygenfunction:: DS_IntermediateDecode
   :project: deepspeech-c

+.. doxygenfunction:: DS_IntermediateDecodeWithMetadata
+   :project: deepspeech-c
+
 .. doxygenfunction:: DS_FinishStream
   :project: deepspeech-c

--- a/doc/DotNet-API.rst
+++ b/doc/DotNet-API.rst
@ -31,13 +31,20 @@ ErrorCodes
 Metadata
 --------

-.. doxygenstruct:: DeepSpeechClient::Structs::Metadata
+.. doxygenstruct:: DeepSpeechClient::Models::Metadata
   :project: deepspeech-dotnet
-   :members: items, num_items, confidence
+   :members: Transcripts

-MetadataItem
------------
+CandidateTranscript
+-------------------

-.. doxygenstruct:: DeepSpeechClient::Structs::MetadataItem
+.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript
   :project: deepspeech-dotnet
-   :members: character, timestep, start_time
+   :members: Tokens, Confidence
+
+TokenMetadata
+-------------
+
+.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata
+   :project: deepspeech-dotnet
+   :members: Text, Timestep, StartTime
--- a/doc/NodeJS-API.rst
+++ b/doc/NodeJS-API.rst
@ -30,8 +30,14 @@ Metadata
 .. js:autoclass:: Metadata
   :members:

-MetadataItem
------------
+CandidateTranscript
+-------------------

-.. js:autoclass:: MetadataItem
+.. js:autoclass:: CandidateTranscript
+   :members:
+
+TokenMetadata
+-------------
+
+.. js:autoclass:: TokenMetadata
   :members:
--- a/doc/Python-API.rst
+++ b/doc/Python-API.rst
@ -21,8 +21,14 @@ Metadata
 .. autoclass:: Metadata
   :members:

-MetadataItem
------------
+CandidateTranscript
+-------------------

-.. autoclass:: MetadataItem
+.. autoclass:: CandidateTranscript
+   :members:
+
+TokenMetadata
+-------------
+
+.. autoclass:: TokenMetadata
   :members:
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@ -42,20 +42,20 @@ typedef struct CandidateTranscript {
  TokenMetadata* tokens;
  /** Size of the tokens array */
  int num_tokens;
-  /** Approximated confidence value for this transcription. This is roughly the
+  /** Approximated confidence value for this transcript. This is roughly the
   * sum of the acoustic model logit values for each timestep/character that
-   * contributed to the creation of this transcription.
+   * contributed to the creation of this transcript.
   */
  double confidence;
 } CandidateTranscript;

 /**
- * @brief An array of CandidateTranscript objects computed by the model
+ * @brief An array of CandidateTranscript objects computed by the model.
 */
 typedef struct Metadata {
  /** Array of CandidateTranscript objects */
  CandidateTranscript* transcripts;
-  /** Size of the transcriptions array */
+  /** Size of the transcripts array */
  int num_transcripts;
 } Metadata;

@ -191,14 +191,14 @@ char* DS_SpeechToText(ModelState* aCtx,
                      unsigned int aBufferSize);

 /**
- * @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata 
- * about the results.
+ * @brief Use the DeepSpeech model to perform Speech-To-Text and output results
+ * including metadata.
 *
 * @param aCtx The ModelState pointer for the model to use.
 * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
 *                sample rate (matching what the model was trained on).
 * @param aBufferSize The number of samples in the audio signal.
- * @param aNumResults The number of candidate transcripts to return.
+ * @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this.
 *
 * @return Metadata struct containing multiple candidate transcripts. Each transcript
 *         has per-token metadata including timing information. The user is
@ -252,7 +252,7 @@ char* DS_IntermediateDecode(const StreamingState* aSctx);

 /**
 * @brief Compute the intermediate decoding of an ongoing streaming inference,
- *        returns per-letter metadata.
+ *        return results including metadata.
 *
 * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
 * @param aNumResults The number of candidate transcripts to return.
@ -267,8 +267,8 @@ Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
                                            unsigned int aNumResults);

 /**
- * @brief Signal the end of an audio signal to an ongoing streaming
- *        inference, returns the STT result over the whole audio signal.
+ * @brief Compute the final decoding of an ongoing streaming inference and return
+ *        the result. Signals the end of an ongoing streaming inference.
 *
 * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
 *
@ -281,8 +281,9 @@ DEEPSPEECH_EXPORT
 char* DS_FinishStream(StreamingState* aSctx);

 /**
- * @brief Signal the end of an audio signal to an ongoing streaming
- *        inference, returns per-letter metadata.
+ * @brief Compute the final decoding of an ongoing streaming inference and return
+ *        results including metadata. Signals the end of an ongoing streaming
+ *        inference.
 *
 * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
 * @param aNumResults The number of candidate transcripts to return.
@ -295,7 +296,7 @@ char* DS_FinishStream(StreamingState* aSctx);
 * @note This method will free the state pointer (@p aSctx).
 */
 DEEPSPEECH_EXPORT
-Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, 
+Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
                                      unsigned int aNumResults);

 /**
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
@ -199,10 +199,10 @@ namespace DeepSpeechClient
        }

        /// <summary>
-        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
+        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
        /// </summary>
        /// <param name="stream">Instance of the stream to finish.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
        /// <returns>The extended metadata result.</returns>
        public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
        {
@ -220,10 +220,10 @@ namespace DeepSpeechClient
        }

        /// <summary>
-        /// Computes the intermediate decoding of an ongoing streaming inference.
+        /// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
        /// </summary>
        /// <param name="stream">Instance of the stream to decode.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
        /// <returns>The STT intermediate result.</returns>
        public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
        {
@ -273,11 +273,11 @@ namespace DeepSpeechClient
        }

        /// <summary>
-        /// Use the DeepSpeech model to perform Speech-To-Text.
+        /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
        /// </summary>
        /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
        /// <param name="aBufferSize">The number of samples in the audio signal.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
        /// <returns>The extended metadata. Returns NULL on error.</returns>
        public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
        {
--- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
@ -68,11 +68,11 @@ namespace DeepSpeechClient.Interfaces
                uint aBufferSize);

        /// <summary>
-        /// Use the DeepSpeech model to perform Speech-To-Text.
+        /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
        /// </summary>
        /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
        /// <param name="aBufferSize">The number of samples in the audio signal.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
        /// <returns>The extended metadata. Returns NULL on error.</returns>
        unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
                uint aBufferSize,
@ -105,10 +105,10 @@ namespace DeepSpeechClient.Interfaces
        unsafe string IntermediateDecode(DeepSpeechStream stream);

        /// <summary>
-        /// Computes the intermediate decoding of an ongoing streaming inference.
+        /// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
        /// </summary>
        /// <param name="stream">Instance of the stream to decode.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
        /// <returns>The extended metadata result.</returns>
        unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);

@ -120,10 +120,10 @@ namespace DeepSpeechClient.Interfaces
        unsafe string FinishStream(DeepSpeechStream stream);

        /// <summary>
-        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
+        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
        /// </summary>
        /// <param name="stream">Instance of the stream to finish.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
        /// <returns>The extended metadata result.</returns>
        unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
    }
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
@ -117,9 +117,10 @@ public class DeepSpeechModel {
    * @param buffer A 16-bit, mono raw audio signal at the appropriate
    *                sample rate (matching what the model was trained on).
    * @param buffer_size The number of samples in the audio signal.
-    * @param num_results Number of candidate transcripts to return.
+    * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
    *
-    * @return Outputs a Metadata object of individual letters along with their timing information.
+    * @return Metadata struct containing multiple candidate transcripts. Each transcript
+    *         has per-token metadata including timing information.
    */
    public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) {
        return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results);
@ -165,7 +166,7 @@ public class DeepSpeechModel {
    * @brief Compute the intermediate decoding of an ongoing streaming inference.
    *
    * @param ctx A streaming state pointer returned by createStream().
-    * @param num_results Number of candidate transcripts to return.
+    * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
    *
    * @return The STT intermediate result.
    */
@ -174,8 +175,8 @@ public class DeepSpeechModel {
    }

   /**
-    * @brief Signal the end of an audio signal to an ongoing streaming
-    *        inference, returns the STT result over the whole audio signal.
+    * @brief Compute the final decoding of an ongoing streaming inference and return
+    *        the result. Signals the end of an ongoing streaming inference.
    *
    * @param ctx A streaming state pointer returned by createStream().
    *
@ -188,13 +189,15 @@ public class DeepSpeechModel {
    }

   /**
-    * @brief Signal the end of an audio signal to an ongoing streaming
-    *        inference, returns per-letter metadata.
+    * @brief Compute the final decoding of an ongoing streaming inference and return
+    *        the results including metadata. Signals the end of an ongoing streaming
+    *        inference.
    *
    * @param ctx A streaming state pointer returned by createStream().
-    * @param num_results Number of candidate transcripts to return.
+    * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
    *
-    * @return Outputs a Metadata object of individual letters along with their timing information.
+    * @return Metadata struct containing multiple candidate transcripts. Each transcript
+    *         has per-token metadata including timing information.
    *
    * @note This method will free the state pointer (@p ctx).
    */
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 4.0.1
+ *
+ * Do not make changes to this file unless you know what you are doing--modify
+ * the SWIG interface file instead.
+ * ----------------------------------------------------------------------------- */
+
+package org.mozilla.deepspeech.libdeepspeech;
+
+/**
+ * A single transcript computed by the model, including a confidence value and
+ * the metadata for its constituent tokens.
+ */
+public class CandidateTranscript {
+  private transient long swigCPtr;
+  protected transient boolean swigCMemOwn;
+
+  protected CandidateTranscript(long cPtr, boolean cMemoryOwn) {
+    swigCMemOwn = cMemoryOwn;
+    swigCPtr = cPtr;
+  }
+
+  protected static long getCPtr(CandidateTranscript obj) {
+    return (obj == null) ? 0 : obj.swigCPtr;
+  }
+
+  public synchronized void delete() {
+    if (swigCPtr != 0) {
+      if (swigCMemOwn) {
+        swigCMemOwn = false;
+        throw new UnsupportedOperationException("C++ destructor does not have public access");
+      }
+      swigCPtr = 0;
+    }
+  }
+
+  /**
+   * Array of TokenMetadata objects
+   */
+  public void setTokens(TokenMetadata value) {
+    implJNI.CandidateTranscript_tokens_set(swigCPtr, this, TokenMetadata.getCPtr(value), value);
+  }
+
+  /**
+   * Array of TokenMetadata objects
+   */
+  public TokenMetadata getTokens() {
+    long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this);
+    return (cPtr == 0) ? null : new TokenMetadata(cPtr, false);
+  }
+
+  /**
+   * Size of the tokens array
+   */
+  public void setNum_tokens(int value) {
+    implJNI.CandidateTranscript_num_tokens_set(swigCPtr, this, value);
+  }
+
+  /**
+   * Size of the tokens array
+   */
+  public int getNum_tokens() {
+    return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this);
+  }
+
+  /**
+   * Approximated confidence value for this transcript. This is roughly the
+   * sum of the acoustic model logit values for each timestep/character that
+   * contributed to the creation of this transcript.
+   */
+  public void setConfidence(double value) {
+    implJNI.CandidateTranscript_confidence_set(swigCPtr, this, value);
+  }
+
+  /**
+   * Approximated confidence value for this transcript. This is roughly the
+   * sum of the acoustic model logit values for each timestep/character that
+   * contributed to the creation of this transcript.
+   */
+  public double getConfidence() {
+    return implJNI.CandidateTranscript_confidence_get(swigCPtr, this);
+  }
+
+  /**
+   * Retrieve one TokenMetadata element
+   *
+   * @param i Array index of the TokenMetadata to get
+   *
+   * @return The TokenMetadata requested or null
+   */
+  public TokenMetadata getToken(int i) {
+    return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), true);
+  }
+
+}
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
 * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 4.0.2
+ * Version 4.0.1
 *
 * Do not make changes to this file unless you know what you are doing--modify
 * the SWIG interface file instead.
@ -9,7 +9,7 @@
 package org.mozilla.deepspeech.libdeepspeech;

 /**
- * Stores the entire CTC output as an array of character metadata objects
+ * An array of CandidateTranscript objects computed by the model.
 */
 public class Metadata {
  private transient long swigCPtr;
@ -40,61 +40,43 @@ public class Metadata {
  }

  /**
-   *  List of items 
+   * Array of CandidateTranscript objects
   */
-  public void setItems(MetadataItem value) {
-    implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value);
+  public void setTranscripts(CandidateTranscript value) {
+    implJNI.Metadata_transcripts_set(swigCPtr, this, CandidateTranscript.getCPtr(value), value);
  }

  /**
-   *  List of items 
+   * Array of CandidateTranscript objects
   */
-  public MetadataItem getItems() {
-    long cPtr = implJNI.Metadata_items_get(swigCPtr, this);
-    return (cPtr == 0) ? null : new MetadataItem(cPtr, false);
+  public CandidateTranscript getTranscripts() {
+    long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this);
+    return (cPtr == 0) ? null : new CandidateTranscript(cPtr, false);
  }

  /**
-   *  Size of the list of items 
+   * Size of the transcripts array
   */
-  public void setNum_items(int value) {
-    implJNI.Metadata_num_items_set(swigCPtr, this, value);
+  public void setNum_transcripts(int value) {
+    implJNI.Metadata_num_transcripts_set(swigCPtr, this, value);
  }

  /**
-   *  Size of the list of items 
+   * Size of the transcripts array
   */
-  public int getNum_items() {
-    return implJNI.Metadata_num_items_get(swigCPtr, this);
+  public int getNum_transcripts() {
+    return implJNI.Metadata_num_transcripts_get(swigCPtr, this);
  }

  /**
-   *  Approximated confidence value for this transcription. This is roughly the<br>
-   * sum of the acoustic model logit values for each timestep/character that<br>
-   * contributed to the creation of this transcription.
+   * Retrieve one CandidateTranscript element
+   *
+   * @param i Array index of the CandidateTranscript to get
+   *
+   * @return The CandidateTranscript requested or null
   */
-  public void setConfidence(double value) {
-    implJNI.Metadata_confidence_set(swigCPtr, this, value);
-  }
-
-  /**
-   *  Approximated confidence value for this transcription. This is roughly the<br>
-   * sum of the acoustic model logit values for each timestep/character that<br>
-   * contributed to the creation of this transcription.
-   */
-  public double getConfidence() {
-    return implJNI.Metadata_confidence_get(swigCPtr, this);
-  }
-
-  /**
-   * Retrieve one MetadataItem element<br>
-   * <br>
-   * @param i Array index of the MetadataItem to get<br>
-   * <br>
-   * @return The MetadataItem requested or null
-   */
-  public MetadataItem getItem(int i) {
-    return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true);
+  public CandidateTranscript getTranscript(int i) {
+    return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), true);
  }

 }
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java
@ -0,0 +1,79 @@
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 4.0.1
+ *
+ * Do not make changes to this file unless you know what you are doing--modify
+ * the SWIG interface file instead.
+ * ----------------------------------------------------------------------------- */
+
+package org.mozilla.deepspeech.libdeepspeech;
+
+/**
+ * Stores text of an individual token, along with its timing information
+ */
+public class TokenMetadata {
+  private transient long swigCPtr;
+  protected transient boolean swigCMemOwn;
+
+  protected TokenMetadata(long cPtr, boolean cMemoryOwn) {
+    swigCMemOwn = cMemoryOwn;
+    swigCPtr = cPtr;
+  }
+
+  protected static long getCPtr(TokenMetadata obj) {
+    return (obj == null) ? 0 : obj.swigCPtr;
+  }
+
+  public synchronized void delete() {
+    if (swigCPtr != 0) {
+      if (swigCMemOwn) {
+        swigCMemOwn = false;
+        throw new UnsupportedOperationException("C++ destructor does not have public access");
+      }
+      swigCPtr = 0;
+    }
+  }
+
+  /**
+   * The text corresponding to this token
+   */
+  public void setText(String value) {
+    implJNI.TokenMetadata_text_set(swigCPtr, this, value);
+  }
+
+  /**
+   * The text corresponding to this token
+   */
+  public String getText() {
+    return implJNI.TokenMetadata_text_get(swigCPtr, this);
+  }
+
+  /**
+   * Position of the token in units of 20ms
+   */
+  public void setTimestep(int value) {
+    implJNI.TokenMetadata_timestep_set(swigCPtr, this, value);
+  }
+
+  /**
+   * Position of the token in units of 20ms
+   */
+  public int getTimestep() {
+    return implJNI.TokenMetadata_timestep_get(swigCPtr, this);
+  }
+
+  /**
+   * Position of the token in seconds
+   */
+  public void setStart_time(float value) {
+    implJNI.TokenMetadata_start_time_set(swigCPtr, this, value);
+  }
+
+  /**
+   * Position of the token in seconds
+   */
+  public float getStart_time() {
+    return implJNI.TokenMetadata_start_time_get(swigCPtr, this);
+  }
+
+}
--- a/native_client/javascript/index.js
+++ b/native_client/javascript/index.js
@ -115,12 +115,12 @@ Model.prototype.stt = function(aBuffer) {
 }

 /**
- * Use the DeepSpeech model to perform Speech-To-Text and output metadata
- * about the results.
+ * Use the DeepSpeech model to perform Speech-To-Text and output results including metadata.
 *
 * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
+ * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
 *
- * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
+ * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
 */
 Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) {
    aNumResults = aNumResults || 1;
@ -173,9 +173,11 @@ Stream.prototype.intermediateDecode = function() {
 }

 /**
- * Compute the intermediate decoding of an ongoing streaming inference.
+ * Compute the intermediate decoding of an ongoing streaming inference, return results including metadata.
 *
- * @return {string} The STT intermediate result.
+ * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
+ *
+ * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
 */
 Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
    aNumResults = aNumResults || 1;
@ -183,7 +185,7 @@ Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
 }

 /**
- * Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
+ * Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference.
 *
 * @return {string} The STT result.
 *
@ -196,7 +198,9 @@ Stream.prototype.finishStream = function() {
 }

 /**
- * Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
+ * Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference.
+ *
+ * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
 *
 * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
 *
@ -253,48 +257,49 @@ function Version() {
 /**
 * @class
 * 
- * Stores each individual character, along with its timing information
+ * Stores text of an individual token, along with its timing information
 */
 function TokenMetadata() {}

 /** 
- * The character generated for transcription
+ * The text corresponding to this token
 *
- * @return {string} The character generated
+ * @return {string} The text generated
 */
 TokenMetadata.prototype.text = function() {}

 /**
- * Position of the character in units of 20ms
+ * Position of the token in units of 20ms
 *
- * @return {int} The position of the character
+ * @return {int} The position of the token
 */
 TokenMetadata.prototype.timestep = function() {};

 /**
- * Position of the character in seconds
+ * Position of the token in seconds
 *
- * @return {float} The position of the character
+ * @return {float} The position of the token
 */
 TokenMetadata.prototype.start_time = function() {};

 /**
 * @class
 *
- * Stores the entire CTC output as an array of character metadata objects
+ * A single transcript computed by the model, including a confidence value and
+ * the metadata for its constituent tokens.
 */
 function CandidateTranscript () {}

 /**
- * List of items
+ * Array of tokens
 *
- * @return {array} List of :js:func:`TokenMetadata`
+ * @return {array} Array of :js:func:`TokenMetadata`
 */
-CandidateTranscript.prototype.items = function() {}
+CandidateTranscript.prototype.tokens = function() {}

 /**
 * Approximated confidence value for this transcription. This is roughly the
- * sum of the acoustic model logit values for each timestep/character that
+ * sum of the acoustic model logit values for each timestep/token that
 * contributed to the creation of this transcription.
 *
 * @return {float} Confidence value
@ -304,14 +309,14 @@ CandidateTranscript.prototype.confidence = function() {}
 /**
 * @class
 *
- * Stores the entire CTC output as an array of character metadata objects
+ * An array of CandidateTranscript objects computed by the model.
 */
 function Metadata () {}

 /**
- * List of items
+ * Array of transcripts
 *
- * @return {array} List of :js:func:`CandidateTranscript` objects
+ * @return {array} Array of :js:func:`CandidateTranscript` objects
 */
 Metadata.prototype.transcripts = function() {}

--- a/native_client/python/init.py
+++ b/native_client/python/init.py
@ -123,15 +123,15 @@ class Model(object):

    def sttWithMetadata(self, audio_buffer, num_results=1):
        """
-        Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
+        Use the DeepSpeech model to perform Speech-To-Text and return results including metadata.

        :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
        :type audio_buffer: numpy.int16 array

-        :param num_results: Number of candidate transcripts to return.
+        :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
        :type num_results: int

-        :return: Outputs a struct of individual letters along with their timing information.
+        :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
        :type: :func:`Metadata`
        """
        return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results)
@ -192,10 +192,13 @@ class Stream(object):

    def intermediateDecodeWithMetadata(self, num_results=1):
        """
-        Compute the intermediate decoding of an ongoing streaming inference.
+        Compute the intermediate decoding of an ongoing streaming inference and return results including metadata.

-        :return: The STT intermediate result.
-        :type: str
+        :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
+        :type num_results: int
+
+        :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
+        :type: :func:`Metadata`

        :throws: RuntimeError if the stream object is not valid
        """
@ -205,8 +208,9 @@ class Stream(object):

    def finishStream(self):
        """
-        Signal the end of an audio signal to an ongoing streaming inference,
-        returns the STT result over the whole audio signal.
+        Compute the final decoding of an ongoing streaming inference and return
+        the result. Signals the end of an ongoing streaming inference. The underlying
+        stream object must not be used after this method is called.

        :return: The STT result.
        :type: str
@ -221,13 +225,15 @@ class Stream(object):

    def finishStreamWithMetadata(self, num_results=1):
        """
-        Signal the end of an audio signal to an ongoing streaming inference,
-        returns per-letter metadata.
+        Compute the final decoding of an ongoing streaming inference and return
+        results including metadata. Signals the end of an ongoing streaming
+        inference. The underlying stream object must not be used after this
+        method is called.

-        :param num_results: Number of candidate transcripts to return.
+        :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
        :type num_results: int

-        :return: Outputs a struct of individual letters along with their timing information.
+        :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
        :type: :func:`Metadata`

        :throws: RuntimeError if the stream object is not valid