Update docs

This commit is contained in:
Reuben Morais 2020-02-25 15:43:36 +01:00
parent c52f3b32fa
commit e9ae38bf47
13 changed files with 314 additions and 120 deletions

View File

@ -34,6 +34,9 @@ C
.. doxygenfunction:: DS_IntermediateDecode
:project: deepspeech-c
.. doxygenfunction:: DS_IntermediateDecodeWithMetadata
:project: deepspeech-c
.. doxygenfunction:: DS_FinishStream
:project: deepspeech-c

View File

@ -31,13 +31,20 @@ ErrorCodes
Metadata
--------
.. doxygenstruct:: DeepSpeechClient::Structs::Metadata
.. doxygenstruct:: DeepSpeechClient::Models::Metadata
:project: deepspeech-dotnet
:members: items, num_items, confidence
:members: Transcripts
MetadataItem
------------
CandidateTranscript
-------------------
.. doxygenstruct:: DeepSpeechClient::Structs::MetadataItem
.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript
:project: deepspeech-dotnet
:members: character, timestep, start_time
:members: Tokens, Confidence
TokenMetadata
-------------
.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata
:project: deepspeech-dotnet
:members: Text, Timestep, StartTime

View File

@ -30,8 +30,14 @@ Metadata
.. js:autoclass:: Metadata
:members:
MetadataItem
------------
CandidateTranscript
-------------------
.. js:autoclass:: MetadataItem
.. js:autoclass:: CandidateTranscript
:members:
TokenMetadata
-------------
.. js:autoclass:: TokenMetadata
:members:

View File

@ -21,8 +21,14 @@ Metadata
.. autoclass:: Metadata
:members:
MetadataItem
------------
CandidateTranscript
-------------------
.. autoclass:: MetadataItem
.. autoclass:: CandidateTranscript
:members:
TokenMetadata
-------------
.. autoclass:: TokenMetadata
:members:

View File

@ -42,20 +42,20 @@ typedef struct CandidateTranscript {
TokenMetadata* tokens;
/** Size of the tokens array */
int num_tokens;
/** Approximated confidence value for this transcription. This is roughly the
/** Approximated confidence value for this transcript. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
* contributed to the creation of this transcription.
* contributed to the creation of this transcript.
*/
double confidence;
} CandidateTranscript;
/**
* @brief An array of CandidateTranscript objects computed by the model
* @brief An array of CandidateTranscript objects computed by the model.
*/
typedef struct Metadata {
/** Array of CandidateTranscript objects */
CandidateTranscript* transcripts;
/** Size of the transcriptions array */
/** Size of the transcripts array */
int num_transcripts;
} Metadata;
@ -191,14 +191,14 @@ char* DS_SpeechToText(ModelState* aCtx,
unsigned int aBufferSize);
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results.
* @brief Use the DeepSpeech model to perform Speech-To-Text and output results
* including metadata.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aNumResults The number of candidate transcripts to return.
* @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this.
*
* @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information. The user is
@ -252,7 +252,7 @@ char* DS_IntermediateDecode(const StreamingState* aSctx);
/**
* @brief Compute the intermediate decoding of an ongoing streaming inference,
* returns per-letter metadata.
* return results including metadata.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aNumResults The number of candidate transcripts to return.
@ -267,8 +267,8 @@ Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
unsigned int aNumResults);
/**
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns the STT result over the whole audio signal.
* @brief Compute the final decoding of an ongoing streaming inference and return
* the result. Signals the end of an ongoing streaming inference.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
@ -281,8 +281,9 @@ DEEPSPEECH_EXPORT
char* DS_FinishStream(StreamingState* aSctx);
/**
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns per-letter metadata.
* @brief Compute the final decoding of an ongoing streaming inference and return
* results including metadata. Signals the end of an ongoing streaming
* inference.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aNumResults The number of candidate transcripts to return.
@ -295,7 +296,7 @@ char* DS_FinishStream(StreamingState* aSctx);
* @note This method will free the state pointer (@p aSctx).
*/
DEEPSPEECH_EXPORT
Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
unsigned int aNumResults);
/**

View File

@ -199,10 +199,10 @@ namespace DeepSpeechClient
}
/// <summary>
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
/// </summary>
/// <param name="stream">Instance of the stream to finish.</param>
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata result.</returns>
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
{
@ -220,10 +220,10 @@ namespace DeepSpeechClient
}
/// <summary>
/// Computes the intermediate decoding of an ongoing streaming inference.
/// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
/// </summary>
/// <param name="stream">Instance of the stream to decode.</param>
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The STT intermediate result.</returns>
public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
{
@ -273,11 +273,11 @@ namespace DeepSpeechClient
}
/// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text.
/// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
/// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata. Returns NULL on error.</returns>
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
{

View File

@ -68,11 +68,11 @@ namespace DeepSpeechClient.Interfaces
uint aBufferSize);
/// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text.
/// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
/// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata. Returns NULL on error.</returns>
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
uint aBufferSize,
@ -105,10 +105,10 @@ namespace DeepSpeechClient.Interfaces
unsafe string IntermediateDecode(DeepSpeechStream stream);
/// <summary>
/// Computes the intermediate decoding of an ongoing streaming inference.
/// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
/// </summary>
/// <param name="stream">Instance of the stream to decode.</param>
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata result.</returns>
unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
@ -120,10 +120,10 @@ namespace DeepSpeechClient.Interfaces
unsafe string FinishStream(DeepSpeechStream stream);
/// <summary>
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
/// </summary>
/// <param name="stream">Instance of the stream to finish.</param>
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata result.</returns>
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
}

View File

@ -117,9 +117,10 @@ public class DeepSpeechModel {
* @param buffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param buffer_size The number of samples in the audio signal.
* @param num_results Number of candidate transcripts to return.
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
*
* @return Outputs a Metadata object of individual letters along with their timing information.
* @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information.
*/
public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) {
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results);
@ -165,7 +166,7 @@ public class DeepSpeechModel {
* @brief Compute the intermediate decoding of an ongoing streaming inference.
*
* @param ctx A streaming state pointer returned by createStream().
* @param num_results Number of candidate transcripts to return.
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
*
* @return The STT intermediate result.
*/
@ -174,8 +175,8 @@ public class DeepSpeechModel {
}
/**
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns the STT result over the whole audio signal.
* @brief Compute the final decoding of an ongoing streaming inference and return
* the result. Signals the end of an ongoing streaming inference.
*
* @param ctx A streaming state pointer returned by createStream().
*
@ -188,13 +189,15 @@ public class DeepSpeechModel {
}
/**
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns per-letter metadata.
* @brief Compute the final decoding of an ongoing streaming inference and return
* the results including metadata. Signals the end of an ongoing streaming
* inference.
*
* @param ctx A streaming state pointer returned by createStream().
* @param num_results Number of candidate transcripts to return.
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
*
* @return Outputs a Metadata object of individual letters along with their timing information.
* @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information.
*
* @note This method will free the state pointer (@p ctx).
*/

View File

@ -0,0 +1,96 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org).
* Version 4.0.1
*
* Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead.
* ----------------------------------------------------------------------------- */
package org.mozilla.deepspeech.libdeepspeech;
/**
* A single transcript computed by the model, including a confidence value and
* the metadata for its constituent tokens.
*/
public class CandidateTranscript {
private transient long swigCPtr;
protected transient boolean swigCMemOwn;
protected CandidateTranscript(long cPtr, boolean cMemoryOwn) {
swigCMemOwn = cMemoryOwn;
swigCPtr = cPtr;
}
protected static long getCPtr(CandidateTranscript obj) {
return (obj == null) ? 0 : obj.swigCPtr;
}
public synchronized void delete() {
if (swigCPtr != 0) {
if (swigCMemOwn) {
swigCMemOwn = false;
throw new UnsupportedOperationException("C++ destructor does not have public access");
}
swigCPtr = 0;
}
}
/**
* Array of TokenMetadata objects
*/
public void setTokens(TokenMetadata value) {
implJNI.CandidateTranscript_tokens_set(swigCPtr, this, TokenMetadata.getCPtr(value), value);
}
/**
* Array of TokenMetadata objects
*/
public TokenMetadata getTokens() {
long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this);
return (cPtr == 0) ? null : new TokenMetadata(cPtr, false);
}
/**
* Size of the tokens array
*/
public void setNum_tokens(int value) {
implJNI.CandidateTranscript_num_tokens_set(swigCPtr, this, value);
}
/**
* Size of the tokens array
*/
public int getNum_tokens() {
return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this);
}
/**
* Approximated confidence value for this transcript. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
* contributed to the creation of this transcript.
*/
public void setConfidence(double value) {
implJNI.CandidateTranscript_confidence_set(swigCPtr, this, value);
}
/**
* Approximated confidence value for this transcript. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
* contributed to the creation of this transcript.
*/
public double getConfidence() {
return implJNI.CandidateTranscript_confidence_get(swigCPtr, this);
}
/**
* Retrieve one TokenMetadata element
*
* @param i Array index of the TokenMetadata to get
*
* @return The TokenMetadata requested or null
*/
public TokenMetadata getToken(int i) {
return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), true);
}
}

View File

@ -1,6 +1,6 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org).
* Version 4.0.2
* Version 4.0.1
*
* Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead.
@ -9,7 +9,7 @@
package org.mozilla.deepspeech.libdeepspeech;
/**
* Stores the entire CTC output as an array of character metadata objects
* An array of CandidateTranscript objects computed by the model.
*/
public class Metadata {
private transient long swigCPtr;
@ -40,61 +40,43 @@ public class Metadata {
}
/**
* List of items
* Array of CandidateTranscript objects
*/
public void setItems(MetadataItem value) {
implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value);
public void setTranscripts(CandidateTranscript value) {
implJNI.Metadata_transcripts_set(swigCPtr, this, CandidateTranscript.getCPtr(value), value);
}
/**
* List of items
* Array of CandidateTranscript objects
*/
public MetadataItem getItems() {
long cPtr = implJNI.Metadata_items_get(swigCPtr, this);
return (cPtr == 0) ? null : new MetadataItem(cPtr, false);
public CandidateTranscript getTranscripts() {
long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this);
return (cPtr == 0) ? null : new CandidateTranscript(cPtr, false);
}
/**
* Size of the list of items
* Size of the transcripts array
*/
public void setNum_items(int value) {
implJNI.Metadata_num_items_set(swigCPtr, this, value);
public void setNum_transcripts(int value) {
implJNI.Metadata_num_transcripts_set(swigCPtr, this, value);
}
/**
* Size of the list of items
* Size of the transcripts array
*/
public int getNum_items() {
return implJNI.Metadata_num_items_get(swigCPtr, this);
public int getNum_transcripts() {
return implJNI.Metadata_num_transcripts_get(swigCPtr, this);
}
/**
* Approximated confidence value for this transcription. This is roughly the<br>
* sum of the acoustic model logit values for each timestep/character that<br>
* contributed to the creation of this transcription.
* Retrieve one CandidateTranscript element
*
* @param i Array index of the CandidateTranscript to get
*
* @return The CandidateTranscript requested or null
*/
public void setConfidence(double value) {
implJNI.Metadata_confidence_set(swigCPtr, this, value);
}
/**
* Approximated confidence value for this transcription. This is roughly the<br>
* sum of the acoustic model logit values for each timestep/character that<br>
* contributed to the creation of this transcription.
*/
public double getConfidence() {
return implJNI.Metadata_confidence_get(swigCPtr, this);
}
/**
* Retrieve one MetadataItem element<br>
* <br>
* @param i Array index of the MetadataItem to get<br>
* <br>
* @return The MetadataItem requested or null
*/
public MetadataItem getItem(int i) {
return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true);
public CandidateTranscript getTranscript(int i) {
return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), true);
}
}

View File

@ -0,0 +1,79 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org).
* Version 4.0.1
*
* Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead.
* ----------------------------------------------------------------------------- */
package org.mozilla.deepspeech.libdeepspeech;
/**
* Stores text of an individual token, along with its timing information
*/
public class TokenMetadata {
private transient long swigCPtr;
protected transient boolean swigCMemOwn;
protected TokenMetadata(long cPtr, boolean cMemoryOwn) {
swigCMemOwn = cMemoryOwn;
swigCPtr = cPtr;
}
protected static long getCPtr(TokenMetadata obj) {
return (obj == null) ? 0 : obj.swigCPtr;
}
public synchronized void delete() {
if (swigCPtr != 0) {
if (swigCMemOwn) {
swigCMemOwn = false;
throw new UnsupportedOperationException("C++ destructor does not have public access");
}
swigCPtr = 0;
}
}
/**
* The text corresponding to this token
*/
public void setText(String value) {
implJNI.TokenMetadata_text_set(swigCPtr, this, value);
}
/**
* The text corresponding to this token
*/
public String getText() {
return implJNI.TokenMetadata_text_get(swigCPtr, this);
}
/**
* Position of the token in units of 20ms
*/
public void setTimestep(int value) {
implJNI.TokenMetadata_timestep_set(swigCPtr, this, value);
}
/**
* Position of the token in units of 20ms
*/
public int getTimestep() {
return implJNI.TokenMetadata_timestep_get(swigCPtr, this);
}
/**
* Position of the token in seconds
*/
public void setStart_time(float value) {
implJNI.TokenMetadata_start_time_set(swigCPtr, this, value);
}
/**
* Position of the token in seconds
*/
public float getStart_time() {
return implJNI.TokenMetadata_start_time_get(swigCPtr, this);
}
}

View File

@ -115,12 +115,12 @@ Model.prototype.stt = function(aBuffer) {
}
/**
* Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results.
* Use the DeepSpeech model to perform Speech-To-Text and output results including metadata.
*
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
*
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
* @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/
Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) {
aNumResults = aNumResults || 1;
@ -173,9 +173,11 @@ Stream.prototype.intermediateDecode = function() {
}
/**
* Compute the intermediate decoding of an ongoing streaming inference.
* Compute the intermediate decoding of an ongoing streaming inference, return results including metadata.
*
* @return {string} The STT intermediate result.
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
*
* @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/
Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
aNumResults = aNumResults || 1;
@ -183,7 +185,7 @@ Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
}
/**
* Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
* Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference.
*
* @return {string} The STT result.
*
@ -196,7 +198,9 @@ Stream.prototype.finishStream = function() {
}
/**
* Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
* Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference.
*
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
*
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
*
@ -253,48 +257,49 @@ function Version() {
/**
* @class
*
* Stores each individual character, along with its timing information
* Stores text of an individual token, along with its timing information
*/
function TokenMetadata() {}
/**
* The character generated for transcription
* The text corresponding to this token
*
* @return {string} The character generated
* @return {string} The text generated
*/
TokenMetadata.prototype.text = function() {}
/**
* Position of the character in units of 20ms
* Position of the token in units of 20ms
*
* @return {int} The position of the character
* @return {int} The position of the token
*/
TokenMetadata.prototype.timestep = function() {};
/**
* Position of the character in seconds
* Position of the token in seconds
*
* @return {float} The position of the character
* @return {float} The position of the token
*/
TokenMetadata.prototype.start_time = function() {};
/**
* @class
*
* Stores the entire CTC output as an array of character metadata objects
* A single transcript computed by the model, including a confidence value and
* the metadata for its constituent tokens.
*/
function CandidateTranscript () {}
/**
* List of items
* Array of tokens
*
* @return {array} List of :js:func:`TokenMetadata`
* @return {array} Array of :js:func:`TokenMetadata`
*/
CandidateTranscript.prototype.items = function() {}
CandidateTranscript.prototype.tokens = function() {}
/**
* Approximated confidence value for this transcription. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
* sum of the acoustic model logit values for each timestep/token that
* contributed to the creation of this transcription.
*
* @return {float} Confidence value
@ -304,14 +309,14 @@ CandidateTranscript.prototype.confidence = function() {}
/**
* @class
*
* Stores the entire CTC output as an array of character metadata objects
* An array of CandidateTranscript objects computed by the model.
*/
function Metadata () {}
/**
* List of items
* Array of transcripts
*
* @return {array} List of :js:func:`CandidateTranscript` objects
* @return {array} Array of :js:func:`CandidateTranscript` objects
*/
Metadata.prototype.transcripts = function() {}

View File

@ -123,15 +123,15 @@ class Model(object):
def sttWithMetadata(self, audio_buffer, num_results=1):
"""
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
Use the DeepSpeech model to perform Speech-To-Text and return results including metadata.
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type audio_buffer: numpy.int16 array
:param num_results: Number of candidate transcripts to return.
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
:type num_results: int
:return: Outputs a struct of individual letters along with their timing information.
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
:type: :func:`Metadata`
"""
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results)
@ -192,10 +192,13 @@ class Stream(object):
def intermediateDecodeWithMetadata(self, num_results=1):
"""
Compute the intermediate decoding of an ongoing streaming inference.
Compute the intermediate decoding of an ongoing streaming inference and return results including metadata.
:return: The STT intermediate result.
:type: str
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
:type num_results: int
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
:type: :func:`Metadata`
:throws: RuntimeError if the stream object is not valid
"""
@ -205,8 +208,9 @@ class Stream(object):
def finishStream(self):
"""
Signal the end of an audio signal to an ongoing streaming inference,
returns the STT result over the whole audio signal.
Compute the final decoding of an ongoing streaming inference and return
the result. Signals the end of an ongoing streaming inference. The underlying
stream object must not be used after this method is called.
:return: The STT result.
:type: str
@ -221,13 +225,15 @@ class Stream(object):
def finishStreamWithMetadata(self, num_results=1):
"""
Signal the end of an audio signal to an ongoing streaming inference,
returns per-letter metadata.
Compute the final decoding of an ongoing streaming inference and return
results including metadata. Signals the end of an ongoing streaming
inference. The underlying stream object must not be used after this
method is called.
:param num_results: Number of candidate transcripts to return.
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
:type num_results: int
:return: Outputs a struct of individual letters along with their timing information.
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
:type: :func:`Metadata`
:throws: RuntimeError if the stream object is not valid