Update docs
This commit is contained in:
parent
c52f3b32fa
commit
e9ae38bf47
|
@ -34,6 +34,9 @@ C
|
|||
.. doxygenfunction:: DS_IntermediateDecode
|
||||
:project: deepspeech-c
|
||||
|
||||
.. doxygenfunction:: DS_IntermediateDecodeWithMetadata
|
||||
:project: deepspeech-c
|
||||
|
||||
.. doxygenfunction:: DS_FinishStream
|
||||
:project: deepspeech-c
|
||||
|
||||
|
|
|
@ -31,13 +31,20 @@ ErrorCodes
|
|||
Metadata
|
||||
--------
|
||||
|
||||
.. doxygenstruct:: DeepSpeechClient::Structs::Metadata
|
||||
.. doxygenstruct:: DeepSpeechClient::Models::Metadata
|
||||
:project: deepspeech-dotnet
|
||||
:members: items, num_items, confidence
|
||||
:members: Transcripts
|
||||
|
||||
MetadataItem
|
||||
------------
|
||||
CandidateTranscript
|
||||
-------------------
|
||||
|
||||
.. doxygenstruct:: DeepSpeechClient::Structs::MetadataItem
|
||||
.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript
|
||||
:project: deepspeech-dotnet
|
||||
:members: character, timestep, start_time
|
||||
:members: Tokens, Confidence
|
||||
|
||||
TokenMetadata
|
||||
-------------
|
||||
|
||||
.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata
|
||||
:project: deepspeech-dotnet
|
||||
:members: Text, Timestep, StartTime
|
||||
|
|
|
@ -30,8 +30,14 @@ Metadata
|
|||
.. js:autoclass:: Metadata
|
||||
:members:
|
||||
|
||||
MetadataItem
|
||||
------------
|
||||
CandidateTranscript
|
||||
-------------------
|
||||
|
||||
.. js:autoclass:: MetadataItem
|
||||
.. js:autoclass:: CandidateTranscript
|
||||
:members:
|
||||
|
||||
TokenMetadata
|
||||
-------------
|
||||
|
||||
.. js:autoclass:: TokenMetadata
|
||||
:members:
|
||||
|
|
|
@ -21,8 +21,14 @@ Metadata
|
|||
.. autoclass:: Metadata
|
||||
:members:
|
||||
|
||||
MetadataItem
|
||||
------------
|
||||
CandidateTranscript
|
||||
-------------------
|
||||
|
||||
.. autoclass:: MetadataItem
|
||||
.. autoclass:: CandidateTranscript
|
||||
:members:
|
||||
|
||||
TokenMetadata
|
||||
-------------
|
||||
|
||||
.. autoclass:: TokenMetadata
|
||||
:members:
|
||||
|
|
|
@ -42,20 +42,20 @@ typedef struct CandidateTranscript {
|
|||
TokenMetadata* tokens;
|
||||
/** Size of the tokens array */
|
||||
int num_tokens;
|
||||
/** Approximated confidence value for this transcription. This is roughly the
|
||||
/** Approximated confidence value for this transcript. This is roughly the
|
||||
* sum of the acoustic model logit values for each timestep/character that
|
||||
* contributed to the creation of this transcription.
|
||||
* contributed to the creation of this transcript.
|
||||
*/
|
||||
double confidence;
|
||||
} CandidateTranscript;
|
||||
|
||||
/**
|
||||
* @brief An array of CandidateTranscript objects computed by the model
|
||||
* @brief An array of CandidateTranscript objects computed by the model.
|
||||
*/
|
||||
typedef struct Metadata {
|
||||
/** Array of CandidateTranscript objects */
|
||||
CandidateTranscript* transcripts;
|
||||
/** Size of the transcriptions array */
|
||||
/** Size of the transcripts array */
|
||||
int num_transcripts;
|
||||
} Metadata;
|
||||
|
||||
|
@ -191,14 +191,14 @@ char* DS_SpeechToText(ModelState* aCtx,
|
|||
unsigned int aBufferSize);
|
||||
|
||||
/**
|
||||
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
|
||||
* about the results.
|
||||
* @brief Use the DeepSpeech model to perform Speech-To-Text and output results
|
||||
* including metadata.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||
* sample rate (matching what the model was trained on).
|
||||
* @param aBufferSize The number of samples in the audio signal.
|
||||
* @param aNumResults The number of candidate transcripts to return.
|
||||
* @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this.
|
||||
*
|
||||
* @return Metadata struct containing multiple candidate transcripts. Each transcript
|
||||
* has per-token metadata including timing information. The user is
|
||||
|
@ -252,7 +252,7 @@ char* DS_IntermediateDecode(const StreamingState* aSctx);
|
|||
|
||||
/**
|
||||
* @brief Compute the intermediate decoding of an ongoing streaming inference,
|
||||
* returns per-letter metadata.
|
||||
* return results including metadata.
|
||||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
* @param aNumResults The number of candidate transcripts to return.
|
||||
|
@ -267,8 +267,8 @@ Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
|
|||
unsigned int aNumResults);
|
||||
|
||||
/**
|
||||
* @brief Signal the end of an audio signal to an ongoing streaming
|
||||
* inference, returns the STT result over the whole audio signal.
|
||||
* @brief Compute the final decoding of an ongoing streaming inference and return
|
||||
* the result. Signals the end of an ongoing streaming inference.
|
||||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
*
|
||||
|
@ -281,8 +281,9 @@ DEEPSPEECH_EXPORT
|
|||
char* DS_FinishStream(StreamingState* aSctx);
|
||||
|
||||
/**
|
||||
* @brief Signal the end of an audio signal to an ongoing streaming
|
||||
* inference, returns per-letter metadata.
|
||||
* @brief Compute the final decoding of an ongoing streaming inference and return
|
||||
* results including metadata. Signals the end of an ongoing streaming
|
||||
* inference.
|
||||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
* @param aNumResults The number of candidate transcripts to return.
|
||||
|
@ -295,7 +296,7 @@ char* DS_FinishStream(StreamingState* aSctx);
|
|||
* @note This method will free the state pointer (@p aSctx).
|
||||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
|
||||
Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
|
||||
unsigned int aNumResults);
|
||||
|
||||
/**
|
||||
|
|
|
@ -199,10 +199,10 @@ namespace DeepSpeechClient
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
|
||||
/// </summary>
|
||||
/// <param name="stream">Instance of the stream to finish.</param>
|
||||
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
|
||||
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||
/// <returns>The extended metadata result.</returns>
|
||||
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
|
||||
{
|
||||
|
@ -220,10 +220,10 @@ namespace DeepSpeechClient
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the intermediate decoding of an ongoing streaming inference.
|
||||
/// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
|
||||
/// </summary>
|
||||
/// <param name="stream">Instance of the stream to decode.</param>
|
||||
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
|
||||
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||
/// <returns>The STT intermediate result.</returns>
|
||||
public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
|
||||
{
|
||||
|
@ -273,11 +273,11 @@ namespace DeepSpeechClient
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
|
||||
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||
/// <returns>The extended metadata. Returns NULL on error.</returns>
|
||||
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
|
||||
{
|
||||
|
|
|
@ -68,11 +68,11 @@ namespace DeepSpeechClient.Interfaces
|
|||
uint aBufferSize);
|
||||
|
||||
/// <summary>
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
|
||||
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||
/// <returns>The extended metadata. Returns NULL on error.</returns>
|
||||
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
|
||||
uint aBufferSize,
|
||||
|
@ -105,10 +105,10 @@ namespace DeepSpeechClient.Interfaces
|
|||
unsafe string IntermediateDecode(DeepSpeechStream stream);
|
||||
|
||||
/// <summary>
|
||||
/// Computes the intermediate decoding of an ongoing streaming inference.
|
||||
/// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
|
||||
/// </summary>
|
||||
/// <param name="stream">Instance of the stream to decode.</param>
|
||||
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
|
||||
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||
/// <returns>The extended metadata result.</returns>
|
||||
unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
|
||||
|
||||
|
@ -120,10 +120,10 @@ namespace DeepSpeechClient.Interfaces
|
|||
unsafe string FinishStream(DeepSpeechStream stream);
|
||||
|
||||
/// <summary>
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
|
||||
/// </summary>
|
||||
/// <param name="stream">Instance of the stream to finish.</param>
|
||||
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
|
||||
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||
/// <returns>The extended metadata result.</returns>
|
||||
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
|
||||
}
|
||||
|
|
|
@ -117,9 +117,10 @@ public class DeepSpeechModel {
|
|||
* @param buffer A 16-bit, mono raw audio signal at the appropriate
|
||||
* sample rate (matching what the model was trained on).
|
||||
* @param buffer_size The number of samples in the audio signal.
|
||||
* @param num_results Number of candidate transcripts to return.
|
||||
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||
*
|
||||
* @return Outputs a Metadata object of individual letters along with their timing information.
|
||||
* @return Metadata struct containing multiple candidate transcripts. Each transcript
|
||||
* has per-token metadata including timing information.
|
||||
*/
|
||||
public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) {
|
||||
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results);
|
||||
|
@ -165,7 +166,7 @@ public class DeepSpeechModel {
|
|||
* @brief Compute the intermediate decoding of an ongoing streaming inference.
|
||||
*
|
||||
* @param ctx A streaming state pointer returned by createStream().
|
||||
* @param num_results Number of candidate transcripts to return.
|
||||
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||
*
|
||||
* @return The STT intermediate result.
|
||||
*/
|
||||
|
@ -174,8 +175,8 @@ public class DeepSpeechModel {
|
|||
}
|
||||
|
||||
/**
|
||||
* @brief Signal the end of an audio signal to an ongoing streaming
|
||||
* inference, returns the STT result over the whole audio signal.
|
||||
* @brief Compute the final decoding of an ongoing streaming inference and return
|
||||
* the result. Signals the end of an ongoing streaming inference.
|
||||
*
|
||||
* @param ctx A streaming state pointer returned by createStream().
|
||||
*
|
||||
|
@ -188,13 +189,15 @@ public class DeepSpeechModel {
|
|||
}
|
||||
|
||||
/**
|
||||
* @brief Signal the end of an audio signal to an ongoing streaming
|
||||
* inference, returns per-letter metadata.
|
||||
* @brief Compute the final decoding of an ongoing streaming inference and return
|
||||
* the results including metadata. Signals the end of an ongoing streaming
|
||||
* inference.
|
||||
*
|
||||
* @param ctx A streaming state pointer returned by createStream().
|
||||
* @param num_results Number of candidate transcripts to return.
|
||||
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||
*
|
||||
* @return Outputs a Metadata object of individual letters along with their timing information.
|
||||
* @return Metadata struct containing multiple candidate transcripts. Each transcript
|
||||
* has per-token metadata including timing information.
|
||||
*
|
||||
* @note This method will free the state pointer (@p ctx).
|
||||
*/
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
/* ----------------------------------------------------------------------------
|
||||
* This file was automatically generated by SWIG (http://www.swig.org).
|
||||
* Version 4.0.1
|
||||
*
|
||||
* Do not make changes to this file unless you know what you are doing--modify
|
||||
* the SWIG interface file instead.
|
||||
* ----------------------------------------------------------------------------- */
|
||||
|
||||
package org.mozilla.deepspeech.libdeepspeech;
|
||||
|
||||
/**
|
||||
* A single transcript computed by the model, including a confidence value and
|
||||
* the metadata for its constituent tokens.
|
||||
*/
|
||||
public class CandidateTranscript {
|
||||
private transient long swigCPtr;
|
||||
protected transient boolean swigCMemOwn;
|
||||
|
||||
protected CandidateTranscript(long cPtr, boolean cMemoryOwn) {
|
||||
swigCMemOwn = cMemoryOwn;
|
||||
swigCPtr = cPtr;
|
||||
}
|
||||
|
||||
protected static long getCPtr(CandidateTranscript obj) {
|
||||
return (obj == null) ? 0 : obj.swigCPtr;
|
||||
}
|
||||
|
||||
public synchronized void delete() {
|
||||
if (swigCPtr != 0) {
|
||||
if (swigCMemOwn) {
|
||||
swigCMemOwn = false;
|
||||
throw new UnsupportedOperationException("C++ destructor does not have public access");
|
||||
}
|
||||
swigCPtr = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Array of TokenMetadata objects
|
||||
*/
|
||||
public void setTokens(TokenMetadata value) {
|
||||
implJNI.CandidateTranscript_tokens_set(swigCPtr, this, TokenMetadata.getCPtr(value), value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Array of TokenMetadata objects
|
||||
*/
|
||||
public TokenMetadata getTokens() {
|
||||
long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this);
|
||||
return (cPtr == 0) ? null : new TokenMetadata(cPtr, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Size of the tokens array
|
||||
*/
|
||||
public void setNum_tokens(int value) {
|
||||
implJNI.CandidateTranscript_num_tokens_set(swigCPtr, this, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Size of the tokens array
|
||||
*/
|
||||
public int getNum_tokens() {
|
||||
return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Approximated confidence value for this transcript. This is roughly the
|
||||
* sum of the acoustic model logit values for each timestep/character that
|
||||
* contributed to the creation of this transcript.
|
||||
*/
|
||||
public void setConfidence(double value) {
|
||||
implJNI.CandidateTranscript_confidence_set(swigCPtr, this, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Approximated confidence value for this transcript. This is roughly the
|
||||
* sum of the acoustic model logit values for each timestep/character that
|
||||
* contributed to the creation of this transcript.
|
||||
*/
|
||||
public double getConfidence() {
|
||||
return implJNI.CandidateTranscript_confidence_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve one TokenMetadata element
|
||||
*
|
||||
* @param i Array index of the TokenMetadata to get
|
||||
*
|
||||
* @return The TokenMetadata requested or null
|
||||
*/
|
||||
public TokenMetadata getToken(int i) {
|
||||
return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), true);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
/* ----------------------------------------------------------------------------
|
||||
* This file was automatically generated by SWIG (http://www.swig.org).
|
||||
* Version 4.0.2
|
||||
* Version 4.0.1
|
||||
*
|
||||
* Do not make changes to this file unless you know what you are doing--modify
|
||||
* the SWIG interface file instead.
|
||||
|
@ -9,7 +9,7 @@
|
|||
package org.mozilla.deepspeech.libdeepspeech;
|
||||
|
||||
/**
|
||||
* Stores the entire CTC output as an array of character metadata objects
|
||||
* An array of CandidateTranscript objects computed by the model.
|
||||
*/
|
||||
public class Metadata {
|
||||
private transient long swigCPtr;
|
||||
|
@ -40,61 +40,43 @@ public class Metadata {
|
|||
}
|
||||
|
||||
/**
|
||||
* List of items
|
||||
* Array of CandidateTranscript objects
|
||||
*/
|
||||
public void setItems(MetadataItem value) {
|
||||
implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value);
|
||||
public void setTranscripts(CandidateTranscript value) {
|
||||
implJNI.Metadata_transcripts_set(swigCPtr, this, CandidateTranscript.getCPtr(value), value);
|
||||
}
|
||||
|
||||
/**
|
||||
* List of items
|
||||
* Array of CandidateTranscript objects
|
||||
*/
|
||||
public MetadataItem getItems() {
|
||||
long cPtr = implJNI.Metadata_items_get(swigCPtr, this);
|
||||
return (cPtr == 0) ? null : new MetadataItem(cPtr, false);
|
||||
public CandidateTranscript getTranscripts() {
|
||||
long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this);
|
||||
return (cPtr == 0) ? null : new CandidateTranscript(cPtr, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Size of the list of items
|
||||
* Size of the transcripts array
|
||||
*/
|
||||
public void setNum_items(int value) {
|
||||
implJNI.Metadata_num_items_set(swigCPtr, this, value);
|
||||
public void setNum_transcripts(int value) {
|
||||
implJNI.Metadata_num_transcripts_set(swigCPtr, this, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Size of the list of items
|
||||
* Size of the transcripts array
|
||||
*/
|
||||
public int getNum_items() {
|
||||
return implJNI.Metadata_num_items_get(swigCPtr, this);
|
||||
public int getNum_transcripts() {
|
||||
return implJNI.Metadata_num_transcripts_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Approximated confidence value for this transcription. This is roughly the<br>
|
||||
* sum of the acoustic model logit values for each timestep/character that<br>
|
||||
* contributed to the creation of this transcription.
|
||||
* Retrieve one CandidateTranscript element
|
||||
*
|
||||
* @param i Array index of the CandidateTranscript to get
|
||||
*
|
||||
* @return The CandidateTranscript requested or null
|
||||
*/
|
||||
public void setConfidence(double value) {
|
||||
implJNI.Metadata_confidence_set(swigCPtr, this, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Approximated confidence value for this transcription. This is roughly the<br>
|
||||
* sum of the acoustic model logit values for each timestep/character that<br>
|
||||
* contributed to the creation of this transcription.
|
||||
*/
|
||||
public double getConfidence() {
|
||||
return implJNI.Metadata_confidence_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve one MetadataItem element<br>
|
||||
* <br>
|
||||
* @param i Array index of the MetadataItem to get<br>
|
||||
* <br>
|
||||
* @return The MetadataItem requested or null
|
||||
*/
|
||||
public MetadataItem getItem(int i) {
|
||||
return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true);
|
||||
public CandidateTranscript getTranscript(int i) {
|
||||
return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), true);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
/* ----------------------------------------------------------------------------
|
||||
* This file was automatically generated by SWIG (http://www.swig.org).
|
||||
* Version 4.0.1
|
||||
*
|
||||
* Do not make changes to this file unless you know what you are doing--modify
|
||||
* the SWIG interface file instead.
|
||||
* ----------------------------------------------------------------------------- */
|
||||
|
||||
package org.mozilla.deepspeech.libdeepspeech;
|
||||
|
||||
/**
|
||||
* Stores text of an individual token, along with its timing information
|
||||
*/
|
||||
public class TokenMetadata {
|
||||
private transient long swigCPtr;
|
||||
protected transient boolean swigCMemOwn;
|
||||
|
||||
protected TokenMetadata(long cPtr, boolean cMemoryOwn) {
|
||||
swigCMemOwn = cMemoryOwn;
|
||||
swigCPtr = cPtr;
|
||||
}
|
||||
|
||||
protected static long getCPtr(TokenMetadata obj) {
|
||||
return (obj == null) ? 0 : obj.swigCPtr;
|
||||
}
|
||||
|
||||
public synchronized void delete() {
|
||||
if (swigCPtr != 0) {
|
||||
if (swigCMemOwn) {
|
||||
swigCMemOwn = false;
|
||||
throw new UnsupportedOperationException("C++ destructor does not have public access");
|
||||
}
|
||||
swigCPtr = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The text corresponding to this token
|
||||
*/
|
||||
public void setText(String value) {
|
||||
implJNI.TokenMetadata_text_set(swigCPtr, this, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* The text corresponding to this token
|
||||
*/
|
||||
public String getText() {
|
||||
return implJNI.TokenMetadata_text_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Position of the token in units of 20ms
|
||||
*/
|
||||
public void setTimestep(int value) {
|
||||
implJNI.TokenMetadata_timestep_set(swigCPtr, this, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Position of the token in units of 20ms
|
||||
*/
|
||||
public int getTimestep() {
|
||||
return implJNI.TokenMetadata_timestep_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Position of the token in seconds
|
||||
*/
|
||||
public void setStart_time(float value) {
|
||||
implJNI.TokenMetadata_start_time_set(swigCPtr, this, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Position of the token in seconds
|
||||
*/
|
||||
public float getStart_time() {
|
||||
return implJNI.TokenMetadata_start_time_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
}
|
|
@ -115,12 +115,12 @@ Model.prototype.stt = function(aBuffer) {
|
|||
}
|
||||
|
||||
/**
|
||||
* Use the DeepSpeech model to perform Speech-To-Text and output metadata
|
||||
* about the results.
|
||||
* Use the DeepSpeech model to perform Speech-To-Text and output results including metadata.
|
||||
*
|
||||
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
|
||||
*
|
||||
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
||||
* @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
||||
*/
|
||||
Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) {
|
||||
aNumResults = aNumResults || 1;
|
||||
|
@ -173,9 +173,11 @@ Stream.prototype.intermediateDecode = function() {
|
|||
}
|
||||
|
||||
/**
|
||||
* Compute the intermediate decoding of an ongoing streaming inference.
|
||||
* Compute the intermediate decoding of an ongoing streaming inference, return results including metadata.
|
||||
*
|
||||
* @return {string} The STT intermediate result.
|
||||
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
|
||||
*
|
||||
* @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
||||
*/
|
||||
Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
|
||||
aNumResults = aNumResults || 1;
|
||||
|
@ -183,7 +185,7 @@ Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
|
|||
}
|
||||
|
||||
/**
|
||||
* Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||
* Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference.
|
||||
*
|
||||
* @return {string} The STT result.
|
||||
*
|
||||
|
@ -196,7 +198,9 @@ Stream.prototype.finishStream = function() {
|
|||
}
|
||||
|
||||
/**
|
||||
* Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
|
||||
* Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference.
|
||||
*
|
||||
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
|
||||
*
|
||||
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
|
||||
*
|
||||
|
@ -253,48 +257,49 @@ function Version() {
|
|||
/**
|
||||
* @class
|
||||
*
|
||||
* Stores each individual character, along with its timing information
|
||||
* Stores text of an individual token, along with its timing information
|
||||
*/
|
||||
function TokenMetadata() {}
|
||||
|
||||
/**
|
||||
* The character generated for transcription
|
||||
* The text corresponding to this token
|
||||
*
|
||||
* @return {string} The character generated
|
||||
* @return {string} The text generated
|
||||
*/
|
||||
TokenMetadata.prototype.text = function() {}
|
||||
|
||||
/**
|
||||
* Position of the character in units of 20ms
|
||||
* Position of the token in units of 20ms
|
||||
*
|
||||
* @return {int} The position of the character
|
||||
* @return {int} The position of the token
|
||||
*/
|
||||
TokenMetadata.prototype.timestep = function() {};
|
||||
|
||||
/**
|
||||
* Position of the character in seconds
|
||||
* Position of the token in seconds
|
||||
*
|
||||
* @return {float} The position of the character
|
||||
* @return {float} The position of the token
|
||||
*/
|
||||
TokenMetadata.prototype.start_time = function() {};
|
||||
|
||||
/**
|
||||
* @class
|
||||
*
|
||||
* Stores the entire CTC output as an array of character metadata objects
|
||||
* A single transcript computed by the model, including a confidence value and
|
||||
* the metadata for its constituent tokens.
|
||||
*/
|
||||
function CandidateTranscript () {}
|
||||
|
||||
/**
|
||||
* List of items
|
||||
* Array of tokens
|
||||
*
|
||||
* @return {array} List of :js:func:`TokenMetadata`
|
||||
* @return {array} Array of :js:func:`TokenMetadata`
|
||||
*/
|
||||
CandidateTranscript.prototype.items = function() {}
|
||||
CandidateTranscript.prototype.tokens = function() {}
|
||||
|
||||
/**
|
||||
* Approximated confidence value for this transcription. This is roughly the
|
||||
* sum of the acoustic model logit values for each timestep/character that
|
||||
* sum of the acoustic model logit values for each timestep/token that
|
||||
* contributed to the creation of this transcription.
|
||||
*
|
||||
* @return {float} Confidence value
|
||||
|
@ -304,14 +309,14 @@ CandidateTranscript.prototype.confidence = function() {}
|
|||
/**
|
||||
* @class
|
||||
*
|
||||
* Stores the entire CTC output as an array of character metadata objects
|
||||
* An array of CandidateTranscript objects computed by the model.
|
||||
*/
|
||||
function Metadata () {}
|
||||
|
||||
/**
|
||||
* List of items
|
||||
* Array of transcripts
|
||||
*
|
||||
* @return {array} List of :js:func:`CandidateTranscript` objects
|
||||
* @return {array} Array of :js:func:`CandidateTranscript` objects
|
||||
*/
|
||||
Metadata.prototype.transcripts = function() {}
|
||||
|
||||
|
|
|
@ -123,15 +123,15 @@ class Model(object):
|
|||
|
||||
def sttWithMetadata(self, audio_buffer, num_results=1):
|
||||
"""
|
||||
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
|
||||
Use the DeepSpeech model to perform Speech-To-Text and return results including metadata.
|
||||
|
||||
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
:type audio_buffer: numpy.int16 array
|
||||
|
||||
:param num_results: Number of candidate transcripts to return.
|
||||
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||
:type num_results: int
|
||||
|
||||
:return: Outputs a struct of individual letters along with their timing information.
|
||||
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
|
||||
:type: :func:`Metadata`
|
||||
"""
|
||||
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results)
|
||||
|
@ -192,10 +192,13 @@ class Stream(object):
|
|||
|
||||
def intermediateDecodeWithMetadata(self, num_results=1):
|
||||
"""
|
||||
Compute the intermediate decoding of an ongoing streaming inference.
|
||||
Compute the intermediate decoding of an ongoing streaming inference and return results including metadata.
|
||||
|
||||
:return: The STT intermediate result.
|
||||
:type: str
|
||||
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||
:type num_results: int
|
||||
|
||||
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
|
||||
:type: :func:`Metadata`
|
||||
|
||||
:throws: RuntimeError if the stream object is not valid
|
||||
"""
|
||||
|
@ -205,8 +208,9 @@ class Stream(object):
|
|||
|
||||
def finishStream(self):
|
||||
"""
|
||||
Signal the end of an audio signal to an ongoing streaming inference,
|
||||
returns the STT result over the whole audio signal.
|
||||
Compute the final decoding of an ongoing streaming inference and return
|
||||
the result. Signals the end of an ongoing streaming inference. The underlying
|
||||
stream object must not be used after this method is called.
|
||||
|
||||
:return: The STT result.
|
||||
:type: str
|
||||
|
@ -221,13 +225,15 @@ class Stream(object):
|
|||
|
||||
def finishStreamWithMetadata(self, num_results=1):
|
||||
"""
|
||||
Signal the end of an audio signal to an ongoing streaming inference,
|
||||
returns per-letter metadata.
|
||||
Compute the final decoding of an ongoing streaming inference and return
|
||||
results including metadata. Signals the end of an ongoing streaming
|
||||
inference. The underlying stream object must not be used after this
|
||||
method is called.
|
||||
|
||||
:param num_results: Number of candidate transcripts to return.
|
||||
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||
:type num_results: int
|
||||
|
||||
:return: Outputs a struct of individual letters along with their timing information.
|
||||
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
|
||||
:type: :func:`Metadata`
|
||||
|
||||
:throws: RuntimeError if the stream object is not valid
|
||||
|
|
Loading…
Reference in New Issue