Address review comments
This commit is contained in:
parent
e9ae38bf47
commit
2ec34d5a06
|
@ -31,20 +31,20 @@ ErrorCodes
|
||||||
Metadata
|
Metadata
|
||||||
--------
|
--------
|
||||||
|
|
||||||
.. doxygenstruct:: DeepSpeechClient::Models::Metadata
|
.. doxygenclass:: DeepSpeechClient::Models::Metadata
|
||||||
:project: deepspeech-dotnet
|
:project: deepspeech-dotnet
|
||||||
:members: Transcripts
|
:members: Transcripts
|
||||||
|
|
||||||
CandidateTranscript
|
CandidateTranscript
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript
|
.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript
|
||||||
:project: deepspeech-dotnet
|
:project: deepspeech-dotnet
|
||||||
:members: Tokens, Confidence
|
:members: Tokens, Confidence
|
||||||
|
|
||||||
TokenMetadata
|
TokenMetadata
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata
|
.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata
|
||||||
:project: deepspeech-dotnet
|
:project: deepspeech-dotnet
|
||||||
:members: Text, Timestep, StartTime
|
:members: Text, Timestep, StartTime
|
||||||
|
|
|
@ -13,11 +13,17 @@ Metadata
|
||||||
|
|
||||||
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
|
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
|
||||||
:project: deepspeech-java
|
:project: deepspeech-java
|
||||||
:members: getItems, getNum_items, getProbability, getItem
|
:members: getTranscripts, getNum_transcripts, getTranscript
|
||||||
|
|
||||||
MetadataItem
|
CandidateTranscript
|
||||||
------------
|
-------------------
|
||||||
|
|
||||||
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem
|
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript
|
||||||
:project: deepspeech-java
|
:project: deepspeech-java
|
||||||
:members: getCharacter, getTimestep, getStart_time
|
:members: getTokens, getNum_tokens, getConfidence, getToken
|
||||||
|
|
||||||
|
TokenMetadata
|
||||||
|
-------------
|
||||||
|
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata
|
||||||
|
:project: deepspeech-java
|
||||||
|
:members: getText, getTimestep, getStart_time
|
||||||
|
|
|
@ -8,9 +8,16 @@ Metadata
|
||||||
:project: deepspeech-c
|
:project: deepspeech-c
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
MetadataItem
|
CandidateTranscript
|
||||||
------------
|
-------------------
|
||||||
|
|
||||||
.. doxygenstruct:: MetadataItem
|
.. doxygenstruct:: CandidateTranscript
|
||||||
|
:project: deepspeech-c
|
||||||
|
:members:
|
||||||
|
|
||||||
|
TokenMetadata
|
||||||
|
-------------
|
||||||
|
|
||||||
|
.. doxygenstruct:: TokenMetadata
|
||||||
:project: deepspeech-c
|
:project: deepspeech-c
|
||||||
:members:
|
:members:
|
||||||
|
|
|
@ -790,7 +790,7 @@ WARN_LOGFILE =
|
||||||
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
|
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
|
||||||
# Note: If this tag is empty the current directory is searched.
|
# Note: If this tag is empty the current directory is searched.
|
||||||
|
|
||||||
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/
|
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/
|
||||||
|
|
||||||
# This tag can be used to specify the character encoding of the source files
|
# This tag can be used to specify the character encoding of the source files
|
||||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
||||||
|
|
|
@ -34,6 +34,8 @@ bool extended_metadata = false;
|
||||||
|
|
||||||
bool json_output = false;
|
bool json_output = false;
|
||||||
|
|
||||||
|
int json_candidate_transcripts = 3;
|
||||||
|
|
||||||
int stream_size = 0;
|
int stream_size = 0;
|
||||||
|
|
||||||
void PrintHelp(const char* bin)
|
void PrintHelp(const char* bin)
|
||||||
|
@ -43,18 +45,19 @@ void PrintHelp(const char* bin)
|
||||||
"\n"
|
"\n"
|
||||||
"Running DeepSpeech inference.\n"
|
"Running DeepSpeech inference.\n"
|
||||||
"\n"
|
"\n"
|
||||||
"\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
|
"\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n"
|
||||||
"\t--scorer SCORER\t\tPath to the external scorer file\n"
|
"\t--scorer SCORER\t\t\tPath to the external scorer file\n"
|
||||||
"\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
|
"\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n"
|
||||||
"\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
|
"\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n"
|
||||||
"\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
|
"\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n"
|
||||||
"\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
|
"\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n"
|
||||||
"\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
|
"\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
|
||||||
"\t--extended\t\tOutput string from extended metadata\n"
|
"\t--extended\t\t\tOutput string from extended metadata\n"
|
||||||
"\t--json\t\t\tExtended output, shows word timings as JSON\n"
|
"\t--json\t\t\t\tExtended output, shows word timings as JSON\n"
|
||||||
"\t--stream size\t\tRun in stream mode, output intermediate results\n"
|
"\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n"
|
||||||
"\t--help\t\t\tShow help\n"
|
"\t--stream size\t\t\tRun in stream mode, output intermediate results\n"
|
||||||
"\t--version\t\tPrint version and exits\n";
|
"\t--help\t\t\t\tShow help\n"
|
||||||
|
"\t--version\t\t\tPrint version and exits\n";
|
||||||
char* version = DS_Version();
|
char* version = DS_Version();
|
||||||
std::cerr << "DeepSpeech " << version << "\n";
|
std::cerr << "DeepSpeech " << version << "\n";
|
||||||
DS_FreeString(version);
|
DS_FreeString(version);
|
||||||
|
@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv)
|
||||||
{"t", no_argument, nullptr, 't'},
|
{"t", no_argument, nullptr, 't'},
|
||||||
{"extended", no_argument, nullptr, 'e'},
|
{"extended", no_argument, nullptr, 'e'},
|
||||||
{"json", no_argument, nullptr, 'j'},
|
{"json", no_argument, nullptr, 'j'},
|
||||||
|
{"candidate_transcripts", required_argument, nullptr, 150},
|
||||||
{"stream", required_argument, nullptr, 's'},
|
{"stream", required_argument, nullptr, 's'},
|
||||||
{"version", no_argument, nullptr, 'v'},
|
{"version", no_argument, nullptr, 'v'},
|
||||||
{"help", no_argument, nullptr, 'h'},
|
{"help", no_argument, nullptr, 'h'},
|
||||||
|
@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv)
|
||||||
json_output = true;
|
json_output = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case 150:
|
||||||
|
json_candidate_transcripts = atoi(optarg);
|
||||||
|
break;
|
||||||
|
|
||||||
case 's':
|
case 's':
|
||||||
stream_size = atoi(optarg);
|
stream_size = atoi(optarg);
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -49,7 +49,7 @@ CandidateTranscriptToString(CandidateTranscript* transcript)
|
||||||
{
|
{
|
||||||
std::string retval = "";
|
std::string retval = "";
|
||||||
for (int i = 0; i < transcript->num_tokens; i++) {
|
for (int i = 0; i < transcript->num_tokens; i++) {
|
||||||
TokenMetadata token = transcript->tokens[i];
|
const TokenMetadata& token = transcript->tokens[i];
|
||||||
retval += token.text;
|
retval += token.text;
|
||||||
}
|
}
|
||||||
return strdup(retval.c_str());
|
return strdup(retval.c_str());
|
||||||
|
@ -65,7 +65,7 @@ CandidateTranscriptToWords(CandidateTranscript* transcript)
|
||||||
|
|
||||||
// Loop through each token
|
// Loop through each token
|
||||||
for (int i = 0; i < transcript->num_tokens; i++) {
|
for (int i = 0; i < transcript->num_tokens; i++) {
|
||||||
TokenMetadata token = transcript->tokens[i];
|
const TokenMetadata& token = transcript->tokens[i];
|
||||||
|
|
||||||
// Append token to word if it's not a space
|
// Append token to word if it's not a space
|
||||||
if (strcmp(token.text, u8" ") != 0) {
|
if (strcmp(token.text, u8" ") != 0) {
|
||||||
|
@ -167,7 +167,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
|
||||||
res.string = CandidateTranscriptToString(&result->transcripts[0]);
|
res.string = CandidateTranscriptToString(&result->transcripts[0]);
|
||||||
DS_FreeMetadata(result);
|
DS_FreeMetadata(result);
|
||||||
} else if (json_output) {
|
} else if (json_output) {
|
||||||
Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3);
|
Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts);
|
||||||
res.string = MetadataToJSON(result);
|
res.string = MetadataToJSON(result);
|
||||||
DS_FreeMetadata(result);
|
DS_FreeMetadata(result);
|
||||||
} else if (stream_size > 0) {
|
} else if (stream_size > 0) {
|
||||||
|
|
|
@ -60,7 +60,7 @@ public:
|
||||||
int time_dim,
|
int time_dim,
|
||||||
int class_dim);
|
int class_dim);
|
||||||
|
|
||||||
/* Get transcription from current decoder state
|
/* Get up to num_results transcriptions from current decoder state.
|
||||||
*
|
*
|
||||||
* Parameters:
|
* Parameters:
|
||||||
* num_results: Number of beams to return.
|
* num_results: Number of beams to return.
|
||||||
|
|
|
@ -27,7 +27,7 @@ typedef struct TokenMetadata {
|
||||||
char* text;
|
char* text;
|
||||||
|
|
||||||
/** Position of the token in units of 20ms */
|
/** Position of the token in units of 20ms */
|
||||||
int timestep;
|
unsigned int timestep;
|
||||||
|
|
||||||
/** Position of the token in seconds */
|
/** Position of the token in seconds */
|
||||||
float start_time;
|
float start_time;
|
||||||
|
@ -41,7 +41,7 @@ typedef struct CandidateTranscript {
|
||||||
/** Array of TokenMetadata objects */
|
/** Array of TokenMetadata objects */
|
||||||
TokenMetadata* tokens;
|
TokenMetadata* tokens;
|
||||||
/** Size of the tokens array */
|
/** Size of the tokens array */
|
||||||
int num_tokens;
|
unsigned int num_tokens;
|
||||||
/** Approximated confidence value for this transcript. This is roughly the
|
/** Approximated confidence value for this transcript. This is roughly the
|
||||||
* sum of the acoustic model logit values for each timestep/character that
|
* sum of the acoustic model logit values for each timestep/character that
|
||||||
* contributed to the creation of this transcript.
|
* contributed to the creation of this transcript.
|
||||||
|
@ -56,7 +56,7 @@ typedef struct Metadata {
|
||||||
/** Array of CandidateTranscript objects */
|
/** Array of CandidateTranscript objects */
|
||||||
CandidateTranscript* transcripts;
|
CandidateTranscript* transcripts;
|
||||||
/** Size of the transcripts array */
|
/** Size of the transcripts array */
|
||||||
int num_transcripts;
|
unsigned int num_transcripts;
|
||||||
} Metadata;
|
} Metadata;
|
||||||
|
|
||||||
enum DeepSpeech_Error_Codes
|
enum DeepSpeech_Error_Codes
|
||||||
|
@ -175,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx,
|
||||||
float aBeta);
|
float aBeta);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Use the DeepSpeech model to perform Speech-To-Text.
|
* @brief Use the DeepSpeech model to convert speech to text.
|
||||||
*
|
*
|
||||||
* @param aCtx The ModelState pointer for the model to use.
|
* @param aCtx The ModelState pointer for the model to use.
|
||||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||||
|
@ -191,18 +191,18 @@ char* DS_SpeechToText(ModelState* aCtx,
|
||||||
unsigned int aBufferSize);
|
unsigned int aBufferSize);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Use the DeepSpeech model to perform Speech-To-Text and output results
|
* @brief Use the DeepSpeech model to convert speech to text and output results
|
||||||
* including metadata.
|
* including metadata.
|
||||||
*
|
*
|
||||||
* @param aCtx The ModelState pointer for the model to use.
|
* @param aCtx The ModelState pointer for the model to use.
|
||||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||||
* sample rate (matching what the model was trained on).
|
* sample rate (matching what the model was trained on).
|
||||||
* @param aBufferSize The number of samples in the audio signal.
|
* @param aBufferSize The number of samples in the audio signal.
|
||||||
* @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this.
|
* @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this.
|
||||||
*
|
*
|
||||||
* @return Metadata struct containing multiple candidate transcripts. Each transcript
|
* @return Metadata struct containing multiple CandidateTranscript structs. Each
|
||||||
* has per-token metadata including timing information. The user is
|
* transcript has per-token metadata including timing information. The
|
||||||
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
|
* user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
|
||||||
* Returns NULL on error.
|
* Returns NULL on error.
|
||||||
*/
|
*/
|
||||||
DEEPSPEECH_EXPORT
|
DEEPSPEECH_EXPORT
|
||||||
|
|
|
@ -66,7 +66,7 @@ struct ModelState {
|
||||||
* @brief Return character-level metadata including letter timings.
|
* @brief Return character-level metadata including letter timings.
|
||||||
*
|
*
|
||||||
* @param state Decoder state to use when decoding.
|
* @param state Decoder state to use when decoding.
|
||||||
* @param num_results Number of candidate results to return.
|
* @param num_results Maximum number of candidate results to return.
|
||||||
*
|
*
|
||||||
* @return A Metadata struct containing CandidateTranscript structs.
|
* @return A Metadata struct containing CandidateTranscript structs.
|
||||||
* Each represents an candidate transcript, with the first ranked most probable.
|
* Each represents an candidate transcript, with the first ranked most probable.
|
||||||
|
|
Loading…
Reference in New Issue