Address review comments
This commit is contained in:
parent
e9ae38bf47
commit
2ec34d5a06
|
@ -31,20 +31,20 @@ ErrorCodes
|
|||
Metadata
|
||||
--------
|
||||
|
||||
.. doxygenstruct:: DeepSpeechClient::Models::Metadata
|
||||
.. doxygenclass:: DeepSpeechClient::Models::Metadata
|
||||
:project: deepspeech-dotnet
|
||||
:members: Transcripts
|
||||
|
||||
CandidateTranscript
|
||||
-------------------
|
||||
|
||||
.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript
|
||||
.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript
|
||||
:project: deepspeech-dotnet
|
||||
:members: Tokens, Confidence
|
||||
|
||||
TokenMetadata
|
||||
-------------
|
||||
|
||||
.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata
|
||||
.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata
|
||||
:project: deepspeech-dotnet
|
||||
:members: Text, Timestep, StartTime
|
||||
|
|
|
@ -13,11 +13,17 @@ Metadata
|
|||
|
||||
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
|
||||
:project: deepspeech-java
|
||||
:members: getItems, getNum_items, getProbability, getItem
|
||||
:members: getTranscripts, getNum_transcripts, getTranscript
|
||||
|
||||
MetadataItem
|
||||
------------
|
||||
CandidateTranscript
|
||||
-------------------
|
||||
|
||||
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem
|
||||
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript
|
||||
:project: deepspeech-java
|
||||
:members: getCharacter, getTimestep, getStart_time
|
||||
:members: getTokens, getNum_tokens, getConfidence, getToken
|
||||
|
||||
TokenMetadata
|
||||
-------------
|
||||
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata
|
||||
:project: deepspeech-java
|
||||
:members: getText, getTimestep, getStart_time
|
||||
|
|
|
@ -8,9 +8,16 @@ Metadata
|
|||
:project: deepspeech-c
|
||||
:members:
|
||||
|
||||
MetadataItem
|
||||
------------
|
||||
CandidateTranscript
|
||||
-------------------
|
||||
|
||||
.. doxygenstruct:: MetadataItem
|
||||
.. doxygenstruct:: CandidateTranscript
|
||||
:project: deepspeech-c
|
||||
:members:
|
||||
|
||||
TokenMetadata
|
||||
-------------
|
||||
|
||||
.. doxygenstruct:: TokenMetadata
|
||||
:project: deepspeech-c
|
||||
:members:
|
||||
|
|
|
@ -790,7 +790,7 @@ WARN_LOGFILE =
|
|||
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
|
||||
# Note: If this tag is empty the current directory is searched.
|
||||
|
||||
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/
|
||||
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/
|
||||
|
||||
# This tag can be used to specify the character encoding of the source files
|
||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
||||
|
|
|
@ -34,6 +34,8 @@ bool extended_metadata = false;
|
|||
|
||||
bool json_output = false;
|
||||
|
||||
int json_candidate_transcripts = 3;
|
||||
|
||||
int stream_size = 0;
|
||||
|
||||
void PrintHelp(const char* bin)
|
||||
|
@ -43,18 +45,19 @@ void PrintHelp(const char* bin)
|
|||
"\n"
|
||||
"Running DeepSpeech inference.\n"
|
||||
"\n"
|
||||
"\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
|
||||
"\t--scorer SCORER\t\tPath to the external scorer file\n"
|
||||
"\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
|
||||
"\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
|
||||
"\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
|
||||
"\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
|
||||
"\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
|
||||
"\t--extended\t\tOutput string from extended metadata\n"
|
||||
"\t--json\t\t\tExtended output, shows word timings as JSON\n"
|
||||
"\t--stream size\t\tRun in stream mode, output intermediate results\n"
|
||||
"\t--help\t\t\tShow help\n"
|
||||
"\t--version\t\tPrint version and exits\n";
|
||||
"\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n"
|
||||
"\t--scorer SCORER\t\t\tPath to the external scorer file\n"
|
||||
"\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n"
|
||||
"\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n"
|
||||
"\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n"
|
||||
"\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n"
|
||||
"\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
|
||||
"\t--extended\t\t\tOutput string from extended metadata\n"
|
||||
"\t--json\t\t\t\tExtended output, shows word timings as JSON\n"
|
||||
"\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n"
|
||||
"\t--stream size\t\t\tRun in stream mode, output intermediate results\n"
|
||||
"\t--help\t\t\t\tShow help\n"
|
||||
"\t--version\t\t\tPrint version and exits\n";
|
||||
char* version = DS_Version();
|
||||
std::cerr << "DeepSpeech " << version << "\n";
|
||||
DS_FreeString(version);
|
||||
|
@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv)
|
|||
{"t", no_argument, nullptr, 't'},
|
||||
{"extended", no_argument, nullptr, 'e'},
|
||||
{"json", no_argument, nullptr, 'j'},
|
||||
{"candidate_transcripts", required_argument, nullptr, 150},
|
||||
{"stream", required_argument, nullptr, 's'},
|
||||
{"version", no_argument, nullptr, 'v'},
|
||||
{"help", no_argument, nullptr, 'h'},
|
||||
|
@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv)
|
|||
json_output = true;
|
||||
break;
|
||||
|
||||
case 150:
|
||||
json_candidate_transcripts = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 's':
|
||||
stream_size = atoi(optarg);
|
||||
break;
|
||||
|
|
|
@ -49,7 +49,7 @@ CandidateTranscriptToString(CandidateTranscript* transcript)
|
|||
{
|
||||
std::string retval = "";
|
||||
for (int i = 0; i < transcript->num_tokens; i++) {
|
||||
TokenMetadata token = transcript->tokens[i];
|
||||
const TokenMetadata& token = transcript->tokens[i];
|
||||
retval += token.text;
|
||||
}
|
||||
return strdup(retval.c_str());
|
||||
|
@ -65,7 +65,7 @@ CandidateTranscriptToWords(CandidateTranscript* transcript)
|
|||
|
||||
// Loop through each token
|
||||
for (int i = 0; i < transcript->num_tokens; i++) {
|
||||
TokenMetadata token = transcript->tokens[i];
|
||||
const TokenMetadata& token = transcript->tokens[i];
|
||||
|
||||
// Append token to word if it's not a space
|
||||
if (strcmp(token.text, u8" ") != 0) {
|
||||
|
@ -167,7 +167,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
|
|||
res.string = CandidateTranscriptToString(&result->transcripts[0]);
|
||||
DS_FreeMetadata(result);
|
||||
} else if (json_output) {
|
||||
Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3);
|
||||
Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts);
|
||||
res.string = MetadataToJSON(result);
|
||||
DS_FreeMetadata(result);
|
||||
} else if (stream_size > 0) {
|
||||
|
|
|
@ -60,7 +60,7 @@ public:
|
|||
int time_dim,
|
||||
int class_dim);
|
||||
|
||||
/* Get transcription from current decoder state
|
||||
/* Get up to num_results transcriptions from current decoder state.
|
||||
*
|
||||
* Parameters:
|
||||
* num_results: Number of beams to return.
|
||||
|
|
|
@ -27,7 +27,7 @@ typedef struct TokenMetadata {
|
|||
char* text;
|
||||
|
||||
/** Position of the token in units of 20ms */
|
||||
int timestep;
|
||||
unsigned int timestep;
|
||||
|
||||
/** Position of the token in seconds */
|
||||
float start_time;
|
||||
|
@ -41,7 +41,7 @@ typedef struct CandidateTranscript {
|
|||
/** Array of TokenMetadata objects */
|
||||
TokenMetadata* tokens;
|
||||
/** Size of the tokens array */
|
||||
int num_tokens;
|
||||
unsigned int num_tokens;
|
||||
/** Approximated confidence value for this transcript. This is roughly the
|
||||
* sum of the acoustic model logit values for each timestep/character that
|
||||
* contributed to the creation of this transcript.
|
||||
|
@ -56,7 +56,7 @@ typedef struct Metadata {
|
|||
/** Array of CandidateTranscript objects */
|
||||
CandidateTranscript* transcripts;
|
||||
/** Size of the transcripts array */
|
||||
int num_transcripts;
|
||||
unsigned int num_transcripts;
|
||||
} Metadata;
|
||||
|
||||
enum DeepSpeech_Error_Codes
|
||||
|
@ -175,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx,
|
|||
float aBeta);
|
||||
|
||||
/**
|
||||
* @brief Use the DeepSpeech model to perform Speech-To-Text.
|
||||
* @brief Use the DeepSpeech model to convert speech to text.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||
|
@ -191,18 +191,18 @@ char* DS_SpeechToText(ModelState* aCtx,
|
|||
unsigned int aBufferSize);
|
||||
|
||||
/**
|
||||
* @brief Use the DeepSpeech model to perform Speech-To-Text and output results
|
||||
* @brief Use the DeepSpeech model to convert speech to text and output results
|
||||
* including metadata.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||
* sample rate (matching what the model was trained on).
|
||||
* @param aBufferSize The number of samples in the audio signal.
|
||||
* @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this.
|
||||
* @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this.
|
||||
*
|
||||
* @return Metadata struct containing multiple candidate transcripts. Each transcript
|
||||
* has per-token metadata including timing information. The user is
|
||||
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
|
||||
* @return Metadata struct containing multiple CandidateTranscript structs. Each
|
||||
* transcript has per-token metadata including timing information. The
|
||||
* user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
|
||||
* Returns NULL on error.
|
||||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
|
|
|
@ -66,7 +66,7 @@ struct ModelState {
|
|||
* @brief Return character-level metadata including letter timings.
|
||||
*
|
||||
* @param state Decoder state to use when decoding.
|
||||
* @param num_results Number of candidate results to return.
|
||||
* @param num_results Maximum number of candidate results to return.
|
||||
*
|
||||
* @return A Metadata struct containing CandidateTranscript structs.
|
||||
* Each represents an candidate transcript, with the first ranked most probable.
|
||||
|
|
Loading…
Reference in New Issue