Address review comments

This commit is contained in:
Reuben Morais 2020-03-17 14:47:18 +01:00
parent e9ae38bf47
commit 2ec34d5a06
9 changed files with 59 additions and 38 deletions

View File

@ -31,20 +31,20 @@ ErrorCodes
Metadata
--------
.. doxygenstruct:: DeepSpeechClient::Models::Metadata
.. doxygenclass:: DeepSpeechClient::Models::Metadata
:project: deepspeech-dotnet
:members: Transcripts
CandidateTranscript
-------------------
.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript
.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript
:project: deepspeech-dotnet
:members: Tokens, Confidence
TokenMetadata
-------------
.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata
.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata
:project: deepspeech-dotnet
:members: Text, Timestep, StartTime

View File

@ -13,11 +13,17 @@ Metadata
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
:project: deepspeech-java
:members: getItems, getNum_items, getProbability, getItem
:members: getTranscripts, getNum_transcripts, getTranscript
MetadataItem
------------
CandidateTranscript
-------------------
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript
:project: deepspeech-java
:members: getCharacter, getTimestep, getStart_time
:members: getTokens, getNum_tokens, getConfidence, getToken
TokenMetadata
-------------
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata
:project: deepspeech-java
:members: getText, getTimestep, getStart_time

View File

@ -8,9 +8,16 @@ Metadata
:project: deepspeech-c
:members:
MetadataItem
------------
CandidateTranscript
-------------------
.. doxygenstruct:: MetadataItem
.. doxygenstruct:: CandidateTranscript
:project: deepspeech-c
:members:
TokenMetadata
-------------
.. doxygenstruct:: TokenMetadata
:project: deepspeech-c
:members:

View File

@ -790,7 +790,7 @@ WARN_LOGFILE =
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched.
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses

View File

@ -34,6 +34,8 @@ bool extended_metadata = false;
bool json_output = false;
int json_candidate_transcripts = 3;
int stream_size = 0;
void PrintHelp(const char* bin)
@ -43,18 +45,19 @@ void PrintHelp(const char* bin)
"\n"
"Running DeepSpeech inference.\n"
"\n"
"\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
"\t--scorer SCORER\t\tPath to the external scorer file\n"
"\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
"\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
"\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
"\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
"\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
"\t--extended\t\tOutput string from extended metadata\n"
"\t--json\t\t\tExtended output, shows word timings as JSON\n"
"\t--stream size\t\tRun in stream mode, output intermediate results\n"
"\t--help\t\t\tShow help\n"
"\t--version\t\tPrint version and exits\n";
"\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n"
"\t--scorer SCORER\t\t\tPath to the external scorer file\n"
"\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n"
"\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n"
"\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n"
"\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n"
"\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
"\t--extended\t\t\tOutput string from extended metadata\n"
"\t--json\t\t\t\tExtended output, shows word timings as JSON\n"
"\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n"
"\t--stream size\t\t\tRun in stream mode, output intermediate results\n"
"\t--help\t\t\t\tShow help\n"
"\t--version\t\t\tPrint version and exits\n";
char* version = DS_Version();
std::cerr << "DeepSpeech " << version << "\n";
DS_FreeString(version);
@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv)
{"t", no_argument, nullptr, 't'},
{"extended", no_argument, nullptr, 'e'},
{"json", no_argument, nullptr, 'j'},
{"candidate_transcripts", required_argument, nullptr, 150},
{"stream", required_argument, nullptr, 's'},
{"version", no_argument, nullptr, 'v'},
{"help", no_argument, nullptr, 'h'},
@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv)
json_output = true;
break;
case 150:
json_candidate_transcripts = atoi(optarg);
break;
case 's':
stream_size = atoi(optarg);
break;

View File

@ -49,7 +49,7 @@ CandidateTranscriptToString(CandidateTranscript* transcript)
{
std::string retval = "";
for (int i = 0; i < transcript->num_tokens; i++) {
TokenMetadata token = transcript->tokens[i];
const TokenMetadata& token = transcript->tokens[i];
retval += token.text;
}
return strdup(retval.c_str());
@ -65,7 +65,7 @@ CandidateTranscriptToWords(CandidateTranscript* transcript)
// Loop through each token
for (int i = 0; i < transcript->num_tokens; i++) {
TokenMetadata token = transcript->tokens[i];
const TokenMetadata& token = transcript->tokens[i];
// Append token to word if it's not a space
if (strcmp(token.text, u8" ") != 0) {
@ -167,7 +167,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
res.string = CandidateTranscriptToString(&result->transcripts[0]);
DS_FreeMetadata(result);
} else if (json_output) {
Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3);
Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts);
res.string = MetadataToJSON(result);
DS_FreeMetadata(result);
} else if (stream_size > 0) {

View File

@ -60,7 +60,7 @@ public:
int time_dim,
int class_dim);
/* Get transcription from current decoder state
/* Get up to num_results transcriptions from current decoder state.
*
* Parameters:
* num_results: Number of beams to return.

View File

@ -27,7 +27,7 @@ typedef struct TokenMetadata {
char* text;
/** Position of the token in units of 20ms */
int timestep;
unsigned int timestep;
/** Position of the token in seconds */
float start_time;
@ -41,7 +41,7 @@ typedef struct CandidateTranscript {
/** Array of TokenMetadata objects */
TokenMetadata* tokens;
/** Size of the tokens array */
int num_tokens;
unsigned int num_tokens;
/** Approximated confidence value for this transcript. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
* contributed to the creation of this transcript.
@ -56,7 +56,7 @@ typedef struct Metadata {
/** Array of CandidateTranscript objects */
CandidateTranscript* transcripts;
/** Size of the transcripts array */
int num_transcripts;
unsigned int num_transcripts;
} Metadata;
enum DeepSpeech_Error_Codes
@ -175,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx,
float aBeta);
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text.
* @brief Use the DeepSpeech model to convert speech to text.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
@ -191,18 +191,18 @@ char* DS_SpeechToText(ModelState* aCtx,
unsigned int aBufferSize);
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output results
* @brief Use the DeepSpeech model to convert speech to text and output results
* including metadata.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this.
* @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this.
*
* @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information. The user is
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
* @return Metadata struct containing multiple CandidateTranscript structs. Each
* transcript has per-token metadata including timing information. The
* user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
* Returns NULL on error.
*/
DEEPSPEECH_EXPORT

View File

@ -66,7 +66,7 @@ struct ModelState {
* @brief Return character-level metadata including letter timings.
*
* @param state Decoder state to use when decoding.
* @param num_results Number of candidate results to return.
* @param num_results Maximum number of candidate results to return.
*
* @return A Metadata struct containing CandidateTranscript structs.
* Each represents an candidate transcript, with the first ranked most probable.