Address review comments

This commit is contained in:
Reuben Morais 2020-03-17 14:47:18 +01:00
parent e9ae38bf47
commit 2ec34d5a06
9 changed files with 59 additions and 38 deletions

View File

@ -31,20 +31,20 @@ ErrorCodes
Metadata Metadata
-------- --------
.. doxygenstruct:: DeepSpeechClient::Models::Metadata .. doxygenclass:: DeepSpeechClient::Models::Metadata
:project: deepspeech-dotnet :project: deepspeech-dotnet
:members: Transcripts :members: Transcripts
CandidateTranscript CandidateTranscript
------------------- -------------------
.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript .. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript
:project: deepspeech-dotnet :project: deepspeech-dotnet
:members: Tokens, Confidence :members: Tokens, Confidence
TokenMetadata TokenMetadata
------------- -------------
.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata .. doxygenclass:: DeepSpeechClient::Models::TokenMetadata
:project: deepspeech-dotnet :project: deepspeech-dotnet
:members: Text, Timestep, StartTime :members: Text, Timestep, StartTime

View File

@ -13,11 +13,17 @@ Metadata
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata .. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
:project: deepspeech-java :project: deepspeech-java
:members: getItems, getNum_items, getProbability, getItem :members: getTranscripts, getNum_transcripts, getTranscript
MetadataItem CandidateTranscript
------------ -------------------
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem .. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript
:project: deepspeech-java :project: deepspeech-java
:members: getCharacter, getTimestep, getStart_time :members: getTokens, getNum_tokens, getConfidence, getToken
TokenMetadata
-------------
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata
:project: deepspeech-java
:members: getText, getTimestep, getStart_time

View File

@ -8,9 +8,16 @@ Metadata
:project: deepspeech-c :project: deepspeech-c
:members: :members:
MetadataItem CandidateTranscript
------------ -------------------
.. doxygenstruct:: MetadataItem .. doxygenstruct:: CandidateTranscript
:project: deepspeech-c
:members:
TokenMetadata
-------------
.. doxygenstruct:: TokenMetadata
:project: deepspeech-c :project: deepspeech-c
:members: :members:

View File

@ -790,7 +790,7 @@ WARN_LOGFILE =
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched. # Note: If this tag is empty the current directory is searched.
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/ INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/
# This tag can be used to specify the character encoding of the source files # This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses

View File

@ -34,6 +34,8 @@ bool extended_metadata = false;
bool json_output = false; bool json_output = false;
int json_candidate_transcripts = 3;
int stream_size = 0; int stream_size = 0;
void PrintHelp(const char* bin) void PrintHelp(const char* bin)
@ -43,18 +45,19 @@ void PrintHelp(const char* bin)
"\n" "\n"
"Running DeepSpeech inference.\n" "Running DeepSpeech inference.\n"
"\n" "\n"
"\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n" "\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n"
"\t--scorer SCORER\t\tPath to the external scorer file\n" "\t--scorer SCORER\t\t\tPath to the external scorer file\n"
"\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n" "\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n"
"\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n" "\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n"
"\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n" "\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n"
"\t--lm_beta LM_BETA\tValue for language model beta param (float)\n" "\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n"
"\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n" "\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
"\t--extended\t\tOutput string from extended metadata\n" "\t--extended\t\t\tOutput string from extended metadata\n"
"\t--json\t\t\tExtended output, shows word timings as JSON\n" "\t--json\t\t\t\tExtended output, shows word timings as JSON\n"
"\t--stream size\t\tRun in stream mode, output intermediate results\n" "\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n"
"\t--help\t\t\tShow help\n" "\t--stream size\t\t\tRun in stream mode, output intermediate results\n"
"\t--version\t\tPrint version and exits\n"; "\t--help\t\t\t\tShow help\n"
"\t--version\t\t\tPrint version and exits\n";
char* version = DS_Version(); char* version = DS_Version();
std::cerr << "DeepSpeech " << version << "\n"; std::cerr << "DeepSpeech " << version << "\n";
DS_FreeString(version); DS_FreeString(version);
@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv)
{"t", no_argument, nullptr, 't'}, {"t", no_argument, nullptr, 't'},
{"extended", no_argument, nullptr, 'e'}, {"extended", no_argument, nullptr, 'e'},
{"json", no_argument, nullptr, 'j'}, {"json", no_argument, nullptr, 'j'},
{"candidate_transcripts", required_argument, nullptr, 150},
{"stream", required_argument, nullptr, 's'}, {"stream", required_argument, nullptr, 's'},
{"version", no_argument, nullptr, 'v'}, {"version", no_argument, nullptr, 'v'},
{"help", no_argument, nullptr, 'h'}, {"help", no_argument, nullptr, 'h'},
@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv)
json_output = true; json_output = true;
break; break;
case 150:
json_candidate_transcripts = atoi(optarg);
break;
case 's': case 's':
stream_size = atoi(optarg); stream_size = atoi(optarg);
break; break;

View File

@ -49,7 +49,7 @@ CandidateTranscriptToString(CandidateTranscript* transcript)
{ {
std::string retval = ""; std::string retval = "";
for (int i = 0; i < transcript->num_tokens; i++) { for (int i = 0; i < transcript->num_tokens; i++) {
TokenMetadata token = transcript->tokens[i]; const TokenMetadata& token = transcript->tokens[i];
retval += token.text; retval += token.text;
} }
return strdup(retval.c_str()); return strdup(retval.c_str());
@ -65,7 +65,7 @@ CandidateTranscriptToWords(CandidateTranscript* transcript)
// Loop through each token // Loop through each token
for (int i = 0; i < transcript->num_tokens; i++) { for (int i = 0; i < transcript->num_tokens; i++) {
TokenMetadata token = transcript->tokens[i]; const TokenMetadata& token = transcript->tokens[i];
// Append token to word if it's not a space // Append token to word if it's not a space
if (strcmp(token.text, u8" ") != 0) { if (strcmp(token.text, u8" ") != 0) {
@ -167,7 +167,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
res.string = CandidateTranscriptToString(&result->transcripts[0]); res.string = CandidateTranscriptToString(&result->transcripts[0]);
DS_FreeMetadata(result); DS_FreeMetadata(result);
} else if (json_output) { } else if (json_output) {
Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3); Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts);
res.string = MetadataToJSON(result); res.string = MetadataToJSON(result);
DS_FreeMetadata(result); DS_FreeMetadata(result);
} else if (stream_size > 0) { } else if (stream_size > 0) {

View File

@ -60,7 +60,7 @@ public:
int time_dim, int time_dim,
int class_dim); int class_dim);
/* Get transcription from current decoder state /* Get up to num_results transcriptions from current decoder state.
* *
* Parameters: * Parameters:
* num_results: Number of beams to return. * num_results: Number of beams to return.

View File

@ -27,7 +27,7 @@ typedef struct TokenMetadata {
char* text; char* text;
/** Position of the token in units of 20ms */ /** Position of the token in units of 20ms */
int timestep; unsigned int timestep;
/** Position of the token in seconds */ /** Position of the token in seconds */
float start_time; float start_time;
@ -41,7 +41,7 @@ typedef struct CandidateTranscript {
/** Array of TokenMetadata objects */ /** Array of TokenMetadata objects */
TokenMetadata* tokens; TokenMetadata* tokens;
/** Size of the tokens array */ /** Size of the tokens array */
int num_tokens; unsigned int num_tokens;
/** Approximated confidence value for this transcript. This is roughly the /** Approximated confidence value for this transcript. This is roughly the
* sum of the acoustic model logit values for each timestep/character that * sum of the acoustic model logit values for each timestep/character that
* contributed to the creation of this transcript. * contributed to the creation of this transcript.
@ -56,7 +56,7 @@ typedef struct Metadata {
/** Array of CandidateTranscript objects */ /** Array of CandidateTranscript objects */
CandidateTranscript* transcripts; CandidateTranscript* transcripts;
/** Size of the transcripts array */ /** Size of the transcripts array */
int num_transcripts; unsigned int num_transcripts;
} Metadata; } Metadata;
enum DeepSpeech_Error_Codes enum DeepSpeech_Error_Codes
@ -175,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx,
float aBeta); float aBeta);
/** /**
* @brief Use the DeepSpeech model to perform Speech-To-Text. * @brief Use the DeepSpeech model to convert speech to text.
* *
* @param aCtx The ModelState pointer for the model to use. * @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
@ -191,18 +191,18 @@ char* DS_SpeechToText(ModelState* aCtx,
unsigned int aBufferSize); unsigned int aBufferSize);
/** /**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output results * @brief Use the DeepSpeech model to convert speech to text and output results
* including metadata. * including metadata.
* *
* @param aCtx The ModelState pointer for the model to use. * @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on). * sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal. * @param aBufferSize The number of samples in the audio signal.
* @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this. * @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this.
* *
* @return Metadata struct containing multiple candidate transcripts. Each transcript * @return Metadata struct containing multiple CandidateTranscript structs. Each
* has per-token metadata including timing information. The user is * transcript has per-token metadata including timing information. The
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. * user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
* Returns NULL on error. * Returns NULL on error.
*/ */
DEEPSPEECH_EXPORT DEEPSPEECH_EXPORT

View File

@ -66,7 +66,7 @@ struct ModelState {
* @brief Return character-level metadata including letter timings. * @brief Return character-level metadata including letter timings.
* *
* @param state Decoder state to use when decoding. * @param state Decoder state to use when decoding.
* @param num_results Number of candidate results to return. * @param num_results Maximum number of candidate results to return.
* *
* @return A Metadata struct containing CandidateTranscript structs. * @return A Metadata struct containing CandidateTranscript structs.
* Each represents an candidate transcript, with the first ranked most probable. * Each represents an candidate transcript, with the first ranked most probable.