From 2ec34d5a067334a84b323328c149bd9752008059 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 17 Mar 2020 14:47:18 +0100 Subject: [PATCH] Address review comments --- doc/DotNet-API.rst | 6 ++-- doc/Java-API.rst | 16 +++++++--- doc/Structs.rst | 13 ++++++-- doc/doxygen-dotnet.conf | 2 +- native_client/args.h | 32 ++++++++++++------- native_client/client.cc | 6 ++-- .../ctcdecode/ctc_beam_search_decoder.h | 2 +- native_client/deepspeech.h | 18 +++++------ native_client/modelstate.h | 2 +- 9 files changed, 59 insertions(+), 38 deletions(-) diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst index d43c7afb..b4f85dfc 100644 --- a/doc/DotNet-API.rst +++ b/doc/DotNet-API.rst @@ -31,20 +31,20 @@ ErrorCodes Metadata -------- -.. doxygenstruct:: DeepSpeechClient::Models::Metadata +.. doxygenclass:: DeepSpeechClient::Models::Metadata :project: deepspeech-dotnet :members: Transcripts CandidateTranscript ------------------- -.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript +.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript :project: deepspeech-dotnet :members: Tokens, Confidence TokenMetadata ------------- -.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata +.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata :project: deepspeech-dotnet :members: Text, Timestep, StartTime diff --git a/doc/Java-API.rst b/doc/Java-API.rst index a485dc02..2986ca97 100644 --- a/doc/Java-API.rst +++ b/doc/Java-API.rst @@ -13,11 +13,17 @@ Metadata .. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata :project: deepspeech-java - :members: getItems, getNum_items, getProbability, getItem + :members: getTranscripts, getNum_transcripts, getTranscript -MetadataItem ------------- +CandidateTranscript +------------------- -.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem +.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript :project: deepspeech-java - :members: getCharacter, getTimestep, getStart_time + :members: getTokens, getNum_tokens, getConfidence, getToken + +TokenMetadata +------------- +.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata + :project: deepspeech-java + :members: getText, getTimestep, getStart_time diff --git a/doc/Structs.rst b/doc/Structs.rst index 713e52e0..5d532277 100644 --- a/doc/Structs.rst +++ b/doc/Structs.rst @@ -8,9 +8,16 @@ Metadata :project: deepspeech-c :members: -MetadataItem ------------- +CandidateTranscript +------------------- -.. doxygenstruct:: MetadataItem +.. doxygenstruct:: CandidateTranscript + :project: deepspeech-c + :members: + +TokenMetadata +------------- + +.. doxygenstruct:: TokenMetadata :project: deepspeech-c :members: diff --git a/doc/doxygen-dotnet.conf b/doc/doxygen-dotnet.conf index ad64cfcb..74c2c5bb 100644 --- a/doc/doxygen-dotnet.conf +++ b/doc/doxygen-dotnet.conf @@ -790,7 +790,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/ +INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/native_client/args.h b/native_client/args.h index 33b9b8fe..ca28bfb7 100644 --- a/native_client/args.h +++ b/native_client/args.h @@ -34,6 +34,8 @@ bool extended_metadata = false; bool json_output = false; +int json_candidate_transcripts = 3; + int stream_size = 0; void PrintHelp(const char* bin) @@ -43,18 +45,19 @@ void PrintHelp(const char* bin) "\n" "Running DeepSpeech inference.\n" "\n" - "\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n" - "\t--scorer SCORER\t\tPath to the external scorer file\n" - "\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n" - "\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n" - "\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n" - "\t--lm_beta LM_BETA\tValue for language model beta param (float)\n" - "\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n" - "\t--extended\t\tOutput string from extended metadata\n" - "\t--json\t\t\tExtended output, shows word timings as JSON\n" - "\t--stream size\t\tRun in stream mode, output intermediate results\n" - "\t--help\t\t\tShow help\n" - "\t--version\t\tPrint version and exits\n"; + "\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n" + "\t--scorer SCORER\t\t\tPath to the external scorer file\n" + "\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n" + "\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n" + "\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n" + "\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n" + "\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n" + "\t--extended\t\t\tOutput string from extended metadata\n" + "\t--json\t\t\t\tExtended output, shows word timings as JSON\n" + "\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n" + "\t--stream size\t\t\tRun in stream mode, output intermediate results\n" + "\t--help\t\t\t\tShow help\n" + "\t--version\t\t\tPrint version and exits\n"; char* version = DS_Version(); std::cerr << "DeepSpeech " << version << "\n"; DS_FreeString(version); @@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv) {"t", no_argument, nullptr, 't'}, {"extended", no_argument, nullptr, 'e'}, {"json", no_argument, nullptr, 'j'}, + {"candidate_transcripts", required_argument, nullptr, 150}, {"stream", required_argument, nullptr, 's'}, {"version", no_argument, nullptr, 'v'}, {"help", no_argument, nullptr, 'h'}, @@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv) json_output = true; break; + case 150: + json_candidate_transcripts = atoi(optarg); + break; + case 's': stream_size = atoi(optarg); break; diff --git a/native_client/client.cc b/native_client/client.cc index 9ab47f27..f108419b 100644 --- a/native_client/client.cc +++ b/native_client/client.cc @@ -49,7 +49,7 @@ CandidateTranscriptToString(CandidateTranscript* transcript) { std::string retval = ""; for (int i = 0; i < transcript->num_tokens; i++) { - TokenMetadata token = transcript->tokens[i]; + const TokenMetadata& token = transcript->tokens[i]; retval += token.text; } return strdup(retval.c_str()); @@ -65,7 +65,7 @@ CandidateTranscriptToWords(CandidateTranscript* transcript) // Loop through each token for (int i = 0; i < transcript->num_tokens; i++) { - TokenMetadata token = transcript->tokens[i]; + const TokenMetadata& token = transcript->tokens[i]; // Append token to word if it's not a space if (strcmp(token.text, u8" ") != 0) { @@ -167,7 +167,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, res.string = CandidateTranscriptToString(&result->transcripts[0]); DS_FreeMetadata(result); } else if (json_output) { - Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3); + Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts); res.string = MetadataToJSON(result); DS_FreeMetadata(result); } else if (stream_size > 0) { diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.h b/native_client/ctcdecode/ctc_beam_search_decoder.h index 78871b2a..b785e097 100644 --- a/native_client/ctcdecode/ctc_beam_search_decoder.h +++ b/native_client/ctcdecode/ctc_beam_search_decoder.h @@ -60,7 +60,7 @@ public: int time_dim, int class_dim); - /* Get transcription from current decoder state + /* Get up to num_results transcriptions from current decoder state. * * Parameters: * num_results: Number of beams to return. diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h index bf4c0f00..6fb9645c 100644 --- a/native_client/deepspeech.h +++ b/native_client/deepspeech.h @@ -27,7 +27,7 @@ typedef struct TokenMetadata { char* text; /** Position of the token in units of 20ms */ - int timestep; + unsigned int timestep; /** Position of the token in seconds */ float start_time; @@ -41,7 +41,7 @@ typedef struct CandidateTranscript { /** Array of TokenMetadata objects */ TokenMetadata* tokens; /** Size of the tokens array */ - int num_tokens; + unsigned int num_tokens; /** Approximated confidence value for this transcript. This is roughly the * sum of the acoustic model logit values for each timestep/character that * contributed to the creation of this transcript. @@ -56,7 +56,7 @@ typedef struct Metadata { /** Array of CandidateTranscript objects */ CandidateTranscript* transcripts; /** Size of the transcripts array */ - int num_transcripts; + unsigned int num_transcripts; } Metadata; enum DeepSpeech_Error_Codes @@ -175,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx, float aBeta); /** - * @brief Use the DeepSpeech model to perform Speech-To-Text. + * @brief Use the DeepSpeech model to convert speech to text. * * @param aCtx The ModelState pointer for the model to use. * @param aBuffer A 16-bit, mono raw audio signal at the appropriate @@ -191,18 +191,18 @@ char* DS_SpeechToText(ModelState* aCtx, unsigned int aBufferSize); /** - * @brief Use the DeepSpeech model to perform Speech-To-Text and output results + * @brief Use the DeepSpeech model to convert speech to text and output results * including metadata. * * @param aCtx The ModelState pointer for the model to use. * @param aBuffer A 16-bit, mono raw audio signal at the appropriate * sample rate (matching what the model was trained on). * @param aBufferSize The number of samples in the audio signal. - * @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this. + * @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this. * - * @return Metadata struct containing multiple candidate transcripts. Each transcript - * has per-token metadata including timing information. The user is - * responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. + * @return Metadata struct containing multiple CandidateTranscript structs. Each + * transcript has per-token metadata including timing information. The + * user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. * Returns NULL on error. */ DEEPSPEECH_EXPORT diff --git a/native_client/modelstate.h b/native_client/modelstate.h index 43eef970..0dbe108a 100644 --- a/native_client/modelstate.h +++ b/native_client/modelstate.h @@ -66,7 +66,7 @@ struct ModelState { * @brief Return character-level metadata including letter timings. * * @param state Decoder state to use when decoding. - * @param num_results Number of candidate results to return. + * @param num_results Maximum number of candidate results to return. * * @return A Metadata struct containing CandidateTranscript structs. * Each represents an candidate transcript, with the first ranked most probable.