From 2ec34d5a067334a84b323328c149bd9752008059 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 17 Mar 2020 14:47:18 +0100
Subject: [PATCH] Address review comments

---
 doc/DotNet-API.rst                            |  6 ++--
 doc/Java-API.rst                              | 16 +++++++---
 doc/Structs.rst                               | 13 ++++++--
 doc/doxygen-dotnet.conf                       |  2 +-
 native_client/args.h                          | 32 ++++++++++++-------
 native_client/client.cc                       |  6 ++--
 .../ctcdecode/ctc_beam_search_decoder.h       |  2 +-
 native_client/deepspeech.h                    | 18 +++++------
 native_client/modelstate.h                    |  2 +-
 9 files changed, 59 insertions(+), 38 deletions(-)

diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst
index d43c7afb..b4f85dfc 100644
--- a/doc/DotNet-API.rst
+++ b/doc/DotNet-API.rst
@@ -31,20 +31,20 @@ ErrorCodes
 Metadata
 --------
 
-.. doxygenstruct:: DeepSpeechClient::Models::Metadata
+.. doxygenclass:: DeepSpeechClient::Models::Metadata
    :project: deepspeech-dotnet
    :members: Transcripts
 
 CandidateTranscript
 -------------------
 
-.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript
+.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript
    :project: deepspeech-dotnet
    :members: Tokens, Confidence
 
 TokenMetadata
 -------------
 
-.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata
+.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata
    :project: deepspeech-dotnet
    :members: Text, Timestep, StartTime
diff --git a/doc/Java-API.rst b/doc/Java-API.rst
index a485dc02..2986ca97 100644
--- a/doc/Java-API.rst
+++ b/doc/Java-API.rst
@@ -13,11 +13,17 @@ Metadata
 
 .. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
    :project: deepspeech-java
-   :members: getItems, getNum_items, getProbability, getItem
+   :members: getTranscripts, getNum_transcripts, getTranscript
 
-MetadataItem
-------------
+CandidateTranscript
+-------------------
 
-.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem
+.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript
    :project: deepspeech-java
-   :members: getCharacter, getTimestep, getStart_time
+   :members: getTokens, getNum_tokens, getConfidence, getToken
+
+TokenMetadata
+-------------
+.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata
+   :project: deepspeech-java
+   :members: getText, getTimestep, getStart_time
diff --git a/doc/Structs.rst b/doc/Structs.rst
index 713e52e0..5d532277 100644
--- a/doc/Structs.rst
+++ b/doc/Structs.rst
@@ -8,9 +8,16 @@ Metadata
    :project: deepspeech-c
    :members:
 
-MetadataItem
-------------
+CandidateTranscript
+-------------------
 
-.. doxygenstruct:: MetadataItem
+.. doxygenstruct:: CandidateTranscript
+   :project: deepspeech-c
+   :members:
+
+TokenMetadata
+-------------
+
+.. doxygenstruct:: TokenMetadata
    :project: deepspeech-c
    :members:
diff --git a/doc/doxygen-dotnet.conf b/doc/doxygen-dotnet.conf
index ad64cfcb..74c2c5bb 100644
--- a/doc/doxygen-dotnet.conf
+++ b/doc/doxygen-dotnet.conf
@@ -790,7 +790,7 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/
+INPUT                  = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/native_client/args.h b/native_client/args.h
index 33b9b8fe..ca28bfb7 100644
--- a/native_client/args.h
+++ b/native_client/args.h
@@ -34,6 +34,8 @@ bool extended_metadata = false;
 
 bool json_output = false;
 
+int json_candidate_transcripts = 3;
+
 int stream_size = 0;
 
 void PrintHelp(const char* bin)
@@ -43,18 +45,19 @@ void PrintHelp(const char* bin)
     "\n"
     "Running DeepSpeech inference.\n"
     "\n"
-    "\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
-    "\t--scorer SCORER\t\tPath to the external scorer file\n"
-    "\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
-    "\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
-    "\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
-    "\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
-    "\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
-    "\t--extended\t\tOutput string from extended metadata\n"
-    "\t--json\t\t\tExtended output, shows word timings as JSON\n"
-    "\t--stream size\t\tRun in stream mode, output intermediate results\n"
-    "\t--help\t\t\tShow help\n"
-    "\t--version\t\tPrint version and exits\n";
+    "\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n"
+    "\t--scorer SCORER\t\t\tPath to the external scorer file\n"
+    "\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n"
+    "\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n"
+    "\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n"
+    "\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n"
+    "\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
+    "\t--extended\t\t\tOutput string from extended metadata\n"
+    "\t--json\t\t\t\tExtended output, shows word timings as JSON\n"
+    "\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n"
+    "\t--stream size\t\t\tRun in stream mode, output intermediate results\n"
+    "\t--help\t\t\t\tShow help\n"
+    "\t--version\t\t\tPrint version and exits\n";
     char* version = DS_Version();
     std::cerr << "DeepSpeech " << version << "\n";
     DS_FreeString(version);
@@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv)
             {"t", no_argument, nullptr, 't'},
             {"extended", no_argument, nullptr, 'e'},
             {"json", no_argument, nullptr, 'j'},
+            {"candidate_transcripts", required_argument, nullptr, 150},
             {"stream", required_argument, nullptr, 's'},
             {"version", no_argument, nullptr, 'v'},
             {"help", no_argument, nullptr, 'h'},
@@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv)
             json_output = true;
             break;
 
+        case 150:
+            json_candidate_transcripts = atoi(optarg);
+            break;
+
         case 's':
             stream_size = atoi(optarg);
             break;
diff --git a/native_client/client.cc b/native_client/client.cc
index 9ab47f27..f108419b 100644
--- a/native_client/client.cc
+++ b/native_client/client.cc
@@ -49,7 +49,7 @@ CandidateTranscriptToString(CandidateTranscript* transcript)
 {
   std::string retval = "";
   for (int i = 0; i < transcript->num_tokens; i++) {
-    TokenMetadata token = transcript->tokens[i];
+    const TokenMetadata& token = transcript->tokens[i];
     retval += token.text;
   }
   return strdup(retval.c_str());
@@ -65,7 +65,7 @@ CandidateTranscriptToWords(CandidateTranscript* transcript)
 
   // Loop through each token
   for (int i = 0; i < transcript->num_tokens; i++) {
-    TokenMetadata token = transcript->tokens[i];
+    const TokenMetadata& token = transcript->tokens[i];
 
     // Append token to word if it's not a space
     if (strcmp(token.text, u8" ") != 0) {
@@ -167,7 +167,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
     res.string = CandidateTranscriptToString(&result->transcripts[0]);
     DS_FreeMetadata(result);
   } else if (json_output) {
-    Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3);
+    Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts);
     res.string = MetadataToJSON(result);
     DS_FreeMetadata(result);
   } else if (stream_size > 0) {
diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.h b/native_client/ctcdecode/ctc_beam_search_decoder.h
index 78871b2a..b785e097 100644
--- a/native_client/ctcdecode/ctc_beam_search_decoder.h
+++ b/native_client/ctcdecode/ctc_beam_search_decoder.h
@@ -60,7 +60,7 @@ public:
             int time_dim,
             int class_dim);
 
-  /* Get transcription from current decoder state
+  /* Get up to num_results transcriptions from current decoder state.
    *
    * Parameters:
    *     num_results: Number of beams to return.
diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h
index bf4c0f00..6fb9645c 100644
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@@ -27,7 +27,7 @@ typedef struct TokenMetadata {
   char* text;
 
   /** Position of the token in units of 20ms */
-  int timestep;
+  unsigned int timestep;
 
   /** Position of the token in seconds */
   float start_time;
@@ -41,7 +41,7 @@ typedef struct CandidateTranscript {
   /** Array of TokenMetadata objects */
   TokenMetadata* tokens;
   /** Size of the tokens array */
-  int num_tokens;
+  unsigned int num_tokens;
   /** Approximated confidence value for this transcript. This is roughly the
    * sum of the acoustic model logit values for each timestep/character that
    * contributed to the creation of this transcript.
@@ -56,7 +56,7 @@ typedef struct Metadata {
   /** Array of CandidateTranscript objects */
   CandidateTranscript* transcripts;
   /** Size of the transcripts array */
-  int num_transcripts;
+  unsigned int num_transcripts;
 } Metadata;
 
 enum DeepSpeech_Error_Codes
@@ -175,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx,
                           float aBeta);
 
 /**
- * @brief Use the DeepSpeech model to perform Speech-To-Text.
+ * @brief Use the DeepSpeech model to convert speech to text.
  *
  * @param aCtx The ModelState pointer for the model to use.
  * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
@@ -191,18 +191,18 @@ char* DS_SpeechToText(ModelState* aCtx,
                       unsigned int aBufferSize);
 
 /**
- * @brief Use the DeepSpeech model to perform Speech-To-Text and output results
+ * @brief Use the DeepSpeech model to convert speech to text and output results
  * including metadata.
  *
  * @param aCtx The ModelState pointer for the model to use.
  * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
  *                sample rate (matching what the model was trained on).
  * @param aBufferSize The number of samples in the audio signal.
- * @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this.
+ * @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this.
  *
- * @return Metadata struct containing multiple candidate transcripts. Each transcript
- *         has per-token metadata including timing information. The user is
- *         responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
+ * @return Metadata struct containing multiple CandidateTranscript structs. Each
+ *         transcript has per-token metadata including timing information. The
+ *         user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
  *         Returns NULL on error.
  */
 DEEPSPEECH_EXPORT
diff --git a/native_client/modelstate.h b/native_client/modelstate.h
index 43eef970..0dbe108a 100644
--- a/native_client/modelstate.h
+++ b/native_client/modelstate.h
@@ -66,7 +66,7 @@ struct ModelState {
    * @brief Return character-level metadata including letter timings.
    *
    * @param state Decoder state to use when decoding.
-   * @param num_results Number of candidate results to return.
+   * @param num_results Maximum number of candidate results to return.
    *
    * @return A Metadata struct containing CandidateTranscript structs.
    * Each represents an candidate transcript, with the first ranked most probable.