Address review comments

2020-03-17 14:47:18 +01:00 · 2020-03-17 14:47:18 +01:00 · 2ec34d5a06
parent e9ae38bf47
commit 2ec34d5a06
9 changed files with 59 additions and 38 deletions
--- a/doc/DotNet-API.rst
+++ b/doc/DotNet-API.rst
@ -31,20 +31,20 @@ ErrorCodes
 Metadata
 --------

-.. doxygenstruct:: DeepSpeechClient::Models::Metadata
+.. doxygenclass:: DeepSpeechClient::Models::Metadata
   :project: deepspeech-dotnet
   :members: Transcripts

 CandidateTranscript
 -------------------

-.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript
+.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript
   :project: deepspeech-dotnet
   :members: Tokens, Confidence

 TokenMetadata
 -------------

-.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata
+.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata
   :project: deepspeech-dotnet
   :members: Text, Timestep, StartTime
--- a/doc/Java-API.rst
+++ b/doc/Java-API.rst
@ -13,11 +13,17 @@ Metadata

 .. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
   :project: deepspeech-java
-   :members: getItems, getNum_items, getProbability, getItem
+   :members: getTranscripts, getNum_transcripts, getTranscript

-MetadataItem
------------
+CandidateTranscript
+-------------------

-.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem
+.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript
   :project: deepspeech-java
-   :members: getCharacter, getTimestep, getStart_time
+   :members: getTokens, getNum_tokens, getConfidence, getToken
+
+TokenMetadata
+-------------
+.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata
+   :project: deepspeech-java
+   :members: getText, getTimestep, getStart_time
--- a/doc/Structs.rst
+++ b/doc/Structs.rst
@ -8,9 +8,16 @@ Metadata
   :project: deepspeech-c
   :members:

-MetadataItem
------------
+CandidateTranscript
+-------------------

-.. doxygenstruct:: MetadataItem
+.. doxygenstruct:: CandidateTranscript
+   :project: deepspeech-c
+   :members:
+
+TokenMetadata
+-------------
+
+.. doxygenstruct:: TokenMetadata
   :project: deepspeech-c
   :members:
--- a/doc/doxygen-dotnet.conf
+++ b/doc/doxygen-dotnet.conf
@ -790,7 +790,7 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.

-INPUT                  = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/
+INPUT                  = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/

 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
--- a/native_client/args.h
+++ b/native_client/args.h
@ -34,6 +34,8 @@ bool extended_metadata = false;

 bool json_output = false;

+int json_candidate_transcripts = 3;
+
 int stream_size = 0;

 void PrintHelp(const char* bin)
@ -43,18 +45,19 @@ void PrintHelp(const char* bin)
    "\n"
    "Running DeepSpeech inference.\n"
    "\n"
-    "\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
-    "\t--scorer SCORER\t\tPath to the external scorer file\n"
-    "\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
-    "\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
-    "\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
-    "\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
-    "\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
-    "\t--extended\t\tOutput string from extended metadata\n"
-    "\t--json\t\t\tExtended output, shows word timings as JSON\n"
-    "\t--stream size\t\tRun in stream mode, output intermediate results\n"
-    "\t--help\t\t\tShow help\n"
-    "\t--version\t\tPrint version and exits\n";
+    "\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n"
+    "\t--scorer SCORER\t\t\tPath to the external scorer file\n"
+    "\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n"
+    "\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n"
+    "\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n"
+    "\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n"
+    "\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
+    "\t--extended\t\t\tOutput string from extended metadata\n"
+    "\t--json\t\t\t\tExtended output, shows word timings as JSON\n"
+    "\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n"
+    "\t--stream size\t\t\tRun in stream mode, output intermediate results\n"
+    "\t--help\t\t\t\tShow help\n"
+    "\t--version\t\t\tPrint version and exits\n";
    char* version = DS_Version();
    std::cerr << "DeepSpeech " << version << "\n";
    DS_FreeString(version);
@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv)
            {"t", no_argument, nullptr, 't'},
            {"extended", no_argument, nullptr, 'e'},
            {"json", no_argument, nullptr, 'j'},
+            {"candidate_transcripts", required_argument, nullptr, 150},
            {"stream", required_argument, nullptr, 's'},
            {"version", no_argument, nullptr, 'v'},
            {"help", no_argument, nullptr, 'h'},
@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv)
            json_output = true;
            break;

+        case 150:
+            json_candidate_transcripts = atoi(optarg);
+            break;
+
        case 's':
            stream_size = atoi(optarg);
            break;
--- a/native_client/client.cc
+++ b/native_client/client.cc
@ -49,7 +49,7 @@ CandidateTranscriptToString(CandidateTranscript* transcript)
 {
  std::string retval = "";
  for (int i = 0; i < transcript->num_tokens; i++) {
-    TokenMetadata token = transcript->tokens[i];
+    const TokenMetadata& token = transcript->tokens[i];
    retval += token.text;
  }
  return strdup(retval.c_str());
@ -65,7 +65,7 @@ CandidateTranscriptToWords(CandidateTranscript* transcript)

  // Loop through each token
  for (int i = 0; i < transcript->num_tokens; i++) {
-    TokenMetadata token = transcript->tokens[i];
+    const TokenMetadata& token = transcript->tokens[i];

    // Append token to word if it's not a space
    if (strcmp(token.text, u8" ") != 0) {
@ -167,7 +167,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
    res.string = CandidateTranscriptToString(&result->transcripts[0]);
    DS_FreeMetadata(result);
  } else if (json_output) {
-    Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3);
+    Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts);
    res.string = MetadataToJSON(result);
    DS_FreeMetadata(result);
  } else if (stream_size > 0) {
--- a/native_client/ctcdecode/ctc_beam_search_decoder.h
+++ b/native_client/ctcdecode/ctc_beam_search_decoder.h
@ -60,7 +60,7 @@ public:
            int time_dim,
            int class_dim);

-  /* Get transcription from current decoder state
+  /* Get up to num_results transcriptions from current decoder state.
   *
   * Parameters:
   *     num_results: Number of beams to return.
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@ -27,7 +27,7 @@ typedef struct TokenMetadata {
  char* text;

  /** Position of the token in units of 20ms */
-  int timestep;
+  unsigned int timestep;

  /** Position of the token in seconds */
  float start_time;
@ -41,7 +41,7 @@ typedef struct CandidateTranscript {
  /** Array of TokenMetadata objects */
  TokenMetadata* tokens;
  /** Size of the tokens array */
-  int num_tokens;
+  unsigned int num_tokens;
  /** Approximated confidence value for this transcript. This is roughly the
   * sum of the acoustic model logit values for each timestep/character that
   * contributed to the creation of this transcript.
@ -56,7 +56,7 @@ typedef struct Metadata {
  /** Array of CandidateTranscript objects */
  CandidateTranscript* transcripts;
  /** Size of the transcripts array */
-  int num_transcripts;
+  unsigned int num_transcripts;
 } Metadata;

 enum DeepSpeech_Error_Codes
@ -175,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx,
                          float aBeta);

 /**
- * @brief Use the DeepSpeech model to perform Speech-To-Text.
+ * @brief Use the DeepSpeech model to convert speech to text.
 *
 * @param aCtx The ModelState pointer for the model to use.
 * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
@ -191,18 +191,18 @@ char* DS_SpeechToText(ModelState* aCtx,
                      unsigned int aBufferSize);

 /**
- * @brief Use the DeepSpeech model to perform Speech-To-Text and output results
+ * @brief Use the DeepSpeech model to convert speech to text and output results
 * including metadata.
 *
 * @param aCtx The ModelState pointer for the model to use.
 * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
 *                sample rate (matching what the model was trained on).
 * @param aBufferSize The number of samples in the audio signal.
- * @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this.
+ * @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this.
 *
- * @return Metadata struct containing multiple candidate transcripts. Each transcript
- *         has per-token metadata including timing information. The user is
- *         responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
+ * @return Metadata struct containing multiple CandidateTranscript structs. Each
+ *         transcript has per-token metadata including timing information. The
+ *         user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
 *         Returns NULL on error.
 */
 DEEPSPEECH_EXPORT
--- a/native_client/modelstate.h
+++ b/native_client/modelstate.h
@ -66,7 +66,7 @@ struct ModelState {
   * @brief Return character-level metadata including letter timings.
   *
   * @param state Decoder state to use when decoding.
-   * @param num_results Number of candidate results to return.
+   * @param num_results Maximum number of candidate results to return.
   *
   * @return A Metadata struct containing CandidateTranscript structs.
   * Each represents an candidate transcript, with the first ranked most probable.