Merge pull request #2792 from reuben/multiple_transcriptions

Expose multiple transcriptions in "WithMetadata" API
2020-03-20 16:58:32 +01:00 · 2020-03-20 16:58:32 +01:00 · 903d0b8fe4
parent 5e46d702af ee30a1c9de
commit 903d0b8fe4
41 changed files with 1006 additions and 467 deletions
--- a/doc/C-API.rst
+++ b/doc/C-API.rst
@ -34,6 +34,9 @@ C
 .. doxygenfunction:: DS_IntermediateDecode
   :project: deepspeech-c
 .. doxygenfunction:: DS_IntermediateDecodeWithMetadata
   :project: deepspeech-c
 .. doxygenfunction:: DS_FinishStream
   :project: deepspeech-c
--- a/doc/DotNet-API.rst
+++ b/doc/DotNet-API.rst
@ -31,13 +31,20 @@ ErrorCodes
 Metadata
 --------
-.. doxygenstruct:: DeepSpeechClient::Structs::Metadata
+.. doxygenclass:: DeepSpeechClient::Models::Metadata
   :project: deepspeech-dotnet
-   :members: items, num_items, confidence
+   :members: Transcripts
-MetadataItem
+CandidateTranscript
------------
+-------------------
-.. doxygenstruct:: DeepSpeechClient::Structs::MetadataItem
+.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript
   :project: deepspeech-dotnet
-   :members: character, timestep, start_time
+   :members: Tokens, Confidence
 TokenMetadata
 -------------
 .. doxygenclass:: DeepSpeechClient::Models::TokenMetadata
   :project: deepspeech-dotnet
   :members: Text, Timestep, StartTime
--- a/doc/Java-API.rst
+++ b/doc/Java-API.rst
@ -13,11 +13,17 @@ Metadata
 .. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
   :project: deepspeech-java
-   :members: getItems, getNum_items, getProbability, getItem
+   :members: getTranscripts, getNum_transcripts, getTranscript
-MetadataItem
+CandidateTranscript
------------
+-------------------
-.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem
+.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript
   :project: deepspeech-java
-   :members: getCharacter, getTimestep, getStart_time
+   :members: getTokens, getNum_tokens, getConfidence, getToken
 TokenMetadata
 -------------
 .. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata
   :project: deepspeech-java
   :members: getText, getTimestep, getStart_time
--- a/doc/NodeJS-API.rst
+++ b/doc/NodeJS-API.rst
@ -30,8 +30,14 @@ Metadata
 .. js:autoclass:: Metadata
   :members:
-MetadataItem
+CandidateTranscript
------------
+-------------------
-.. js:autoclass:: MetadataItem
+.. js:autoclass:: CandidateTranscript
   :members:
 TokenMetadata
 -------------
 .. js:autoclass:: TokenMetadata
   :members:
--- a/doc/Python-API.rst
+++ b/doc/Python-API.rst
@ -21,8 +21,14 @@ Metadata
 .. autoclass:: Metadata
   :members:
-MetadataItem
+CandidateTranscript
------------
+-------------------
-.. autoclass:: MetadataItem
+.. autoclass:: CandidateTranscript
   :members:
 TokenMetadata
 -------------
 .. autoclass:: TokenMetadata
   :members:
--- a/doc/Structs.rst
+++ b/doc/Structs.rst
@ -8,9 +8,16 @@ Metadata
   :project: deepspeech-c
   :members:
-MetadataItem
+CandidateTranscript
------------
+-------------------
-.. doxygenstruct:: MetadataItem
+.. doxygenstruct:: CandidateTranscript
   :project: deepspeech-c
   :members:
 TokenMetadata
 -------------
 .. doxygenstruct:: TokenMetadata
   :project: deepspeech-c
   :members:
--- a/doc/doxygen-dotnet.conf
+++ b/doc/doxygen-dotnet.conf
@ -790,7 +790,7 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
-INPUT                  = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/
+INPUT                  = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
--- a/native_client/args.h
+++ b/native_client/args.h
@ -34,6 +34,8 @@ bool extended_metadata = false;
 bool json_output = false;
 int json_candidate_transcripts = 3;
 int stream_size = 0;
 void PrintHelp(const char* bin)
@ -43,18 +45,19 @@ void PrintHelp(const char* bin)
    "\n"
    "Running DeepSpeech inference.\n"
    "\n"
-    "\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
+    "\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n"
-    "\t--scorer SCORER\t\tPath to the external scorer file\n"
+    "\t--scorer SCORER\t\t\tPath to the external scorer file\n"
-    "\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
+    "\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n"
-    "\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
+    "\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n"
-    "\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
+    "\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n"
-    "\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
+    "\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n"
-    "\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
+    "\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
-    "\t--extended\t\tOutput string from extended metadata\n"
+    "\t--extended\t\t\tOutput string from extended metadata\n"
-    "\t--json\t\t\tExtended output, shows word timings as JSON\n"
+    "\t--json\t\t\t\tExtended output, shows word timings as JSON\n"
-    "\t--stream size\t\tRun in stream mode, output intermediate results\n"
+    "\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n"
-    "\t--help\t\t\tShow help\n"
+    "\t--stream size\t\t\tRun in stream mode, output intermediate results\n"
-    "\t--version\t\tPrint version and exits\n";
+    "\t--help\t\t\t\tShow help\n"
    "\t--version\t\t\tPrint version and exits\n";
    char* version = DS_Version();
    std::cerr << "DeepSpeech " << version << "\n";
    DS_FreeString(version);
@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv)
            {"t", no_argument, nullptr, 't'},
            {"extended", no_argument, nullptr, 'e'},
            {"json", no_argument, nullptr, 'j'},
            {"candidate_transcripts", required_argument, nullptr, 150},
            {"stream", required_argument, nullptr, 's'},
            {"version", no_argument, nullptr, 'v'},
            {"help", no_argument, nullptr, 'h'},
@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv)
            json_output = true;
            break;
        case 150:
            json_candidate_transcripts = atoi(optarg);
            break;
        case 's':
            stream_size = atoi(optarg);
            break;
--- a/native_client/client.cc
+++ b/native_client/client.cc
@ -44,9 +44,115 @@ struct meta_word {
  float duration;
 };
-char* metadataToString(Metadata* metadata);
+char*
-std::vector<meta_word> WordsFromMetadata(Metadata* metadata);
+CandidateTranscriptToString(const CandidateTranscript* transcript)
-char* JSONOutput(Metadata* metadata);
+{
  std::string retval = "";
  for (int i = 0; i < transcript->num_tokens; i++) {
    const TokenMetadata& token = transcript->tokens[i];
    retval += token.text;
  }
  return strdup(retval.c_str());
 }
 std::vector<meta_word>
 CandidateTranscriptToWords(const CandidateTranscript* transcript)
 {
  std::vector<meta_word> word_list;
  std::string word = "";
  float word_start_time = 0;
  // Loop through each token
  for (int i = 0; i < transcript->num_tokens; i++) {
    const TokenMetadata& token = transcript->tokens[i];
    // Append token to word if it's not a space
    if (strcmp(token.text, u8" ") != 0) {
      // Log the start time of the new word
      if (word.length() == 0) {
        word_start_time = token.start_time;
      }
      word.append(token.text);
    }
    // Word boundary is either a space or the last token in the array
    if (strcmp(token.text, u8" ") == 0 || i == transcript->num_tokens-1) {
      float word_duration = token.start_time - word_start_time;
      if (word_duration < 0) {
        word_duration = 0;
      }
      meta_word w;
      w.word = word;
      w.start_time = word_start_time;
      w.duration = word_duration;
      word_list.push_back(w);
      // Reset
      word = "";
      word_start_time = 0;
    }
  }
  return word_list;
 }
 std::string
 CandidateTranscriptToJSON(const CandidateTranscript *transcript)
 {
  std::ostringstream out_string;
  std::vector<meta_word> words = CandidateTranscriptToWords(transcript);
  out_string << R"("metadata":{"confidence":)" << transcript->confidence << R"(},"words":[)";
  for (int i = 0; i < words.size(); i++) {
    meta_word w = words[i];
    out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
    if (i < words.size() - 1) {
      out_string << ",";
    }
  }
  out_string << "]";
  return out_string.str();
 }
 char*
 MetadataToJSON(Metadata* result)
 {
  std::ostringstream out_string;
  out_string << "{\n";
  for (int j=0; j < result->num_transcripts; ++j) {
    const CandidateTranscript *transcript = &result->transcripts[j];
    if (j == 0) {
      out_string << CandidateTranscriptToJSON(transcript);
      if (result->num_transcripts > 1) {
        out_string << ",\n" << R"("alternatives")" << ":[\n";
      }
    } else {
      out_string << "{" << CandidateTranscriptToJSON(transcript) << "}";
      if (j < result->num_transcripts - 1) {
        out_string << ",\n";
      } else {
        out_string << "\n]";
      }
    }
  }
  out_string << "\n}\n";
  return strdup(out_string.str().c_str());
 }
 ds_result
 LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
@ -57,13 +163,13 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
  clock_t ds_start_time = clock();
  if (extended_output) {
-    Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
+    Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1);
-    res.string = metadataToString(metadata);
+    res.string = CandidateTranscriptToString(&result->transcripts[0]);
-    DS_FreeMetadata(metadata);
+    DS_FreeMetadata(result);
  } else if (json_output) {
-    Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
+    Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts);
-    res.string = JSONOutput(metadata);
+    res.string = MetadataToJSON(result);
-    DS_FreeMetadata(metadata);
+    DS_FreeMetadata(result);
  } else if (stream_size > 0) {
    StreamingState* ctx;
    int status = DS_CreateStream(aCtx, &ctx);
@ -278,87 +384,6 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
  }
 }
 char*
 metadataToString(Metadata* metadata)
 {
  std::string retval = "";
  for (int i = 0; i < metadata->num_items; i++) {
    MetadataItem item = metadata->items[i];
    retval += item.character;
  }
  return strdup(retval.c_str());
 }
 std::vector<meta_word>
 WordsFromMetadata(Metadata* metadata)
 {
  std::vector<meta_word> word_list;
  std::string word = "";
  float word_start_time = 0;
  // Loop through each character
  for (int i = 0; i < metadata->num_items; i++) {
    MetadataItem item = metadata->items[i];
    // Append character to word if it's not a space
    if (strcmp(item.character, u8" ") != 0) {
      // Log the start time of the new word
      if (word.length() == 0) {
        word_start_time = item.start_time;
      }
      word.append(item.character);
    }
    // Word boundary is either a space or the last character in the array
    if (strcmp(item.character, " ") == 0
        || strcmp(item.character, u8" ") == 0
        || i == metadata->num_items-1) {
      float word_duration = item.start_time - word_start_time;
      if (word_duration < 0) {
        word_duration = 0;
      }
      meta_word w;
      w.word = word;
      w.start_time = word_start_time;
      w.duration = word_duration;
      word_list.push_back(w);
      // Reset
      word = "";
      word_start_time = 0;
    }
  }
  return word_list;
 }
 char* 
 JSONOutput(Metadata* metadata)
 {
  std::vector<meta_word> words = WordsFromMetadata(metadata);
  std::ostringstream out_string;
  out_string << R"({"metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)";
  for (int i = 0; i < words.size(); i++) {
    meta_word w = words[i];
    out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
    if (i < words.size() - 1) {
      out_string << ",";
    }
  }
  out_string << "]}\n";
  return strdup(out_string.str().c_str());
 }
 int
 main(int argc, char **argv)
 {
--- a/native_client/ctcdecode/ctc_beam_search_decoder.cpp
+++ b/native_client/ctcdecode/ctc_beam_search_decoder.cpp
@ -157,7 +157,7 @@ DecoderState::next(const double *probs,
 }
 std::vector<Output>
-DecoderState::decode() const
+DecoderState::decode(size_t num_results) const
 {
  std::vector<PathTrie*> prefixes_copy = prefixes_;
  std::unordered_map<const PathTrie*, float> scores;
@ -181,16 +181,12 @@ DecoderState::decode() const
  }
  using namespace std::placeholders;
-  size_t num_prefixes = std::min(prefixes_copy.size(), beam_size_);
+  size_t num_returned = std::min(prefixes_copy.size(), num_results);
  std::partial_sort(prefixes_copy.begin(),
-                    prefixes_copy.begin() + num_prefixes,
+                    prefixes_copy.begin() + num_returned,
                    prefixes_copy.end(),
                    std::bind(prefix_compare_external, _1, _2, scores));
  //TODO: expose this as an API parameter
  const size_t top_paths = 1;
  size_t num_returned = std::min(num_prefixes, top_paths);
  std::vector<Output> outputs;
  outputs.reserve(num_returned);
--- a/native_client/ctcdecode/ctc_beam_search_decoder.h
+++ b/native_client/ctcdecode/ctc_beam_search_decoder.h
@ -60,13 +60,16 @@ public:
            int time_dim,
            int class_dim);
-  /* Get transcription from current decoder state
+  /* Get up to num_results transcriptions from current decoder state.
   *
   * Parameters:
   *     num_results: Number of beams to return.
   *
   * Return:
   *     A vector where each element is a pair of score and decoding result,
   *     in descending order.
  */
-  std::vector<Output> decode() const;
+  std::vector<Output> decode(size_t num_results=1) const;
 };
--- a/native_client/deepspeech.cc
+++ b/native_client/deepspeech.cc
@ -60,7 +60,7 @@ using std::vector;
   When batch_buffer is full, we do a single step through the acoustic model
   and accumulate the intermediate decoding state in the DecoderState structure.
-   When finishStream() is called, we return the corresponding transcription from
+   When finishStream() is called, we return the corresponding transcript from
   the current decoder state.
 */
 struct StreamingState {
@ -78,9 +78,10 @@ struct StreamingState {
  void feedAudioContent(const short* buffer, unsigned int buffer_size);
  char* intermediateDecode() const;
  Metadata* intermediateDecodeWithMetadata(unsigned int num_results) const;
  void finalizeStream();
  char* finishStream();
-  Metadata* finishStreamWithMetadata();
+  Metadata* finishStreamWithMetadata(unsigned int num_results);
  void processAudioWindow(const vector<float>& buf);
  void processMfccWindow(const vector<float>& buf);
@ -136,6 +137,12 @@ StreamingState::intermediateDecode() const
  return model_->decode(decoder_state_);
 }
 Metadata*
 StreamingState::intermediateDecodeWithMetadata(unsigned int num_results) const
 {
  return model_->decode_metadata(decoder_state_, num_results);
 }
 char*
 StreamingState::finishStream()
 {
@ -144,10 +151,10 @@ StreamingState::finishStream()
 }
 Metadata*
-StreamingState::finishStreamWithMetadata()
+StreamingState::finishStreamWithMetadata(unsigned int num_results)
 {
  finalizeStream();
-  return model_->decode_metadata(decoder_state_);
+  return model_->decode_metadata(decoder_state_, num_results);
 }
 void
@ -402,6 +409,13 @@ DS_IntermediateDecode(const StreamingState* aSctx)
  return aSctx->intermediateDecode();
 }
 Metadata*
 DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
                                  unsigned int aNumResults)
 {
  return aSctx->intermediateDecodeWithMetadata(aNumResults);
 }
 char*
 DS_FinishStream(StreamingState* aSctx)
 {
@ -411,11 +425,12 @@ DS_FinishStream(StreamingState* aSctx)
 }
 Metadata*
-DS_FinishStreamWithMetadata(StreamingState* aSctx)
+DS_FinishStreamWithMetadata(StreamingState* aSctx, 
                            unsigned int aNumResults)
 {
-  Metadata* metadata = aSctx->finishStreamWithMetadata();
+  Metadata* result = aSctx->finishStreamWithMetadata(aNumResults);
  DS_FreeStream(aSctx);
-  return metadata;
+  return result;
 }
 StreamingState*
@ -444,10 +459,11 @@ DS_SpeechToText(ModelState* aCtx,
 Metadata*
 DS_SpeechToTextWithMetadata(ModelState* aCtx,
                            const short* aBuffer,
-                            unsigned int aBufferSize)
+                            unsigned int aBufferSize,
                            unsigned int aNumResults)
 {
  StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
-  return DS_FinishStreamWithMetadata(ctx);
+  return DS_FinishStreamWithMetadata(ctx, aNumResults);
 }
 void
@ -460,11 +476,16 @@ void
 DS_FreeMetadata(Metadata* m)
 {
  if (m) {
-    for (int i = 0; i < m->num_items; ++i) {
+    for (int i = 0; i < m->num_transcripts; ++i) {
-      free(m->items[i].character);
+      for (int j = 0; j < m->transcripts[i].num_tokens; ++j) {
        free((void*)m->transcripts[i].tokens[j].text);
      }
-    delete[] m->items;
+
-    delete m;
+      free((void*)m->transcripts[i].tokens);
    }
    free((void*)m->transcripts);
    free(m);
  }
 }
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@ -20,32 +20,43 @@ typedef struct ModelState ModelState;
 typedef struct StreamingState StreamingState;
 /**
- * @brief Stores each individual character, along with its timing information
+ * @brief Stores text of an individual token, along with its timing information
 */
-typedef struct MetadataItem {
+typedef struct TokenMetadata {
-  /** The character generated for transcription */
+  /** The text corresponding to this token */
-  char* character;
+  const char* const text;
-  /** Position of the character in units of 20ms */
+  /** Position of the token in units of 20ms */
-  int timestep;
+  const unsigned int timestep;
-  /** Position of the character in seconds */
+  /** Position of the token in seconds */
-  float start_time;
+  const float start_time;
-} MetadataItem;
+} TokenMetadata;
 /**
- * @brief Stores the entire CTC output as an array of character metadata objects
+ * @brief A single transcript computed by the model, including a confidence
 *        value and the metadata for its constituent tokens.
 */
 typedef struct CandidateTranscript {
  /** Array of TokenMetadata objects */
  const TokenMetadata* const tokens;
  /** Size of the tokens array */
  const unsigned int num_tokens;
  /** Approximated confidence value for this transcript. This is roughly the
   * sum of the acoustic model logit values for each timestep/character that
   * contributed to the creation of this transcript.
   */
  const double confidence;
 } CandidateTranscript;
 /**
 * @brief An array of CandidateTranscript objects computed by the model.
 */
 typedef struct Metadata {
-  /** List of items */
+  /** Array of CandidateTranscript objects */
-  MetadataItem* items;
+  const CandidateTranscript* const transcripts;
-  /** Size of the list of items */
+  /** Size of the transcripts array */
-  int num_items;
+  const unsigned int num_transcripts;
  /** Approximated confidence value for this transcription. This is roughly the
   * sum of the acoustic model logit values for each timestep/character that
   * contributed to the creation of this transcription.
   */
  double confidence;
 } Metadata;
 enum DeepSpeech_Error_Codes
@ -164,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx,
                          float aBeta);
 /**
- * @brief Use the DeepSpeech model to perform Speech-To-Text.
+ * @brief Use the DeepSpeech model to convert speech to text.
 *
 * @param aCtx The ModelState pointer for the model to use.
 * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
@ -180,21 +191,25 @@ char* DS_SpeechToText(ModelState* aCtx,
                      unsigned int aBufferSize);
 /**
- * @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata 
+ * @brief Use the DeepSpeech model to convert speech to text and output results
- * about the results.
+ * including metadata.
 *
 * @param aCtx The ModelState pointer for the model to use.
 * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
 *                sample rate (matching what the model was trained on).
 * @param aBufferSize The number of samples in the audio signal.
 * @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this.
 *
- * @return Outputs a struct of individual letters along with their timing information. 
+ * @return Metadata struct containing multiple CandidateTranscript structs. Each
- *         The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
+ *         transcript has per-token metadata including timing information. The
 *         user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
 *         Returns NULL on error.
 */
 DEEPSPEECH_EXPORT
 Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
                                      const short* aBuffer,
-                                      unsigned int aBufferSize);
+                                      unsigned int aBufferSize,
                                      unsigned int aNumResults);
 /**
 * @brief Create a new streaming inference state. The streaming state returned
@ -236,8 +251,24 @@ DEEPSPEECH_EXPORT
 char* DS_IntermediateDecode(const StreamingState* aSctx);
 /**
- * @brief Signal the end of an audio signal to an ongoing streaming
+ * @brief Compute the intermediate decoding of an ongoing streaming inference,
- *        inference, returns the STT result over the whole audio signal.
+ *        return results including metadata.
 *
 * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
 * @param aNumResults The number of candidate transcripts to return.
 *
 * @return Metadata struct containing multiple candidate transcripts. Each transcript
 *         has per-token metadata including timing information. The user is
 *         responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
 *         Returns NULL on error.
 */
 DEEPSPEECH_EXPORT
 Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
                                            unsigned int aNumResults);
 /**
 * @brief Compute the final decoding of an ongoing streaming inference and return
 *        the result. Signals the end of an ongoing streaming inference.
 *
 * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
 *
@ -250,18 +281,23 @@ DEEPSPEECH_EXPORT
 char* DS_FinishStream(StreamingState* aSctx);
 /**
- * @brief Signal the end of an audio signal to an ongoing streaming
+ * @brief Compute the final decoding of an ongoing streaming inference and return
- *        inference, returns per-letter metadata.
+ *        results including metadata. Signals the end of an ongoing streaming
 *        inference.
 *
 * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
 * @param aNumResults The number of candidate transcripts to return.
 *
- * @return Outputs a struct of individual letters along with their timing information. 
+ * @return Metadata struct containing multiple candidate transcripts. Each transcript
- *         The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
+ *         has per-token metadata including timing information. The user is
 *         responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
 *         Returns NULL on error.
 *
 * @note This method will free the state pointer (@p aSctx).
 */
 DEEPSPEECH_EXPORT
-Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx);
+Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
                                      unsigned int aNumResults);
 /**
 * @brief Destroy a streaming state without decoding the computed logits. This
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
@ -199,13 +199,14 @@ namespace DeepSpeechClient
        }
        /// <summary>
-        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
+        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
        /// </summary>
        /// <param name="stream">Instance of the stream to finish.</param>
        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
        /// <returns>The extended metadata result.</returns>
-        public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream)
+        public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
        {
-            return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata();
+            return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
        }
        /// <summary>
@ -218,6 +219,17 @@ namespace DeepSpeechClient
            return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString();
        }
        /// <summary>
        /// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
        /// </summary>
        /// <param name="stream">Instance of the stream to decode.</param>
        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
        /// <returns>The STT intermediate result.</returns>
        public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
        {
            return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
        }
        /// <summary>
        /// Return version of this library. The returned version is a semantic version
        /// (SemVer 2.0.0).
@ -261,14 +273,15 @@ namespace DeepSpeechClient
        }
        /// <summary>
-        /// Use the DeepSpeech model to perform Speech-To-Text.
+        /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
        /// </summary>
        /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
        /// <param name="aBufferSize">The number of samples in the audio signal.</param>
        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
        /// <returns>The extended metadata. Returns NULL on error.</returns>
-        public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
+        public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
        {
-            return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
+            return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata();
        }
        #endregion
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
@ -50,11 +50,13 @@
    <Compile Include="Extensions\NativeExtensions.cs" />
    <Compile Include="Models\DeepSpeechStream.cs" />
    <Compile Include="Models\Metadata.cs" />
-    <Compile Include="Models\MetadataItem.cs" />
+    <Compile Include="Models\CandidateTranscript.cs" />
    <Compile Include="Models\TokenMetadata.cs" />
    <Compile Include="NativeImp.cs" />
    <Compile Include="Properties\AssemblyInfo.cs" />
    <Compile Include="Structs\Metadata.cs" />
-    <Compile Include="Structs\MetadataItem.cs" />
+    <Compile Include="Structs\CandidateTranscript.cs" />
    <Compile Include="Structs\TokenMetadata.cs" />
  </ItemGroup>
  <ItemGroup />
  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
--- a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs
+++ b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs
@ -26,35 +26,68 @@ namespace DeepSpeechClient.Extensions
        }
        /// <summary>
-        /// Converts a pointer into managed metadata object.
+        /// Converts a pointer into managed TokenMetadata object.
        /// </summary>
        /// <param name="intPtr">Native pointer.</param>
        /// <returns>TokenMetadata managed object.</returns>
        private static Models.TokenMetadata PtrToTokenMetadata(this IntPtr intPtr)
        {
            var token = Marshal.PtrToStructure<TokenMetadata>(intPtr);
            var managedToken = new Models.TokenMetadata
            {
                Timestep = token.timestep,
                StartTime = token.start_time,
                Text = token.text.PtrToString(releasePtr: false)
            };
            return managedToken;
        }
        /// <summary>
        /// Converts a pointer into managed CandidateTranscript object.
        /// </summary>
        /// <param name="intPtr">Native pointer.</param>
        /// <returns>CandidateTranscript managed object.</returns>
        private static Models.CandidateTranscript PtrToCandidateTranscript(this IntPtr intPtr)
        {
            var managedTranscript = new Models.CandidateTranscript();
            var transcript = Marshal.PtrToStructure<CandidateTranscript>(intPtr);
            managedTranscript.Tokens = new Models.TokenMetadata[transcript.num_tokens];
            managedTranscript.Confidence = transcript.confidence;
            //we need to manually read each item from the native ptr using its size
            var sizeOfTokenMetadata = Marshal.SizeOf(typeof(TokenMetadata));
            for (int i = 0; i < transcript.num_tokens; i++)
            {
                managedTranscript.Tokens[i] = transcript.tokens.PtrToTokenMetadata();
                transcript.tokens += sizeOfTokenMetadata;
            }
            return managedTranscript;
        }
        /// <summary>
        /// Converts a pointer into managed Metadata object.
        /// </summary>
        /// <param name="intPtr">Native pointer.</param>
        /// <returns>Metadata managed object.</returns>
        internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
        {
-            var managedMetaObject = new Models.Metadata();
+            var managedMetadata = new Models.Metadata();
-            var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));
+            var metadata = Marshal.PtrToStructure<Metadata>(intPtr);
            managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
            managedMetaObject.Confidence = metaData.confidence;
            managedMetadata.Transcripts = new Models.CandidateTranscript[metadata.num_transcripts];
            //we need to manually read each item from the native ptr using its size
-            var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
+            var sizeOfCandidateTranscript = Marshal.SizeOf(typeof(CandidateTranscript));
-            for (int i = 0; i < metaData.num_items; i++)
+            for (int i = 0; i < metadata.num_transcripts; i++)
            {
-                var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items);
+                managedMetadata.Transcripts[i] = metadata.transcripts.PtrToCandidateTranscript();
-                managedMetaObject.Items[i] = new Models.MetadataItem
+                metadata.transcripts += sizeOfCandidateTranscript;
                {
                    Timestep = tempItem.timestep,
                    StartTime = tempItem.start_time,
                    Character = tempItem.character.PtrToString(releasePtr: false)
                };
                //we keep the offset on each read
                metaData.items += sizeOfMetaItem;
            }
            NativeImp.DS_FreeMetadata(intPtr);
-            return managedMetaObject;
+            return managedMetadata;
        }
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
@ -68,13 +68,15 @@ namespace DeepSpeechClient.Interfaces
                uint aBufferSize);
        /// <summary>
-        /// Use the DeepSpeech model to perform Speech-To-Text.
+        /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
        /// </summary>
        /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
        /// <param name="aBufferSize">The number of samples in the audio signal.</param>
        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
        /// <returns>The extended metadata. Returns NULL on error.</returns>
        unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
-                uint aBufferSize);
+                uint aBufferSize,
                uint aNumResults);
        /// <summary>
        /// Destroy a streaming state without decoding the computed logits.
@ -102,6 +104,14 @@ namespace DeepSpeechClient.Interfaces
        /// <returns>The STT intermediate result.</returns>
        unsafe string IntermediateDecode(DeepSpeechStream stream);
        /// <summary>
        /// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
        /// </summary>
        /// <param name="stream">Instance of the stream to decode.</param>
        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
        /// <returns>The extended metadata result.</returns>
        unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
        /// <summary>
        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
        /// </summary>
@ -110,10 +120,11 @@ namespace DeepSpeechClient.Interfaces
        unsafe string FinishStream(DeepSpeechStream stream);
        /// <summary>
-        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
+        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
        /// </summary>
        /// <param name="stream">Instance of the stream to finish.</param>
        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
        /// <returns>The extended metadata result.</returns>
-        unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream);
+        unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs
+++ b/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs
@ -0,0 +1,17 @@
 namespace DeepSpeechClient.Models
 {
    /// <summary>
    /// Stores the entire CTC output as an array of character metadata objects.
    /// </summary>
    public class CandidateTranscript
    {
        /// <summary>
        /// Approximated confidence value for this transcription.
        /// </summary>
        public double Confidence { get; set; }
        /// <summary>
        /// List of metada tokens containing text, timestep, and time offset.
        /// </summary>
        public TokenMetadata[] Tokens { get; set; }
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
@ -6,12 +6,8 @@
    public class Metadata
    {
        /// <summary>
-        /// Approximated confidence value for this transcription.
+        /// List of candidate transcripts.
        /// </summary>
-        public double Confidence { get; set; }
+        public CandidateTranscript[] Transcripts { get; set; }
        /// <summary>
        /// List of metada items containing char, timespet, and time offset.
        /// </summary>
        public MetadataItem[] Items { get; set; }
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs
@ -3,12 +3,12 @@
    /// <summary>
    /// Stores each individual character, along with its timing information.
    /// </summary>
-    public class MetadataItem
+    public class TokenMetadata
    {
        /// <summary>
        /// Char of the current timestep.
        /// </summary>
-        public string Character;
+        public string Text;
        /// <summary>
        /// Position of the character in units of 20ms.
        /// </summary>
--- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs
+++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs
@ -55,7 +55,8 @@ namespace DeepSpeechClient
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
        internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
            short[] aBuffer,
-                uint aBufferSize);
+            uint aBufferSize,
            uint aNumResults);
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
        internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
@ -82,12 +83,17 @@ namespace DeepSpeechClient
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
        internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx);
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
        internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx,
            uint aNumResults);
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
            CharSet = CharSet.Ansi, SetLastError = true)]
        internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx);
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
-        internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx);
+        internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx,
            uint aNumResults);
        #endregion
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs
+++ b/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs
@ -0,0 +1,22 @@
 using System;
 using System.Runtime.InteropServices;
 namespace DeepSpeechClient.Structs
 {
    [StructLayout(LayoutKind.Sequential)]
    internal unsafe struct CandidateTranscript
    {
        /// <summary>
        /// Native list of tokens.
        /// </summary>
        internal unsafe IntPtr tokens;
        /// <summary>
        /// Count of tokens from the native side.
        /// </summary>
        internal unsafe int num_tokens;
        /// <summary>
        /// Approximated confidence value for this transcription.
        /// </summary>
        internal unsafe double confidence;
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
@ -7,16 +7,12 @@ namespace DeepSpeechClient.Structs
    internal unsafe struct Metadata
    {
        /// <summary>
-        /// Native list of items.
+        /// Native list of candidate transcripts.
        /// </summary>
-        internal unsafe IntPtr items;
+        internal unsafe IntPtr transcripts;
        /// <summary>
-        /// Count of items from the native side.
+        /// Count of transcripts from the native side.
        /// </summary>
-        internal unsafe int num_items;
+        internal unsafe int num_transcripts;
        /// <summary>
        /// Approximated confidence value for this transcription.
        /// </summary>
        internal unsafe double confidence;
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs
@ -4,12 +4,12 @@ using System.Runtime.InteropServices;
 namespace DeepSpeechClient.Structs
 {
    [StructLayout(LayoutKind.Sequential)]
-    internal unsafe struct MetadataItem
+    internal unsafe struct TokenMetadata
    {
        /// <summary>
-        /// Native character.
+        /// Native text.
        /// </summary>
-        internal unsafe IntPtr character;
+        internal unsafe IntPtr text;
        /// <summary>
        /// Position of the character in units of 20ms.
        /// </summary>
--- a/native_client/dotnet/DeepSpeechConsole/Program.cs
+++ b/native_client/dotnet/DeepSpeechConsole/Program.cs
@ -21,14 +21,14 @@ namespace CSharpExamples
        static string GetArgument(IEnumerable<string> args, string option)
        => args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();
-        static string MetadataToString(Metadata meta)
+        static string MetadataToString(CandidateTranscript transcript)
        {
            var nl = Environment.NewLine;
            string retval =
-             Environment.NewLine + $"Recognized text: {string.Join("", meta?.Items?.Select(x => x.Character))} {nl}"
+             Environment.NewLine + $"Recognized text: {string.Join("", transcript?.Tokens?.Select(x => x.Text))} {nl}"
-             + $"Confidence: {meta?.Confidence} {nl}"
+             + $"Confidence: {transcript?.Confidence} {nl}"
-             + $"Item count: {meta?.Items?.Length} {nl}"
+             + $"Item count: {transcript?.Tokens?.Length} {nl}"
-             + string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}"));
+             + string.Join(nl, transcript?.Tokens?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Text}"));
            return retval;
        }
@ -75,8 +75,8 @@ namespace CSharpExamples
                        if (extended)
                        {
                            Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer,
-                                Convert.ToUInt32(waveBuffer.MaxSize / 2));
+                                Convert.ToUInt32(waveBuffer.MaxSize / 2), 1);
-                            speechResult = MetadataToString(metaResult);
+                            speechResult = MetadataToString(metaResult.Transcripts[0]);
                        }
                        else
                        {
--- a/native_client/java/jni/deepspeech.i
+++ b/native_client/java/jni/deepspeech.i
@ -6,6 +6,8 @@
 %}
 %include "typemaps.i"
 %include "enums.swg"
 %javaconst(1);
 %include "arrays_java.i"
 // apply to DS_FeedAudioContent and DS_SpeechToText
@ -15,21 +17,29 @@
 %pointer_functions(ModelState*, modelstatep);
 %pointer_functions(StreamingState*, streamingstatep);
-%typemap(newfree) char* "DS_FreeString($1);";
+%extend struct CandidateTranscript {
-
+  /**
-%include "carrays.i"
+   * Retrieve one TokenMetadata element
-%array_functions(struct MetadataItem, metadataItem_array);
+   * 
   * @param i Array index of the TokenMetadata to get
   *
   * @return The TokenMetadata requested or null
   */
  const TokenMetadata& getToken(int i) {
    return self->tokens[i];
  }
 }
 %extend struct Metadata {
  /**
-   * Retrieve one MetadataItem element
+   * Retrieve one CandidateTranscript element
   * 
-   * @param i Array index of the MetadataItem to get
+   * @param i Array index of the CandidateTranscript to get
   *
-   * @return The MetadataItem requested or null
+   * @return The CandidateTranscript requested or null
   */
-  MetadataItem getItem(int i) {
+  const CandidateTranscript& getTranscript(int i) {
-    return metadataItem_array_getitem(self->items, i);
+    return self->transcripts[i];
  }
  ~Metadata() {
@ -37,14 +47,18 @@
  }
 }
 %nodefaultdtor Metadata;
 %nodefaultctor Metadata;
-%nodefaultctor MetadataItem;
+%nodefaultdtor Metadata;
-%nodefaultdtor MetadataItem;
+%nodefaultctor CandidateTranscript;
 %nodefaultdtor CandidateTranscript;
 %nodefaultctor TokenMetadata;
 %nodefaultdtor TokenMetadata;
 %typemap(newfree) char* "DS_FreeString($1);";
 %newobject DS_SpeechToText;
 %newobject DS_IntermediateDecode;
 %newobject DS_FinishStream;
 %newobject DS_ErrorCodeToErrorMessage;
 %rename ("%(strip:[DS_])s") "";
--- a/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java
+++ b/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java
@ -12,7 +12,7 @@ import org.junit.runners.MethodSorters;
 import static org.junit.Assert.*;
 import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
-import org.mozilla.deepspeech.libdeepspeech.Metadata;
+import org.mozilla.deepspeech.libdeepspeech.CandidateTranscript;
 import java.io.RandomAccessFile;
 import java.io.FileNotFoundException;
@ -61,10 +61,10 @@ public class BasicTest {
        m.freeModel();
    }
-    private String metadataToString(Metadata m) {
+    private String candidateTranscriptToString(CandidateTranscript t) {
        String retval = "";
-        for (int i = 0; i < m.getNum_items(); ++i) {
+        for (int i = 0; i < t.getNum_tokens(); ++i) {
-            retval += m.getItem(i).getCharacter();
+            retval += t.getToken(i).getText();
        }
        return retval;
    }
@ -97,7 +97,7 @@ public class BasicTest {
            ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
            if (extendedMetadata) {
-                return metadataToString(m.sttWithMetadata(shorts, shorts.length));
+                return candidateTranscriptToString(m.sttWithMetadata(shorts, shorts.length, 1).getTranscript(0));
            } else {
                return m.stt(shorts, shorts.length);
            }
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
@ -11,8 +11,15 @@ public class DeepSpeechModel {
    }
    // FIXME: We should have something better than those SWIGTYPE_*
-    SWIGTYPE_p_p_ModelState _mspp;
+    private SWIGTYPE_p_p_ModelState _mspp;
-    SWIGTYPE_p_ModelState   _msp;
+    private SWIGTYPE_p_ModelState   _msp;
    private void evaluateErrorCode(int errorCode) {
        DeepSpeech_Error_Codes code = DeepSpeech_Error_Codes.swigToEnum(errorCode);
        if (code != DeepSpeech_Error_Codes.ERR_OK) {
            throw new RuntimeException("Error: " + impl.ErrorCodeToErrorMessage(errorCode) + " (0x" + Integer.toHexString(errorCode) + ").");
        }
    }
   /**
    * @brief An object providing an interface to a trained DeepSpeech model.
@ -20,10 +27,12 @@ public class DeepSpeechModel {
    * @constructor
    *
    * @param modelPath The path to the frozen model graph.
    *
    * @throws RuntimeException on failure.
    */
    public DeepSpeechModel(String modelPath) {
        this._mspp = impl.new_modelstatep();
-        impl.CreateModel(modelPath, this._mspp);
+        evaluateErrorCode(impl.CreateModel(modelPath, this._mspp));
        this._msp  = impl.modelstatep_value(this._mspp);
    }
@ -43,10 +52,10 @@ public class DeepSpeechModel {
     * @param aBeamWidth The beam width used by the model. A larger beam width value
     *                   generates better results at the cost of decoding time.
     *
-     * @return Zero on success, non-zero on failure.
+     * @throws RuntimeException on failure.
     */
-    public int setBeamWidth(long beamWidth) {
+    public void setBeamWidth(long beamWidth) {
-        return impl.SetModelBeamWidth(this._msp, beamWidth);
+        evaluateErrorCode(impl.SetModelBeamWidth(this._msp, beamWidth));
    }
   /**
@ -70,19 +79,19 @@ public class DeepSpeechModel {
    *
    * @param scorer The path to the external scorer file.
    *
-    * @return Zero on success, non-zero on failure (invalid arguments).
+    * @throws RuntimeException on failure.
    */
    public void enableExternalScorer(String scorer) {
-        impl.EnableExternalScorer(this._msp, scorer);
+        evaluateErrorCode(impl.EnableExternalScorer(this._msp, scorer));
    }
    /**
    * @brief Disable decoding using an external scorer.
    *
-    * @return Zero on success, non-zero on failure (invalid arguments).
+    * @throws RuntimeException on failure.
    */
    public void disableExternalScorer() {
-        impl.DisableExternalScorer(this._msp);
+        evaluateErrorCode(impl.DisableExternalScorer(this._msp));
    }
    /**
@ -91,10 +100,10 @@ public class DeepSpeechModel {
    * @param alpha The alpha hyperparameter of the decoder. Language model weight.
    * @param beta The beta hyperparameter of the decoder. Word insertion weight.
    *
-    * @return Zero on success, non-zero on failure (invalid arguments).
+    * @throws RuntimeException on failure.
    */
    public void setScorerAlphaBeta(float alpha, float beta) {
-        impl.SetScorerAlphaBeta(this._msp, alpha, beta);
+        evaluateErrorCode(impl.SetScorerAlphaBeta(this._msp, alpha, beta));
    }
   /*
@ -117,11 +126,13 @@ public class DeepSpeechModel {
    * @param buffer A 16-bit, mono raw audio signal at the appropriate
    *                sample rate (matching what the model was trained on).
    * @param buffer_size The number of samples in the audio signal.
    * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
    *
-    * @return Outputs a Metadata object of individual letters along with their timing information.
+    * @return Metadata struct containing multiple candidate transcripts. Each transcript
    *         has per-token metadata including timing information.
    */
-    public Metadata sttWithMetadata(short[] buffer, int buffer_size) {
+    public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) {
-        return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size);
+        return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results);
    }
   /**
@ -130,10 +141,12 @@ public class DeepSpeechModel {
    *        and finishStream().
    *
    * @return An opaque object that represents the streaming state.
    *
    * @throws RuntimeException on failure.
    */
    public DeepSpeechStreamingState createStream() {
        SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
-        impl.CreateStream(this._msp, ssp);
+        evaluateErrorCode(impl.CreateStream(this._msp, ssp));
        return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
    }
@ -161,8 +174,20 @@ public class DeepSpeechModel {
    }
   /**
-    * @brief Signal the end of an audio signal to an ongoing streaming
+    * @brief Compute the intermediate decoding of an ongoing streaming inference.
-    *        inference, returns the STT result over the whole audio signal.
+    *
    * @param ctx A streaming state pointer returned by createStream().
    * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
    *
    * @return The STT intermediate result.
    */
    public Metadata intermediateDecodeWithMetadata(DeepSpeechStreamingState ctx, int num_results) {
        return impl.IntermediateDecodeWithMetadata(ctx.get(), num_results);
    }
   /**
    * @brief Compute the final decoding of an ongoing streaming inference and return
    *        the result. Signals the end of an ongoing streaming inference.
    *
    * @param ctx A streaming state pointer returned by createStream().
    *
@ -175,16 +200,19 @@ public class DeepSpeechModel {
    }
   /**
-    * @brief Signal the end of an audio signal to an ongoing streaming
+    * @brief Compute the final decoding of an ongoing streaming inference and return
-    *        inference, returns per-letter metadata.
+    *        the results including metadata. Signals the end of an ongoing streaming
    *        inference.
    *
    * @param ctx A streaming state pointer returned by createStream().
    * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
    *
-    * @return Outputs a Metadata object of individual letters along with their timing information.
+    * @return Metadata struct containing multiple candidate transcripts. Each transcript
    *         has per-token metadata including timing information.
    *
    * @note This method will free the state pointer (@p ctx).
    */
-    public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx) {
+    public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx, int num_results) {
-        return impl.FinishStreamWithMetadata(ctx.get());
+        return impl.FinishStreamWithMetadata(ctx.get(), num_results);
    }
 }
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
@ -0,0 +1,73 @@
 /* ----------------------------------------------------------------------------
 * This file was automatically generated by SWIG (http://www.swig.org).
 * Version 4.0.1
 *
 * Do not make changes to this file unless you know what you are doing--modify
 * the SWIG interface file instead.
 * ----------------------------------------------------------------------------- */
 package org.mozilla.deepspeech.libdeepspeech;
 /**
 * A single transcript computed by the model, including a confidence<br>
 *        value and the metadata for its constituent tokens.
 */
 public class CandidateTranscript {
  private transient long swigCPtr;
  protected transient boolean swigCMemOwn;
  protected CandidateTranscript(long cPtr, boolean cMemoryOwn) {
    swigCMemOwn = cMemoryOwn;
    swigCPtr = cPtr;
  }
  protected static long getCPtr(CandidateTranscript obj) {
    return (obj == null) ? 0 : obj.swigCPtr;
  }
  public synchronized void delete() {
    if (swigCPtr != 0) {
      if (swigCMemOwn) {
        swigCMemOwn = false;
        throw new UnsupportedOperationException("C++ destructor does not have public access");
      }
      swigCPtr = 0;
    }
  }
  /**
   *  Array of TokenMetadata objects 
   */
  public TokenMetadata getTokens() {
    long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this);
    return (cPtr == 0) ? null : new TokenMetadata(cPtr, false);
  }
  /**
   *  Size of the tokens array 
   */
  public long getNum_tokens() {
    return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this);
  }
  /**
   *  Approximated confidence value for this transcript. This is roughly the<br>
   * sum of the acoustic model logit values for each timestep/character that<br>
   * contributed to the creation of this transcript.
   */
  public double getConfidence() {
    return implJNI.CandidateTranscript_confidence_get(swigCPtr, this);
  }
  /**
   * Retrieve one TokenMetadata element<br>
   * <br>
   * @param i Array index of the TokenMetadata to get<br>
   * <br>
   * @return The TokenMetadata requested or null
   */
  public TokenMetadata getToken(int i) {
    return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), false);
  }
 }
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/DeepSpeech_Error_Codes.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/DeepSpeech_Error_Codes.java
@ -0,0 +1,65 @@
 /* ----------------------------------------------------------------------------
 * This file was automatically generated by SWIG (http://www.swig.org).
 * Version 4.0.1
 *
 * Do not make changes to this file unless you know what you are doing--modify
 * the SWIG interface file instead.
 * ----------------------------------------------------------------------------- */
 package org.mozilla.deepspeech.libdeepspeech;
 public enum DeepSpeech_Error_Codes {
  ERR_OK(0x0000),
  ERR_NO_MODEL(0x1000),
  ERR_INVALID_ALPHABET(0x2000),
  ERR_INVALID_SHAPE(0x2001),
  ERR_INVALID_SCORER(0x2002),
  ERR_MODEL_INCOMPATIBLE(0x2003),
  ERR_SCORER_NOT_ENABLED(0x2004),
  ERR_FAIL_INIT_MMAP(0x3000),
  ERR_FAIL_INIT_SESS(0x3001),
  ERR_FAIL_INTERPRETER(0x3002),
  ERR_FAIL_RUN_SESS(0x3003),
  ERR_FAIL_CREATE_STREAM(0x3004),
  ERR_FAIL_READ_PROTOBUF(0x3005),
  ERR_FAIL_CREATE_SESS(0x3006),
  ERR_FAIL_CREATE_MODEL(0x3007);
  public final int swigValue() {
    return swigValue;
  }
  public static DeepSpeech_Error_Codes swigToEnum(int swigValue) {
    DeepSpeech_Error_Codes[] swigValues = DeepSpeech_Error_Codes.class.getEnumConstants();
    if (swigValue < swigValues.length && swigValue >= 0 && swigValues[swigValue].swigValue == swigValue)
      return swigValues[swigValue];
    for (DeepSpeech_Error_Codes swigEnum : swigValues)
      if (swigEnum.swigValue == swigValue)
        return swigEnum;
    throw new IllegalArgumentException("No enum " + DeepSpeech_Error_Codes.class + " with value " + swigValue);
  }
  @SuppressWarnings("unused")
  private DeepSpeech_Error_Codes() {
    this.swigValue = SwigNext.next++;
  }
  @SuppressWarnings("unused")
  private DeepSpeech_Error_Codes(int swigValue) {
    this.swigValue = swigValue;
    SwigNext.next = swigValue+1;
  }
  @SuppressWarnings("unused")
  private DeepSpeech_Error_Codes(DeepSpeech_Error_Codes swigEnum) {
    this.swigValue = swigEnum.swigValue;
    SwigNext.next = this.swigValue+1;
  }
  private final int swigValue;
  private static class SwigNext {
    private static int next = 0;
  }
 }
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
 * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 4.0.2
+ * Version 4.0.1
 *
 * Do not make changes to this file unless you know what you are doing--modify
 * the SWIG interface file instead.
@ -9,7 +9,7 @@
 package org.mozilla.deepspeech.libdeepspeech;
 /**
- * Stores the entire CTC output as an array of character metadata objects
+ * An array of CandidateTranscript objects computed by the model.
 */
 public class Metadata {
  private transient long swigCPtr;
@ -40,61 +40,29 @@ public class Metadata {
  }
  /**
-   *  List of items 
+   *  Array of CandidateTranscript objects 
   */
-  public void setItems(MetadataItem value) {
+  public CandidateTranscript getTranscripts() {
-    implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value);
+    long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this);
    return (cPtr == 0) ? null : new CandidateTranscript(cPtr, false);
  }
  /**
-   *  List of items 
+   *  Size of the transcripts array 
   */
-  public MetadataItem getItems() {
+  public long getNum_transcripts() {
-    long cPtr = implJNI.Metadata_items_get(swigCPtr, this);
+    return implJNI.Metadata_num_transcripts_get(swigCPtr, this);
    return (cPtr == 0) ? null : new MetadataItem(cPtr, false);
  }
  /**
-   *  Size of the list of items 
+   * Retrieve one CandidateTranscript element<br>
   */
  public void setNum_items(int value) {
    implJNI.Metadata_num_items_set(swigCPtr, this, value);
  }
  /**
   *  Size of the list of items 
   */
  public int getNum_items() {
    return implJNI.Metadata_num_items_get(swigCPtr, this);
  }
  /**
   *  Approximated confidence value for this transcription. This is roughly the<br>
   * sum of the acoustic model logit values for each timestep/character that<br>
   * contributed to the creation of this transcription.
   */
  public void setConfidence(double value) {
    implJNI.Metadata_confidence_set(swigCPtr, this, value);
  }
  /**
   *  Approximated confidence value for this transcription. This is roughly the<br>
   * sum of the acoustic model logit values for each timestep/character that<br>
   * contributed to the creation of this transcription.
   */
  public double getConfidence() {
    return implJNI.Metadata_confidence_get(swigCPtr, this);
  }
  /**
   * Retrieve one MetadataItem element<br>
   * <br>
-   * @param i Array index of the MetadataItem to get<br>
+   * @param i Array index of the CandidateTranscript to get<br>
   * <br>
-   * @return The MetadataItem requested or null
+   * @return The CandidateTranscript requested or null
   */
-  public MetadataItem getItem(int i) {
+  public CandidateTranscript getTranscript(int i) {
-    return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true);
+    return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), false);
  }
 }
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/README.rst
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/README.rst
@ -4,7 +4,7 @@ Javadoc for Sphinx
 This code is only here for reference for documentation generation.
-To update, please build SWIG (4.0 at least) and then run from native_client/java:
+To update, please install SWIG (4.0 at least) and then run from native_client/java:
 .. code-block::
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java
@ -0,0 +1,58 @@
 /* ----------------------------------------------------------------------------
 * This file was automatically generated by SWIG (http://www.swig.org).
 * Version 4.0.1
 *
 * Do not make changes to this file unless you know what you are doing--modify
 * the SWIG interface file instead.
 * ----------------------------------------------------------------------------- */
 package org.mozilla.deepspeech.libdeepspeech;
 /**
 * Stores text of an individual token, along with its timing information
 */
 public class TokenMetadata {
  private transient long swigCPtr;
  protected transient boolean swigCMemOwn;
  protected TokenMetadata(long cPtr, boolean cMemoryOwn) {
    swigCMemOwn = cMemoryOwn;
    swigCPtr = cPtr;
  }
  protected static long getCPtr(TokenMetadata obj) {
    return (obj == null) ? 0 : obj.swigCPtr;
  }
  public synchronized void delete() {
    if (swigCPtr != 0) {
      if (swigCMemOwn) {
        swigCMemOwn = false;
        throw new UnsupportedOperationException("C++ destructor does not have public access");
      }
      swigCPtr = 0;
    }
  }
  /**
   *  The text corresponding to this token 
   */
  public String getText() {
    return implJNI.TokenMetadata_text_get(swigCPtr, this);
  }
  /**
   *  Position of the token in units of 20ms 
   */
  public long getTimestep() {
    return implJNI.TokenMetadata_timestep_get(swigCPtr, this);
  }
  /**
   *  Position of the token in seconds 
   */
  public float getStart_time() {
    return implJNI.TokenMetadata_start_time_get(swigCPtr, this);
  }
 }
--- a/native_client/javascript/client.js
+++ b/native_client/javascript/client.js
@ -42,12 +42,11 @@ function totalTime(hrtimeValue) {
  return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
 }
-function metadataToString(metadata) {
+function candidateTranscriptToString(transcript) {
  var retval = ""
-  for (var i = 0; i < metadata.num_items; ++i) {
+  for (var i = 0; i < transcript.tokens.length; ++i) {
-    retval += metadata.items[i].character;
+    retval += transcript.tokens[i].text;
  }
  Ds.FreeMetadata(metadata);
  return retval;
 }
@ -117,7 +116,9 @@ audioStream.on('finish', () => {
  const audioLength = (audioBuffer.length / 2) * (1 / desired_sample_rate);
  if (args['extended']) {
-    console.log(metadataToString(model.sttWithMetadata(audioBuffer)));
+    let metadata = model.sttWithMetadata(audioBuffer, 1);
    console.log(candidateTranscriptToString(metadata.transcripts[0]));
    Ds.FreeMetadata(metadata);
  } else {
    console.log(model.stt(audioBuffer));
  }
--- a/native_client/javascript/deepspeech.i
+++ b/native_client/javascript/deepspeech.i
@ -47,8 +47,8 @@ using namespace node;
 %typemap(argout) ModelState **retval {
  $result = SWIGV8_ARRAY_NEW();
  SWIGV8_AppendOutput($result, SWIG_From_int(result));
-  // owned by SWIG, ModelState destructor gets called when the JavaScript object is finalized (see below)
+  // owned by the application. NodeJS does not guarantee the finalizer will be called so applications must call FreeMetadata themselves.
-  %append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, SWIG_POINTER_OWN));
+  %append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0));
 }
@ -68,27 +68,29 @@ using namespace node;
 %nodefaultctor ModelState;
 %nodefaultdtor ModelState;
-%typemap(out) MetadataItem* %{
+%typemap(out) TokenMetadata* %{
  $result = SWIGV8_ARRAY_NEW();
-  for (int i = 0; i < arg1->num_items; ++i) {
+  for (int i = 0; i < arg1->num_tokens; ++i) {
-    SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_MetadataItem, SWIG_POINTER_OWN));
+    SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_TokenMetadata, 0));
  }
 %}
-%nodefaultdtor Metadata;
+%typemap(out) CandidateTranscript* %{
-%nodefaultctor Metadata;
+  $result = SWIGV8_ARRAY_NEW();
-%nodefaultctor MetadataItem;
+  for (int i = 0; i < arg1->num_transcripts; ++i) {
-%nodefaultdtor MetadataItem;
+    SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_CandidateTranscript, 0));
 %extend struct Metadata {
  ~Metadata() {
    DS_FreeMetadata($self);
  }
-}
+%}
-%extend struct MetadataItem {
+%ignore Metadata::num_transcripts;
-  ~MetadataItem() { }
+%ignore CandidateTranscript::num_tokens;
-}
+
 %nodefaultctor Metadata;
 %nodefaultdtor Metadata;
 %nodefaultctor CandidateTranscript;
 %nodefaultdtor CandidateTranscript;
 %nodefaultctor TokenMetadata;
 %nodefaultdtor TokenMetadata;
 %rename ("%(strip:[DS_])s") "";
--- a/native_client/javascript/index.js
+++ b/native_client/javascript/index.js
@ -115,15 +115,16 @@ Model.prototype.stt = function(aBuffer) {
 }
 /**
- * Use the DeepSpeech model to perform Speech-To-Text and output metadata
+ * Use the DeepSpeech model to perform Speech-To-Text and output results including metadata.
 * about the results.
 *
 * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
 * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
 *
- * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
+ * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
 */
-Model.prototype.sttWithMetadata = function(aBuffer) {
+Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) {
-    return binding.SpeechToTextWithMetadata(this._impl, aBuffer);
+    aNumResults = aNumResults || 1;
    return binding.SpeechToTextWithMetadata(this._impl, aBuffer, aNumResults);
 }
 /**
@ -172,7 +173,19 @@ Stream.prototype.intermediateDecode = function() {
 }
 /**
- * Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
+ * Compute the intermediate decoding of an ongoing streaming inference, return results including metadata.
 *
 * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
 *
 * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
 */
 Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
    aNumResults = aNumResults || 1;
    return binding.IntermediateDecode(this._impl, aNumResults);
 }
 /**
 * Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference.
 *
 * @return {string} The STT result.
 *
@ -185,14 +198,17 @@ Stream.prototype.finishStream = function() {
 }
 /**
- * Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
+ * Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference.
 *
 * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
 *
 * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
 *
 * This method will free the stream, it must not be used after this method is called.
 */
-Stream.prototype.finishStreamWithMetadata = function() {
+Stream.prototype.finishStreamWithMetadata = function(aNumResults) {
-    result = binding.FinishStreamWithMetadata(this._impl);
+    aNumResults = aNumResults || 1;
    result = binding.FinishStreamWithMetadata(this._impl, aNumResults);
    this._impl = null;
    return result;
 }
@ -236,70 +252,80 @@ function Version() {
 }
-//// Metadata and MetadataItem are here only for documentation purposes
+//// Metadata, CandidateTranscript and TokenMetadata are here only for documentation purposes
 /**
 * @class
 * 
- * Stores each individual character, along with its timing information
+ * Stores text of an individual token, along with its timing information
 */
-function MetadataItem() {}
+function TokenMetadata() {}
 /** 
- * The character generated for transcription
+ * The text corresponding to this token
 *
- * @return {string} The character generated
+ * @return {string} The text generated
 */
-MetadataItem.prototype.character = function() {}
+TokenMetadata.prototype.text = function() {}
 /**
- * Position of the character in units of 20ms
+ * Position of the token in units of 20ms
 *
- * @return {int} The position of the character
+ * @return {int} The position of the token
 */
-MetadataItem.prototype.timestep = function() {};
+TokenMetadata.prototype.timestep = function() {};
 /**
- * Position of the character in seconds
+ * Position of the token in seconds
 *
- * @return {float} The position of the character
+ * @return {float} The position of the token
 */
-MetadataItem.prototype.start_time = function() {};
+TokenMetadata.prototype.start_time = function() {};
 /**
 * @class
 *
- * Stores the entire CTC output as an array of character metadata objects
+ * A single transcript computed by the model, including a confidence value and
 * the metadata for its constituent tokens.
 */
-function Metadata () {}
+function CandidateTranscript () {}
 /**
- * List of items
+ * Array of tokens
 *
- * @return {array} List of :js:func:`MetadataItem`
+ * @return {array} Array of :js:func:`TokenMetadata`
 */
-Metadata.prototype.items = function() {}
+CandidateTranscript.prototype.tokens = function() {}
 /**
 * Size of the list of items
 *
 * @return {int} Number of items
 */
 Metadata.prototype.num_items = function() {}
 /**
 * Approximated confidence value for this transcription. This is roughly the
- * sum of the acoustic model logit values for each timestep/character that
+ * sum of the acoustic model logit values for each timestep/token that
 * contributed to the creation of this transcription.
 *
 * @return {float} Confidence value
 */
-Metadata.prototype.confidence = function() {}
+CandidateTranscript.prototype.confidence = function() {}
 /**
 * @class
 *
 * An array of CandidateTranscript objects computed by the model.
 */
 function Metadata () {}
 /**
 * Array of transcripts
 *
 * @return {array} Array of :js:func:`CandidateTranscript` objects
 */
 Metadata.prototype.transcripts = function() {}
 module.exports = {
    Model: Model,
    Metadata: Metadata,
-    MetadataItem: MetadataItem,
+    CandidateTranscript: CandidateTranscript,
    TokenMetadata: TokenMetadata,
    Version: Version,
    FreeModel: FreeModel,
    FreeStream: FreeStream,
--- a/native_client/modelstate.cc
+++ b/native_client/modelstate.cc
@ -37,27 +37,39 @@ ModelState::decode(const DecoderState& state) const
 }
 Metadata*
-ModelState::decode_metadata(const DecoderState& state)
+ModelState::decode_metadata(const DecoderState& state, 
                            size_t num_results)
 {
-  vector<Output> out = state.decode();
+  vector<Output> out = state.decode(num_results);
  unsigned int num_returned = out.size();
-  std::unique_ptr<Metadata> metadata(new Metadata());
+  CandidateTranscript* transcripts = (CandidateTranscript*)malloc(sizeof(CandidateTranscript)*num_returned);
  metadata->num_items = out[0].tokens.size();
  metadata->confidence = out[0].confidence;
-  std::unique_ptr<MetadataItem[]> items(new MetadataItem[metadata->num_items]());
+  for (int i = 0; i < num_returned; ++i) {
    TokenMetadata* tokens = (TokenMetadata*)malloc(sizeof(TokenMetadata)*out[i].tokens.size());
-  // Loop through each character
+    for (int j = 0; j < out[i].tokens.size(); ++j) {
-  for (int i = 0; i < out[0].tokens.size(); ++i) {
+      TokenMetadata token {
-    items[i].character = strdup(alphabet_.StringFromLabel(out[0].tokens[i]).c_str());
+        strdup(alphabet_.StringFromLabel(out[i].tokens[j]).c_str()),   // text
-    items[i].timestep = out[0].timesteps[i];
+        static_cast<unsigned int>(out[i].timesteps[j]),                // timestep
-    items[i].start_time = out[0].timesteps[i] * ((float)audio_win_step_ / sample_rate_);
+        out[i].timesteps[j] * ((float)audio_win_step_ / sample_rate_), // start_time
-
+      };
-    if (items[i].start_time < 0) {
+      memcpy(&tokens[j], &token, sizeof(TokenMetadata));
      items[i].start_time = 0;
    }
    }
-  metadata->items = items.release();
+    CandidateTranscript transcript {
-  return metadata.release();
+      tokens,                                          // tokens
      static_cast<unsigned int>(out[i].tokens.size()), // num_tokens
      out[i].confidence,                               // confidence
    };
    memcpy(&transcripts[i], &transcript, sizeof(CandidateTranscript));
  }
  Metadata* ret = (Metadata*)malloc(sizeof(Metadata));
  Metadata metadata {
    transcripts,  // transcripts
    num_returned, // num_transcripts
  };
  memcpy(ret, &metadata, sizeof(Metadata));
  return ret;
 }
--- a/native_client/modelstate.h
+++ b/native_client/modelstate.h
@ -66,11 +66,14 @@ struct ModelState {
   * @brief Return character-level metadata including letter timings.
   *
   * @param state Decoder state to use when decoding.
   * @param num_results Maximum number of candidate results to return.
   *
-   * @return Metadata struct containing MetadataItem structs for each character.
+   * @return A Metadata struct containing CandidateTranscript structs.
-   * The user is responsible for freeing Metadata by calling DS_FreeMetadata().
+   * Each represents an candidate transcript, with the first ranked most probable.
   * The user is responsible for freeing Result by calling DS_FreeMetadata().
   */
-  virtual Metadata* decode_metadata(const DecoderState& state);
+  virtual Metadata* decode_metadata(const DecoderState& state,
                                    size_t num_results);
 };
 #endif // MODELSTATE_H
--- a/native_client/python/init.py
+++ b/native_client/python/init.py
@ -121,17 +121,20 @@ class Model(object):
        """
        return deepspeech.impl.SpeechToText(self._impl, audio_buffer)
-    def sttWithMetadata(self, audio_buffer):
+    def sttWithMetadata(self, audio_buffer, num_results=1):
        """
-        Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
+        Use the DeepSpeech model to perform Speech-To-Text and return results including metadata.
        :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
        :type audio_buffer: numpy.int16 array
-        :return: Outputs a struct of individual letters along with their timing information.
+        :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
        :type num_results: int
        :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
        :type: :func:`Metadata`
        """
-        return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer)
+        return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results)
    def createStream(self):
        """
@ -187,10 +190,27 @@ class Stream(object):
            raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
        return deepspeech.impl.IntermediateDecode(self._impl)
    def intermediateDecodeWithMetadata(self, num_results=1):
        """
        Compute the intermediate decoding of an ongoing streaming inference and return results including metadata.
        :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
        :type num_results: int
        :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
        :type: :func:`Metadata`
        :throws: RuntimeError if the stream object is not valid
        """
        if not self._impl:
            raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
        return deepspeech.impl.IntermediateDecodeWithMetadata(self._impl, num_results)
    def finishStream(self):
        """
-        Signal the end of an audio signal to an ongoing streaming inference,
+        Compute the final decoding of an ongoing streaming inference and return
-        returns the STT result over the whole audio signal.
+        the result. Signals the end of an ongoing streaming inference. The underlying
        stream object must not be used after this method is called.
        :return: The STT result.
        :type: str
@ -203,19 +223,24 @@ class Stream(object):
        self._impl = None
        return result
-    def finishStreamWithMetadata(self):
+    def finishStreamWithMetadata(self, num_results=1):
        """
-        Signal the end of an audio signal to an ongoing streaming inference,
+        Compute the final decoding of an ongoing streaming inference and return
-        returns per-letter metadata.
+        results including metadata. Signals the end of an ongoing streaming
        inference. The underlying stream object must not be used after this
        method is called.
-        :return: Outputs a struct of individual letters along with their timing information.
+        :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
        :type num_results: int
        :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
        :type: :func:`Metadata`
        :throws: RuntimeError if the stream object is not valid
        """
        if not self._impl:
            raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
-        result = deepspeech.impl.FinishStreamWithMetadata(self._impl)
+        result = deepspeech.impl.FinishStreamWithMetadata(self._impl, num_results)
        self._impl = None
        return result
@ -233,52 +258,43 @@ class Stream(object):
 # This is only for documentation purpose
-# Metadata and MetadataItem should be in sync with native_client/deepspeech.h
+# Metadata, CandidateTranscript and TokenMetadata should be in sync with native_client/deepspeech.h
-class MetadataItem(object):
+class TokenMetadata(object):
    """
    Stores each individual character, along with its timing information
    """
-    def character(self):
+    def text(self):
        """
-        The character generated for transcription
+        The text for this token
        """
    def timestep(self):
        """
-        Position of the character in units of 20ms
+        Position of the token in units of 20ms
        """
    def start_time(self):
        """
-        Position of the character in seconds
+        Position of the token in seconds
        """
-class Metadata(object):
+class CandidateTranscript(object):
    """
    Stores the entire CTC output as an array of character metadata objects
    """
-    def items(self):
+    def tokens(self):
        """
-        List of items
+        List of tokens
-        :return: A list of :func:`MetadataItem` elements
+        :return: A list of :func:`TokenMetadata` elements
        :type: list
        """
    def num_items(self):
        """
        Size of the list of items
        :return: Size of the list of items
        :type: int
        """
    def confidence(self):
        """
        Approximated confidence value for this transcription. This is roughly the
@ -286,3 +302,12 @@ class Metadata(object):
        contributed to the creation of this transcription.
        """
 class Metadata(object):
    def transcripts(self):
        """
        List of candidate transcripts
        :return: A list of :func:`CandidateTranscript` objects
        :type: list
        """
--- a/native_client/python/client.py
+++ b/native_client/python/client.py
@ -18,6 +18,7 @@ try:
 except ImportError:
    from pipes import quote
 def convert_samplerate(audio_path, desired_sample_rate):
    sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate)
    try:
@ -31,25 +32,25 @@ def convert_samplerate(audio_path, desired_sample_rate):
 def metadata_to_string(metadata):
-    return ''.join(item.character for item in metadata.items)
+    return ''.join(token.text for token in metadata.tokens)
-def words_from_metadata(metadata):
+
 def words_from_candidate_transcript(metadata):
    word = ""
    word_list = []
    word_start_time = 0
    # Loop through each character
-    for i in range(0, metadata.num_items):
+    for i, token in enumerate(metadata.tokens):
        item = metadata.items[i]
        # Append character to word if it's not a space
-        if item.character != " ":
+        if token.text != " ":
            if len(word) == 0:
                # Log the start time of the new word
-                word_start_time = item.start_time
+                word_start_time = token.start_time
-            word = word + item.character
+            word = word + token.text
        # Word boundary is either a space or the last character in the array
-        if item.character == " " or i == metadata.num_items - 1:
+        if token.text == " " or i == len(metadata.tokens) - 1:
-            word_duration = item.start_time - word_start_time
+            word_duration = token.start_time - word_start_time
            if word_duration < 0:
                word_duration = 0
@ -69,9 +70,11 @@ def words_from_metadata(metadata):
 def metadata_json_output(metadata):
    json_result = dict()
-    json_result["words"] = words_from_metadata(metadata)
+    json_result["transcripts"] = [{
-    json_result["confidence"] = metadata.confidence
+        "confidence": transcript.confidence,
-    return json.dumps(json_result)
+        "words": words_from_candidate_transcript(transcript),
    } for transcript in metadata.transcripts]
    return json.dumps(json_result, indent=2)
@ -141,9 +144,9 @@ def main():
    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    if args.extended:
-        print(metadata_to_string(ds.sttWithMetadata(audio)))
+        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
-        print(metadata_json_output(ds.sttWithMetadata(audio)))
+        print(metadata_json_output(ds.sttWithMetadata(audio, 3)))
    else:
        print(ds.stt(audio))
    inference_end = timer() - inference_start
--- a/native_client/python/impl.i
+++ b/native_client/python/impl.i
@ -38,30 +38,69 @@ import_array();
  %append_output(SWIG_NewPointerObj(%as_voidptr($1), $1_descriptor, SWIG_POINTER_OWN));
 }
-%typemap(out) MetadataItem* %{
+%fragment("parent_reference_init", "init") {
-  $result = PyList_New(arg1->num_items);
+  // Thread-safe initialization - initialize during Python module initialization
-  for (int i = 0; i < arg1->num_items; ++i) {
+  parent_reference();
-    PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->items[i]), SWIGTYPE_p_MetadataItem, 0);
+}
 %fragment("parent_reference_function", "header", fragment="parent_reference_init") {
 static PyObject *parent_reference() {
  static PyObject *parent_reference_string = SWIG_Python_str_FromChar("__parent_reference");
  return parent_reference_string;
 }
 }
 %typemap(out, fragment="parent_reference_function") CandidateTranscript* %{
  $result = PyList_New(arg1->num_transcripts);
  for (int i = 0; i < arg1->num_transcripts; ++i) {
    PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->transcripts[i]), SWIGTYPE_p_CandidateTranscript, 0);
    // Add a reference to Metadata in the returned elements to avoid premature
    // garbage collection
    PyObject_SetAttr(o, parent_reference(), $self);
    PyList_SetItem($result, i, o);
  }
 %}
-%extend struct MetadataItem {
+%typemap(out, fragment="parent_reference_function") TokenMetadata* %{
  $result = PyList_New(arg1->num_tokens);
  for (int i = 0; i < arg1->num_tokens; ++i) {
    PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->tokens[i]), SWIGTYPE_p_TokenMetadata, 0);
    // Add a reference to CandidateTranscript in the returned elements to avoid premature
    // garbage collection
    PyObject_SetAttr(o, parent_reference(), $self);
    PyList_SetItem($result, i, o);
  }
 %}
 %extend struct TokenMetadata {
 %pythoncode %{
  def __repr__(self):
-    return 'MetadataItem(character=\'{}\', timestep={}, start_time={})'.format(self.character, self.timestep, self.start_time)
+    return 'TokenMetadata(text=\'{}\', timestep={}, start_time={})'.format(self.text, self.timestep, self.start_time)
 %}
 }
 %extend struct CandidateTranscript {
 %pythoncode %{
  def __repr__(self):
    tokens_repr = ',\n'.join(repr(i) for i in self.tokens)
    tokens_repr = '\n'.join('  ' + l for l in tokens_repr.split('\n'))
    return 'CandidateTranscript(confidence={}, tokens=[\n{}\n])'.format(self.confidence, tokens_repr)
 %}
 }
 %extend struct Metadata {
 %pythoncode %{
  def __repr__(self):
-    items_repr = ', \n'.join('  ' + repr(i) for i in self.items)
+    transcripts_repr = ',\n'.join(repr(i) for i in self.transcripts)
-    return 'Metadata(confidence={}, items=[\n{}\n])'.format(self.confidence, items_repr)
+    transcripts_repr = '\n'.join('  ' + l for l in transcripts_repr.split('\n'))
    return 'Metadata(transcripts=[\n{}\n])'.format(transcripts_repr)
 %}
 }
-%ignore Metadata::num_items;
+%ignore Metadata::num_transcripts;
 %ignore CandidateTranscript::num_tokens;
 %extend struct Metadata {
  ~Metadata() {
@ -69,10 +108,12 @@ import_array();
  }
 }
 %nodefaultdtor Metadata;
 %nodefaultctor Metadata;
-%nodefaultctor MetadataItem;
+%nodefaultdtor Metadata;
-%nodefaultdtor MetadataItem;
+%nodefaultctor CandidateTranscript;
 %nodefaultdtor CandidateTranscript;
 %nodefaultctor TokenMetadata;
 %nodefaultdtor TokenMetadata;
 %typemap(newfree) char* "DS_FreeString($1);";