From 32c969c1846453474d786306dac4c1de3a2b2b54 Mon Sep 17 00:00:00 2001
From: dabinat <dabinat@example.com>
Date: Wed, 5 Feb 2020 07:55:15 +0000
Subject: [PATCH 01/16] Expose multiple transcriptions through the API

---
 .../ctcdecode/ctc_beam_search_decoder.cpp     | 13 +++--
 native_client/deepspeech.cc                   | 58 +++++++++++++++----
 native_client/deepspeech.h                    | 26 +++++++--
 native_client/modelstate.cc                   | 43 ++++++++------
 native_client/modelstate.h                    |  9 ++-
 5 files changed, 108 insertions(+), 41 deletions(-)
diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.cpp b/native_client/ctcdecode/ctc_beam_search_decoder.cpp
index 5dadd57f..9b3da8cf 100644
--- a/native_client/ctcdecode/ctc_beam_search_decoder.cpp
+++ b/native_client/ctcdecode/ctc_beam_search_decoder.cpp
@@ -157,7 +157,7 @@ DecoderState::next(const double *probs,
 }
 
 std::vector<Output>
-DecoderState::decode() const
+DecoderState::decode(size_t top_paths) const
 {
   std::vector<PathTrie*> prefixes_copy = prefixes_;
   std::unordered_map<const PathTrie*, float> scores;
@@ -167,7 +167,7 @@ DecoderState::decode() const
 
   // score the last word of each prefix that doesn't end with space
   if (ext_scorer_) {
-    for (size_t i = 0; i < beam_size_ && i < prefixes_copy.size(); ++i) {
+    for (size_t i = 0; i < top_paths && i < prefixes_copy.size(); ++i) {
       auto prefix = prefixes_copy[i];
       if (!ext_scorer_->is_scoring_boundary(prefix->parent, prefix->character)) {
         float score = 0.0;
@@ -181,14 +181,12 @@ DecoderState::decode() const
   }
 
   using namespace std::placeholders;
-  size_t num_prefixes = std::min(prefixes_copy.size(), beam_size_);
+  size_t num_prefixes = std::min(prefixes_copy.size(), top_paths);
   std::partial_sort(prefixes_copy.begin(),
                     prefixes_copy.begin() + num_prefixes,
                     prefixes_copy.end(),
                     std::bind(prefix_compare_external, _1, _2, scores));
 
-  //TODO: expose this as an API parameter
-  const size_t top_paths = 1;
   size_t num_returned = std::min(num_prefixes, top_paths);
 
   std::vector<Output> outputs;
@@ -220,6 +218,7 @@ std::vector<Output> ctc_beam_search_decoder(
     int class_dim,
     const Alphabet &alphabet,
     size_t beam_size,
+    size_t top_paths,
     double cutoff_prob,
     size_t cutoff_top_n,
     std::shared_ptr<Scorer> ext_scorer)
@@ -227,7 +226,7 @@ std::vector<Output> ctc_beam_search_decoder(
   DecoderState state;
   state.init(alphabet, beam_size, cutoff_prob, cutoff_top_n, ext_scorer);
   state.next(probs, time_dim, class_dim);
-  return state.decode();
+  return state.decode(top_paths);
 }
 
 std::vector<std::vector<Output>>
@@ -240,6 +239,7 @@ ctc_beam_search_decoder_batch(
     int seq_lengths_size,
     const Alphabet &alphabet,
     size_t beam_size,
+    size_t top_paths,
     size_t num_processes,
     double cutoff_prob,
     size_t cutoff_top_n,
@@ -259,6 +259,7 @@ ctc_beam_search_decoder_batch(
                                   class_dim,
                                   alphabet,
                                   beam_size,
+                                  top_paths,
                                   cutoff_prob,
                                   cutoff_top_n,
                                   ext_scorer));
diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc
index dd2a95ea..839a0122 100644
--- a/native_client/deepspeech.cc
+++ b/native_client/deepspeech.cc
@@ -80,7 +80,7 @@ struct StreamingState {
   char* intermediateDecode() const;
   void finalizeStream();
   char* finishStream();
-  Metadata* finishStreamWithMetadata();
+  Result* finishStreamWithMetadata(unsigned int numResults);
 
   void processAudioWindow(const vector<float>& buf);
   void processMfccWindow(const vector<float>& buf);
@@ -143,11 +143,26 @@ StreamingState::finishStream()
   return model_->decode(decoder_state_);
 }
 
-Metadata*
-StreamingState::finishStreamWithMetadata()
+Result*
+StreamingState::finishStreamWithMetadata(unsigned int numResults)
 {
   finalizeStream();
-  return model_->decode_metadata(decoder_state_);
+
+  vector<Metadata*> metadata = model_->decode_metadata(decoder_state_, numResults);
+
+  std::unique_ptr<Result> result(new Result());
+  result->num_transcriptions = metadata.size();
+
+  std::unique_ptr<Metadata[]> items(new Metadata[result->num_transcriptions]);
+
+  for (int i = 0; i < result->num_transcriptions; ++i) {
+      std::unique_ptr<Metadata> pointer(new Metadata(*metadata[i]));
+      items[i] = *pointer.release();
+  }
+
+  result->transcriptions = items.release();
+
+  return result.release();
 }
 
 void
@@ -410,12 +425,13 @@ DS_FinishStream(StreamingState* aSctx)
   return str;
 }
 
-Metadata*
-DS_FinishStreamWithMetadata(StreamingState* aSctx)
+Result*
+DS_FinishStreamWithMetadata(StreamingState* aSctx, 
+                            unsigned int numResults)
 {
-  Metadata* metadata = aSctx->finishStreamWithMetadata();
+  Result* result = aSctx->finishStreamWithMetadata(numResults);
   DS_FreeStream(aSctx);
-  return metadata;
+  return result;
 }
 
 StreamingState*
@@ -441,13 +457,14 @@ DS_SpeechToText(ModelState* aCtx,
   return DS_FinishStream(ctx);
 }
 
-Metadata*
+Result*
 DS_SpeechToTextWithMetadata(ModelState* aCtx,
                             const short* aBuffer,
-                            unsigned int aBufferSize)
+                            unsigned int aBufferSize,
+                            unsigned int numResults)
 {
   StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
-  return DS_FinishStreamWithMetadata(ctx);
+  return DS_FinishStreamWithMetadata(ctx, numResults);
 }
 
 void
@@ -468,6 +485,25 @@ DS_FreeMetadata(Metadata* m)
   }
 }
 
+void
+DS_FreeResult(Result* r)
+{
+  if (r) {
+    for (int i = 0; i < r->num_transcriptions; ++i) {
+      Metadata* m = &r->transcriptions[i];
+
+      for (int j = 0; j < m->num_items; ++j) {
+        free(m->items[j].character);
+      }
+
+      delete[] m->items;
+    }
+
+    delete[] r->transcriptions;
+    delete r;
+  }
+}
+
 void
 DS_FreeString(char* str)
 {
diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h
index 6dad59db..41d133ae 100644
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@@ -48,6 +48,16 @@ typedef struct Metadata {
   double confidence;
 } Metadata;
 
+/**
+ * @brief Stores Metadata structs for each alternative transcription
+ */
+typedef struct Result {
+  /** List of transcriptions */
+  Metadata* transcriptions;
+  /** Size of the list of transcriptions */
+  int num_transcriptions;
+} Result;
+
 enum DeepSpeech_Error_Codes
 {
     // OK
@@ -192,9 +202,10 @@ char* DS_SpeechToText(ModelState* aCtx,
  *         The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
  */
 DEEPSPEECH_EXPORT
-Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
-                                      const short* aBuffer,
-                                      unsigned int aBufferSize);
+Result* DS_SpeechToTextWithMetadata(ModelState* aCtx,
+                                    const short* aBuffer,
+                                    unsigned int aBufferSize,
+                                    unsigned int numResults);
 
 /**
  * @brief Create a new streaming inference state. The streaming state returned
@@ -261,7 +272,8 @@ char* DS_FinishStream(StreamingState* aSctx);
  * @note This method will free the state pointer (@p aSctx).
  */
 DEEPSPEECH_EXPORT
-Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx);
+Result* DS_FinishStreamWithMetadata(StreamingState* aSctx, 
+                                    unsigned int numResults);
 
 /**
  * @brief Destroy a streaming state without decoding the computed logits. This
@@ -281,6 +293,12 @@ void DS_FreeStream(StreamingState* aSctx);
 DEEPSPEECH_EXPORT
 void DS_FreeMetadata(Metadata* m);
 
+/**
+ * @brief Free memory allocated for result information.
+ */
+DEEPSPEECH_EXPORT
+void DS_FreeResult(Result* r);
+
 /**
  * @brief Free a char* string returned by the DeepSpeech API.
  */
diff --git a/native_client/modelstate.cc b/native_client/modelstate.cc
index ea8928bd..88c2c857 100644
--- a/native_client/modelstate.cc
+++ b/native_client/modelstate.cc
@@ -32,32 +32,41 @@ ModelState::init(const char* model_path)
 char*
 ModelState::decode(const DecoderState& state) const
 {
-  vector<Output> out = state.decode();
+  vector<Output> out = state.decode(1);
   return strdup(alphabet_.LabelsToString(out[0].tokens).c_str());
 }
 
-Metadata*
-ModelState::decode_metadata(const DecoderState& state)
+vector<Metadata*>
+ModelState::decode_metadata(const DecoderState& state, 
+                            size_t top_paths)
 {
-  vector<Output> out = state.decode();
+  vector<Output> out = state.decode(top_paths);
 
-  std::unique_ptr<Metadata> metadata(new Metadata());
-  metadata->num_items = out[0].tokens.size();
-  metadata->confidence = out[0].confidence;
+  vector<Metadata*> meta_out;
 
-  std::unique_ptr<MetadataItem[]> items(new MetadataItem[metadata->num_items]());
+  size_t max_results = std::min(top_paths, out.size());
 
-  // Loop through each character
-  for (int i = 0; i < out[0].tokens.size(); ++i) {
-    items[i].character = strdup(alphabet_.StringFromLabel(out[0].tokens[i]).c_str());
-    items[i].timestep = out[0].timesteps[i];
-    items[i].start_time = out[0].timesteps[i] * ((float)audio_win_step_ / sample_rate_);
+  for (int j = 0; j < max_results; ++j) {
+    std::unique_ptr<Metadata> metadata(new Metadata());
+    metadata->num_items = out[j].tokens.size();
+    metadata->confidence = out[j].confidence;
 
-    if (items[i].start_time < 0) {
-      items[i].start_time = 0;
+    std::unique_ptr<MetadataItem[]> items(new MetadataItem[metadata->num_items]());
+
+    // Loop through each character
+    for (int i = 0; i < out[j].tokens.size(); ++i) {
+      items[i].character = strdup(alphabet_.StringFromLabel(out[j].tokens[i]).c_str());
+      items[i].timestep = out[j].timesteps[i];
+      items[i].start_time = out[j].timesteps[i] * ((float)audio_win_step_ / sample_rate_);
+
+      if (items[i].start_time < 0) {
+        items[i].start_time = 0;
+      }
     }
+
+    metadata->items = items.release();
+    meta_out.push_back(metadata.release());
   }
 
-  metadata->items = items.release();
-  return metadata.release();
+  return meta_out;
 }
diff --git a/native_client/modelstate.h b/native_client/modelstate.h
index 25251e15..30d1e101 100644
--- a/native_client/modelstate.h
+++ b/native_client/modelstate.h
@@ -66,11 +66,14 @@ struct ModelState {
    * @brief Return character-level metadata including letter timings.
    *
    * @param state Decoder state to use when decoding.
+   * @param top_paths Number of alternate results to return.   
    *
-   * @return Metadata struct containing MetadataItem structs for each character.
-   * The user is responsible for freeing Metadata by calling DS_FreeMetadata().
+   * @return Vector of Metadata structs containing MetadataItem structs for each character.
+   * Each represents an alternate transcription, with the first ranked most probable.
+   * The user is responsible for freeing Metadata by calling DS_FreeMetadata() on each item.
    */
-  virtual Metadata* decode_metadata(const DecoderState& state);
+  virtual std::vector<Metadata*> decode_metadata(const DecoderState& state, 
+                                                 size_t top_paths);
 };
 
 #endif // MODELSTATE_H

From 004d66d224853d19e69db8ebafc67e9b762b453b Mon Sep 17 00:00:00 2001
From: dabinat <dabinat@example.com>
Date: Wed, 5 Feb 2020 07:55:55 +0000
Subject: [PATCH 02/16] Client changes to show multiple transcriptions in JSON
 output

---
 native_client/client.cc | 45 +++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/native_client/client.cc b/native_client/client.cc
index abcadd8d..ffe3b518 100644
--- a/native_client/client.cc
+++ b/native_client/client.cc
@@ -46,7 +46,7 @@ struct meta_word {
 
 char* metadataToString(Metadata* metadata);
 std::vector<meta_word> WordsFromMetadata(Metadata* metadata);
-char* JSONOutput(Metadata* metadata);
+char* JSONOutput(Result* result);
 
 ds_result
 LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
@@ -57,13 +57,13 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
   clock_t ds_start_time = clock();
 
   if (extended_output) {
-    Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
-    res.string = metadataToString(metadata);
-    DS_FreeMetadata(metadata);
+    Result *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1);
+    res.string = metadataToString(&result->transcriptions[0]);
+    DS_FreeResult(result);
   } else if (json_output) {
-    Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
-    res.string = JSONOutput(metadata);
-    DS_FreeMetadata(metadata);
+    Result *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3);
+    res.string = JSONOutput(result);
+    DS_FreeResult(result);
   } else if (stream_size > 0) {
     StreamingState* ctx;
     int status = DS_CreateStream(aCtx, &ctx);
@@ -338,23 +338,34 @@ WordsFromMetadata(Metadata* metadata)
 }
 
 char* 
-JSONOutput(Metadata* metadata)
+JSONOutput(Result* result)
 {
-  std::vector<meta_word> words = WordsFromMetadata(metadata);
-
   std::ostringstream out_string;
-  out_string << R"({"metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)";
+  out_string << "[\n";
 
-  for (int i = 0; i < words.size(); i++) {
-    meta_word w = words[i];
-    out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
+  for (int j=0; j < result->num_transcriptions; ++j) {
+    Metadata *metadata = &result->transcriptions[j];
+    std::vector<meta_word> words = WordsFromMetadata(metadata);
 
-    if (i < words.size() - 1) {
-      out_string << ",";
+    out_string << R"({"metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)";
+
+    for (int i = 0; i < words.size(); i++) {
+      meta_word w = words[i];
+      out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
+
+      if (i < words.size() - 1) {
+        out_string << ",";
+      }
+    }
+
+    out_string << "]}";
+
+    if (j < result->num_transcriptions - 1) {
+      out_string << ",\n";
     }
   }
   
-  out_string << "]}\n";
+  out_string << "\n]\n";
 
   return strdup(out_string.str().c_str());
 }

From 969b2ac4ba45aaf940a6371cfa73a34e38cab24f Mon Sep 17 00:00:00 2001
From: dabinat <dabinat@example.com>
Date: Fri, 14 Feb 2020 19:14:08 -0800
Subject: [PATCH 03/16] Changed variable names to match coding style

---
 native_client/deepspeech.cc | 12 ++++++------
 native_client/deepspeech.h  |  6 ++++--
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc
index 839a0122..c44e130b 100644
--- a/native_client/deepspeech.cc
+++ b/native_client/deepspeech.cc
@@ -80,7 +80,7 @@ struct StreamingState {
   char* intermediateDecode() const;
   void finalizeStream();
   char* finishStream();
-  Result* finishStreamWithMetadata(unsigned int numResults);
+  Result* finishStreamWithMetadata(unsigned int num_results);
 
   void processAudioWindow(const vector<float>& buf);
   void processMfccWindow(const vector<float>& buf);
@@ -144,7 +144,7 @@ StreamingState::finishStream()
 }
 
 Result*
-StreamingState::finishStreamWithMetadata(unsigned int numResults)
+StreamingState::finishStreamWithMetadata(unsigned int num_results)
 {
   finalizeStream();
 
@@ -427,9 +427,9 @@ DS_FinishStream(StreamingState* aSctx)
 
 Result*
 DS_FinishStreamWithMetadata(StreamingState* aSctx, 
-                            unsigned int numResults)
+                            unsigned int aNumResults)
 {
-  Result* result = aSctx->finishStreamWithMetadata(numResults);
+  Result* result = aSctx->finishStreamWithMetadata(aNumResults);
   DS_FreeStream(aSctx);
   return result;
 }
@@ -461,10 +461,10 @@ Result*
 DS_SpeechToTextWithMetadata(ModelState* aCtx,
                             const short* aBuffer,
                             unsigned int aBufferSize,
-                            unsigned int numResults)
+                            unsigned int aNumResults)
 {
   StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
-  return DS_FinishStreamWithMetadata(ctx, numResults);
+  return DS_FinishStreamWithMetadata(ctx, aNumResults);
 }
 
 void
diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h
index 41d133ae..53f1954f 100644
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@@ -197,6 +197,7 @@ char* DS_SpeechToText(ModelState* aCtx,
  * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
  *                sample rate (matching what the model was trained on).
  * @param aBufferSize The number of samples in the audio signal.
+ * @param aNumResults The number of alternative transcriptions to return.
  *
  * @return Outputs a struct of individual letters along with their timing information. 
  *         The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
@@ -205,7 +206,7 @@ DEEPSPEECH_EXPORT
 Result* DS_SpeechToTextWithMetadata(ModelState* aCtx,
                                     const short* aBuffer,
                                     unsigned int aBufferSize,
-                                    unsigned int numResults);
+                                    unsigned int aNumResults);
 
 /**
  * @brief Create a new streaming inference state. The streaming state returned
@@ -265,6 +266,7 @@ char* DS_FinishStream(StreamingState* aSctx);
  *        inference, returns per-letter metadata.
  *
  * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
+ * @param aNumResults The number of alternative transcriptions to return.
  *
  * @return Outputs a struct of individual letters along with their timing information. 
  *         The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
@@ -273,7 +275,7 @@ char* DS_FinishStream(StreamingState* aSctx);
  */
 DEEPSPEECH_EXPORT
 Result* DS_FinishStreamWithMetadata(StreamingState* aSctx, 
-                                    unsigned int numResults);
+                                    unsigned int aNumResults);
 
 /**
  * @brief Destroy a streaming state without decoding the computed logits. This

From e0c42f01a441692fd4133899586a9cfd7b685641 Mon Sep 17 00:00:00 2001
From: dabinat <dabinat@example.com>
Date: Fri, 14 Feb 2020 19:17:52 -0800
Subject: [PATCH 04/16] Moved result limiting to ModelState instead of CTC
 decoder

---
 .../ctcdecode/ctc_beam_search_decoder.cpp     | 13 +++++------
 native_client/deepspeech.cc                   | 16 +-------------
 native_client/modelstate.cc                   | 22 +++++++++++--------
 native_client/modelstate.h                    | 10 ++++-----
 4 files changed, 24 insertions(+), 37 deletions(-)

diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.cpp b/native_client/ctcdecode/ctc_beam_search_decoder.cpp
index 9b3da8cf..3039d47c 100644
--- a/native_client/ctcdecode/ctc_beam_search_decoder.cpp
+++ b/native_client/ctcdecode/ctc_beam_search_decoder.cpp
@@ -157,7 +157,7 @@ DecoderState::next(const double *probs,
 }
 
 std::vector<Output>
-DecoderState::decode(size_t top_paths) const
+DecoderState::decode() const
 {
   std::vector<PathTrie*> prefixes_copy = prefixes_;
   std::unordered_map<const PathTrie*, float> scores;
@@ -167,7 +167,7 @@ DecoderState::decode(size_t top_paths) const
 
   // score the last word of each prefix that doesn't end with space
   if (ext_scorer_) {
-    for (size_t i = 0; i < top_paths && i < prefixes_copy.size(); ++i) {
+    for (size_t i = 0; i < beam_size_ && i < prefixes_copy.size(); ++i) {
       auto prefix = prefixes_copy[i];
       if (!ext_scorer_->is_scoring_boundary(prefix->parent, prefix->character)) {
         float score = 0.0;
@@ -181,13 +181,13 @@ DecoderState::decode(size_t top_paths) const
   }
 
   using namespace std::placeholders;
-  size_t num_prefixes = std::min(prefixes_copy.size(), top_paths);
+  size_t num_prefixes = std::min(prefixes_copy.size(), beam_size_);
   std::partial_sort(prefixes_copy.begin(),
                     prefixes_copy.begin() + num_prefixes,
                     prefixes_copy.end(),
                     std::bind(prefix_compare_external, _1, _2, scores));
 
-  size_t num_returned = std::min(num_prefixes, top_paths);
+  size_t num_returned = std::min(num_prefixes, beam_size_);
 
   std::vector<Output> outputs;
   outputs.reserve(num_returned);
@@ -218,7 +218,6 @@ std::vector<Output> ctc_beam_search_decoder(
     int class_dim,
     const Alphabet &alphabet,
     size_t beam_size,
-    size_t top_paths,
     double cutoff_prob,
     size_t cutoff_top_n,
     std::shared_ptr<Scorer> ext_scorer)
@@ -226,7 +225,7 @@ std::vector<Output> ctc_beam_search_decoder(
   DecoderState state;
   state.init(alphabet, beam_size, cutoff_prob, cutoff_top_n, ext_scorer);
   state.next(probs, time_dim, class_dim);
-  return state.decode(top_paths);
+  return state.decode();
 }
 
 std::vector<std::vector<Output>>
@@ -239,7 +238,6 @@ ctc_beam_search_decoder_batch(
     int seq_lengths_size,
     const Alphabet &alphabet,
     size_t beam_size,
-    size_t top_paths,
     size_t num_processes,
     double cutoff_prob,
     size_t cutoff_top_n,
@@ -259,7 +257,6 @@ ctc_beam_search_decoder_batch(
                                   class_dim,
                                   alphabet,
                                   beam_size,
-                                  top_paths,
                                   cutoff_prob,
                                   cutoff_top_n,
                                   ext_scorer));
diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc
index c44e130b..ffc10a13 100644
--- a/native_client/deepspeech.cc
+++ b/native_client/deepspeech.cc
@@ -148,21 +148,7 @@ StreamingState::finishStreamWithMetadata(unsigned int num_results)
 {
   finalizeStream();
 
-  vector<Metadata*> metadata = model_->decode_metadata(decoder_state_, numResults);
-
-  std::unique_ptr<Result> result(new Result());
-  result->num_transcriptions = metadata.size();
-
-  std::unique_ptr<Metadata[]> items(new Metadata[result->num_transcriptions]);
-
-  for (int i = 0; i < result->num_transcriptions; ++i) {
-      std::unique_ptr<Metadata> pointer(new Metadata(*metadata[i]));
-      items[i] = *pointer.release();
-  }
-
-  result->transcriptions = items.release();
-
-  return result.release();
+  return model_->decode_metadata(decoder_state_, num_results);
 }
 
 void
diff --git a/native_client/modelstate.cc b/native_client/modelstate.cc
index 88c2c857..5a8afae3 100644
--- a/native_client/modelstate.cc
+++ b/native_client/modelstate.cc
@@ -32,22 +32,25 @@ ModelState::init(const char* model_path)
 char*
 ModelState::decode(const DecoderState& state) const
 {
-  vector<Output> out = state.decode(1);
+  vector<Output> out = state.decode();
   return strdup(alphabet_.LabelsToString(out[0].tokens).c_str());
 }
 
-vector<Metadata*>
+Result*
 ModelState::decode_metadata(const DecoderState& state, 
-                            size_t top_paths)
+                            size_t num_results)
 {
-  vector<Output> out = state.decode(top_paths);
+  vector<Output> out = state.decode();
 
-  vector<Metadata*> meta_out;
+  size_t max_results = std::min(num_results, out.size());
 
-  size_t max_results = std::min(top_paths, out.size());
+  std::unique_ptr<Result> result(new Result());
+  result->num_transcriptions = max_results;
+
+  std::unique_ptr<Metadata[]> transcripts(new Metadata[max_results]());
 
   for (int j = 0; j < max_results; ++j) {
-    std::unique_ptr<Metadata> metadata(new Metadata());
+    Metadata* metadata = &transcripts[j];
     metadata->num_items = out[j].tokens.size();
     metadata->confidence = out[j].confidence;
 
@@ -65,8 +68,9 @@ ModelState::decode_metadata(const DecoderState& state,
     }
 
     metadata->items = items.release();
-    meta_out.push_back(metadata.release());
   }
 
-  return meta_out;
+  result->transcriptions = transcripts.release();
+
+  return result.release();
 }
diff --git a/native_client/modelstate.h b/native_client/modelstate.h
index 30d1e101..8ea7ad99 100644
--- a/native_client/modelstate.h
+++ b/native_client/modelstate.h
@@ -66,14 +66,14 @@ struct ModelState {
    * @brief Return character-level metadata including letter timings.
    *
    * @param state Decoder state to use when decoding.
-   * @param top_paths Number of alternate results to return.   
+   * @param num_results Number of alternate results to return.   
    *
-   * @return Vector of Metadata structs containing MetadataItem structs for each character.
+   * @return A Result struct containing Metadata structs.
    * Each represents an alternate transcription, with the first ranked most probable.
-   * The user is responsible for freeing Metadata by calling DS_FreeMetadata() on each item.
+   * The user is responsible for freeing Result by calling DS_FreeResult().
    */
-  virtual std::vector<Metadata*> decode_metadata(const DecoderState& state, 
-                                                 size_t top_paths);
+  virtual Result* decode_metadata(const DecoderState& state, 
+                                                 size_t num_results);
 };
 
 #endif // MODELSTATE_H

From e1fec4e8183a3cd451330e7e5619cb2e6ded4868 Mon Sep 17 00:00:00 2001
From: dabinat <dabinat@example.com>
Date: Fri, 14 Feb 2020 19:19:14 -0800
Subject: [PATCH 05/16] Client - Change JSON output to return alternatives
 transcripts in an "alternatives" array

---
 native_client/client.cc | 52 +++++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/native_client/client.cc b/native_client/client.cc
index ffe3b518..413be288 100644
--- a/native_client/client.cc
+++ b/native_client/client.cc
@@ -47,6 +47,7 @@ struct meta_word {
 char* metadataToString(Metadata* metadata);
 std::vector<meta_word> WordsFromMetadata(Metadata* metadata);
 char* JSONOutput(Result* result);
+std::string MetadataOutput(Metadata* metadata);
 
 ds_result
 LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
@@ -341,35 +342,56 @@ char*
 JSONOutput(Result* result)
 {
   std::ostringstream out_string;
-  out_string << "[\n";
+  out_string << "{\n";
 
   for (int j=0; j < result->num_transcriptions; ++j) {
     Metadata *metadata = &result->transcriptions[j];
-    std::vector<meta_word> words = WordsFromMetadata(metadata);
 
-    out_string << R"({"metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)";
+    if (j == 0) {
+      out_string << MetadataOutput(metadata);
 
-    for (int i = 0; i < words.size(); i++) {
-      meta_word w = words[i];
-      out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
-
-      if (i < words.size() - 1) {
-        out_string << ",";
+      if (result->num_transcriptions > 1) {
+        out_string << ",\n" << R"("alternatives")" << ":[\n";
       }
-    }
+    } else {
+      out_string << "{" << MetadataOutput(metadata) << "}";
 
-    out_string << "]}";
-
-    if (j < result->num_transcriptions - 1) {
-      out_string << ",\n";
+      if (j < result->num_transcriptions - 1) {
+        out_string << ",\n";
+      } else {
+        out_string << "\n]";
+      }
     }
   }
   
-  out_string << "\n]\n";
+  out_string << "\n}\n";
 
   return strdup(out_string.str().c_str());
 }
 
+std::string 
+MetadataOutput(Metadata *metadata)
+{
+  std::ostringstream out_string;
+
+  std::vector<meta_word> words = WordsFromMetadata(metadata);
+
+  out_string << R"("metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)";
+
+  for (int i = 0; i < words.size(); i++) {
+    meta_word w = words[i];
+    out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
+
+    if (i < words.size() - 1) {
+      out_string << ",";
+    }
+  }
+
+  out_string << "]";
+
+  return out_string.str();
+}
+
 int
 main(int argc, char **argv)
 {

From 69bd0326052717ad7c7a47bb336cd0234c45bb7e Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 25 Feb 2020 12:29:18 +0100
Subject: [PATCH 06/16] Improve API naming around Metadata objects

---
 .../ctcdecode/ctc_beam_search_decoder.cpp     |  8 +--
 .../ctcdecode/ctc_beam_search_decoder.h       |  5 +-
 native_client/deepspeech.cc                   | 38 ++++-------
 native_client/deepspeech.h                    | 65 +++++++++----------
 native_client/modelstate.cc                   | 43 ++++++------
 native_client/modelstate.h                    | 12 ++--
 6 files changed, 75 insertions(+), 96 deletions(-)

diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.cpp b/native_client/ctcdecode/ctc_beam_search_decoder.cpp
index 3039d47c..8a072c53 100644
--- a/native_client/ctcdecode/ctc_beam_search_decoder.cpp
+++ b/native_client/ctcdecode/ctc_beam_search_decoder.cpp
@@ -157,7 +157,7 @@ DecoderState::next(const double *probs,
 }
 
 std::vector<Output>
-DecoderState::decode() const
+DecoderState::decode(size_t num_results) const
 {
   std::vector<PathTrie*> prefixes_copy = prefixes_;
   std::unordered_map<const PathTrie*, float> scores;
@@ -181,14 +181,12 @@ DecoderState::decode() const
   }
 
   using namespace std::placeholders;
-  size_t num_prefixes = std::min(prefixes_copy.size(), beam_size_);
+  size_t num_returned = std::min(prefixes_copy.size(), num_results);
   std::partial_sort(prefixes_copy.begin(),
-                    prefixes_copy.begin() + num_prefixes,
+                    prefixes_copy.begin() + num_returned,
                     prefixes_copy.end(),
                     std::bind(prefix_compare_external, _1, _2, scores));
 
-  size_t num_returned = std::min(num_prefixes, beam_size_);
-
   std::vector<Output> outputs;
   outputs.reserve(num_returned);
 
diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.h b/native_client/ctcdecode/ctc_beam_search_decoder.h
index a3d5c480..78871b2a 100644
--- a/native_client/ctcdecode/ctc_beam_search_decoder.h
+++ b/native_client/ctcdecode/ctc_beam_search_decoder.h
@@ -61,12 +61,15 @@ public:
             int class_dim);
 
   /* Get transcription from current decoder state
+   *
+   * Parameters:
+   *     num_results: Number of beams to return.
    *
    * Return:
    *     A vector where each element is a pair of score and decoding result,
    *     in descending order.
   */
-  std::vector<Output> decode() const;
+  std::vector<Output> decode(size_t num_results=1) const;
 };
 
 
diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc
index ffc10a13..adaa0445 100644
--- a/native_client/deepspeech.cc
+++ b/native_client/deepspeech.cc
@@ -60,7 +60,7 @@ using std::vector;
    When batch_buffer is full, we do a single step through the acoustic model
    and accumulate the intermediate decoding state in the DecoderState structure.
 
-   When finishStream() is called, we return the corresponding transcription from
+   When finishStream() is called, we return the corresponding transcript from
    the current decoder state.
 */
 struct StreamingState {
@@ -80,7 +80,7 @@ struct StreamingState {
   char* intermediateDecode() const;
   void finalizeStream();
   char* finishStream();
-  Result* finishStreamWithMetadata(unsigned int num_results);
+  Metadata* finishStreamWithMetadata(unsigned int num_results);
 
   void processAudioWindow(const vector<float>& buf);
   void processMfccWindow(const vector<float>& buf);
@@ -143,7 +143,7 @@ StreamingState::finishStream()
   return model_->decode(decoder_state_);
 }
 
-Result*
+Metadata*
 StreamingState::finishStreamWithMetadata(unsigned int num_results)
 {
   finalizeStream();
@@ -411,11 +411,11 @@ DS_FinishStream(StreamingState* aSctx)
   return str;
 }
 
-Result*
+Metadata*
 DS_FinishStreamWithMetadata(StreamingState* aSctx, 
                             unsigned int aNumResults)
 {
-  Result* result = aSctx->finishStreamWithMetadata(aNumResults);
+  Metadata* result = aSctx->finishStreamWithMetadata(aNumResults);
   DS_FreeStream(aSctx);
   return result;
 }
@@ -443,7 +443,7 @@ DS_SpeechToText(ModelState* aCtx,
   return DS_FinishStream(ctx);
 }
 
-Result*
+Metadata*
 DS_SpeechToTextWithMetadata(ModelState* aCtx,
                             const short* aBuffer,
                             unsigned int aBufferSize,
@@ -463,30 +463,16 @@ void
 DS_FreeMetadata(Metadata* m)
 {
   if (m) {
-    for (int i = 0; i < m->num_items; ++i) {
-      free(m->items[i].character);
-    }
-    delete[] m->items;
-    delete m;
-  }
-}
-
-void
-DS_FreeResult(Result* r)
-{
-  if (r) {
-    for (int i = 0; i < r->num_transcriptions; ++i) {
-      Metadata* m = &r->transcriptions[i];
-
-      for (int j = 0; j < m->num_items; ++j) {
-        free(m->items[j].character);
+    for (int i = 0; i < m->num_transcripts; ++i) {
+      for (int j = 0; j < m->transcripts[i].num_tokens; ++j) {
+        free(m->transcripts[i].tokens[j].text);
       }
 
-      delete[] m->items;
+      delete[] m->transcripts[i].tokens;
     }
 
-    delete[] r->transcriptions;
-    delete r;
+    delete[] m->transcripts;
+    delete m;
   }
 }
 
diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h
index 53f1954f..7aee1048 100644
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@@ -20,43 +20,44 @@ typedef struct ModelState ModelState;
 typedef struct StreamingState StreamingState;
 
 /**
- * @brief Stores each individual character, along with its timing information
+ * @brief Stores text of an individual token, along with its timing information
  */
-typedef struct MetadataItem {
-  /** The character generated for transcription */
-  char* character;
+typedef struct TokenMetadata {
+  /** The text corresponding to this token */
+  char* text;
 
-  /** Position of the character in units of 20ms */
+  /** Position of the token in units of 20ms */
   int timestep;
 
-  /** Position of the character in seconds */
+  /** Position of the token in seconds */
   float start_time;
-} MetadataItem;
+} TokenMetadata;
 
 /**
- * @brief Stores the entire CTC output as an array of character metadata objects
+ * @brief A single transcript computed by the model, including a confidence
+ *        value and the metadata for its constituent tokens.
  */
-typedef struct Metadata {
-  /** List of items */
-  MetadataItem* items;
-  /** Size of the list of items */
-  int num_items;
+typedef struct CandidateTranscript {
+  /** Array of TokenMetadata objects */
+  TokenMetadata* tokens;
+  /** Size of the tokens array */
+  int num_tokens;
   /** Approximated confidence value for this transcription. This is roughly the
    * sum of the acoustic model logit values for each timestep/character that
    * contributed to the creation of this transcription.
    */
   double confidence;
-} Metadata;
+} CandidateTranscript;
 
 /**
- * @brief Stores Metadata structs for each alternative transcription
+ * @brief An array of CandidateTranscript objects computed by the model
  */
-typedef struct Result {
-  /** List of transcriptions */
-  Metadata* transcriptions;
-  /** Size of the list of transcriptions */
-  int num_transcriptions;
-} Result;
+typedef struct Metadata {
+  /** Array of CandidateTranscript objects */
+  CandidateTranscript* transcripts;
+  /** Size of the transcriptions array */
+  int num_transcripts;
+} Metadata;
 
 enum DeepSpeech_Error_Codes
 {
@@ -197,16 +198,16 @@ char* DS_SpeechToText(ModelState* aCtx,
  * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
  *                sample rate (matching what the model was trained on).
  * @param aBufferSize The number of samples in the audio signal.
- * @param aNumResults The number of alternative transcriptions to return.
+ * @param aNumResults The number of candidate transcripts to return.
  *
  * @return Outputs a struct of individual letters along with their timing information. 
  *         The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
  */
 DEEPSPEECH_EXPORT
-Result* DS_SpeechToTextWithMetadata(ModelState* aCtx,
-                                    const short* aBuffer,
-                                    unsigned int aBufferSize,
-                                    unsigned int aNumResults);
+Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
+                                      const short* aBuffer,
+                                      unsigned int aBufferSize,
+                                      unsigned int aNumResults);
 
 /**
  * @brief Create a new streaming inference state. The streaming state returned
@@ -266,7 +267,7 @@ char* DS_FinishStream(StreamingState* aSctx);
  *        inference, returns per-letter metadata.
  *
  * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
- * @param aNumResults The number of alternative transcriptions to return.
+ * @param aNumResults The number of candidate transcripts to return.
  *
  * @return Outputs a struct of individual letters along with their timing information. 
  *         The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
@@ -274,8 +275,8 @@ char* DS_FinishStream(StreamingState* aSctx);
  * @note This method will free the state pointer (@p aSctx).
  */
 DEEPSPEECH_EXPORT
-Result* DS_FinishStreamWithMetadata(StreamingState* aSctx, 
-                                    unsigned int aNumResults);
+Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, 
+                                      unsigned int aNumResults);
 
 /**
  * @brief Destroy a streaming state without decoding the computed logits. This
@@ -295,12 +296,6 @@ void DS_FreeStream(StreamingState* aSctx);
 DEEPSPEECH_EXPORT
 void DS_FreeMetadata(Metadata* m);
 
-/**
- * @brief Free memory allocated for result information.
- */
-DEEPSPEECH_EXPORT
-void DS_FreeResult(Result* r);
-
 /**
  * @brief Free a char* string returned by the DeepSpeech API.
  */
diff --git a/native_client/modelstate.cc b/native_client/modelstate.cc
index 5a8afae3..d4f16636 100644
--- a/native_client/modelstate.cc
+++ b/native_client/modelstate.cc
@@ -36,41 +36,38 @@ ModelState::decode(const DecoderState& state) const
   return strdup(alphabet_.LabelsToString(out[0].tokens).c_str());
 }
 
-Result*
+Metadata*
 ModelState::decode_metadata(const DecoderState& state, 
                             size_t num_results)
 {
-  vector<Output> out = state.decode();
+  vector<Output> out = state.decode(num_results);
+  size_t num_returned = out.size();
 
-  size_t max_results = std::min(num_results, out.size());
+  std::unique_ptr<Metadata> metadata(new Metadata);
+  metadata->num_transcripts = num_returned;
 
-  std::unique_ptr<Result> result(new Result());
-  result->num_transcriptions = max_results;
+  std::unique_ptr<CandidateTranscript[]> transcripts(new CandidateTranscript[num_returned]);
 
-  std::unique_ptr<Metadata[]> transcripts(new Metadata[max_results]());
+  for (int i = 0; i < num_returned; ++i) {
+    transcripts[i].num_tokens = out[i].tokens.size();
+    transcripts[i].confidence = out[i].confidence;
 
-  for (int j = 0; j < max_results; ++j) {
-    Metadata* metadata = &transcripts[j];
-    metadata->num_items = out[j].tokens.size();
-    metadata->confidence = out[j].confidence;
+    std::unique_ptr<TokenMetadata[]> tokens(new TokenMetadata[transcripts[i].num_tokens]);
 
-    std::unique_ptr<MetadataItem[]> items(new MetadataItem[metadata->num_items]());
+    // Loop through each token
+    for (int j = 0; j < out[i].tokens.size(); ++j) {
+      tokens[j].text = strdup(alphabet_.StringFromLabel(out[i].tokens[j]).c_str());
+      tokens[j].timestep = out[i].timesteps[j];
+      tokens[j].start_time = out[i].timesteps[j] * ((float)audio_win_step_ / sample_rate_);
 
-    // Loop through each character
-    for (int i = 0; i < out[j].tokens.size(); ++i) {
-      items[i].character = strdup(alphabet_.StringFromLabel(out[j].tokens[i]).c_str());
-      items[i].timestep = out[j].timesteps[i];
-      items[i].start_time = out[j].timesteps[i] * ((float)audio_win_step_ / sample_rate_);
-
-      if (items[i].start_time < 0) {
-        items[i].start_time = 0;
+      if (tokens[j].start_time < 0) {
+        tokens[j].start_time = 0;
       }
     }
 
-    metadata->items = items.release();
+    transcripts[i].tokens = tokens.release();
   }
 
-  result->transcriptions = transcripts.release();
-
-  return result.release();
+  metadata->transcripts = transcripts.release();
+  return metadata.release();
 }
diff --git a/native_client/modelstate.h b/native_client/modelstate.h
index 8ea7ad99..43eef970 100644
--- a/native_client/modelstate.h
+++ b/native_client/modelstate.h
@@ -66,14 +66,14 @@ struct ModelState {
    * @brief Return character-level metadata including letter timings.
    *
    * @param state Decoder state to use when decoding.
-   * @param num_results Number of alternate results to return.   
+   * @param num_results Number of candidate results to return.
    *
-   * @return A Result struct containing Metadata structs.
-   * Each represents an alternate transcription, with the first ranked most probable.
-   * The user is responsible for freeing Result by calling DS_FreeResult().
+   * @return A Metadata struct containing CandidateTranscript structs.
+   * Each represents an candidate transcript, with the first ranked most probable.
+   * The user is responsible for freeing Result by calling DS_FreeMetadata().
    */
-  virtual Result* decode_metadata(const DecoderState& state, 
-                                                 size_t num_results);
+  virtual Metadata* decode_metadata(const DecoderState& state,
+                                    size_t num_results);
 };
 
 #endif // MODELSTATE_H

From ea8c7d2957d93cd7686751ba0860a10f7c5c330d Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 25 Feb 2020 13:38:25 +0100
Subject: [PATCH 07/16] Add DS_IntermediateDecodeWithMetadata

---
 native_client/deepspeech.cc | 15 ++++++++++++++-
 native_client/deepspeech.h  | 28 ++++++++++++++++++++++++----
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc
index adaa0445..d284a319 100644
--- a/native_client/deepspeech.cc
+++ b/native_client/deepspeech.cc
@@ -78,6 +78,7 @@ struct StreamingState {
 
   void feedAudioContent(const short* buffer, unsigned int buffer_size);
   char* intermediateDecode() const;
+  Metadata* intermediateDecodeWithMetadata(unsigned int num_results) const;
   void finalizeStream();
   char* finishStream();
   Metadata* finishStreamWithMetadata(unsigned int num_results);
@@ -136,6 +137,12 @@ StreamingState::intermediateDecode() const
   return model_->decode(decoder_state_);
 }
 
+Metadata*
+StreamingState::intermediateDecodeWithMetadata(unsigned int num_results) const
+{
+  return model_->decode_metadata(decoder_state_, num_results);
+}
+
 char*
 StreamingState::finishStream()
 {
@@ -147,7 +154,6 @@ Metadata*
 StreamingState::finishStreamWithMetadata(unsigned int num_results)
 {
   finalizeStream();
-
   return model_->decode_metadata(decoder_state_, num_results);
 }
 
@@ -403,6 +409,13 @@ DS_IntermediateDecode(const StreamingState* aSctx)
   return aSctx->intermediateDecode();
 }
 
+Metadata*
+DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
+                                  unsigned int aNumResults)
+{
+  return aSctx->intermediateDecodeWithMetadata(aNumResults);
+}
+
 char*
 DS_FinishStream(StreamingState* aSctx)
 {
diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h
index 7aee1048..8bfee073 100644
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@@ -200,8 +200,10 @@ char* DS_SpeechToText(ModelState* aCtx,
  * @param aBufferSize The number of samples in the audio signal.
  * @param aNumResults The number of candidate transcripts to return.
  *
- * @return Outputs a struct of individual letters along with their timing information. 
- *         The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
+ * @return Metadata struct containing multiple candidate transcripts. Each transcript
+ *         has per-token metadata including timing information. The user is
+ *         responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
+ *         Returns NULL on error.
  */
 DEEPSPEECH_EXPORT
 Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
@@ -248,6 +250,22 @@ void DS_FeedAudioContent(StreamingState* aSctx,
 DEEPSPEECH_EXPORT
 char* DS_IntermediateDecode(const StreamingState* aSctx);
 
+/**
+ * @brief Compute the intermediate decoding of an ongoing streaming inference,
+ *        returns per-letter metadata.
+ *
+ * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
+ * @param aNumResults The number of candidate transcripts to return.
+ *
+ * @return Metadata struct containing multiple candidate transcripts. Each transcript
+ *         has per-token metadata including timing information. The user is
+ *         responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
+ *         Returns NULL on error.
+ */
+DEEPSPEECH_EXPORT
+Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
+                                            unsigned int aNumResults);
+
 /**
  * @brief Signal the end of an audio signal to an ongoing streaming
  *        inference, returns the STT result over the whole audio signal.
@@ -269,8 +287,10 @@ char* DS_FinishStream(StreamingState* aSctx);
  * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
  * @param aNumResults The number of candidate transcripts to return.
  *
- * @return Outputs a struct of individual letters along with their timing information. 
- *         The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
+ * @return Metadata struct containing multiple candidate transcripts. Each transcript
+ *         has per-token metadata including timing information. The user is
+ *         responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
+ *         Returns NULL on error.
  *
  * @note This method will free the state pointer (@p aSctx).
  */

From c74dcffe79dd8346e47a9f9ed5a56e46c2c1810a Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 25 Feb 2020 12:38:42 +0100
Subject: [PATCH 08/16] Adjust client.cc for new API and small cleanup of code
 and function names

---
 native_client/client.cc | 238 +++++++++++++++++++---------------------
 1 file changed, 115 insertions(+), 123 deletions(-)

diff --git a/native_client/client.cc b/native_client/client.cc
index 413be288..9ab47f27 100644
--- a/native_client/client.cc
+++ b/native_client/client.cc
@@ -44,10 +44,115 @@ struct meta_word {
   float duration;
 };
 
-char* metadataToString(Metadata* metadata);
-std::vector<meta_word> WordsFromMetadata(Metadata* metadata);
-char* JSONOutput(Result* result);
-std::string MetadataOutput(Metadata* metadata);
+char*
+CandidateTranscriptToString(CandidateTranscript* transcript)
+{
+  std::string retval = "";
+  for (int i = 0; i < transcript->num_tokens; i++) {
+    TokenMetadata token = transcript->tokens[i];
+    retval += token.text;
+  }
+  return strdup(retval.c_str());
+}
+
+std::vector<meta_word>
+CandidateTranscriptToWords(CandidateTranscript* transcript)
+{
+  std::vector<meta_word> word_list;
+
+  std::string word = "";
+  float word_start_time = 0;
+
+  // Loop through each token
+  for (int i = 0; i < transcript->num_tokens; i++) {
+    TokenMetadata token = transcript->tokens[i];
+
+    // Append token to word if it's not a space
+    if (strcmp(token.text, u8" ") != 0) {
+      // Log the start time of the new word
+      if (word.length() == 0) {
+        word_start_time = token.start_time;
+      }
+      word.append(token.text);
+    }
+
+    // Word boundary is either a space or the last token in the array
+    if (strcmp(token.text, u8" ") == 0 || i == transcript->num_tokens-1) {
+      float word_duration = token.start_time - word_start_time;
+
+      if (word_duration < 0) {
+        word_duration = 0;
+      }
+
+      meta_word w;
+      w.word = word;
+      w.start_time = word_start_time;
+      w.duration = word_duration;
+
+      word_list.push_back(w);
+
+      // Reset
+      word = "";
+      word_start_time = 0;
+    }
+  }
+
+  return word_list;
+}
+
+std::string
+CandidateTranscriptToJSON(CandidateTranscript *transcript)
+{
+  std::ostringstream out_string;
+
+  std::vector<meta_word> words = CandidateTranscriptToWords(transcript);
+
+  out_string << R"("metadata":{"confidence":)" << transcript->confidence << R"(},"words":[)";
+
+  for (int i = 0; i < words.size(); i++) {
+    meta_word w = words[i];
+    out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
+
+    if (i < words.size() - 1) {
+      out_string << ",";
+    }
+  }
+
+  out_string << "]";
+
+  return out_string.str();
+}
+
+char*
+MetadataToJSON(Metadata* result)
+{
+  std::ostringstream out_string;
+  out_string << "{\n";
+
+  for (int j=0; j < result->num_transcripts; ++j) {
+    CandidateTranscript *transcript = &result->transcripts[j];
+
+    if (j == 0) {
+      out_string << CandidateTranscriptToJSON(transcript);
+
+      if (result->num_transcripts > 1) {
+        out_string << ",\n" << R"("alternatives")" << ":[\n";
+      }
+    } else {
+      out_string << "{" << CandidateTranscriptToJSON(transcript) << "}";
+
+      if (j < result->num_transcripts - 1) {
+        out_string << ",\n";
+      } else {
+        out_string << "\n]";
+      }
+    }
+  }
+  
+  out_string << "\n}\n";
+
+  return strdup(out_string.str().c_str());
+}
 
 ds_result
 LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
@@ -58,13 +163,13 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
   clock_t ds_start_time = clock();
 
   if (extended_output) {
-    Result *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1);
-    res.string = metadataToString(&result->transcriptions[0]);
-    DS_FreeResult(result);
+    Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1);
+    res.string = CandidateTranscriptToString(&result->transcripts[0]);
+    DS_FreeMetadata(result);
   } else if (json_output) {
-    Result *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3);
-    res.string = JSONOutput(result);
-    DS_FreeResult(result);
+    Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3);
+    res.string = MetadataToJSON(result);
+    DS_FreeMetadata(result);
   } else if (stream_size > 0) {
     StreamingState* ctx;
     int status = DS_CreateStream(aCtx, &ctx);
@@ -279,119 +384,6 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
   }
 }
 
-char*
-metadataToString(Metadata* metadata)
-{
-  std::string retval = "";
-  for (int i = 0; i < metadata->num_items; i++) {
-    MetadataItem item = metadata->items[i];
-    retval += item.character;
-  }
-  return strdup(retval.c_str());
-}
-
-std::vector<meta_word>
-WordsFromMetadata(Metadata* metadata)
-{
-  std::vector<meta_word> word_list;
-
-  std::string word = "";
-  float word_start_time = 0;
-
-  // Loop through each character
-  for (int i = 0; i < metadata->num_items; i++) {
-    MetadataItem item = metadata->items[i];
-
-    // Append character to word if it's not a space
-    if (strcmp(item.character, u8" ") != 0) {
-      // Log the start time of the new word
-      if (word.length() == 0) {
-        word_start_time = item.start_time;
-      }
-      word.append(item.character);
-    }
-
-    // Word boundary is either a space or the last character in the array
-    if (strcmp(item.character, " ") == 0
-        || strcmp(item.character, u8" ") == 0
-        || i == metadata->num_items-1) {
-
-      float word_duration = item.start_time - word_start_time;
-
-      if (word_duration < 0) {
-        word_duration = 0;
-      }
-
-      meta_word w;
-      w.word = word;
-      w.start_time = word_start_time;
-      w.duration = word_duration;
-
-      word_list.push_back(w);
-
-      // Reset
-      word = "";
-      word_start_time = 0;
-    }
-  }
-
-  return word_list;
-}
-
-char* 
-JSONOutput(Result* result)
-{
-  std::ostringstream out_string;
-  out_string << "{\n";
-
-  for (int j=0; j < result->num_transcriptions; ++j) {
-    Metadata *metadata = &result->transcriptions[j];
-
-    if (j == 0) {
-      out_string << MetadataOutput(metadata);
-
-      if (result->num_transcriptions > 1) {
-        out_string << ",\n" << R"("alternatives")" << ":[\n";
-      }
-    } else {
-      out_string << "{" << MetadataOutput(metadata) << "}";
-
-      if (j < result->num_transcriptions - 1) {
-        out_string << ",\n";
-      } else {
-        out_string << "\n]";
-      }
-    }
-  }
-  
-  out_string << "\n}\n";
-
-  return strdup(out_string.str().c_str());
-}
-
-std::string 
-MetadataOutput(Metadata *metadata)
-{
-  std::ostringstream out_string;
-
-  std::vector<meta_word> words = WordsFromMetadata(metadata);
-
-  out_string << R"("metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)";
-
-  for (int i = 0; i < words.size(); i++) {
-    meta_word w = words[i];
-    out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
-
-    if (i < words.size() - 1) {
-      out_string << ",";
-    }
-  }
-
-  out_string << "]";
-
-  return out_string.str();
-}
-
 int
 main(int argc, char **argv)
 {

From 6e88a37ad4367f1481e29472bf0a299881e96e63 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 25 Feb 2020 12:50:06 +0100
Subject: [PATCH 09/16] Adapt Python bindings to new API

---
 native_client/python/__init__.py | 65 +++++++++++++++++++-----------
 native_client/python/client.py   | 31 +++++++-------
 native_client/python/impl.i      | 69 ++++++++++++++++++++++++++------
 3 files changed, 116 insertions(+), 49 deletions(-)

diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py
index a6511efe..5d9072ec 100644
--- a/native_client/python/__init__.py
+++ b/native_client/python/__init__.py
@@ -121,17 +121,20 @@ class Model(object):
         """
         return deepspeech.impl.SpeechToText(self._impl, audio_buffer)
 
-    def sttWithMetadata(self, audio_buffer):
+    def sttWithMetadata(self, audio_buffer, num_results=1):
         """
         Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
 
         :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
         :type audio_buffer: numpy.int16 array
 
+        :param num_results: Number of candidate transcripts to return.
+        :type num_results: int
+
         :return: Outputs a struct of individual letters along with their timing information.
         :type: :func:`Metadata`
         """
-        return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer)
+        return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results)
 
     def createStream(self):
         """
@@ -187,6 +190,19 @@ class Stream(object):
             raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
         return deepspeech.impl.IntermediateDecode(self._impl)
 
+    def intermediateDecodeWithMetadata(self, num_results=1):
+        """
+        Compute the intermediate decoding of an ongoing streaming inference.
+
+        :return: The STT intermediate result.
+        :type: str
+
+        :throws: RuntimeError if the stream object is not valid
+        """
+        if not self._impl:
+            raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
+        return deepspeech.impl.IntermediateDecodeWithMetadata(self._impl, num_results)
+
     def finishStream(self):
         """
         Signal the end of an audio signal to an ongoing streaming inference,
@@ -203,11 +219,14 @@ class Stream(object):
         self._impl = None
         return result
 
-    def finishStreamWithMetadata(self):
+    def finishStreamWithMetadata(self, num_results=1):
         """
         Signal the end of an audio signal to an ongoing streaming inference,
         returns per-letter metadata.
 
+        :param num_results: Number of candidate transcripts to return.
+        :type num_results: int
+
         :return: Outputs a struct of individual letters along with their timing information.
         :type: :func:`Metadata`
 
@@ -215,7 +234,7 @@ class Stream(object):
         """
         if not self._impl:
             raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
-        result = deepspeech.impl.FinishStreamWithMetadata(self._impl)
+        result = deepspeech.impl.FinishStreamWithMetadata(self._impl, num_results)
         self._impl = None
         return result
 
@@ -233,52 +252,43 @@ class Stream(object):
 
 
 # This is only for documentation purpose
-# Metadata and MetadataItem should be in sync with native_client/deepspeech.h
-class MetadataItem(object):
+# Metadata, CandidateTranscript and TokenMetadata should be in sync with native_client/deepspeech.h
+class TokenMetadata(object):
     """
     Stores each individual character, along with its timing information
     """
 
-    def character(self):
+    def text(self):
         """
-        The character generated for transcription
+        The text for this token
         """
 
 
     def timestep(self):
         """
-        Position of the character in units of 20ms
+        Position of the token in units of 20ms
         """
 
 
     def start_time(self):
         """
-        Position of the character in seconds
+        Position of the token in seconds
         """
 
 
-class Metadata(object):
+class CandidateTranscript(object):
     """
     Stores the entire CTC output as an array of character metadata objects
     """
-    def items(self):
+    def tokens(self):
         """
-        List of items
+        List of tokens
 
-        :return: A list of :func:`MetadataItem` elements
+        :return: A list of :func:`TokenMetadata` elements
         :type: list
         """
 
 
-    def num_items(self):
-        """
-        Size of the list of items
-
-        :return: Size of the list of items
-        :type: int
-        """
-
-
     def confidence(self):
         """
         Approximated confidence value for this transcription. This is roughly the
@@ -286,3 +296,12 @@ class Metadata(object):
         contributed to the creation of this transcription.
         """
 
+
+class Metadata(object):
+    def transcripts(self):
+        """
+        List of candidate transcripts
+
+        :return: A list of :func:`CandidateTranscript` objects
+        :type: list
+        """
diff --git a/native_client/python/client.py b/native_client/python/client.py
index 671968b9..00fa2ff6 100644
--- a/native_client/python/client.py
+++ b/native_client/python/client.py
@@ -18,6 +18,7 @@ try:
 except ImportError:
     from pipes import quote
 
+
 def convert_samplerate(audio_path, desired_sample_rate):
     sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate)
     try:
@@ -31,25 +32,25 @@ def convert_samplerate(audio_path, desired_sample_rate):
 
 
 def metadata_to_string(metadata):
-    return ''.join(item.character for item in metadata.items)
+    return ''.join(token.text for token in metadata.tokens)
 
-def words_from_metadata(metadata):
+
+def words_from_candidate_transcript(metadata):
     word = ""
     word_list = []
     word_start_time = 0
     # Loop through each character
-    for i in range(0, metadata.num_items):
-        item = metadata.items[i]
+    for i, token in enumerate(metadata.tokens):
         # Append character to word if it's not a space
-        if item.character != " ":
+        if token.text != " ":
             if len(word) == 0:
                 # Log the start time of the new word
-                word_start_time = item.start_time
+                word_start_time = token.start_time
 
-            word = word + item.character
+            word = word + token.text
         # Word boundary is either a space or the last character in the array
-        if item.character == " " or i == metadata.num_items - 1:
-            word_duration = item.start_time - word_start_time
+        if token.text == " " or i == len(metadata.tokens) - 1:
+            word_duration = token.start_time - word_start_time
 
             if word_duration < 0:
                 word_duration = 0
@@ -69,9 +70,11 @@ def words_from_metadata(metadata):
 
 def metadata_json_output(metadata):
     json_result = dict()
-    json_result["words"] = words_from_metadata(metadata)
-    json_result["confidence"] = metadata.confidence
-    return json.dumps(json_result)
+    json_result["transcripts"] = [{
+        "confidence": transcript.confidence,
+        "words": words_from_candidate_transcript(transcript),
+    } for transcript in metadata.transcripts]
+    return json.dumps(json_result, indent=2)
 
 
 
@@ -141,9 +144,9 @@ def main():
     print('Running inference.', file=sys.stderr)
     inference_start = timer()
     if args.extended:
-        print(metadata_to_string(ds.sttWithMetadata(audio)))
+        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
     elif args.json:
-        print(metadata_json_output(ds.sttWithMetadata(audio)))
+        print(metadata_json_output(ds.sttWithMetadata(audio, 3)))
     else:
         print(ds.stt(audio))
     inference_end = timer() - inference_start
diff --git a/native_client/python/impl.i b/native_client/python/impl.i
index d6c7ba19..001a6165 100644
--- a/native_client/python/impl.i
+++ b/native_client/python/impl.i
@@ -38,30 +38,69 @@ import_array();
   %append_output(SWIG_NewPointerObj(%as_voidptr($1), $1_descriptor, SWIG_POINTER_OWN));
 }
 
-%typemap(out) MetadataItem* %{
-  $result = PyList_New(arg1->num_items);
-  for (int i = 0; i < arg1->num_items; ++i) {
-    PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->items[i]), SWIGTYPE_p_MetadataItem, 0);
+%fragment("parent_reference_init", "init") {
+  // Thread-safe initialization - initialize during Python module initialization
+  parent_reference();
+}
+
+%fragment("parent_reference_function", "header", fragment="parent_reference_init") {
+
+static PyObject *parent_reference() {
+  static PyObject *parent_reference_string = SWIG_Python_str_FromChar("__parent_reference");
+  return parent_reference_string;
+}
+
+}
+
+%typemap(out, fragment="parent_reference_function") CandidateTranscript* %{
+  $result = PyList_New(arg1->num_transcripts);
+  for (int i = 0; i < arg1->num_transcripts; ++i) {
+    PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->transcripts[i]), SWIGTYPE_p_CandidateTranscript, 0);
+    // Add a reference to Metadata in the returned elements to avoid premature
+    // garbage collection
+    PyObject_SetAttr(o, parent_reference(), $self);
     PyList_SetItem($result, i, o);
   }
 %}
 
-%extend struct MetadataItem {
+%typemap(out, fragment="parent_reference_function") TokenMetadata* %{
+  $result = PyList_New(arg1->num_tokens);
+  for (int i = 0; i < arg1->num_tokens; ++i) {
+    PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->tokens[i]), SWIGTYPE_p_TokenMetadata, 0);
+    // Add a reference to CandidateTranscript in the returned elements to avoid premature
+    // garbage collection
+    PyObject_SetAttr(o, parent_reference(), $self);
+    PyList_SetItem($result, i, o);
+  }
+%}
+
+%extend struct TokenMetadata {
 %pythoncode %{
   def __repr__(self):
-    return 'MetadataItem(character=\'{}\', timestep={}, start_time={})'.format(self.character, self.timestep, self.start_time)
+    return 'TokenMetadata(text=\'{}\', timestep={}, start_time={})'.format(self.text, self.timestep, self.start_time)
+%}
+}
+
+%extend struct CandidateTranscript {
+%pythoncode %{
+  def __repr__(self):
+    tokens_repr = ',\n'.join(repr(i) for i in self.tokens)
+    tokens_repr = '\n'.join('  ' + l for l in tokens_repr.split('\n'))
+    return 'CandidateTranscript(confidence={}, tokens=[\n{}\n])'.format(self.confidence, tokens_repr)
 %}
 }
 
 %extend struct Metadata {
 %pythoncode %{
   def __repr__(self):
-    items_repr = ', \n'.join('  ' + repr(i) for i in self.items)
-    return 'Metadata(confidence={}, items=[\n{}\n])'.format(self.confidence, items_repr)
+    transcripts_repr = ',\n'.join(repr(i) for i in self.transcripts)
+    transcripts_repr = '\n'.join('  ' + l for l in transcripts_repr.split('\n'))
+    return 'Metadata(transcripts=[\n{}\n])'.format(transcripts_repr)
 %}
 }
 
-%ignore Metadata::num_items;
+%ignore Metadata::num_transcripts;
+%ignore CandidateTranscript::num_tokens;
 
 %extend struct Metadata {
   ~Metadata() {
@@ -69,10 +108,16 @@ import_array();
   }
 }
 
-%nodefaultdtor Metadata;
+%immutable Metadata::transcripts;
+%immutable CandidateTranscript::tokens;
+%immutable TokenMetadata::text;
+
 %nodefaultctor Metadata;
-%nodefaultctor MetadataItem;
-%nodefaultdtor MetadataItem;
+%nodefaultdtor Metadata;
+%nodefaultctor CandidateTranscript;
+%nodefaultdtor CandidateTranscript;
+%nodefaultctor TokenMetadata;
+%nodefaultdtor TokenMetadata;
 
 %typemap(newfree) char* "DS_FreeString($1);";
 

From 09048e2ea23c3e3f3d2f3d6d28c71d8283aca633 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 25 Feb 2020 13:58:29 +0100
Subject: [PATCH 10/16] Adapt JavaScript bindings to new API

---
 native_client/javascript/client.js    | 11 ++--
 native_client/javascript/deepspeech.i | 40 ++++++++------
 native_client/javascript/index.js     | 75 +++++++++++++++++----------
 3 files changed, 77 insertions(+), 49 deletions(-)

diff --git a/native_client/javascript/client.js b/native_client/javascript/client.js
index abbfe59e..16dd19e8 100644
--- a/native_client/javascript/client.js
+++ b/native_client/javascript/client.js
@@ -42,12 +42,11 @@ function totalTime(hrtimeValue) {
   return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
 }
 
-function metadataToString(metadata) {
+function candidateTranscriptToString(transcript) {
   var retval = ""
-  for (var i = 0; i < metadata.num_items; ++i) {
-    retval += metadata.items[i].character;
+  for (var i = 0; i < transcript.tokens.length; ++i) {
+    retval += transcript.tokens[i].text;
   }
-  Ds.FreeMetadata(metadata);
   return retval;
 }
 
@@ -117,7 +116,9 @@ audioStream.on('finish', () => {
   const audioLength = (audioBuffer.length / 2) * (1 / desired_sample_rate);
 
   if (args['extended']) {
-    console.log(metadataToString(model.sttWithMetadata(audioBuffer)));
+    let metadata = model.sttWithMetadata(audioBuffer, 1);
+    console.log(candidateTranscriptToString(metadata.transcripts[0]));
+    Ds.FreeMetadata(metadata);
   } else {
     console.log(model.stt(audioBuffer));
   }
diff --git a/native_client/javascript/deepspeech.i b/native_client/javascript/deepspeech.i
index efbaa360..6b0151a4 100644
--- a/native_client/javascript/deepspeech.i
+++ b/native_client/javascript/deepspeech.i
@@ -47,8 +47,8 @@ using namespace node;
 %typemap(argout) ModelState **retval {
   $result = SWIGV8_ARRAY_NEW();
   SWIGV8_AppendOutput($result, SWIG_From_int(result));
-  // owned by SWIG, ModelState destructor gets called when the JavaScript object is finalized (see below)
-  %append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, SWIG_POINTER_OWN));
+  // owned by the application. NodeJS does not guarantee the finalizer will be called so applications must call FreeMetadata themselves.
+  %append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0));
 }
 
 
@@ -68,27 +68,33 @@ using namespace node;
 %nodefaultctor ModelState;
 %nodefaultdtor ModelState;
 
-%typemap(out) MetadataItem* %{
+%typemap(out) TokenMetadata* %{
   $result = SWIGV8_ARRAY_NEW();
-  for (int i = 0; i < arg1->num_items; ++i) {
-    SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_MetadataItem, SWIG_POINTER_OWN));
+  for (int i = 0; i < arg1->num_tokens; ++i) {
+    SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_TokenMetadata, 0));
   }
 %}
 
-%nodefaultdtor Metadata;
-%nodefaultctor Metadata;
-%nodefaultctor MetadataItem;
-%nodefaultdtor MetadataItem;
-
-%extend struct Metadata {
-  ~Metadata() {
-    DS_FreeMetadata($self);
+%typemap(out) CandidateTranscript* %{
+  $result = SWIGV8_ARRAY_NEW();
+  for (int i = 0; i < arg1->num_transcripts; ++i) {
+    SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_CandidateTranscript, 0));
   }
-}
+%}
 
-%extend struct MetadataItem {
-  ~MetadataItem() { }
-}
+%ignore Metadata::num_transcripts;
+%ignore CandidateTranscript::num_tokens;
+
+%immutable Metadata::transcripts;
+%immutable CandidateTranscripts::tokens;
+%immutable TokenMetadata::text;
+
+%nodefaultctor Metadata;
+%nodefaultdtor Metadata;
+%nodefaultctor CandidateTranscript;
+%nodefaultdtor CandidateTranscript;
+%nodefaultctor TokenMetadata;
+%nodefaultdtor TokenMetadata;
 
 %rename ("%(strip:[DS_])s") "";
 
diff --git a/native_client/javascript/index.js b/native_client/javascript/index.js
index cca483f1..7a027bde 100644
--- a/native_client/javascript/index.js
+++ b/native_client/javascript/index.js
@@ -122,8 +122,9 @@ Model.prototype.stt = function(aBuffer) {
  *
  * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
  */
-Model.prototype.sttWithMetadata = function(aBuffer) {
-    return binding.SpeechToTextWithMetadata(this._impl, aBuffer);
+Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) {
+    aNumResults = aNumResults || 1;
+    return binding.SpeechToTextWithMetadata(this._impl, aBuffer, aNumResults);
 }
 
 /**
@@ -171,6 +172,16 @@ Stream.prototype.intermediateDecode = function() {
     return binding.IntermediateDecode(this._impl);
 }
 
+/**
+ * Compute the intermediate decoding of an ongoing streaming inference.
+ *
+ * @return {string} The STT intermediate result.
+ */
+Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
+    aNumResults = aNumResults || 1;
+    return binding.IntermediateDecode(this._impl, aNumResults);
+}
+
 /**
  * Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
  *
@@ -191,8 +202,9 @@ Stream.prototype.finishStream = function() {
  *
  * This method will free the stream, it must not be used after this method is called.
  */
-Stream.prototype.finishStreamWithMetadata = function() {
-    result = binding.FinishStreamWithMetadata(this._impl);
+Stream.prototype.finishStreamWithMetadata = function(aNumResults) {
+    aNumResults = aNumResults || 1;
+    result = binding.FinishStreamWithMetadata(this._impl, aNumResults);
     this._impl = null;
     return result;
 }
@@ -236,35 +248,58 @@ function Version() {
 }
 
 
-//// Metadata and MetadataItem are here only for documentation purposes
+//// Metadata, CandidateTranscript and TokenMetadata are here only for documentation purposes
 
 /**
  * @class
  * 
  * Stores each individual character, along with its timing information
  */
-function MetadataItem() {}
+function TokenMetadata() {}
 
 /** 
  * The character generated for transcription
  *
  * @return {string} The character generated
  */
-MetadataItem.prototype.character = function() {}
+TokenMetadata.prototype.text = function() {}
 
 /**
  * Position of the character in units of 20ms
  *
  * @return {int} The position of the character
  */
-MetadataItem.prototype.timestep = function() {};
+TokenMetadata.prototype.timestep = function() {};
 
 /**
  * Position of the character in seconds
  *
  * @return {float} The position of the character
  */
-MetadataItem.prototype.start_time = function() {};
+TokenMetadata.prototype.start_time = function() {};
+
+/**
+ * @class
+ *
+ * Stores the entire CTC output as an array of character metadata objects
+ */
+function CandidateTranscript () {}
+
+/**
+ * List of items
+ *
+ * @return {array} List of :js:func:`TokenMetadata`
+ */
+CandidateTranscript.prototype.items = function() {}
+
+/**
+ * Approximated confidence value for this transcription. This is roughly the
+ * sum of the acoustic model logit values for each timestep/character that
+ * contributed to the creation of this transcription.
+ *
+ * @return {float} Confidence value
+ */
+CandidateTranscript.prototype.confidence = function() {}
 
 /**
  * @class
@@ -276,30 +311,16 @@ function Metadata () {}
 /**
  * List of items
  *
- * @return {array} List of :js:func:`MetadataItem`
+ * @return {array} List of :js:func:`CandidateTranscript` objects
  */
-Metadata.prototype.items = function() {}
+Metadata.prototype.transcripts = function() {}
 
-/**
- * Size of the list of items
- *
- * @return {int} Number of items
- */
-Metadata.prototype.num_items = function() {}
-
-/**
- * Approximated confidence value for this transcription. This is roughly the
- * sum of the acoustic model logit values for each timestep/character that
- * contributed to the creation of this transcription.
- *
- * @return {float} Confidence value
- */
-Metadata.prototype.confidence = function() {}
 
 module.exports = {
     Model: Model,
     Metadata: Metadata,
-    MetadataItem: MetadataItem,
+    CandidateTranscript: CandidateTranscript,
+    TokenMetadata: TokenMetadata,
     Version: Version,
     FreeModel: FreeModel,
     FreeStream: FreeStream,

From bb709ff9553f513afa20bde601fe03b7539a6759 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 25 Feb 2020 14:18:23 +0100
Subject: [PATCH 11/16] Adapt .NET bindings to new API

---
 .../dotnet/DeepSpeechClient/DeepSpeech.cs     | 21 ++++--
 .../DeepSpeechClient/DeepSpeechClient.csproj  |  6 +-
 .../Extensions/NativeExtensions.cs            | 69 ++++++++++++++-----
 .../Interfaces/IDeepSpeech.cs                 | 15 +++-
 .../Models/CandidateTranscript.cs             | 17 +++++
 .../DeepSpeechClient/Models/Metadata.cs       |  8 +--
 .../{MetadataItem.cs => TokenMetadata.cs}     |  4 +-
 .../dotnet/DeepSpeechClient/NativeImp.cs      | 34 +++++----
 .../Structs/CandidateTranscript.cs            | 22 ++++++
 .../DeepSpeechClient/Structs/Metadata.cs      | 12 ++--
 .../{MetadataItem.cs => TokenMetadata.cs}     |  6 +-
 .../dotnet/DeepSpeechConsole/Program.cs       | 14 ++--
 12 files changed, 162 insertions(+), 66 deletions(-)
 create mode 100644 native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs
 rename native_client/dotnet/DeepSpeechClient/Models/{MetadataItem.cs => TokenMetadata.cs} (89%)
 create mode 100644 native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs
 rename native_client/dotnet/DeepSpeechClient/Structs/{MetadataItem.cs => TokenMetadata.cs} (80%)

diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
index 576ed308..ce184cf4 100644
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
@@ -202,10 +202,11 @@ namespace DeepSpeechClient
         /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
         /// </summary>
         /// <param name="stream">Instance of the stream to finish.</param>
+        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
         /// <returns>The extended metadata result.</returns>
-        public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream)
+        public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
         {
-            return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata();
+            return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
         }
 
         /// <summary>
@@ -218,6 +219,17 @@ namespace DeepSpeechClient
             return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString();
         }
 
+        /// <summary>
+        /// Computes the intermediate decoding of an ongoing streaming inference.
+        /// </summary>
+        /// <param name="stream">Instance of the stream to decode.</param>
+        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <returns>The STT intermediate result.</returns>
+        public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
+        {
+            return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
+        }
+
         /// <summary>
         /// Return version of this library. The returned version is a semantic version
         /// (SemVer 2.0.0).
@@ -265,10 +277,11 @@ namespace DeepSpeechClient
         /// </summary>
         /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
         /// <param name="aBufferSize">The number of samples in the audio signal.</param>
+        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
         /// <returns>The extended metadata. Returns NULL on error.</returns>
-        public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
+        public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
         {
-            return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
+            return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata();
         }
 
         #endregion
diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
index b9077361..0139b3e8 100644
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
@@ -50,11 +50,13 @@
     <Compile Include="Extensions\NativeExtensions.cs" />
     <Compile Include="Models\DeepSpeechStream.cs" />
     <Compile Include="Models\Metadata.cs" />
-    <Compile Include="Models\MetadataItem.cs" />
+    <Compile Include="Models\CandidateTranscript.cs" />
+    <Compile Include="Models\TokenMetadata.cs" />
     <Compile Include="NativeImp.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />
     <Compile Include="Structs\Metadata.cs" />
-    <Compile Include="Structs\MetadataItem.cs" />
+    <Compile Include="Structs\CandidateTranscript.cs" />
+    <Compile Include="Structs\TokenMetadata.cs" />
   </ItemGroup>
   <ItemGroup />
   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
diff --git a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs
index 6b7f4c6a..9325f4b8 100644
--- a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs
+++ b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs
@@ -26,35 +26,68 @@ namespace DeepSpeechClient.Extensions
         }
 
         /// <summary>
-        /// Converts a pointer into managed metadata object.
+        /// Converts a pointer into managed TokenMetadata object.
+        /// </summary>
+        /// <param name="intPtr">Native pointer.</param>
+        /// <returns>TokenMetadata managed object.</returns>
+        private static Models.TokenMetadata PtrToTokenMetadata(this IntPtr intPtr)
+        {
+            var token = Marshal.PtrToStructure<TokenMetadata>(intPtr);
+            var managedToken = new Models.TokenMetadata
+            {
+                Timestep = token.timestep,
+                StartTime = token.start_time,
+                Text = token.text.PtrToString(releasePtr: false)
+            };
+            return managedToken;
+        }
+
+        /// <summary>
+        /// Converts a pointer into managed CandidateTranscript object.
+        /// </summary>
+        /// <param name="intPtr">Native pointer.</param>
+        /// <returns>CandidateTranscript managed object.</returns>
+        private static Models.CandidateTranscript PtrToCandidateTranscript(this IntPtr intPtr)
+        {
+            var managedTranscript = new Models.CandidateTranscript();
+            var transcript = Marshal.PtrToStructure<CandidateTranscript>(intPtr);
+
+            managedTranscript.Tokens = new Models.TokenMetadata[transcript.num_tokens];
+            managedTranscript.Confidence = transcript.confidence;
+
+            //we need to manually read each item from the native ptr using its size
+            var sizeOfTokenMetadata = Marshal.SizeOf(typeof(TokenMetadata));
+            for (int i = 0; i < transcript.num_tokens; i++)
+            {
+                managedTranscript.Tokens[i] = transcript.tokens.PtrToTokenMetadata();
+                transcript.tokens += sizeOfTokenMetadata;
+            }
+
+            return managedTranscript;
+        }
+
+        /// <summary>
+        /// Converts a pointer into managed Metadata object.
         /// </summary>
         /// <param name="intPtr">Native pointer.</param>
         /// <returns>Metadata managed object.</returns>
         internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
         {
-            var managedMetaObject = new Models.Metadata();
-            var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));
-
-            managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
-            managedMetaObject.Confidence = metaData.confidence;
+            var managedMetadata = new Models.Metadata();
+            var metadata = Marshal.PtrToStructure<Metadata>(intPtr);
 
+            managedMetadata.Transcripts = new Models.CandidateTranscript[metadata.num_transcripts];
 
             //we need to manually read each item from the native ptr using its size
-            var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
-            for (int i = 0; i < metaData.num_items; i++)
+            var sizeOfCandidateTranscript = Marshal.SizeOf(typeof(CandidateTranscript));
+            for (int i = 0; i < metadata.num_transcripts; i++)
             {
-                var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items);
-                managedMetaObject.Items[i] = new Models.MetadataItem
-                {
-                    Timestep = tempItem.timestep,
-                    StartTime = tempItem.start_time,
-                    Character = tempItem.character.PtrToString(releasePtr: false)
-                };
-                //we keep the offset on each read
-                metaData.items += sizeOfMetaItem;
+                managedMetadata.Transcripts[i] = metadata.transcripts.PtrToCandidateTranscript();
+                metadata.transcripts += sizeOfCandidateTranscript;
             }
+
             NativeImp.DS_FreeMetadata(intPtr);
-            return managedMetaObject;
+            return managedMetadata;
         }
     }
 }
diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
index 18677abc..ae3e72cf 100644
--- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
@@ -72,9 +72,11 @@ namespace DeepSpeechClient.Interfaces
         /// </summary>
         /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
         /// <param name="aBufferSize">The number of samples in the audio signal.</param>
+        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
         /// <returns>The extended metadata. Returns NULL on error.</returns>
         unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
-                uint aBufferSize);
+                uint aBufferSize,
+                uint aNumResults);
 
         /// <summary>
         /// Destroy a streaming state without decoding the computed logits.
@@ -102,6 +104,14 @@ namespace DeepSpeechClient.Interfaces
         /// <returns>The STT intermediate result.</returns>
         unsafe string IntermediateDecode(DeepSpeechStream stream);
 
+        /// <summary>
+        /// Computes the intermediate decoding of an ongoing streaming inference.
+        /// </summary>
+        /// <param name="stream">Instance of the stream to decode.</param>
+        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <returns>The extended metadata result.</returns>
+        unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
+
         /// <summary>
         /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
         /// </summary>
@@ -113,7 +123,8 @@ namespace DeepSpeechClient.Interfaces
         /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
         /// </summary>
         /// <param name="stream">Instance of the stream to finish.</param>
+        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
         /// <returns>The extended metadata result.</returns>
-        unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream);
+        unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
     }
 }
diff --git a/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs b/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs
new file mode 100644
index 00000000..cc6b5d28
--- /dev/null
+++ b/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs
@@ -0,0 +1,17 @@
+﻿namespace DeepSpeechClient.Models
+{
+    /// <summary>
+    /// Stores the entire CTC output as an array of character metadata objects.
+    /// </summary>
+    public class CandidateTranscript
+    {
+        /// <summary>
+        /// Approximated confidence value for this transcription.
+        /// </summary>
+        public double Confidence { get; set; }
+        /// <summary>
+        /// List of metada tokens containing text, timestep, and time offset.
+        /// </summary>
+        public TokenMetadata[] Tokens { get; set; }
+    }
+}
\ No newline at end of file
diff --git a/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs b/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
index 870eb162..fb6c613d 100644
--- a/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
@@ -6,12 +6,8 @@
     public class Metadata
     {
         /// <summary>
-        /// Approximated confidence value for this transcription.
+        /// List of candidate transcripts.
         /// </summary>
-        public double Confidence { get; set; }
-        /// <summary>
-        /// List of metada items containing char, timespet, and time offset.
-        /// </summary>
-        public MetadataItem[] Items { get; set; }
+        public CandidateTranscript[] Transcripts { get; set; }
     }
 }
\ No newline at end of file
diff --git a/native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs b/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs
similarity index 89%
rename from native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs
rename to native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs
index e329c6cb..5f2dea56 100644
--- a/native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs
+++ b/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs
@@ -3,12 +3,12 @@
     /// <summary>
     /// Stores each individual character, along with its timing information.
     /// </summary>
-    public class MetadataItem
+    public class TokenMetadata
     {
         /// <summary>
         /// Char of the current timestep.
         /// </summary>
-        public string Character;
+        public string Text;
         /// <summary>
         /// Position of the character in units of 20ms.
         /// </summary>
diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs
index 6c3494b6..eabbfe48 100644
--- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs
+++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs
@@ -17,45 +17,46 @@ namespace DeepSpeechClient
 
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
         internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath,
-                   ref IntPtr** pint);
+            ref IntPtr** pint);
 
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
         internal unsafe static extern uint DS_GetModelBeamWidth(IntPtr** aCtx);
 
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
         internal unsafe static extern ErrorCodes DS_SetModelBeamWidth(IntPtr** aCtx,
-                   uint aBeamWidth);
+            uint aBeamWidth);
 
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
         internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath,
-                   uint aBeamWidth,
-                   ref IntPtr** pint);
+            uint aBeamWidth,
+            ref IntPtr** pint);
 
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
         internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx);
 
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
         internal static unsafe extern ErrorCodes DS_EnableExternalScorer(IntPtr** aCtx,
-                  string aScorerPath);
+            string aScorerPath);
 
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
         internal static unsafe extern ErrorCodes DS_DisableExternalScorer(IntPtr** aCtx);
 
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
         internal static unsafe extern ErrorCodes DS_SetScorerAlphaBeta(IntPtr** aCtx,
-                  float aAlpha,
-                  float aBeta);
+            float aAlpha,
+            float aBeta);
 
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
             CharSet = CharSet.Ansi, SetLastError = true)]
         internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx,
-                 short[] aBuffer,
-                uint aBufferSize);
+            short[] aBuffer,
+            uint aBufferSize);
 
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
         internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
-                 short[] aBuffer,
-                uint aBufferSize);
+            short[] aBuffer,
+            uint aBufferSize,
+            uint aNumResults);
 
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
         internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
@@ -76,18 +77,23 @@ namespace DeepSpeechClient
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
             CharSet = CharSet.Ansi, SetLastError = true)]
         internal static unsafe extern void DS_FeedAudioContent(IntPtr** aSctx,
-                     short[] aBuffer,
-                    uint aBufferSize);
+            short[] aBuffer,
+            uint aBufferSize);
 
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
         internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx);
 
+        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
+        internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx,
+            uint aNumResults);
+
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
             CharSet = CharSet.Ansi, SetLastError = true)]
         internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx);
 
         [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
-        internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx);
+        internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx,
+            uint aNumResults);
         #endregion
     }
 }
diff --git a/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs b/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs
new file mode 100644
index 00000000..54581f6f
--- /dev/null
+++ b/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs
@@ -0,0 +1,22 @@
+using System;
+using System.Runtime.InteropServices;
+
+namespace DeepSpeechClient.Structs
+{
+    [StructLayout(LayoutKind.Sequential)]
+    internal unsafe struct CandidateTranscript
+    {
+        /// <summary>
+        /// Native list of tokens.
+        /// </summary>
+        internal unsafe IntPtr tokens;
+        /// <summary>
+        /// Count of tokens from the native side.
+        /// </summary>
+        internal unsafe int num_tokens;
+        /// <summary>
+        /// Approximated confidence value for this transcription.
+        /// </summary>
+        internal unsafe double confidence;
+    }
+}
diff --git a/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs b/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
index 411da9f2..0a9beddc 100644
--- a/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
@@ -7,16 +7,12 @@ namespace DeepSpeechClient.Structs
     internal unsafe struct Metadata
     {
         /// <summary>
-        /// Native list of items.
+        /// Native list of candidate transcripts.
         /// </summary>
-        internal unsafe IntPtr items;
+        internal unsafe IntPtr transcripts;
         /// <summary>
-        /// Count of items from the native side.
+        /// Count of transcripts from the native side.
         /// </summary>
-        internal unsafe int num_items;
-        /// <summary>
-        /// Approximated confidence value for this transcription.
-        /// </summary>
-        internal unsafe double confidence;
+        internal unsafe int num_transcripts;
     }
 }
diff --git a/native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs b/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs
similarity index 80%
rename from native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs
rename to native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs
index 10092742..1c660c71 100644
--- a/native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs
+++ b/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs
@@ -4,12 +4,12 @@ using System.Runtime.InteropServices;
 namespace DeepSpeechClient.Structs
 {
     [StructLayout(LayoutKind.Sequential)]
-    internal unsafe struct MetadataItem
+    internal unsafe struct TokenMetadata
     {
         /// <summary>
-        /// Native character.
+        /// Native text.
         /// </summary>
-        internal unsafe IntPtr character;
+        internal unsafe IntPtr text;
         /// <summary>
         /// Position of the character in units of 20ms.
         /// </summary>
diff --git a/native_client/dotnet/DeepSpeechConsole/Program.cs b/native_client/dotnet/DeepSpeechConsole/Program.cs
index b35c7046..a08e44b6 100644
--- a/native_client/dotnet/DeepSpeechConsole/Program.cs
+++ b/native_client/dotnet/DeepSpeechConsole/Program.cs
@@ -21,14 +21,14 @@ namespace CSharpExamples
         static string GetArgument(IEnumerable<string> args, string option)
         => args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();
 
-        static string MetadataToString(Metadata meta)
+        static string MetadataToString(CandidateTranscript transcript)
         {
             var nl = Environment.NewLine;
             string retval =
-             Environment.NewLine + $"Recognized text: {string.Join("", meta?.Items?.Select(x => x.Character))} {nl}"
-             + $"Confidence: {meta?.Confidence} {nl}"
-             + $"Item count: {meta?.Items?.Length} {nl}"
-             + string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}"));
+             Environment.NewLine + $"Recognized text: {string.Join("", transcript?.Tokens?.Select(x => x.Text))} {nl}"
+             + $"Confidence: {transcript?.Confidence} {nl}"
+             + $"Item count: {transcript?.Tokens?.Length} {nl}"
+             + string.Join(nl, transcript?.Tokens?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Text}"));
             return retval;
         }
 
@@ -75,8 +75,8 @@ namespace CSharpExamples
                         if (extended)
                         {
                             Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer,
-                                Convert.ToUInt32(waveBuffer.MaxSize / 2));
-                            speechResult = MetadataToString(metaResult);
+                                Convert.ToUInt32(waveBuffer.MaxSize / 2), 1);
+                            speechResult = MetadataToString(metaResult.Transcripts[0]);
                         }
                         else
                         {

From c52f3b32fa3c7001151beedc2ac77a40294c3c41 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 25 Feb 2020 14:29:49 +0100
Subject: [PATCH 12/16] Adapt Java bindings to new API

---
 native_client/java/jni/deepspeech.i           | 34 ++++++++++++++-----
 .../libdeepspeech/test/BasicTest.java         | 10 +++---
 .../libdeepspeech/DeepSpeechModel.java        | 22 +++++++++---
 3 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/native_client/java/jni/deepspeech.i b/native_client/java/jni/deepspeech.i
index ded18439..4bbdc776 100644
--- a/native_client/java/jni/deepspeech.i
+++ b/native_client/java/jni/deepspeech.i
@@ -18,18 +18,32 @@
 %typemap(newfree) char* "DS_FreeString($1);";
 
 %include "carrays.i"
-%array_functions(struct MetadataItem, metadataItem_array);
+%array_functions(struct TokenMetadata, TokenMetadata_array);
+%array_functions(struct CandidateTranscript, CandidateTranscript_array);
+
+%extend struct CandidateTranscript {
+  /**
+   * Retrieve one TokenMetadata element
+   * 
+   * @param i Array index of the TokenMetadata to get
+   *
+   * @return The TokenMetadata requested or null
+   */
+  TokenMetadata getToken(int i) {
+    return TokenMetadata_array_getitem(self->tokens, i);
+  }
+}
 
 %extend struct Metadata {
   /**
-   * Retrieve one MetadataItem element
+   * Retrieve one CandidateTranscript element
    * 
-   * @param i Array index of the MetadataItem to get
+   * @param i Array index of the CandidateTranscript to get
    *
-   * @return The MetadataItem requested or null
+   * @return The CandidateTranscript requested or null
    */
-  MetadataItem getItem(int i) {
-    return metadataItem_array_getitem(self->items, i);
+  CandidateTranscript getTranscript(int i) {
+    return CandidateTranscript_array_getitem(self->transcripts, i);
   }
 
   ~Metadata() {
@@ -37,10 +51,12 @@
   }
 }
 
-%nodefaultdtor Metadata;
 %nodefaultctor Metadata;
-%nodefaultctor MetadataItem;
-%nodefaultdtor MetadataItem;
+%nodefaultdtor Metadata;
+%nodefaultctor CandidateTranscript;
+%nodefaultdtor CandidateTranscript;
+%nodefaultctor TokenMetadata;
+%nodefaultdtor TokenMetadata;
 
 %newobject DS_SpeechToText;
 %newobject DS_IntermediateDecode;
diff --git a/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java b/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java
index 2957b2e7..f7eccf00 100644
--- a/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java
+++ b/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java
@@ -12,7 +12,7 @@ import org.junit.runners.MethodSorters;
 import static org.junit.Assert.*;
 
 import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
-import org.mozilla.deepspeech.libdeepspeech.Metadata;
+import org.mozilla.deepspeech.libdeepspeech.CandidateTranscript;
 
 import java.io.RandomAccessFile;
 import java.io.FileNotFoundException;
@@ -61,10 +61,10 @@ public class BasicTest {
         m.freeModel();
     }
 
-    private String metadataToString(Metadata m) {
+    private String candidateTranscriptToString(CandidateTranscript t) {
         String retval = "";
-        for (int i = 0; i < m.getNum_items(); ++i) {
-            retval += m.getItem(i).getCharacter();
+        for (int i = 0; i < t.getNum_tokens(); ++i) {
+            retval += t.getToken(i).getText();
         }
         return retval;
     }
@@ -97,7 +97,7 @@ public class BasicTest {
             ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
 
             if (extendedMetadata) {
-                return metadataToString(m.sttWithMetadata(shorts, shorts.length));
+                return candidateTranscriptToString(m.sttWithMetadata(shorts, shorts.length, 1).getTranscript(0));
             } else {
                 return m.stt(shorts, shorts.length);
             }
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
index 6d0a316b..b506b1d3 100644
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
@@ -117,11 +117,12 @@ public class DeepSpeechModel {
     * @param buffer A 16-bit, mono raw audio signal at the appropriate
     *                sample rate (matching what the model was trained on).
     * @param buffer_size The number of samples in the audio signal.
+    * @param num_results Number of candidate transcripts to return.
     *
     * @return Outputs a Metadata object of individual letters along with their timing information.
     */
-    public Metadata sttWithMetadata(short[] buffer, int buffer_size) {
-        return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size);
+    public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) {
+        return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results);
     }
 
    /**
@@ -160,6 +161,18 @@ public class DeepSpeechModel {
         return impl.IntermediateDecode(ctx.get());
     }
 
+   /**
+    * @brief Compute the intermediate decoding of an ongoing streaming inference.
+    *
+    * @param ctx A streaming state pointer returned by createStream().
+    * @param num_results Number of candidate transcripts to return.
+    *
+    * @return The STT intermediate result.
+    */
+    public Metadata intermediateDecodeWithMetadata(DeepSpeechStreamingState ctx, int num_results) {
+        return impl.IntermediateDecodeWithMetadata(ctx.get(), num_results);
+    }
+
    /**
     * @brief Signal the end of an audio signal to an ongoing streaming
     *        inference, returns the STT result over the whole audio signal.
@@ -179,12 +192,13 @@ public class DeepSpeechModel {
     *        inference, returns per-letter metadata.
     *
     * @param ctx A streaming state pointer returned by createStream().
+    * @param num_results Number of candidate transcripts to return.
     *
     * @return Outputs a Metadata object of individual letters along with their timing information.
     *
     * @note This method will free the state pointer (@p ctx).
     */
-    public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx) {
-        return impl.FinishStreamWithMetadata(ctx.get());
+    public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx, int num_results) {
+        return impl.FinishStreamWithMetadata(ctx.get(), num_results);
     }
 }

From e9ae38bf4789b9a2f62520c622c1eba1af656a9c Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 25 Feb 2020 15:43:36 +0100
Subject: [PATCH 13/16] Update docs

---
 doc/C-API.rst                                 |  3 +
 doc/DotNet-API.rst                            | 19 ++--
 doc/NodeJS-API.rst                            | 12 ++-
 doc/Python-API.rst                            | 12 ++-
 native_client/deepspeech.h                    | 27 +++---
 .../dotnet/DeepSpeechClient/DeepSpeech.cs     | 12 +--
 .../Interfaces/IDeepSpeech.cs                 | 12 +--
 .../libdeepspeech/DeepSpeechModel.java        | 21 ++--
 .../CandidateTranscript.java                  | 96 +++++++++++++++++++
 .../libdeepspeech_doc/Metadata.java           | 62 +++++-------
 .../libdeepspeech_doc/TokenMetadata.java      | 79 +++++++++++++++
 native_client/javascript/index.js             | 49 +++++-----
 native_client/python/__init__.py              | 30 +++---
 13 files changed, 314 insertions(+), 120 deletions(-)
 create mode 100644 native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
 create mode 100644 native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java

diff --git a/doc/C-API.rst b/doc/C-API.rst
index 2506d9b2..2b0e7e05 100644
--- a/doc/C-API.rst
+++ b/doc/C-API.rst
@@ -34,6 +34,9 @@ C
 .. doxygenfunction:: DS_IntermediateDecode
    :project: deepspeech-c
 
+.. doxygenfunction:: DS_IntermediateDecodeWithMetadata
+   :project: deepspeech-c
+
 .. doxygenfunction:: DS_FinishStream
    :project: deepspeech-c
 
diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst
index 2ba3415f..d43c7afb 100644
--- a/doc/DotNet-API.rst
+++ b/doc/DotNet-API.rst
@@ -31,13 +31,20 @@ ErrorCodes
 Metadata
 --------
 
-.. doxygenstruct:: DeepSpeechClient::Structs::Metadata
+.. doxygenstruct:: DeepSpeechClient::Models::Metadata
    :project: deepspeech-dotnet
-   :members: items, num_items, confidence
+   :members: Transcripts
 
-MetadataItem
-------------
+CandidateTranscript
+-------------------
 
-.. doxygenstruct:: DeepSpeechClient::Structs::MetadataItem
+.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript
    :project: deepspeech-dotnet
-   :members: character, timestep, start_time
+   :members: Tokens, Confidence
+
+TokenMetadata
+-------------
+
+.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata
+   :project: deepspeech-dotnet
+   :members: Text, Timestep, StartTime
diff --git a/doc/NodeJS-API.rst b/doc/NodeJS-API.rst
index aaba718c..b6170b5b 100644
--- a/doc/NodeJS-API.rst
+++ b/doc/NodeJS-API.rst
@@ -30,8 +30,14 @@ Metadata
 .. js:autoclass:: Metadata
    :members:
 
-MetadataItem
-------------
+CandidateTranscript
+-------------------
 
-.. js:autoclass:: MetadataItem
+.. js:autoclass:: CandidateTranscript
+   :members:
+
+TokenMetadata
+-------------
+
+.. js:autoclass:: TokenMetadata
    :members:
diff --git a/doc/Python-API.rst b/doc/Python-API.rst
index b2b3567f..9aec57f0 100644
--- a/doc/Python-API.rst
+++ b/doc/Python-API.rst
@@ -21,8 +21,14 @@ Metadata
 .. autoclass:: Metadata
    :members:
 
-MetadataItem
-------------
+CandidateTranscript
+-------------------
 
-.. autoclass:: MetadataItem
+.. autoclass:: CandidateTranscript
+   :members:
+
+TokenMetadata
+-------------
+
+.. autoclass:: TokenMetadata
    :members:
diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h
index 8bfee073..bf4c0f00 100644
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@@ -42,20 +42,20 @@ typedef struct CandidateTranscript {
   TokenMetadata* tokens;
   /** Size of the tokens array */
   int num_tokens;
-  /** Approximated confidence value for this transcription. This is roughly the
+  /** Approximated confidence value for this transcript. This is roughly the
    * sum of the acoustic model logit values for each timestep/character that
-   * contributed to the creation of this transcription.
+   * contributed to the creation of this transcript.
    */
   double confidence;
 } CandidateTranscript;
 
 /**
- * @brief An array of CandidateTranscript objects computed by the model
+ * @brief An array of CandidateTranscript objects computed by the model.
  */
 typedef struct Metadata {
   /** Array of CandidateTranscript objects */
   CandidateTranscript* transcripts;
-  /** Size of the transcriptions array */
+  /** Size of the transcripts array */
   int num_transcripts;
 } Metadata;
 
@@ -191,14 +191,14 @@ char* DS_SpeechToText(ModelState* aCtx,
                       unsigned int aBufferSize);
 
 /**
- * @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata 
- * about the results.
+ * @brief Use the DeepSpeech model to perform Speech-To-Text and output results
+ * including metadata.
  *
  * @param aCtx The ModelState pointer for the model to use.
  * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
  *                sample rate (matching what the model was trained on).
  * @param aBufferSize The number of samples in the audio signal.
- * @param aNumResults The number of candidate transcripts to return.
+ * @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this.
  *
  * @return Metadata struct containing multiple candidate transcripts. Each transcript
  *         has per-token metadata including timing information. The user is
@@ -252,7 +252,7 @@ char* DS_IntermediateDecode(const StreamingState* aSctx);
 
 /**
  * @brief Compute the intermediate decoding of an ongoing streaming inference,
- *        returns per-letter metadata.
+ *        return results including metadata.
  *
  * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
  * @param aNumResults The number of candidate transcripts to return.
@@ -267,8 +267,8 @@ Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
                                             unsigned int aNumResults);
 
 /**
- * @brief Signal the end of an audio signal to an ongoing streaming
- *        inference, returns the STT result over the whole audio signal.
+ * @brief Compute the final decoding of an ongoing streaming inference and return
+ *        the result. Signals the end of an ongoing streaming inference.
  *
  * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
  *
@@ -281,8 +281,9 @@ DEEPSPEECH_EXPORT
 char* DS_FinishStream(StreamingState* aSctx);
 
 /**
- * @brief Signal the end of an audio signal to an ongoing streaming
- *        inference, returns per-letter metadata.
+ * @brief Compute the final decoding of an ongoing streaming inference and return
+ *        results including metadata. Signals the end of an ongoing streaming
+ *        inference.
  *
  * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
  * @param aNumResults The number of candidate transcripts to return.
@@ -295,7 +296,7 @@ char* DS_FinishStream(StreamingState* aSctx);
  * @note This method will free the state pointer (@p aSctx).
  */
 DEEPSPEECH_EXPORT
-Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, 
+Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
                                       unsigned int aNumResults);
 
 /**
diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
index ce184cf4..3340c9b3 100644
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
@@ -199,10 +199,10 @@ namespace DeepSpeechClient
         }
 
         /// <summary>
-        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
+        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
         /// </summary>
         /// <param name="stream">Instance of the stream to finish.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
         /// <returns>The extended metadata result.</returns>
         public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
         {
@@ -220,10 +220,10 @@ namespace DeepSpeechClient
         }
 
         /// <summary>
-        /// Computes the intermediate decoding of an ongoing streaming inference.
+        /// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
         /// </summary>
         /// <param name="stream">Instance of the stream to decode.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
         /// <returns>The STT intermediate result.</returns>
         public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
         {
@@ -273,11 +273,11 @@ namespace DeepSpeechClient
         }
 
         /// <summary>
-        /// Use the DeepSpeech model to perform Speech-To-Text.
+        /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
         /// </summary>
         /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
         /// <param name="aBufferSize">The number of samples in the audio signal.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
         /// <returns>The extended metadata. Returns NULL on error.</returns>
         public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
         {
diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
index ae3e72cf..37d6ce59 100644
--- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
@@ -68,11 +68,11 @@ namespace DeepSpeechClient.Interfaces
                 uint aBufferSize);
 
         /// <summary>
-        /// Use the DeepSpeech model to perform Speech-To-Text.
+        /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
         /// </summary>
         /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
         /// <param name="aBufferSize">The number of samples in the audio signal.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
         /// <returns>The extended metadata. Returns NULL on error.</returns>
         unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
                 uint aBufferSize,
@@ -105,10 +105,10 @@ namespace DeepSpeechClient.Interfaces
         unsafe string IntermediateDecode(DeepSpeechStream stream);
 
         /// <summary>
-        /// Computes the intermediate decoding of an ongoing streaming inference.
+        /// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
         /// </summary>
         /// <param name="stream">Instance of the stream to decode.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
         /// <returns>The extended metadata result.</returns>
         unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
 
@@ -120,10 +120,10 @@ namespace DeepSpeechClient.Interfaces
         unsafe string FinishStream(DeepSpeechStream stream);
 
         /// <summary>
-        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
+        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
         /// </summary>
         /// <param name="stream">Instance of the stream to finish.</param>
-        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
         /// <returns>The extended metadata result.</returns>
         unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
     }
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
index b506b1d3..a5b339b3 100644
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
@@ -117,9 +117,10 @@ public class DeepSpeechModel {
     * @param buffer A 16-bit, mono raw audio signal at the appropriate
     *                sample rate (matching what the model was trained on).
     * @param buffer_size The number of samples in the audio signal.
-    * @param num_results Number of candidate transcripts to return.
+    * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
     *
-    * @return Outputs a Metadata object of individual letters along with their timing information.
+    * @return Metadata struct containing multiple candidate transcripts. Each transcript
+    *         has per-token metadata including timing information.
     */
     public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) {
         return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results);
@@ -165,7 +166,7 @@ public class DeepSpeechModel {
     * @brief Compute the intermediate decoding of an ongoing streaming inference.
     *
     * @param ctx A streaming state pointer returned by createStream().
-    * @param num_results Number of candidate transcripts to return.
+    * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
     *
     * @return The STT intermediate result.
     */
@@ -174,8 +175,8 @@ public class DeepSpeechModel {
     }
 
    /**
-    * @brief Signal the end of an audio signal to an ongoing streaming
-    *        inference, returns the STT result over the whole audio signal.
+    * @brief Compute the final decoding of an ongoing streaming inference and return
+    *        the result. Signals the end of an ongoing streaming inference.
     *
     * @param ctx A streaming state pointer returned by createStream().
     *
@@ -188,13 +189,15 @@ public class DeepSpeechModel {
     }
 
    /**
-    * @brief Signal the end of an audio signal to an ongoing streaming
-    *        inference, returns per-letter metadata.
+    * @brief Compute the final decoding of an ongoing streaming inference and return
+    *        the results including metadata. Signals the end of an ongoing streaming
+    *        inference.
     *
     * @param ctx A streaming state pointer returned by createStream().
-    * @param num_results Number of candidate transcripts to return.
+    * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
     *
-    * @return Outputs a Metadata object of individual letters along with their timing information.
+    * @return Metadata struct containing multiple candidate transcripts. Each transcript
+    *         has per-token metadata including timing information.
     *
     * @note This method will free the state pointer (@p ctx).
     */
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
new file mode 100644
index 00000000..c02b39ad
--- /dev/null
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 4.0.1
+ *
+ * Do not make changes to this file unless you know what you are doing--modify
+ * the SWIG interface file instead.
+ * ----------------------------------------------------------------------------- */
+
+package org.mozilla.deepspeech.libdeepspeech;
+
+/**
+ * A single transcript computed by the model, including a confidence value and
+ * the metadata for its constituent tokens.
+ */
+public class CandidateTranscript {
+  private transient long swigCPtr;
+  protected transient boolean swigCMemOwn;
+
+  protected CandidateTranscript(long cPtr, boolean cMemoryOwn) {
+    swigCMemOwn = cMemoryOwn;
+    swigCPtr = cPtr;
+  }
+
+  protected static long getCPtr(CandidateTranscript obj) {
+    return (obj == null) ? 0 : obj.swigCPtr;
+  }
+
+  public synchronized void delete() {
+    if (swigCPtr != 0) {
+      if (swigCMemOwn) {
+        swigCMemOwn = false;
+        throw new UnsupportedOperationException("C++ destructor does not have public access");
+      }
+      swigCPtr = 0;
+    }
+  }
+
+  /**
+   * Array of TokenMetadata objects
+   */
+  public void setTokens(TokenMetadata value) {
+    implJNI.CandidateTranscript_tokens_set(swigCPtr, this, TokenMetadata.getCPtr(value), value);
+  }
+
+  /**
+   * Array of TokenMetadata objects
+   */
+  public TokenMetadata getTokens() {
+    long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this);
+    return (cPtr == 0) ? null : new TokenMetadata(cPtr, false);
+  }
+
+  /**
+   * Size of the tokens array
+   */
+  public void setNum_tokens(int value) {
+    implJNI.CandidateTranscript_num_tokens_set(swigCPtr, this, value);
+  }
+
+  /**
+   * Size of the tokens array
+   */
+  public int getNum_tokens() {
+    return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this);
+  }
+
+  /**
+   * Approximated confidence value for this transcript. This is roughly the
+   * sum of the acoustic model logit values for each timestep/character that
+   * contributed to the creation of this transcript.
+   */
+  public void setConfidence(double value) {
+    implJNI.CandidateTranscript_confidence_set(swigCPtr, this, value);
+  }
+
+  /**
+   * Approximated confidence value for this transcript. This is roughly the
+   * sum of the acoustic model logit values for each timestep/character that
+   * contributed to the creation of this transcript.
+   */
+  public double getConfidence() {
+    return implJNI.CandidateTranscript_confidence_get(swigCPtr, this);
+  }
+
+  /**
+   * Retrieve one TokenMetadata element
+   *
+   * @param i Array index of the TokenMetadata to get
+   *
+   * @return The TokenMetadata requested or null
+   */
+  public TokenMetadata getToken(int i) {
+    return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), true);
+  }
+
+}
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
index 482b7c58..bb9b0773 100644
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 4.0.2
+ * Version 4.0.1
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
@@ -9,7 +9,7 @@
 package org.mozilla.deepspeech.libdeepspeech;
 
 /**
- * Stores the entire CTC output as an array of character metadata objects
+ * An array of CandidateTranscript objects computed by the model.
  */
 public class Metadata {
   private transient long swigCPtr;
@@ -40,61 +40,43 @@ public class Metadata {
   }
 
   /**
-   *  List of items 
+   * Array of CandidateTranscript objects
    */
-  public void setItems(MetadataItem value) {
-    implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value);
+  public void setTranscripts(CandidateTranscript value) {
+    implJNI.Metadata_transcripts_set(swigCPtr, this, CandidateTranscript.getCPtr(value), value);
   }
 
   /**
-   *  List of items 
+   * Array of CandidateTranscript objects
    */
-  public MetadataItem getItems() {
-    long cPtr = implJNI.Metadata_items_get(swigCPtr, this);
-    return (cPtr == 0) ? null : new MetadataItem(cPtr, false);
+  public CandidateTranscript getTranscripts() {
+    long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this);
+    return (cPtr == 0) ? null : new CandidateTranscript(cPtr, false);
   }
 
   /**
-   *  Size of the list of items 
+   * Size of the transcripts array
    */
-  public void setNum_items(int value) {
-    implJNI.Metadata_num_items_set(swigCPtr, this, value);
+  public void setNum_transcripts(int value) {
+    implJNI.Metadata_num_transcripts_set(swigCPtr, this, value);
   }
 
   /**
-   *  Size of the list of items 
+   * Size of the transcripts array
    */
-  public int getNum_items() {
-    return implJNI.Metadata_num_items_get(swigCPtr, this);
+  public int getNum_transcripts() {
+    return implJNI.Metadata_num_transcripts_get(swigCPtr, this);
   }
 
   /**
-   *  Approximated confidence value for this transcription. This is roughly the<br>
-   * sum of the acoustic model logit values for each timestep/character that<br>
-   * contributed to the creation of this transcription.
+   * Retrieve one CandidateTranscript element
+   *
+   * @param i Array index of the CandidateTranscript to get
+   *
+   * @return The CandidateTranscript requested or null
    */
-  public void setConfidence(double value) {
-    implJNI.Metadata_confidence_set(swigCPtr, this, value);
-  }
-
-  /**
-   *  Approximated confidence value for this transcription. This is roughly the<br>
-   * sum of the acoustic model logit values for each timestep/character that<br>
-   * contributed to the creation of this transcription.
-   */
-  public double getConfidence() {
-    return implJNI.Metadata_confidence_get(swigCPtr, this);
-  }
-
-  /**
-   * Retrieve one MetadataItem element<br>
-   * <br>
-   * @param i Array index of the MetadataItem to get<br>
-   * <br>
-   * @return The MetadataItem requested or null
-   */
-  public MetadataItem getItem(int i) {
-    return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true);
+  public CandidateTranscript getTranscript(int i) {
+    return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), true);
   }
 
 }
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java
new file mode 100644
index 00000000..32246f1a
--- /dev/null
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java
@@ -0,0 +1,79 @@
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 4.0.1
+ *
+ * Do not make changes to this file unless you know what you are doing--modify
+ * the SWIG interface file instead.
+ * ----------------------------------------------------------------------------- */
+
+package org.mozilla.deepspeech.libdeepspeech;
+
+/**
+ * Stores text of an individual token, along with its timing information
+ */
+public class TokenMetadata {
+  private transient long swigCPtr;
+  protected transient boolean swigCMemOwn;
+
+  protected TokenMetadata(long cPtr, boolean cMemoryOwn) {
+    swigCMemOwn = cMemoryOwn;
+    swigCPtr = cPtr;
+  }
+
+  protected static long getCPtr(TokenMetadata obj) {
+    return (obj == null) ? 0 : obj.swigCPtr;
+  }
+
+  public synchronized void delete() {
+    if (swigCPtr != 0) {
+      if (swigCMemOwn) {
+        swigCMemOwn = false;
+        throw new UnsupportedOperationException("C++ destructor does not have public access");
+      }
+      swigCPtr = 0;
+    }
+  }
+
+  /**
+   * The text corresponding to this token
+   */
+  public void setText(String value) {
+    implJNI.TokenMetadata_text_set(swigCPtr, this, value);
+  }
+
+  /**
+   * The text corresponding to this token
+   */
+  public String getText() {
+    return implJNI.TokenMetadata_text_get(swigCPtr, this);
+  }
+
+  /**
+   * Position of the token in units of 20ms
+   */
+  public void setTimestep(int value) {
+    implJNI.TokenMetadata_timestep_set(swigCPtr, this, value);
+  }
+
+  /**
+   * Position of the token in units of 20ms
+   */
+  public int getTimestep() {
+    return implJNI.TokenMetadata_timestep_get(swigCPtr, this);
+  }
+
+  /**
+   * Position of the token in seconds
+   */
+  public void setStart_time(float value) {
+    implJNI.TokenMetadata_start_time_set(swigCPtr, this, value);
+  }
+
+  /**
+   * Position of the token in seconds
+   */
+  public float getStart_time() {
+    return implJNI.TokenMetadata_start_time_get(swigCPtr, this);
+  }
+
+}
diff --git a/native_client/javascript/index.js b/native_client/javascript/index.js
index 7a027bde..6ce06c0d 100644
--- a/native_client/javascript/index.js
+++ b/native_client/javascript/index.js
@@ -115,12 +115,12 @@ Model.prototype.stt = function(aBuffer) {
 }
 
 /**
- * Use the DeepSpeech model to perform Speech-To-Text and output metadata
- * about the results.
+ * Use the DeepSpeech model to perform Speech-To-Text and output results including metadata.
  *
  * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
+ * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
  *
- * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
+ * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
  */
 Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) {
     aNumResults = aNumResults || 1;
@@ -173,9 +173,11 @@ Stream.prototype.intermediateDecode = function() {
 }
 
 /**
- * Compute the intermediate decoding of an ongoing streaming inference.
+ * Compute the intermediate decoding of an ongoing streaming inference, return results including metadata.
  *
- * @return {string} The STT intermediate result.
+ * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
+ *
+ * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
  */
 Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
     aNumResults = aNumResults || 1;
@@ -183,7 +185,7 @@ Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
 }
 
 /**
- * Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
+ * Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference.
  *
  * @return {string} The STT result.
  *
@@ -196,7 +198,9 @@ Stream.prototype.finishStream = function() {
 }
 
 /**
- * Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
+ * Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference.
+ *
+ * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
  *
  * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
  *
@@ -253,48 +257,49 @@ function Version() {
 /**
  * @class
  * 
- * Stores each individual character, along with its timing information
+ * Stores text of an individual token, along with its timing information
  */
 function TokenMetadata() {}
 
 /** 
- * The character generated for transcription
+ * The text corresponding to this token
  *
- * @return {string} The character generated
+ * @return {string} The text generated
  */
 TokenMetadata.prototype.text = function() {}
 
 /**
- * Position of the character in units of 20ms
+ * Position of the token in units of 20ms
  *
- * @return {int} The position of the character
+ * @return {int} The position of the token
  */
 TokenMetadata.prototype.timestep = function() {};
 
 /**
- * Position of the character in seconds
+ * Position of the token in seconds
  *
- * @return {float} The position of the character
+ * @return {float} The position of the token
  */
 TokenMetadata.prototype.start_time = function() {};
 
 /**
  * @class
  *
- * Stores the entire CTC output as an array of character metadata objects
+ * A single transcript computed by the model, including a confidence value and
+ * the metadata for its constituent tokens.
  */
 function CandidateTranscript () {}
 
 /**
- * List of items
+ * Array of tokens
  *
- * @return {array} List of :js:func:`TokenMetadata`
+ * @return {array} Array of :js:func:`TokenMetadata`
  */
-CandidateTranscript.prototype.items = function() {}
+CandidateTranscript.prototype.tokens = function() {}
 
 /**
  * Approximated confidence value for this transcription. This is roughly the
- * sum of the acoustic model logit values for each timestep/character that
+ * sum of the acoustic model logit values for each timestep/token that
  * contributed to the creation of this transcription.
  *
  * @return {float} Confidence value
@@ -304,14 +309,14 @@ CandidateTranscript.prototype.confidence = function() {}
 /**
  * @class
  *
- * Stores the entire CTC output as an array of character metadata objects
+ * An array of CandidateTranscript objects computed by the model.
  */
 function Metadata () {}
 
 /**
- * List of items
+ * Array of transcripts
  *
- * @return {array} List of :js:func:`CandidateTranscript` objects
+ * @return {array} Array of :js:func:`CandidateTranscript` objects
  */
 Metadata.prototype.transcripts = function() {}
 
diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py
index 5d9072ec..a44cf05f 100644
--- a/native_client/python/__init__.py
+++ b/native_client/python/__init__.py
@@ -123,15 +123,15 @@ class Model(object):
 
     def sttWithMetadata(self, audio_buffer, num_results=1):
         """
-        Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
+        Use the DeepSpeech model to perform Speech-To-Text and return results including metadata.
 
         :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
         :type audio_buffer: numpy.int16 array
 
-        :param num_results: Number of candidate transcripts to return.
+        :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
         :type num_results: int
 
-        :return: Outputs a struct of individual letters along with their timing information.
+        :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
         :type: :func:`Metadata`
         """
         return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results)
@@ -192,10 +192,13 @@ class Stream(object):
 
     def intermediateDecodeWithMetadata(self, num_results=1):
         """
-        Compute the intermediate decoding of an ongoing streaming inference.
+        Compute the intermediate decoding of an ongoing streaming inference and return results including metadata.
 
-        :return: The STT intermediate result.
-        :type: str
+        :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
+        :type num_results: int
+
+        :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
+        :type: :func:`Metadata`
 
         :throws: RuntimeError if the stream object is not valid
         """
@@ -205,8 +208,9 @@ class Stream(object):
 
     def finishStream(self):
         """
-        Signal the end of an audio signal to an ongoing streaming inference,
-        returns the STT result over the whole audio signal.
+        Compute the final decoding of an ongoing streaming inference and return
+        the result. Signals the end of an ongoing streaming inference. The underlying
+        stream object must not be used after this method is called.
 
         :return: The STT result.
         :type: str
@@ -221,13 +225,15 @@ class Stream(object):
 
     def finishStreamWithMetadata(self, num_results=1):
         """
-        Signal the end of an audio signal to an ongoing streaming inference,
-        returns per-letter metadata.
+        Compute the final decoding of an ongoing streaming inference and return
+        results including metadata. Signals the end of an ongoing streaming
+        inference. The underlying stream object must not be used after this
+        method is called.
 
-        :param num_results: Number of candidate transcripts to return.
+        :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
         :type num_results: int
 
-        :return: Outputs a struct of individual letters along with their timing information.
+        :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
         :type: :func:`Metadata`
 
         :throws: RuntimeError if the stream object is not valid

From 2ec34d5a067334a84b323328c149bd9752008059 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 17 Mar 2020 14:47:18 +0100
Subject: [PATCH 14/16] Address review comments

---
 doc/DotNet-API.rst                            |  6 ++--
 doc/Java-API.rst                              | 16 +++++++---
 doc/Structs.rst                               | 13 ++++++--
 doc/doxygen-dotnet.conf                       |  2 +-
 native_client/args.h                          | 32 ++++++++++++-------
 native_client/client.cc                       |  6 ++--
 .../ctcdecode/ctc_beam_search_decoder.h       |  2 +-
 native_client/deepspeech.h                    | 18 +++++------
 native_client/modelstate.h                    |  2 +-
 9 files changed, 59 insertions(+), 38 deletions(-)

diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst
index d43c7afb..b4f85dfc 100644
--- a/doc/DotNet-API.rst
+++ b/doc/DotNet-API.rst
@@ -31,20 +31,20 @@ ErrorCodes
 Metadata
 --------
 
-.. doxygenstruct:: DeepSpeechClient::Models::Metadata
+.. doxygenclass:: DeepSpeechClient::Models::Metadata
    :project: deepspeech-dotnet
    :members: Transcripts
 
 CandidateTranscript
 -------------------
 
-.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript
+.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript
    :project: deepspeech-dotnet
    :members: Tokens, Confidence
 
 TokenMetadata
 -------------
 
-.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata
+.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata
    :project: deepspeech-dotnet
    :members: Text, Timestep, StartTime
diff --git a/doc/Java-API.rst b/doc/Java-API.rst
index a485dc02..2986ca97 100644
--- a/doc/Java-API.rst
+++ b/doc/Java-API.rst
@@ -13,11 +13,17 @@ Metadata
 
 .. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
    :project: deepspeech-java
-   :members: getItems, getNum_items, getProbability, getItem
+   :members: getTranscripts, getNum_transcripts, getTranscript
 
-MetadataItem
-------------
+CandidateTranscript
+-------------------
 
-.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem
+.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript
    :project: deepspeech-java
-   :members: getCharacter, getTimestep, getStart_time
+   :members: getTokens, getNum_tokens, getConfidence, getToken
+
+TokenMetadata
+-------------
+.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata
+   :project: deepspeech-java
+   :members: getText, getTimestep, getStart_time
diff --git a/doc/Structs.rst b/doc/Structs.rst
index 713e52e0..5d532277 100644
--- a/doc/Structs.rst
+++ b/doc/Structs.rst
@@ -8,9 +8,16 @@ Metadata
    :project: deepspeech-c
    :members:
 
-MetadataItem
-------------
+CandidateTranscript
+-------------------
 
-.. doxygenstruct:: MetadataItem
+.. doxygenstruct:: CandidateTranscript
+   :project: deepspeech-c
+   :members:
+
+TokenMetadata
+-------------
+
+.. doxygenstruct:: TokenMetadata
    :project: deepspeech-c
    :members:
diff --git a/doc/doxygen-dotnet.conf b/doc/doxygen-dotnet.conf
index ad64cfcb..74c2c5bb 100644
--- a/doc/doxygen-dotnet.conf
+++ b/doc/doxygen-dotnet.conf
@@ -790,7 +790,7 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/
+INPUT                  = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/native_client/args.h b/native_client/args.h
index 33b9b8fe..ca28bfb7 100644
--- a/native_client/args.h
+++ b/native_client/args.h
@@ -34,6 +34,8 @@ bool extended_metadata = false;
 
 bool json_output = false;
 
+int json_candidate_transcripts = 3;
+
 int stream_size = 0;
 
 void PrintHelp(const char* bin)
@@ -43,18 +45,19 @@ void PrintHelp(const char* bin)
     "\n"
     "Running DeepSpeech inference.\n"
     "\n"
-    "\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
-    "\t--scorer SCORER\t\tPath to the external scorer file\n"
-    "\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
-    "\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
-    "\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
-    "\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
-    "\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
-    "\t--extended\t\tOutput string from extended metadata\n"
-    "\t--json\t\t\tExtended output, shows word timings as JSON\n"
-    "\t--stream size\t\tRun in stream mode, output intermediate results\n"
-    "\t--help\t\t\tShow help\n"
-    "\t--version\t\tPrint version and exits\n";
+    "\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n"
+    "\t--scorer SCORER\t\t\tPath to the external scorer file\n"
+    "\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n"
+    "\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n"
+    "\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n"
+    "\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n"
+    "\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
+    "\t--extended\t\t\tOutput string from extended metadata\n"
+    "\t--json\t\t\t\tExtended output, shows word timings as JSON\n"
+    "\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n"
+    "\t--stream size\t\t\tRun in stream mode, output intermediate results\n"
+    "\t--help\t\t\t\tShow help\n"
+    "\t--version\t\t\tPrint version and exits\n";
     char* version = DS_Version();
     std::cerr << "DeepSpeech " << version << "\n";
     DS_FreeString(version);
@@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv)
             {"t", no_argument, nullptr, 't'},
             {"extended", no_argument, nullptr, 'e'},
             {"json", no_argument, nullptr, 'j'},
+            {"candidate_transcripts", required_argument, nullptr, 150},
             {"stream", required_argument, nullptr, 's'},
             {"version", no_argument, nullptr, 'v'},
             {"help", no_argument, nullptr, 'h'},
@@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv)
             json_output = true;
             break;
 
+        case 150:
+            json_candidate_transcripts = atoi(optarg);
+            break;
+
         case 's':
             stream_size = atoi(optarg);
             break;
diff --git a/native_client/client.cc b/native_client/client.cc
index 9ab47f27..f108419b 100644
--- a/native_client/client.cc
+++ b/native_client/client.cc
@@ -49,7 +49,7 @@ CandidateTranscriptToString(CandidateTranscript* transcript)
 {
   std::string retval = "";
   for (int i = 0; i < transcript->num_tokens; i++) {
-    TokenMetadata token = transcript->tokens[i];
+    const TokenMetadata& token = transcript->tokens[i];
     retval += token.text;
   }
   return strdup(retval.c_str());
@@ -65,7 +65,7 @@ CandidateTranscriptToWords(CandidateTranscript* transcript)
 
   // Loop through each token
   for (int i = 0; i < transcript->num_tokens; i++) {
-    TokenMetadata token = transcript->tokens[i];
+    const TokenMetadata& token = transcript->tokens[i];
 
     // Append token to word if it's not a space
     if (strcmp(token.text, u8" ") != 0) {
@@ -167,7 +167,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
     res.string = CandidateTranscriptToString(&result->transcripts[0]);
     DS_FreeMetadata(result);
   } else if (json_output) {
-    Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3);
+    Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts);
     res.string = MetadataToJSON(result);
     DS_FreeMetadata(result);
   } else if (stream_size > 0) {
diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.h b/native_client/ctcdecode/ctc_beam_search_decoder.h
index 78871b2a..b785e097 100644
--- a/native_client/ctcdecode/ctc_beam_search_decoder.h
+++ b/native_client/ctcdecode/ctc_beam_search_decoder.h
@@ -60,7 +60,7 @@ public:
             int time_dim,
             int class_dim);
 
-  /* Get transcription from current decoder state
+  /* Get up to num_results transcriptions from current decoder state.
    *
    * Parameters:
    *     num_results: Number of beams to return.
diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h
index bf4c0f00..6fb9645c 100644
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@@ -27,7 +27,7 @@ typedef struct TokenMetadata {
   char* text;
 
   /** Position of the token in units of 20ms */
-  int timestep;
+  unsigned int timestep;
 
   /** Position of the token in seconds */
   float start_time;
@@ -41,7 +41,7 @@ typedef struct CandidateTranscript {
   /** Array of TokenMetadata objects */
   TokenMetadata* tokens;
   /** Size of the tokens array */
-  int num_tokens;
+  unsigned int num_tokens;
   /** Approximated confidence value for this transcript. This is roughly the
    * sum of the acoustic model logit values for each timestep/character that
    * contributed to the creation of this transcript.
@@ -56,7 +56,7 @@ typedef struct Metadata {
   /** Array of CandidateTranscript objects */
   CandidateTranscript* transcripts;
   /** Size of the transcripts array */
-  int num_transcripts;
+  unsigned int num_transcripts;
 } Metadata;
 
 enum DeepSpeech_Error_Codes
@@ -175,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx,
                           float aBeta);
 
 /**
- * @brief Use the DeepSpeech model to perform Speech-To-Text.
+ * @brief Use the DeepSpeech model to convert speech to text.
  *
  * @param aCtx The ModelState pointer for the model to use.
  * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
@@ -191,18 +191,18 @@ char* DS_SpeechToText(ModelState* aCtx,
                       unsigned int aBufferSize);
 
 /**
- * @brief Use the DeepSpeech model to perform Speech-To-Text and output results
+ * @brief Use the DeepSpeech model to convert speech to text and output results
  * including metadata.
  *
  * @param aCtx The ModelState pointer for the model to use.
  * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
  *                sample rate (matching what the model was trained on).
  * @param aBufferSize The number of samples in the audio signal.
- * @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this.
+ * @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this.
  *
- * @return Metadata struct containing multiple candidate transcripts. Each transcript
- *         has per-token metadata including timing information. The user is
- *         responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
+ * @return Metadata struct containing multiple CandidateTranscript structs. Each
+ *         transcript has per-token metadata including timing information. The
+ *         user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
  *         Returns NULL on error.
  */
 DEEPSPEECH_EXPORT
diff --git a/native_client/modelstate.h b/native_client/modelstate.h
index 43eef970..0dbe108a 100644
--- a/native_client/modelstate.h
+++ b/native_client/modelstate.h
@@ -66,7 +66,7 @@ struct ModelState {
    * @brief Return character-level metadata including letter timings.
    *
    * @param state Decoder state to use when decoding.
-   * @param num_results Number of candidate results to return.
+   * @param num_results Maximum number of candidate results to return.
    *
    * @return A Metadata struct containing CandidateTranscript structs.
    * Each represents an candidate transcript, with the first ranked most probable.

From 1547498e82c3ad1a0c648a93c62a4b2091074c45 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Wed, 18 Mar 2020 19:11:58 +0100
Subject: [PATCH 15/16] Const members in structs

---
 native_client/client.cc               |  8 ++---
 native_client/deepspeech.cc           |  8 ++---
 native_client/deepspeech.h            | 16 +++++-----
 native_client/javascript/deepspeech.i |  4 ---
 native_client/modelstate.cc           | 42 ++++++++++++++-------------
 native_client/python/impl.i           |  4 ---
 6 files changed, 38 insertions(+), 44 deletions(-)

diff --git a/native_client/client.cc b/native_client/client.cc
index f108419b..1f7f78eb 100644
--- a/native_client/client.cc
+++ b/native_client/client.cc
@@ -45,7 +45,7 @@ struct meta_word {
 };
 
 char*
-CandidateTranscriptToString(CandidateTranscript* transcript)
+CandidateTranscriptToString(const CandidateTranscript* transcript)
 {
   std::string retval = "";
   for (int i = 0; i < transcript->num_tokens; i++) {
@@ -56,7 +56,7 @@ CandidateTranscriptToString(CandidateTranscript* transcript)
 }
 
 std::vector<meta_word>
-CandidateTranscriptToWords(CandidateTranscript* transcript)
+CandidateTranscriptToWords(const CandidateTranscript* transcript)
 {
   std::vector<meta_word> word_list;
 
@@ -101,7 +101,7 @@ CandidateTranscriptToWords(CandidateTranscript* transcript)
 }
 
 std::string
-CandidateTranscriptToJSON(CandidateTranscript *transcript)
+CandidateTranscriptToJSON(const CandidateTranscript *transcript)
 {
   std::ostringstream out_string;
 
@@ -130,7 +130,7 @@ MetadataToJSON(Metadata* result)
   out_string << "{\n";
 
   for (int j=0; j < result->num_transcripts; ++j) {
-    CandidateTranscript *transcript = &result->transcripts[j];
+    const CandidateTranscript *transcript = &result->transcripts[j];
 
     if (j == 0) {
       out_string << CandidateTranscriptToJSON(transcript);
diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc
index d284a319..96989e04 100644
--- a/native_client/deepspeech.cc
+++ b/native_client/deepspeech.cc
@@ -478,14 +478,14 @@ DS_FreeMetadata(Metadata* m)
   if (m) {
     for (int i = 0; i < m->num_transcripts; ++i) {
       for (int j = 0; j < m->transcripts[i].num_tokens; ++j) {
-        free(m->transcripts[i].tokens[j].text);
+        free((void*)m->transcripts[i].tokens[j].text);
       }
 
-      delete[] m->transcripts[i].tokens;
+      free((void*)m->transcripts[i].tokens);
     }
 
-    delete[] m->transcripts;
-    delete m;
+    free((void*)m->transcripts);
+    free(m);
   }
 }
 
diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h
index 6fb9645c..a8c29c93 100644
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@@ -24,13 +24,13 @@ typedef struct StreamingState StreamingState;
  */
 typedef struct TokenMetadata {
   /** The text corresponding to this token */
-  char* text;
+  const char* const text;
 
   /** Position of the token in units of 20ms */
-  unsigned int timestep;
+  const unsigned int timestep;
 
   /** Position of the token in seconds */
-  float start_time;
+  const float start_time;
 } TokenMetadata;
 
 /**
@@ -39,14 +39,14 @@ typedef struct TokenMetadata {
  */
 typedef struct CandidateTranscript {
   /** Array of TokenMetadata objects */
-  TokenMetadata* tokens;
+  const TokenMetadata* const tokens;
   /** Size of the tokens array */
-  unsigned int num_tokens;
+  const unsigned int num_tokens;
   /** Approximated confidence value for this transcript. This is roughly the
    * sum of the acoustic model logit values for each timestep/character that
    * contributed to the creation of this transcript.
    */
-  double confidence;
+  const double confidence;
 } CandidateTranscript;
 
 /**
@@ -54,9 +54,9 @@ typedef struct CandidateTranscript {
  */
 typedef struct Metadata {
   /** Array of CandidateTranscript objects */
-  CandidateTranscript* transcripts;
+  const CandidateTranscript* const transcripts;
   /** Size of the transcripts array */
-  unsigned int num_transcripts;
+  const unsigned int num_transcripts;
 } Metadata;
 
 enum DeepSpeech_Error_Codes
diff --git a/native_client/javascript/deepspeech.i b/native_client/javascript/deepspeech.i
index 6b0151a4..cb3968c2 100644
--- a/native_client/javascript/deepspeech.i
+++ b/native_client/javascript/deepspeech.i
@@ -85,10 +85,6 @@ using namespace node;
 %ignore Metadata::num_transcripts;
 %ignore CandidateTranscript::num_tokens;
 
-%immutable Metadata::transcripts;
-%immutable CandidateTranscripts::tokens;
-%immutable TokenMetadata::text;
-
 %nodefaultctor Metadata;
 %nodefaultdtor Metadata;
 %nodefaultctor CandidateTranscript;
diff --git a/native_client/modelstate.cc b/native_client/modelstate.cc
index d4f16636..3cb06ac2 100644
--- a/native_client/modelstate.cc
+++ b/native_client/modelstate.cc
@@ -41,33 +41,35 @@ ModelState::decode_metadata(const DecoderState& state,
                             size_t num_results)
 {
   vector<Output> out = state.decode(num_results);
-  size_t num_returned = out.size();
+  unsigned int num_returned = out.size();
 
-  std::unique_ptr<Metadata> metadata(new Metadata);
-  metadata->num_transcripts = num_returned;
-
-  std::unique_ptr<CandidateTranscript[]> transcripts(new CandidateTranscript[num_returned]);
+  CandidateTranscript* transcripts = (CandidateTranscript*)malloc(sizeof(CandidateTranscript)*num_returned);
 
   for (int i = 0; i < num_returned; ++i) {
-    transcripts[i].num_tokens = out[i].tokens.size();
-    transcripts[i].confidence = out[i].confidence;
+    TokenMetadata* tokens = (TokenMetadata*)malloc(sizeof(TokenMetadata)*out[i].tokens.size());
 
-    std::unique_ptr<TokenMetadata[]> tokens(new TokenMetadata[transcripts[i].num_tokens]);
-
-    // Loop through each token
     for (int j = 0; j < out[i].tokens.size(); ++j) {
-      tokens[j].text = strdup(alphabet_.StringFromLabel(out[i].tokens[j]).c_str());
-      tokens[j].timestep = out[i].timesteps[j];
-      tokens[j].start_time = out[i].timesteps[j] * ((float)audio_win_step_ / sample_rate_);
-
-      if (tokens[j].start_time < 0) {
-        tokens[j].start_time = 0;
-      }
+      TokenMetadata token {
+        strdup(alphabet_.StringFromLabel(out[i].tokens[j]).c_str()),   // text
+        static_cast<unsigned int>(out[i].timesteps[j]),                // timestep
+        out[i].timesteps[j] * ((float)audio_win_step_ / sample_rate_), // start_time
+      };
+      memcpy(&tokens[j], &token, sizeof(TokenMetadata));
     }
 
-    transcripts[i].tokens = tokens.release();
+    CandidateTranscript transcript {
+      tokens,                                          // tokens
+      static_cast<unsigned int>(out[i].tokens.size()), // num_tokens
+      out[i].confidence,                               // confidence
+    };
+    memcpy(&transcripts[i], &transcript, sizeof(CandidateTranscript));
   }
 
-  metadata->transcripts = transcripts.release();
-  return metadata.release();
+  Metadata* ret = (Metadata*)malloc(sizeof(Metadata));
+  Metadata metadata {
+    transcripts,  // transcripts
+    num_returned, // num_transcripts
+  };
+  memcpy(ret, &metadata, sizeof(Metadata));
+  return ret;
 }
diff --git a/native_client/python/impl.i b/native_client/python/impl.i
index 001a6165..259a5b5d 100644
--- a/native_client/python/impl.i
+++ b/native_client/python/impl.i
@@ -108,10 +108,6 @@ static PyObject *parent_reference() {
   }
 }
 
-%immutable Metadata::transcripts;
-%immutable CandidateTranscript::tokens;
-%immutable TokenMetadata::text;
-
 %nodefaultctor Metadata;
 %nodefaultdtor Metadata;
 %nodefaultctor CandidateTranscript;

From ee30a1c9dead1b7cbd86ab51a2039f5e1859740b Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Wed, 18 Mar 2020 19:49:14 +0100
Subject: [PATCH 16/16] Adapt Java bindings to const structs

---
 native_client/java/jni/deepspeech.i           | 18 +++--
 .../libdeepspeech/DeepSpeechModel.java        | 37 +++++++----
 .../CandidateTranscript.java                  | 47 ++++----------
 .../DeepSpeech_Error_Codes.java               | 65 +++++++++++++++++++
 .../libdeepspeech_doc/Metadata.java           | 30 +++------
 .../deepspeech/libdeepspeech_doc/README.rst   |  2 +-
 .../libdeepspeech_doc/TokenMetadata.java      | 29 ++-------
 7 files changed, 122 insertions(+), 106 deletions(-)
 create mode 100644 native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/DeepSpeech_Error_Codes.java

diff --git a/native_client/java/jni/deepspeech.i b/native_client/java/jni/deepspeech.i
index 4bbdc776..c028714c 100644
--- a/native_client/java/jni/deepspeech.i
+++ b/native_client/java/jni/deepspeech.i
@@ -6,6 +6,8 @@
 %}
 
 %include "typemaps.i"
+%include "enums.swg"
+%javaconst(1);
 
 %include "arrays_java.i"
 // apply to DS_FeedAudioContent and DS_SpeechToText
@@ -15,12 +17,6 @@
 %pointer_functions(ModelState*, modelstatep);
 %pointer_functions(StreamingState*, streamingstatep);
 
-%typemap(newfree) char* "DS_FreeString($1);";
-
-%include "carrays.i"
-%array_functions(struct TokenMetadata, TokenMetadata_array);
-%array_functions(struct CandidateTranscript, CandidateTranscript_array);
-
 %extend struct CandidateTranscript {
   /**
    * Retrieve one TokenMetadata element
@@ -29,8 +25,8 @@
    *
    * @return The TokenMetadata requested or null
    */
-  TokenMetadata getToken(int i) {
-    return TokenMetadata_array_getitem(self->tokens, i);
+  const TokenMetadata& getToken(int i) {
+    return self->tokens[i];
   }
 }
 
@@ -42,8 +38,8 @@
    *
    * @return The CandidateTranscript requested or null
    */
-  CandidateTranscript getTranscript(int i) {
-    return CandidateTranscript_array_getitem(self->transcripts, i);
+  const CandidateTranscript& getTranscript(int i) {
+    return self->transcripts[i];
   }
 
   ~Metadata() {
@@ -58,9 +54,11 @@
 %nodefaultctor TokenMetadata;
 %nodefaultdtor TokenMetadata;
 
+%typemap(newfree) char* "DS_FreeString($1);";
 %newobject DS_SpeechToText;
 %newobject DS_IntermediateDecode;
 %newobject DS_FinishStream;
+%newobject DS_ErrorCodeToErrorMessage;
 
 %rename ("%(strip:[DS_])s") "";
 
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
index a5b339b3..eafa11e2 100644
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
@@ -11,8 +11,15 @@ public class DeepSpeechModel {
     }
 
     // FIXME: We should have something better than those SWIGTYPE_*
-    SWIGTYPE_p_p_ModelState _mspp;
-    SWIGTYPE_p_ModelState   _msp;
+    private SWIGTYPE_p_p_ModelState _mspp;
+    private SWIGTYPE_p_ModelState   _msp;
+
+    private void evaluateErrorCode(int errorCode) {
+        DeepSpeech_Error_Codes code = DeepSpeech_Error_Codes.swigToEnum(errorCode);
+        if (code != DeepSpeech_Error_Codes.ERR_OK) {
+            throw new RuntimeException("Error: " + impl.ErrorCodeToErrorMessage(errorCode) + " (0x" + Integer.toHexString(errorCode) + ").");
+        }
+    }
 
    /**
     * @brief An object providing an interface to a trained DeepSpeech model.
@@ -20,10 +27,12 @@ public class DeepSpeechModel {
     * @constructor
     *
     * @param modelPath The path to the frozen model graph.
+    *
+    * @throws RuntimeException on failure.
     */
     public DeepSpeechModel(String modelPath) {
         this._mspp = impl.new_modelstatep();
-        impl.CreateModel(modelPath, this._mspp);
+        evaluateErrorCode(impl.CreateModel(modelPath, this._mspp));
         this._msp  = impl.modelstatep_value(this._mspp);
     }
 
@@ -43,10 +52,10 @@ public class DeepSpeechModel {
      * @param aBeamWidth The beam width used by the model. A larger beam width value
      *                   generates better results at the cost of decoding time.
      *
-     * @return Zero on success, non-zero on failure.
+     * @throws RuntimeException on failure.
      */
-    public int setBeamWidth(long beamWidth) {
-        return impl.SetModelBeamWidth(this._msp, beamWidth);
+    public void setBeamWidth(long beamWidth) {
+        evaluateErrorCode(impl.SetModelBeamWidth(this._msp, beamWidth));
     }
 
    /**
@@ -70,19 +79,19 @@ public class DeepSpeechModel {
     *
     * @param scorer The path to the external scorer file.
     *
-    * @return Zero on success, non-zero on failure (invalid arguments).
+    * @throws RuntimeException on failure.
     */
     public void enableExternalScorer(String scorer) {
-        impl.EnableExternalScorer(this._msp, scorer);
+        evaluateErrorCode(impl.EnableExternalScorer(this._msp, scorer));
     }
 
     /**
     * @brief Disable decoding using an external scorer.
     *
-    * @return Zero on success, non-zero on failure (invalid arguments).
+    * @throws RuntimeException on failure.
     */
     public void disableExternalScorer() {
-        impl.DisableExternalScorer(this._msp);
+        evaluateErrorCode(impl.DisableExternalScorer(this._msp));
     }
 
     /**
@@ -91,10 +100,10 @@ public class DeepSpeechModel {
     * @param alpha The alpha hyperparameter of the decoder. Language model weight.
     * @param beta The beta hyperparameter of the decoder. Word insertion weight.
     *
-    * @return Zero on success, non-zero on failure (invalid arguments).
+    * @throws RuntimeException on failure.
     */
     public void setScorerAlphaBeta(float alpha, float beta) {
-        impl.SetScorerAlphaBeta(this._msp, alpha, beta);
+        evaluateErrorCode(impl.SetScorerAlphaBeta(this._msp, alpha, beta));
     }
 
    /*
@@ -132,10 +141,12 @@ public class DeepSpeechModel {
     *        and finishStream().
     *
     * @return An opaque object that represents the streaming state.
+    *
+    * @throws RuntimeException on failure.
     */
     public DeepSpeechStreamingState createStream() {
         SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
-        impl.CreateStream(this._msp, ssp);
+        evaluateErrorCode(impl.CreateStream(this._msp, ssp));
         return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
     }
 
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
index c02b39ad..fa13c474 100644
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java
@@ -9,8 +9,8 @@
 package org.mozilla.deepspeech.libdeepspeech;
 
 /**
- * A single transcript computed by the model, including a confidence value and
- * the metadata for its constituent tokens.
+ * A single transcript computed by the model, including a confidence<br>
+ *        value and the metadata for its constituent tokens.
  */
 public class CandidateTranscript {
   private transient long swigCPtr;
@@ -36,14 +36,7 @@ public class CandidateTranscript {
   }
 
   /**
-   * Array of TokenMetadata objects
-   */
-  public void setTokens(TokenMetadata value) {
-    implJNI.CandidateTranscript_tokens_set(swigCPtr, this, TokenMetadata.getCPtr(value), value);
-  }
-
-  /**
-   * Array of TokenMetadata objects
+   *  Array of TokenMetadata objects 
    */
   public TokenMetadata getTokens() {
     long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this);
@@ -51,31 +44,15 @@ public class CandidateTranscript {
   }
 
   /**
-   * Size of the tokens array
+   *  Size of the tokens array 
    */
-  public void setNum_tokens(int value) {
-    implJNI.CandidateTranscript_num_tokens_set(swigCPtr, this, value);
-  }
-
-  /**
-   * Size of the tokens array
-   */
-  public int getNum_tokens() {
+  public long getNum_tokens() {
     return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this);
   }
 
   /**
-   * Approximated confidence value for this transcript. This is roughly the
-   * sum of the acoustic model logit values for each timestep/character that
-   * contributed to the creation of this transcript.
-   */
-  public void setConfidence(double value) {
-    implJNI.CandidateTranscript_confidence_set(swigCPtr, this, value);
-  }
-
-  /**
-   * Approximated confidence value for this transcript. This is roughly the
-   * sum of the acoustic model logit values for each timestep/character that
+   *  Approximated confidence value for this transcript. This is roughly the<br>
+   * sum of the acoustic model logit values for each timestep/character that<br>
    * contributed to the creation of this transcript.
    */
   public double getConfidence() {
@@ -83,14 +60,14 @@ public class CandidateTranscript {
   }
 
   /**
-   * Retrieve one TokenMetadata element
-   *
-   * @param i Array index of the TokenMetadata to get
-   *
+   * Retrieve one TokenMetadata element<br>
+   * <br>
+   * @param i Array index of the TokenMetadata to get<br>
+   * <br>
    * @return The TokenMetadata requested or null
    */
   public TokenMetadata getToken(int i) {
-    return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), true);
+    return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), false);
   }
 
 }
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/DeepSpeech_Error_Codes.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/DeepSpeech_Error_Codes.java
new file mode 100644
index 00000000..ed47183e
--- /dev/null
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/DeepSpeech_Error_Codes.java
@@ -0,0 +1,65 @@
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 4.0.1
+ *
+ * Do not make changes to this file unless you know what you are doing--modify
+ * the SWIG interface file instead.
+ * ----------------------------------------------------------------------------- */
+
+package org.mozilla.deepspeech.libdeepspeech;
+
+public enum DeepSpeech_Error_Codes {
+  ERR_OK(0x0000),
+  ERR_NO_MODEL(0x1000),
+  ERR_INVALID_ALPHABET(0x2000),
+  ERR_INVALID_SHAPE(0x2001),
+  ERR_INVALID_SCORER(0x2002),
+  ERR_MODEL_INCOMPATIBLE(0x2003),
+  ERR_SCORER_NOT_ENABLED(0x2004),
+  ERR_FAIL_INIT_MMAP(0x3000),
+  ERR_FAIL_INIT_SESS(0x3001),
+  ERR_FAIL_INTERPRETER(0x3002),
+  ERR_FAIL_RUN_SESS(0x3003),
+  ERR_FAIL_CREATE_STREAM(0x3004),
+  ERR_FAIL_READ_PROTOBUF(0x3005),
+  ERR_FAIL_CREATE_SESS(0x3006),
+  ERR_FAIL_CREATE_MODEL(0x3007);
+
+  public final int swigValue() {
+    return swigValue;
+  }
+
+  public static DeepSpeech_Error_Codes swigToEnum(int swigValue) {
+    DeepSpeech_Error_Codes[] swigValues = DeepSpeech_Error_Codes.class.getEnumConstants();
+    if (swigValue < swigValues.length && swigValue >= 0 && swigValues[swigValue].swigValue == swigValue)
+      return swigValues[swigValue];
+    for (DeepSpeech_Error_Codes swigEnum : swigValues)
+      if (swigEnum.swigValue == swigValue)
+        return swigEnum;
+    throw new IllegalArgumentException("No enum " + DeepSpeech_Error_Codes.class + " with value " + swigValue);
+  }
+
+  @SuppressWarnings("unused")
+  private DeepSpeech_Error_Codes() {
+    this.swigValue = SwigNext.next++;
+  }
+
+  @SuppressWarnings("unused")
+  private DeepSpeech_Error_Codes(int swigValue) {
+    this.swigValue = swigValue;
+    SwigNext.next = swigValue+1;
+  }
+
+  @SuppressWarnings("unused")
+  private DeepSpeech_Error_Codes(DeepSpeech_Error_Codes swigEnum) {
+    this.swigValue = swigEnum.swigValue;
+    SwigNext.next = this.swigValue+1;
+  }
+
+  private final int swigValue;
+
+  private static class SwigNext {
+    private static int next = 0;
+  }
+}
+
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
index bb9b0773..d2831bc4 100644
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java
@@ -40,14 +40,7 @@ public class Metadata {
   }
 
   /**
-   * Array of CandidateTranscript objects
-   */
-  public void setTranscripts(CandidateTranscript value) {
-    implJNI.Metadata_transcripts_set(swigCPtr, this, CandidateTranscript.getCPtr(value), value);
-  }
-
-  /**
-   * Array of CandidateTranscript objects
+   *  Array of CandidateTranscript objects 
    */
   public CandidateTranscript getTranscripts() {
     long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this);
@@ -55,28 +48,21 @@ public class Metadata {
   }
 
   /**
-   * Size of the transcripts array
+   *  Size of the transcripts array 
    */
-  public void setNum_transcripts(int value) {
-    implJNI.Metadata_num_transcripts_set(swigCPtr, this, value);
-  }
-
-  /**
-   * Size of the transcripts array
-   */
-  public int getNum_transcripts() {
+  public long getNum_transcripts() {
     return implJNI.Metadata_num_transcripts_get(swigCPtr, this);
   }
 
   /**
-   * Retrieve one CandidateTranscript element
-   *
-   * @param i Array index of the CandidateTranscript to get
-   *
+   * Retrieve one CandidateTranscript element<br>
+   * <br>
+   * @param i Array index of the CandidateTranscript to get<br>
+   * <br>
    * @return The CandidateTranscript requested or null
    */
   public CandidateTranscript getTranscript(int i) {
-    return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), true);
+    return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), false);
   }
 
 }
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/README.rst b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/README.rst
index 1279d717..bd89f9b8 100644
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/README.rst
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/README.rst
@@ -4,7 +4,7 @@ Javadoc for Sphinx
 
 This code is only here for reference for documentation generation.
 
-To update, please build SWIG (4.0 at least) and then run from native_client/java:
+To update, please install SWIG (4.0 at least) and then run from native_client/java:
 
 .. code-block::
 
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java
index 32246f1a..d14fc161 100644
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java
@@ -35,42 +35,21 @@ public class TokenMetadata {
   }
 
   /**
-   * The text corresponding to this token
-   */
-  public void setText(String value) {
-    implJNI.TokenMetadata_text_set(swigCPtr, this, value);
-  }
-
-  /**
-   * The text corresponding to this token
+   *  The text corresponding to this token 
    */
   public String getText() {
     return implJNI.TokenMetadata_text_get(swigCPtr, this);
   }
 
   /**
-   * Position of the token in units of 20ms
+   *  Position of the token in units of 20ms 
    */
-  public void setTimestep(int value) {
-    implJNI.TokenMetadata_timestep_set(swigCPtr, this, value);
-  }
-
-  /**
-   * Position of the token in units of 20ms
-   */
-  public int getTimestep() {
+  public long getTimestep() {
     return implJNI.TokenMetadata_timestep_get(swigCPtr, this);
   }
 
   /**
-   * Position of the token in seconds
-   */
-  public void setStart_time(float value) {
-    implJNI.TokenMetadata_start_time_set(swigCPtr, this, value);
-  }
-
-  /**
-   * Position of the token in seconds
+   *  Position of the token in seconds 
    */
   public float getStart_time() {
     return implJNI.TokenMetadata_start_time_get(swigCPtr, this);