From 32c969c1846453474d786306dac4c1de3a2b2b54 Mon Sep 17 00:00:00 2001 From: dabinat Date: Wed, 5 Feb 2020 07:55:15 +0000 Subject: [PATCH 01/16] Expose multiple transcriptions through the API --- .../ctcdecode/ctc_beam_search_decoder.cpp | 13 +++-- native_client/deepspeech.cc | 58 +++++++++++++++---- native_client/deepspeech.h | 26 +++++++-- native_client/modelstate.cc | 43 ++++++++------ native_client/modelstate.h | 9 ++- 5 files changed, 108 insertions(+), 41 deletions(-) diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.cpp b/native_client/ctcdecode/ctc_beam_search_decoder.cpp index 5dadd57f..9b3da8cf 100644 --- a/native_client/ctcdecode/ctc_beam_search_decoder.cpp +++ b/native_client/ctcdecode/ctc_beam_search_decoder.cpp @@ -157,7 +157,7 @@ DecoderState::next(const double *probs, } std::vector -DecoderState::decode() const +DecoderState::decode(size_t top_paths) const { std::vector prefixes_copy = prefixes_; std::unordered_map scores; @@ -167,7 +167,7 @@ DecoderState::decode() const // score the last word of each prefix that doesn't end with space if (ext_scorer_) { - for (size_t i = 0; i < beam_size_ && i < prefixes_copy.size(); ++i) { + for (size_t i = 0; i < top_paths && i < prefixes_copy.size(); ++i) { auto prefix = prefixes_copy[i]; if (!ext_scorer_->is_scoring_boundary(prefix->parent, prefix->character)) { float score = 0.0; @@ -181,14 +181,12 @@ DecoderState::decode() const } using namespace std::placeholders; - size_t num_prefixes = std::min(prefixes_copy.size(), beam_size_); + size_t num_prefixes = std::min(prefixes_copy.size(), top_paths); std::partial_sort(prefixes_copy.begin(), prefixes_copy.begin() + num_prefixes, prefixes_copy.end(), std::bind(prefix_compare_external, _1, _2, scores)); - //TODO: expose this as an API parameter - const size_t top_paths = 1; size_t num_returned = std::min(num_prefixes, top_paths); std::vector outputs; @@ -220,6 +218,7 @@ std::vector ctc_beam_search_decoder( int class_dim, const Alphabet &alphabet, size_t beam_size, + size_t top_paths, double cutoff_prob, size_t cutoff_top_n, std::shared_ptr ext_scorer) @@ -227,7 +226,7 @@ std::vector ctc_beam_search_decoder( DecoderState state; state.init(alphabet, beam_size, cutoff_prob, cutoff_top_n, ext_scorer); state.next(probs, time_dim, class_dim); - return state.decode(); + return state.decode(top_paths); } std::vector> @@ -240,6 +239,7 @@ ctc_beam_search_decoder_batch( int seq_lengths_size, const Alphabet &alphabet, size_t beam_size, + size_t top_paths, size_t num_processes, double cutoff_prob, size_t cutoff_top_n, @@ -259,6 +259,7 @@ ctc_beam_search_decoder_batch( class_dim, alphabet, beam_size, + top_paths, cutoff_prob, cutoff_top_n, ext_scorer)); diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc index dd2a95ea..839a0122 100644 --- a/native_client/deepspeech.cc +++ b/native_client/deepspeech.cc @@ -80,7 +80,7 @@ struct StreamingState { char* intermediateDecode() const; void finalizeStream(); char* finishStream(); - Metadata* finishStreamWithMetadata(); + Result* finishStreamWithMetadata(unsigned int numResults); void processAudioWindow(const vector& buf); void processMfccWindow(const vector& buf); @@ -143,11 +143,26 @@ StreamingState::finishStream() return model_->decode(decoder_state_); } -Metadata* -StreamingState::finishStreamWithMetadata() +Result* +StreamingState::finishStreamWithMetadata(unsigned int numResults) { finalizeStream(); - return model_->decode_metadata(decoder_state_); + + vector metadata = model_->decode_metadata(decoder_state_, numResults); + + std::unique_ptr result(new Result()); + result->num_transcriptions = metadata.size(); + + std::unique_ptr items(new Metadata[result->num_transcriptions]); + + for (int i = 0; i < result->num_transcriptions; ++i) { + std::unique_ptr pointer(new Metadata(*metadata[i])); + items[i] = *pointer.release(); + } + + result->transcriptions = items.release(); + + return result.release(); } void @@ -410,12 +425,13 @@ DS_FinishStream(StreamingState* aSctx) return str; } -Metadata* -DS_FinishStreamWithMetadata(StreamingState* aSctx) +Result* +DS_FinishStreamWithMetadata(StreamingState* aSctx, + unsigned int numResults) { - Metadata* metadata = aSctx->finishStreamWithMetadata(); + Result* result = aSctx->finishStreamWithMetadata(numResults); DS_FreeStream(aSctx); - return metadata; + return result; } StreamingState* @@ -441,13 +457,14 @@ DS_SpeechToText(ModelState* aCtx, return DS_FinishStream(ctx); } -Metadata* +Result* DS_SpeechToTextWithMetadata(ModelState* aCtx, const short* aBuffer, - unsigned int aBufferSize) + unsigned int aBufferSize, + unsigned int numResults) { StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize); - return DS_FinishStreamWithMetadata(ctx); + return DS_FinishStreamWithMetadata(ctx, numResults); } void @@ -468,6 +485,25 @@ DS_FreeMetadata(Metadata* m) } } +void +DS_FreeResult(Result* r) +{ + if (r) { + for (int i = 0; i < r->num_transcriptions; ++i) { + Metadata* m = &r->transcriptions[i]; + + for (int j = 0; j < m->num_items; ++j) { + free(m->items[j].character); + } + + delete[] m->items; + } + + delete[] r->transcriptions; + delete r; + } +} + void DS_FreeString(char* str) { diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h index 6dad59db..41d133ae 100644 --- a/native_client/deepspeech.h +++ b/native_client/deepspeech.h @@ -48,6 +48,16 @@ typedef struct Metadata { double confidence; } Metadata; +/** + * @brief Stores Metadata structs for each alternative transcription + */ +typedef struct Result { + /** List of transcriptions */ + Metadata* transcriptions; + /** Size of the list of transcriptions */ + int num_transcriptions; +} Result; + enum DeepSpeech_Error_Codes { // OK @@ -192,9 +202,10 @@ char* DS_SpeechToText(ModelState* aCtx, * The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. */ DEEPSPEECH_EXPORT -Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, - const short* aBuffer, - unsigned int aBufferSize); +Result* DS_SpeechToTextWithMetadata(ModelState* aCtx, + const short* aBuffer, + unsigned int aBufferSize, + unsigned int numResults); /** * @brief Create a new streaming inference state. The streaming state returned @@ -261,7 +272,8 @@ char* DS_FinishStream(StreamingState* aSctx); * @note This method will free the state pointer (@p aSctx). */ DEEPSPEECH_EXPORT -Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx); +Result* DS_FinishStreamWithMetadata(StreamingState* aSctx, + unsigned int numResults); /** * @brief Destroy a streaming state without decoding the computed logits. This @@ -281,6 +293,12 @@ void DS_FreeStream(StreamingState* aSctx); DEEPSPEECH_EXPORT void DS_FreeMetadata(Metadata* m); +/** + * @brief Free memory allocated for result information. + */ +DEEPSPEECH_EXPORT +void DS_FreeResult(Result* r); + /** * @brief Free a char* string returned by the DeepSpeech API. */ diff --git a/native_client/modelstate.cc b/native_client/modelstate.cc index ea8928bd..88c2c857 100644 --- a/native_client/modelstate.cc +++ b/native_client/modelstate.cc @@ -32,32 +32,41 @@ ModelState::init(const char* model_path) char* ModelState::decode(const DecoderState& state) const { - vector out = state.decode(); + vector out = state.decode(1); return strdup(alphabet_.LabelsToString(out[0].tokens).c_str()); } -Metadata* -ModelState::decode_metadata(const DecoderState& state) +vector +ModelState::decode_metadata(const DecoderState& state, + size_t top_paths) { - vector out = state.decode(); + vector out = state.decode(top_paths); - std::unique_ptr metadata(new Metadata()); - metadata->num_items = out[0].tokens.size(); - metadata->confidence = out[0].confidence; + vector meta_out; - std::unique_ptr items(new MetadataItem[metadata->num_items]()); + size_t max_results = std::min(top_paths, out.size()); - // Loop through each character - for (int i = 0; i < out[0].tokens.size(); ++i) { - items[i].character = strdup(alphabet_.StringFromLabel(out[0].tokens[i]).c_str()); - items[i].timestep = out[0].timesteps[i]; - items[i].start_time = out[0].timesteps[i] * ((float)audio_win_step_ / sample_rate_); + for (int j = 0; j < max_results; ++j) { + std::unique_ptr metadata(new Metadata()); + metadata->num_items = out[j].tokens.size(); + metadata->confidence = out[j].confidence; - if (items[i].start_time < 0) { - items[i].start_time = 0; + std::unique_ptr items(new MetadataItem[metadata->num_items]()); + + // Loop through each character + for (int i = 0; i < out[j].tokens.size(); ++i) { + items[i].character = strdup(alphabet_.StringFromLabel(out[j].tokens[i]).c_str()); + items[i].timestep = out[j].timesteps[i]; + items[i].start_time = out[j].timesteps[i] * ((float)audio_win_step_ / sample_rate_); + + if (items[i].start_time < 0) { + items[i].start_time = 0; + } } + + metadata->items = items.release(); + meta_out.push_back(metadata.release()); } - metadata->items = items.release(); - return metadata.release(); + return meta_out; } diff --git a/native_client/modelstate.h b/native_client/modelstate.h index 25251e15..30d1e101 100644 --- a/native_client/modelstate.h +++ b/native_client/modelstate.h @@ -66,11 +66,14 @@ struct ModelState { * @brief Return character-level metadata including letter timings. * * @param state Decoder state to use when decoding. + * @param top_paths Number of alternate results to return. * - * @return Metadata struct containing MetadataItem structs for each character. - * The user is responsible for freeing Metadata by calling DS_FreeMetadata(). + * @return Vector of Metadata structs containing MetadataItem structs for each character. + * Each represents an alternate transcription, with the first ranked most probable. + * The user is responsible for freeing Metadata by calling DS_FreeMetadata() on each item. */ - virtual Metadata* decode_metadata(const DecoderState& state); + virtual std::vector decode_metadata(const DecoderState& state, + size_t top_paths); }; #endif // MODELSTATE_H From 004d66d224853d19e69db8ebafc67e9b762b453b Mon Sep 17 00:00:00 2001 From: dabinat Date: Wed, 5 Feb 2020 07:55:55 +0000 Subject: [PATCH 02/16] Client changes to show multiple transcriptions in JSON output --- native_client/client.cc | 45 +++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/native_client/client.cc b/native_client/client.cc index abcadd8d..ffe3b518 100644 --- a/native_client/client.cc +++ b/native_client/client.cc @@ -46,7 +46,7 @@ struct meta_word { char* metadataToString(Metadata* metadata); std::vector WordsFromMetadata(Metadata* metadata); -char* JSONOutput(Metadata* metadata); +char* JSONOutput(Result* result); ds_result LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, @@ -57,13 +57,13 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, clock_t ds_start_time = clock(); if (extended_output) { - Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize); - res.string = metadataToString(metadata); - DS_FreeMetadata(metadata); + Result *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1); + res.string = metadataToString(&result->transcriptions[0]); + DS_FreeResult(result); } else if (json_output) { - Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize); - res.string = JSONOutput(metadata); - DS_FreeMetadata(metadata); + Result *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3); + res.string = JSONOutput(result); + DS_FreeResult(result); } else if (stream_size > 0) { StreamingState* ctx; int status = DS_CreateStream(aCtx, &ctx); @@ -338,23 +338,34 @@ WordsFromMetadata(Metadata* metadata) } char* -JSONOutput(Metadata* metadata) +JSONOutput(Result* result) { - std::vector words = WordsFromMetadata(metadata); - std::ostringstream out_string; - out_string << R"({"metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)"; + out_string << "[\n"; - for (int i = 0; i < words.size(); i++) { - meta_word w = words[i]; - out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}"; + for (int j=0; j < result->num_transcriptions; ++j) { + Metadata *metadata = &result->transcriptions[j]; + std::vector words = WordsFromMetadata(metadata); - if (i < words.size() - 1) { - out_string << ","; + out_string << R"({"metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)"; + + for (int i = 0; i < words.size(); i++) { + meta_word w = words[i]; + out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}"; + + if (i < words.size() - 1) { + out_string << ","; + } + } + + out_string << "]}"; + + if (j < result->num_transcriptions - 1) { + out_string << ",\n"; } } - out_string << "]}\n"; + out_string << "\n]\n"; return strdup(out_string.str().c_str()); } From 969b2ac4ba45aaf940a6371cfa73a34e38cab24f Mon Sep 17 00:00:00 2001 From: dabinat Date: Fri, 14 Feb 2020 19:14:08 -0800 Subject: [PATCH 03/16] Changed variable names to match coding style --- native_client/deepspeech.cc | 12 ++++++------ native_client/deepspeech.h | 6 ++++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc index 839a0122..c44e130b 100644 --- a/native_client/deepspeech.cc +++ b/native_client/deepspeech.cc @@ -80,7 +80,7 @@ struct StreamingState { char* intermediateDecode() const; void finalizeStream(); char* finishStream(); - Result* finishStreamWithMetadata(unsigned int numResults); + Result* finishStreamWithMetadata(unsigned int num_results); void processAudioWindow(const vector& buf); void processMfccWindow(const vector& buf); @@ -144,7 +144,7 @@ StreamingState::finishStream() } Result* -StreamingState::finishStreamWithMetadata(unsigned int numResults) +StreamingState::finishStreamWithMetadata(unsigned int num_results) { finalizeStream(); @@ -427,9 +427,9 @@ DS_FinishStream(StreamingState* aSctx) Result* DS_FinishStreamWithMetadata(StreamingState* aSctx, - unsigned int numResults) + unsigned int aNumResults) { - Result* result = aSctx->finishStreamWithMetadata(numResults); + Result* result = aSctx->finishStreamWithMetadata(aNumResults); DS_FreeStream(aSctx); return result; } @@ -461,10 +461,10 @@ Result* DS_SpeechToTextWithMetadata(ModelState* aCtx, const short* aBuffer, unsigned int aBufferSize, - unsigned int numResults) + unsigned int aNumResults) { StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize); - return DS_FinishStreamWithMetadata(ctx, numResults); + return DS_FinishStreamWithMetadata(ctx, aNumResults); } void diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h index 41d133ae..53f1954f 100644 --- a/native_client/deepspeech.h +++ b/native_client/deepspeech.h @@ -197,6 +197,7 @@ char* DS_SpeechToText(ModelState* aCtx, * @param aBuffer A 16-bit, mono raw audio signal at the appropriate * sample rate (matching what the model was trained on). * @param aBufferSize The number of samples in the audio signal. + * @param aNumResults The number of alternative transcriptions to return. * * @return Outputs a struct of individual letters along with their timing information. * The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. @@ -205,7 +206,7 @@ DEEPSPEECH_EXPORT Result* DS_SpeechToTextWithMetadata(ModelState* aCtx, const short* aBuffer, unsigned int aBufferSize, - unsigned int numResults); + unsigned int aNumResults); /** * @brief Create a new streaming inference state. The streaming state returned @@ -265,6 +266,7 @@ char* DS_FinishStream(StreamingState* aSctx); * inference, returns per-letter metadata. * * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. + * @param aNumResults The number of alternative transcriptions to return. * * @return Outputs a struct of individual letters along with their timing information. * The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. @@ -273,7 +275,7 @@ char* DS_FinishStream(StreamingState* aSctx); */ DEEPSPEECH_EXPORT Result* DS_FinishStreamWithMetadata(StreamingState* aSctx, - unsigned int numResults); + unsigned int aNumResults); /** * @brief Destroy a streaming state without decoding the computed logits. This From e0c42f01a441692fd4133899586a9cfd7b685641 Mon Sep 17 00:00:00 2001 From: dabinat Date: Fri, 14 Feb 2020 19:17:52 -0800 Subject: [PATCH 04/16] Moved result limiting to ModelState instead of CTC decoder --- .../ctcdecode/ctc_beam_search_decoder.cpp | 13 +++++------ native_client/deepspeech.cc | 16 +------------- native_client/modelstate.cc | 22 +++++++++++-------- native_client/modelstate.h | 10 ++++----- 4 files changed, 24 insertions(+), 37 deletions(-) diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.cpp b/native_client/ctcdecode/ctc_beam_search_decoder.cpp index 9b3da8cf..3039d47c 100644 --- a/native_client/ctcdecode/ctc_beam_search_decoder.cpp +++ b/native_client/ctcdecode/ctc_beam_search_decoder.cpp @@ -157,7 +157,7 @@ DecoderState::next(const double *probs, } std::vector -DecoderState::decode(size_t top_paths) const +DecoderState::decode() const { std::vector prefixes_copy = prefixes_; std::unordered_map scores; @@ -167,7 +167,7 @@ DecoderState::decode(size_t top_paths) const // score the last word of each prefix that doesn't end with space if (ext_scorer_) { - for (size_t i = 0; i < top_paths && i < prefixes_copy.size(); ++i) { + for (size_t i = 0; i < beam_size_ && i < prefixes_copy.size(); ++i) { auto prefix = prefixes_copy[i]; if (!ext_scorer_->is_scoring_boundary(prefix->parent, prefix->character)) { float score = 0.0; @@ -181,13 +181,13 @@ DecoderState::decode(size_t top_paths) const } using namespace std::placeholders; - size_t num_prefixes = std::min(prefixes_copy.size(), top_paths); + size_t num_prefixes = std::min(prefixes_copy.size(), beam_size_); std::partial_sort(prefixes_copy.begin(), prefixes_copy.begin() + num_prefixes, prefixes_copy.end(), std::bind(prefix_compare_external, _1, _2, scores)); - size_t num_returned = std::min(num_prefixes, top_paths); + size_t num_returned = std::min(num_prefixes, beam_size_); std::vector outputs; outputs.reserve(num_returned); @@ -218,7 +218,6 @@ std::vector ctc_beam_search_decoder( int class_dim, const Alphabet &alphabet, size_t beam_size, - size_t top_paths, double cutoff_prob, size_t cutoff_top_n, std::shared_ptr ext_scorer) @@ -226,7 +225,7 @@ std::vector ctc_beam_search_decoder( DecoderState state; state.init(alphabet, beam_size, cutoff_prob, cutoff_top_n, ext_scorer); state.next(probs, time_dim, class_dim); - return state.decode(top_paths); + return state.decode(); } std::vector> @@ -239,7 +238,6 @@ ctc_beam_search_decoder_batch( int seq_lengths_size, const Alphabet &alphabet, size_t beam_size, - size_t top_paths, size_t num_processes, double cutoff_prob, size_t cutoff_top_n, @@ -259,7 +257,6 @@ ctc_beam_search_decoder_batch( class_dim, alphabet, beam_size, - top_paths, cutoff_prob, cutoff_top_n, ext_scorer)); diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc index c44e130b..ffc10a13 100644 --- a/native_client/deepspeech.cc +++ b/native_client/deepspeech.cc @@ -148,21 +148,7 @@ StreamingState::finishStreamWithMetadata(unsigned int num_results) { finalizeStream(); - vector metadata = model_->decode_metadata(decoder_state_, numResults); - - std::unique_ptr result(new Result()); - result->num_transcriptions = metadata.size(); - - std::unique_ptr items(new Metadata[result->num_transcriptions]); - - for (int i = 0; i < result->num_transcriptions; ++i) { - std::unique_ptr pointer(new Metadata(*metadata[i])); - items[i] = *pointer.release(); - } - - result->transcriptions = items.release(); - - return result.release(); + return model_->decode_metadata(decoder_state_, num_results); } void diff --git a/native_client/modelstate.cc b/native_client/modelstate.cc index 88c2c857..5a8afae3 100644 --- a/native_client/modelstate.cc +++ b/native_client/modelstate.cc @@ -32,22 +32,25 @@ ModelState::init(const char* model_path) char* ModelState::decode(const DecoderState& state) const { - vector out = state.decode(1); + vector out = state.decode(); return strdup(alphabet_.LabelsToString(out[0].tokens).c_str()); } -vector +Result* ModelState::decode_metadata(const DecoderState& state, - size_t top_paths) + size_t num_results) { - vector out = state.decode(top_paths); + vector out = state.decode(); - vector meta_out; + size_t max_results = std::min(num_results, out.size()); - size_t max_results = std::min(top_paths, out.size()); + std::unique_ptr result(new Result()); + result->num_transcriptions = max_results; + + std::unique_ptr transcripts(new Metadata[max_results]()); for (int j = 0; j < max_results; ++j) { - std::unique_ptr metadata(new Metadata()); + Metadata* metadata = &transcripts[j]; metadata->num_items = out[j].tokens.size(); metadata->confidence = out[j].confidence; @@ -65,8 +68,9 @@ ModelState::decode_metadata(const DecoderState& state, } metadata->items = items.release(); - meta_out.push_back(metadata.release()); } - return meta_out; + result->transcriptions = transcripts.release(); + + return result.release(); } diff --git a/native_client/modelstate.h b/native_client/modelstate.h index 30d1e101..8ea7ad99 100644 --- a/native_client/modelstate.h +++ b/native_client/modelstate.h @@ -66,14 +66,14 @@ struct ModelState { * @brief Return character-level metadata including letter timings. * * @param state Decoder state to use when decoding. - * @param top_paths Number of alternate results to return. + * @param num_results Number of alternate results to return. * - * @return Vector of Metadata structs containing MetadataItem structs for each character. + * @return A Result struct containing Metadata structs. * Each represents an alternate transcription, with the first ranked most probable. - * The user is responsible for freeing Metadata by calling DS_FreeMetadata() on each item. + * The user is responsible for freeing Result by calling DS_FreeResult(). */ - virtual std::vector decode_metadata(const DecoderState& state, - size_t top_paths); + virtual Result* decode_metadata(const DecoderState& state, + size_t num_results); }; #endif // MODELSTATE_H From e1fec4e8183a3cd451330e7e5619cb2e6ded4868 Mon Sep 17 00:00:00 2001 From: dabinat Date: Fri, 14 Feb 2020 19:19:14 -0800 Subject: [PATCH 05/16] Client - Change JSON output to return alternatives transcripts in an "alternatives" array --- native_client/client.cc | 52 +++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/native_client/client.cc b/native_client/client.cc index ffe3b518..413be288 100644 --- a/native_client/client.cc +++ b/native_client/client.cc @@ -47,6 +47,7 @@ struct meta_word { char* metadataToString(Metadata* metadata); std::vector WordsFromMetadata(Metadata* metadata); char* JSONOutput(Result* result); +std::string MetadataOutput(Metadata* metadata); ds_result LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, @@ -341,35 +342,56 @@ char* JSONOutput(Result* result) { std::ostringstream out_string; - out_string << "[\n"; + out_string << "{\n"; for (int j=0; j < result->num_transcriptions; ++j) { Metadata *metadata = &result->transcriptions[j]; - std::vector words = WordsFromMetadata(metadata); - out_string << R"({"metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)"; + if (j == 0) { + out_string << MetadataOutput(metadata); - for (int i = 0; i < words.size(); i++) { - meta_word w = words[i]; - out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}"; - - if (i < words.size() - 1) { - out_string << ","; + if (result->num_transcriptions > 1) { + out_string << ",\n" << R"("alternatives")" << ":[\n"; } - } + } else { + out_string << "{" << MetadataOutput(metadata) << "}"; - out_string << "]}"; - - if (j < result->num_transcriptions - 1) { - out_string << ",\n"; + if (j < result->num_transcriptions - 1) { + out_string << ",\n"; + } else { + out_string << "\n]"; + } } } - out_string << "\n]\n"; + out_string << "\n}\n"; return strdup(out_string.str().c_str()); } +std::string +MetadataOutput(Metadata *metadata) +{ + std::ostringstream out_string; + + std::vector words = WordsFromMetadata(metadata); + + out_string << R"("metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)"; + + for (int i = 0; i < words.size(); i++) { + meta_word w = words[i]; + out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}"; + + if (i < words.size() - 1) { + out_string << ","; + } + } + + out_string << "]"; + + return out_string.str(); +} + int main(int argc, char **argv) { From 69bd0326052717ad7c7a47bb336cd0234c45bb7e Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 25 Feb 2020 12:29:18 +0100 Subject: [PATCH 06/16] Improve API naming around Metadata objects --- .../ctcdecode/ctc_beam_search_decoder.cpp | 8 +-- .../ctcdecode/ctc_beam_search_decoder.h | 5 +- native_client/deepspeech.cc | 38 ++++------- native_client/deepspeech.h | 65 +++++++++---------- native_client/modelstate.cc | 43 ++++++------ native_client/modelstate.h | 12 ++-- 6 files changed, 75 insertions(+), 96 deletions(-) diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.cpp b/native_client/ctcdecode/ctc_beam_search_decoder.cpp index 3039d47c..8a072c53 100644 --- a/native_client/ctcdecode/ctc_beam_search_decoder.cpp +++ b/native_client/ctcdecode/ctc_beam_search_decoder.cpp @@ -157,7 +157,7 @@ DecoderState::next(const double *probs, } std::vector -DecoderState::decode() const +DecoderState::decode(size_t num_results) const { std::vector prefixes_copy = prefixes_; std::unordered_map scores; @@ -181,14 +181,12 @@ DecoderState::decode() const } using namespace std::placeholders; - size_t num_prefixes = std::min(prefixes_copy.size(), beam_size_); + size_t num_returned = std::min(prefixes_copy.size(), num_results); std::partial_sort(prefixes_copy.begin(), - prefixes_copy.begin() + num_prefixes, + prefixes_copy.begin() + num_returned, prefixes_copy.end(), std::bind(prefix_compare_external, _1, _2, scores)); - size_t num_returned = std::min(num_prefixes, beam_size_); - std::vector outputs; outputs.reserve(num_returned); diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.h b/native_client/ctcdecode/ctc_beam_search_decoder.h index a3d5c480..78871b2a 100644 --- a/native_client/ctcdecode/ctc_beam_search_decoder.h +++ b/native_client/ctcdecode/ctc_beam_search_decoder.h @@ -61,12 +61,15 @@ public: int class_dim); /* Get transcription from current decoder state + * + * Parameters: + * num_results: Number of beams to return. * * Return: * A vector where each element is a pair of score and decoding result, * in descending order. */ - std::vector decode() const; + std::vector decode(size_t num_results=1) const; }; diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc index ffc10a13..adaa0445 100644 --- a/native_client/deepspeech.cc +++ b/native_client/deepspeech.cc @@ -60,7 +60,7 @@ using std::vector; When batch_buffer is full, we do a single step through the acoustic model and accumulate the intermediate decoding state in the DecoderState structure. - When finishStream() is called, we return the corresponding transcription from + When finishStream() is called, we return the corresponding transcript from the current decoder state. */ struct StreamingState { @@ -80,7 +80,7 @@ struct StreamingState { char* intermediateDecode() const; void finalizeStream(); char* finishStream(); - Result* finishStreamWithMetadata(unsigned int num_results); + Metadata* finishStreamWithMetadata(unsigned int num_results); void processAudioWindow(const vector& buf); void processMfccWindow(const vector& buf); @@ -143,7 +143,7 @@ StreamingState::finishStream() return model_->decode(decoder_state_); } -Result* +Metadata* StreamingState::finishStreamWithMetadata(unsigned int num_results) { finalizeStream(); @@ -411,11 +411,11 @@ DS_FinishStream(StreamingState* aSctx) return str; } -Result* +Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, unsigned int aNumResults) { - Result* result = aSctx->finishStreamWithMetadata(aNumResults); + Metadata* result = aSctx->finishStreamWithMetadata(aNumResults); DS_FreeStream(aSctx); return result; } @@ -443,7 +443,7 @@ DS_SpeechToText(ModelState* aCtx, return DS_FinishStream(ctx); } -Result* +Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, const short* aBuffer, unsigned int aBufferSize, @@ -463,30 +463,16 @@ void DS_FreeMetadata(Metadata* m) { if (m) { - for (int i = 0; i < m->num_items; ++i) { - free(m->items[i].character); - } - delete[] m->items; - delete m; - } -} - -void -DS_FreeResult(Result* r) -{ - if (r) { - for (int i = 0; i < r->num_transcriptions; ++i) { - Metadata* m = &r->transcriptions[i]; - - for (int j = 0; j < m->num_items; ++j) { - free(m->items[j].character); + for (int i = 0; i < m->num_transcripts; ++i) { + for (int j = 0; j < m->transcripts[i].num_tokens; ++j) { + free(m->transcripts[i].tokens[j].text); } - delete[] m->items; + delete[] m->transcripts[i].tokens; } - delete[] r->transcriptions; - delete r; + delete[] m->transcripts; + delete m; } } diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h index 53f1954f..7aee1048 100644 --- a/native_client/deepspeech.h +++ b/native_client/deepspeech.h @@ -20,43 +20,44 @@ typedef struct ModelState ModelState; typedef struct StreamingState StreamingState; /** - * @brief Stores each individual character, along with its timing information + * @brief Stores text of an individual token, along with its timing information */ -typedef struct MetadataItem { - /** The character generated for transcription */ - char* character; +typedef struct TokenMetadata { + /** The text corresponding to this token */ + char* text; - /** Position of the character in units of 20ms */ + /** Position of the token in units of 20ms */ int timestep; - /** Position of the character in seconds */ + /** Position of the token in seconds */ float start_time; -} MetadataItem; +} TokenMetadata; /** - * @brief Stores the entire CTC output as an array of character metadata objects + * @brief A single transcript computed by the model, including a confidence + * value and the metadata for its constituent tokens. */ -typedef struct Metadata { - /** List of items */ - MetadataItem* items; - /** Size of the list of items */ - int num_items; +typedef struct CandidateTranscript { + /** Array of TokenMetadata objects */ + TokenMetadata* tokens; + /** Size of the tokens array */ + int num_tokens; /** Approximated confidence value for this transcription. This is roughly the * sum of the acoustic model logit values for each timestep/character that * contributed to the creation of this transcription. */ double confidence; -} Metadata; +} CandidateTranscript; /** - * @brief Stores Metadata structs for each alternative transcription + * @brief An array of CandidateTranscript objects computed by the model */ -typedef struct Result { - /** List of transcriptions */ - Metadata* transcriptions; - /** Size of the list of transcriptions */ - int num_transcriptions; -} Result; +typedef struct Metadata { + /** Array of CandidateTranscript objects */ + CandidateTranscript* transcripts; + /** Size of the transcriptions array */ + int num_transcripts; +} Metadata; enum DeepSpeech_Error_Codes { @@ -197,16 +198,16 @@ char* DS_SpeechToText(ModelState* aCtx, * @param aBuffer A 16-bit, mono raw audio signal at the appropriate * sample rate (matching what the model was trained on). * @param aBufferSize The number of samples in the audio signal. - * @param aNumResults The number of alternative transcriptions to return. + * @param aNumResults The number of candidate transcripts to return. * * @return Outputs a struct of individual letters along with their timing information. * The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. */ DEEPSPEECH_EXPORT -Result* DS_SpeechToTextWithMetadata(ModelState* aCtx, - const short* aBuffer, - unsigned int aBufferSize, - unsigned int aNumResults); +Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, + const short* aBuffer, + unsigned int aBufferSize, + unsigned int aNumResults); /** * @brief Create a new streaming inference state. The streaming state returned @@ -266,7 +267,7 @@ char* DS_FinishStream(StreamingState* aSctx); * inference, returns per-letter metadata. * * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. - * @param aNumResults The number of alternative transcriptions to return. + * @param aNumResults The number of candidate transcripts to return. * * @return Outputs a struct of individual letters along with their timing information. * The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. @@ -274,8 +275,8 @@ char* DS_FinishStream(StreamingState* aSctx); * @note This method will free the state pointer (@p aSctx). */ DEEPSPEECH_EXPORT -Result* DS_FinishStreamWithMetadata(StreamingState* aSctx, - unsigned int aNumResults); +Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, + unsigned int aNumResults); /** * @brief Destroy a streaming state without decoding the computed logits. This @@ -295,12 +296,6 @@ void DS_FreeStream(StreamingState* aSctx); DEEPSPEECH_EXPORT void DS_FreeMetadata(Metadata* m); -/** - * @brief Free memory allocated for result information. - */ -DEEPSPEECH_EXPORT -void DS_FreeResult(Result* r); - /** * @brief Free a char* string returned by the DeepSpeech API. */ diff --git a/native_client/modelstate.cc b/native_client/modelstate.cc index 5a8afae3..d4f16636 100644 --- a/native_client/modelstate.cc +++ b/native_client/modelstate.cc @@ -36,41 +36,38 @@ ModelState::decode(const DecoderState& state) const return strdup(alphabet_.LabelsToString(out[0].tokens).c_str()); } -Result* +Metadata* ModelState::decode_metadata(const DecoderState& state, size_t num_results) { - vector out = state.decode(); + vector out = state.decode(num_results); + size_t num_returned = out.size(); - size_t max_results = std::min(num_results, out.size()); + std::unique_ptr metadata(new Metadata); + metadata->num_transcripts = num_returned; - std::unique_ptr result(new Result()); - result->num_transcriptions = max_results; + std::unique_ptr transcripts(new CandidateTranscript[num_returned]); - std::unique_ptr transcripts(new Metadata[max_results]()); + for (int i = 0; i < num_returned; ++i) { + transcripts[i].num_tokens = out[i].tokens.size(); + transcripts[i].confidence = out[i].confidence; - for (int j = 0; j < max_results; ++j) { - Metadata* metadata = &transcripts[j]; - metadata->num_items = out[j].tokens.size(); - metadata->confidence = out[j].confidence; + std::unique_ptr tokens(new TokenMetadata[transcripts[i].num_tokens]); - std::unique_ptr items(new MetadataItem[metadata->num_items]()); + // Loop through each token + for (int j = 0; j < out[i].tokens.size(); ++j) { + tokens[j].text = strdup(alphabet_.StringFromLabel(out[i].tokens[j]).c_str()); + tokens[j].timestep = out[i].timesteps[j]; + tokens[j].start_time = out[i].timesteps[j] * ((float)audio_win_step_ / sample_rate_); - // Loop through each character - for (int i = 0; i < out[j].tokens.size(); ++i) { - items[i].character = strdup(alphabet_.StringFromLabel(out[j].tokens[i]).c_str()); - items[i].timestep = out[j].timesteps[i]; - items[i].start_time = out[j].timesteps[i] * ((float)audio_win_step_ / sample_rate_); - - if (items[i].start_time < 0) { - items[i].start_time = 0; + if (tokens[j].start_time < 0) { + tokens[j].start_time = 0; } } - metadata->items = items.release(); + transcripts[i].tokens = tokens.release(); } - result->transcriptions = transcripts.release(); - - return result.release(); + metadata->transcripts = transcripts.release(); + return metadata.release(); } diff --git a/native_client/modelstate.h b/native_client/modelstate.h index 8ea7ad99..43eef970 100644 --- a/native_client/modelstate.h +++ b/native_client/modelstate.h @@ -66,14 +66,14 @@ struct ModelState { * @brief Return character-level metadata including letter timings. * * @param state Decoder state to use when decoding. - * @param num_results Number of alternate results to return. + * @param num_results Number of candidate results to return. * - * @return A Result struct containing Metadata structs. - * Each represents an alternate transcription, with the first ranked most probable. - * The user is responsible for freeing Result by calling DS_FreeResult(). + * @return A Metadata struct containing CandidateTranscript structs. + * Each represents an candidate transcript, with the first ranked most probable. + * The user is responsible for freeing Result by calling DS_FreeMetadata(). */ - virtual Result* decode_metadata(const DecoderState& state, - size_t num_results); + virtual Metadata* decode_metadata(const DecoderState& state, + size_t num_results); }; #endif // MODELSTATE_H From ea8c7d2957d93cd7686751ba0860a10f7c5c330d Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 25 Feb 2020 13:38:25 +0100 Subject: [PATCH 07/16] Add DS_IntermediateDecodeWithMetadata --- native_client/deepspeech.cc | 15 ++++++++++++++- native_client/deepspeech.h | 28 ++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc index adaa0445..d284a319 100644 --- a/native_client/deepspeech.cc +++ b/native_client/deepspeech.cc @@ -78,6 +78,7 @@ struct StreamingState { void feedAudioContent(const short* buffer, unsigned int buffer_size); char* intermediateDecode() const; + Metadata* intermediateDecodeWithMetadata(unsigned int num_results) const; void finalizeStream(); char* finishStream(); Metadata* finishStreamWithMetadata(unsigned int num_results); @@ -136,6 +137,12 @@ StreamingState::intermediateDecode() const return model_->decode(decoder_state_); } +Metadata* +StreamingState::intermediateDecodeWithMetadata(unsigned int num_results) const +{ + return model_->decode_metadata(decoder_state_, num_results); +} + char* StreamingState::finishStream() { @@ -147,7 +154,6 @@ Metadata* StreamingState::finishStreamWithMetadata(unsigned int num_results) { finalizeStream(); - return model_->decode_metadata(decoder_state_, num_results); } @@ -403,6 +409,13 @@ DS_IntermediateDecode(const StreamingState* aSctx) return aSctx->intermediateDecode(); } +Metadata* +DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx, + unsigned int aNumResults) +{ + return aSctx->intermediateDecodeWithMetadata(aNumResults); +} + char* DS_FinishStream(StreamingState* aSctx) { diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h index 7aee1048..8bfee073 100644 --- a/native_client/deepspeech.h +++ b/native_client/deepspeech.h @@ -200,8 +200,10 @@ char* DS_SpeechToText(ModelState* aCtx, * @param aBufferSize The number of samples in the audio signal. * @param aNumResults The number of candidate transcripts to return. * - * @return Outputs a struct of individual letters along with their timing information. - * The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. + * @return Metadata struct containing multiple candidate transcripts. Each transcript + * has per-token metadata including timing information. The user is + * responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. + * Returns NULL on error. */ DEEPSPEECH_EXPORT Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, @@ -248,6 +250,22 @@ void DS_FeedAudioContent(StreamingState* aSctx, DEEPSPEECH_EXPORT char* DS_IntermediateDecode(const StreamingState* aSctx); +/** + * @brief Compute the intermediate decoding of an ongoing streaming inference, + * returns per-letter metadata. + * + * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. + * @param aNumResults The number of candidate transcripts to return. + * + * @return Metadata struct containing multiple candidate transcripts. Each transcript + * has per-token metadata including timing information. The user is + * responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. + * Returns NULL on error. + */ +DEEPSPEECH_EXPORT +Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx, + unsigned int aNumResults); + /** * @brief Signal the end of an audio signal to an ongoing streaming * inference, returns the STT result over the whole audio signal. @@ -269,8 +287,10 @@ char* DS_FinishStream(StreamingState* aSctx); * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @param aNumResults The number of candidate transcripts to return. * - * @return Outputs a struct of individual letters along with their timing information. - * The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. + * @return Metadata struct containing multiple candidate transcripts. Each transcript + * has per-token metadata including timing information. The user is + * responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. + * Returns NULL on error. * * @note This method will free the state pointer (@p aSctx). */ From c74dcffe79dd8346e47a9f9ed5a56e46c2c1810a Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 25 Feb 2020 12:38:42 +0100 Subject: [PATCH 08/16] Adjust client.cc for new API and small cleanup of code and function names --- native_client/client.cc | 238 +++++++++++++++++++--------------------- 1 file changed, 115 insertions(+), 123 deletions(-) diff --git a/native_client/client.cc b/native_client/client.cc index 413be288..9ab47f27 100644 --- a/native_client/client.cc +++ b/native_client/client.cc @@ -44,10 +44,115 @@ struct meta_word { float duration; }; -char* metadataToString(Metadata* metadata); -std::vector WordsFromMetadata(Metadata* metadata); -char* JSONOutput(Result* result); -std::string MetadataOutput(Metadata* metadata); +char* +CandidateTranscriptToString(CandidateTranscript* transcript) +{ + std::string retval = ""; + for (int i = 0; i < transcript->num_tokens; i++) { + TokenMetadata token = transcript->tokens[i]; + retval += token.text; + } + return strdup(retval.c_str()); +} + +std::vector +CandidateTranscriptToWords(CandidateTranscript* transcript) +{ + std::vector word_list; + + std::string word = ""; + float word_start_time = 0; + + // Loop through each token + for (int i = 0; i < transcript->num_tokens; i++) { + TokenMetadata token = transcript->tokens[i]; + + // Append token to word if it's not a space + if (strcmp(token.text, u8" ") != 0) { + // Log the start time of the new word + if (word.length() == 0) { + word_start_time = token.start_time; + } + word.append(token.text); + } + + // Word boundary is either a space or the last token in the array + if (strcmp(token.text, u8" ") == 0 || i == transcript->num_tokens-1) { + float word_duration = token.start_time - word_start_time; + + if (word_duration < 0) { + word_duration = 0; + } + + meta_word w; + w.word = word; + w.start_time = word_start_time; + w.duration = word_duration; + + word_list.push_back(w); + + // Reset + word = ""; + word_start_time = 0; + } + } + + return word_list; +} + +std::string +CandidateTranscriptToJSON(CandidateTranscript *transcript) +{ + std::ostringstream out_string; + + std::vector words = CandidateTranscriptToWords(transcript); + + out_string << R"("metadata":{"confidence":)" << transcript->confidence << R"(},"words":[)"; + + for (int i = 0; i < words.size(); i++) { + meta_word w = words[i]; + out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}"; + + if (i < words.size() - 1) { + out_string << ","; + } + } + + out_string << "]"; + + return out_string.str(); +} + +char* +MetadataToJSON(Metadata* result) +{ + std::ostringstream out_string; + out_string << "{\n"; + + for (int j=0; j < result->num_transcripts; ++j) { + CandidateTranscript *transcript = &result->transcripts[j]; + + if (j == 0) { + out_string << CandidateTranscriptToJSON(transcript); + + if (result->num_transcripts > 1) { + out_string << ",\n" << R"("alternatives")" << ":[\n"; + } + } else { + out_string << "{" << CandidateTranscriptToJSON(transcript) << "}"; + + if (j < result->num_transcripts - 1) { + out_string << ",\n"; + } else { + out_string << "\n]"; + } + } + } + + out_string << "\n}\n"; + + return strdup(out_string.str().c_str()); +} ds_result LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, @@ -58,13 +163,13 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, clock_t ds_start_time = clock(); if (extended_output) { - Result *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1); - res.string = metadataToString(&result->transcriptions[0]); - DS_FreeResult(result); + Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1); + res.string = CandidateTranscriptToString(&result->transcripts[0]); + DS_FreeMetadata(result); } else if (json_output) { - Result *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3); - res.string = JSONOutput(result); - DS_FreeResult(result); + Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3); + res.string = MetadataToJSON(result); + DS_FreeMetadata(result); } else if (stream_size > 0) { StreamingState* ctx; int status = DS_CreateStream(aCtx, &ctx); @@ -279,119 +384,6 @@ ProcessFile(ModelState* context, const char* path, bool show_times) } } -char* -metadataToString(Metadata* metadata) -{ - std::string retval = ""; - for (int i = 0; i < metadata->num_items; i++) { - MetadataItem item = metadata->items[i]; - retval += item.character; - } - return strdup(retval.c_str()); -} - -std::vector -WordsFromMetadata(Metadata* metadata) -{ - std::vector word_list; - - std::string word = ""; - float word_start_time = 0; - - // Loop through each character - for (int i = 0; i < metadata->num_items; i++) { - MetadataItem item = metadata->items[i]; - - // Append character to word if it's not a space - if (strcmp(item.character, u8" ") != 0) { - // Log the start time of the new word - if (word.length() == 0) { - word_start_time = item.start_time; - } - word.append(item.character); - } - - // Word boundary is either a space or the last character in the array - if (strcmp(item.character, " ") == 0 - || strcmp(item.character, u8" ") == 0 - || i == metadata->num_items-1) { - - float word_duration = item.start_time - word_start_time; - - if (word_duration < 0) { - word_duration = 0; - } - - meta_word w; - w.word = word; - w.start_time = word_start_time; - w.duration = word_duration; - - word_list.push_back(w); - - // Reset - word = ""; - word_start_time = 0; - } - } - - return word_list; -} - -char* -JSONOutput(Result* result) -{ - std::ostringstream out_string; - out_string << "{\n"; - - for (int j=0; j < result->num_transcriptions; ++j) { - Metadata *metadata = &result->transcriptions[j]; - - if (j == 0) { - out_string << MetadataOutput(metadata); - - if (result->num_transcriptions > 1) { - out_string << ",\n" << R"("alternatives")" << ":[\n"; - } - } else { - out_string << "{" << MetadataOutput(metadata) << "}"; - - if (j < result->num_transcriptions - 1) { - out_string << ",\n"; - } else { - out_string << "\n]"; - } - } - } - - out_string << "\n}\n"; - - return strdup(out_string.str().c_str()); -} - -std::string -MetadataOutput(Metadata *metadata) -{ - std::ostringstream out_string; - - std::vector words = WordsFromMetadata(metadata); - - out_string << R"("metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)"; - - for (int i = 0; i < words.size(); i++) { - meta_word w = words[i]; - out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}"; - - if (i < words.size() - 1) { - out_string << ","; - } - } - - out_string << "]"; - - return out_string.str(); -} - int main(int argc, char **argv) { From 6e88a37ad4367f1481e29472bf0a299881e96e63 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 25 Feb 2020 12:50:06 +0100 Subject: [PATCH 09/16] Adapt Python bindings to new API --- native_client/python/__init__.py | 65 +++++++++++++++++++----------- native_client/python/client.py | 31 +++++++------- native_client/python/impl.i | 69 ++++++++++++++++++++++++++------ 3 files changed, 116 insertions(+), 49 deletions(-) diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py index a6511efe..5d9072ec 100644 --- a/native_client/python/__init__.py +++ b/native_client/python/__init__.py @@ -121,17 +121,20 @@ class Model(object): """ return deepspeech.impl.SpeechToText(self._impl, audio_buffer) - def sttWithMetadata(self, audio_buffer): + def sttWithMetadata(self, audio_buffer, num_results=1): """ Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results. :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). :type audio_buffer: numpy.int16 array + :param num_results: Number of candidate transcripts to return. + :type num_results: int + :return: Outputs a struct of individual letters along with their timing information. :type: :func:`Metadata` """ - return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer) + return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results) def createStream(self): """ @@ -187,6 +190,19 @@ class Stream(object): raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?") return deepspeech.impl.IntermediateDecode(self._impl) + def intermediateDecodeWithMetadata(self, num_results=1): + """ + Compute the intermediate decoding of an ongoing streaming inference. + + :return: The STT intermediate result. + :type: str + + :throws: RuntimeError if the stream object is not valid + """ + if not self._impl: + raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?") + return deepspeech.impl.IntermediateDecodeWithMetadata(self._impl, num_results) + def finishStream(self): """ Signal the end of an audio signal to an ongoing streaming inference, @@ -203,11 +219,14 @@ class Stream(object): self._impl = None return result - def finishStreamWithMetadata(self): + def finishStreamWithMetadata(self, num_results=1): """ Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata. + :param num_results: Number of candidate transcripts to return. + :type num_results: int + :return: Outputs a struct of individual letters along with their timing information. :type: :func:`Metadata` @@ -215,7 +234,7 @@ class Stream(object): """ if not self._impl: raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?") - result = deepspeech.impl.FinishStreamWithMetadata(self._impl) + result = deepspeech.impl.FinishStreamWithMetadata(self._impl, num_results) self._impl = None return result @@ -233,52 +252,43 @@ class Stream(object): # This is only for documentation purpose -# Metadata and MetadataItem should be in sync with native_client/deepspeech.h -class MetadataItem(object): +# Metadata, CandidateTranscript and TokenMetadata should be in sync with native_client/deepspeech.h +class TokenMetadata(object): """ Stores each individual character, along with its timing information """ - def character(self): + def text(self): """ - The character generated for transcription + The text for this token """ def timestep(self): """ - Position of the character in units of 20ms + Position of the token in units of 20ms """ def start_time(self): """ - Position of the character in seconds + Position of the token in seconds """ -class Metadata(object): +class CandidateTranscript(object): """ Stores the entire CTC output as an array of character metadata objects """ - def items(self): + def tokens(self): """ - List of items + List of tokens - :return: A list of :func:`MetadataItem` elements + :return: A list of :func:`TokenMetadata` elements :type: list """ - def num_items(self): - """ - Size of the list of items - - :return: Size of the list of items - :type: int - """ - - def confidence(self): """ Approximated confidence value for this transcription. This is roughly the @@ -286,3 +296,12 @@ class Metadata(object): contributed to the creation of this transcription. """ + +class Metadata(object): + def transcripts(self): + """ + List of candidate transcripts + + :return: A list of :func:`CandidateTranscript` objects + :type: list + """ diff --git a/native_client/python/client.py b/native_client/python/client.py index 671968b9..00fa2ff6 100644 --- a/native_client/python/client.py +++ b/native_client/python/client.py @@ -18,6 +18,7 @@ try: except ImportError: from pipes import quote + def convert_samplerate(audio_path, desired_sample_rate): sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate) try: @@ -31,25 +32,25 @@ def convert_samplerate(audio_path, desired_sample_rate): def metadata_to_string(metadata): - return ''.join(item.character for item in metadata.items) + return ''.join(token.text for token in metadata.tokens) -def words_from_metadata(metadata): + +def words_from_candidate_transcript(metadata): word = "" word_list = [] word_start_time = 0 # Loop through each character - for i in range(0, metadata.num_items): - item = metadata.items[i] + for i, token in enumerate(metadata.tokens): # Append character to word if it's not a space - if item.character != " ": + if token.text != " ": if len(word) == 0: # Log the start time of the new word - word_start_time = item.start_time + word_start_time = token.start_time - word = word + item.character + word = word + token.text # Word boundary is either a space or the last character in the array - if item.character == " " or i == metadata.num_items - 1: - word_duration = item.start_time - word_start_time + if token.text == " " or i == len(metadata.tokens) - 1: + word_duration = token.start_time - word_start_time if word_duration < 0: word_duration = 0 @@ -69,9 +70,11 @@ def words_from_metadata(metadata): def metadata_json_output(metadata): json_result = dict() - json_result["words"] = words_from_metadata(metadata) - json_result["confidence"] = metadata.confidence - return json.dumps(json_result) + json_result["transcripts"] = [{ + "confidence": transcript.confidence, + "words": words_from_candidate_transcript(transcript), + } for transcript in metadata.transcripts] + return json.dumps(json_result, indent=2) @@ -141,9 +144,9 @@ def main(): print('Running inference.', file=sys.stderr) inference_start = timer() if args.extended: - print(metadata_to_string(ds.sttWithMetadata(audio))) + print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) elif args.json: - print(metadata_json_output(ds.sttWithMetadata(audio))) + print(metadata_json_output(ds.sttWithMetadata(audio, 3))) else: print(ds.stt(audio)) inference_end = timer() - inference_start diff --git a/native_client/python/impl.i b/native_client/python/impl.i index d6c7ba19..001a6165 100644 --- a/native_client/python/impl.i +++ b/native_client/python/impl.i @@ -38,30 +38,69 @@ import_array(); %append_output(SWIG_NewPointerObj(%as_voidptr($1), $1_descriptor, SWIG_POINTER_OWN)); } -%typemap(out) MetadataItem* %{ - $result = PyList_New(arg1->num_items); - for (int i = 0; i < arg1->num_items; ++i) { - PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->items[i]), SWIGTYPE_p_MetadataItem, 0); +%fragment("parent_reference_init", "init") { + // Thread-safe initialization - initialize during Python module initialization + parent_reference(); +} + +%fragment("parent_reference_function", "header", fragment="parent_reference_init") { + +static PyObject *parent_reference() { + static PyObject *parent_reference_string = SWIG_Python_str_FromChar("__parent_reference"); + return parent_reference_string; +} + +} + +%typemap(out, fragment="parent_reference_function") CandidateTranscript* %{ + $result = PyList_New(arg1->num_transcripts); + for (int i = 0; i < arg1->num_transcripts; ++i) { + PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->transcripts[i]), SWIGTYPE_p_CandidateTranscript, 0); + // Add a reference to Metadata in the returned elements to avoid premature + // garbage collection + PyObject_SetAttr(o, parent_reference(), $self); PyList_SetItem($result, i, o); } %} -%extend struct MetadataItem { +%typemap(out, fragment="parent_reference_function") TokenMetadata* %{ + $result = PyList_New(arg1->num_tokens); + for (int i = 0; i < arg1->num_tokens; ++i) { + PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->tokens[i]), SWIGTYPE_p_TokenMetadata, 0); + // Add a reference to CandidateTranscript in the returned elements to avoid premature + // garbage collection + PyObject_SetAttr(o, parent_reference(), $self); + PyList_SetItem($result, i, o); + } +%} + +%extend struct TokenMetadata { %pythoncode %{ def __repr__(self): - return 'MetadataItem(character=\'{}\', timestep={}, start_time={})'.format(self.character, self.timestep, self.start_time) + return 'TokenMetadata(text=\'{}\', timestep={}, start_time={})'.format(self.text, self.timestep, self.start_time) +%} +} + +%extend struct CandidateTranscript { +%pythoncode %{ + def __repr__(self): + tokens_repr = ',\n'.join(repr(i) for i in self.tokens) + tokens_repr = '\n'.join(' ' + l for l in tokens_repr.split('\n')) + return 'CandidateTranscript(confidence={}, tokens=[\n{}\n])'.format(self.confidence, tokens_repr) %} } %extend struct Metadata { %pythoncode %{ def __repr__(self): - items_repr = ', \n'.join(' ' + repr(i) for i in self.items) - return 'Metadata(confidence={}, items=[\n{}\n])'.format(self.confidence, items_repr) + transcripts_repr = ',\n'.join(repr(i) for i in self.transcripts) + transcripts_repr = '\n'.join(' ' + l for l in transcripts_repr.split('\n')) + return 'Metadata(transcripts=[\n{}\n])'.format(transcripts_repr) %} } -%ignore Metadata::num_items; +%ignore Metadata::num_transcripts; +%ignore CandidateTranscript::num_tokens; %extend struct Metadata { ~Metadata() { @@ -69,10 +108,16 @@ import_array(); } } -%nodefaultdtor Metadata; +%immutable Metadata::transcripts; +%immutable CandidateTranscript::tokens; +%immutable TokenMetadata::text; + %nodefaultctor Metadata; -%nodefaultctor MetadataItem; -%nodefaultdtor MetadataItem; +%nodefaultdtor Metadata; +%nodefaultctor CandidateTranscript; +%nodefaultdtor CandidateTranscript; +%nodefaultctor TokenMetadata; +%nodefaultdtor TokenMetadata; %typemap(newfree) char* "DS_FreeString($1);"; From 09048e2ea23c3e3f3d2f3d6d28c71d8283aca633 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 25 Feb 2020 13:58:29 +0100 Subject: [PATCH 10/16] Adapt JavaScript bindings to new API --- native_client/javascript/client.js | 11 ++-- native_client/javascript/deepspeech.i | 40 ++++++++------ native_client/javascript/index.js | 75 +++++++++++++++++---------- 3 files changed, 77 insertions(+), 49 deletions(-) diff --git a/native_client/javascript/client.js b/native_client/javascript/client.js index abbfe59e..16dd19e8 100644 --- a/native_client/javascript/client.js +++ b/native_client/javascript/client.js @@ -42,12 +42,11 @@ function totalTime(hrtimeValue) { return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4); } -function metadataToString(metadata) { +function candidateTranscriptToString(transcript) { var retval = "" - for (var i = 0; i < metadata.num_items; ++i) { - retval += metadata.items[i].character; + for (var i = 0; i < transcript.tokens.length; ++i) { + retval += transcript.tokens[i].text; } - Ds.FreeMetadata(metadata); return retval; } @@ -117,7 +116,9 @@ audioStream.on('finish', () => { const audioLength = (audioBuffer.length / 2) * (1 / desired_sample_rate); if (args['extended']) { - console.log(metadataToString(model.sttWithMetadata(audioBuffer))); + let metadata = model.sttWithMetadata(audioBuffer, 1); + console.log(candidateTranscriptToString(metadata.transcripts[0])); + Ds.FreeMetadata(metadata); } else { console.log(model.stt(audioBuffer)); } diff --git a/native_client/javascript/deepspeech.i b/native_client/javascript/deepspeech.i index efbaa360..6b0151a4 100644 --- a/native_client/javascript/deepspeech.i +++ b/native_client/javascript/deepspeech.i @@ -47,8 +47,8 @@ using namespace node; %typemap(argout) ModelState **retval { $result = SWIGV8_ARRAY_NEW(); SWIGV8_AppendOutput($result, SWIG_From_int(result)); - // owned by SWIG, ModelState destructor gets called when the JavaScript object is finalized (see below) - %append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, SWIG_POINTER_OWN)); + // owned by the application. NodeJS does not guarantee the finalizer will be called so applications must call FreeMetadata themselves. + %append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0)); } @@ -68,27 +68,33 @@ using namespace node; %nodefaultctor ModelState; %nodefaultdtor ModelState; -%typemap(out) MetadataItem* %{ +%typemap(out) TokenMetadata* %{ $result = SWIGV8_ARRAY_NEW(); - for (int i = 0; i < arg1->num_items; ++i) { - SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_MetadataItem, SWIG_POINTER_OWN)); + for (int i = 0; i < arg1->num_tokens; ++i) { + SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_TokenMetadata, 0)); } %} -%nodefaultdtor Metadata; -%nodefaultctor Metadata; -%nodefaultctor MetadataItem; -%nodefaultdtor MetadataItem; - -%extend struct Metadata { - ~Metadata() { - DS_FreeMetadata($self); +%typemap(out) CandidateTranscript* %{ + $result = SWIGV8_ARRAY_NEW(); + for (int i = 0; i < arg1->num_transcripts; ++i) { + SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_CandidateTranscript, 0)); } -} +%} -%extend struct MetadataItem { - ~MetadataItem() { } -} +%ignore Metadata::num_transcripts; +%ignore CandidateTranscript::num_tokens; + +%immutable Metadata::transcripts; +%immutable CandidateTranscripts::tokens; +%immutable TokenMetadata::text; + +%nodefaultctor Metadata; +%nodefaultdtor Metadata; +%nodefaultctor CandidateTranscript; +%nodefaultdtor CandidateTranscript; +%nodefaultctor TokenMetadata; +%nodefaultdtor TokenMetadata; %rename ("%(strip:[DS_])s") ""; diff --git a/native_client/javascript/index.js b/native_client/javascript/index.js index cca483f1..7a027bde 100644 --- a/native_client/javascript/index.js +++ b/native_client/javascript/index.js @@ -122,8 +122,9 @@ Model.prototype.stt = function(aBuffer) { * * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error. */ -Model.prototype.sttWithMetadata = function(aBuffer) { - return binding.SpeechToTextWithMetadata(this._impl, aBuffer); +Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) { + aNumResults = aNumResults || 1; + return binding.SpeechToTextWithMetadata(this._impl, aBuffer, aNumResults); } /** @@ -171,6 +172,16 @@ Stream.prototype.intermediateDecode = function() { return binding.IntermediateDecode(this._impl); } +/** + * Compute the intermediate decoding of an ongoing streaming inference. + * + * @return {string} The STT intermediate result. + */ +Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) { + aNumResults = aNumResults || 1; + return binding.IntermediateDecode(this._impl, aNumResults); +} + /** * Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal. * @@ -191,8 +202,9 @@ Stream.prototype.finishStream = function() { * * This method will free the stream, it must not be used after this method is called. */ -Stream.prototype.finishStreamWithMetadata = function() { - result = binding.FinishStreamWithMetadata(this._impl); +Stream.prototype.finishStreamWithMetadata = function(aNumResults) { + aNumResults = aNumResults || 1; + result = binding.FinishStreamWithMetadata(this._impl, aNumResults); this._impl = null; return result; } @@ -236,35 +248,58 @@ function Version() { } -//// Metadata and MetadataItem are here only for documentation purposes +//// Metadata, CandidateTranscript and TokenMetadata are here only for documentation purposes /** * @class * * Stores each individual character, along with its timing information */ -function MetadataItem() {} +function TokenMetadata() {} /** * The character generated for transcription * * @return {string} The character generated */ -MetadataItem.prototype.character = function() {} +TokenMetadata.prototype.text = function() {} /** * Position of the character in units of 20ms * * @return {int} The position of the character */ -MetadataItem.prototype.timestep = function() {}; +TokenMetadata.prototype.timestep = function() {}; /** * Position of the character in seconds * * @return {float} The position of the character */ -MetadataItem.prototype.start_time = function() {}; +TokenMetadata.prototype.start_time = function() {}; + +/** + * @class + * + * Stores the entire CTC output as an array of character metadata objects + */ +function CandidateTranscript () {} + +/** + * List of items + * + * @return {array} List of :js:func:`TokenMetadata` + */ +CandidateTranscript.prototype.items = function() {} + +/** + * Approximated confidence value for this transcription. This is roughly the + * sum of the acoustic model logit values for each timestep/character that + * contributed to the creation of this transcription. + * + * @return {float} Confidence value + */ +CandidateTranscript.prototype.confidence = function() {} /** * @class @@ -276,30 +311,16 @@ function Metadata () {} /** * List of items * - * @return {array} List of :js:func:`MetadataItem` + * @return {array} List of :js:func:`CandidateTranscript` objects */ -Metadata.prototype.items = function() {} +Metadata.prototype.transcripts = function() {} -/** - * Size of the list of items - * - * @return {int} Number of items - */ -Metadata.prototype.num_items = function() {} - -/** - * Approximated confidence value for this transcription. This is roughly the - * sum of the acoustic model logit values for each timestep/character that - * contributed to the creation of this transcription. - * - * @return {float} Confidence value - */ -Metadata.prototype.confidence = function() {} module.exports = { Model: Model, Metadata: Metadata, - MetadataItem: MetadataItem, + CandidateTranscript: CandidateTranscript, + TokenMetadata: TokenMetadata, Version: Version, FreeModel: FreeModel, FreeStream: FreeStream, From bb709ff9553f513afa20bde601fe03b7539a6759 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 25 Feb 2020 14:18:23 +0100 Subject: [PATCH 11/16] Adapt .NET bindings to new API --- .../dotnet/DeepSpeechClient/DeepSpeech.cs | 21 ++++-- .../DeepSpeechClient/DeepSpeechClient.csproj | 6 +- .../Extensions/NativeExtensions.cs | 69 ++++++++++++++----- .../Interfaces/IDeepSpeech.cs | 15 +++- .../Models/CandidateTranscript.cs | 17 +++++ .../DeepSpeechClient/Models/Metadata.cs | 8 +-- .../{MetadataItem.cs => TokenMetadata.cs} | 4 +- .../dotnet/DeepSpeechClient/NativeImp.cs | 34 +++++---- .../Structs/CandidateTranscript.cs | 22 ++++++ .../DeepSpeechClient/Structs/Metadata.cs | 12 ++-- .../{MetadataItem.cs => TokenMetadata.cs} | 6 +- .../dotnet/DeepSpeechConsole/Program.cs | 14 ++-- 12 files changed, 162 insertions(+), 66 deletions(-) create mode 100644 native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs rename native_client/dotnet/DeepSpeechClient/Models/{MetadataItem.cs => TokenMetadata.cs} (89%) create mode 100644 native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs rename native_client/dotnet/DeepSpeechClient/Structs/{MetadataItem.cs => TokenMetadata.cs} (80%) diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs index 576ed308..ce184cf4 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs @@ -202,10 +202,11 @@ namespace DeepSpeechClient /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// /// Instance of the stream to finish. + /// Number of candidate transcripts to return. /// The extended metadata result. - public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream) + public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults) { - return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata(); + return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata(); } /// @@ -218,6 +219,17 @@ namespace DeepSpeechClient return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString(); } + /// + /// Computes the intermediate decoding of an ongoing streaming inference. + /// + /// Instance of the stream to decode. + /// Number of candidate transcripts to return. + /// The STT intermediate result. + public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults) + { + return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata(); + } + /// /// Return version of this library. The returned version is a semantic version /// (SemVer 2.0.0). @@ -265,10 +277,11 @@ namespace DeepSpeechClient /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. + /// Number of candidate transcripts to return. /// The extended metadata. Returns NULL on error. - public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize) + public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults) { - return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata(); + return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata(); } #endregion diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj index b9077361..0139b3e8 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj @@ -50,11 +50,13 @@ - + + - + + diff --git a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs index 6b7f4c6a..9325f4b8 100644 --- a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs +++ b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs @@ -26,35 +26,68 @@ namespace DeepSpeechClient.Extensions } /// - /// Converts a pointer into managed metadata object. + /// Converts a pointer into managed TokenMetadata object. + /// + /// Native pointer. + /// TokenMetadata managed object. + private static Models.TokenMetadata PtrToTokenMetadata(this IntPtr intPtr) + { + var token = Marshal.PtrToStructure(intPtr); + var managedToken = new Models.TokenMetadata + { + Timestep = token.timestep, + StartTime = token.start_time, + Text = token.text.PtrToString(releasePtr: false) + }; + return managedToken; + } + + /// + /// Converts a pointer into managed CandidateTranscript object. + /// + /// Native pointer. + /// CandidateTranscript managed object. + private static Models.CandidateTranscript PtrToCandidateTranscript(this IntPtr intPtr) + { + var managedTranscript = new Models.CandidateTranscript(); + var transcript = Marshal.PtrToStructure(intPtr); + + managedTranscript.Tokens = new Models.TokenMetadata[transcript.num_tokens]; + managedTranscript.Confidence = transcript.confidence; + + //we need to manually read each item from the native ptr using its size + var sizeOfTokenMetadata = Marshal.SizeOf(typeof(TokenMetadata)); + for (int i = 0; i < transcript.num_tokens; i++) + { + managedTranscript.Tokens[i] = transcript.tokens.PtrToTokenMetadata(); + transcript.tokens += sizeOfTokenMetadata; + } + + return managedTranscript; + } + + /// + /// Converts a pointer into managed Metadata object. /// /// Native pointer. /// Metadata managed object. internal static Models.Metadata PtrToMetadata(this IntPtr intPtr) { - var managedMetaObject = new Models.Metadata(); - var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata)); - - managedMetaObject.Items = new Models.MetadataItem[metaData.num_items]; - managedMetaObject.Confidence = metaData.confidence; + var managedMetadata = new Models.Metadata(); + var metadata = Marshal.PtrToStructure(intPtr); + managedMetadata.Transcripts = new Models.CandidateTranscript[metadata.num_transcripts]; //we need to manually read each item from the native ptr using its size - var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem)); - for (int i = 0; i < metaData.num_items; i++) + var sizeOfCandidateTranscript = Marshal.SizeOf(typeof(CandidateTranscript)); + for (int i = 0; i < metadata.num_transcripts; i++) { - var tempItem = Marshal.PtrToStructure(metaData.items); - managedMetaObject.Items[i] = new Models.MetadataItem - { - Timestep = tempItem.timestep, - StartTime = tempItem.start_time, - Character = tempItem.character.PtrToString(releasePtr: false) - }; - //we keep the offset on each read - metaData.items += sizeOfMetaItem; + managedMetadata.Transcripts[i] = metadata.transcripts.PtrToCandidateTranscript(); + metadata.transcripts += sizeOfCandidateTranscript; } + NativeImp.DS_FreeMetadata(intPtr); - return managedMetaObject; + return managedMetadata; } } } diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs index 18677abc..ae3e72cf 100644 --- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs @@ -72,9 +72,11 @@ namespace DeepSpeechClient.Interfaces /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. + /// Number of candidate transcripts to return. /// The extended metadata. Returns NULL on error. unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, - uint aBufferSize); + uint aBufferSize, + uint aNumResults); /// /// Destroy a streaming state without decoding the computed logits. @@ -102,6 +104,14 @@ namespace DeepSpeechClient.Interfaces /// The STT intermediate result. unsafe string IntermediateDecode(DeepSpeechStream stream); + /// + /// Computes the intermediate decoding of an ongoing streaming inference. + /// + /// Instance of the stream to decode. + /// Number of candidate transcripts to return. + /// The extended metadata result. + unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults); + /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// @@ -113,7 +123,8 @@ namespace DeepSpeechClient.Interfaces /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// /// Instance of the stream to finish. + /// Number of candidate transcripts to return. /// The extended metadata result. - unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream); + unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults); } } diff --git a/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs b/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs new file mode 100644 index 00000000..cc6b5d28 --- /dev/null +++ b/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs @@ -0,0 +1,17 @@ +namespace DeepSpeechClient.Models +{ + /// + /// Stores the entire CTC output as an array of character metadata objects. + /// + public class CandidateTranscript + { + /// + /// Approximated confidence value for this transcription. + /// + public double Confidence { get; set; } + /// + /// List of metada tokens containing text, timestep, and time offset. + /// + public TokenMetadata[] Tokens { get; set; } + } +} \ No newline at end of file diff --git a/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs b/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs index 870eb162..fb6c613d 100644 --- a/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs +++ b/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs @@ -6,12 +6,8 @@ public class Metadata { /// - /// Approximated confidence value for this transcription. + /// List of candidate transcripts. /// - public double Confidence { get; set; } - /// - /// List of metada items containing char, timespet, and time offset. - /// - public MetadataItem[] Items { get; set; } + public CandidateTranscript[] Transcripts { get; set; } } } \ No newline at end of file diff --git a/native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs b/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs similarity index 89% rename from native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs rename to native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs index e329c6cb..5f2dea56 100644 --- a/native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs +++ b/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs @@ -3,12 +3,12 @@ /// /// Stores each individual character, along with its timing information. /// - public class MetadataItem + public class TokenMetadata { /// /// Char of the current timestep. /// - public string Character; + public string Text; /// /// Position of the character in units of 20ms. /// diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs index 6c3494b6..eabbfe48 100644 --- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs +++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs @@ -17,45 +17,46 @@ namespace DeepSpeechClient [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath, - ref IntPtr** pint); + ref IntPtr** pint); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal unsafe static extern uint DS_GetModelBeamWidth(IntPtr** aCtx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal unsafe static extern ErrorCodes DS_SetModelBeamWidth(IntPtr** aCtx, - uint aBeamWidth); + uint aBeamWidth); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath, - uint aBeamWidth, - ref IntPtr** pint); + uint aBeamWidth, + ref IntPtr** pint); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern ErrorCodes DS_EnableExternalScorer(IntPtr** aCtx, - string aScorerPath); + string aScorerPath); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern ErrorCodes DS_DisableExternalScorer(IntPtr** aCtx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern ErrorCodes DS_SetScorerAlphaBeta(IntPtr** aCtx, - float aAlpha, - float aBeta); + float aAlpha, + float aBeta); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, CharSet = CharSet.Ansi, SetLastError = true)] internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx, - short[] aBuffer, - uint aBufferSize); + short[] aBuffer, + uint aBufferSize); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)] internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx, - short[] aBuffer, - uint aBufferSize); + short[] aBuffer, + uint aBufferSize, + uint aNumResults); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern void DS_FreeModel(IntPtr** aCtx); @@ -76,18 +77,23 @@ namespace DeepSpeechClient [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, CharSet = CharSet.Ansi, SetLastError = true)] internal static unsafe extern void DS_FeedAudioContent(IntPtr** aSctx, - short[] aBuffer, - uint aBufferSize); + short[] aBuffer, + uint aBufferSize); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx); + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx, + uint aNumResults); + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, CharSet = CharSet.Ansi, SetLastError = true)] internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx); + internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx, + uint aNumResults); #endregion } } diff --git a/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs b/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs new file mode 100644 index 00000000..54581f6f --- /dev/null +++ b/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs @@ -0,0 +1,22 @@ +using System; +using System.Runtime.InteropServices; + +namespace DeepSpeechClient.Structs +{ + [StructLayout(LayoutKind.Sequential)] + internal unsafe struct CandidateTranscript + { + /// + /// Native list of tokens. + /// + internal unsafe IntPtr tokens; + /// + /// Count of tokens from the native side. + /// + internal unsafe int num_tokens; + /// + /// Approximated confidence value for this transcription. + /// + internal unsafe double confidence; + } +} diff --git a/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs b/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs index 411da9f2..0a9beddc 100644 --- a/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs +++ b/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs @@ -7,16 +7,12 @@ namespace DeepSpeechClient.Structs internal unsafe struct Metadata { /// - /// Native list of items. + /// Native list of candidate transcripts. /// - internal unsafe IntPtr items; + internal unsafe IntPtr transcripts; /// - /// Count of items from the native side. + /// Count of transcripts from the native side. /// - internal unsafe int num_items; - /// - /// Approximated confidence value for this transcription. - /// - internal unsafe double confidence; + internal unsafe int num_transcripts; } } diff --git a/native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs b/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs similarity index 80% rename from native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs rename to native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs index 10092742..1c660c71 100644 --- a/native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs +++ b/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs @@ -4,12 +4,12 @@ using System.Runtime.InteropServices; namespace DeepSpeechClient.Structs { [StructLayout(LayoutKind.Sequential)] - internal unsafe struct MetadataItem + internal unsafe struct TokenMetadata { /// - /// Native character. + /// Native text. /// - internal unsafe IntPtr character; + internal unsafe IntPtr text; /// /// Position of the character in units of 20ms. /// diff --git a/native_client/dotnet/DeepSpeechConsole/Program.cs b/native_client/dotnet/DeepSpeechConsole/Program.cs index b35c7046..a08e44b6 100644 --- a/native_client/dotnet/DeepSpeechConsole/Program.cs +++ b/native_client/dotnet/DeepSpeechConsole/Program.cs @@ -21,14 +21,14 @@ namespace CSharpExamples static string GetArgument(IEnumerable args, string option) => args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault(); - static string MetadataToString(Metadata meta) + static string MetadataToString(CandidateTranscript transcript) { var nl = Environment.NewLine; string retval = - Environment.NewLine + $"Recognized text: {string.Join("", meta?.Items?.Select(x => x.Character))} {nl}" - + $"Confidence: {meta?.Confidence} {nl}" - + $"Item count: {meta?.Items?.Length} {nl}" - + string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}")); + Environment.NewLine + $"Recognized text: {string.Join("", transcript?.Tokens?.Select(x => x.Text))} {nl}" + + $"Confidence: {transcript?.Confidence} {nl}" + + $"Item count: {transcript?.Tokens?.Length} {nl}" + + string.Join(nl, transcript?.Tokens?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Text}")); return retval; } @@ -75,8 +75,8 @@ namespace CSharpExamples if (extended) { Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, - Convert.ToUInt32(waveBuffer.MaxSize / 2)); - speechResult = MetadataToString(metaResult); + Convert.ToUInt32(waveBuffer.MaxSize / 2), 1); + speechResult = MetadataToString(metaResult.Transcripts[0]); } else { From c52f3b32fa3c7001151beedc2ac77a40294c3c41 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 25 Feb 2020 14:29:49 +0100 Subject: [PATCH 12/16] Adapt Java bindings to new API --- native_client/java/jni/deepspeech.i | 34 ++++++++++++++----- .../libdeepspeech/test/BasicTest.java | 10 +++--- .../libdeepspeech/DeepSpeechModel.java | 22 +++++++++--- 3 files changed, 48 insertions(+), 18 deletions(-) diff --git a/native_client/java/jni/deepspeech.i b/native_client/java/jni/deepspeech.i index ded18439..4bbdc776 100644 --- a/native_client/java/jni/deepspeech.i +++ b/native_client/java/jni/deepspeech.i @@ -18,18 +18,32 @@ %typemap(newfree) char* "DS_FreeString($1);"; %include "carrays.i" -%array_functions(struct MetadataItem, metadataItem_array); +%array_functions(struct TokenMetadata, TokenMetadata_array); +%array_functions(struct CandidateTranscript, CandidateTranscript_array); + +%extend struct CandidateTranscript { + /** + * Retrieve one TokenMetadata element + * + * @param i Array index of the TokenMetadata to get + * + * @return The TokenMetadata requested or null + */ + TokenMetadata getToken(int i) { + return TokenMetadata_array_getitem(self->tokens, i); + } +} %extend struct Metadata { /** - * Retrieve one MetadataItem element + * Retrieve one CandidateTranscript element * - * @param i Array index of the MetadataItem to get + * @param i Array index of the CandidateTranscript to get * - * @return The MetadataItem requested or null + * @return The CandidateTranscript requested or null */ - MetadataItem getItem(int i) { - return metadataItem_array_getitem(self->items, i); + CandidateTranscript getTranscript(int i) { + return CandidateTranscript_array_getitem(self->transcripts, i); } ~Metadata() { @@ -37,10 +51,12 @@ } } -%nodefaultdtor Metadata; %nodefaultctor Metadata; -%nodefaultctor MetadataItem; -%nodefaultdtor MetadataItem; +%nodefaultdtor Metadata; +%nodefaultctor CandidateTranscript; +%nodefaultdtor CandidateTranscript; +%nodefaultctor TokenMetadata; +%nodefaultdtor TokenMetadata; %newobject DS_SpeechToText; %newobject DS_IntermediateDecode; diff --git a/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java b/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java index 2957b2e7..f7eccf00 100644 --- a/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java +++ b/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java @@ -12,7 +12,7 @@ import org.junit.runners.MethodSorters; import static org.junit.Assert.*; import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel; -import org.mozilla.deepspeech.libdeepspeech.Metadata; +import org.mozilla.deepspeech.libdeepspeech.CandidateTranscript; import java.io.RandomAccessFile; import java.io.FileNotFoundException; @@ -61,10 +61,10 @@ public class BasicTest { m.freeModel(); } - private String metadataToString(Metadata m) { + private String candidateTranscriptToString(CandidateTranscript t) { String retval = ""; - for (int i = 0; i < m.getNum_items(); ++i) { - retval += m.getItem(i).getCharacter(); + for (int i = 0; i < t.getNum_tokens(); ++i) { + retval += t.getToken(i).getText(); } return retval; } @@ -97,7 +97,7 @@ public class BasicTest { ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts); if (extendedMetadata) { - return metadataToString(m.sttWithMetadata(shorts, shorts.length)); + return candidateTranscriptToString(m.sttWithMetadata(shorts, shorts.length, 1).getTranscript(0)); } else { return m.stt(shorts, shorts.length); } diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java index 6d0a316b..b506b1d3 100644 --- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java @@ -117,11 +117,12 @@ public class DeepSpeechModel { * @param buffer A 16-bit, mono raw audio signal at the appropriate * sample rate (matching what the model was trained on). * @param buffer_size The number of samples in the audio signal. + * @param num_results Number of candidate transcripts to return. * * @return Outputs a Metadata object of individual letters along with their timing information. */ - public Metadata sttWithMetadata(short[] buffer, int buffer_size) { - return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size); + public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) { + return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results); } /** @@ -160,6 +161,18 @@ public class DeepSpeechModel { return impl.IntermediateDecode(ctx.get()); } + /** + * @brief Compute the intermediate decoding of an ongoing streaming inference. + * + * @param ctx A streaming state pointer returned by createStream(). + * @param num_results Number of candidate transcripts to return. + * + * @return The STT intermediate result. + */ + public Metadata intermediateDecodeWithMetadata(DeepSpeechStreamingState ctx, int num_results) { + return impl.IntermediateDecodeWithMetadata(ctx.get(), num_results); + } + /** * @brief Signal the end of an audio signal to an ongoing streaming * inference, returns the STT result over the whole audio signal. @@ -179,12 +192,13 @@ public class DeepSpeechModel { * inference, returns per-letter metadata. * * @param ctx A streaming state pointer returned by createStream(). + * @param num_results Number of candidate transcripts to return. * * @return Outputs a Metadata object of individual letters along with their timing information. * * @note This method will free the state pointer (@p ctx). */ - public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx) { - return impl.FinishStreamWithMetadata(ctx.get()); + public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx, int num_results) { + return impl.FinishStreamWithMetadata(ctx.get(), num_results); } } From e9ae38bf4789b9a2f62520c622c1eba1af656a9c Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 25 Feb 2020 15:43:36 +0100 Subject: [PATCH 13/16] Update docs --- doc/C-API.rst | 3 + doc/DotNet-API.rst | 19 ++-- doc/NodeJS-API.rst | 12 ++- doc/Python-API.rst | 12 ++- native_client/deepspeech.h | 27 +++--- .../dotnet/DeepSpeechClient/DeepSpeech.cs | 12 +-- .../Interfaces/IDeepSpeech.cs | 12 +-- .../libdeepspeech/DeepSpeechModel.java | 21 ++-- .../CandidateTranscript.java | 96 +++++++++++++++++++ .../libdeepspeech_doc/Metadata.java | 62 +++++------- .../libdeepspeech_doc/TokenMetadata.java | 79 +++++++++++++++ native_client/javascript/index.js | 49 +++++----- native_client/python/__init__.py | 30 +++--- 13 files changed, 314 insertions(+), 120 deletions(-) create mode 100644 native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java create mode 100644 native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java diff --git a/doc/C-API.rst b/doc/C-API.rst index 2506d9b2..2b0e7e05 100644 --- a/doc/C-API.rst +++ b/doc/C-API.rst @@ -34,6 +34,9 @@ C .. doxygenfunction:: DS_IntermediateDecode :project: deepspeech-c +.. doxygenfunction:: DS_IntermediateDecodeWithMetadata + :project: deepspeech-c + .. doxygenfunction:: DS_FinishStream :project: deepspeech-c diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst index 2ba3415f..d43c7afb 100644 --- a/doc/DotNet-API.rst +++ b/doc/DotNet-API.rst @@ -31,13 +31,20 @@ ErrorCodes Metadata -------- -.. doxygenstruct:: DeepSpeechClient::Structs::Metadata +.. doxygenstruct:: DeepSpeechClient::Models::Metadata :project: deepspeech-dotnet - :members: items, num_items, confidence + :members: Transcripts -MetadataItem ------------- +CandidateTranscript +------------------- -.. doxygenstruct:: DeepSpeechClient::Structs::MetadataItem +.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript :project: deepspeech-dotnet - :members: character, timestep, start_time + :members: Tokens, Confidence + +TokenMetadata +------------- + +.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata + :project: deepspeech-dotnet + :members: Text, Timestep, StartTime diff --git a/doc/NodeJS-API.rst b/doc/NodeJS-API.rst index aaba718c..b6170b5b 100644 --- a/doc/NodeJS-API.rst +++ b/doc/NodeJS-API.rst @@ -30,8 +30,14 @@ Metadata .. js:autoclass:: Metadata :members: -MetadataItem ------------- +CandidateTranscript +------------------- -.. js:autoclass:: MetadataItem +.. js:autoclass:: CandidateTranscript + :members: + +TokenMetadata +------------- + +.. js:autoclass:: TokenMetadata :members: diff --git a/doc/Python-API.rst b/doc/Python-API.rst index b2b3567f..9aec57f0 100644 --- a/doc/Python-API.rst +++ b/doc/Python-API.rst @@ -21,8 +21,14 @@ Metadata .. autoclass:: Metadata :members: -MetadataItem ------------- +CandidateTranscript +------------------- -.. autoclass:: MetadataItem +.. autoclass:: CandidateTranscript + :members: + +TokenMetadata +------------- + +.. autoclass:: TokenMetadata :members: diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h index 8bfee073..bf4c0f00 100644 --- a/native_client/deepspeech.h +++ b/native_client/deepspeech.h @@ -42,20 +42,20 @@ typedef struct CandidateTranscript { TokenMetadata* tokens; /** Size of the tokens array */ int num_tokens; - /** Approximated confidence value for this transcription. This is roughly the + /** Approximated confidence value for this transcript. This is roughly the * sum of the acoustic model logit values for each timestep/character that - * contributed to the creation of this transcription. + * contributed to the creation of this transcript. */ double confidence; } CandidateTranscript; /** - * @brief An array of CandidateTranscript objects computed by the model + * @brief An array of CandidateTranscript objects computed by the model. */ typedef struct Metadata { /** Array of CandidateTranscript objects */ CandidateTranscript* transcripts; - /** Size of the transcriptions array */ + /** Size of the transcripts array */ int num_transcripts; } Metadata; @@ -191,14 +191,14 @@ char* DS_SpeechToText(ModelState* aCtx, unsigned int aBufferSize); /** - * @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata - * about the results. + * @brief Use the DeepSpeech model to perform Speech-To-Text and output results + * including metadata. * * @param aCtx The ModelState pointer for the model to use. * @param aBuffer A 16-bit, mono raw audio signal at the appropriate * sample rate (matching what the model was trained on). * @param aBufferSize The number of samples in the audio signal. - * @param aNumResults The number of candidate transcripts to return. + * @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this. * * @return Metadata struct containing multiple candidate transcripts. Each transcript * has per-token metadata including timing information. The user is @@ -252,7 +252,7 @@ char* DS_IntermediateDecode(const StreamingState* aSctx); /** * @brief Compute the intermediate decoding of an ongoing streaming inference, - * returns per-letter metadata. + * return results including metadata. * * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @param aNumResults The number of candidate transcripts to return. @@ -267,8 +267,8 @@ Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx, unsigned int aNumResults); /** - * @brief Signal the end of an audio signal to an ongoing streaming - * inference, returns the STT result over the whole audio signal. + * @brief Compute the final decoding of an ongoing streaming inference and return + * the result. Signals the end of an ongoing streaming inference. * * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @@ -281,8 +281,9 @@ DEEPSPEECH_EXPORT char* DS_FinishStream(StreamingState* aSctx); /** - * @brief Signal the end of an audio signal to an ongoing streaming - * inference, returns per-letter metadata. + * @brief Compute the final decoding of an ongoing streaming inference and return + * results including metadata. Signals the end of an ongoing streaming + * inference. * * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @param aNumResults The number of candidate transcripts to return. @@ -295,7 +296,7 @@ char* DS_FinishStream(StreamingState* aSctx); * @note This method will free the state pointer (@p aSctx). */ DEEPSPEECH_EXPORT -Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, +Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, unsigned int aNumResults); /** diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs index ce184cf4..3340c9b3 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs @@ -199,10 +199,10 @@ namespace DeepSpeechClient } /// - /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. + /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata. /// /// Instance of the stream to finish. - /// Number of candidate transcripts to return. + /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata result. public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults) { @@ -220,10 +220,10 @@ namespace DeepSpeechClient } /// - /// Computes the intermediate decoding of an ongoing streaming inference. + /// Computes the intermediate decoding of an ongoing streaming inference, including metadata. /// /// Instance of the stream to decode. - /// Number of candidate transcripts to return. + /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The STT intermediate result. public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults) { @@ -273,11 +273,11 @@ namespace DeepSpeechClient } /// - /// Use the DeepSpeech model to perform Speech-To-Text. + /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata. /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// Number of candidate transcripts to return. + /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata. Returns NULL on error. public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults) { diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs index ae3e72cf..37d6ce59 100644 --- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs @@ -68,11 +68,11 @@ namespace DeepSpeechClient.Interfaces uint aBufferSize); /// - /// Use the DeepSpeech model to perform Speech-To-Text. + /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata. /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// Number of candidate transcripts to return. + /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata. Returns NULL on error. unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, @@ -105,10 +105,10 @@ namespace DeepSpeechClient.Interfaces unsafe string IntermediateDecode(DeepSpeechStream stream); /// - /// Computes the intermediate decoding of an ongoing streaming inference. + /// Computes the intermediate decoding of an ongoing streaming inference, including metadata. /// /// Instance of the stream to decode. - /// Number of candidate transcripts to return. + /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata result. unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults); @@ -120,10 +120,10 @@ namespace DeepSpeechClient.Interfaces unsafe string FinishStream(DeepSpeechStream stream); /// - /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. + /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata. /// /// Instance of the stream to finish. - /// Number of candidate transcripts to return. + /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata result. unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults); } diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java index b506b1d3..a5b339b3 100644 --- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java @@ -117,9 +117,10 @@ public class DeepSpeechModel { * @param buffer A 16-bit, mono raw audio signal at the appropriate * sample rate (matching what the model was trained on). * @param buffer_size The number of samples in the audio signal. - * @param num_results Number of candidate transcripts to return. + * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this. * - * @return Outputs a Metadata object of individual letters along with their timing information. + * @return Metadata struct containing multiple candidate transcripts. Each transcript + * has per-token metadata including timing information. */ public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) { return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results); @@ -165,7 +166,7 @@ public class DeepSpeechModel { * @brief Compute the intermediate decoding of an ongoing streaming inference. * * @param ctx A streaming state pointer returned by createStream(). - * @param num_results Number of candidate transcripts to return. + * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this. * * @return The STT intermediate result. */ @@ -174,8 +175,8 @@ public class DeepSpeechModel { } /** - * @brief Signal the end of an audio signal to an ongoing streaming - * inference, returns the STT result over the whole audio signal. + * @brief Compute the final decoding of an ongoing streaming inference and return + * the result. Signals the end of an ongoing streaming inference. * * @param ctx A streaming state pointer returned by createStream(). * @@ -188,13 +189,15 @@ public class DeepSpeechModel { } /** - * @brief Signal the end of an audio signal to an ongoing streaming - * inference, returns per-letter metadata. + * @brief Compute the final decoding of an ongoing streaming inference and return + * the results including metadata. Signals the end of an ongoing streaming + * inference. * * @param ctx A streaming state pointer returned by createStream(). - * @param num_results Number of candidate transcripts to return. + * @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this. * - * @return Outputs a Metadata object of individual letters along with their timing information. + * @return Metadata struct containing multiple candidate transcripts. Each transcript + * has per-token metadata including timing information. * * @note This method will free the state pointer (@p ctx). */ diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java new file mode 100644 index 00000000..c02b39ad --- /dev/null +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java @@ -0,0 +1,96 @@ +/* ---------------------------------------------------------------------------- + * This file was automatically generated by SWIG (http://www.swig.org). + * Version 4.0.1 + * + * Do not make changes to this file unless you know what you are doing--modify + * the SWIG interface file instead. + * ----------------------------------------------------------------------------- */ + +package org.mozilla.deepspeech.libdeepspeech; + +/** + * A single transcript computed by the model, including a confidence value and + * the metadata for its constituent tokens. + */ +public class CandidateTranscript { + private transient long swigCPtr; + protected transient boolean swigCMemOwn; + + protected CandidateTranscript(long cPtr, boolean cMemoryOwn) { + swigCMemOwn = cMemoryOwn; + swigCPtr = cPtr; + } + + protected static long getCPtr(CandidateTranscript obj) { + return (obj == null) ? 0 : obj.swigCPtr; + } + + public synchronized void delete() { + if (swigCPtr != 0) { + if (swigCMemOwn) { + swigCMemOwn = false; + throw new UnsupportedOperationException("C++ destructor does not have public access"); + } + swigCPtr = 0; + } + } + + /** + * Array of TokenMetadata objects + */ + public void setTokens(TokenMetadata value) { + implJNI.CandidateTranscript_tokens_set(swigCPtr, this, TokenMetadata.getCPtr(value), value); + } + + /** + * Array of TokenMetadata objects + */ + public TokenMetadata getTokens() { + long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this); + return (cPtr == 0) ? null : new TokenMetadata(cPtr, false); + } + + /** + * Size of the tokens array + */ + public void setNum_tokens(int value) { + implJNI.CandidateTranscript_num_tokens_set(swigCPtr, this, value); + } + + /** + * Size of the tokens array + */ + public int getNum_tokens() { + return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this); + } + + /** + * Approximated confidence value for this transcript. This is roughly the + * sum of the acoustic model logit values for each timestep/character that + * contributed to the creation of this transcript. + */ + public void setConfidence(double value) { + implJNI.CandidateTranscript_confidence_set(swigCPtr, this, value); + } + + /** + * Approximated confidence value for this transcript. This is roughly the + * sum of the acoustic model logit values for each timestep/character that + * contributed to the creation of this transcript. + */ + public double getConfidence() { + return implJNI.CandidateTranscript_confidence_get(swigCPtr, this); + } + + /** + * Retrieve one TokenMetadata element + * + * @param i Array index of the TokenMetadata to get + * + * @return The TokenMetadata requested or null + */ + public TokenMetadata getToken(int i) { + return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), true); + } + +} diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java index 482b7c58..bb9b0773 100644 --- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java @@ -1,6 +1,6 @@ /* ---------------------------------------------------------------------------- * This file was automatically generated by SWIG (http://www.swig.org). - * Version 4.0.2 + * Version 4.0.1 * * Do not make changes to this file unless you know what you are doing--modify * the SWIG interface file instead. @@ -9,7 +9,7 @@ package org.mozilla.deepspeech.libdeepspeech; /** - * Stores the entire CTC output as an array of character metadata objects + * An array of CandidateTranscript objects computed by the model. */ public class Metadata { private transient long swigCPtr; @@ -40,61 +40,43 @@ public class Metadata { } /** - * List of items + * Array of CandidateTranscript objects */ - public void setItems(MetadataItem value) { - implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value); + public void setTranscripts(CandidateTranscript value) { + implJNI.Metadata_transcripts_set(swigCPtr, this, CandidateTranscript.getCPtr(value), value); } /** - * List of items + * Array of CandidateTranscript objects */ - public MetadataItem getItems() { - long cPtr = implJNI.Metadata_items_get(swigCPtr, this); - return (cPtr == 0) ? null : new MetadataItem(cPtr, false); + public CandidateTranscript getTranscripts() { + long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this); + return (cPtr == 0) ? null : new CandidateTranscript(cPtr, false); } /** - * Size of the list of items + * Size of the transcripts array */ - public void setNum_items(int value) { - implJNI.Metadata_num_items_set(swigCPtr, this, value); + public void setNum_transcripts(int value) { + implJNI.Metadata_num_transcripts_set(swigCPtr, this, value); } /** - * Size of the list of items + * Size of the transcripts array */ - public int getNum_items() { - return implJNI.Metadata_num_items_get(swigCPtr, this); + public int getNum_transcripts() { + return implJNI.Metadata_num_transcripts_get(swigCPtr, this); } /** - * Approximated confidence value for this transcription. This is roughly the
- * sum of the acoustic model logit values for each timestep/character that
- * contributed to the creation of this transcription. + * Retrieve one CandidateTranscript element + * + * @param i Array index of the CandidateTranscript to get + * + * @return The CandidateTranscript requested or null */ - public void setConfidence(double value) { - implJNI.Metadata_confidence_set(swigCPtr, this, value); - } - - /** - * Approximated confidence value for this transcription. This is roughly the
- * sum of the acoustic model logit values for each timestep/character that
- * contributed to the creation of this transcription. - */ - public double getConfidence() { - return implJNI.Metadata_confidence_get(swigCPtr, this); - } - - /** - * Retrieve one MetadataItem element
- *
- * @param i Array index of the MetadataItem to get
- *
- * @return The MetadataItem requested or null - */ - public MetadataItem getItem(int i) { - return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true); + public CandidateTranscript getTranscript(int i) { + return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), true); } } diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java new file mode 100644 index 00000000..32246f1a --- /dev/null +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java @@ -0,0 +1,79 @@ +/* ---------------------------------------------------------------------------- + * This file was automatically generated by SWIG (http://www.swig.org). + * Version 4.0.1 + * + * Do not make changes to this file unless you know what you are doing--modify + * the SWIG interface file instead. + * ----------------------------------------------------------------------------- */ + +package org.mozilla.deepspeech.libdeepspeech; + +/** + * Stores text of an individual token, along with its timing information + */ +public class TokenMetadata { + private transient long swigCPtr; + protected transient boolean swigCMemOwn; + + protected TokenMetadata(long cPtr, boolean cMemoryOwn) { + swigCMemOwn = cMemoryOwn; + swigCPtr = cPtr; + } + + protected static long getCPtr(TokenMetadata obj) { + return (obj == null) ? 0 : obj.swigCPtr; + } + + public synchronized void delete() { + if (swigCPtr != 0) { + if (swigCMemOwn) { + swigCMemOwn = false; + throw new UnsupportedOperationException("C++ destructor does not have public access"); + } + swigCPtr = 0; + } + } + + /** + * The text corresponding to this token + */ + public void setText(String value) { + implJNI.TokenMetadata_text_set(swigCPtr, this, value); + } + + /** + * The text corresponding to this token + */ + public String getText() { + return implJNI.TokenMetadata_text_get(swigCPtr, this); + } + + /** + * Position of the token in units of 20ms + */ + public void setTimestep(int value) { + implJNI.TokenMetadata_timestep_set(swigCPtr, this, value); + } + + /** + * Position of the token in units of 20ms + */ + public int getTimestep() { + return implJNI.TokenMetadata_timestep_get(swigCPtr, this); + } + + /** + * Position of the token in seconds + */ + public void setStart_time(float value) { + implJNI.TokenMetadata_start_time_set(swigCPtr, this, value); + } + + /** + * Position of the token in seconds + */ + public float getStart_time() { + return implJNI.TokenMetadata_start_time_get(swigCPtr, this); + } + +} diff --git a/native_client/javascript/index.js b/native_client/javascript/index.js index 7a027bde..6ce06c0d 100644 --- a/native_client/javascript/index.js +++ b/native_client/javascript/index.js @@ -115,12 +115,12 @@ Model.prototype.stt = function(aBuffer) { } /** - * Use the DeepSpeech model to perform Speech-To-Text and output metadata - * about the results. + * Use the DeepSpeech model to perform Speech-To-Text and output results including metadata. * * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). + * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified. * - * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error. + * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error. */ Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) { aNumResults = aNumResults || 1; @@ -173,9 +173,11 @@ Stream.prototype.intermediateDecode = function() { } /** - * Compute the intermediate decoding of an ongoing streaming inference. + * Compute the intermediate decoding of an ongoing streaming inference, return results including metadata. * - * @return {string} The STT intermediate result. + * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified. + * + * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error. */ Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) { aNumResults = aNumResults || 1; @@ -183,7 +185,7 @@ Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) { } /** - * Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal. + * Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference. * * @return {string} The STT result. * @@ -196,7 +198,9 @@ Stream.prototype.finishStream = function() { } /** - * Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata. + * Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference. + * + * @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified. * * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. * @@ -253,48 +257,49 @@ function Version() { /** * @class * - * Stores each individual character, along with its timing information + * Stores text of an individual token, along with its timing information */ function TokenMetadata() {} /** - * The character generated for transcription + * The text corresponding to this token * - * @return {string} The character generated + * @return {string} The text generated */ TokenMetadata.prototype.text = function() {} /** - * Position of the character in units of 20ms + * Position of the token in units of 20ms * - * @return {int} The position of the character + * @return {int} The position of the token */ TokenMetadata.prototype.timestep = function() {}; /** - * Position of the character in seconds + * Position of the token in seconds * - * @return {float} The position of the character + * @return {float} The position of the token */ TokenMetadata.prototype.start_time = function() {}; /** * @class * - * Stores the entire CTC output as an array of character metadata objects + * A single transcript computed by the model, including a confidence value and + * the metadata for its constituent tokens. */ function CandidateTranscript () {} /** - * List of items + * Array of tokens * - * @return {array} List of :js:func:`TokenMetadata` + * @return {array} Array of :js:func:`TokenMetadata` */ -CandidateTranscript.prototype.items = function() {} +CandidateTranscript.prototype.tokens = function() {} /** * Approximated confidence value for this transcription. This is roughly the - * sum of the acoustic model logit values for each timestep/character that + * sum of the acoustic model logit values for each timestep/token that * contributed to the creation of this transcription. * * @return {float} Confidence value @@ -304,14 +309,14 @@ CandidateTranscript.prototype.confidence = function() {} /** * @class * - * Stores the entire CTC output as an array of character metadata objects + * An array of CandidateTranscript objects computed by the model. */ function Metadata () {} /** - * List of items + * Array of transcripts * - * @return {array} List of :js:func:`CandidateTranscript` objects + * @return {array} Array of :js:func:`CandidateTranscript` objects */ Metadata.prototype.transcripts = function() {} diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py index 5d9072ec..a44cf05f 100644 --- a/native_client/python/__init__.py +++ b/native_client/python/__init__.py @@ -123,15 +123,15 @@ class Model(object): def sttWithMetadata(self, audio_buffer, num_results=1): """ - Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results. + Use the DeepSpeech model to perform Speech-To-Text and return results including metadata. :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). :type audio_buffer: numpy.int16 array - :param num_results: Number of candidate transcripts to return. + :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this. :type num_results: int - :return: Outputs a struct of individual letters along with their timing information. + :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. :type: :func:`Metadata` """ return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results) @@ -192,10 +192,13 @@ class Stream(object): def intermediateDecodeWithMetadata(self, num_results=1): """ - Compute the intermediate decoding of an ongoing streaming inference. + Compute the intermediate decoding of an ongoing streaming inference and return results including metadata. - :return: The STT intermediate result. - :type: str + :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this. + :type num_results: int + + :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. + :type: :func:`Metadata` :throws: RuntimeError if the stream object is not valid """ @@ -205,8 +208,9 @@ class Stream(object): def finishStream(self): """ - Signal the end of an audio signal to an ongoing streaming inference, - returns the STT result over the whole audio signal. + Compute the final decoding of an ongoing streaming inference and return + the result. Signals the end of an ongoing streaming inference. The underlying + stream object must not be used after this method is called. :return: The STT result. :type: str @@ -221,13 +225,15 @@ class Stream(object): def finishStreamWithMetadata(self, num_results=1): """ - Signal the end of an audio signal to an ongoing streaming inference, - returns per-letter metadata. + Compute the final decoding of an ongoing streaming inference and return + results including metadata. Signals the end of an ongoing streaming + inference. The underlying stream object must not be used after this + method is called. - :param num_results: Number of candidate transcripts to return. + :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this. :type num_results: int - :return: Outputs a struct of individual letters along with their timing information. + :return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. :type: :func:`Metadata` :throws: RuntimeError if the stream object is not valid From 2ec34d5a067334a84b323328c149bd9752008059 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 17 Mar 2020 14:47:18 +0100 Subject: [PATCH 14/16] Address review comments --- doc/DotNet-API.rst | 6 ++-- doc/Java-API.rst | 16 +++++++--- doc/Structs.rst | 13 ++++++-- doc/doxygen-dotnet.conf | 2 +- native_client/args.h | 32 ++++++++++++------- native_client/client.cc | 6 ++-- .../ctcdecode/ctc_beam_search_decoder.h | 2 +- native_client/deepspeech.h | 18 +++++------ native_client/modelstate.h | 2 +- 9 files changed, 59 insertions(+), 38 deletions(-) diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst index d43c7afb..b4f85dfc 100644 --- a/doc/DotNet-API.rst +++ b/doc/DotNet-API.rst @@ -31,20 +31,20 @@ ErrorCodes Metadata -------- -.. doxygenstruct:: DeepSpeechClient::Models::Metadata +.. doxygenclass:: DeepSpeechClient::Models::Metadata :project: deepspeech-dotnet :members: Transcripts CandidateTranscript ------------------- -.. doxygenstruct:: DeepSpeechClient::Models::CandidateTranscript +.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript :project: deepspeech-dotnet :members: Tokens, Confidence TokenMetadata ------------- -.. doxygenstruct:: DeepSpeechClient::Models::TokenMetadata +.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata :project: deepspeech-dotnet :members: Text, Timestep, StartTime diff --git a/doc/Java-API.rst b/doc/Java-API.rst index a485dc02..2986ca97 100644 --- a/doc/Java-API.rst +++ b/doc/Java-API.rst @@ -13,11 +13,17 @@ Metadata .. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata :project: deepspeech-java - :members: getItems, getNum_items, getProbability, getItem + :members: getTranscripts, getNum_transcripts, getTranscript -MetadataItem ------------- +CandidateTranscript +------------------- -.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem +.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript :project: deepspeech-java - :members: getCharacter, getTimestep, getStart_time + :members: getTokens, getNum_tokens, getConfidence, getToken + +TokenMetadata +------------- +.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata + :project: deepspeech-java + :members: getText, getTimestep, getStart_time diff --git a/doc/Structs.rst b/doc/Structs.rst index 713e52e0..5d532277 100644 --- a/doc/Structs.rst +++ b/doc/Structs.rst @@ -8,9 +8,16 @@ Metadata :project: deepspeech-c :members: -MetadataItem ------------- +CandidateTranscript +------------------- -.. doxygenstruct:: MetadataItem +.. doxygenstruct:: CandidateTranscript + :project: deepspeech-c + :members: + +TokenMetadata +------------- + +.. doxygenstruct:: TokenMetadata :project: deepspeech-c :members: diff --git a/doc/doxygen-dotnet.conf b/doc/doxygen-dotnet.conf index ad64cfcb..74c2c5bb 100644 --- a/doc/doxygen-dotnet.conf +++ b/doc/doxygen-dotnet.conf @@ -790,7 +790,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/ +INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/native_client/args.h b/native_client/args.h index 33b9b8fe..ca28bfb7 100644 --- a/native_client/args.h +++ b/native_client/args.h @@ -34,6 +34,8 @@ bool extended_metadata = false; bool json_output = false; +int json_candidate_transcripts = 3; + int stream_size = 0; void PrintHelp(const char* bin) @@ -43,18 +45,19 @@ void PrintHelp(const char* bin) "\n" "Running DeepSpeech inference.\n" "\n" - "\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n" - "\t--scorer SCORER\t\tPath to the external scorer file\n" - "\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n" - "\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n" - "\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n" - "\t--lm_beta LM_BETA\tValue for language model beta param (float)\n" - "\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n" - "\t--extended\t\tOutput string from extended metadata\n" - "\t--json\t\t\tExtended output, shows word timings as JSON\n" - "\t--stream size\t\tRun in stream mode, output intermediate results\n" - "\t--help\t\t\tShow help\n" - "\t--version\t\tPrint version and exits\n"; + "\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n" + "\t--scorer SCORER\t\t\tPath to the external scorer file\n" + "\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n" + "\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n" + "\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n" + "\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n" + "\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n" + "\t--extended\t\t\tOutput string from extended metadata\n" + "\t--json\t\t\t\tExtended output, shows word timings as JSON\n" + "\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n" + "\t--stream size\t\t\tRun in stream mode, output intermediate results\n" + "\t--help\t\t\t\tShow help\n" + "\t--version\t\t\tPrint version and exits\n"; char* version = DS_Version(); std::cerr << "DeepSpeech " << version << "\n"; DS_FreeString(version); @@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv) {"t", no_argument, nullptr, 't'}, {"extended", no_argument, nullptr, 'e'}, {"json", no_argument, nullptr, 'j'}, + {"candidate_transcripts", required_argument, nullptr, 150}, {"stream", required_argument, nullptr, 's'}, {"version", no_argument, nullptr, 'v'}, {"help", no_argument, nullptr, 'h'}, @@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv) json_output = true; break; + case 150: + json_candidate_transcripts = atoi(optarg); + break; + case 's': stream_size = atoi(optarg); break; diff --git a/native_client/client.cc b/native_client/client.cc index 9ab47f27..f108419b 100644 --- a/native_client/client.cc +++ b/native_client/client.cc @@ -49,7 +49,7 @@ CandidateTranscriptToString(CandidateTranscript* transcript) { std::string retval = ""; for (int i = 0; i < transcript->num_tokens; i++) { - TokenMetadata token = transcript->tokens[i]; + const TokenMetadata& token = transcript->tokens[i]; retval += token.text; } return strdup(retval.c_str()); @@ -65,7 +65,7 @@ CandidateTranscriptToWords(CandidateTranscript* transcript) // Loop through each token for (int i = 0; i < transcript->num_tokens; i++) { - TokenMetadata token = transcript->tokens[i]; + const TokenMetadata& token = transcript->tokens[i]; // Append token to word if it's not a space if (strcmp(token.text, u8" ") != 0) { @@ -167,7 +167,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, res.string = CandidateTranscriptToString(&result->transcripts[0]); DS_FreeMetadata(result); } else if (json_output) { - Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 3); + Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts); res.string = MetadataToJSON(result); DS_FreeMetadata(result); } else if (stream_size > 0) { diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.h b/native_client/ctcdecode/ctc_beam_search_decoder.h index 78871b2a..b785e097 100644 --- a/native_client/ctcdecode/ctc_beam_search_decoder.h +++ b/native_client/ctcdecode/ctc_beam_search_decoder.h @@ -60,7 +60,7 @@ public: int time_dim, int class_dim); - /* Get transcription from current decoder state + /* Get up to num_results transcriptions from current decoder state. * * Parameters: * num_results: Number of beams to return. diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h index bf4c0f00..6fb9645c 100644 --- a/native_client/deepspeech.h +++ b/native_client/deepspeech.h @@ -27,7 +27,7 @@ typedef struct TokenMetadata { char* text; /** Position of the token in units of 20ms */ - int timestep; + unsigned int timestep; /** Position of the token in seconds */ float start_time; @@ -41,7 +41,7 @@ typedef struct CandidateTranscript { /** Array of TokenMetadata objects */ TokenMetadata* tokens; /** Size of the tokens array */ - int num_tokens; + unsigned int num_tokens; /** Approximated confidence value for this transcript. This is roughly the * sum of the acoustic model logit values for each timestep/character that * contributed to the creation of this transcript. @@ -56,7 +56,7 @@ typedef struct Metadata { /** Array of CandidateTranscript objects */ CandidateTranscript* transcripts; /** Size of the transcripts array */ - int num_transcripts; + unsigned int num_transcripts; } Metadata; enum DeepSpeech_Error_Codes @@ -175,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx, float aBeta); /** - * @brief Use the DeepSpeech model to perform Speech-To-Text. + * @brief Use the DeepSpeech model to convert speech to text. * * @param aCtx The ModelState pointer for the model to use. * @param aBuffer A 16-bit, mono raw audio signal at the appropriate @@ -191,18 +191,18 @@ char* DS_SpeechToText(ModelState* aCtx, unsigned int aBufferSize); /** - * @brief Use the DeepSpeech model to perform Speech-To-Text and output results + * @brief Use the DeepSpeech model to convert speech to text and output results * including metadata. * * @param aCtx The ModelState pointer for the model to use. * @param aBuffer A 16-bit, mono raw audio signal at the appropriate * sample rate (matching what the model was trained on). * @param aBufferSize The number of samples in the audio signal. - * @param aNumResults The maximum number of candidate transcripts to return. Returned value might be smaller than this. + * @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this. * - * @return Metadata struct containing multiple candidate transcripts. Each transcript - * has per-token metadata including timing information. The user is - * responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. + * @return Metadata struct containing multiple CandidateTranscript structs. Each + * transcript has per-token metadata including timing information. The + * user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. * Returns NULL on error. */ DEEPSPEECH_EXPORT diff --git a/native_client/modelstate.h b/native_client/modelstate.h index 43eef970..0dbe108a 100644 --- a/native_client/modelstate.h +++ b/native_client/modelstate.h @@ -66,7 +66,7 @@ struct ModelState { * @brief Return character-level metadata including letter timings. * * @param state Decoder state to use when decoding. - * @param num_results Number of candidate results to return. + * @param num_results Maximum number of candidate results to return. * * @return A Metadata struct containing CandidateTranscript structs. * Each represents an candidate transcript, with the first ranked most probable. From 1547498e82c3ad1a0c648a93c62a4b2091074c45 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Wed, 18 Mar 2020 19:11:58 +0100 Subject: [PATCH 15/16] Const members in structs --- native_client/client.cc | 8 ++--- native_client/deepspeech.cc | 8 ++--- native_client/deepspeech.h | 16 +++++----- native_client/javascript/deepspeech.i | 4 --- native_client/modelstate.cc | 42 ++++++++++++++------------- native_client/python/impl.i | 4 --- 6 files changed, 38 insertions(+), 44 deletions(-) diff --git a/native_client/client.cc b/native_client/client.cc index f108419b..1f7f78eb 100644 --- a/native_client/client.cc +++ b/native_client/client.cc @@ -45,7 +45,7 @@ struct meta_word { }; char* -CandidateTranscriptToString(CandidateTranscript* transcript) +CandidateTranscriptToString(const CandidateTranscript* transcript) { std::string retval = ""; for (int i = 0; i < transcript->num_tokens; i++) { @@ -56,7 +56,7 @@ CandidateTranscriptToString(CandidateTranscript* transcript) } std::vector -CandidateTranscriptToWords(CandidateTranscript* transcript) +CandidateTranscriptToWords(const CandidateTranscript* transcript) { std::vector word_list; @@ -101,7 +101,7 @@ CandidateTranscriptToWords(CandidateTranscript* transcript) } std::string -CandidateTranscriptToJSON(CandidateTranscript *transcript) +CandidateTranscriptToJSON(const CandidateTranscript *transcript) { std::ostringstream out_string; @@ -130,7 +130,7 @@ MetadataToJSON(Metadata* result) out_string << "{\n"; for (int j=0; j < result->num_transcripts; ++j) { - CandidateTranscript *transcript = &result->transcripts[j]; + const CandidateTranscript *transcript = &result->transcripts[j]; if (j == 0) { out_string << CandidateTranscriptToJSON(transcript); diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc index d284a319..96989e04 100644 --- a/native_client/deepspeech.cc +++ b/native_client/deepspeech.cc @@ -478,14 +478,14 @@ DS_FreeMetadata(Metadata* m) if (m) { for (int i = 0; i < m->num_transcripts; ++i) { for (int j = 0; j < m->transcripts[i].num_tokens; ++j) { - free(m->transcripts[i].tokens[j].text); + free((void*)m->transcripts[i].tokens[j].text); } - delete[] m->transcripts[i].tokens; + free((void*)m->transcripts[i].tokens); } - delete[] m->transcripts; - delete m; + free((void*)m->transcripts); + free(m); } } diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h index 6fb9645c..a8c29c93 100644 --- a/native_client/deepspeech.h +++ b/native_client/deepspeech.h @@ -24,13 +24,13 @@ typedef struct StreamingState StreamingState; */ typedef struct TokenMetadata { /** The text corresponding to this token */ - char* text; + const char* const text; /** Position of the token in units of 20ms */ - unsigned int timestep; + const unsigned int timestep; /** Position of the token in seconds */ - float start_time; + const float start_time; } TokenMetadata; /** @@ -39,14 +39,14 @@ typedef struct TokenMetadata { */ typedef struct CandidateTranscript { /** Array of TokenMetadata objects */ - TokenMetadata* tokens; + const TokenMetadata* const tokens; /** Size of the tokens array */ - unsigned int num_tokens; + const unsigned int num_tokens; /** Approximated confidence value for this transcript. This is roughly the * sum of the acoustic model logit values for each timestep/character that * contributed to the creation of this transcript. */ - double confidence; + const double confidence; } CandidateTranscript; /** @@ -54,9 +54,9 @@ typedef struct CandidateTranscript { */ typedef struct Metadata { /** Array of CandidateTranscript objects */ - CandidateTranscript* transcripts; + const CandidateTranscript* const transcripts; /** Size of the transcripts array */ - unsigned int num_transcripts; + const unsigned int num_transcripts; } Metadata; enum DeepSpeech_Error_Codes diff --git a/native_client/javascript/deepspeech.i b/native_client/javascript/deepspeech.i index 6b0151a4..cb3968c2 100644 --- a/native_client/javascript/deepspeech.i +++ b/native_client/javascript/deepspeech.i @@ -85,10 +85,6 @@ using namespace node; %ignore Metadata::num_transcripts; %ignore CandidateTranscript::num_tokens; -%immutable Metadata::transcripts; -%immutable CandidateTranscripts::tokens; -%immutable TokenMetadata::text; - %nodefaultctor Metadata; %nodefaultdtor Metadata; %nodefaultctor CandidateTranscript; diff --git a/native_client/modelstate.cc b/native_client/modelstate.cc index d4f16636..3cb06ac2 100644 --- a/native_client/modelstate.cc +++ b/native_client/modelstate.cc @@ -41,33 +41,35 @@ ModelState::decode_metadata(const DecoderState& state, size_t num_results) { vector out = state.decode(num_results); - size_t num_returned = out.size(); + unsigned int num_returned = out.size(); - std::unique_ptr metadata(new Metadata); - metadata->num_transcripts = num_returned; - - std::unique_ptr transcripts(new CandidateTranscript[num_returned]); + CandidateTranscript* transcripts = (CandidateTranscript*)malloc(sizeof(CandidateTranscript)*num_returned); for (int i = 0; i < num_returned; ++i) { - transcripts[i].num_tokens = out[i].tokens.size(); - transcripts[i].confidence = out[i].confidence; + TokenMetadata* tokens = (TokenMetadata*)malloc(sizeof(TokenMetadata)*out[i].tokens.size()); - std::unique_ptr tokens(new TokenMetadata[transcripts[i].num_tokens]); - - // Loop through each token for (int j = 0; j < out[i].tokens.size(); ++j) { - tokens[j].text = strdup(alphabet_.StringFromLabel(out[i].tokens[j]).c_str()); - tokens[j].timestep = out[i].timesteps[j]; - tokens[j].start_time = out[i].timesteps[j] * ((float)audio_win_step_ / sample_rate_); - - if (tokens[j].start_time < 0) { - tokens[j].start_time = 0; - } + TokenMetadata token { + strdup(alphabet_.StringFromLabel(out[i].tokens[j]).c_str()), // text + static_cast(out[i].timesteps[j]), // timestep + out[i].timesteps[j] * ((float)audio_win_step_ / sample_rate_), // start_time + }; + memcpy(&tokens[j], &token, sizeof(TokenMetadata)); } - transcripts[i].tokens = tokens.release(); + CandidateTranscript transcript { + tokens, // tokens + static_cast(out[i].tokens.size()), // num_tokens + out[i].confidence, // confidence + }; + memcpy(&transcripts[i], &transcript, sizeof(CandidateTranscript)); } - metadata->transcripts = transcripts.release(); - return metadata.release(); + Metadata* ret = (Metadata*)malloc(sizeof(Metadata)); + Metadata metadata { + transcripts, // transcripts + num_returned, // num_transcripts + }; + memcpy(ret, &metadata, sizeof(Metadata)); + return ret; } diff --git a/native_client/python/impl.i b/native_client/python/impl.i index 001a6165..259a5b5d 100644 --- a/native_client/python/impl.i +++ b/native_client/python/impl.i @@ -108,10 +108,6 @@ static PyObject *parent_reference() { } } -%immutable Metadata::transcripts; -%immutable CandidateTranscript::tokens; -%immutable TokenMetadata::text; - %nodefaultctor Metadata; %nodefaultdtor Metadata; %nodefaultctor CandidateTranscript; From ee30a1c9dead1b7cbd86ab51a2039f5e1859740b Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Wed, 18 Mar 2020 19:49:14 +0100 Subject: [PATCH 16/16] Adapt Java bindings to const structs --- native_client/java/jni/deepspeech.i | 18 +++-- .../libdeepspeech/DeepSpeechModel.java | 37 +++++++---- .../CandidateTranscript.java | 47 ++++---------- .../DeepSpeech_Error_Codes.java | 65 +++++++++++++++++++ .../libdeepspeech_doc/Metadata.java | 30 +++------ .../deepspeech/libdeepspeech_doc/README.rst | 2 +- .../libdeepspeech_doc/TokenMetadata.java | 29 ++------- 7 files changed, 122 insertions(+), 106 deletions(-) create mode 100644 native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/DeepSpeech_Error_Codes.java diff --git a/native_client/java/jni/deepspeech.i b/native_client/java/jni/deepspeech.i index 4bbdc776..c028714c 100644 --- a/native_client/java/jni/deepspeech.i +++ b/native_client/java/jni/deepspeech.i @@ -6,6 +6,8 @@ %} %include "typemaps.i" +%include "enums.swg" +%javaconst(1); %include "arrays_java.i" // apply to DS_FeedAudioContent and DS_SpeechToText @@ -15,12 +17,6 @@ %pointer_functions(ModelState*, modelstatep); %pointer_functions(StreamingState*, streamingstatep); -%typemap(newfree) char* "DS_FreeString($1);"; - -%include "carrays.i" -%array_functions(struct TokenMetadata, TokenMetadata_array); -%array_functions(struct CandidateTranscript, CandidateTranscript_array); - %extend struct CandidateTranscript { /** * Retrieve one TokenMetadata element @@ -29,8 +25,8 @@ * * @return The TokenMetadata requested or null */ - TokenMetadata getToken(int i) { - return TokenMetadata_array_getitem(self->tokens, i); + const TokenMetadata& getToken(int i) { + return self->tokens[i]; } } @@ -42,8 +38,8 @@ * * @return The CandidateTranscript requested or null */ - CandidateTranscript getTranscript(int i) { - return CandidateTranscript_array_getitem(self->transcripts, i); + const CandidateTranscript& getTranscript(int i) { + return self->transcripts[i]; } ~Metadata() { @@ -58,9 +54,11 @@ %nodefaultctor TokenMetadata; %nodefaultdtor TokenMetadata; +%typemap(newfree) char* "DS_FreeString($1);"; %newobject DS_SpeechToText; %newobject DS_IntermediateDecode; %newobject DS_FinishStream; +%newobject DS_ErrorCodeToErrorMessage; %rename ("%(strip:[DS_])s") ""; diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java index a5b339b3..eafa11e2 100644 --- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java @@ -11,8 +11,15 @@ public class DeepSpeechModel { } // FIXME: We should have something better than those SWIGTYPE_* - SWIGTYPE_p_p_ModelState _mspp; - SWIGTYPE_p_ModelState _msp; + private SWIGTYPE_p_p_ModelState _mspp; + private SWIGTYPE_p_ModelState _msp; + + private void evaluateErrorCode(int errorCode) { + DeepSpeech_Error_Codes code = DeepSpeech_Error_Codes.swigToEnum(errorCode); + if (code != DeepSpeech_Error_Codes.ERR_OK) { + throw new RuntimeException("Error: " + impl.ErrorCodeToErrorMessage(errorCode) + " (0x" + Integer.toHexString(errorCode) + ")."); + } + } /** * @brief An object providing an interface to a trained DeepSpeech model. @@ -20,10 +27,12 @@ public class DeepSpeechModel { * @constructor * * @param modelPath The path to the frozen model graph. + * + * @throws RuntimeException on failure. */ public DeepSpeechModel(String modelPath) { this._mspp = impl.new_modelstatep(); - impl.CreateModel(modelPath, this._mspp); + evaluateErrorCode(impl.CreateModel(modelPath, this._mspp)); this._msp = impl.modelstatep_value(this._mspp); } @@ -43,10 +52,10 @@ public class DeepSpeechModel { * @param aBeamWidth The beam width used by the model. A larger beam width value * generates better results at the cost of decoding time. * - * @return Zero on success, non-zero on failure. + * @throws RuntimeException on failure. */ - public int setBeamWidth(long beamWidth) { - return impl.SetModelBeamWidth(this._msp, beamWidth); + public void setBeamWidth(long beamWidth) { + evaluateErrorCode(impl.SetModelBeamWidth(this._msp, beamWidth)); } /** @@ -70,19 +79,19 @@ public class DeepSpeechModel { * * @param scorer The path to the external scorer file. * - * @return Zero on success, non-zero on failure (invalid arguments). + * @throws RuntimeException on failure. */ public void enableExternalScorer(String scorer) { - impl.EnableExternalScorer(this._msp, scorer); + evaluateErrorCode(impl.EnableExternalScorer(this._msp, scorer)); } /** * @brief Disable decoding using an external scorer. * - * @return Zero on success, non-zero on failure (invalid arguments). + * @throws RuntimeException on failure. */ public void disableExternalScorer() { - impl.DisableExternalScorer(this._msp); + evaluateErrorCode(impl.DisableExternalScorer(this._msp)); } /** @@ -91,10 +100,10 @@ public class DeepSpeechModel { * @param alpha The alpha hyperparameter of the decoder. Language model weight. * @param beta The beta hyperparameter of the decoder. Word insertion weight. * - * @return Zero on success, non-zero on failure (invalid arguments). + * @throws RuntimeException on failure. */ public void setScorerAlphaBeta(float alpha, float beta) { - impl.SetScorerAlphaBeta(this._msp, alpha, beta); + evaluateErrorCode(impl.SetScorerAlphaBeta(this._msp, alpha, beta)); } /* @@ -132,10 +141,12 @@ public class DeepSpeechModel { * and finishStream(). * * @return An opaque object that represents the streaming state. + * + * @throws RuntimeException on failure. */ public DeepSpeechStreamingState createStream() { SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep(); - impl.CreateStream(this._msp, ssp); + evaluateErrorCode(impl.CreateStream(this._msp, ssp)); return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp)); } diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java index c02b39ad..fa13c474 100644 --- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/CandidateTranscript.java @@ -9,8 +9,8 @@ package org.mozilla.deepspeech.libdeepspeech; /** - * A single transcript computed by the model, including a confidence value and - * the metadata for its constituent tokens. + * A single transcript computed by the model, including a confidence
+ * value and the metadata for its constituent tokens. */ public class CandidateTranscript { private transient long swigCPtr; @@ -36,14 +36,7 @@ public class CandidateTranscript { } /** - * Array of TokenMetadata objects - */ - public void setTokens(TokenMetadata value) { - implJNI.CandidateTranscript_tokens_set(swigCPtr, this, TokenMetadata.getCPtr(value), value); - } - - /** - * Array of TokenMetadata objects + * Array of TokenMetadata objects */ public TokenMetadata getTokens() { long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this); @@ -51,31 +44,15 @@ public class CandidateTranscript { } /** - * Size of the tokens array + * Size of the tokens array */ - public void setNum_tokens(int value) { - implJNI.CandidateTranscript_num_tokens_set(swigCPtr, this, value); - } - - /** - * Size of the tokens array - */ - public int getNum_tokens() { + public long getNum_tokens() { return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this); } /** - * Approximated confidence value for this transcript. This is roughly the - * sum of the acoustic model logit values for each timestep/character that - * contributed to the creation of this transcript. - */ - public void setConfidence(double value) { - implJNI.CandidateTranscript_confidence_set(swigCPtr, this, value); - } - - /** - * Approximated confidence value for this transcript. This is roughly the - * sum of the acoustic model logit values for each timestep/character that + * Approximated confidence value for this transcript. This is roughly the
+ * sum of the acoustic model logit values for each timestep/character that
* contributed to the creation of this transcript. */ public double getConfidence() { @@ -83,14 +60,14 @@ public class CandidateTranscript { } /** - * Retrieve one TokenMetadata element - * - * @param i Array index of the TokenMetadata to get - * + * Retrieve one TokenMetadata element
+ *
+ * @param i Array index of the TokenMetadata to get
+ *
* @return The TokenMetadata requested or null */ public TokenMetadata getToken(int i) { - return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), true); + return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), false); } } diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/DeepSpeech_Error_Codes.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/DeepSpeech_Error_Codes.java new file mode 100644 index 00000000..ed47183e --- /dev/null +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/DeepSpeech_Error_Codes.java @@ -0,0 +1,65 @@ +/* ---------------------------------------------------------------------------- + * This file was automatically generated by SWIG (http://www.swig.org). + * Version 4.0.1 + * + * Do not make changes to this file unless you know what you are doing--modify + * the SWIG interface file instead. + * ----------------------------------------------------------------------------- */ + +package org.mozilla.deepspeech.libdeepspeech; + +public enum DeepSpeech_Error_Codes { + ERR_OK(0x0000), + ERR_NO_MODEL(0x1000), + ERR_INVALID_ALPHABET(0x2000), + ERR_INVALID_SHAPE(0x2001), + ERR_INVALID_SCORER(0x2002), + ERR_MODEL_INCOMPATIBLE(0x2003), + ERR_SCORER_NOT_ENABLED(0x2004), + ERR_FAIL_INIT_MMAP(0x3000), + ERR_FAIL_INIT_SESS(0x3001), + ERR_FAIL_INTERPRETER(0x3002), + ERR_FAIL_RUN_SESS(0x3003), + ERR_FAIL_CREATE_STREAM(0x3004), + ERR_FAIL_READ_PROTOBUF(0x3005), + ERR_FAIL_CREATE_SESS(0x3006), + ERR_FAIL_CREATE_MODEL(0x3007); + + public final int swigValue() { + return swigValue; + } + + public static DeepSpeech_Error_Codes swigToEnum(int swigValue) { + DeepSpeech_Error_Codes[] swigValues = DeepSpeech_Error_Codes.class.getEnumConstants(); + if (swigValue < swigValues.length && swigValue >= 0 && swigValues[swigValue].swigValue == swigValue) + return swigValues[swigValue]; + for (DeepSpeech_Error_Codes swigEnum : swigValues) + if (swigEnum.swigValue == swigValue) + return swigEnum; + throw new IllegalArgumentException("No enum " + DeepSpeech_Error_Codes.class + " with value " + swigValue); + } + + @SuppressWarnings("unused") + private DeepSpeech_Error_Codes() { + this.swigValue = SwigNext.next++; + } + + @SuppressWarnings("unused") + private DeepSpeech_Error_Codes(int swigValue) { + this.swigValue = swigValue; + SwigNext.next = swigValue+1; + } + + @SuppressWarnings("unused") + private DeepSpeech_Error_Codes(DeepSpeech_Error_Codes swigEnum) { + this.swigValue = swigEnum.swigValue; + SwigNext.next = this.swigValue+1; + } + + private final int swigValue; + + private static class SwigNext { + private static int next = 0; + } +} + diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java index bb9b0773..d2831bc4 100644 --- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/Metadata.java @@ -40,14 +40,7 @@ public class Metadata { } /** - * Array of CandidateTranscript objects - */ - public void setTranscripts(CandidateTranscript value) { - implJNI.Metadata_transcripts_set(swigCPtr, this, CandidateTranscript.getCPtr(value), value); - } - - /** - * Array of CandidateTranscript objects + * Array of CandidateTranscript objects */ public CandidateTranscript getTranscripts() { long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this); @@ -55,28 +48,21 @@ public class Metadata { } /** - * Size of the transcripts array + * Size of the transcripts array */ - public void setNum_transcripts(int value) { - implJNI.Metadata_num_transcripts_set(swigCPtr, this, value); - } - - /** - * Size of the transcripts array - */ - public int getNum_transcripts() { + public long getNum_transcripts() { return implJNI.Metadata_num_transcripts_get(swigCPtr, this); } /** - * Retrieve one CandidateTranscript element - * - * @param i Array index of the CandidateTranscript to get - * + * Retrieve one CandidateTranscript element
+ *
+ * @param i Array index of the CandidateTranscript to get
+ *
* @return The CandidateTranscript requested or null */ public CandidateTranscript getTranscript(int i) { - return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), true); + return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), false); } } diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/README.rst b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/README.rst index 1279d717..bd89f9b8 100644 --- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/README.rst +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/README.rst @@ -4,7 +4,7 @@ Javadoc for Sphinx This code is only here for reference for documentation generation. -To update, please build SWIG (4.0 at least) and then run from native_client/java: +To update, please install SWIG (4.0 at least) and then run from native_client/java: .. code-block:: diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java index 32246f1a..d14fc161 100644 --- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/TokenMetadata.java @@ -35,42 +35,21 @@ public class TokenMetadata { } /** - * The text corresponding to this token - */ - public void setText(String value) { - implJNI.TokenMetadata_text_set(swigCPtr, this, value); - } - - /** - * The text corresponding to this token + * The text corresponding to this token */ public String getText() { return implJNI.TokenMetadata_text_get(swigCPtr, this); } /** - * Position of the token in units of 20ms + * Position of the token in units of 20ms */ - public void setTimestep(int value) { - implJNI.TokenMetadata_timestep_set(swigCPtr, this, value); - } - - /** - * Position of the token in units of 20ms - */ - public int getTimestep() { + public long getTimestep() { return implJNI.TokenMetadata_timestep_get(swigCPtr, this); } /** - * Position of the token in seconds - */ - public void setStart_time(float value) { - implJNI.TokenMetadata_start_time_set(swigCPtr, this, value); - } - - /** - * Position of the token in seconds + * Position of the token in seconds */ public float getStart_time() { return implJNI.TokenMetadata_start_time_get(swigCPtr, this);