Merge pull request #2792 from reuben/multiple_transcriptions

Expose multiple transcriptions in "WithMetadata" API
This commit is contained in:
Reuben Morais 2020-03-20 16:58:32 +01:00 committed by GitHub
commit 903d0b8fe4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
41 changed files with 1006 additions and 467 deletions

View File

@ -34,6 +34,9 @@ C
.. doxygenfunction:: DS_IntermediateDecode .. doxygenfunction:: DS_IntermediateDecode
:project: deepspeech-c :project: deepspeech-c
.. doxygenfunction:: DS_IntermediateDecodeWithMetadata
:project: deepspeech-c
.. doxygenfunction:: DS_FinishStream .. doxygenfunction:: DS_FinishStream
:project: deepspeech-c :project: deepspeech-c

View File

@ -31,13 +31,20 @@ ErrorCodes
Metadata Metadata
-------- --------
.. doxygenstruct:: DeepSpeechClient::Structs::Metadata .. doxygenclass:: DeepSpeechClient::Models::Metadata
:project: deepspeech-dotnet :project: deepspeech-dotnet
:members: items, num_items, confidence :members: Transcripts
MetadataItem CandidateTranscript
------------ -------------------
.. doxygenstruct:: DeepSpeechClient::Structs::MetadataItem .. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript
:project: deepspeech-dotnet :project: deepspeech-dotnet
:members: character, timestep, start_time :members: Tokens, Confidence
TokenMetadata
-------------
.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata
:project: deepspeech-dotnet
:members: Text, Timestep, StartTime

View File

@ -13,11 +13,17 @@ Metadata
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata .. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
:project: deepspeech-java :project: deepspeech-java
:members: getItems, getNum_items, getProbability, getItem :members: getTranscripts, getNum_transcripts, getTranscript
MetadataItem CandidateTranscript
------------ -------------------
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem .. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript
:project: deepspeech-java :project: deepspeech-java
:members: getCharacter, getTimestep, getStart_time :members: getTokens, getNum_tokens, getConfidence, getToken
TokenMetadata
-------------
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata
:project: deepspeech-java
:members: getText, getTimestep, getStart_time

View File

@ -30,8 +30,14 @@ Metadata
.. js:autoclass:: Metadata .. js:autoclass:: Metadata
:members: :members:
MetadataItem CandidateTranscript
------------ -------------------
.. js:autoclass:: MetadataItem .. js:autoclass:: CandidateTranscript
:members:
TokenMetadata
-------------
.. js:autoclass:: TokenMetadata
:members: :members:

View File

@ -21,8 +21,14 @@ Metadata
.. autoclass:: Metadata .. autoclass:: Metadata
:members: :members:
MetadataItem CandidateTranscript
------------ -------------------
.. autoclass:: MetadataItem .. autoclass:: CandidateTranscript
:members:
TokenMetadata
-------------
.. autoclass:: TokenMetadata
:members: :members:

View File

@ -8,9 +8,16 @@ Metadata
:project: deepspeech-c :project: deepspeech-c
:members: :members:
MetadataItem CandidateTranscript
------------ -------------------
.. doxygenstruct:: MetadataItem .. doxygenstruct:: CandidateTranscript
:project: deepspeech-c
:members:
TokenMetadata
-------------
.. doxygenstruct:: TokenMetadata
:project: deepspeech-c :project: deepspeech-c
:members: :members:

View File

@ -790,7 +790,7 @@ WARN_LOGFILE =
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched. # Note: If this tag is empty the current directory is searched.
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/ INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/
# This tag can be used to specify the character encoding of the source files # This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses

View File

@ -34,6 +34,8 @@ bool extended_metadata = false;
bool json_output = false; bool json_output = false;
int json_candidate_transcripts = 3;
int stream_size = 0; int stream_size = 0;
void PrintHelp(const char* bin) void PrintHelp(const char* bin)
@ -43,18 +45,19 @@ void PrintHelp(const char* bin)
"\n" "\n"
"Running DeepSpeech inference.\n" "Running DeepSpeech inference.\n"
"\n" "\n"
"\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n" "\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n"
"\t--scorer SCORER\t\tPath to the external scorer file\n" "\t--scorer SCORER\t\t\tPath to the external scorer file\n"
"\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n" "\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n"
"\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n" "\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n"
"\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n" "\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n"
"\t--lm_beta LM_BETA\tValue for language model beta param (float)\n" "\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n"
"\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n" "\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
"\t--extended\t\tOutput string from extended metadata\n" "\t--extended\t\t\tOutput string from extended metadata\n"
"\t--json\t\t\tExtended output, shows word timings as JSON\n" "\t--json\t\t\t\tExtended output, shows word timings as JSON\n"
"\t--stream size\t\tRun in stream mode, output intermediate results\n" "\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n"
"\t--help\t\t\tShow help\n" "\t--stream size\t\t\tRun in stream mode, output intermediate results\n"
"\t--version\t\tPrint version and exits\n"; "\t--help\t\t\t\tShow help\n"
"\t--version\t\t\tPrint version and exits\n";
char* version = DS_Version(); char* version = DS_Version();
std::cerr << "DeepSpeech " << version << "\n"; std::cerr << "DeepSpeech " << version << "\n";
DS_FreeString(version); DS_FreeString(version);
@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv)
{"t", no_argument, nullptr, 't'}, {"t", no_argument, nullptr, 't'},
{"extended", no_argument, nullptr, 'e'}, {"extended", no_argument, nullptr, 'e'},
{"json", no_argument, nullptr, 'j'}, {"json", no_argument, nullptr, 'j'},
{"candidate_transcripts", required_argument, nullptr, 150},
{"stream", required_argument, nullptr, 's'}, {"stream", required_argument, nullptr, 's'},
{"version", no_argument, nullptr, 'v'}, {"version", no_argument, nullptr, 'v'},
{"help", no_argument, nullptr, 'h'}, {"help", no_argument, nullptr, 'h'},
@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv)
json_output = true; json_output = true;
break; break;
case 150:
json_candidate_transcripts = atoi(optarg);
break;
case 's': case 's':
stream_size = atoi(optarg); stream_size = atoi(optarg);
break; break;

View File

@ -44,9 +44,115 @@ struct meta_word {
float duration; float duration;
}; };
char* metadataToString(Metadata* metadata); char*
std::vector<meta_word> WordsFromMetadata(Metadata* metadata); CandidateTranscriptToString(const CandidateTranscript* transcript)
char* JSONOutput(Metadata* metadata); {
std::string retval = "";
for (int i = 0; i < transcript->num_tokens; i++) {
const TokenMetadata& token = transcript->tokens[i];
retval += token.text;
}
return strdup(retval.c_str());
}
std::vector<meta_word>
CandidateTranscriptToWords(const CandidateTranscript* transcript)
{
std::vector<meta_word> word_list;
std::string word = "";
float word_start_time = 0;
// Loop through each token
for (int i = 0; i < transcript->num_tokens; i++) {
const TokenMetadata& token = transcript->tokens[i];
// Append token to word if it's not a space
if (strcmp(token.text, u8" ") != 0) {
// Log the start time of the new word
if (word.length() == 0) {
word_start_time = token.start_time;
}
word.append(token.text);
}
// Word boundary is either a space or the last token in the array
if (strcmp(token.text, u8" ") == 0 || i == transcript->num_tokens-1) {
float word_duration = token.start_time - word_start_time;
if (word_duration < 0) {
word_duration = 0;
}
meta_word w;
w.word = word;
w.start_time = word_start_time;
w.duration = word_duration;
word_list.push_back(w);
// Reset
word = "";
word_start_time = 0;
}
}
return word_list;
}
std::string
CandidateTranscriptToJSON(const CandidateTranscript *transcript)
{
std::ostringstream out_string;
std::vector<meta_word> words = CandidateTranscriptToWords(transcript);
out_string << R"("metadata":{"confidence":)" << transcript->confidence << R"(},"words":[)";
for (int i = 0; i < words.size(); i++) {
meta_word w = words[i];
out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
if (i < words.size() - 1) {
out_string << ",";
}
}
out_string << "]";
return out_string.str();
}
char*
MetadataToJSON(Metadata* result)
{
std::ostringstream out_string;
out_string << "{\n";
for (int j=0; j < result->num_transcripts; ++j) {
const CandidateTranscript *transcript = &result->transcripts[j];
if (j == 0) {
out_string << CandidateTranscriptToJSON(transcript);
if (result->num_transcripts > 1) {
out_string << ",\n" << R"("alternatives")" << ":[\n";
}
} else {
out_string << "{" << CandidateTranscriptToJSON(transcript) << "}";
if (j < result->num_transcripts - 1) {
out_string << ",\n";
} else {
out_string << "\n]";
}
}
}
out_string << "\n}\n";
return strdup(out_string.str().c_str());
}
ds_result ds_result
LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
@ -57,13 +163,13 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
clock_t ds_start_time = clock(); clock_t ds_start_time = clock();
if (extended_output) { if (extended_output) {
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize); Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1);
res.string = metadataToString(metadata); res.string = CandidateTranscriptToString(&result->transcripts[0]);
DS_FreeMetadata(metadata); DS_FreeMetadata(result);
} else if (json_output) { } else if (json_output) {
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize); Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts);
res.string = JSONOutput(metadata); res.string = MetadataToJSON(result);
DS_FreeMetadata(metadata); DS_FreeMetadata(result);
} else if (stream_size > 0) { } else if (stream_size > 0) {
StreamingState* ctx; StreamingState* ctx;
int status = DS_CreateStream(aCtx, &ctx); int status = DS_CreateStream(aCtx, &ctx);
@ -278,87 +384,6 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
} }
} }
char*
metadataToString(Metadata* metadata)
{
std::string retval = "";
for (int i = 0; i < metadata->num_items; i++) {
MetadataItem item = metadata->items[i];
retval += item.character;
}
return strdup(retval.c_str());
}
std::vector<meta_word>
WordsFromMetadata(Metadata* metadata)
{
std::vector<meta_word> word_list;
std::string word = "";
float word_start_time = 0;
// Loop through each character
for (int i = 0; i < metadata->num_items; i++) {
MetadataItem item = metadata->items[i];
// Append character to word if it's not a space
if (strcmp(item.character, u8" ") != 0) {
// Log the start time of the new word
if (word.length() == 0) {
word_start_time = item.start_time;
}
word.append(item.character);
}
// Word boundary is either a space or the last character in the array
if (strcmp(item.character, " ") == 0
|| strcmp(item.character, u8" ") == 0
|| i == metadata->num_items-1) {
float word_duration = item.start_time - word_start_time;
if (word_duration < 0) {
word_duration = 0;
}
meta_word w;
w.word = word;
w.start_time = word_start_time;
w.duration = word_duration;
word_list.push_back(w);
// Reset
word = "";
word_start_time = 0;
}
}
return word_list;
}
char*
JSONOutput(Metadata* metadata)
{
std::vector<meta_word> words = WordsFromMetadata(metadata);
std::ostringstream out_string;
out_string << R"({"metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)";
for (int i = 0; i < words.size(); i++) {
meta_word w = words[i];
out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
if (i < words.size() - 1) {
out_string << ",";
}
}
out_string << "]}\n";
return strdup(out_string.str().c_str());
}
int int
main(int argc, char **argv) main(int argc, char **argv)
{ {

View File

@ -157,7 +157,7 @@ DecoderState::next(const double *probs,
} }
std::vector<Output> std::vector<Output>
DecoderState::decode() const DecoderState::decode(size_t num_results) const
{ {
std::vector<PathTrie*> prefixes_copy = prefixes_; std::vector<PathTrie*> prefixes_copy = prefixes_;
std::unordered_map<const PathTrie*, float> scores; std::unordered_map<const PathTrie*, float> scores;
@ -181,16 +181,12 @@ DecoderState::decode() const
} }
using namespace std::placeholders; using namespace std::placeholders;
size_t num_prefixes = std::min(prefixes_copy.size(), beam_size_); size_t num_returned = std::min(prefixes_copy.size(), num_results);
std::partial_sort(prefixes_copy.begin(), std::partial_sort(prefixes_copy.begin(),
prefixes_copy.begin() + num_prefixes, prefixes_copy.begin() + num_returned,
prefixes_copy.end(), prefixes_copy.end(),
std::bind(prefix_compare_external, _1, _2, scores)); std::bind(prefix_compare_external, _1, _2, scores));
//TODO: expose this as an API parameter
const size_t top_paths = 1;
size_t num_returned = std::min(num_prefixes, top_paths);
std::vector<Output> outputs; std::vector<Output> outputs;
outputs.reserve(num_returned); outputs.reserve(num_returned);

View File

@ -60,13 +60,16 @@ public:
int time_dim, int time_dim,
int class_dim); int class_dim);
/* Get transcription from current decoder state /* Get up to num_results transcriptions from current decoder state.
*
* Parameters:
* num_results: Number of beams to return.
* *
* Return: * Return:
* A vector where each element is a pair of score and decoding result, * A vector where each element is a pair of score and decoding result,
* in descending order. * in descending order.
*/ */
std::vector<Output> decode() const; std::vector<Output> decode(size_t num_results=1) const;
}; };

View File

@ -60,7 +60,7 @@ using std::vector;
When batch_buffer is full, we do a single step through the acoustic model When batch_buffer is full, we do a single step through the acoustic model
and accumulate the intermediate decoding state in the DecoderState structure. and accumulate the intermediate decoding state in the DecoderState structure.
When finishStream() is called, we return the corresponding transcription from When finishStream() is called, we return the corresponding transcript from
the current decoder state. the current decoder state.
*/ */
struct StreamingState { struct StreamingState {
@ -78,9 +78,10 @@ struct StreamingState {
void feedAudioContent(const short* buffer, unsigned int buffer_size); void feedAudioContent(const short* buffer, unsigned int buffer_size);
char* intermediateDecode() const; char* intermediateDecode() const;
Metadata* intermediateDecodeWithMetadata(unsigned int num_results) const;
void finalizeStream(); void finalizeStream();
char* finishStream(); char* finishStream();
Metadata* finishStreamWithMetadata(); Metadata* finishStreamWithMetadata(unsigned int num_results);
void processAudioWindow(const vector<float>& buf); void processAudioWindow(const vector<float>& buf);
void processMfccWindow(const vector<float>& buf); void processMfccWindow(const vector<float>& buf);
@ -136,6 +137,12 @@ StreamingState::intermediateDecode() const
return model_->decode(decoder_state_); return model_->decode(decoder_state_);
} }
Metadata*
StreamingState::intermediateDecodeWithMetadata(unsigned int num_results) const
{
return model_->decode_metadata(decoder_state_, num_results);
}
char* char*
StreamingState::finishStream() StreamingState::finishStream()
{ {
@ -144,10 +151,10 @@ StreamingState::finishStream()
} }
Metadata* Metadata*
StreamingState::finishStreamWithMetadata() StreamingState::finishStreamWithMetadata(unsigned int num_results)
{ {
finalizeStream(); finalizeStream();
return model_->decode_metadata(decoder_state_); return model_->decode_metadata(decoder_state_, num_results);
} }
void void
@ -402,6 +409,13 @@ DS_IntermediateDecode(const StreamingState* aSctx)
return aSctx->intermediateDecode(); return aSctx->intermediateDecode();
} }
Metadata*
DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
unsigned int aNumResults)
{
return aSctx->intermediateDecodeWithMetadata(aNumResults);
}
char* char*
DS_FinishStream(StreamingState* aSctx) DS_FinishStream(StreamingState* aSctx)
{ {
@ -411,11 +425,12 @@ DS_FinishStream(StreamingState* aSctx)
} }
Metadata* Metadata*
DS_FinishStreamWithMetadata(StreamingState* aSctx) DS_FinishStreamWithMetadata(StreamingState* aSctx,
unsigned int aNumResults)
{ {
Metadata* metadata = aSctx->finishStreamWithMetadata(); Metadata* result = aSctx->finishStreamWithMetadata(aNumResults);
DS_FreeStream(aSctx); DS_FreeStream(aSctx);
return metadata; return result;
} }
StreamingState* StreamingState*
@ -444,10 +459,11 @@ DS_SpeechToText(ModelState* aCtx,
Metadata* Metadata*
DS_SpeechToTextWithMetadata(ModelState* aCtx, DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer, const short* aBuffer,
unsigned int aBufferSize) unsigned int aBufferSize,
unsigned int aNumResults)
{ {
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize); StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
return DS_FinishStreamWithMetadata(ctx); return DS_FinishStreamWithMetadata(ctx, aNumResults);
} }
void void
@ -460,11 +476,16 @@ void
DS_FreeMetadata(Metadata* m) DS_FreeMetadata(Metadata* m)
{ {
if (m) { if (m) {
for (int i = 0; i < m->num_items; ++i) { for (int i = 0; i < m->num_transcripts; ++i) {
free(m->items[i].character); for (int j = 0; j < m->transcripts[i].num_tokens; ++j) {
free((void*)m->transcripts[i].tokens[j].text);
} }
delete[] m->items;
delete m; free((void*)m->transcripts[i].tokens);
}
free((void*)m->transcripts);
free(m);
} }
} }

View File

@ -20,32 +20,43 @@ typedef struct ModelState ModelState;
typedef struct StreamingState StreamingState; typedef struct StreamingState StreamingState;
/** /**
* @brief Stores each individual character, along with its timing information * @brief Stores text of an individual token, along with its timing information
*/ */
typedef struct MetadataItem { typedef struct TokenMetadata {
/** The character generated for transcription */ /** The text corresponding to this token */
char* character; const char* const text;
/** Position of the character in units of 20ms */ /** Position of the token in units of 20ms */
int timestep; const unsigned int timestep;
/** Position of the character in seconds */ /** Position of the token in seconds */
float start_time; const float start_time;
} MetadataItem; } TokenMetadata;
/** /**
* @brief Stores the entire CTC output as an array of character metadata objects * @brief A single transcript computed by the model, including a confidence
* value and the metadata for its constituent tokens.
*/
typedef struct CandidateTranscript {
/** Array of TokenMetadata objects */
const TokenMetadata* const tokens;
/** Size of the tokens array */
const unsigned int num_tokens;
/** Approximated confidence value for this transcript. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
* contributed to the creation of this transcript.
*/
const double confidence;
} CandidateTranscript;
/**
* @brief An array of CandidateTranscript objects computed by the model.
*/ */
typedef struct Metadata { typedef struct Metadata {
/** List of items */ /** Array of CandidateTranscript objects */
MetadataItem* items; const CandidateTranscript* const transcripts;
/** Size of the list of items */ /** Size of the transcripts array */
int num_items; const unsigned int num_transcripts;
/** Approximated confidence value for this transcription. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
* contributed to the creation of this transcription.
*/
double confidence;
} Metadata; } Metadata;
enum DeepSpeech_Error_Codes enum DeepSpeech_Error_Codes
@ -164,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx,
float aBeta); float aBeta);
/** /**
* @brief Use the DeepSpeech model to perform Speech-To-Text. * @brief Use the DeepSpeech model to convert speech to text.
* *
* @param aCtx The ModelState pointer for the model to use. * @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
@ -180,21 +191,25 @@ char* DS_SpeechToText(ModelState* aCtx,
unsigned int aBufferSize); unsigned int aBufferSize);
/** /**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata * @brief Use the DeepSpeech model to convert speech to text and output results
* about the results. * including metadata.
* *
* @param aCtx The ModelState pointer for the model to use. * @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on). * sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal. * @param aBufferSize The number of samples in the audio signal.
* @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this.
* *
* @return Outputs a struct of individual letters along with their timing information. * @return Metadata struct containing multiple CandidateTranscript structs. Each
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. * transcript has per-token metadata including timing information. The
* user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
* Returns NULL on error.
*/ */
DEEPSPEECH_EXPORT DEEPSPEECH_EXPORT
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer, const short* aBuffer,
unsigned int aBufferSize); unsigned int aBufferSize,
unsigned int aNumResults);
/** /**
* @brief Create a new streaming inference state. The streaming state returned * @brief Create a new streaming inference state. The streaming state returned
@ -236,8 +251,24 @@ DEEPSPEECH_EXPORT
char* DS_IntermediateDecode(const StreamingState* aSctx); char* DS_IntermediateDecode(const StreamingState* aSctx);
/** /**
* @brief Signal the end of an audio signal to an ongoing streaming * @brief Compute the intermediate decoding of an ongoing streaming inference,
* inference, returns the STT result over the whole audio signal. * return results including metadata.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aNumResults The number of candidate transcripts to return.
*
* @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information. The user is
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
* Returns NULL on error.
*/
DEEPSPEECH_EXPORT
Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
unsigned int aNumResults);
/**
* @brief Compute the final decoding of an ongoing streaming inference and return
* the result. Signals the end of an ongoing streaming inference.
* *
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* *
@ -250,18 +281,23 @@ DEEPSPEECH_EXPORT
char* DS_FinishStream(StreamingState* aSctx); char* DS_FinishStream(StreamingState* aSctx);
/** /**
* @brief Signal the end of an audio signal to an ongoing streaming * @brief Compute the final decoding of an ongoing streaming inference and return
* inference, returns per-letter metadata. * results including metadata. Signals the end of an ongoing streaming
* inference.
* *
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aNumResults The number of candidate transcripts to return.
* *
* @return Outputs a struct of individual letters along with their timing information. * @return Metadata struct containing multiple candidate transcripts. Each transcript
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. * has per-token metadata including timing information. The user is
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
* Returns NULL on error.
* *
* @note This method will free the state pointer (@p aSctx). * @note This method will free the state pointer (@p aSctx).
*/ */
DEEPSPEECH_EXPORT DEEPSPEECH_EXPORT
Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx); Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
unsigned int aNumResults);
/** /**
* @brief Destroy a streaming state without decoding the computed logits. This * @brief Destroy a streaming state without decoding the computed logits. This

View File

@ -199,13 +199,14 @@ namespace DeepSpeechClient
} }
/// <summary> /// <summary>
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
/// </summary> /// </summary>
/// <param name="stream">Instance of the stream to finish.</param> /// <param name="stream">Instance of the stream to finish.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata result.</returns> /// <returns>The extended metadata result.</returns>
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream) public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
{ {
return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata(); return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
} }
/// <summary> /// <summary>
@ -218,6 +219,17 @@ namespace DeepSpeechClient
return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString(); return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString();
} }
/// <summary>
/// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
/// </summary>
/// <param name="stream">Instance of the stream to decode.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The STT intermediate result.</returns>
public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
{
return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
}
/// <summary> /// <summary>
/// Return version of this library. The returned version is a semantic version /// Return version of this library. The returned version is a semantic version
/// (SemVer 2.0.0). /// (SemVer 2.0.0).
@ -261,14 +273,15 @@ namespace DeepSpeechClient
} }
/// <summary> /// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text. /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
/// </summary> /// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param> /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param> /// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata. Returns NULL on error.</returns> /// <returns>The extended metadata. Returns NULL on error.</returns>
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize) public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
{ {
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata(); return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata();
} }
#endregion #endregion

View File

@ -50,11 +50,13 @@
<Compile Include="Extensions\NativeExtensions.cs" /> <Compile Include="Extensions\NativeExtensions.cs" />
<Compile Include="Models\DeepSpeechStream.cs" /> <Compile Include="Models\DeepSpeechStream.cs" />
<Compile Include="Models\Metadata.cs" /> <Compile Include="Models\Metadata.cs" />
<Compile Include="Models\MetadataItem.cs" /> <Compile Include="Models\CandidateTranscript.cs" />
<Compile Include="Models\TokenMetadata.cs" />
<Compile Include="NativeImp.cs" /> <Compile Include="NativeImp.cs" />
<Compile Include="Properties\AssemblyInfo.cs" /> <Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Structs\Metadata.cs" /> <Compile Include="Structs\Metadata.cs" />
<Compile Include="Structs\MetadataItem.cs" /> <Compile Include="Structs\CandidateTranscript.cs" />
<Compile Include="Structs\TokenMetadata.cs" />
</ItemGroup> </ItemGroup>
<ItemGroup /> <ItemGroup />
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />

View File

@ -26,35 +26,68 @@ namespace DeepSpeechClient.Extensions
} }
/// <summary> /// <summary>
/// Converts a pointer into managed metadata object. /// Converts a pointer into managed TokenMetadata object.
/// </summary>
/// <param name="intPtr">Native pointer.</param>
/// <returns>TokenMetadata managed object.</returns>
private static Models.TokenMetadata PtrToTokenMetadata(this IntPtr intPtr)
{
var token = Marshal.PtrToStructure<TokenMetadata>(intPtr);
var managedToken = new Models.TokenMetadata
{
Timestep = token.timestep,
StartTime = token.start_time,
Text = token.text.PtrToString(releasePtr: false)
};
return managedToken;
}
/// <summary>
/// Converts a pointer into managed CandidateTranscript object.
/// </summary>
/// <param name="intPtr">Native pointer.</param>
/// <returns>CandidateTranscript managed object.</returns>
private static Models.CandidateTranscript PtrToCandidateTranscript(this IntPtr intPtr)
{
var managedTranscript = new Models.CandidateTranscript();
var transcript = Marshal.PtrToStructure<CandidateTranscript>(intPtr);
managedTranscript.Tokens = new Models.TokenMetadata[transcript.num_tokens];
managedTranscript.Confidence = transcript.confidence;
//we need to manually read each item from the native ptr using its size
var sizeOfTokenMetadata = Marshal.SizeOf(typeof(TokenMetadata));
for (int i = 0; i < transcript.num_tokens; i++)
{
managedTranscript.Tokens[i] = transcript.tokens.PtrToTokenMetadata();
transcript.tokens += sizeOfTokenMetadata;
}
return managedTranscript;
}
/// <summary>
/// Converts a pointer into managed Metadata object.
/// </summary> /// </summary>
/// <param name="intPtr">Native pointer.</param> /// <param name="intPtr">Native pointer.</param>
/// <returns>Metadata managed object.</returns> /// <returns>Metadata managed object.</returns>
internal static Models.Metadata PtrToMetadata(this IntPtr intPtr) internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
{ {
var managedMetaObject = new Models.Metadata(); var managedMetadata = new Models.Metadata();
var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata)); var metadata = Marshal.PtrToStructure<Metadata>(intPtr);
managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
managedMetaObject.Confidence = metaData.confidence;
managedMetadata.Transcripts = new Models.CandidateTranscript[metadata.num_transcripts];
//we need to manually read each item from the native ptr using its size //we need to manually read each item from the native ptr using its size
var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem)); var sizeOfCandidateTranscript = Marshal.SizeOf(typeof(CandidateTranscript));
for (int i = 0; i < metaData.num_items; i++) for (int i = 0; i < metadata.num_transcripts; i++)
{ {
var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items); managedMetadata.Transcripts[i] = metadata.transcripts.PtrToCandidateTranscript();
managedMetaObject.Items[i] = new Models.MetadataItem metadata.transcripts += sizeOfCandidateTranscript;
{
Timestep = tempItem.timestep,
StartTime = tempItem.start_time,
Character = tempItem.character.PtrToString(releasePtr: false)
};
//we keep the offset on each read
metaData.items += sizeOfMetaItem;
} }
NativeImp.DS_FreeMetadata(intPtr); NativeImp.DS_FreeMetadata(intPtr);
return managedMetaObject; return managedMetadata;
} }
} }
} }

View File

@ -68,13 +68,15 @@ namespace DeepSpeechClient.Interfaces
uint aBufferSize); uint aBufferSize);
/// <summary> /// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text. /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
/// </summary> /// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param> /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param> /// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata. Returns NULL on error.</returns> /// <returns>The extended metadata. Returns NULL on error.</returns>
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
uint aBufferSize); uint aBufferSize,
uint aNumResults);
/// <summary> /// <summary>
/// Destroy a streaming state without decoding the computed logits. /// Destroy a streaming state without decoding the computed logits.
@ -102,6 +104,14 @@ namespace DeepSpeechClient.Interfaces
/// <returns>The STT intermediate result.</returns> /// <returns>The STT intermediate result.</returns>
unsafe string IntermediateDecode(DeepSpeechStream stream); unsafe string IntermediateDecode(DeepSpeechStream stream);
/// <summary>
/// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
/// </summary>
/// <param name="stream">Instance of the stream to decode.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata result.</returns>
unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
/// <summary> /// <summary>
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
/// </summary> /// </summary>
@ -110,10 +120,11 @@ namespace DeepSpeechClient.Interfaces
unsafe string FinishStream(DeepSpeechStream stream); unsafe string FinishStream(DeepSpeechStream stream);
/// <summary> /// <summary>
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
/// </summary> /// </summary>
/// <param name="stream">Instance of the stream to finish.</param> /// <param name="stream">Instance of the stream to finish.</param>
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
/// <returns>The extended metadata result.</returns> /// <returns>The extended metadata result.</returns>
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream); unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
} }
} }

View File

@ -0,0 +1,17 @@
namespace DeepSpeechClient.Models
{
/// <summary>
/// Stores the entire CTC output as an array of character metadata objects.
/// </summary>
public class CandidateTranscript
{
/// <summary>
/// Approximated confidence value for this transcription.
/// </summary>
public double Confidence { get; set; }
/// <summary>
/// List of metada tokens containing text, timestep, and time offset.
/// </summary>
public TokenMetadata[] Tokens { get; set; }
}
}

View File

@ -6,12 +6,8 @@
public class Metadata public class Metadata
{ {
/// <summary> /// <summary>
/// Approximated confidence value for this transcription. /// List of candidate transcripts.
/// </summary> /// </summary>
public double Confidence { get; set; } public CandidateTranscript[] Transcripts { get; set; }
/// <summary>
/// List of metada items containing char, timespet, and time offset.
/// </summary>
public MetadataItem[] Items { get; set; }
} }
} }

View File

@ -3,12 +3,12 @@
/// <summary> /// <summary>
/// Stores each individual character, along with its timing information. /// Stores each individual character, along with its timing information.
/// </summary> /// </summary>
public class MetadataItem public class TokenMetadata
{ {
/// <summary> /// <summary>
/// Char of the current timestep. /// Char of the current timestep.
/// </summary> /// </summary>
public string Character; public string Text;
/// <summary> /// <summary>
/// Position of the character in units of 20ms. /// Position of the character in units of 20ms.
/// </summary> /// </summary>

View File

@ -55,7 +55,8 @@ namespace DeepSpeechClient
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx, internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
short[] aBuffer, short[] aBuffer,
uint aBufferSize); uint aBufferSize,
uint aNumResults);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeModel(IntPtr** aCtx); internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
@ -82,12 +83,17 @@ namespace DeepSpeechClient
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx); internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx,
uint aNumResults);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
CharSet = CharSet.Ansi, SetLastError = true)] CharSet = CharSet.Ansi, SetLastError = true)]
internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx); internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx); internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx,
uint aNumResults);
#endregion #endregion
} }
} }

View File

@ -0,0 +1,22 @@
using System;
using System.Runtime.InteropServices;
namespace DeepSpeechClient.Structs
{
[StructLayout(LayoutKind.Sequential)]
internal unsafe struct CandidateTranscript
{
/// <summary>
/// Native list of tokens.
/// </summary>
internal unsafe IntPtr tokens;
/// <summary>
/// Count of tokens from the native side.
/// </summary>
internal unsafe int num_tokens;
/// <summary>
/// Approximated confidence value for this transcription.
/// </summary>
internal unsafe double confidence;
}
}

View File

@ -7,16 +7,12 @@ namespace DeepSpeechClient.Structs
internal unsafe struct Metadata internal unsafe struct Metadata
{ {
/// <summary> /// <summary>
/// Native list of items. /// Native list of candidate transcripts.
/// </summary> /// </summary>
internal unsafe IntPtr items; internal unsafe IntPtr transcripts;
/// <summary> /// <summary>
/// Count of items from the native side. /// Count of transcripts from the native side.
/// </summary> /// </summary>
internal unsafe int num_items; internal unsafe int num_transcripts;
/// <summary>
/// Approximated confidence value for this transcription.
/// </summary>
internal unsafe double confidence;
} }
} }

View File

@ -4,12 +4,12 @@ using System.Runtime.InteropServices;
namespace DeepSpeechClient.Structs namespace DeepSpeechClient.Structs
{ {
[StructLayout(LayoutKind.Sequential)] [StructLayout(LayoutKind.Sequential)]
internal unsafe struct MetadataItem internal unsafe struct TokenMetadata
{ {
/// <summary> /// <summary>
/// Native character. /// Native text.
/// </summary> /// </summary>
internal unsafe IntPtr character; internal unsafe IntPtr text;
/// <summary> /// <summary>
/// Position of the character in units of 20ms. /// Position of the character in units of 20ms.
/// </summary> /// </summary>

View File

@ -21,14 +21,14 @@ namespace CSharpExamples
static string GetArgument(IEnumerable<string> args, string option) static string GetArgument(IEnumerable<string> args, string option)
=> args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault(); => args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();
static string MetadataToString(Metadata meta) static string MetadataToString(CandidateTranscript transcript)
{ {
var nl = Environment.NewLine; var nl = Environment.NewLine;
string retval = string retval =
Environment.NewLine + $"Recognized text: {string.Join("", meta?.Items?.Select(x => x.Character))} {nl}" Environment.NewLine + $"Recognized text: {string.Join("", transcript?.Tokens?.Select(x => x.Text))} {nl}"
+ $"Confidence: {meta?.Confidence} {nl}" + $"Confidence: {transcript?.Confidence} {nl}"
+ $"Item count: {meta?.Items?.Length} {nl}" + $"Item count: {transcript?.Tokens?.Length} {nl}"
+ string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}")); + string.Join(nl, transcript?.Tokens?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Text}"));
return retval; return retval;
} }
@ -75,8 +75,8 @@ namespace CSharpExamples
if (extended) if (extended)
{ {
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer,
Convert.ToUInt32(waveBuffer.MaxSize / 2)); Convert.ToUInt32(waveBuffer.MaxSize / 2), 1);
speechResult = MetadataToString(metaResult); speechResult = MetadataToString(metaResult.Transcripts[0]);
} }
else else
{ {

View File

@ -6,6 +6,8 @@
%} %}
%include "typemaps.i" %include "typemaps.i"
%include "enums.swg"
%javaconst(1);
%include "arrays_java.i" %include "arrays_java.i"
// apply to DS_FeedAudioContent and DS_SpeechToText // apply to DS_FeedAudioContent and DS_SpeechToText
@ -15,21 +17,29 @@
%pointer_functions(ModelState*, modelstatep); %pointer_functions(ModelState*, modelstatep);
%pointer_functions(StreamingState*, streamingstatep); %pointer_functions(StreamingState*, streamingstatep);
%typemap(newfree) char* "DS_FreeString($1);"; %extend struct CandidateTranscript {
/**
%include "carrays.i" * Retrieve one TokenMetadata element
%array_functions(struct MetadataItem, metadataItem_array); *
* @param i Array index of the TokenMetadata to get
*
* @return The TokenMetadata requested or null
*/
const TokenMetadata& getToken(int i) {
return self->tokens[i];
}
}
%extend struct Metadata { %extend struct Metadata {
/** /**
* Retrieve one MetadataItem element * Retrieve one CandidateTranscript element
* *
* @param i Array index of the MetadataItem to get * @param i Array index of the CandidateTranscript to get
* *
* @return The MetadataItem requested or null * @return The CandidateTranscript requested or null
*/ */
MetadataItem getItem(int i) { const CandidateTranscript& getTranscript(int i) {
return metadataItem_array_getitem(self->items, i); return self->transcripts[i];
} }
~Metadata() { ~Metadata() {
@ -37,14 +47,18 @@
} }
} }
%nodefaultdtor Metadata;
%nodefaultctor Metadata; %nodefaultctor Metadata;
%nodefaultctor MetadataItem; %nodefaultdtor Metadata;
%nodefaultdtor MetadataItem; %nodefaultctor CandidateTranscript;
%nodefaultdtor CandidateTranscript;
%nodefaultctor TokenMetadata;
%nodefaultdtor TokenMetadata;
%typemap(newfree) char* "DS_FreeString($1);";
%newobject DS_SpeechToText; %newobject DS_SpeechToText;
%newobject DS_IntermediateDecode; %newobject DS_IntermediateDecode;
%newobject DS_FinishStream; %newobject DS_FinishStream;
%newobject DS_ErrorCodeToErrorMessage;
%rename ("%(strip:[DS_])s") ""; %rename ("%(strip:[DS_])s") "";

View File

@ -12,7 +12,7 @@ import org.junit.runners.MethodSorters;
import static org.junit.Assert.*; import static org.junit.Assert.*;
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel; import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
import org.mozilla.deepspeech.libdeepspeech.Metadata; import org.mozilla.deepspeech.libdeepspeech.CandidateTranscript;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
@ -61,10 +61,10 @@ public class BasicTest {
m.freeModel(); m.freeModel();
} }
private String metadataToString(Metadata m) { private String candidateTranscriptToString(CandidateTranscript t) {
String retval = ""; String retval = "";
for (int i = 0; i < m.getNum_items(); ++i) { for (int i = 0; i < t.getNum_tokens(); ++i) {
retval += m.getItem(i).getCharacter(); retval += t.getToken(i).getText();
} }
return retval; return retval;
} }
@ -97,7 +97,7 @@ public class BasicTest {
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts); ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
if (extendedMetadata) { if (extendedMetadata) {
return metadataToString(m.sttWithMetadata(shorts, shorts.length)); return candidateTranscriptToString(m.sttWithMetadata(shorts, shorts.length, 1).getTranscript(0));
} else { } else {
return m.stt(shorts, shorts.length); return m.stt(shorts, shorts.length);
} }

View File

@ -11,8 +11,15 @@ public class DeepSpeechModel {
} }
// FIXME: We should have something better than those SWIGTYPE_* // FIXME: We should have something better than those SWIGTYPE_*
SWIGTYPE_p_p_ModelState _mspp; private SWIGTYPE_p_p_ModelState _mspp;
SWIGTYPE_p_ModelState _msp; private SWIGTYPE_p_ModelState _msp;
private void evaluateErrorCode(int errorCode) {
DeepSpeech_Error_Codes code = DeepSpeech_Error_Codes.swigToEnum(errorCode);
if (code != DeepSpeech_Error_Codes.ERR_OK) {
throw new RuntimeException("Error: " + impl.ErrorCodeToErrorMessage(errorCode) + " (0x" + Integer.toHexString(errorCode) + ").");
}
}
/** /**
* @brief An object providing an interface to a trained DeepSpeech model. * @brief An object providing an interface to a trained DeepSpeech model.
@ -20,10 +27,12 @@ public class DeepSpeechModel {
* @constructor * @constructor
* *
* @param modelPath The path to the frozen model graph. * @param modelPath The path to the frozen model graph.
*
* @throws RuntimeException on failure.
*/ */
public DeepSpeechModel(String modelPath) { public DeepSpeechModel(String modelPath) {
this._mspp = impl.new_modelstatep(); this._mspp = impl.new_modelstatep();
impl.CreateModel(modelPath, this._mspp); evaluateErrorCode(impl.CreateModel(modelPath, this._mspp));
this._msp = impl.modelstatep_value(this._mspp); this._msp = impl.modelstatep_value(this._mspp);
} }
@ -43,10 +52,10 @@ public class DeepSpeechModel {
* @param aBeamWidth The beam width used by the model. A larger beam width value * @param aBeamWidth The beam width used by the model. A larger beam width value
* generates better results at the cost of decoding time. * generates better results at the cost of decoding time.
* *
* @return Zero on success, non-zero on failure. * @throws RuntimeException on failure.
*/ */
public int setBeamWidth(long beamWidth) { public void setBeamWidth(long beamWidth) {
return impl.SetModelBeamWidth(this._msp, beamWidth); evaluateErrorCode(impl.SetModelBeamWidth(this._msp, beamWidth));
} }
/** /**
@ -70,19 +79,19 @@ public class DeepSpeechModel {
* *
* @param scorer The path to the external scorer file. * @param scorer The path to the external scorer file.
* *
* @return Zero on success, non-zero on failure (invalid arguments). * @throws RuntimeException on failure.
*/ */
public void enableExternalScorer(String scorer) { public void enableExternalScorer(String scorer) {
impl.EnableExternalScorer(this._msp, scorer); evaluateErrorCode(impl.EnableExternalScorer(this._msp, scorer));
} }
/** /**
* @brief Disable decoding using an external scorer. * @brief Disable decoding using an external scorer.
* *
* @return Zero on success, non-zero on failure (invalid arguments). * @throws RuntimeException on failure.
*/ */
public void disableExternalScorer() { public void disableExternalScorer() {
impl.DisableExternalScorer(this._msp); evaluateErrorCode(impl.DisableExternalScorer(this._msp));
} }
/** /**
@ -91,10 +100,10 @@ public class DeepSpeechModel {
* @param alpha The alpha hyperparameter of the decoder. Language model weight. * @param alpha The alpha hyperparameter of the decoder. Language model weight.
* @param beta The beta hyperparameter of the decoder. Word insertion weight. * @param beta The beta hyperparameter of the decoder. Word insertion weight.
* *
* @return Zero on success, non-zero on failure (invalid arguments). * @throws RuntimeException on failure.
*/ */
public void setScorerAlphaBeta(float alpha, float beta) { public void setScorerAlphaBeta(float alpha, float beta) {
impl.SetScorerAlphaBeta(this._msp, alpha, beta); evaluateErrorCode(impl.SetScorerAlphaBeta(this._msp, alpha, beta));
} }
/* /*
@ -117,11 +126,13 @@ public class DeepSpeechModel {
* @param buffer A 16-bit, mono raw audio signal at the appropriate * @param buffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on). * sample rate (matching what the model was trained on).
* @param buffer_size The number of samples in the audio signal. * @param buffer_size The number of samples in the audio signal.
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
* *
* @return Outputs a Metadata object of individual letters along with their timing information. * @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information.
*/ */
public Metadata sttWithMetadata(short[] buffer, int buffer_size) { public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) {
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size); return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results);
} }
/** /**
@ -130,10 +141,12 @@ public class DeepSpeechModel {
* and finishStream(). * and finishStream().
* *
* @return An opaque object that represents the streaming state. * @return An opaque object that represents the streaming state.
*
* @throws RuntimeException on failure.
*/ */
public DeepSpeechStreamingState createStream() { public DeepSpeechStreamingState createStream() {
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep(); SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
impl.CreateStream(this._msp, ssp); evaluateErrorCode(impl.CreateStream(this._msp, ssp));
return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp)); return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
} }
@ -161,8 +174,20 @@ public class DeepSpeechModel {
} }
/** /**
* @brief Signal the end of an audio signal to an ongoing streaming * @brief Compute the intermediate decoding of an ongoing streaming inference.
* inference, returns the STT result over the whole audio signal. *
* @param ctx A streaming state pointer returned by createStream().
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
*
* @return The STT intermediate result.
*/
public Metadata intermediateDecodeWithMetadata(DeepSpeechStreamingState ctx, int num_results) {
return impl.IntermediateDecodeWithMetadata(ctx.get(), num_results);
}
/**
* @brief Compute the final decoding of an ongoing streaming inference and return
* the result. Signals the end of an ongoing streaming inference.
* *
* @param ctx A streaming state pointer returned by createStream(). * @param ctx A streaming state pointer returned by createStream().
* *
@ -175,16 +200,19 @@ public class DeepSpeechModel {
} }
/** /**
* @brief Signal the end of an audio signal to an ongoing streaming * @brief Compute the final decoding of an ongoing streaming inference and return
* inference, returns per-letter metadata. * the results including metadata. Signals the end of an ongoing streaming
* inference.
* *
* @param ctx A streaming state pointer returned by createStream(). * @param ctx A streaming state pointer returned by createStream().
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
* *
* @return Outputs a Metadata object of individual letters along with their timing information. * @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information.
* *
* @note This method will free the state pointer (@p ctx). * @note This method will free the state pointer (@p ctx).
*/ */
public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx) { public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx, int num_results) {
return impl.FinishStreamWithMetadata(ctx.get()); return impl.FinishStreamWithMetadata(ctx.get(), num_results);
} }
} }

View File

@ -0,0 +1,73 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org).
* Version 4.0.1
*
* Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead.
* ----------------------------------------------------------------------------- */
package org.mozilla.deepspeech.libdeepspeech;
/**
* A single transcript computed by the model, including a confidence<br>
* value and the metadata for its constituent tokens.
*/
public class CandidateTranscript {
private transient long swigCPtr;
protected transient boolean swigCMemOwn;
protected CandidateTranscript(long cPtr, boolean cMemoryOwn) {
swigCMemOwn = cMemoryOwn;
swigCPtr = cPtr;
}
protected static long getCPtr(CandidateTranscript obj) {
return (obj == null) ? 0 : obj.swigCPtr;
}
public synchronized void delete() {
if (swigCPtr != 0) {
if (swigCMemOwn) {
swigCMemOwn = false;
throw new UnsupportedOperationException("C++ destructor does not have public access");
}
swigCPtr = 0;
}
}
/**
* Array of TokenMetadata objects
*/
public TokenMetadata getTokens() {
long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this);
return (cPtr == 0) ? null : new TokenMetadata(cPtr, false);
}
/**
* Size of the tokens array
*/
public long getNum_tokens() {
return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this);
}
/**
* Approximated confidence value for this transcript. This is roughly the<br>
* sum of the acoustic model logit values for each timestep/character that<br>
* contributed to the creation of this transcript.
*/
public double getConfidence() {
return implJNI.CandidateTranscript_confidence_get(swigCPtr, this);
}
/**
* Retrieve one TokenMetadata element<br>
* <br>
* @param i Array index of the TokenMetadata to get<br>
* <br>
* @return The TokenMetadata requested or null
*/
public TokenMetadata getToken(int i) {
return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), false);
}
}

View File

@ -0,0 +1,65 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org).
* Version 4.0.1
*
* Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead.
* ----------------------------------------------------------------------------- */
package org.mozilla.deepspeech.libdeepspeech;
public enum DeepSpeech_Error_Codes {
ERR_OK(0x0000),
ERR_NO_MODEL(0x1000),
ERR_INVALID_ALPHABET(0x2000),
ERR_INVALID_SHAPE(0x2001),
ERR_INVALID_SCORER(0x2002),
ERR_MODEL_INCOMPATIBLE(0x2003),
ERR_SCORER_NOT_ENABLED(0x2004),
ERR_FAIL_INIT_MMAP(0x3000),
ERR_FAIL_INIT_SESS(0x3001),
ERR_FAIL_INTERPRETER(0x3002),
ERR_FAIL_RUN_SESS(0x3003),
ERR_FAIL_CREATE_STREAM(0x3004),
ERR_FAIL_READ_PROTOBUF(0x3005),
ERR_FAIL_CREATE_SESS(0x3006),
ERR_FAIL_CREATE_MODEL(0x3007);
public final int swigValue() {
return swigValue;
}
public static DeepSpeech_Error_Codes swigToEnum(int swigValue) {
DeepSpeech_Error_Codes[] swigValues = DeepSpeech_Error_Codes.class.getEnumConstants();
if (swigValue < swigValues.length && swigValue >= 0 && swigValues[swigValue].swigValue == swigValue)
return swigValues[swigValue];
for (DeepSpeech_Error_Codes swigEnum : swigValues)
if (swigEnum.swigValue == swigValue)
return swigEnum;
throw new IllegalArgumentException("No enum " + DeepSpeech_Error_Codes.class + " with value " + swigValue);
}
@SuppressWarnings("unused")
private DeepSpeech_Error_Codes() {
this.swigValue = SwigNext.next++;
}
@SuppressWarnings("unused")
private DeepSpeech_Error_Codes(int swigValue) {
this.swigValue = swigValue;
SwigNext.next = swigValue+1;
}
@SuppressWarnings("unused")
private DeepSpeech_Error_Codes(DeepSpeech_Error_Codes swigEnum) {
this.swigValue = swigEnum.swigValue;
SwigNext.next = this.swigValue+1;
}
private final int swigValue;
private static class SwigNext {
private static int next = 0;
}
}

View File

@ -1,6 +1,6 @@
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org). * This file was automatically generated by SWIG (http://www.swig.org).
* Version 4.0.2 * Version 4.0.1
* *
* Do not make changes to this file unless you know what you are doing--modify * Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead. * the SWIG interface file instead.
@ -9,7 +9,7 @@
package org.mozilla.deepspeech.libdeepspeech; package org.mozilla.deepspeech.libdeepspeech;
/** /**
* Stores the entire CTC output as an array of character metadata objects * An array of CandidateTranscript objects computed by the model.
*/ */
public class Metadata { public class Metadata {
private transient long swigCPtr; private transient long swigCPtr;
@ -40,61 +40,29 @@ public class Metadata {
} }
/** /**
* List of items * Array of CandidateTranscript objects
*/ */
public void setItems(MetadataItem value) { public CandidateTranscript getTranscripts() {
implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value); long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this);
return (cPtr == 0) ? null : new CandidateTranscript(cPtr, false);
} }
/** /**
* List of items * Size of the transcripts array
*/ */
public MetadataItem getItems() { public long getNum_transcripts() {
long cPtr = implJNI.Metadata_items_get(swigCPtr, this); return implJNI.Metadata_num_transcripts_get(swigCPtr, this);
return (cPtr == 0) ? null : new MetadataItem(cPtr, false);
} }
/** /**
* Size of the list of items * Retrieve one CandidateTranscript element<br>
*/
public void setNum_items(int value) {
implJNI.Metadata_num_items_set(swigCPtr, this, value);
}
/**
* Size of the list of items
*/
public int getNum_items() {
return implJNI.Metadata_num_items_get(swigCPtr, this);
}
/**
* Approximated confidence value for this transcription. This is roughly the<br>
* sum of the acoustic model logit values for each timestep/character that<br>
* contributed to the creation of this transcription.
*/
public void setConfidence(double value) {
implJNI.Metadata_confidence_set(swigCPtr, this, value);
}
/**
* Approximated confidence value for this transcription. This is roughly the<br>
* sum of the acoustic model logit values for each timestep/character that<br>
* contributed to the creation of this transcription.
*/
public double getConfidence() {
return implJNI.Metadata_confidence_get(swigCPtr, this);
}
/**
* Retrieve one MetadataItem element<br>
* <br> * <br>
* @param i Array index of the MetadataItem to get<br> * @param i Array index of the CandidateTranscript to get<br>
* <br> * <br>
* @return The MetadataItem requested or null * @return The CandidateTranscript requested or null
*/ */
public MetadataItem getItem(int i) { public CandidateTranscript getTranscript(int i) {
return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true); return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), false);
} }
} }

View File

@ -4,7 +4,7 @@ Javadoc for Sphinx
This code is only here for reference for documentation generation. This code is only here for reference for documentation generation.
To update, please build SWIG (4.0 at least) and then run from native_client/java: To update, please install SWIG (4.0 at least) and then run from native_client/java:
.. code-block:: .. code-block::

View File

@ -0,0 +1,58 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org).
* Version 4.0.1
*
* Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead.
* ----------------------------------------------------------------------------- */
package org.mozilla.deepspeech.libdeepspeech;
/**
* Stores text of an individual token, along with its timing information
*/
public class TokenMetadata {
private transient long swigCPtr;
protected transient boolean swigCMemOwn;
protected TokenMetadata(long cPtr, boolean cMemoryOwn) {
swigCMemOwn = cMemoryOwn;
swigCPtr = cPtr;
}
protected static long getCPtr(TokenMetadata obj) {
return (obj == null) ? 0 : obj.swigCPtr;
}
public synchronized void delete() {
if (swigCPtr != 0) {
if (swigCMemOwn) {
swigCMemOwn = false;
throw new UnsupportedOperationException("C++ destructor does not have public access");
}
swigCPtr = 0;
}
}
/**
* The text corresponding to this token
*/
public String getText() {
return implJNI.TokenMetadata_text_get(swigCPtr, this);
}
/**
* Position of the token in units of 20ms
*/
public long getTimestep() {
return implJNI.TokenMetadata_timestep_get(swigCPtr, this);
}
/**
* Position of the token in seconds
*/
public float getStart_time() {
return implJNI.TokenMetadata_start_time_get(swigCPtr, this);
}
}

View File

@ -42,12 +42,11 @@ function totalTime(hrtimeValue) {
return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4); return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
} }
function metadataToString(metadata) { function candidateTranscriptToString(transcript) {
var retval = "" var retval = ""
for (var i = 0; i < metadata.num_items; ++i) { for (var i = 0; i < transcript.tokens.length; ++i) {
retval += metadata.items[i].character; retval += transcript.tokens[i].text;
} }
Ds.FreeMetadata(metadata);
return retval; return retval;
} }
@ -117,7 +116,9 @@ audioStream.on('finish', () => {
const audioLength = (audioBuffer.length / 2) * (1 / desired_sample_rate); const audioLength = (audioBuffer.length / 2) * (1 / desired_sample_rate);
if (args['extended']) { if (args['extended']) {
console.log(metadataToString(model.sttWithMetadata(audioBuffer))); let metadata = model.sttWithMetadata(audioBuffer, 1);
console.log(candidateTranscriptToString(metadata.transcripts[0]));
Ds.FreeMetadata(metadata);
} else { } else {
console.log(model.stt(audioBuffer)); console.log(model.stt(audioBuffer));
} }

View File

@ -47,8 +47,8 @@ using namespace node;
%typemap(argout) ModelState **retval { %typemap(argout) ModelState **retval {
$result = SWIGV8_ARRAY_NEW(); $result = SWIGV8_ARRAY_NEW();
SWIGV8_AppendOutput($result, SWIG_From_int(result)); SWIGV8_AppendOutput($result, SWIG_From_int(result));
// owned by SWIG, ModelState destructor gets called when the JavaScript object is finalized (see below) // owned by the application. NodeJS does not guarantee the finalizer will be called so applications must call FreeMetadata themselves.
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, SWIG_POINTER_OWN)); %append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0));
} }
@ -68,27 +68,29 @@ using namespace node;
%nodefaultctor ModelState; %nodefaultctor ModelState;
%nodefaultdtor ModelState; %nodefaultdtor ModelState;
%typemap(out) MetadataItem* %{ %typemap(out) TokenMetadata* %{
$result = SWIGV8_ARRAY_NEW(); $result = SWIGV8_ARRAY_NEW();
for (int i = 0; i < arg1->num_items; ++i) { for (int i = 0; i < arg1->num_tokens; ++i) {
SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_MetadataItem, SWIG_POINTER_OWN)); SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_TokenMetadata, 0));
} }
%} %}
%nodefaultdtor Metadata; %typemap(out) CandidateTranscript* %{
%nodefaultctor Metadata; $result = SWIGV8_ARRAY_NEW();
%nodefaultctor MetadataItem; for (int i = 0; i < arg1->num_transcripts; ++i) {
%nodefaultdtor MetadataItem; SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_CandidateTranscript, 0));
%extend struct Metadata {
~Metadata() {
DS_FreeMetadata($self);
} }
} %}
%extend struct MetadataItem { %ignore Metadata::num_transcripts;
~MetadataItem() { } %ignore CandidateTranscript::num_tokens;
}
%nodefaultctor Metadata;
%nodefaultdtor Metadata;
%nodefaultctor CandidateTranscript;
%nodefaultdtor CandidateTranscript;
%nodefaultctor TokenMetadata;
%nodefaultdtor TokenMetadata;
%rename ("%(strip:[DS_])s") ""; %rename ("%(strip:[DS_])s") "";

View File

@ -115,15 +115,16 @@ Model.prototype.stt = function(aBuffer) {
} }
/** /**
* Use the DeepSpeech model to perform Speech-To-Text and output metadata * Use the DeepSpeech model to perform Speech-To-Text and output results including metadata.
* about the results.
* *
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
* *
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error. * @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/ */
Model.prototype.sttWithMetadata = function(aBuffer) { Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) {
return binding.SpeechToTextWithMetadata(this._impl, aBuffer); aNumResults = aNumResults || 1;
return binding.SpeechToTextWithMetadata(this._impl, aBuffer, aNumResults);
} }
/** /**
@ -172,7 +173,19 @@ Stream.prototype.intermediateDecode = function() {
} }
/** /**
* Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal. * Compute the intermediate decoding of an ongoing streaming inference, return results including metadata.
*
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
*
* @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/
Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
aNumResults = aNumResults || 1;
return binding.IntermediateDecode(this._impl, aNumResults);
}
/**
* Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference.
* *
* @return {string} The STT result. * @return {string} The STT result.
* *
@ -185,14 +198,17 @@ Stream.prototype.finishStream = function() {
} }
/** /**
* Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata. * Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference.
*
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
* *
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
* *
* This method will free the stream, it must not be used after this method is called. * This method will free the stream, it must not be used after this method is called.
*/ */
Stream.prototype.finishStreamWithMetadata = function() { Stream.prototype.finishStreamWithMetadata = function(aNumResults) {
result = binding.FinishStreamWithMetadata(this._impl); aNumResults = aNumResults || 1;
result = binding.FinishStreamWithMetadata(this._impl, aNumResults);
this._impl = null; this._impl = null;
return result; return result;
} }
@ -236,70 +252,80 @@ function Version() {
} }
//// Metadata and MetadataItem are here only for documentation purposes //// Metadata, CandidateTranscript and TokenMetadata are here only for documentation purposes
/** /**
* @class * @class
* *
* Stores each individual character, along with its timing information * Stores text of an individual token, along with its timing information
*/ */
function MetadataItem() {} function TokenMetadata() {}
/** /**
* The character generated for transcription * The text corresponding to this token
* *
* @return {string} The character generated * @return {string} The text generated
*/ */
MetadataItem.prototype.character = function() {} TokenMetadata.prototype.text = function() {}
/** /**
* Position of the character in units of 20ms * Position of the token in units of 20ms
* *
* @return {int} The position of the character * @return {int} The position of the token
*/ */
MetadataItem.prototype.timestep = function() {}; TokenMetadata.prototype.timestep = function() {};
/** /**
* Position of the character in seconds * Position of the token in seconds
* *
* @return {float} The position of the character * @return {float} The position of the token
*/ */
MetadataItem.prototype.start_time = function() {}; TokenMetadata.prototype.start_time = function() {};
/** /**
* @class * @class
* *
* Stores the entire CTC output as an array of character metadata objects * A single transcript computed by the model, including a confidence value and
* the metadata for its constituent tokens.
*/ */
function Metadata () {} function CandidateTranscript () {}
/** /**
* List of items * Array of tokens
* *
* @return {array} List of :js:func:`MetadataItem` * @return {array} Array of :js:func:`TokenMetadata`
*/ */
Metadata.prototype.items = function() {} CandidateTranscript.prototype.tokens = function() {}
/**
* Size of the list of items
*
* @return {int} Number of items
*/
Metadata.prototype.num_items = function() {}
/** /**
* Approximated confidence value for this transcription. This is roughly the * Approximated confidence value for this transcription. This is roughly the
* sum of the acoustic model logit values for each timestep/character that * sum of the acoustic model logit values for each timestep/token that
* contributed to the creation of this transcription. * contributed to the creation of this transcription.
* *
* @return {float} Confidence value * @return {float} Confidence value
*/ */
Metadata.prototype.confidence = function() {} CandidateTranscript.prototype.confidence = function() {}
/**
* @class
*
* An array of CandidateTranscript objects computed by the model.
*/
function Metadata () {}
/**
* Array of transcripts
*
* @return {array} Array of :js:func:`CandidateTranscript` objects
*/
Metadata.prototype.transcripts = function() {}
module.exports = { module.exports = {
Model: Model, Model: Model,
Metadata: Metadata, Metadata: Metadata,
MetadataItem: MetadataItem, CandidateTranscript: CandidateTranscript,
TokenMetadata: TokenMetadata,
Version: Version, Version: Version,
FreeModel: FreeModel, FreeModel: FreeModel,
FreeStream: FreeStream, FreeStream: FreeStream,

View File

@ -37,27 +37,39 @@ ModelState::decode(const DecoderState& state) const
} }
Metadata* Metadata*
ModelState::decode_metadata(const DecoderState& state) ModelState::decode_metadata(const DecoderState& state,
size_t num_results)
{ {
vector<Output> out = state.decode(); vector<Output> out = state.decode(num_results);
unsigned int num_returned = out.size();
std::unique_ptr<Metadata> metadata(new Metadata()); CandidateTranscript* transcripts = (CandidateTranscript*)malloc(sizeof(CandidateTranscript)*num_returned);
metadata->num_items = out[0].tokens.size();
metadata->confidence = out[0].confidence;
std::unique_ptr<MetadataItem[]> items(new MetadataItem[metadata->num_items]()); for (int i = 0; i < num_returned; ++i) {
TokenMetadata* tokens = (TokenMetadata*)malloc(sizeof(TokenMetadata)*out[i].tokens.size());
// Loop through each character for (int j = 0; j < out[i].tokens.size(); ++j) {
for (int i = 0; i < out[0].tokens.size(); ++i) { TokenMetadata token {
items[i].character = strdup(alphabet_.StringFromLabel(out[0].tokens[i]).c_str()); strdup(alphabet_.StringFromLabel(out[i].tokens[j]).c_str()), // text
items[i].timestep = out[0].timesteps[i]; static_cast<unsigned int>(out[i].timesteps[j]), // timestep
items[i].start_time = out[0].timesteps[i] * ((float)audio_win_step_ / sample_rate_); out[i].timesteps[j] * ((float)audio_win_step_ / sample_rate_), // start_time
};
if (items[i].start_time < 0) { memcpy(&tokens[j], &token, sizeof(TokenMetadata));
items[i].start_time = 0;
}
} }
metadata->items = items.release(); CandidateTranscript transcript {
return metadata.release(); tokens, // tokens
static_cast<unsigned int>(out[i].tokens.size()), // num_tokens
out[i].confidence, // confidence
};
memcpy(&transcripts[i], &transcript, sizeof(CandidateTranscript));
}
Metadata* ret = (Metadata*)malloc(sizeof(Metadata));
Metadata metadata {
transcripts, // transcripts
num_returned, // num_transcripts
};
memcpy(ret, &metadata, sizeof(Metadata));
return ret;
} }

View File

@ -66,11 +66,14 @@ struct ModelState {
* @brief Return character-level metadata including letter timings. * @brief Return character-level metadata including letter timings.
* *
* @param state Decoder state to use when decoding. * @param state Decoder state to use when decoding.
* @param num_results Maximum number of candidate results to return.
* *
* @return Metadata struct containing MetadataItem structs for each character. * @return A Metadata struct containing CandidateTranscript structs.
* The user is responsible for freeing Metadata by calling DS_FreeMetadata(). * Each represents an candidate transcript, with the first ranked most probable.
* The user is responsible for freeing Result by calling DS_FreeMetadata().
*/ */
virtual Metadata* decode_metadata(const DecoderState& state); virtual Metadata* decode_metadata(const DecoderState& state,
size_t num_results);
}; };
#endif // MODELSTATE_H #endif // MODELSTATE_H

View File

@ -121,17 +121,20 @@ class Model(object):
""" """
return deepspeech.impl.SpeechToText(self._impl, audio_buffer) return deepspeech.impl.SpeechToText(self._impl, audio_buffer)
def sttWithMetadata(self, audio_buffer): def sttWithMetadata(self, audio_buffer, num_results=1):
""" """
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results. Use the DeepSpeech model to perform Speech-To-Text and return results including metadata.
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type audio_buffer: numpy.int16 array :type audio_buffer: numpy.int16 array
:return: Outputs a struct of individual letters along with their timing information. :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
:type num_results: int
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
:type: :func:`Metadata` :type: :func:`Metadata`
""" """
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer) return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results)
def createStream(self): def createStream(self):
""" """
@ -187,10 +190,27 @@ class Stream(object):
raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?") raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
return deepspeech.impl.IntermediateDecode(self._impl) return deepspeech.impl.IntermediateDecode(self._impl)
def intermediateDecodeWithMetadata(self, num_results=1):
"""
Compute the intermediate decoding of an ongoing streaming inference and return results including metadata.
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
:type num_results: int
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
:type: :func:`Metadata`
:throws: RuntimeError if the stream object is not valid
"""
if not self._impl:
raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
return deepspeech.impl.IntermediateDecodeWithMetadata(self._impl, num_results)
def finishStream(self): def finishStream(self):
""" """
Signal the end of an audio signal to an ongoing streaming inference, Compute the final decoding of an ongoing streaming inference and return
returns the STT result over the whole audio signal. the result. Signals the end of an ongoing streaming inference. The underlying
stream object must not be used after this method is called.
:return: The STT result. :return: The STT result.
:type: str :type: str
@ -203,19 +223,24 @@ class Stream(object):
self._impl = None self._impl = None
return result return result
def finishStreamWithMetadata(self): def finishStreamWithMetadata(self, num_results=1):
""" """
Signal the end of an audio signal to an ongoing streaming inference, Compute the final decoding of an ongoing streaming inference and return
returns per-letter metadata. results including metadata. Signals the end of an ongoing streaming
inference. The underlying stream object must not be used after this
method is called.
:return: Outputs a struct of individual letters along with their timing information. :param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
:type num_results: int
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
:type: :func:`Metadata` :type: :func:`Metadata`
:throws: RuntimeError if the stream object is not valid :throws: RuntimeError if the stream object is not valid
""" """
if not self._impl: if not self._impl:
raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?") raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
result = deepspeech.impl.FinishStreamWithMetadata(self._impl) result = deepspeech.impl.FinishStreamWithMetadata(self._impl, num_results)
self._impl = None self._impl = None
return result return result
@ -233,52 +258,43 @@ class Stream(object):
# This is only for documentation purpose # This is only for documentation purpose
# Metadata and MetadataItem should be in sync with native_client/deepspeech.h # Metadata, CandidateTranscript and TokenMetadata should be in sync with native_client/deepspeech.h
class MetadataItem(object): class TokenMetadata(object):
""" """
Stores each individual character, along with its timing information Stores each individual character, along with its timing information
""" """
def character(self): def text(self):
""" """
The character generated for transcription The text for this token
""" """
def timestep(self): def timestep(self):
""" """
Position of the character in units of 20ms Position of the token in units of 20ms
""" """
def start_time(self): def start_time(self):
""" """
Position of the character in seconds Position of the token in seconds
""" """
class Metadata(object): class CandidateTranscript(object):
""" """
Stores the entire CTC output as an array of character metadata objects Stores the entire CTC output as an array of character metadata objects
""" """
def items(self): def tokens(self):
""" """
List of items List of tokens
:return: A list of :func:`MetadataItem` elements :return: A list of :func:`TokenMetadata` elements
:type: list :type: list
""" """
def num_items(self):
"""
Size of the list of items
:return: Size of the list of items
:type: int
"""
def confidence(self): def confidence(self):
""" """
Approximated confidence value for this transcription. This is roughly the Approximated confidence value for this transcription. This is roughly the
@ -286,3 +302,12 @@ class Metadata(object):
contributed to the creation of this transcription. contributed to the creation of this transcription.
""" """
class Metadata(object):
def transcripts(self):
"""
List of candidate transcripts
:return: A list of :func:`CandidateTranscript` objects
:type: list
"""

View File

@ -18,6 +18,7 @@ try:
except ImportError: except ImportError:
from pipes import quote from pipes import quote
def convert_samplerate(audio_path, desired_sample_rate): def convert_samplerate(audio_path, desired_sample_rate):
sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate) sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate)
try: try:
@ -31,25 +32,25 @@ def convert_samplerate(audio_path, desired_sample_rate):
def metadata_to_string(metadata): def metadata_to_string(metadata):
return ''.join(item.character for item in metadata.items) return ''.join(token.text for token in metadata.tokens)
def words_from_metadata(metadata):
def words_from_candidate_transcript(metadata):
word = "" word = ""
word_list = [] word_list = []
word_start_time = 0 word_start_time = 0
# Loop through each character # Loop through each character
for i in range(0, metadata.num_items): for i, token in enumerate(metadata.tokens):
item = metadata.items[i]
# Append character to word if it's not a space # Append character to word if it's not a space
if item.character != " ": if token.text != " ":
if len(word) == 0: if len(word) == 0:
# Log the start time of the new word # Log the start time of the new word
word_start_time = item.start_time word_start_time = token.start_time
word = word + item.character word = word + token.text
# Word boundary is either a space or the last character in the array # Word boundary is either a space or the last character in the array
if item.character == " " or i == metadata.num_items - 1: if token.text == " " or i == len(metadata.tokens) - 1:
word_duration = item.start_time - word_start_time word_duration = token.start_time - word_start_time
if word_duration < 0: if word_duration < 0:
word_duration = 0 word_duration = 0
@ -69,9 +70,11 @@ def words_from_metadata(metadata):
def metadata_json_output(metadata): def metadata_json_output(metadata):
json_result = dict() json_result = dict()
json_result["words"] = words_from_metadata(metadata) json_result["transcripts"] = [{
json_result["confidence"] = metadata.confidence "confidence": transcript.confidence,
return json.dumps(json_result) "words": words_from_candidate_transcript(transcript),
} for transcript in metadata.transcripts]
return json.dumps(json_result, indent=2)
@ -141,9 +144,9 @@ def main():
print('Running inference.', file=sys.stderr) print('Running inference.', file=sys.stderr)
inference_start = timer() inference_start = timer()
if args.extended: if args.extended:
print(metadata_to_string(ds.sttWithMetadata(audio))) print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
elif args.json: elif args.json:
print(metadata_json_output(ds.sttWithMetadata(audio))) print(metadata_json_output(ds.sttWithMetadata(audio, 3)))
else: else:
print(ds.stt(audio)) print(ds.stt(audio))
inference_end = timer() - inference_start inference_end = timer() - inference_start

View File

@ -38,30 +38,69 @@ import_array();
%append_output(SWIG_NewPointerObj(%as_voidptr($1), $1_descriptor, SWIG_POINTER_OWN)); %append_output(SWIG_NewPointerObj(%as_voidptr($1), $1_descriptor, SWIG_POINTER_OWN));
} }
%typemap(out) MetadataItem* %{ %fragment("parent_reference_init", "init") {
$result = PyList_New(arg1->num_items); // Thread-safe initialization - initialize during Python module initialization
for (int i = 0; i < arg1->num_items; ++i) { parent_reference();
PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->items[i]), SWIGTYPE_p_MetadataItem, 0); }
%fragment("parent_reference_function", "header", fragment="parent_reference_init") {
static PyObject *parent_reference() {
static PyObject *parent_reference_string = SWIG_Python_str_FromChar("__parent_reference");
return parent_reference_string;
}
}
%typemap(out, fragment="parent_reference_function") CandidateTranscript* %{
$result = PyList_New(arg1->num_transcripts);
for (int i = 0; i < arg1->num_transcripts; ++i) {
PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->transcripts[i]), SWIGTYPE_p_CandidateTranscript, 0);
// Add a reference to Metadata in the returned elements to avoid premature
// garbage collection
PyObject_SetAttr(o, parent_reference(), $self);
PyList_SetItem($result, i, o); PyList_SetItem($result, i, o);
} }
%} %}
%extend struct MetadataItem { %typemap(out, fragment="parent_reference_function") TokenMetadata* %{
$result = PyList_New(arg1->num_tokens);
for (int i = 0; i < arg1->num_tokens; ++i) {
PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->tokens[i]), SWIGTYPE_p_TokenMetadata, 0);
// Add a reference to CandidateTranscript in the returned elements to avoid premature
// garbage collection
PyObject_SetAttr(o, parent_reference(), $self);
PyList_SetItem($result, i, o);
}
%}
%extend struct TokenMetadata {
%pythoncode %{ %pythoncode %{
def __repr__(self): def __repr__(self):
return 'MetadataItem(character=\'{}\', timestep={}, start_time={})'.format(self.character, self.timestep, self.start_time) return 'TokenMetadata(text=\'{}\', timestep={}, start_time={})'.format(self.text, self.timestep, self.start_time)
%}
}
%extend struct CandidateTranscript {
%pythoncode %{
def __repr__(self):
tokens_repr = ',\n'.join(repr(i) for i in self.tokens)
tokens_repr = '\n'.join(' ' + l for l in tokens_repr.split('\n'))
return 'CandidateTranscript(confidence={}, tokens=[\n{}\n])'.format(self.confidence, tokens_repr)
%} %}
} }
%extend struct Metadata { %extend struct Metadata {
%pythoncode %{ %pythoncode %{
def __repr__(self): def __repr__(self):
items_repr = ', \n'.join(' ' + repr(i) for i in self.items) transcripts_repr = ',\n'.join(repr(i) for i in self.transcripts)
return 'Metadata(confidence={}, items=[\n{}\n])'.format(self.confidence, items_repr) transcripts_repr = '\n'.join(' ' + l for l in transcripts_repr.split('\n'))
return 'Metadata(transcripts=[\n{}\n])'.format(transcripts_repr)
%} %}
} }
%ignore Metadata::num_items; %ignore Metadata::num_transcripts;
%ignore CandidateTranscript::num_tokens;
%extend struct Metadata { %extend struct Metadata {
~Metadata() { ~Metadata() {
@ -69,10 +108,12 @@ import_array();
} }
} }
%nodefaultdtor Metadata;
%nodefaultctor Metadata; %nodefaultctor Metadata;
%nodefaultctor MetadataItem; %nodefaultdtor Metadata;
%nodefaultdtor MetadataItem; %nodefaultctor CandidateTranscript;
%nodefaultdtor CandidateTranscript;
%nodefaultctor TokenMetadata;
%nodefaultdtor TokenMetadata;
%typemap(newfree) char* "DS_FreeString($1);"; %typemap(newfree) char* "DS_FreeString($1);";