Merge pull request #2792 from reuben/multiple_transcriptions
Expose multiple transcriptions in "WithMetadata" API
This commit is contained in:
commit
903d0b8fe4
|
@ -34,6 +34,9 @@ C
|
||||||
.. doxygenfunction:: DS_IntermediateDecode
|
.. doxygenfunction:: DS_IntermediateDecode
|
||||||
:project: deepspeech-c
|
:project: deepspeech-c
|
||||||
|
|
||||||
|
.. doxygenfunction:: DS_IntermediateDecodeWithMetadata
|
||||||
|
:project: deepspeech-c
|
||||||
|
|
||||||
.. doxygenfunction:: DS_FinishStream
|
.. doxygenfunction:: DS_FinishStream
|
||||||
:project: deepspeech-c
|
:project: deepspeech-c
|
||||||
|
|
||||||
|
|
|
@ -31,13 +31,20 @@ ErrorCodes
|
||||||
Metadata
|
Metadata
|
||||||
--------
|
--------
|
||||||
|
|
||||||
.. doxygenstruct:: DeepSpeechClient::Structs::Metadata
|
.. doxygenclass:: DeepSpeechClient::Models::Metadata
|
||||||
:project: deepspeech-dotnet
|
:project: deepspeech-dotnet
|
||||||
:members: items, num_items, confidence
|
:members: Transcripts
|
||||||
|
|
||||||
MetadataItem
|
CandidateTranscript
|
||||||
------------
|
-------------------
|
||||||
|
|
||||||
.. doxygenstruct:: DeepSpeechClient::Structs::MetadataItem
|
.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript
|
||||||
:project: deepspeech-dotnet
|
:project: deepspeech-dotnet
|
||||||
:members: character, timestep, start_time
|
:members: Tokens, Confidence
|
||||||
|
|
||||||
|
TokenMetadata
|
||||||
|
-------------
|
||||||
|
|
||||||
|
.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata
|
||||||
|
:project: deepspeech-dotnet
|
||||||
|
:members: Text, Timestep, StartTime
|
||||||
|
|
|
@ -13,11 +13,17 @@ Metadata
|
||||||
|
|
||||||
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
|
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
|
||||||
:project: deepspeech-java
|
:project: deepspeech-java
|
||||||
:members: getItems, getNum_items, getProbability, getItem
|
:members: getTranscripts, getNum_transcripts, getTranscript
|
||||||
|
|
||||||
MetadataItem
|
CandidateTranscript
|
||||||
------------
|
-------------------
|
||||||
|
|
||||||
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem
|
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript
|
||||||
:project: deepspeech-java
|
:project: deepspeech-java
|
||||||
:members: getCharacter, getTimestep, getStart_time
|
:members: getTokens, getNum_tokens, getConfidence, getToken
|
||||||
|
|
||||||
|
TokenMetadata
|
||||||
|
-------------
|
||||||
|
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata
|
||||||
|
:project: deepspeech-java
|
||||||
|
:members: getText, getTimestep, getStart_time
|
||||||
|
|
|
@ -30,8 +30,14 @@ Metadata
|
||||||
.. js:autoclass:: Metadata
|
.. js:autoclass:: Metadata
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
MetadataItem
|
CandidateTranscript
|
||||||
------------
|
-------------------
|
||||||
|
|
||||||
.. js:autoclass:: MetadataItem
|
.. js:autoclass:: CandidateTranscript
|
||||||
|
:members:
|
||||||
|
|
||||||
|
TokenMetadata
|
||||||
|
-------------
|
||||||
|
|
||||||
|
.. js:autoclass:: TokenMetadata
|
||||||
:members:
|
:members:
|
||||||
|
|
|
@ -21,8 +21,14 @@ Metadata
|
||||||
.. autoclass:: Metadata
|
.. autoclass:: Metadata
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
MetadataItem
|
CandidateTranscript
|
||||||
------------
|
-------------------
|
||||||
|
|
||||||
.. autoclass:: MetadataItem
|
.. autoclass:: CandidateTranscript
|
||||||
|
:members:
|
||||||
|
|
||||||
|
TokenMetadata
|
||||||
|
-------------
|
||||||
|
|
||||||
|
.. autoclass:: TokenMetadata
|
||||||
:members:
|
:members:
|
||||||
|
|
|
@ -8,9 +8,16 @@ Metadata
|
||||||
:project: deepspeech-c
|
:project: deepspeech-c
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
MetadataItem
|
CandidateTranscript
|
||||||
------------
|
-------------------
|
||||||
|
|
||||||
.. doxygenstruct:: MetadataItem
|
.. doxygenstruct:: CandidateTranscript
|
||||||
|
:project: deepspeech-c
|
||||||
|
:members:
|
||||||
|
|
||||||
|
TokenMetadata
|
||||||
|
-------------
|
||||||
|
|
||||||
|
.. doxygenstruct:: TokenMetadata
|
||||||
:project: deepspeech-c
|
:project: deepspeech-c
|
||||||
:members:
|
:members:
|
||||||
|
|
|
@ -790,7 +790,7 @@ WARN_LOGFILE =
|
||||||
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
|
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
|
||||||
# Note: If this tag is empty the current directory is searched.
|
# Note: If this tag is empty the current directory is searched.
|
||||||
|
|
||||||
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/
|
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/
|
||||||
|
|
||||||
# This tag can be used to specify the character encoding of the source files
|
# This tag can be used to specify the character encoding of the source files
|
||||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
||||||
|
|
|
@ -34,6 +34,8 @@ bool extended_metadata = false;
|
||||||
|
|
||||||
bool json_output = false;
|
bool json_output = false;
|
||||||
|
|
||||||
|
int json_candidate_transcripts = 3;
|
||||||
|
|
||||||
int stream_size = 0;
|
int stream_size = 0;
|
||||||
|
|
||||||
void PrintHelp(const char* bin)
|
void PrintHelp(const char* bin)
|
||||||
|
@ -43,18 +45,19 @@ void PrintHelp(const char* bin)
|
||||||
"\n"
|
"\n"
|
||||||
"Running DeepSpeech inference.\n"
|
"Running DeepSpeech inference.\n"
|
||||||
"\n"
|
"\n"
|
||||||
"\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
|
"\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n"
|
||||||
"\t--scorer SCORER\t\tPath to the external scorer file\n"
|
"\t--scorer SCORER\t\t\tPath to the external scorer file\n"
|
||||||
"\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
|
"\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n"
|
||||||
"\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
|
"\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n"
|
||||||
"\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
|
"\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n"
|
||||||
"\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
|
"\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n"
|
||||||
"\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
|
"\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
|
||||||
"\t--extended\t\tOutput string from extended metadata\n"
|
"\t--extended\t\t\tOutput string from extended metadata\n"
|
||||||
"\t--json\t\t\tExtended output, shows word timings as JSON\n"
|
"\t--json\t\t\t\tExtended output, shows word timings as JSON\n"
|
||||||
"\t--stream size\t\tRun in stream mode, output intermediate results\n"
|
"\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n"
|
||||||
"\t--help\t\t\tShow help\n"
|
"\t--stream size\t\t\tRun in stream mode, output intermediate results\n"
|
||||||
"\t--version\t\tPrint version and exits\n";
|
"\t--help\t\t\t\tShow help\n"
|
||||||
|
"\t--version\t\t\tPrint version and exits\n";
|
||||||
char* version = DS_Version();
|
char* version = DS_Version();
|
||||||
std::cerr << "DeepSpeech " << version << "\n";
|
std::cerr << "DeepSpeech " << version << "\n";
|
||||||
DS_FreeString(version);
|
DS_FreeString(version);
|
||||||
|
@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv)
|
||||||
{"t", no_argument, nullptr, 't'},
|
{"t", no_argument, nullptr, 't'},
|
||||||
{"extended", no_argument, nullptr, 'e'},
|
{"extended", no_argument, nullptr, 'e'},
|
||||||
{"json", no_argument, nullptr, 'j'},
|
{"json", no_argument, nullptr, 'j'},
|
||||||
|
{"candidate_transcripts", required_argument, nullptr, 150},
|
||||||
{"stream", required_argument, nullptr, 's'},
|
{"stream", required_argument, nullptr, 's'},
|
||||||
{"version", no_argument, nullptr, 'v'},
|
{"version", no_argument, nullptr, 'v'},
|
||||||
{"help", no_argument, nullptr, 'h'},
|
{"help", no_argument, nullptr, 'h'},
|
||||||
|
@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv)
|
||||||
json_output = true;
|
json_output = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case 150:
|
||||||
|
json_candidate_transcripts = atoi(optarg);
|
||||||
|
break;
|
||||||
|
|
||||||
case 's':
|
case 's':
|
||||||
stream_size = atoi(optarg);
|
stream_size = atoi(optarg);
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -44,9 +44,115 @@ struct meta_word {
|
||||||
float duration;
|
float duration;
|
||||||
};
|
};
|
||||||
|
|
||||||
char* metadataToString(Metadata* metadata);
|
char*
|
||||||
std::vector<meta_word> WordsFromMetadata(Metadata* metadata);
|
CandidateTranscriptToString(const CandidateTranscript* transcript)
|
||||||
char* JSONOutput(Metadata* metadata);
|
{
|
||||||
|
std::string retval = "";
|
||||||
|
for (int i = 0; i < transcript->num_tokens; i++) {
|
||||||
|
const TokenMetadata& token = transcript->tokens[i];
|
||||||
|
retval += token.text;
|
||||||
|
}
|
||||||
|
return strdup(retval.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<meta_word>
|
||||||
|
CandidateTranscriptToWords(const CandidateTranscript* transcript)
|
||||||
|
{
|
||||||
|
std::vector<meta_word> word_list;
|
||||||
|
|
||||||
|
std::string word = "";
|
||||||
|
float word_start_time = 0;
|
||||||
|
|
||||||
|
// Loop through each token
|
||||||
|
for (int i = 0; i < transcript->num_tokens; i++) {
|
||||||
|
const TokenMetadata& token = transcript->tokens[i];
|
||||||
|
|
||||||
|
// Append token to word if it's not a space
|
||||||
|
if (strcmp(token.text, u8" ") != 0) {
|
||||||
|
// Log the start time of the new word
|
||||||
|
if (word.length() == 0) {
|
||||||
|
word_start_time = token.start_time;
|
||||||
|
}
|
||||||
|
word.append(token.text);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Word boundary is either a space or the last token in the array
|
||||||
|
if (strcmp(token.text, u8" ") == 0 || i == transcript->num_tokens-1) {
|
||||||
|
float word_duration = token.start_time - word_start_time;
|
||||||
|
|
||||||
|
if (word_duration < 0) {
|
||||||
|
word_duration = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
meta_word w;
|
||||||
|
w.word = word;
|
||||||
|
w.start_time = word_start_time;
|
||||||
|
w.duration = word_duration;
|
||||||
|
|
||||||
|
word_list.push_back(w);
|
||||||
|
|
||||||
|
// Reset
|
||||||
|
word = "";
|
||||||
|
word_start_time = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return word_list;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string
|
||||||
|
CandidateTranscriptToJSON(const CandidateTranscript *transcript)
|
||||||
|
{
|
||||||
|
std::ostringstream out_string;
|
||||||
|
|
||||||
|
std::vector<meta_word> words = CandidateTranscriptToWords(transcript);
|
||||||
|
|
||||||
|
out_string << R"("metadata":{"confidence":)" << transcript->confidence << R"(},"words":[)";
|
||||||
|
|
||||||
|
for (int i = 0; i < words.size(); i++) {
|
||||||
|
meta_word w = words[i];
|
||||||
|
out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
|
||||||
|
|
||||||
|
if (i < words.size() - 1) {
|
||||||
|
out_string << ",";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out_string << "]";
|
||||||
|
|
||||||
|
return out_string.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
char*
|
||||||
|
MetadataToJSON(Metadata* result)
|
||||||
|
{
|
||||||
|
std::ostringstream out_string;
|
||||||
|
out_string << "{\n";
|
||||||
|
|
||||||
|
for (int j=0; j < result->num_transcripts; ++j) {
|
||||||
|
const CandidateTranscript *transcript = &result->transcripts[j];
|
||||||
|
|
||||||
|
if (j == 0) {
|
||||||
|
out_string << CandidateTranscriptToJSON(transcript);
|
||||||
|
|
||||||
|
if (result->num_transcripts > 1) {
|
||||||
|
out_string << ",\n" << R"("alternatives")" << ":[\n";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
out_string << "{" << CandidateTranscriptToJSON(transcript) << "}";
|
||||||
|
|
||||||
|
if (j < result->num_transcripts - 1) {
|
||||||
|
out_string << ",\n";
|
||||||
|
} else {
|
||||||
|
out_string << "\n]";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out_string << "\n}\n";
|
||||||
|
|
||||||
|
return strdup(out_string.str().c_str());
|
||||||
|
}
|
||||||
|
|
||||||
ds_result
|
ds_result
|
||||||
LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
|
LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
|
||||||
|
@ -57,13 +163,13 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
|
||||||
clock_t ds_start_time = clock();
|
clock_t ds_start_time = clock();
|
||||||
|
|
||||||
if (extended_output) {
|
if (extended_output) {
|
||||||
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
|
Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1);
|
||||||
res.string = metadataToString(metadata);
|
res.string = CandidateTranscriptToString(&result->transcripts[0]);
|
||||||
DS_FreeMetadata(metadata);
|
DS_FreeMetadata(result);
|
||||||
} else if (json_output) {
|
} else if (json_output) {
|
||||||
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
|
Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts);
|
||||||
res.string = JSONOutput(metadata);
|
res.string = MetadataToJSON(result);
|
||||||
DS_FreeMetadata(metadata);
|
DS_FreeMetadata(result);
|
||||||
} else if (stream_size > 0) {
|
} else if (stream_size > 0) {
|
||||||
StreamingState* ctx;
|
StreamingState* ctx;
|
||||||
int status = DS_CreateStream(aCtx, &ctx);
|
int status = DS_CreateStream(aCtx, &ctx);
|
||||||
|
@ -278,87 +384,6 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
char*
|
|
||||||
metadataToString(Metadata* metadata)
|
|
||||||
{
|
|
||||||
std::string retval = "";
|
|
||||||
for (int i = 0; i < metadata->num_items; i++) {
|
|
||||||
MetadataItem item = metadata->items[i];
|
|
||||||
retval += item.character;
|
|
||||||
}
|
|
||||||
return strdup(retval.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<meta_word>
|
|
||||||
WordsFromMetadata(Metadata* metadata)
|
|
||||||
{
|
|
||||||
std::vector<meta_word> word_list;
|
|
||||||
|
|
||||||
std::string word = "";
|
|
||||||
float word_start_time = 0;
|
|
||||||
|
|
||||||
// Loop through each character
|
|
||||||
for (int i = 0; i < metadata->num_items; i++) {
|
|
||||||
MetadataItem item = metadata->items[i];
|
|
||||||
|
|
||||||
// Append character to word if it's not a space
|
|
||||||
if (strcmp(item.character, u8" ") != 0) {
|
|
||||||
// Log the start time of the new word
|
|
||||||
if (word.length() == 0) {
|
|
||||||
word_start_time = item.start_time;
|
|
||||||
}
|
|
||||||
word.append(item.character);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Word boundary is either a space or the last character in the array
|
|
||||||
if (strcmp(item.character, " ") == 0
|
|
||||||
|| strcmp(item.character, u8" ") == 0
|
|
||||||
|| i == metadata->num_items-1) {
|
|
||||||
|
|
||||||
float word_duration = item.start_time - word_start_time;
|
|
||||||
|
|
||||||
if (word_duration < 0) {
|
|
||||||
word_duration = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
meta_word w;
|
|
||||||
w.word = word;
|
|
||||||
w.start_time = word_start_time;
|
|
||||||
w.duration = word_duration;
|
|
||||||
|
|
||||||
word_list.push_back(w);
|
|
||||||
|
|
||||||
// Reset
|
|
||||||
word = "";
|
|
||||||
word_start_time = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return word_list;
|
|
||||||
}
|
|
||||||
|
|
||||||
char*
|
|
||||||
JSONOutput(Metadata* metadata)
|
|
||||||
{
|
|
||||||
std::vector<meta_word> words = WordsFromMetadata(metadata);
|
|
||||||
|
|
||||||
std::ostringstream out_string;
|
|
||||||
out_string << R"({"metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)";
|
|
||||||
|
|
||||||
for (int i = 0; i < words.size(); i++) {
|
|
||||||
meta_word w = words[i];
|
|
||||||
out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
|
|
||||||
|
|
||||||
if (i < words.size() - 1) {
|
|
||||||
out_string << ",";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
out_string << "]}\n";
|
|
||||||
|
|
||||||
return strdup(out_string.str().c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
int
|
int
|
||||||
main(int argc, char **argv)
|
main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
|
|
|
@ -157,7 +157,7 @@ DecoderState::next(const double *probs,
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<Output>
|
std::vector<Output>
|
||||||
DecoderState::decode() const
|
DecoderState::decode(size_t num_results) const
|
||||||
{
|
{
|
||||||
std::vector<PathTrie*> prefixes_copy = prefixes_;
|
std::vector<PathTrie*> prefixes_copy = prefixes_;
|
||||||
std::unordered_map<const PathTrie*, float> scores;
|
std::unordered_map<const PathTrie*, float> scores;
|
||||||
|
@ -181,16 +181,12 @@ DecoderState::decode() const
|
||||||
}
|
}
|
||||||
|
|
||||||
using namespace std::placeholders;
|
using namespace std::placeholders;
|
||||||
size_t num_prefixes = std::min(prefixes_copy.size(), beam_size_);
|
size_t num_returned = std::min(prefixes_copy.size(), num_results);
|
||||||
std::partial_sort(prefixes_copy.begin(),
|
std::partial_sort(prefixes_copy.begin(),
|
||||||
prefixes_copy.begin() + num_prefixes,
|
prefixes_copy.begin() + num_returned,
|
||||||
prefixes_copy.end(),
|
prefixes_copy.end(),
|
||||||
std::bind(prefix_compare_external, _1, _2, scores));
|
std::bind(prefix_compare_external, _1, _2, scores));
|
||||||
|
|
||||||
//TODO: expose this as an API parameter
|
|
||||||
const size_t top_paths = 1;
|
|
||||||
size_t num_returned = std::min(num_prefixes, top_paths);
|
|
||||||
|
|
||||||
std::vector<Output> outputs;
|
std::vector<Output> outputs;
|
||||||
outputs.reserve(num_returned);
|
outputs.reserve(num_returned);
|
||||||
|
|
||||||
|
|
|
@ -60,13 +60,16 @@ public:
|
||||||
int time_dim,
|
int time_dim,
|
||||||
int class_dim);
|
int class_dim);
|
||||||
|
|
||||||
/* Get transcription from current decoder state
|
/* Get up to num_results transcriptions from current decoder state.
|
||||||
|
*
|
||||||
|
* Parameters:
|
||||||
|
* num_results: Number of beams to return.
|
||||||
*
|
*
|
||||||
* Return:
|
* Return:
|
||||||
* A vector where each element is a pair of score and decoding result,
|
* A vector where each element is a pair of score and decoding result,
|
||||||
* in descending order.
|
* in descending order.
|
||||||
*/
|
*/
|
||||||
std::vector<Output> decode() const;
|
std::vector<Output> decode(size_t num_results=1) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -60,7 +60,7 @@ using std::vector;
|
||||||
When batch_buffer is full, we do a single step through the acoustic model
|
When batch_buffer is full, we do a single step through the acoustic model
|
||||||
and accumulate the intermediate decoding state in the DecoderState structure.
|
and accumulate the intermediate decoding state in the DecoderState structure.
|
||||||
|
|
||||||
When finishStream() is called, we return the corresponding transcription from
|
When finishStream() is called, we return the corresponding transcript from
|
||||||
the current decoder state.
|
the current decoder state.
|
||||||
*/
|
*/
|
||||||
struct StreamingState {
|
struct StreamingState {
|
||||||
|
@ -78,9 +78,10 @@ struct StreamingState {
|
||||||
|
|
||||||
void feedAudioContent(const short* buffer, unsigned int buffer_size);
|
void feedAudioContent(const short* buffer, unsigned int buffer_size);
|
||||||
char* intermediateDecode() const;
|
char* intermediateDecode() const;
|
||||||
|
Metadata* intermediateDecodeWithMetadata(unsigned int num_results) const;
|
||||||
void finalizeStream();
|
void finalizeStream();
|
||||||
char* finishStream();
|
char* finishStream();
|
||||||
Metadata* finishStreamWithMetadata();
|
Metadata* finishStreamWithMetadata(unsigned int num_results);
|
||||||
|
|
||||||
void processAudioWindow(const vector<float>& buf);
|
void processAudioWindow(const vector<float>& buf);
|
||||||
void processMfccWindow(const vector<float>& buf);
|
void processMfccWindow(const vector<float>& buf);
|
||||||
|
@ -136,6 +137,12 @@ StreamingState::intermediateDecode() const
|
||||||
return model_->decode(decoder_state_);
|
return model_->decode(decoder_state_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Metadata*
|
||||||
|
StreamingState::intermediateDecodeWithMetadata(unsigned int num_results) const
|
||||||
|
{
|
||||||
|
return model_->decode_metadata(decoder_state_, num_results);
|
||||||
|
}
|
||||||
|
|
||||||
char*
|
char*
|
||||||
StreamingState::finishStream()
|
StreamingState::finishStream()
|
||||||
{
|
{
|
||||||
|
@ -144,10 +151,10 @@ StreamingState::finishStream()
|
||||||
}
|
}
|
||||||
|
|
||||||
Metadata*
|
Metadata*
|
||||||
StreamingState::finishStreamWithMetadata()
|
StreamingState::finishStreamWithMetadata(unsigned int num_results)
|
||||||
{
|
{
|
||||||
finalizeStream();
|
finalizeStream();
|
||||||
return model_->decode_metadata(decoder_state_);
|
return model_->decode_metadata(decoder_state_, num_results);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -402,6 +409,13 @@ DS_IntermediateDecode(const StreamingState* aSctx)
|
||||||
return aSctx->intermediateDecode();
|
return aSctx->intermediateDecode();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Metadata*
|
||||||
|
DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
|
||||||
|
unsigned int aNumResults)
|
||||||
|
{
|
||||||
|
return aSctx->intermediateDecodeWithMetadata(aNumResults);
|
||||||
|
}
|
||||||
|
|
||||||
char*
|
char*
|
||||||
DS_FinishStream(StreamingState* aSctx)
|
DS_FinishStream(StreamingState* aSctx)
|
||||||
{
|
{
|
||||||
|
@ -411,11 +425,12 @@ DS_FinishStream(StreamingState* aSctx)
|
||||||
}
|
}
|
||||||
|
|
||||||
Metadata*
|
Metadata*
|
||||||
DS_FinishStreamWithMetadata(StreamingState* aSctx)
|
DS_FinishStreamWithMetadata(StreamingState* aSctx,
|
||||||
|
unsigned int aNumResults)
|
||||||
{
|
{
|
||||||
Metadata* metadata = aSctx->finishStreamWithMetadata();
|
Metadata* result = aSctx->finishStreamWithMetadata(aNumResults);
|
||||||
DS_FreeStream(aSctx);
|
DS_FreeStream(aSctx);
|
||||||
return metadata;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
StreamingState*
|
StreamingState*
|
||||||
|
@ -444,10 +459,11 @@ DS_SpeechToText(ModelState* aCtx,
|
||||||
Metadata*
|
Metadata*
|
||||||
DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
||||||
const short* aBuffer,
|
const short* aBuffer,
|
||||||
unsigned int aBufferSize)
|
unsigned int aBufferSize,
|
||||||
|
unsigned int aNumResults)
|
||||||
{
|
{
|
||||||
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
|
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
|
||||||
return DS_FinishStreamWithMetadata(ctx);
|
return DS_FinishStreamWithMetadata(ctx, aNumResults);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -460,11 +476,16 @@ void
|
||||||
DS_FreeMetadata(Metadata* m)
|
DS_FreeMetadata(Metadata* m)
|
||||||
{
|
{
|
||||||
if (m) {
|
if (m) {
|
||||||
for (int i = 0; i < m->num_items; ++i) {
|
for (int i = 0; i < m->num_transcripts; ++i) {
|
||||||
free(m->items[i].character);
|
for (int j = 0; j < m->transcripts[i].num_tokens; ++j) {
|
||||||
|
free((void*)m->transcripts[i].tokens[j].text);
|
||||||
}
|
}
|
||||||
delete[] m->items;
|
|
||||||
delete m;
|
free((void*)m->transcripts[i].tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
free((void*)m->transcripts);
|
||||||
|
free(m);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,32 +20,43 @@ typedef struct ModelState ModelState;
|
||||||
typedef struct StreamingState StreamingState;
|
typedef struct StreamingState StreamingState;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Stores each individual character, along with its timing information
|
* @brief Stores text of an individual token, along with its timing information
|
||||||
*/
|
*/
|
||||||
typedef struct MetadataItem {
|
typedef struct TokenMetadata {
|
||||||
/** The character generated for transcription */
|
/** The text corresponding to this token */
|
||||||
char* character;
|
const char* const text;
|
||||||
|
|
||||||
/** Position of the character in units of 20ms */
|
/** Position of the token in units of 20ms */
|
||||||
int timestep;
|
const unsigned int timestep;
|
||||||
|
|
||||||
/** Position of the character in seconds */
|
/** Position of the token in seconds */
|
||||||
float start_time;
|
const float start_time;
|
||||||
} MetadataItem;
|
} TokenMetadata;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Stores the entire CTC output as an array of character metadata objects
|
* @brief A single transcript computed by the model, including a confidence
|
||||||
|
* value and the metadata for its constituent tokens.
|
||||||
|
*/
|
||||||
|
typedef struct CandidateTranscript {
|
||||||
|
/** Array of TokenMetadata objects */
|
||||||
|
const TokenMetadata* const tokens;
|
||||||
|
/** Size of the tokens array */
|
||||||
|
const unsigned int num_tokens;
|
||||||
|
/** Approximated confidence value for this transcript. This is roughly the
|
||||||
|
* sum of the acoustic model logit values for each timestep/character that
|
||||||
|
* contributed to the creation of this transcript.
|
||||||
|
*/
|
||||||
|
const double confidence;
|
||||||
|
} CandidateTranscript;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief An array of CandidateTranscript objects computed by the model.
|
||||||
*/
|
*/
|
||||||
typedef struct Metadata {
|
typedef struct Metadata {
|
||||||
/** List of items */
|
/** Array of CandidateTranscript objects */
|
||||||
MetadataItem* items;
|
const CandidateTranscript* const transcripts;
|
||||||
/** Size of the list of items */
|
/** Size of the transcripts array */
|
||||||
int num_items;
|
const unsigned int num_transcripts;
|
||||||
/** Approximated confidence value for this transcription. This is roughly the
|
|
||||||
* sum of the acoustic model logit values for each timestep/character that
|
|
||||||
* contributed to the creation of this transcription.
|
|
||||||
*/
|
|
||||||
double confidence;
|
|
||||||
} Metadata;
|
} Metadata;
|
||||||
|
|
||||||
enum DeepSpeech_Error_Codes
|
enum DeepSpeech_Error_Codes
|
||||||
|
@ -164,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx,
|
||||||
float aBeta);
|
float aBeta);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Use the DeepSpeech model to perform Speech-To-Text.
|
* @brief Use the DeepSpeech model to convert speech to text.
|
||||||
*
|
*
|
||||||
* @param aCtx The ModelState pointer for the model to use.
|
* @param aCtx The ModelState pointer for the model to use.
|
||||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||||
|
@ -180,21 +191,25 @@ char* DS_SpeechToText(ModelState* aCtx,
|
||||||
unsigned int aBufferSize);
|
unsigned int aBufferSize);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
|
* @brief Use the DeepSpeech model to convert speech to text and output results
|
||||||
* about the results.
|
* including metadata.
|
||||||
*
|
*
|
||||||
* @param aCtx The ModelState pointer for the model to use.
|
* @param aCtx The ModelState pointer for the model to use.
|
||||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||||
* sample rate (matching what the model was trained on).
|
* sample rate (matching what the model was trained on).
|
||||||
* @param aBufferSize The number of samples in the audio signal.
|
* @param aBufferSize The number of samples in the audio signal.
|
||||||
|
* @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this.
|
||||||
*
|
*
|
||||||
* @return Outputs a struct of individual letters along with their timing information.
|
* @return Metadata struct containing multiple CandidateTranscript structs. Each
|
||||||
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
|
* transcript has per-token metadata including timing information. The
|
||||||
|
* user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
|
||||||
|
* Returns NULL on error.
|
||||||
*/
|
*/
|
||||||
DEEPSPEECH_EXPORT
|
DEEPSPEECH_EXPORT
|
||||||
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
||||||
const short* aBuffer,
|
const short* aBuffer,
|
||||||
unsigned int aBufferSize);
|
unsigned int aBufferSize,
|
||||||
|
unsigned int aNumResults);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Create a new streaming inference state. The streaming state returned
|
* @brief Create a new streaming inference state. The streaming state returned
|
||||||
|
@ -236,8 +251,24 @@ DEEPSPEECH_EXPORT
|
||||||
char* DS_IntermediateDecode(const StreamingState* aSctx);
|
char* DS_IntermediateDecode(const StreamingState* aSctx);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Signal the end of an audio signal to an ongoing streaming
|
* @brief Compute the intermediate decoding of an ongoing streaming inference,
|
||||||
* inference, returns the STT result over the whole audio signal.
|
* return results including metadata.
|
||||||
|
*
|
||||||
|
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||||
|
* @param aNumResults The number of candidate transcripts to return.
|
||||||
|
*
|
||||||
|
* @return Metadata struct containing multiple candidate transcripts. Each transcript
|
||||||
|
* has per-token metadata including timing information. The user is
|
||||||
|
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
|
||||||
|
* Returns NULL on error.
|
||||||
|
*/
|
||||||
|
DEEPSPEECH_EXPORT
|
||||||
|
Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
|
||||||
|
unsigned int aNumResults);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Compute the final decoding of an ongoing streaming inference and return
|
||||||
|
* the result. Signals the end of an ongoing streaming inference.
|
||||||
*
|
*
|
||||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||||
*
|
*
|
||||||
|
@ -250,18 +281,23 @@ DEEPSPEECH_EXPORT
|
||||||
char* DS_FinishStream(StreamingState* aSctx);
|
char* DS_FinishStream(StreamingState* aSctx);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Signal the end of an audio signal to an ongoing streaming
|
* @brief Compute the final decoding of an ongoing streaming inference and return
|
||||||
* inference, returns per-letter metadata.
|
* results including metadata. Signals the end of an ongoing streaming
|
||||||
|
* inference.
|
||||||
*
|
*
|
||||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||||
|
* @param aNumResults The number of candidate transcripts to return.
|
||||||
*
|
*
|
||||||
* @return Outputs a struct of individual letters along with their timing information.
|
* @return Metadata struct containing multiple candidate transcripts. Each transcript
|
||||||
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
|
* has per-token metadata including timing information. The user is
|
||||||
|
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
|
||||||
|
* Returns NULL on error.
|
||||||
*
|
*
|
||||||
* @note This method will free the state pointer (@p aSctx).
|
* @note This method will free the state pointer (@p aSctx).
|
||||||
*/
|
*/
|
||||||
DEEPSPEECH_EXPORT
|
DEEPSPEECH_EXPORT
|
||||||
Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx);
|
Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
|
||||||
|
unsigned int aNumResults);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Destroy a streaming state without decoding the computed logits. This
|
* @brief Destroy a streaming state without decoding the computed logits. This
|
||||||
|
|
|
@ -199,13 +199,14 @@ namespace DeepSpeechClient
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="stream">Instance of the stream to finish.</param>
|
/// <param name="stream">Instance of the stream to finish.</param>
|
||||||
|
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||||
/// <returns>The extended metadata result.</returns>
|
/// <returns>The extended metadata result.</returns>
|
||||||
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream)
|
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
|
||||||
{
|
{
|
||||||
return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata();
|
return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
@ -218,6 +219,17 @@ namespace DeepSpeechClient
|
||||||
return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString();
|
return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="stream">Instance of the stream to decode.</param>
|
||||||
|
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||||
|
/// <returns>The STT intermediate result.</returns>
|
||||||
|
public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
|
||||||
|
{
|
||||||
|
return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Return version of this library. The returned version is a semantic version
|
/// Return version of this library. The returned version is a semantic version
|
||||||
/// (SemVer 2.0.0).
|
/// (SemVer 2.0.0).
|
||||||
|
@ -261,14 +273,15 @@ namespace DeepSpeechClient
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
/// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
||||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||||
|
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||||
/// <returns>The extended metadata. Returns NULL on error.</returns>
|
/// <returns>The extended metadata. Returns NULL on error.</returns>
|
||||||
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
|
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
|
||||||
{
|
{
|
||||||
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
|
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata();
|
||||||
}
|
}
|
||||||
|
|
||||||
#endregion
|
#endregion
|
||||||
|
|
|
@ -50,11 +50,13 @@
|
||||||
<Compile Include="Extensions\NativeExtensions.cs" />
|
<Compile Include="Extensions\NativeExtensions.cs" />
|
||||||
<Compile Include="Models\DeepSpeechStream.cs" />
|
<Compile Include="Models\DeepSpeechStream.cs" />
|
||||||
<Compile Include="Models\Metadata.cs" />
|
<Compile Include="Models\Metadata.cs" />
|
||||||
<Compile Include="Models\MetadataItem.cs" />
|
<Compile Include="Models\CandidateTranscript.cs" />
|
||||||
|
<Compile Include="Models\TokenMetadata.cs" />
|
||||||
<Compile Include="NativeImp.cs" />
|
<Compile Include="NativeImp.cs" />
|
||||||
<Compile Include="Properties\AssemblyInfo.cs" />
|
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||||
<Compile Include="Structs\Metadata.cs" />
|
<Compile Include="Structs\Metadata.cs" />
|
||||||
<Compile Include="Structs\MetadataItem.cs" />
|
<Compile Include="Structs\CandidateTranscript.cs" />
|
||||||
|
<Compile Include="Structs\TokenMetadata.cs" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup />
|
<ItemGroup />
|
||||||
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
||||||
|
|
|
@ -26,35 +26,68 @@ namespace DeepSpeechClient.Extensions
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Converts a pointer into managed metadata object.
|
/// Converts a pointer into managed TokenMetadata object.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="intPtr">Native pointer.</param>
|
||||||
|
/// <returns>TokenMetadata managed object.</returns>
|
||||||
|
private static Models.TokenMetadata PtrToTokenMetadata(this IntPtr intPtr)
|
||||||
|
{
|
||||||
|
var token = Marshal.PtrToStructure<TokenMetadata>(intPtr);
|
||||||
|
var managedToken = new Models.TokenMetadata
|
||||||
|
{
|
||||||
|
Timestep = token.timestep,
|
||||||
|
StartTime = token.start_time,
|
||||||
|
Text = token.text.PtrToString(releasePtr: false)
|
||||||
|
};
|
||||||
|
return managedToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Converts a pointer into managed CandidateTranscript object.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="intPtr">Native pointer.</param>
|
||||||
|
/// <returns>CandidateTranscript managed object.</returns>
|
||||||
|
private static Models.CandidateTranscript PtrToCandidateTranscript(this IntPtr intPtr)
|
||||||
|
{
|
||||||
|
var managedTranscript = new Models.CandidateTranscript();
|
||||||
|
var transcript = Marshal.PtrToStructure<CandidateTranscript>(intPtr);
|
||||||
|
|
||||||
|
managedTranscript.Tokens = new Models.TokenMetadata[transcript.num_tokens];
|
||||||
|
managedTranscript.Confidence = transcript.confidence;
|
||||||
|
|
||||||
|
//we need to manually read each item from the native ptr using its size
|
||||||
|
var sizeOfTokenMetadata = Marshal.SizeOf(typeof(TokenMetadata));
|
||||||
|
for (int i = 0; i < transcript.num_tokens; i++)
|
||||||
|
{
|
||||||
|
managedTranscript.Tokens[i] = transcript.tokens.PtrToTokenMetadata();
|
||||||
|
transcript.tokens += sizeOfTokenMetadata;
|
||||||
|
}
|
||||||
|
|
||||||
|
return managedTranscript;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Converts a pointer into managed Metadata object.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="intPtr">Native pointer.</param>
|
/// <param name="intPtr">Native pointer.</param>
|
||||||
/// <returns>Metadata managed object.</returns>
|
/// <returns>Metadata managed object.</returns>
|
||||||
internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
|
internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
|
||||||
{
|
{
|
||||||
var managedMetaObject = new Models.Metadata();
|
var managedMetadata = new Models.Metadata();
|
||||||
var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));
|
var metadata = Marshal.PtrToStructure<Metadata>(intPtr);
|
||||||
|
|
||||||
managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
|
|
||||||
managedMetaObject.Confidence = metaData.confidence;
|
|
||||||
|
|
||||||
|
managedMetadata.Transcripts = new Models.CandidateTranscript[metadata.num_transcripts];
|
||||||
|
|
||||||
//we need to manually read each item from the native ptr using its size
|
//we need to manually read each item from the native ptr using its size
|
||||||
var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
|
var sizeOfCandidateTranscript = Marshal.SizeOf(typeof(CandidateTranscript));
|
||||||
for (int i = 0; i < metaData.num_items; i++)
|
for (int i = 0; i < metadata.num_transcripts; i++)
|
||||||
{
|
{
|
||||||
var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items);
|
managedMetadata.Transcripts[i] = metadata.transcripts.PtrToCandidateTranscript();
|
||||||
managedMetaObject.Items[i] = new Models.MetadataItem
|
metadata.transcripts += sizeOfCandidateTranscript;
|
||||||
{
|
|
||||||
Timestep = tempItem.timestep,
|
|
||||||
StartTime = tempItem.start_time,
|
|
||||||
Character = tempItem.character.PtrToString(releasePtr: false)
|
|
||||||
};
|
|
||||||
//we keep the offset on each read
|
|
||||||
metaData.items += sizeOfMetaItem;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
NativeImp.DS_FreeMetadata(intPtr);
|
NativeImp.DS_FreeMetadata(intPtr);
|
||||||
return managedMetaObject;
|
return managedMetadata;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -68,13 +68,15 @@ namespace DeepSpeechClient.Interfaces
|
||||||
uint aBufferSize);
|
uint aBufferSize);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
/// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
||||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||||
|
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||||
/// <returns>The extended metadata. Returns NULL on error.</returns>
|
/// <returns>The extended metadata. Returns NULL on error.</returns>
|
||||||
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
|
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
|
||||||
uint aBufferSize);
|
uint aBufferSize,
|
||||||
|
uint aNumResults);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Destroy a streaming state without decoding the computed logits.
|
/// Destroy a streaming state without decoding the computed logits.
|
||||||
|
@ -102,6 +104,14 @@ namespace DeepSpeechClient.Interfaces
|
||||||
/// <returns>The STT intermediate result.</returns>
|
/// <returns>The STT intermediate result.</returns>
|
||||||
unsafe string IntermediateDecode(DeepSpeechStream stream);
|
unsafe string IntermediateDecode(DeepSpeechStream stream);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="stream">Instance of the stream to decode.</param>
|
||||||
|
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||||
|
/// <returns>The extended metadata result.</returns>
|
||||||
|
unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
@ -110,10 +120,11 @@ namespace DeepSpeechClient.Interfaces
|
||||||
unsafe string FinishStream(DeepSpeechStream stream);
|
unsafe string FinishStream(DeepSpeechStream stream);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="stream">Instance of the stream to finish.</param>
|
/// <param name="stream">Instance of the stream to finish.</param>
|
||||||
|
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||||
/// <returns>The extended metadata result.</returns>
|
/// <returns>The extended metadata result.</returns>
|
||||||
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream);
|
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
namespace DeepSpeechClient.Models
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Stores the entire CTC output as an array of character metadata objects.
|
||||||
|
/// </summary>
|
||||||
|
public class CandidateTranscript
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Approximated confidence value for this transcription.
|
||||||
|
/// </summary>
|
||||||
|
public double Confidence { get; set; }
|
||||||
|
/// <summary>
|
||||||
|
/// List of metada tokens containing text, timestep, and time offset.
|
||||||
|
/// </summary>
|
||||||
|
public TokenMetadata[] Tokens { get; set; }
|
||||||
|
}
|
||||||
|
}
|
|
@ -6,12 +6,8 @@
|
||||||
public class Metadata
|
public class Metadata
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Approximated confidence value for this transcription.
|
/// List of candidate transcripts.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public double Confidence { get; set; }
|
public CandidateTranscript[] Transcripts { get; set; }
|
||||||
/// <summary>
|
|
||||||
/// List of metada items containing char, timespet, and time offset.
|
|
||||||
/// </summary>
|
|
||||||
public MetadataItem[] Items { get; set; }
|
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -3,12 +3,12 @@
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Stores each individual character, along with its timing information.
|
/// Stores each individual character, along with its timing information.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public class MetadataItem
|
public class TokenMetadata
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Char of the current timestep.
|
/// Char of the current timestep.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public string Character;
|
public string Text;
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Position of the character in units of 20ms.
|
/// Position of the character in units of 20ms.
|
||||||
/// </summary>
|
/// </summary>
|
|
@ -55,7 +55,8 @@ namespace DeepSpeechClient
|
||||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
|
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
|
||||||
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
|
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
|
||||||
short[] aBuffer,
|
short[] aBuffer,
|
||||||
uint aBufferSize);
|
uint aBufferSize,
|
||||||
|
uint aNumResults);
|
||||||
|
|
||||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||||
internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
|
internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
|
||||||
|
@ -82,12 +83,17 @@ namespace DeepSpeechClient
|
||||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||||
internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx);
|
internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx);
|
||||||
|
|
||||||
|
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||||
|
internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx,
|
||||||
|
uint aNumResults);
|
||||||
|
|
||||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
|
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
|
||||||
CharSet = CharSet.Ansi, SetLastError = true)]
|
CharSet = CharSet.Ansi, SetLastError = true)]
|
||||||
internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx);
|
internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx);
|
||||||
|
|
||||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||||
internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx);
|
internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx,
|
||||||
|
uint aNumResults);
|
||||||
#endregion
|
#endregion
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
|
||||||
|
namespace DeepSpeechClient.Structs
|
||||||
|
{
|
||||||
|
[StructLayout(LayoutKind.Sequential)]
|
||||||
|
internal unsafe struct CandidateTranscript
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Native list of tokens.
|
||||||
|
/// </summary>
|
||||||
|
internal unsafe IntPtr tokens;
|
||||||
|
/// <summary>
|
||||||
|
/// Count of tokens from the native side.
|
||||||
|
/// </summary>
|
||||||
|
internal unsafe int num_tokens;
|
||||||
|
/// <summary>
|
||||||
|
/// Approximated confidence value for this transcription.
|
||||||
|
/// </summary>
|
||||||
|
internal unsafe double confidence;
|
||||||
|
}
|
||||||
|
}
|
|
@ -7,16 +7,12 @@ namespace DeepSpeechClient.Structs
|
||||||
internal unsafe struct Metadata
|
internal unsafe struct Metadata
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Native list of items.
|
/// Native list of candidate transcripts.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
internal unsafe IntPtr items;
|
internal unsafe IntPtr transcripts;
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Count of items from the native side.
|
/// Count of transcripts from the native side.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
internal unsafe int num_items;
|
internal unsafe int num_transcripts;
|
||||||
/// <summary>
|
|
||||||
/// Approximated confidence value for this transcription.
|
|
||||||
/// </summary>
|
|
||||||
internal unsafe double confidence;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,12 +4,12 @@ using System.Runtime.InteropServices;
|
||||||
namespace DeepSpeechClient.Structs
|
namespace DeepSpeechClient.Structs
|
||||||
{
|
{
|
||||||
[StructLayout(LayoutKind.Sequential)]
|
[StructLayout(LayoutKind.Sequential)]
|
||||||
internal unsafe struct MetadataItem
|
internal unsafe struct TokenMetadata
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Native character.
|
/// Native text.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
internal unsafe IntPtr character;
|
internal unsafe IntPtr text;
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Position of the character in units of 20ms.
|
/// Position of the character in units of 20ms.
|
||||||
/// </summary>
|
/// </summary>
|
|
@ -21,14 +21,14 @@ namespace CSharpExamples
|
||||||
static string GetArgument(IEnumerable<string> args, string option)
|
static string GetArgument(IEnumerable<string> args, string option)
|
||||||
=> args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();
|
=> args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();
|
||||||
|
|
||||||
static string MetadataToString(Metadata meta)
|
static string MetadataToString(CandidateTranscript transcript)
|
||||||
{
|
{
|
||||||
var nl = Environment.NewLine;
|
var nl = Environment.NewLine;
|
||||||
string retval =
|
string retval =
|
||||||
Environment.NewLine + $"Recognized text: {string.Join("", meta?.Items?.Select(x => x.Character))} {nl}"
|
Environment.NewLine + $"Recognized text: {string.Join("", transcript?.Tokens?.Select(x => x.Text))} {nl}"
|
||||||
+ $"Confidence: {meta?.Confidence} {nl}"
|
+ $"Confidence: {transcript?.Confidence} {nl}"
|
||||||
+ $"Item count: {meta?.Items?.Length} {nl}"
|
+ $"Item count: {transcript?.Tokens?.Length} {nl}"
|
||||||
+ string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}"));
|
+ string.Join(nl, transcript?.Tokens?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Text}"));
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,8 +75,8 @@ namespace CSharpExamples
|
||||||
if (extended)
|
if (extended)
|
||||||
{
|
{
|
||||||
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer,
|
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer,
|
||||||
Convert.ToUInt32(waveBuffer.MaxSize / 2));
|
Convert.ToUInt32(waveBuffer.MaxSize / 2), 1);
|
||||||
speechResult = MetadataToString(metaResult);
|
speechResult = MetadataToString(metaResult.Transcripts[0]);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
|
@ -6,6 +6,8 @@
|
||||||
%}
|
%}
|
||||||
|
|
||||||
%include "typemaps.i"
|
%include "typemaps.i"
|
||||||
|
%include "enums.swg"
|
||||||
|
%javaconst(1);
|
||||||
|
|
||||||
%include "arrays_java.i"
|
%include "arrays_java.i"
|
||||||
// apply to DS_FeedAudioContent and DS_SpeechToText
|
// apply to DS_FeedAudioContent and DS_SpeechToText
|
||||||
|
@ -15,21 +17,29 @@
|
||||||
%pointer_functions(ModelState*, modelstatep);
|
%pointer_functions(ModelState*, modelstatep);
|
||||||
%pointer_functions(StreamingState*, streamingstatep);
|
%pointer_functions(StreamingState*, streamingstatep);
|
||||||
|
|
||||||
%typemap(newfree) char* "DS_FreeString($1);";
|
%extend struct CandidateTranscript {
|
||||||
|
/**
|
||||||
%include "carrays.i"
|
* Retrieve one TokenMetadata element
|
||||||
%array_functions(struct MetadataItem, metadataItem_array);
|
*
|
||||||
|
* @param i Array index of the TokenMetadata to get
|
||||||
|
*
|
||||||
|
* @return The TokenMetadata requested or null
|
||||||
|
*/
|
||||||
|
const TokenMetadata& getToken(int i) {
|
||||||
|
return self->tokens[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
%extend struct Metadata {
|
%extend struct Metadata {
|
||||||
/**
|
/**
|
||||||
* Retrieve one MetadataItem element
|
* Retrieve one CandidateTranscript element
|
||||||
*
|
*
|
||||||
* @param i Array index of the MetadataItem to get
|
* @param i Array index of the CandidateTranscript to get
|
||||||
*
|
*
|
||||||
* @return The MetadataItem requested or null
|
* @return The CandidateTranscript requested or null
|
||||||
*/
|
*/
|
||||||
MetadataItem getItem(int i) {
|
const CandidateTranscript& getTranscript(int i) {
|
||||||
return metadataItem_array_getitem(self->items, i);
|
return self->transcripts[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
~Metadata() {
|
~Metadata() {
|
||||||
|
@ -37,14 +47,18 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
%nodefaultdtor Metadata;
|
|
||||||
%nodefaultctor Metadata;
|
%nodefaultctor Metadata;
|
||||||
%nodefaultctor MetadataItem;
|
%nodefaultdtor Metadata;
|
||||||
%nodefaultdtor MetadataItem;
|
%nodefaultctor CandidateTranscript;
|
||||||
|
%nodefaultdtor CandidateTranscript;
|
||||||
|
%nodefaultctor TokenMetadata;
|
||||||
|
%nodefaultdtor TokenMetadata;
|
||||||
|
|
||||||
|
%typemap(newfree) char* "DS_FreeString($1);";
|
||||||
%newobject DS_SpeechToText;
|
%newobject DS_SpeechToText;
|
||||||
%newobject DS_IntermediateDecode;
|
%newobject DS_IntermediateDecode;
|
||||||
%newobject DS_FinishStream;
|
%newobject DS_FinishStream;
|
||||||
|
%newobject DS_ErrorCodeToErrorMessage;
|
||||||
|
|
||||||
%rename ("%(strip:[DS_])s") "";
|
%rename ("%(strip:[DS_])s") "";
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ import org.junit.runners.MethodSorters;
|
||||||
import static org.junit.Assert.*;
|
import static org.junit.Assert.*;
|
||||||
|
|
||||||
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
|
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
|
||||||
import org.mozilla.deepspeech.libdeepspeech.Metadata;
|
import org.mozilla.deepspeech.libdeepspeech.CandidateTranscript;
|
||||||
|
|
||||||
import java.io.RandomAccessFile;
|
import java.io.RandomAccessFile;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
|
@ -61,10 +61,10 @@ public class BasicTest {
|
||||||
m.freeModel();
|
m.freeModel();
|
||||||
}
|
}
|
||||||
|
|
||||||
private String metadataToString(Metadata m) {
|
private String candidateTranscriptToString(CandidateTranscript t) {
|
||||||
String retval = "";
|
String retval = "";
|
||||||
for (int i = 0; i < m.getNum_items(); ++i) {
|
for (int i = 0; i < t.getNum_tokens(); ++i) {
|
||||||
retval += m.getItem(i).getCharacter();
|
retval += t.getToken(i).getText();
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
@ -97,7 +97,7 @@ public class BasicTest {
|
||||||
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
|
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
|
||||||
|
|
||||||
if (extendedMetadata) {
|
if (extendedMetadata) {
|
||||||
return metadataToString(m.sttWithMetadata(shorts, shorts.length));
|
return candidateTranscriptToString(m.sttWithMetadata(shorts, shorts.length, 1).getTranscript(0));
|
||||||
} else {
|
} else {
|
||||||
return m.stt(shorts, shorts.length);
|
return m.stt(shorts, shorts.length);
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,8 +11,15 @@ public class DeepSpeechModel {
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: We should have something better than those SWIGTYPE_*
|
// FIXME: We should have something better than those SWIGTYPE_*
|
||||||
SWIGTYPE_p_p_ModelState _mspp;
|
private SWIGTYPE_p_p_ModelState _mspp;
|
||||||
SWIGTYPE_p_ModelState _msp;
|
private SWIGTYPE_p_ModelState _msp;
|
||||||
|
|
||||||
|
private void evaluateErrorCode(int errorCode) {
|
||||||
|
DeepSpeech_Error_Codes code = DeepSpeech_Error_Codes.swigToEnum(errorCode);
|
||||||
|
if (code != DeepSpeech_Error_Codes.ERR_OK) {
|
||||||
|
throw new RuntimeException("Error: " + impl.ErrorCodeToErrorMessage(errorCode) + " (0x" + Integer.toHexString(errorCode) + ").");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief An object providing an interface to a trained DeepSpeech model.
|
* @brief An object providing an interface to a trained DeepSpeech model.
|
||||||
|
@ -20,10 +27,12 @@ public class DeepSpeechModel {
|
||||||
* @constructor
|
* @constructor
|
||||||
*
|
*
|
||||||
* @param modelPath The path to the frozen model graph.
|
* @param modelPath The path to the frozen model graph.
|
||||||
|
*
|
||||||
|
* @throws RuntimeException on failure.
|
||||||
*/
|
*/
|
||||||
public DeepSpeechModel(String modelPath) {
|
public DeepSpeechModel(String modelPath) {
|
||||||
this._mspp = impl.new_modelstatep();
|
this._mspp = impl.new_modelstatep();
|
||||||
impl.CreateModel(modelPath, this._mspp);
|
evaluateErrorCode(impl.CreateModel(modelPath, this._mspp));
|
||||||
this._msp = impl.modelstatep_value(this._mspp);
|
this._msp = impl.modelstatep_value(this._mspp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -43,10 +52,10 @@ public class DeepSpeechModel {
|
||||||
* @param aBeamWidth The beam width used by the model. A larger beam width value
|
* @param aBeamWidth The beam width used by the model. A larger beam width value
|
||||||
* generates better results at the cost of decoding time.
|
* generates better results at the cost of decoding time.
|
||||||
*
|
*
|
||||||
* @return Zero on success, non-zero on failure.
|
* @throws RuntimeException on failure.
|
||||||
*/
|
*/
|
||||||
public int setBeamWidth(long beamWidth) {
|
public void setBeamWidth(long beamWidth) {
|
||||||
return impl.SetModelBeamWidth(this._msp, beamWidth);
|
evaluateErrorCode(impl.SetModelBeamWidth(this._msp, beamWidth));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -70,19 +79,19 @@ public class DeepSpeechModel {
|
||||||
*
|
*
|
||||||
* @param scorer The path to the external scorer file.
|
* @param scorer The path to the external scorer file.
|
||||||
*
|
*
|
||||||
* @return Zero on success, non-zero on failure (invalid arguments).
|
* @throws RuntimeException on failure.
|
||||||
*/
|
*/
|
||||||
public void enableExternalScorer(String scorer) {
|
public void enableExternalScorer(String scorer) {
|
||||||
impl.EnableExternalScorer(this._msp, scorer);
|
evaluateErrorCode(impl.EnableExternalScorer(this._msp, scorer));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Disable decoding using an external scorer.
|
* @brief Disable decoding using an external scorer.
|
||||||
*
|
*
|
||||||
* @return Zero on success, non-zero on failure (invalid arguments).
|
* @throws RuntimeException on failure.
|
||||||
*/
|
*/
|
||||||
public void disableExternalScorer() {
|
public void disableExternalScorer() {
|
||||||
impl.DisableExternalScorer(this._msp);
|
evaluateErrorCode(impl.DisableExternalScorer(this._msp));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -91,10 +100,10 @@ public class DeepSpeechModel {
|
||||||
* @param alpha The alpha hyperparameter of the decoder. Language model weight.
|
* @param alpha The alpha hyperparameter of the decoder. Language model weight.
|
||||||
* @param beta The beta hyperparameter of the decoder. Word insertion weight.
|
* @param beta The beta hyperparameter of the decoder. Word insertion weight.
|
||||||
*
|
*
|
||||||
* @return Zero on success, non-zero on failure (invalid arguments).
|
* @throws RuntimeException on failure.
|
||||||
*/
|
*/
|
||||||
public void setScorerAlphaBeta(float alpha, float beta) {
|
public void setScorerAlphaBeta(float alpha, float beta) {
|
||||||
impl.SetScorerAlphaBeta(this._msp, alpha, beta);
|
evaluateErrorCode(impl.SetScorerAlphaBeta(this._msp, alpha, beta));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -117,11 +126,13 @@ public class DeepSpeechModel {
|
||||||
* @param buffer A 16-bit, mono raw audio signal at the appropriate
|
* @param buffer A 16-bit, mono raw audio signal at the appropriate
|
||||||
* sample rate (matching what the model was trained on).
|
* sample rate (matching what the model was trained on).
|
||||||
* @param buffer_size The number of samples in the audio signal.
|
* @param buffer_size The number of samples in the audio signal.
|
||||||
|
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||||
*
|
*
|
||||||
* @return Outputs a Metadata object of individual letters along with their timing information.
|
* @return Metadata struct containing multiple candidate transcripts. Each transcript
|
||||||
|
* has per-token metadata including timing information.
|
||||||
*/
|
*/
|
||||||
public Metadata sttWithMetadata(short[] buffer, int buffer_size) {
|
public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) {
|
||||||
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size);
|
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -130,10 +141,12 @@ public class DeepSpeechModel {
|
||||||
* and finishStream().
|
* and finishStream().
|
||||||
*
|
*
|
||||||
* @return An opaque object that represents the streaming state.
|
* @return An opaque object that represents the streaming state.
|
||||||
|
*
|
||||||
|
* @throws RuntimeException on failure.
|
||||||
*/
|
*/
|
||||||
public DeepSpeechStreamingState createStream() {
|
public DeepSpeechStreamingState createStream() {
|
||||||
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
|
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
|
||||||
impl.CreateStream(this._msp, ssp);
|
evaluateErrorCode(impl.CreateStream(this._msp, ssp));
|
||||||
return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
|
return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -161,8 +174,20 @@ public class DeepSpeechModel {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Signal the end of an audio signal to an ongoing streaming
|
* @brief Compute the intermediate decoding of an ongoing streaming inference.
|
||||||
* inference, returns the STT result over the whole audio signal.
|
*
|
||||||
|
* @param ctx A streaming state pointer returned by createStream().
|
||||||
|
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||||
|
*
|
||||||
|
* @return The STT intermediate result.
|
||||||
|
*/
|
||||||
|
public Metadata intermediateDecodeWithMetadata(DeepSpeechStreamingState ctx, int num_results) {
|
||||||
|
return impl.IntermediateDecodeWithMetadata(ctx.get(), num_results);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Compute the final decoding of an ongoing streaming inference and return
|
||||||
|
* the result. Signals the end of an ongoing streaming inference.
|
||||||
*
|
*
|
||||||
* @param ctx A streaming state pointer returned by createStream().
|
* @param ctx A streaming state pointer returned by createStream().
|
||||||
*
|
*
|
||||||
|
@ -175,16 +200,19 @@ public class DeepSpeechModel {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Signal the end of an audio signal to an ongoing streaming
|
* @brief Compute the final decoding of an ongoing streaming inference and return
|
||||||
* inference, returns per-letter metadata.
|
* the results including metadata. Signals the end of an ongoing streaming
|
||||||
|
* inference.
|
||||||
*
|
*
|
||||||
* @param ctx A streaming state pointer returned by createStream().
|
* @param ctx A streaming state pointer returned by createStream().
|
||||||
|
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||||
*
|
*
|
||||||
* @return Outputs a Metadata object of individual letters along with their timing information.
|
* @return Metadata struct containing multiple candidate transcripts. Each transcript
|
||||||
|
* has per-token metadata including timing information.
|
||||||
*
|
*
|
||||||
* @note This method will free the state pointer (@p ctx).
|
* @note This method will free the state pointer (@p ctx).
|
||||||
*/
|
*/
|
||||||
public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx) {
|
public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx, int num_results) {
|
||||||
return impl.FinishStreamWithMetadata(ctx.get());
|
return impl.FinishStreamWithMetadata(ctx.get(), num_results);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,73 @@
|
||||||
|
/* ----------------------------------------------------------------------------
|
||||||
|
* This file was automatically generated by SWIG (http://www.swig.org).
|
||||||
|
* Version 4.0.1
|
||||||
|
*
|
||||||
|
* Do not make changes to this file unless you know what you are doing--modify
|
||||||
|
* the SWIG interface file instead.
|
||||||
|
* ----------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
package org.mozilla.deepspeech.libdeepspeech;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A single transcript computed by the model, including a confidence<br>
|
||||||
|
* value and the metadata for its constituent tokens.
|
||||||
|
*/
|
||||||
|
public class CandidateTranscript {
|
||||||
|
private transient long swigCPtr;
|
||||||
|
protected transient boolean swigCMemOwn;
|
||||||
|
|
||||||
|
protected CandidateTranscript(long cPtr, boolean cMemoryOwn) {
|
||||||
|
swigCMemOwn = cMemoryOwn;
|
||||||
|
swigCPtr = cPtr;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static long getCPtr(CandidateTranscript obj) {
|
||||||
|
return (obj == null) ? 0 : obj.swigCPtr;
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized void delete() {
|
||||||
|
if (swigCPtr != 0) {
|
||||||
|
if (swigCMemOwn) {
|
||||||
|
swigCMemOwn = false;
|
||||||
|
throw new UnsupportedOperationException("C++ destructor does not have public access");
|
||||||
|
}
|
||||||
|
swigCPtr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Array of TokenMetadata objects
|
||||||
|
*/
|
||||||
|
public TokenMetadata getTokens() {
|
||||||
|
long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this);
|
||||||
|
return (cPtr == 0) ? null : new TokenMetadata(cPtr, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Size of the tokens array
|
||||||
|
*/
|
||||||
|
public long getNum_tokens() {
|
||||||
|
return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Approximated confidence value for this transcript. This is roughly the<br>
|
||||||
|
* sum of the acoustic model logit values for each timestep/character that<br>
|
||||||
|
* contributed to the creation of this transcript.
|
||||||
|
*/
|
||||||
|
public double getConfidence() {
|
||||||
|
return implJNI.CandidateTranscript_confidence_get(swigCPtr, this);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieve one TokenMetadata element<br>
|
||||||
|
* <br>
|
||||||
|
* @param i Array index of the TokenMetadata to get<br>
|
||||||
|
* <br>
|
||||||
|
* @return The TokenMetadata requested or null
|
||||||
|
*/
|
||||||
|
public TokenMetadata getToken(int i) {
|
||||||
|
return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), false);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,65 @@
|
||||||
|
/* ----------------------------------------------------------------------------
|
||||||
|
* This file was automatically generated by SWIG (http://www.swig.org).
|
||||||
|
* Version 4.0.1
|
||||||
|
*
|
||||||
|
* Do not make changes to this file unless you know what you are doing--modify
|
||||||
|
* the SWIG interface file instead.
|
||||||
|
* ----------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
package org.mozilla.deepspeech.libdeepspeech;
|
||||||
|
|
||||||
|
public enum DeepSpeech_Error_Codes {
|
||||||
|
ERR_OK(0x0000),
|
||||||
|
ERR_NO_MODEL(0x1000),
|
||||||
|
ERR_INVALID_ALPHABET(0x2000),
|
||||||
|
ERR_INVALID_SHAPE(0x2001),
|
||||||
|
ERR_INVALID_SCORER(0x2002),
|
||||||
|
ERR_MODEL_INCOMPATIBLE(0x2003),
|
||||||
|
ERR_SCORER_NOT_ENABLED(0x2004),
|
||||||
|
ERR_FAIL_INIT_MMAP(0x3000),
|
||||||
|
ERR_FAIL_INIT_SESS(0x3001),
|
||||||
|
ERR_FAIL_INTERPRETER(0x3002),
|
||||||
|
ERR_FAIL_RUN_SESS(0x3003),
|
||||||
|
ERR_FAIL_CREATE_STREAM(0x3004),
|
||||||
|
ERR_FAIL_READ_PROTOBUF(0x3005),
|
||||||
|
ERR_FAIL_CREATE_SESS(0x3006),
|
||||||
|
ERR_FAIL_CREATE_MODEL(0x3007);
|
||||||
|
|
||||||
|
public final int swigValue() {
|
||||||
|
return swigValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static DeepSpeech_Error_Codes swigToEnum(int swigValue) {
|
||||||
|
DeepSpeech_Error_Codes[] swigValues = DeepSpeech_Error_Codes.class.getEnumConstants();
|
||||||
|
if (swigValue < swigValues.length && swigValue >= 0 && swigValues[swigValue].swigValue == swigValue)
|
||||||
|
return swigValues[swigValue];
|
||||||
|
for (DeepSpeech_Error_Codes swigEnum : swigValues)
|
||||||
|
if (swigEnum.swigValue == swigValue)
|
||||||
|
return swigEnum;
|
||||||
|
throw new IllegalArgumentException("No enum " + DeepSpeech_Error_Codes.class + " with value " + swigValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unused")
|
||||||
|
private DeepSpeech_Error_Codes() {
|
||||||
|
this.swigValue = SwigNext.next++;
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unused")
|
||||||
|
private DeepSpeech_Error_Codes(int swigValue) {
|
||||||
|
this.swigValue = swigValue;
|
||||||
|
SwigNext.next = swigValue+1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unused")
|
||||||
|
private DeepSpeech_Error_Codes(DeepSpeech_Error_Codes swigEnum) {
|
||||||
|
this.swigValue = swigEnum.swigValue;
|
||||||
|
SwigNext.next = this.swigValue+1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final int swigValue;
|
||||||
|
|
||||||
|
private static class SwigNext {
|
||||||
|
private static int next = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
/* ----------------------------------------------------------------------------
|
/* ----------------------------------------------------------------------------
|
||||||
* This file was automatically generated by SWIG (http://www.swig.org).
|
* This file was automatically generated by SWIG (http://www.swig.org).
|
||||||
* Version 4.0.2
|
* Version 4.0.1
|
||||||
*
|
*
|
||||||
* Do not make changes to this file unless you know what you are doing--modify
|
* Do not make changes to this file unless you know what you are doing--modify
|
||||||
* the SWIG interface file instead.
|
* the SWIG interface file instead.
|
||||||
|
@ -9,7 +9,7 @@
|
||||||
package org.mozilla.deepspeech.libdeepspeech;
|
package org.mozilla.deepspeech.libdeepspeech;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Stores the entire CTC output as an array of character metadata objects
|
* An array of CandidateTranscript objects computed by the model.
|
||||||
*/
|
*/
|
||||||
public class Metadata {
|
public class Metadata {
|
||||||
private transient long swigCPtr;
|
private transient long swigCPtr;
|
||||||
|
@ -40,61 +40,29 @@ public class Metadata {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* List of items
|
* Array of CandidateTranscript objects
|
||||||
*/
|
*/
|
||||||
public void setItems(MetadataItem value) {
|
public CandidateTranscript getTranscripts() {
|
||||||
implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value);
|
long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this);
|
||||||
|
return (cPtr == 0) ? null : new CandidateTranscript(cPtr, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* List of items
|
* Size of the transcripts array
|
||||||
*/
|
*/
|
||||||
public MetadataItem getItems() {
|
public long getNum_transcripts() {
|
||||||
long cPtr = implJNI.Metadata_items_get(swigCPtr, this);
|
return implJNI.Metadata_num_transcripts_get(swigCPtr, this);
|
||||||
return (cPtr == 0) ? null : new MetadataItem(cPtr, false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Size of the list of items
|
* Retrieve one CandidateTranscript element<br>
|
||||||
*/
|
|
||||||
public void setNum_items(int value) {
|
|
||||||
implJNI.Metadata_num_items_set(swigCPtr, this, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Size of the list of items
|
|
||||||
*/
|
|
||||||
public int getNum_items() {
|
|
||||||
return implJNI.Metadata_num_items_get(swigCPtr, this);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Approximated confidence value for this transcription. This is roughly the<br>
|
|
||||||
* sum of the acoustic model logit values for each timestep/character that<br>
|
|
||||||
* contributed to the creation of this transcription.
|
|
||||||
*/
|
|
||||||
public void setConfidence(double value) {
|
|
||||||
implJNI.Metadata_confidence_set(swigCPtr, this, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Approximated confidence value for this transcription. This is roughly the<br>
|
|
||||||
* sum of the acoustic model logit values for each timestep/character that<br>
|
|
||||||
* contributed to the creation of this transcription.
|
|
||||||
*/
|
|
||||||
public double getConfidence() {
|
|
||||||
return implJNI.Metadata_confidence_get(swigCPtr, this);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Retrieve one MetadataItem element<br>
|
|
||||||
* <br>
|
* <br>
|
||||||
* @param i Array index of the MetadataItem to get<br>
|
* @param i Array index of the CandidateTranscript to get<br>
|
||||||
* <br>
|
* <br>
|
||||||
* @return The MetadataItem requested or null
|
* @return The CandidateTranscript requested or null
|
||||||
*/
|
*/
|
||||||
public MetadataItem getItem(int i) {
|
public CandidateTranscript getTranscript(int i) {
|
||||||
return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true);
|
return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@ Javadoc for Sphinx
|
||||||
|
|
||||||
This code is only here for reference for documentation generation.
|
This code is only here for reference for documentation generation.
|
||||||
|
|
||||||
To update, please build SWIG (4.0 at least) and then run from native_client/java:
|
To update, please install SWIG (4.0 at least) and then run from native_client/java:
|
||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
/* ----------------------------------------------------------------------------
|
||||||
|
* This file was automatically generated by SWIG (http://www.swig.org).
|
||||||
|
* Version 4.0.1
|
||||||
|
*
|
||||||
|
* Do not make changes to this file unless you know what you are doing--modify
|
||||||
|
* the SWIG interface file instead.
|
||||||
|
* ----------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
package org.mozilla.deepspeech.libdeepspeech;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stores text of an individual token, along with its timing information
|
||||||
|
*/
|
||||||
|
public class TokenMetadata {
|
||||||
|
private transient long swigCPtr;
|
||||||
|
protected transient boolean swigCMemOwn;
|
||||||
|
|
||||||
|
protected TokenMetadata(long cPtr, boolean cMemoryOwn) {
|
||||||
|
swigCMemOwn = cMemoryOwn;
|
||||||
|
swigCPtr = cPtr;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static long getCPtr(TokenMetadata obj) {
|
||||||
|
return (obj == null) ? 0 : obj.swigCPtr;
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized void delete() {
|
||||||
|
if (swigCPtr != 0) {
|
||||||
|
if (swigCMemOwn) {
|
||||||
|
swigCMemOwn = false;
|
||||||
|
throw new UnsupportedOperationException("C++ destructor does not have public access");
|
||||||
|
}
|
||||||
|
swigCPtr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The text corresponding to this token
|
||||||
|
*/
|
||||||
|
public String getText() {
|
||||||
|
return implJNI.TokenMetadata_text_get(swigCPtr, this);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Position of the token in units of 20ms
|
||||||
|
*/
|
||||||
|
public long getTimestep() {
|
||||||
|
return implJNI.TokenMetadata_timestep_get(swigCPtr, this);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Position of the token in seconds
|
||||||
|
*/
|
||||||
|
public float getStart_time() {
|
||||||
|
return implJNI.TokenMetadata_start_time_get(swigCPtr, this);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -42,12 +42,11 @@ function totalTime(hrtimeValue) {
|
||||||
return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
|
return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
|
||||||
}
|
}
|
||||||
|
|
||||||
function metadataToString(metadata) {
|
function candidateTranscriptToString(transcript) {
|
||||||
var retval = ""
|
var retval = ""
|
||||||
for (var i = 0; i < metadata.num_items; ++i) {
|
for (var i = 0; i < transcript.tokens.length; ++i) {
|
||||||
retval += metadata.items[i].character;
|
retval += transcript.tokens[i].text;
|
||||||
}
|
}
|
||||||
Ds.FreeMetadata(metadata);
|
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -117,7 +116,9 @@ audioStream.on('finish', () => {
|
||||||
const audioLength = (audioBuffer.length / 2) * (1 / desired_sample_rate);
|
const audioLength = (audioBuffer.length / 2) * (1 / desired_sample_rate);
|
||||||
|
|
||||||
if (args['extended']) {
|
if (args['extended']) {
|
||||||
console.log(metadataToString(model.sttWithMetadata(audioBuffer)));
|
let metadata = model.sttWithMetadata(audioBuffer, 1);
|
||||||
|
console.log(candidateTranscriptToString(metadata.transcripts[0]));
|
||||||
|
Ds.FreeMetadata(metadata);
|
||||||
} else {
|
} else {
|
||||||
console.log(model.stt(audioBuffer));
|
console.log(model.stt(audioBuffer));
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,8 +47,8 @@ using namespace node;
|
||||||
%typemap(argout) ModelState **retval {
|
%typemap(argout) ModelState **retval {
|
||||||
$result = SWIGV8_ARRAY_NEW();
|
$result = SWIGV8_ARRAY_NEW();
|
||||||
SWIGV8_AppendOutput($result, SWIG_From_int(result));
|
SWIGV8_AppendOutput($result, SWIG_From_int(result));
|
||||||
// owned by SWIG, ModelState destructor gets called when the JavaScript object is finalized (see below)
|
// owned by the application. NodeJS does not guarantee the finalizer will be called so applications must call FreeMetadata themselves.
|
||||||
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, SWIG_POINTER_OWN));
|
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -68,27 +68,29 @@ using namespace node;
|
||||||
%nodefaultctor ModelState;
|
%nodefaultctor ModelState;
|
||||||
%nodefaultdtor ModelState;
|
%nodefaultdtor ModelState;
|
||||||
|
|
||||||
%typemap(out) MetadataItem* %{
|
%typemap(out) TokenMetadata* %{
|
||||||
$result = SWIGV8_ARRAY_NEW();
|
$result = SWIGV8_ARRAY_NEW();
|
||||||
for (int i = 0; i < arg1->num_items; ++i) {
|
for (int i = 0; i < arg1->num_tokens; ++i) {
|
||||||
SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_MetadataItem, SWIG_POINTER_OWN));
|
SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_TokenMetadata, 0));
|
||||||
}
|
}
|
||||||
%}
|
%}
|
||||||
|
|
||||||
%nodefaultdtor Metadata;
|
%typemap(out) CandidateTranscript* %{
|
||||||
%nodefaultctor Metadata;
|
$result = SWIGV8_ARRAY_NEW();
|
||||||
%nodefaultctor MetadataItem;
|
for (int i = 0; i < arg1->num_transcripts; ++i) {
|
||||||
%nodefaultdtor MetadataItem;
|
SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_CandidateTranscript, 0));
|
||||||
|
|
||||||
%extend struct Metadata {
|
|
||||||
~Metadata() {
|
|
||||||
DS_FreeMetadata($self);
|
|
||||||
}
|
}
|
||||||
}
|
%}
|
||||||
|
|
||||||
%extend struct MetadataItem {
|
%ignore Metadata::num_transcripts;
|
||||||
~MetadataItem() { }
|
%ignore CandidateTranscript::num_tokens;
|
||||||
}
|
|
||||||
|
%nodefaultctor Metadata;
|
||||||
|
%nodefaultdtor Metadata;
|
||||||
|
%nodefaultctor CandidateTranscript;
|
||||||
|
%nodefaultdtor CandidateTranscript;
|
||||||
|
%nodefaultctor TokenMetadata;
|
||||||
|
%nodefaultdtor TokenMetadata;
|
||||||
|
|
||||||
%rename ("%(strip:[DS_])s") "";
|
%rename ("%(strip:[DS_])s") "";
|
||||||
|
|
||||||
|
|
|
@ -115,15 +115,16 @@ Model.prototype.stt = function(aBuffer) {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Use the DeepSpeech model to perform Speech-To-Text and output metadata
|
* Use the DeepSpeech model to perform Speech-To-Text and output results including metadata.
|
||||||
* about the results.
|
|
||||||
*
|
*
|
||||||
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||||
|
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
|
||||||
*
|
*
|
||||||
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
* @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
||||||
*/
|
*/
|
||||||
Model.prototype.sttWithMetadata = function(aBuffer) {
|
Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) {
|
||||||
return binding.SpeechToTextWithMetadata(this._impl, aBuffer);
|
aNumResults = aNumResults || 1;
|
||||||
|
return binding.SpeechToTextWithMetadata(this._impl, aBuffer, aNumResults);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -172,7 +173,19 @@ Stream.prototype.intermediateDecode = function() {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
|
* Compute the intermediate decoding of an ongoing streaming inference, return results including metadata.
|
||||||
|
*
|
||||||
|
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
|
||||||
|
*
|
||||||
|
* @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
||||||
|
*/
|
||||||
|
Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
|
||||||
|
aNumResults = aNumResults || 1;
|
||||||
|
return binding.IntermediateDecode(this._impl, aNumResults);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference.
|
||||||
*
|
*
|
||||||
* @return {string} The STT result.
|
* @return {string} The STT result.
|
||||||
*
|
*
|
||||||
|
@ -185,14 +198,17 @@ Stream.prototype.finishStream = function() {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
|
* Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference.
|
||||||
|
*
|
||||||
|
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
|
||||||
*
|
*
|
||||||
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
|
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
|
||||||
*
|
*
|
||||||
* This method will free the stream, it must not be used after this method is called.
|
* This method will free the stream, it must not be used after this method is called.
|
||||||
*/
|
*/
|
||||||
Stream.prototype.finishStreamWithMetadata = function() {
|
Stream.prototype.finishStreamWithMetadata = function(aNumResults) {
|
||||||
result = binding.FinishStreamWithMetadata(this._impl);
|
aNumResults = aNumResults || 1;
|
||||||
|
result = binding.FinishStreamWithMetadata(this._impl, aNumResults);
|
||||||
this._impl = null;
|
this._impl = null;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -236,70 +252,80 @@ function Version() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//// Metadata and MetadataItem are here only for documentation purposes
|
//// Metadata, CandidateTranscript and TokenMetadata are here only for documentation purposes
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @class
|
* @class
|
||||||
*
|
*
|
||||||
* Stores each individual character, along with its timing information
|
* Stores text of an individual token, along with its timing information
|
||||||
*/
|
*/
|
||||||
function MetadataItem() {}
|
function TokenMetadata() {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The character generated for transcription
|
* The text corresponding to this token
|
||||||
*
|
*
|
||||||
* @return {string} The character generated
|
* @return {string} The text generated
|
||||||
*/
|
*/
|
||||||
MetadataItem.prototype.character = function() {}
|
TokenMetadata.prototype.text = function() {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Position of the character in units of 20ms
|
* Position of the token in units of 20ms
|
||||||
*
|
*
|
||||||
* @return {int} The position of the character
|
* @return {int} The position of the token
|
||||||
*/
|
*/
|
||||||
MetadataItem.prototype.timestep = function() {};
|
TokenMetadata.prototype.timestep = function() {};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Position of the character in seconds
|
* Position of the token in seconds
|
||||||
*
|
*
|
||||||
* @return {float} The position of the character
|
* @return {float} The position of the token
|
||||||
*/
|
*/
|
||||||
MetadataItem.prototype.start_time = function() {};
|
TokenMetadata.prototype.start_time = function() {};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @class
|
* @class
|
||||||
*
|
*
|
||||||
* Stores the entire CTC output as an array of character metadata objects
|
* A single transcript computed by the model, including a confidence value and
|
||||||
|
* the metadata for its constituent tokens.
|
||||||
*/
|
*/
|
||||||
function Metadata () {}
|
function CandidateTranscript () {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* List of items
|
* Array of tokens
|
||||||
*
|
*
|
||||||
* @return {array} List of :js:func:`MetadataItem`
|
* @return {array} Array of :js:func:`TokenMetadata`
|
||||||
*/
|
*/
|
||||||
Metadata.prototype.items = function() {}
|
CandidateTranscript.prototype.tokens = function() {}
|
||||||
|
|
||||||
/**
|
|
||||||
* Size of the list of items
|
|
||||||
*
|
|
||||||
* @return {int} Number of items
|
|
||||||
*/
|
|
||||||
Metadata.prototype.num_items = function() {}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Approximated confidence value for this transcription. This is roughly the
|
* Approximated confidence value for this transcription. This is roughly the
|
||||||
* sum of the acoustic model logit values for each timestep/character that
|
* sum of the acoustic model logit values for each timestep/token that
|
||||||
* contributed to the creation of this transcription.
|
* contributed to the creation of this transcription.
|
||||||
*
|
*
|
||||||
* @return {float} Confidence value
|
* @return {float} Confidence value
|
||||||
*/
|
*/
|
||||||
Metadata.prototype.confidence = function() {}
|
CandidateTranscript.prototype.confidence = function() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @class
|
||||||
|
*
|
||||||
|
* An array of CandidateTranscript objects computed by the model.
|
||||||
|
*/
|
||||||
|
function Metadata () {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Array of transcripts
|
||||||
|
*
|
||||||
|
* @return {array} Array of :js:func:`CandidateTranscript` objects
|
||||||
|
*/
|
||||||
|
Metadata.prototype.transcripts = function() {}
|
||||||
|
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
Model: Model,
|
Model: Model,
|
||||||
Metadata: Metadata,
|
Metadata: Metadata,
|
||||||
MetadataItem: MetadataItem,
|
CandidateTranscript: CandidateTranscript,
|
||||||
|
TokenMetadata: TokenMetadata,
|
||||||
Version: Version,
|
Version: Version,
|
||||||
FreeModel: FreeModel,
|
FreeModel: FreeModel,
|
||||||
FreeStream: FreeStream,
|
FreeStream: FreeStream,
|
||||||
|
|
|
@ -37,27 +37,39 @@ ModelState::decode(const DecoderState& state) const
|
||||||
}
|
}
|
||||||
|
|
||||||
Metadata*
|
Metadata*
|
||||||
ModelState::decode_metadata(const DecoderState& state)
|
ModelState::decode_metadata(const DecoderState& state,
|
||||||
|
size_t num_results)
|
||||||
{
|
{
|
||||||
vector<Output> out = state.decode();
|
vector<Output> out = state.decode(num_results);
|
||||||
|
unsigned int num_returned = out.size();
|
||||||
|
|
||||||
std::unique_ptr<Metadata> metadata(new Metadata());
|
CandidateTranscript* transcripts = (CandidateTranscript*)malloc(sizeof(CandidateTranscript)*num_returned);
|
||||||
metadata->num_items = out[0].tokens.size();
|
|
||||||
metadata->confidence = out[0].confidence;
|
|
||||||
|
|
||||||
std::unique_ptr<MetadataItem[]> items(new MetadataItem[metadata->num_items]());
|
for (int i = 0; i < num_returned; ++i) {
|
||||||
|
TokenMetadata* tokens = (TokenMetadata*)malloc(sizeof(TokenMetadata)*out[i].tokens.size());
|
||||||
|
|
||||||
// Loop through each character
|
for (int j = 0; j < out[i].tokens.size(); ++j) {
|
||||||
for (int i = 0; i < out[0].tokens.size(); ++i) {
|
TokenMetadata token {
|
||||||
items[i].character = strdup(alphabet_.StringFromLabel(out[0].tokens[i]).c_str());
|
strdup(alphabet_.StringFromLabel(out[i].tokens[j]).c_str()), // text
|
||||||
items[i].timestep = out[0].timesteps[i];
|
static_cast<unsigned int>(out[i].timesteps[j]), // timestep
|
||||||
items[i].start_time = out[0].timesteps[i] * ((float)audio_win_step_ / sample_rate_);
|
out[i].timesteps[j] * ((float)audio_win_step_ / sample_rate_), // start_time
|
||||||
|
};
|
||||||
if (items[i].start_time < 0) {
|
memcpy(&tokens[j], &token, sizeof(TokenMetadata));
|
||||||
items[i].start_time = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
metadata->items = items.release();
|
CandidateTranscript transcript {
|
||||||
return metadata.release();
|
tokens, // tokens
|
||||||
|
static_cast<unsigned int>(out[i].tokens.size()), // num_tokens
|
||||||
|
out[i].confidence, // confidence
|
||||||
|
};
|
||||||
|
memcpy(&transcripts[i], &transcript, sizeof(CandidateTranscript));
|
||||||
|
}
|
||||||
|
|
||||||
|
Metadata* ret = (Metadata*)malloc(sizeof(Metadata));
|
||||||
|
Metadata metadata {
|
||||||
|
transcripts, // transcripts
|
||||||
|
num_returned, // num_transcripts
|
||||||
|
};
|
||||||
|
memcpy(ret, &metadata, sizeof(Metadata));
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
|
@ -66,11 +66,14 @@ struct ModelState {
|
||||||
* @brief Return character-level metadata including letter timings.
|
* @brief Return character-level metadata including letter timings.
|
||||||
*
|
*
|
||||||
* @param state Decoder state to use when decoding.
|
* @param state Decoder state to use when decoding.
|
||||||
|
* @param num_results Maximum number of candidate results to return.
|
||||||
*
|
*
|
||||||
* @return Metadata struct containing MetadataItem structs for each character.
|
* @return A Metadata struct containing CandidateTranscript structs.
|
||||||
* The user is responsible for freeing Metadata by calling DS_FreeMetadata().
|
* Each represents an candidate transcript, with the first ranked most probable.
|
||||||
|
* The user is responsible for freeing Result by calling DS_FreeMetadata().
|
||||||
*/
|
*/
|
||||||
virtual Metadata* decode_metadata(const DecoderState& state);
|
virtual Metadata* decode_metadata(const DecoderState& state,
|
||||||
|
size_t num_results);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // MODELSTATE_H
|
#endif // MODELSTATE_H
|
||||||
|
|
|
@ -121,17 +121,20 @@ class Model(object):
|
||||||
"""
|
"""
|
||||||
return deepspeech.impl.SpeechToText(self._impl, audio_buffer)
|
return deepspeech.impl.SpeechToText(self._impl, audio_buffer)
|
||||||
|
|
||||||
def sttWithMetadata(self, audio_buffer):
|
def sttWithMetadata(self, audio_buffer, num_results=1):
|
||||||
"""
|
"""
|
||||||
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
|
Use the DeepSpeech model to perform Speech-To-Text and return results including metadata.
|
||||||
|
|
||||||
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||||
:type audio_buffer: numpy.int16 array
|
:type audio_buffer: numpy.int16 array
|
||||||
|
|
||||||
:return: Outputs a struct of individual letters along with their timing information.
|
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||||
|
:type num_results: int
|
||||||
|
|
||||||
|
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
|
||||||
:type: :func:`Metadata`
|
:type: :func:`Metadata`
|
||||||
"""
|
"""
|
||||||
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer)
|
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results)
|
||||||
|
|
||||||
def createStream(self):
|
def createStream(self):
|
||||||
"""
|
"""
|
||||||
|
@ -187,10 +190,27 @@ class Stream(object):
|
||||||
raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
|
raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
|
||||||
return deepspeech.impl.IntermediateDecode(self._impl)
|
return deepspeech.impl.IntermediateDecode(self._impl)
|
||||||
|
|
||||||
|
def intermediateDecodeWithMetadata(self, num_results=1):
|
||||||
|
"""
|
||||||
|
Compute the intermediate decoding of an ongoing streaming inference and return results including metadata.
|
||||||
|
|
||||||
|
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||||
|
:type num_results: int
|
||||||
|
|
||||||
|
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
|
||||||
|
:type: :func:`Metadata`
|
||||||
|
|
||||||
|
:throws: RuntimeError if the stream object is not valid
|
||||||
|
"""
|
||||||
|
if not self._impl:
|
||||||
|
raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
|
||||||
|
return deepspeech.impl.IntermediateDecodeWithMetadata(self._impl, num_results)
|
||||||
|
|
||||||
def finishStream(self):
|
def finishStream(self):
|
||||||
"""
|
"""
|
||||||
Signal the end of an audio signal to an ongoing streaming inference,
|
Compute the final decoding of an ongoing streaming inference and return
|
||||||
returns the STT result over the whole audio signal.
|
the result. Signals the end of an ongoing streaming inference. The underlying
|
||||||
|
stream object must not be used after this method is called.
|
||||||
|
|
||||||
:return: The STT result.
|
:return: The STT result.
|
||||||
:type: str
|
:type: str
|
||||||
|
@ -203,19 +223,24 @@ class Stream(object):
|
||||||
self._impl = None
|
self._impl = None
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def finishStreamWithMetadata(self):
|
def finishStreamWithMetadata(self, num_results=1):
|
||||||
"""
|
"""
|
||||||
Signal the end of an audio signal to an ongoing streaming inference,
|
Compute the final decoding of an ongoing streaming inference and return
|
||||||
returns per-letter metadata.
|
results including metadata. Signals the end of an ongoing streaming
|
||||||
|
inference. The underlying stream object must not be used after this
|
||||||
|
method is called.
|
||||||
|
|
||||||
:return: Outputs a struct of individual letters along with their timing information.
|
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||||
|
:type num_results: int
|
||||||
|
|
||||||
|
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
|
||||||
:type: :func:`Metadata`
|
:type: :func:`Metadata`
|
||||||
|
|
||||||
:throws: RuntimeError if the stream object is not valid
|
:throws: RuntimeError if the stream object is not valid
|
||||||
"""
|
"""
|
||||||
if not self._impl:
|
if not self._impl:
|
||||||
raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
|
raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
|
||||||
result = deepspeech.impl.FinishStreamWithMetadata(self._impl)
|
result = deepspeech.impl.FinishStreamWithMetadata(self._impl, num_results)
|
||||||
self._impl = None
|
self._impl = None
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@ -233,52 +258,43 @@ class Stream(object):
|
||||||
|
|
||||||
|
|
||||||
# This is only for documentation purpose
|
# This is only for documentation purpose
|
||||||
# Metadata and MetadataItem should be in sync with native_client/deepspeech.h
|
# Metadata, CandidateTranscript and TokenMetadata should be in sync with native_client/deepspeech.h
|
||||||
class MetadataItem(object):
|
class TokenMetadata(object):
|
||||||
"""
|
"""
|
||||||
Stores each individual character, along with its timing information
|
Stores each individual character, along with its timing information
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def character(self):
|
def text(self):
|
||||||
"""
|
"""
|
||||||
The character generated for transcription
|
The text for this token
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def timestep(self):
|
def timestep(self):
|
||||||
"""
|
"""
|
||||||
Position of the character in units of 20ms
|
Position of the token in units of 20ms
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def start_time(self):
|
def start_time(self):
|
||||||
"""
|
"""
|
||||||
Position of the character in seconds
|
Position of the token in seconds
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class Metadata(object):
|
class CandidateTranscript(object):
|
||||||
"""
|
"""
|
||||||
Stores the entire CTC output as an array of character metadata objects
|
Stores the entire CTC output as an array of character metadata objects
|
||||||
"""
|
"""
|
||||||
def items(self):
|
def tokens(self):
|
||||||
"""
|
"""
|
||||||
List of items
|
List of tokens
|
||||||
|
|
||||||
:return: A list of :func:`MetadataItem` elements
|
:return: A list of :func:`TokenMetadata` elements
|
||||||
:type: list
|
:type: list
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def num_items(self):
|
|
||||||
"""
|
|
||||||
Size of the list of items
|
|
||||||
|
|
||||||
:return: Size of the list of items
|
|
||||||
:type: int
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def confidence(self):
|
def confidence(self):
|
||||||
"""
|
"""
|
||||||
Approximated confidence value for this transcription. This is roughly the
|
Approximated confidence value for this transcription. This is roughly the
|
||||||
|
@ -286,3 +302,12 @@ class Metadata(object):
|
||||||
contributed to the creation of this transcription.
|
contributed to the creation of this transcription.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class Metadata(object):
|
||||||
|
def transcripts(self):
|
||||||
|
"""
|
||||||
|
List of candidate transcripts
|
||||||
|
|
||||||
|
:return: A list of :func:`CandidateTranscript` objects
|
||||||
|
:type: list
|
||||||
|
"""
|
||||||
|
|
|
@ -18,6 +18,7 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from pipes import quote
|
from pipes import quote
|
||||||
|
|
||||||
|
|
||||||
def convert_samplerate(audio_path, desired_sample_rate):
|
def convert_samplerate(audio_path, desired_sample_rate):
|
||||||
sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate)
|
sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate)
|
||||||
try:
|
try:
|
||||||
|
@ -31,25 +32,25 @@ def convert_samplerate(audio_path, desired_sample_rate):
|
||||||
|
|
||||||
|
|
||||||
def metadata_to_string(metadata):
|
def metadata_to_string(metadata):
|
||||||
return ''.join(item.character for item in metadata.items)
|
return ''.join(token.text for token in metadata.tokens)
|
||||||
|
|
||||||
def words_from_metadata(metadata):
|
|
||||||
|
def words_from_candidate_transcript(metadata):
|
||||||
word = ""
|
word = ""
|
||||||
word_list = []
|
word_list = []
|
||||||
word_start_time = 0
|
word_start_time = 0
|
||||||
# Loop through each character
|
# Loop through each character
|
||||||
for i in range(0, metadata.num_items):
|
for i, token in enumerate(metadata.tokens):
|
||||||
item = metadata.items[i]
|
|
||||||
# Append character to word if it's not a space
|
# Append character to word if it's not a space
|
||||||
if item.character != " ":
|
if token.text != " ":
|
||||||
if len(word) == 0:
|
if len(word) == 0:
|
||||||
# Log the start time of the new word
|
# Log the start time of the new word
|
||||||
word_start_time = item.start_time
|
word_start_time = token.start_time
|
||||||
|
|
||||||
word = word + item.character
|
word = word + token.text
|
||||||
# Word boundary is either a space or the last character in the array
|
# Word boundary is either a space or the last character in the array
|
||||||
if item.character == " " or i == metadata.num_items - 1:
|
if token.text == " " or i == len(metadata.tokens) - 1:
|
||||||
word_duration = item.start_time - word_start_time
|
word_duration = token.start_time - word_start_time
|
||||||
|
|
||||||
if word_duration < 0:
|
if word_duration < 0:
|
||||||
word_duration = 0
|
word_duration = 0
|
||||||
|
@ -69,9 +70,11 @@ def words_from_metadata(metadata):
|
||||||
|
|
||||||
def metadata_json_output(metadata):
|
def metadata_json_output(metadata):
|
||||||
json_result = dict()
|
json_result = dict()
|
||||||
json_result["words"] = words_from_metadata(metadata)
|
json_result["transcripts"] = [{
|
||||||
json_result["confidence"] = metadata.confidence
|
"confidence": transcript.confidence,
|
||||||
return json.dumps(json_result)
|
"words": words_from_candidate_transcript(transcript),
|
||||||
|
} for transcript in metadata.transcripts]
|
||||||
|
return json.dumps(json_result, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -141,9 +144,9 @@ def main():
|
||||||
print('Running inference.', file=sys.stderr)
|
print('Running inference.', file=sys.stderr)
|
||||||
inference_start = timer()
|
inference_start = timer()
|
||||||
if args.extended:
|
if args.extended:
|
||||||
print(metadata_to_string(ds.sttWithMetadata(audio)))
|
print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
|
||||||
elif args.json:
|
elif args.json:
|
||||||
print(metadata_json_output(ds.sttWithMetadata(audio)))
|
print(metadata_json_output(ds.sttWithMetadata(audio, 3)))
|
||||||
else:
|
else:
|
||||||
print(ds.stt(audio))
|
print(ds.stt(audio))
|
||||||
inference_end = timer() - inference_start
|
inference_end = timer() - inference_start
|
||||||
|
|
|
@ -38,30 +38,69 @@ import_array();
|
||||||
%append_output(SWIG_NewPointerObj(%as_voidptr($1), $1_descriptor, SWIG_POINTER_OWN));
|
%append_output(SWIG_NewPointerObj(%as_voidptr($1), $1_descriptor, SWIG_POINTER_OWN));
|
||||||
}
|
}
|
||||||
|
|
||||||
%typemap(out) MetadataItem* %{
|
%fragment("parent_reference_init", "init") {
|
||||||
$result = PyList_New(arg1->num_items);
|
// Thread-safe initialization - initialize during Python module initialization
|
||||||
for (int i = 0; i < arg1->num_items; ++i) {
|
parent_reference();
|
||||||
PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->items[i]), SWIGTYPE_p_MetadataItem, 0);
|
}
|
||||||
|
|
||||||
|
%fragment("parent_reference_function", "header", fragment="parent_reference_init") {
|
||||||
|
|
||||||
|
static PyObject *parent_reference() {
|
||||||
|
static PyObject *parent_reference_string = SWIG_Python_str_FromChar("__parent_reference");
|
||||||
|
return parent_reference_string;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
%typemap(out, fragment="parent_reference_function") CandidateTranscript* %{
|
||||||
|
$result = PyList_New(arg1->num_transcripts);
|
||||||
|
for (int i = 0; i < arg1->num_transcripts; ++i) {
|
||||||
|
PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->transcripts[i]), SWIGTYPE_p_CandidateTranscript, 0);
|
||||||
|
// Add a reference to Metadata in the returned elements to avoid premature
|
||||||
|
// garbage collection
|
||||||
|
PyObject_SetAttr(o, parent_reference(), $self);
|
||||||
PyList_SetItem($result, i, o);
|
PyList_SetItem($result, i, o);
|
||||||
}
|
}
|
||||||
%}
|
%}
|
||||||
|
|
||||||
%extend struct MetadataItem {
|
%typemap(out, fragment="parent_reference_function") TokenMetadata* %{
|
||||||
|
$result = PyList_New(arg1->num_tokens);
|
||||||
|
for (int i = 0; i < arg1->num_tokens; ++i) {
|
||||||
|
PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->tokens[i]), SWIGTYPE_p_TokenMetadata, 0);
|
||||||
|
// Add a reference to CandidateTranscript in the returned elements to avoid premature
|
||||||
|
// garbage collection
|
||||||
|
PyObject_SetAttr(o, parent_reference(), $self);
|
||||||
|
PyList_SetItem($result, i, o);
|
||||||
|
}
|
||||||
|
%}
|
||||||
|
|
||||||
|
%extend struct TokenMetadata {
|
||||||
%pythoncode %{
|
%pythoncode %{
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return 'MetadataItem(character=\'{}\', timestep={}, start_time={})'.format(self.character, self.timestep, self.start_time)
|
return 'TokenMetadata(text=\'{}\', timestep={}, start_time={})'.format(self.text, self.timestep, self.start_time)
|
||||||
|
%}
|
||||||
|
}
|
||||||
|
|
||||||
|
%extend struct CandidateTranscript {
|
||||||
|
%pythoncode %{
|
||||||
|
def __repr__(self):
|
||||||
|
tokens_repr = ',\n'.join(repr(i) for i in self.tokens)
|
||||||
|
tokens_repr = '\n'.join(' ' + l for l in tokens_repr.split('\n'))
|
||||||
|
return 'CandidateTranscript(confidence={}, tokens=[\n{}\n])'.format(self.confidence, tokens_repr)
|
||||||
%}
|
%}
|
||||||
}
|
}
|
||||||
|
|
||||||
%extend struct Metadata {
|
%extend struct Metadata {
|
||||||
%pythoncode %{
|
%pythoncode %{
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
items_repr = ', \n'.join(' ' + repr(i) for i in self.items)
|
transcripts_repr = ',\n'.join(repr(i) for i in self.transcripts)
|
||||||
return 'Metadata(confidence={}, items=[\n{}\n])'.format(self.confidence, items_repr)
|
transcripts_repr = '\n'.join(' ' + l for l in transcripts_repr.split('\n'))
|
||||||
|
return 'Metadata(transcripts=[\n{}\n])'.format(transcripts_repr)
|
||||||
%}
|
%}
|
||||||
}
|
}
|
||||||
|
|
||||||
%ignore Metadata::num_items;
|
%ignore Metadata::num_transcripts;
|
||||||
|
%ignore CandidateTranscript::num_tokens;
|
||||||
|
|
||||||
%extend struct Metadata {
|
%extend struct Metadata {
|
||||||
~Metadata() {
|
~Metadata() {
|
||||||
|
@ -69,10 +108,12 @@ import_array();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
%nodefaultdtor Metadata;
|
|
||||||
%nodefaultctor Metadata;
|
%nodefaultctor Metadata;
|
||||||
%nodefaultctor MetadataItem;
|
%nodefaultdtor Metadata;
|
||||||
%nodefaultdtor MetadataItem;
|
%nodefaultctor CandidateTranscript;
|
||||||
|
%nodefaultdtor CandidateTranscript;
|
||||||
|
%nodefaultctor TokenMetadata;
|
||||||
|
%nodefaultdtor TokenMetadata;
|
||||||
|
|
||||||
%typemap(newfree) char* "DS_FreeString($1);";
|
%typemap(newfree) char* "DS_FreeString($1);";
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue