Merge pull request #2792 from reuben/multiple_transcriptions
Expose multiple transcriptions in "WithMetadata" API
This commit is contained in:
commit
903d0b8fe4
|
@ -34,6 +34,9 @@ C
|
|||
.. doxygenfunction:: DS_IntermediateDecode
|
||||
:project: deepspeech-c
|
||||
|
||||
.. doxygenfunction:: DS_IntermediateDecodeWithMetadata
|
||||
:project: deepspeech-c
|
||||
|
||||
.. doxygenfunction:: DS_FinishStream
|
||||
:project: deepspeech-c
|
||||
|
||||
|
|
|
@ -31,13 +31,20 @@ ErrorCodes
|
|||
Metadata
|
||||
--------
|
||||
|
||||
.. doxygenstruct:: DeepSpeechClient::Structs::Metadata
|
||||
.. doxygenclass:: DeepSpeechClient::Models::Metadata
|
||||
:project: deepspeech-dotnet
|
||||
:members: items, num_items, confidence
|
||||
:members: Transcripts
|
||||
|
||||
MetadataItem
|
||||
------------
|
||||
CandidateTranscript
|
||||
-------------------
|
||||
|
||||
.. doxygenstruct:: DeepSpeechClient::Structs::MetadataItem
|
||||
.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript
|
||||
:project: deepspeech-dotnet
|
||||
:members: character, timestep, start_time
|
||||
:members: Tokens, Confidence
|
||||
|
||||
TokenMetadata
|
||||
-------------
|
||||
|
||||
.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata
|
||||
:project: deepspeech-dotnet
|
||||
:members: Text, Timestep, StartTime
|
||||
|
|
|
@ -13,11 +13,17 @@ Metadata
|
|||
|
||||
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
|
||||
:project: deepspeech-java
|
||||
:members: getItems, getNum_items, getProbability, getItem
|
||||
:members: getTranscripts, getNum_transcripts, getTranscript
|
||||
|
||||
MetadataItem
|
||||
------------
|
||||
CandidateTranscript
|
||||
-------------------
|
||||
|
||||
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem
|
||||
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript
|
||||
:project: deepspeech-java
|
||||
:members: getCharacter, getTimestep, getStart_time
|
||||
:members: getTokens, getNum_tokens, getConfidence, getToken
|
||||
|
||||
TokenMetadata
|
||||
-------------
|
||||
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata
|
||||
:project: deepspeech-java
|
||||
:members: getText, getTimestep, getStart_time
|
||||
|
|
|
@ -30,8 +30,14 @@ Metadata
|
|||
.. js:autoclass:: Metadata
|
||||
:members:
|
||||
|
||||
MetadataItem
|
||||
------------
|
||||
CandidateTranscript
|
||||
-------------------
|
||||
|
||||
.. js:autoclass:: MetadataItem
|
||||
.. js:autoclass:: CandidateTranscript
|
||||
:members:
|
||||
|
||||
TokenMetadata
|
||||
-------------
|
||||
|
||||
.. js:autoclass:: TokenMetadata
|
||||
:members:
|
||||
|
|
|
@ -21,8 +21,14 @@ Metadata
|
|||
.. autoclass:: Metadata
|
||||
:members:
|
||||
|
||||
MetadataItem
|
||||
------------
|
||||
CandidateTranscript
|
||||
-------------------
|
||||
|
||||
.. autoclass:: MetadataItem
|
||||
.. autoclass:: CandidateTranscript
|
||||
:members:
|
||||
|
||||
TokenMetadata
|
||||
-------------
|
||||
|
||||
.. autoclass:: TokenMetadata
|
||||
:members:
|
||||
|
|
|
@ -8,9 +8,16 @@ Metadata
|
|||
:project: deepspeech-c
|
||||
:members:
|
||||
|
||||
MetadataItem
|
||||
------------
|
||||
CandidateTranscript
|
||||
-------------------
|
||||
|
||||
.. doxygenstruct:: MetadataItem
|
||||
.. doxygenstruct:: CandidateTranscript
|
||||
:project: deepspeech-c
|
||||
:members:
|
||||
|
||||
TokenMetadata
|
||||
-------------
|
||||
|
||||
.. doxygenstruct:: TokenMetadata
|
||||
:project: deepspeech-c
|
||||
:members:
|
||||
|
|
|
@ -790,7 +790,7 @@ WARN_LOGFILE =
|
|||
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
|
||||
# Note: If this tag is empty the current directory is searched.
|
||||
|
||||
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Structs/
|
||||
INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/
|
||||
|
||||
# This tag can be used to specify the character encoding of the source files
|
||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
||||
|
|
|
@ -34,6 +34,8 @@ bool extended_metadata = false;
|
|||
|
||||
bool json_output = false;
|
||||
|
||||
int json_candidate_transcripts = 3;
|
||||
|
||||
int stream_size = 0;
|
||||
|
||||
void PrintHelp(const char* bin)
|
||||
|
@ -43,18 +45,19 @@ void PrintHelp(const char* bin)
|
|||
"\n"
|
||||
"Running DeepSpeech inference.\n"
|
||||
"\n"
|
||||
"\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
|
||||
"\t--scorer SCORER\t\tPath to the external scorer file\n"
|
||||
"\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
|
||||
"\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
|
||||
"\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
|
||||
"\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
|
||||
"\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
|
||||
"\t--extended\t\tOutput string from extended metadata\n"
|
||||
"\t--json\t\t\tExtended output, shows word timings as JSON\n"
|
||||
"\t--stream size\t\tRun in stream mode, output intermediate results\n"
|
||||
"\t--help\t\t\tShow help\n"
|
||||
"\t--version\t\tPrint version and exits\n";
|
||||
"\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n"
|
||||
"\t--scorer SCORER\t\t\tPath to the external scorer file\n"
|
||||
"\t--audio AUDIO\t\t\tPath to the audio file to run (WAV format)\n"
|
||||
"\t--beam_width BEAM_WIDTH\t\tValue for decoder beam width (int)\n"
|
||||
"\t--lm_alpha LM_ALPHA\t\tValue for language model alpha param (float)\n"
|
||||
"\t--lm_beta LM_BETA\t\tValue for language model beta param (float)\n"
|
||||
"\t-t\t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
|
||||
"\t--extended\t\t\tOutput string from extended metadata\n"
|
||||
"\t--json\t\t\t\tExtended output, shows word timings as JSON\n"
|
||||
"\t--candidate_transcripts NUMBER\tNumber of candidate transcripts to include in output\n"
|
||||
"\t--stream size\t\t\tRun in stream mode, output intermediate results\n"
|
||||
"\t--help\t\t\t\tShow help\n"
|
||||
"\t--version\t\t\tPrint version and exits\n";
|
||||
char* version = DS_Version();
|
||||
std::cerr << "DeepSpeech " << version << "\n";
|
||||
DS_FreeString(version);
|
||||
|
@ -74,6 +77,7 @@ bool ProcessArgs(int argc, char** argv)
|
|||
{"t", no_argument, nullptr, 't'},
|
||||
{"extended", no_argument, nullptr, 'e'},
|
||||
{"json", no_argument, nullptr, 'j'},
|
||||
{"candidate_transcripts", required_argument, nullptr, 150},
|
||||
{"stream", required_argument, nullptr, 's'},
|
||||
{"version", no_argument, nullptr, 'v'},
|
||||
{"help", no_argument, nullptr, 'h'},
|
||||
|
@ -128,6 +132,10 @@ bool ProcessArgs(int argc, char** argv)
|
|||
json_output = true;
|
||||
break;
|
||||
|
||||
case 150:
|
||||
json_candidate_transcripts = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 's':
|
||||
stream_size = atoi(optarg);
|
||||
break;
|
||||
|
|
|
@ -44,9 +44,115 @@ struct meta_word {
|
|||
float duration;
|
||||
};
|
||||
|
||||
char* metadataToString(Metadata* metadata);
|
||||
std::vector<meta_word> WordsFromMetadata(Metadata* metadata);
|
||||
char* JSONOutput(Metadata* metadata);
|
||||
char*
|
||||
CandidateTranscriptToString(const CandidateTranscript* transcript)
|
||||
{
|
||||
std::string retval = "";
|
||||
for (int i = 0; i < transcript->num_tokens; i++) {
|
||||
const TokenMetadata& token = transcript->tokens[i];
|
||||
retval += token.text;
|
||||
}
|
||||
return strdup(retval.c_str());
|
||||
}
|
||||
|
||||
std::vector<meta_word>
|
||||
CandidateTranscriptToWords(const CandidateTranscript* transcript)
|
||||
{
|
||||
std::vector<meta_word> word_list;
|
||||
|
||||
std::string word = "";
|
||||
float word_start_time = 0;
|
||||
|
||||
// Loop through each token
|
||||
for (int i = 0; i < transcript->num_tokens; i++) {
|
||||
const TokenMetadata& token = transcript->tokens[i];
|
||||
|
||||
// Append token to word if it's not a space
|
||||
if (strcmp(token.text, u8" ") != 0) {
|
||||
// Log the start time of the new word
|
||||
if (word.length() == 0) {
|
||||
word_start_time = token.start_time;
|
||||
}
|
||||
word.append(token.text);
|
||||
}
|
||||
|
||||
// Word boundary is either a space or the last token in the array
|
||||
if (strcmp(token.text, u8" ") == 0 || i == transcript->num_tokens-1) {
|
||||
float word_duration = token.start_time - word_start_time;
|
||||
|
||||
if (word_duration < 0) {
|
||||
word_duration = 0;
|
||||
}
|
||||
|
||||
meta_word w;
|
||||
w.word = word;
|
||||
w.start_time = word_start_time;
|
||||
w.duration = word_duration;
|
||||
|
||||
word_list.push_back(w);
|
||||
|
||||
// Reset
|
||||
word = "";
|
||||
word_start_time = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return word_list;
|
||||
}
|
||||
|
||||
std::string
|
||||
CandidateTranscriptToJSON(const CandidateTranscript *transcript)
|
||||
{
|
||||
std::ostringstream out_string;
|
||||
|
||||
std::vector<meta_word> words = CandidateTranscriptToWords(transcript);
|
||||
|
||||
out_string << R"("metadata":{"confidence":)" << transcript->confidence << R"(},"words":[)";
|
||||
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
meta_word w = words[i];
|
||||
out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
|
||||
|
||||
if (i < words.size() - 1) {
|
||||
out_string << ",";
|
||||
}
|
||||
}
|
||||
|
||||
out_string << "]";
|
||||
|
||||
return out_string.str();
|
||||
}
|
||||
|
||||
char*
|
||||
MetadataToJSON(Metadata* result)
|
||||
{
|
||||
std::ostringstream out_string;
|
||||
out_string << "{\n";
|
||||
|
||||
for (int j=0; j < result->num_transcripts; ++j) {
|
||||
const CandidateTranscript *transcript = &result->transcripts[j];
|
||||
|
||||
if (j == 0) {
|
||||
out_string << CandidateTranscriptToJSON(transcript);
|
||||
|
||||
if (result->num_transcripts > 1) {
|
||||
out_string << ",\n" << R"("alternatives")" << ":[\n";
|
||||
}
|
||||
} else {
|
||||
out_string << "{" << CandidateTranscriptToJSON(transcript) << "}";
|
||||
|
||||
if (j < result->num_transcripts - 1) {
|
||||
out_string << ",\n";
|
||||
} else {
|
||||
out_string << "\n]";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out_string << "\n}\n";
|
||||
|
||||
return strdup(out_string.str().c_str());
|
||||
}
|
||||
|
||||
ds_result
|
||||
LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
|
||||
|
@ -57,13 +163,13 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
|
|||
clock_t ds_start_time = clock();
|
||||
|
||||
if (extended_output) {
|
||||
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
|
||||
res.string = metadataToString(metadata);
|
||||
DS_FreeMetadata(metadata);
|
||||
Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1);
|
||||
res.string = CandidateTranscriptToString(&result->transcripts[0]);
|
||||
DS_FreeMetadata(result);
|
||||
} else if (json_output) {
|
||||
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
|
||||
res.string = JSONOutput(metadata);
|
||||
DS_FreeMetadata(metadata);
|
||||
Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts);
|
||||
res.string = MetadataToJSON(result);
|
||||
DS_FreeMetadata(result);
|
||||
} else if (stream_size > 0) {
|
||||
StreamingState* ctx;
|
||||
int status = DS_CreateStream(aCtx, &ctx);
|
||||
|
@ -278,87 +384,6 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
|
|||
}
|
||||
}
|
||||
|
||||
char*
|
||||
metadataToString(Metadata* metadata)
|
||||
{
|
||||
std::string retval = "";
|
||||
for (int i = 0; i < metadata->num_items; i++) {
|
||||
MetadataItem item = metadata->items[i];
|
||||
retval += item.character;
|
||||
}
|
||||
return strdup(retval.c_str());
|
||||
}
|
||||
|
||||
std::vector<meta_word>
|
||||
WordsFromMetadata(Metadata* metadata)
|
||||
{
|
||||
std::vector<meta_word> word_list;
|
||||
|
||||
std::string word = "";
|
||||
float word_start_time = 0;
|
||||
|
||||
// Loop through each character
|
||||
for (int i = 0; i < metadata->num_items; i++) {
|
||||
MetadataItem item = metadata->items[i];
|
||||
|
||||
// Append character to word if it's not a space
|
||||
if (strcmp(item.character, u8" ") != 0) {
|
||||
// Log the start time of the new word
|
||||
if (word.length() == 0) {
|
||||
word_start_time = item.start_time;
|
||||
}
|
||||
word.append(item.character);
|
||||
}
|
||||
|
||||
// Word boundary is either a space or the last character in the array
|
||||
if (strcmp(item.character, " ") == 0
|
||||
|| strcmp(item.character, u8" ") == 0
|
||||
|| i == metadata->num_items-1) {
|
||||
|
||||
float word_duration = item.start_time - word_start_time;
|
||||
|
||||
if (word_duration < 0) {
|
||||
word_duration = 0;
|
||||
}
|
||||
|
||||
meta_word w;
|
||||
w.word = word;
|
||||
w.start_time = word_start_time;
|
||||
w.duration = word_duration;
|
||||
|
||||
word_list.push_back(w);
|
||||
|
||||
// Reset
|
||||
word = "";
|
||||
word_start_time = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return word_list;
|
||||
}
|
||||
|
||||
char*
|
||||
JSONOutput(Metadata* metadata)
|
||||
{
|
||||
std::vector<meta_word> words = WordsFromMetadata(metadata);
|
||||
|
||||
std::ostringstream out_string;
|
||||
out_string << R"({"metadata":{"confidence":)" << metadata->confidence << R"(},"words":[)";
|
||||
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
meta_word w = words[i];
|
||||
out_string << R"({"word":")" << w.word << R"(","time":)" << w.start_time << R"(,"duration":)" << w.duration << "}";
|
||||
|
||||
if (i < words.size() - 1) {
|
||||
out_string << ",";
|
||||
}
|
||||
}
|
||||
|
||||
out_string << "]}\n";
|
||||
|
||||
return strdup(out_string.str().c_str());
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
|
|
|
@ -157,7 +157,7 @@ DecoderState::next(const double *probs,
|
|||
}
|
||||
|
||||
std::vector<Output>
|
||||
DecoderState::decode() const
|
||||
DecoderState::decode(size_t num_results) const
|
||||
{
|
||||
std::vector<PathTrie*> prefixes_copy = prefixes_;
|
||||
std::unordered_map<const PathTrie*, float> scores;
|
||||
|
@ -181,16 +181,12 @@ DecoderState::decode() const
|
|||
}
|
||||
|
||||
using namespace std::placeholders;
|
||||
size_t num_prefixes = std::min(prefixes_copy.size(), beam_size_);
|
||||
size_t num_returned = std::min(prefixes_copy.size(), num_results);
|
||||
std::partial_sort(prefixes_copy.begin(),
|
||||
prefixes_copy.begin() + num_prefixes,
|
||||
prefixes_copy.begin() + num_returned,
|
||||
prefixes_copy.end(),
|
||||
std::bind(prefix_compare_external, _1, _2, scores));
|
||||
|
||||
//TODO: expose this as an API parameter
|
||||
const size_t top_paths = 1;
|
||||
size_t num_returned = std::min(num_prefixes, top_paths);
|
||||
|
||||
std::vector<Output> outputs;
|
||||
outputs.reserve(num_returned);
|
||||
|
||||
|
|
|
@ -60,13 +60,16 @@ public:
|
|||
int time_dim,
|
||||
int class_dim);
|
||||
|
||||
/* Get transcription from current decoder state
|
||||
/* Get up to num_results transcriptions from current decoder state.
|
||||
*
|
||||
* Parameters:
|
||||
* num_results: Number of beams to return.
|
||||
*
|
||||
* Return:
|
||||
* A vector where each element is a pair of score and decoding result,
|
||||
* in descending order.
|
||||
*/
|
||||
std::vector<Output> decode() const;
|
||||
std::vector<Output> decode(size_t num_results=1) const;
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -60,7 +60,7 @@ using std::vector;
|
|||
When batch_buffer is full, we do a single step through the acoustic model
|
||||
and accumulate the intermediate decoding state in the DecoderState structure.
|
||||
|
||||
When finishStream() is called, we return the corresponding transcription from
|
||||
When finishStream() is called, we return the corresponding transcript from
|
||||
the current decoder state.
|
||||
*/
|
||||
struct StreamingState {
|
||||
|
@ -78,9 +78,10 @@ struct StreamingState {
|
|||
|
||||
void feedAudioContent(const short* buffer, unsigned int buffer_size);
|
||||
char* intermediateDecode() const;
|
||||
Metadata* intermediateDecodeWithMetadata(unsigned int num_results) const;
|
||||
void finalizeStream();
|
||||
char* finishStream();
|
||||
Metadata* finishStreamWithMetadata();
|
||||
Metadata* finishStreamWithMetadata(unsigned int num_results);
|
||||
|
||||
void processAudioWindow(const vector<float>& buf);
|
||||
void processMfccWindow(const vector<float>& buf);
|
||||
|
@ -136,6 +137,12 @@ StreamingState::intermediateDecode() const
|
|||
return model_->decode(decoder_state_);
|
||||
}
|
||||
|
||||
Metadata*
|
||||
StreamingState::intermediateDecodeWithMetadata(unsigned int num_results) const
|
||||
{
|
||||
return model_->decode_metadata(decoder_state_, num_results);
|
||||
}
|
||||
|
||||
char*
|
||||
StreamingState::finishStream()
|
||||
{
|
||||
|
@ -144,10 +151,10 @@ StreamingState::finishStream()
|
|||
}
|
||||
|
||||
Metadata*
|
||||
StreamingState::finishStreamWithMetadata()
|
||||
StreamingState::finishStreamWithMetadata(unsigned int num_results)
|
||||
{
|
||||
finalizeStream();
|
||||
return model_->decode_metadata(decoder_state_);
|
||||
return model_->decode_metadata(decoder_state_, num_results);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -402,6 +409,13 @@ DS_IntermediateDecode(const StreamingState* aSctx)
|
|||
return aSctx->intermediateDecode();
|
||||
}
|
||||
|
||||
Metadata*
|
||||
DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
|
||||
unsigned int aNumResults)
|
||||
{
|
||||
return aSctx->intermediateDecodeWithMetadata(aNumResults);
|
||||
}
|
||||
|
||||
char*
|
||||
DS_FinishStream(StreamingState* aSctx)
|
||||
{
|
||||
|
@ -411,11 +425,12 @@ DS_FinishStream(StreamingState* aSctx)
|
|||
}
|
||||
|
||||
Metadata*
|
||||
DS_FinishStreamWithMetadata(StreamingState* aSctx)
|
||||
DS_FinishStreamWithMetadata(StreamingState* aSctx,
|
||||
unsigned int aNumResults)
|
||||
{
|
||||
Metadata* metadata = aSctx->finishStreamWithMetadata();
|
||||
Metadata* result = aSctx->finishStreamWithMetadata(aNumResults);
|
||||
DS_FreeStream(aSctx);
|
||||
return metadata;
|
||||
return result;
|
||||
}
|
||||
|
||||
StreamingState*
|
||||
|
@ -444,10 +459,11 @@ DS_SpeechToText(ModelState* aCtx,
|
|||
Metadata*
|
||||
DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize)
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aNumResults)
|
||||
{
|
||||
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
|
||||
return DS_FinishStreamWithMetadata(ctx);
|
||||
return DS_FinishStreamWithMetadata(ctx, aNumResults);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -460,11 +476,16 @@ void
|
|||
DS_FreeMetadata(Metadata* m)
|
||||
{
|
||||
if (m) {
|
||||
for (int i = 0; i < m->num_items; ++i) {
|
||||
free(m->items[i].character);
|
||||
for (int i = 0; i < m->num_transcripts; ++i) {
|
||||
for (int j = 0; j < m->transcripts[i].num_tokens; ++j) {
|
||||
free((void*)m->transcripts[i].tokens[j].text);
|
||||
}
|
||||
delete[] m->items;
|
||||
delete m;
|
||||
|
||||
free((void*)m->transcripts[i].tokens);
|
||||
}
|
||||
|
||||
free((void*)m->transcripts);
|
||||
free(m);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -20,32 +20,43 @@ typedef struct ModelState ModelState;
|
|||
typedef struct StreamingState StreamingState;
|
||||
|
||||
/**
|
||||
* @brief Stores each individual character, along with its timing information
|
||||
* @brief Stores text of an individual token, along with its timing information
|
||||
*/
|
||||
typedef struct MetadataItem {
|
||||
/** The character generated for transcription */
|
||||
char* character;
|
||||
typedef struct TokenMetadata {
|
||||
/** The text corresponding to this token */
|
||||
const char* const text;
|
||||
|
||||
/** Position of the character in units of 20ms */
|
||||
int timestep;
|
||||
/** Position of the token in units of 20ms */
|
||||
const unsigned int timestep;
|
||||
|
||||
/** Position of the character in seconds */
|
||||
float start_time;
|
||||
} MetadataItem;
|
||||
/** Position of the token in seconds */
|
||||
const float start_time;
|
||||
} TokenMetadata;
|
||||
|
||||
/**
|
||||
* @brief Stores the entire CTC output as an array of character metadata objects
|
||||
* @brief A single transcript computed by the model, including a confidence
|
||||
* value and the metadata for its constituent tokens.
|
||||
*/
|
||||
typedef struct CandidateTranscript {
|
||||
/** Array of TokenMetadata objects */
|
||||
const TokenMetadata* const tokens;
|
||||
/** Size of the tokens array */
|
||||
const unsigned int num_tokens;
|
||||
/** Approximated confidence value for this transcript. This is roughly the
|
||||
* sum of the acoustic model logit values for each timestep/character that
|
||||
* contributed to the creation of this transcript.
|
||||
*/
|
||||
const double confidence;
|
||||
} CandidateTranscript;
|
||||
|
||||
/**
|
||||
* @brief An array of CandidateTranscript objects computed by the model.
|
||||
*/
|
||||
typedef struct Metadata {
|
||||
/** List of items */
|
||||
MetadataItem* items;
|
||||
/** Size of the list of items */
|
||||
int num_items;
|
||||
/** Approximated confidence value for this transcription. This is roughly the
|
||||
* sum of the acoustic model logit values for each timestep/character that
|
||||
* contributed to the creation of this transcription.
|
||||
*/
|
||||
double confidence;
|
||||
/** Array of CandidateTranscript objects */
|
||||
const CandidateTranscript* const transcripts;
|
||||
/** Size of the transcripts array */
|
||||
const unsigned int num_transcripts;
|
||||
} Metadata;
|
||||
|
||||
enum DeepSpeech_Error_Codes
|
||||
|
@ -164,7 +175,7 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx,
|
|||
float aBeta);
|
||||
|
||||
/**
|
||||
* @brief Use the DeepSpeech model to perform Speech-To-Text.
|
||||
* @brief Use the DeepSpeech model to convert speech to text.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||
|
@ -180,21 +191,25 @@ char* DS_SpeechToText(ModelState* aCtx,
|
|||
unsigned int aBufferSize);
|
||||
|
||||
/**
|
||||
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
|
||||
* about the results.
|
||||
* @brief Use the DeepSpeech model to convert speech to text and output results
|
||||
* including metadata.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||
* sample rate (matching what the model was trained on).
|
||||
* @param aBufferSize The number of samples in the audio signal.
|
||||
* @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this.
|
||||
*
|
||||
* @return Outputs a struct of individual letters along with their timing information.
|
||||
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
|
||||
* @return Metadata struct containing multiple CandidateTranscript structs. Each
|
||||
* transcript has per-token metadata including timing information. The
|
||||
* user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
|
||||
* Returns NULL on error.
|
||||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize);
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aNumResults);
|
||||
|
||||
/**
|
||||
* @brief Create a new streaming inference state. The streaming state returned
|
||||
|
@ -236,8 +251,24 @@ DEEPSPEECH_EXPORT
|
|||
char* DS_IntermediateDecode(const StreamingState* aSctx);
|
||||
|
||||
/**
|
||||
* @brief Signal the end of an audio signal to an ongoing streaming
|
||||
* inference, returns the STT result over the whole audio signal.
|
||||
* @brief Compute the intermediate decoding of an ongoing streaming inference,
|
||||
* return results including metadata.
|
||||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
* @param aNumResults The number of candidate transcripts to return.
|
||||
*
|
||||
* @return Metadata struct containing multiple candidate transcripts. Each transcript
|
||||
* has per-token metadata including timing information. The user is
|
||||
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
|
||||
* Returns NULL on error.
|
||||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
|
||||
unsigned int aNumResults);
|
||||
|
||||
/**
|
||||
* @brief Compute the final decoding of an ongoing streaming inference and return
|
||||
* the result. Signals the end of an ongoing streaming inference.
|
||||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
*
|
||||
|
@ -250,18 +281,23 @@ DEEPSPEECH_EXPORT
|
|||
char* DS_FinishStream(StreamingState* aSctx);
|
||||
|
||||
/**
|
||||
* @brief Signal the end of an audio signal to an ongoing streaming
|
||||
* inference, returns per-letter metadata.
|
||||
* @brief Compute the final decoding of an ongoing streaming inference and return
|
||||
* results including metadata. Signals the end of an ongoing streaming
|
||||
* inference.
|
||||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
* @param aNumResults The number of candidate transcripts to return.
|
||||
*
|
||||
* @return Outputs a struct of individual letters along with their timing information.
|
||||
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
|
||||
* @return Metadata struct containing multiple candidate transcripts. Each transcript
|
||||
* has per-token metadata including timing information. The user is
|
||||
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
|
||||
* Returns NULL on error.
|
||||
*
|
||||
* @note This method will free the state pointer (@p aSctx).
|
||||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx);
|
||||
Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
|
||||
unsigned int aNumResults);
|
||||
|
||||
/**
|
||||
* @brief Destroy a streaming state without decoding the computed logits. This
|
||||
|
|
|
@ -199,13 +199,14 @@ namespace DeepSpeechClient
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
|
||||
/// </summary>
|
||||
/// <param name="stream">Instance of the stream to finish.</param>
|
||||
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||
/// <returns>The extended metadata result.</returns>
|
||||
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream)
|
||||
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
|
||||
{
|
||||
return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata();
|
||||
return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -218,6 +219,17 @@ namespace DeepSpeechClient
|
|||
return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
|
||||
/// </summary>
|
||||
/// <param name="stream">Instance of the stream to decode.</param>
|
||||
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||
/// <returns>The STT intermediate result.</returns>
|
||||
public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
|
||||
{
|
||||
return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Return version of this library. The returned version is a semantic version
|
||||
/// (SemVer 2.0.0).
|
||||
|
@ -261,14 +273,15 @@ namespace DeepSpeechClient
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||
/// <returns>The extended metadata. Returns NULL on error.</returns>
|
||||
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
|
||||
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
|
||||
{
|
||||
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
|
||||
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
|
|
@ -50,11 +50,13 @@
|
|||
<Compile Include="Extensions\NativeExtensions.cs" />
|
||||
<Compile Include="Models\DeepSpeechStream.cs" />
|
||||
<Compile Include="Models\Metadata.cs" />
|
||||
<Compile Include="Models\MetadataItem.cs" />
|
||||
<Compile Include="Models\CandidateTranscript.cs" />
|
||||
<Compile Include="Models\TokenMetadata.cs" />
|
||||
<Compile Include="NativeImp.cs" />
|
||||
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||
<Compile Include="Structs\Metadata.cs" />
|
||||
<Compile Include="Structs\MetadataItem.cs" />
|
||||
<Compile Include="Structs\CandidateTranscript.cs" />
|
||||
<Compile Include="Structs\TokenMetadata.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup />
|
||||
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
||||
|
|
|
@ -26,35 +26,68 @@ namespace DeepSpeechClient.Extensions
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a pointer into managed metadata object.
|
||||
/// Converts a pointer into managed TokenMetadata object.
|
||||
/// </summary>
|
||||
/// <param name="intPtr">Native pointer.</param>
|
||||
/// <returns>TokenMetadata managed object.</returns>
|
||||
private static Models.TokenMetadata PtrToTokenMetadata(this IntPtr intPtr)
|
||||
{
|
||||
var token = Marshal.PtrToStructure<TokenMetadata>(intPtr);
|
||||
var managedToken = new Models.TokenMetadata
|
||||
{
|
||||
Timestep = token.timestep,
|
||||
StartTime = token.start_time,
|
||||
Text = token.text.PtrToString(releasePtr: false)
|
||||
};
|
||||
return managedToken;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a pointer into managed CandidateTranscript object.
|
||||
/// </summary>
|
||||
/// <param name="intPtr">Native pointer.</param>
|
||||
/// <returns>CandidateTranscript managed object.</returns>
|
||||
private static Models.CandidateTranscript PtrToCandidateTranscript(this IntPtr intPtr)
|
||||
{
|
||||
var managedTranscript = new Models.CandidateTranscript();
|
||||
var transcript = Marshal.PtrToStructure<CandidateTranscript>(intPtr);
|
||||
|
||||
managedTranscript.Tokens = new Models.TokenMetadata[transcript.num_tokens];
|
||||
managedTranscript.Confidence = transcript.confidence;
|
||||
|
||||
//we need to manually read each item from the native ptr using its size
|
||||
var sizeOfTokenMetadata = Marshal.SizeOf(typeof(TokenMetadata));
|
||||
for (int i = 0; i < transcript.num_tokens; i++)
|
||||
{
|
||||
managedTranscript.Tokens[i] = transcript.tokens.PtrToTokenMetadata();
|
||||
transcript.tokens += sizeOfTokenMetadata;
|
||||
}
|
||||
|
||||
return managedTranscript;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a pointer into managed Metadata object.
|
||||
/// </summary>
|
||||
/// <param name="intPtr">Native pointer.</param>
|
||||
/// <returns>Metadata managed object.</returns>
|
||||
internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
|
||||
{
|
||||
var managedMetaObject = new Models.Metadata();
|
||||
var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));
|
||||
|
||||
managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
|
||||
managedMetaObject.Confidence = metaData.confidence;
|
||||
var managedMetadata = new Models.Metadata();
|
||||
var metadata = Marshal.PtrToStructure<Metadata>(intPtr);
|
||||
|
||||
managedMetadata.Transcripts = new Models.CandidateTranscript[metadata.num_transcripts];
|
||||
|
||||
//we need to manually read each item from the native ptr using its size
|
||||
var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
|
||||
for (int i = 0; i < metaData.num_items; i++)
|
||||
var sizeOfCandidateTranscript = Marshal.SizeOf(typeof(CandidateTranscript));
|
||||
for (int i = 0; i < metadata.num_transcripts; i++)
|
||||
{
|
||||
var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items);
|
||||
managedMetaObject.Items[i] = new Models.MetadataItem
|
||||
{
|
||||
Timestep = tempItem.timestep,
|
||||
StartTime = tempItem.start_time,
|
||||
Character = tempItem.character.PtrToString(releasePtr: false)
|
||||
};
|
||||
//we keep the offset on each read
|
||||
metaData.items += sizeOfMetaItem;
|
||||
managedMetadata.Transcripts[i] = metadata.transcripts.PtrToCandidateTranscript();
|
||||
metadata.transcripts += sizeOfCandidateTranscript;
|
||||
}
|
||||
|
||||
NativeImp.DS_FreeMetadata(intPtr);
|
||||
return managedMetaObject;
|
||||
return managedMetadata;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -68,13 +68,15 @@ namespace DeepSpeechClient.Interfaces
|
|||
uint aBufferSize);
|
||||
|
||||
/// <summary>
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||
/// <returns>The extended metadata. Returns NULL on error.</returns>
|
||||
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
|
||||
uint aBufferSize);
|
||||
uint aBufferSize,
|
||||
uint aNumResults);
|
||||
|
||||
/// <summary>
|
||||
/// Destroy a streaming state without decoding the computed logits.
|
||||
|
@ -102,6 +104,14 @@ namespace DeepSpeechClient.Interfaces
|
|||
/// <returns>The STT intermediate result.</returns>
|
||||
unsafe string IntermediateDecode(DeepSpeechStream stream);
|
||||
|
||||
/// <summary>
|
||||
/// Computes the intermediate decoding of an ongoing streaming inference, including metadata.
|
||||
/// </summary>
|
||||
/// <param name="stream">Instance of the stream to decode.</param>
|
||||
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||
/// <returns>The extended metadata result.</returns>
|
||||
unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
|
||||
|
||||
/// <summary>
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||
/// </summary>
|
||||
|
@ -110,10 +120,11 @@ namespace DeepSpeechClient.Interfaces
|
|||
unsafe string FinishStream(DeepSpeechStream stream);
|
||||
|
||||
/// <summary>
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata.
|
||||
/// </summary>
|
||||
/// <param name="stream">Instance of the stream to finish.</param>
|
||||
/// <param name="aNumResults">Maximum number of candidate transcripts to return. Returned list might be smaller than this.</param>
|
||||
/// <returns>The extended metadata result.</returns>
|
||||
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream);
|
||||
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
namespace DeepSpeechClient.Models
|
||||
{
|
||||
/// <summary>
|
||||
/// Stores the entire CTC output as an array of character metadata objects.
|
||||
/// </summary>
|
||||
public class CandidateTranscript
|
||||
{
|
||||
/// <summary>
|
||||
/// Approximated confidence value for this transcription.
|
||||
/// </summary>
|
||||
public double Confidence { get; set; }
|
||||
/// <summary>
|
||||
/// List of metada tokens containing text, timestep, and time offset.
|
||||
/// </summary>
|
||||
public TokenMetadata[] Tokens { get; set; }
|
||||
}
|
||||
}
|
|
@ -6,12 +6,8 @@
|
|||
public class Metadata
|
||||
{
|
||||
/// <summary>
|
||||
/// Approximated confidence value for this transcription.
|
||||
/// List of candidate transcripts.
|
||||
/// </summary>
|
||||
public double Confidence { get; set; }
|
||||
/// <summary>
|
||||
/// List of metada items containing char, timespet, and time offset.
|
||||
/// </summary>
|
||||
public MetadataItem[] Items { get; set; }
|
||||
public CandidateTranscript[] Transcripts { get; set; }
|
||||
}
|
||||
}
|
|
@ -3,12 +3,12 @@
|
|||
/// <summary>
|
||||
/// Stores each individual character, along with its timing information.
|
||||
/// </summary>
|
||||
public class MetadataItem
|
||||
public class TokenMetadata
|
||||
{
|
||||
/// <summary>
|
||||
/// Char of the current timestep.
|
||||
/// </summary>
|
||||
public string Character;
|
||||
public string Text;
|
||||
/// <summary>
|
||||
/// Position of the character in units of 20ms.
|
||||
/// </summary>
|
|
@ -55,7 +55,8 @@ namespace DeepSpeechClient
|
|||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
|
||||
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
|
||||
short[] aBuffer,
|
||||
uint aBufferSize);
|
||||
uint aBufferSize,
|
||||
uint aNumResults);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
|
||||
|
@ -82,12 +83,17 @@ namespace DeepSpeechClient
|
|||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx,
|
||||
uint aNumResults);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
|
||||
CharSet = CharSet.Ansi, SetLastError = true)]
|
||||
internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx);
|
||||
internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx,
|
||||
uint aNumResults);
|
||||
#endregion
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
namespace DeepSpeechClient.Structs
|
||||
{
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
internal unsafe struct CandidateTranscript
|
||||
{
|
||||
/// <summary>
|
||||
/// Native list of tokens.
|
||||
/// </summary>
|
||||
internal unsafe IntPtr tokens;
|
||||
/// <summary>
|
||||
/// Count of tokens from the native side.
|
||||
/// </summary>
|
||||
internal unsafe int num_tokens;
|
||||
/// <summary>
|
||||
/// Approximated confidence value for this transcription.
|
||||
/// </summary>
|
||||
internal unsafe double confidence;
|
||||
}
|
||||
}
|
|
@ -7,16 +7,12 @@ namespace DeepSpeechClient.Structs
|
|||
internal unsafe struct Metadata
|
||||
{
|
||||
/// <summary>
|
||||
/// Native list of items.
|
||||
/// Native list of candidate transcripts.
|
||||
/// </summary>
|
||||
internal unsafe IntPtr items;
|
||||
internal unsafe IntPtr transcripts;
|
||||
/// <summary>
|
||||
/// Count of items from the native side.
|
||||
/// Count of transcripts from the native side.
|
||||
/// </summary>
|
||||
internal unsafe int num_items;
|
||||
/// <summary>
|
||||
/// Approximated confidence value for this transcription.
|
||||
/// </summary>
|
||||
internal unsafe double confidence;
|
||||
internal unsafe int num_transcripts;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,12 +4,12 @@ using System.Runtime.InteropServices;
|
|||
namespace DeepSpeechClient.Structs
|
||||
{
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
internal unsafe struct MetadataItem
|
||||
internal unsafe struct TokenMetadata
|
||||
{
|
||||
/// <summary>
|
||||
/// Native character.
|
||||
/// Native text.
|
||||
/// </summary>
|
||||
internal unsafe IntPtr character;
|
||||
internal unsafe IntPtr text;
|
||||
/// <summary>
|
||||
/// Position of the character in units of 20ms.
|
||||
/// </summary>
|
|
@ -21,14 +21,14 @@ namespace CSharpExamples
|
|||
static string GetArgument(IEnumerable<string> args, string option)
|
||||
=> args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();
|
||||
|
||||
static string MetadataToString(Metadata meta)
|
||||
static string MetadataToString(CandidateTranscript transcript)
|
||||
{
|
||||
var nl = Environment.NewLine;
|
||||
string retval =
|
||||
Environment.NewLine + $"Recognized text: {string.Join("", meta?.Items?.Select(x => x.Character))} {nl}"
|
||||
+ $"Confidence: {meta?.Confidence} {nl}"
|
||||
+ $"Item count: {meta?.Items?.Length} {nl}"
|
||||
+ string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}"));
|
||||
Environment.NewLine + $"Recognized text: {string.Join("", transcript?.Tokens?.Select(x => x.Text))} {nl}"
|
||||
+ $"Confidence: {transcript?.Confidence} {nl}"
|
||||
+ $"Item count: {transcript?.Tokens?.Length} {nl}"
|
||||
+ string.Join(nl, transcript?.Tokens?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Text}"));
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
@ -75,8 +75,8 @@ namespace CSharpExamples
|
|||
if (extended)
|
||||
{
|
||||
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer,
|
||||
Convert.ToUInt32(waveBuffer.MaxSize / 2));
|
||||
speechResult = MetadataToString(metaResult);
|
||||
Convert.ToUInt32(waveBuffer.MaxSize / 2), 1);
|
||||
speechResult = MetadataToString(metaResult.Transcripts[0]);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -6,6 +6,8 @@
|
|||
%}
|
||||
|
||||
%include "typemaps.i"
|
||||
%include "enums.swg"
|
||||
%javaconst(1);
|
||||
|
||||
%include "arrays_java.i"
|
||||
// apply to DS_FeedAudioContent and DS_SpeechToText
|
||||
|
@ -15,21 +17,29 @@
|
|||
%pointer_functions(ModelState*, modelstatep);
|
||||
%pointer_functions(StreamingState*, streamingstatep);
|
||||
|
||||
%typemap(newfree) char* "DS_FreeString($1);";
|
||||
|
||||
%include "carrays.i"
|
||||
%array_functions(struct MetadataItem, metadataItem_array);
|
||||
%extend struct CandidateTranscript {
|
||||
/**
|
||||
* Retrieve one TokenMetadata element
|
||||
*
|
||||
* @param i Array index of the TokenMetadata to get
|
||||
*
|
||||
* @return The TokenMetadata requested or null
|
||||
*/
|
||||
const TokenMetadata& getToken(int i) {
|
||||
return self->tokens[i];
|
||||
}
|
||||
}
|
||||
|
||||
%extend struct Metadata {
|
||||
/**
|
||||
* Retrieve one MetadataItem element
|
||||
* Retrieve one CandidateTranscript element
|
||||
*
|
||||
* @param i Array index of the MetadataItem to get
|
||||
* @param i Array index of the CandidateTranscript to get
|
||||
*
|
||||
* @return The MetadataItem requested or null
|
||||
* @return The CandidateTranscript requested or null
|
||||
*/
|
||||
MetadataItem getItem(int i) {
|
||||
return metadataItem_array_getitem(self->items, i);
|
||||
const CandidateTranscript& getTranscript(int i) {
|
||||
return self->transcripts[i];
|
||||
}
|
||||
|
||||
~Metadata() {
|
||||
|
@ -37,14 +47,18 @@
|
|||
}
|
||||
}
|
||||
|
||||
%nodefaultdtor Metadata;
|
||||
%nodefaultctor Metadata;
|
||||
%nodefaultctor MetadataItem;
|
||||
%nodefaultdtor MetadataItem;
|
||||
%nodefaultdtor Metadata;
|
||||
%nodefaultctor CandidateTranscript;
|
||||
%nodefaultdtor CandidateTranscript;
|
||||
%nodefaultctor TokenMetadata;
|
||||
%nodefaultdtor TokenMetadata;
|
||||
|
||||
%typemap(newfree) char* "DS_FreeString($1);";
|
||||
%newobject DS_SpeechToText;
|
||||
%newobject DS_IntermediateDecode;
|
||||
%newobject DS_FinishStream;
|
||||
%newobject DS_ErrorCodeToErrorMessage;
|
||||
|
||||
%rename ("%(strip:[DS_])s") "";
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ import org.junit.runners.MethodSorters;
|
|||
import static org.junit.Assert.*;
|
||||
|
||||
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
|
||||
import org.mozilla.deepspeech.libdeepspeech.Metadata;
|
||||
import org.mozilla.deepspeech.libdeepspeech.CandidateTranscript;
|
||||
|
||||
import java.io.RandomAccessFile;
|
||||
import java.io.FileNotFoundException;
|
||||
|
@ -61,10 +61,10 @@ public class BasicTest {
|
|||
m.freeModel();
|
||||
}
|
||||
|
||||
private String metadataToString(Metadata m) {
|
||||
private String candidateTranscriptToString(CandidateTranscript t) {
|
||||
String retval = "";
|
||||
for (int i = 0; i < m.getNum_items(); ++i) {
|
||||
retval += m.getItem(i).getCharacter();
|
||||
for (int i = 0; i < t.getNum_tokens(); ++i) {
|
||||
retval += t.getToken(i).getText();
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
@ -97,7 +97,7 @@ public class BasicTest {
|
|||
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
|
||||
|
||||
if (extendedMetadata) {
|
||||
return metadataToString(m.sttWithMetadata(shorts, shorts.length));
|
||||
return candidateTranscriptToString(m.sttWithMetadata(shorts, shorts.length, 1).getTranscript(0));
|
||||
} else {
|
||||
return m.stt(shorts, shorts.length);
|
||||
}
|
||||
|
|
|
@ -11,8 +11,15 @@ public class DeepSpeechModel {
|
|||
}
|
||||
|
||||
// FIXME: We should have something better than those SWIGTYPE_*
|
||||
SWIGTYPE_p_p_ModelState _mspp;
|
||||
SWIGTYPE_p_ModelState _msp;
|
||||
private SWIGTYPE_p_p_ModelState _mspp;
|
||||
private SWIGTYPE_p_ModelState _msp;
|
||||
|
||||
private void evaluateErrorCode(int errorCode) {
|
||||
DeepSpeech_Error_Codes code = DeepSpeech_Error_Codes.swigToEnum(errorCode);
|
||||
if (code != DeepSpeech_Error_Codes.ERR_OK) {
|
||||
throw new RuntimeException("Error: " + impl.ErrorCodeToErrorMessage(errorCode) + " (0x" + Integer.toHexString(errorCode) + ").");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief An object providing an interface to a trained DeepSpeech model.
|
||||
|
@ -20,10 +27,12 @@ public class DeepSpeechModel {
|
|||
* @constructor
|
||||
*
|
||||
* @param modelPath The path to the frozen model graph.
|
||||
*
|
||||
* @throws RuntimeException on failure.
|
||||
*/
|
||||
public DeepSpeechModel(String modelPath) {
|
||||
this._mspp = impl.new_modelstatep();
|
||||
impl.CreateModel(modelPath, this._mspp);
|
||||
evaluateErrorCode(impl.CreateModel(modelPath, this._mspp));
|
||||
this._msp = impl.modelstatep_value(this._mspp);
|
||||
}
|
||||
|
||||
|
@ -43,10 +52,10 @@ public class DeepSpeechModel {
|
|||
* @param aBeamWidth The beam width used by the model. A larger beam width value
|
||||
* generates better results at the cost of decoding time.
|
||||
*
|
||||
* @return Zero on success, non-zero on failure.
|
||||
* @throws RuntimeException on failure.
|
||||
*/
|
||||
public int setBeamWidth(long beamWidth) {
|
||||
return impl.SetModelBeamWidth(this._msp, beamWidth);
|
||||
public void setBeamWidth(long beamWidth) {
|
||||
evaluateErrorCode(impl.SetModelBeamWidth(this._msp, beamWidth));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -70,19 +79,19 @@ public class DeepSpeechModel {
|
|||
*
|
||||
* @param scorer The path to the external scorer file.
|
||||
*
|
||||
* @return Zero on success, non-zero on failure (invalid arguments).
|
||||
* @throws RuntimeException on failure.
|
||||
*/
|
||||
public void enableExternalScorer(String scorer) {
|
||||
impl.EnableExternalScorer(this._msp, scorer);
|
||||
evaluateErrorCode(impl.EnableExternalScorer(this._msp, scorer));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Disable decoding using an external scorer.
|
||||
*
|
||||
* @return Zero on success, non-zero on failure (invalid arguments).
|
||||
* @throws RuntimeException on failure.
|
||||
*/
|
||||
public void disableExternalScorer() {
|
||||
impl.DisableExternalScorer(this._msp);
|
||||
evaluateErrorCode(impl.DisableExternalScorer(this._msp));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -91,10 +100,10 @@ public class DeepSpeechModel {
|
|||
* @param alpha The alpha hyperparameter of the decoder. Language model weight.
|
||||
* @param beta The beta hyperparameter of the decoder. Word insertion weight.
|
||||
*
|
||||
* @return Zero on success, non-zero on failure (invalid arguments).
|
||||
* @throws RuntimeException on failure.
|
||||
*/
|
||||
public void setScorerAlphaBeta(float alpha, float beta) {
|
||||
impl.SetScorerAlphaBeta(this._msp, alpha, beta);
|
||||
evaluateErrorCode(impl.SetScorerAlphaBeta(this._msp, alpha, beta));
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -117,11 +126,13 @@ public class DeepSpeechModel {
|
|||
* @param buffer A 16-bit, mono raw audio signal at the appropriate
|
||||
* sample rate (matching what the model was trained on).
|
||||
* @param buffer_size The number of samples in the audio signal.
|
||||
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||
*
|
||||
* @return Outputs a Metadata object of individual letters along with their timing information.
|
||||
* @return Metadata struct containing multiple candidate transcripts. Each transcript
|
||||
* has per-token metadata including timing information.
|
||||
*/
|
||||
public Metadata sttWithMetadata(short[] buffer, int buffer_size) {
|
||||
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size);
|
||||
public Metadata sttWithMetadata(short[] buffer, int buffer_size, int num_results) {
|
||||
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, num_results);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -130,10 +141,12 @@ public class DeepSpeechModel {
|
|||
* and finishStream().
|
||||
*
|
||||
* @return An opaque object that represents the streaming state.
|
||||
*
|
||||
* @throws RuntimeException on failure.
|
||||
*/
|
||||
public DeepSpeechStreamingState createStream() {
|
||||
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
|
||||
impl.CreateStream(this._msp, ssp);
|
||||
evaluateErrorCode(impl.CreateStream(this._msp, ssp));
|
||||
return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
|
||||
}
|
||||
|
||||
|
@ -161,8 +174,20 @@ public class DeepSpeechModel {
|
|||
}
|
||||
|
||||
/**
|
||||
* @brief Signal the end of an audio signal to an ongoing streaming
|
||||
* inference, returns the STT result over the whole audio signal.
|
||||
* @brief Compute the intermediate decoding of an ongoing streaming inference.
|
||||
*
|
||||
* @param ctx A streaming state pointer returned by createStream().
|
||||
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||
*
|
||||
* @return The STT intermediate result.
|
||||
*/
|
||||
public Metadata intermediateDecodeWithMetadata(DeepSpeechStreamingState ctx, int num_results) {
|
||||
return impl.IntermediateDecodeWithMetadata(ctx.get(), num_results);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute the final decoding of an ongoing streaming inference and return
|
||||
* the result. Signals the end of an ongoing streaming inference.
|
||||
*
|
||||
* @param ctx A streaming state pointer returned by createStream().
|
||||
*
|
||||
|
@ -175,16 +200,19 @@ public class DeepSpeechModel {
|
|||
}
|
||||
|
||||
/**
|
||||
* @brief Signal the end of an audio signal to an ongoing streaming
|
||||
* inference, returns per-letter metadata.
|
||||
* @brief Compute the final decoding of an ongoing streaming inference and return
|
||||
* the results including metadata. Signals the end of an ongoing streaming
|
||||
* inference.
|
||||
*
|
||||
* @param ctx A streaming state pointer returned by createStream().
|
||||
* @param num_results Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||
*
|
||||
* @return Outputs a Metadata object of individual letters along with their timing information.
|
||||
* @return Metadata struct containing multiple candidate transcripts. Each transcript
|
||||
* has per-token metadata including timing information.
|
||||
*
|
||||
* @note This method will free the state pointer (@p ctx).
|
||||
*/
|
||||
public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx) {
|
||||
return impl.FinishStreamWithMetadata(ctx.get());
|
||||
public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx, int num_results) {
|
||||
return impl.FinishStreamWithMetadata(ctx.get(), num_results);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
/* ----------------------------------------------------------------------------
|
||||
* This file was automatically generated by SWIG (http://www.swig.org).
|
||||
* Version 4.0.1
|
||||
*
|
||||
* Do not make changes to this file unless you know what you are doing--modify
|
||||
* the SWIG interface file instead.
|
||||
* ----------------------------------------------------------------------------- */
|
||||
|
||||
package org.mozilla.deepspeech.libdeepspeech;
|
||||
|
||||
/**
|
||||
* A single transcript computed by the model, including a confidence<br>
|
||||
* value and the metadata for its constituent tokens.
|
||||
*/
|
||||
public class CandidateTranscript {
|
||||
private transient long swigCPtr;
|
||||
protected transient boolean swigCMemOwn;
|
||||
|
||||
protected CandidateTranscript(long cPtr, boolean cMemoryOwn) {
|
||||
swigCMemOwn = cMemoryOwn;
|
||||
swigCPtr = cPtr;
|
||||
}
|
||||
|
||||
protected static long getCPtr(CandidateTranscript obj) {
|
||||
return (obj == null) ? 0 : obj.swigCPtr;
|
||||
}
|
||||
|
||||
public synchronized void delete() {
|
||||
if (swigCPtr != 0) {
|
||||
if (swigCMemOwn) {
|
||||
swigCMemOwn = false;
|
||||
throw new UnsupportedOperationException("C++ destructor does not have public access");
|
||||
}
|
||||
swigCPtr = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Array of TokenMetadata objects
|
||||
*/
|
||||
public TokenMetadata getTokens() {
|
||||
long cPtr = implJNI.CandidateTranscript_tokens_get(swigCPtr, this);
|
||||
return (cPtr == 0) ? null : new TokenMetadata(cPtr, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Size of the tokens array
|
||||
*/
|
||||
public long getNum_tokens() {
|
||||
return implJNI.CandidateTranscript_num_tokens_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Approximated confidence value for this transcript. This is roughly the<br>
|
||||
* sum of the acoustic model logit values for each timestep/character that<br>
|
||||
* contributed to the creation of this transcript.
|
||||
*/
|
||||
public double getConfidence() {
|
||||
return implJNI.CandidateTranscript_confidence_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve one TokenMetadata element<br>
|
||||
* <br>
|
||||
* @param i Array index of the TokenMetadata to get<br>
|
||||
* <br>
|
||||
* @return The TokenMetadata requested or null
|
||||
*/
|
||||
public TokenMetadata getToken(int i) {
|
||||
return new TokenMetadata(implJNI.CandidateTranscript_getToken(swigCPtr, this, i), false);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
/* ----------------------------------------------------------------------------
|
||||
* This file was automatically generated by SWIG (http://www.swig.org).
|
||||
* Version 4.0.1
|
||||
*
|
||||
* Do not make changes to this file unless you know what you are doing--modify
|
||||
* the SWIG interface file instead.
|
||||
* ----------------------------------------------------------------------------- */
|
||||
|
||||
package org.mozilla.deepspeech.libdeepspeech;
|
||||
|
||||
public enum DeepSpeech_Error_Codes {
|
||||
ERR_OK(0x0000),
|
||||
ERR_NO_MODEL(0x1000),
|
||||
ERR_INVALID_ALPHABET(0x2000),
|
||||
ERR_INVALID_SHAPE(0x2001),
|
||||
ERR_INVALID_SCORER(0x2002),
|
||||
ERR_MODEL_INCOMPATIBLE(0x2003),
|
||||
ERR_SCORER_NOT_ENABLED(0x2004),
|
||||
ERR_FAIL_INIT_MMAP(0x3000),
|
||||
ERR_FAIL_INIT_SESS(0x3001),
|
||||
ERR_FAIL_INTERPRETER(0x3002),
|
||||
ERR_FAIL_RUN_SESS(0x3003),
|
||||
ERR_FAIL_CREATE_STREAM(0x3004),
|
||||
ERR_FAIL_READ_PROTOBUF(0x3005),
|
||||
ERR_FAIL_CREATE_SESS(0x3006),
|
||||
ERR_FAIL_CREATE_MODEL(0x3007);
|
||||
|
||||
public final int swigValue() {
|
||||
return swigValue;
|
||||
}
|
||||
|
||||
public static DeepSpeech_Error_Codes swigToEnum(int swigValue) {
|
||||
DeepSpeech_Error_Codes[] swigValues = DeepSpeech_Error_Codes.class.getEnumConstants();
|
||||
if (swigValue < swigValues.length && swigValue >= 0 && swigValues[swigValue].swigValue == swigValue)
|
||||
return swigValues[swigValue];
|
||||
for (DeepSpeech_Error_Codes swigEnum : swigValues)
|
||||
if (swigEnum.swigValue == swigValue)
|
||||
return swigEnum;
|
||||
throw new IllegalArgumentException("No enum " + DeepSpeech_Error_Codes.class + " with value " + swigValue);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
private DeepSpeech_Error_Codes() {
|
||||
this.swigValue = SwigNext.next++;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
private DeepSpeech_Error_Codes(int swigValue) {
|
||||
this.swigValue = swigValue;
|
||||
SwigNext.next = swigValue+1;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
private DeepSpeech_Error_Codes(DeepSpeech_Error_Codes swigEnum) {
|
||||
this.swigValue = swigEnum.swigValue;
|
||||
SwigNext.next = this.swigValue+1;
|
||||
}
|
||||
|
||||
private final int swigValue;
|
||||
|
||||
private static class SwigNext {
|
||||
private static int next = 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
/* ----------------------------------------------------------------------------
|
||||
* This file was automatically generated by SWIG (http://www.swig.org).
|
||||
* Version 4.0.2
|
||||
* Version 4.0.1
|
||||
*
|
||||
* Do not make changes to this file unless you know what you are doing--modify
|
||||
* the SWIG interface file instead.
|
||||
|
@ -9,7 +9,7 @@
|
|||
package org.mozilla.deepspeech.libdeepspeech;
|
||||
|
||||
/**
|
||||
* Stores the entire CTC output as an array of character metadata objects
|
||||
* An array of CandidateTranscript objects computed by the model.
|
||||
*/
|
||||
public class Metadata {
|
||||
private transient long swigCPtr;
|
||||
|
@ -40,61 +40,29 @@ public class Metadata {
|
|||
}
|
||||
|
||||
/**
|
||||
* List of items
|
||||
* Array of CandidateTranscript objects
|
||||
*/
|
||||
public void setItems(MetadataItem value) {
|
||||
implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value);
|
||||
public CandidateTranscript getTranscripts() {
|
||||
long cPtr = implJNI.Metadata_transcripts_get(swigCPtr, this);
|
||||
return (cPtr == 0) ? null : new CandidateTranscript(cPtr, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* List of items
|
||||
* Size of the transcripts array
|
||||
*/
|
||||
public MetadataItem getItems() {
|
||||
long cPtr = implJNI.Metadata_items_get(swigCPtr, this);
|
||||
return (cPtr == 0) ? null : new MetadataItem(cPtr, false);
|
||||
public long getNum_transcripts() {
|
||||
return implJNI.Metadata_num_transcripts_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Size of the list of items
|
||||
*/
|
||||
public void setNum_items(int value) {
|
||||
implJNI.Metadata_num_items_set(swigCPtr, this, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Size of the list of items
|
||||
*/
|
||||
public int getNum_items() {
|
||||
return implJNI.Metadata_num_items_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Approximated confidence value for this transcription. This is roughly the<br>
|
||||
* sum of the acoustic model logit values for each timestep/character that<br>
|
||||
* contributed to the creation of this transcription.
|
||||
*/
|
||||
public void setConfidence(double value) {
|
||||
implJNI.Metadata_confidence_set(swigCPtr, this, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Approximated confidence value for this transcription. This is roughly the<br>
|
||||
* sum of the acoustic model logit values for each timestep/character that<br>
|
||||
* contributed to the creation of this transcription.
|
||||
*/
|
||||
public double getConfidence() {
|
||||
return implJNI.Metadata_confidence_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve one MetadataItem element<br>
|
||||
* Retrieve one CandidateTranscript element<br>
|
||||
* <br>
|
||||
* @param i Array index of the MetadataItem to get<br>
|
||||
* @param i Array index of the CandidateTranscript to get<br>
|
||||
* <br>
|
||||
* @return The MetadataItem requested or null
|
||||
* @return The CandidateTranscript requested or null
|
||||
*/
|
||||
public MetadataItem getItem(int i) {
|
||||
return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true);
|
||||
public CandidateTranscript getTranscript(int i) {
|
||||
return new CandidateTranscript(implJNI.Metadata_getTranscript(swigCPtr, this, i), false);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@ Javadoc for Sphinx
|
|||
|
||||
This code is only here for reference for documentation generation.
|
||||
|
||||
To update, please build SWIG (4.0 at least) and then run from native_client/java:
|
||||
To update, please install SWIG (4.0 at least) and then run from native_client/java:
|
||||
|
||||
.. code-block::
|
||||
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
/* ----------------------------------------------------------------------------
|
||||
* This file was automatically generated by SWIG (http://www.swig.org).
|
||||
* Version 4.0.1
|
||||
*
|
||||
* Do not make changes to this file unless you know what you are doing--modify
|
||||
* the SWIG interface file instead.
|
||||
* ----------------------------------------------------------------------------- */
|
||||
|
||||
package org.mozilla.deepspeech.libdeepspeech;
|
||||
|
||||
/**
|
||||
* Stores text of an individual token, along with its timing information
|
||||
*/
|
||||
public class TokenMetadata {
|
||||
private transient long swigCPtr;
|
||||
protected transient boolean swigCMemOwn;
|
||||
|
||||
protected TokenMetadata(long cPtr, boolean cMemoryOwn) {
|
||||
swigCMemOwn = cMemoryOwn;
|
||||
swigCPtr = cPtr;
|
||||
}
|
||||
|
||||
protected static long getCPtr(TokenMetadata obj) {
|
||||
return (obj == null) ? 0 : obj.swigCPtr;
|
||||
}
|
||||
|
||||
public synchronized void delete() {
|
||||
if (swigCPtr != 0) {
|
||||
if (swigCMemOwn) {
|
||||
swigCMemOwn = false;
|
||||
throw new UnsupportedOperationException("C++ destructor does not have public access");
|
||||
}
|
||||
swigCPtr = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The text corresponding to this token
|
||||
*/
|
||||
public String getText() {
|
||||
return implJNI.TokenMetadata_text_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Position of the token in units of 20ms
|
||||
*/
|
||||
public long getTimestep() {
|
||||
return implJNI.TokenMetadata_timestep_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Position of the token in seconds
|
||||
*/
|
||||
public float getStart_time() {
|
||||
return implJNI.TokenMetadata_start_time_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
}
|
|
@ -42,12 +42,11 @@ function totalTime(hrtimeValue) {
|
|||
return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
|
||||
}
|
||||
|
||||
function metadataToString(metadata) {
|
||||
function candidateTranscriptToString(transcript) {
|
||||
var retval = ""
|
||||
for (var i = 0; i < metadata.num_items; ++i) {
|
||||
retval += metadata.items[i].character;
|
||||
for (var i = 0; i < transcript.tokens.length; ++i) {
|
||||
retval += transcript.tokens[i].text;
|
||||
}
|
||||
Ds.FreeMetadata(metadata);
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
@ -117,7 +116,9 @@ audioStream.on('finish', () => {
|
|||
const audioLength = (audioBuffer.length / 2) * (1 / desired_sample_rate);
|
||||
|
||||
if (args['extended']) {
|
||||
console.log(metadataToString(model.sttWithMetadata(audioBuffer)));
|
||||
let metadata = model.sttWithMetadata(audioBuffer, 1);
|
||||
console.log(candidateTranscriptToString(metadata.transcripts[0]));
|
||||
Ds.FreeMetadata(metadata);
|
||||
} else {
|
||||
console.log(model.stt(audioBuffer));
|
||||
}
|
||||
|
|
|
@ -47,8 +47,8 @@ using namespace node;
|
|||
%typemap(argout) ModelState **retval {
|
||||
$result = SWIGV8_ARRAY_NEW();
|
||||
SWIGV8_AppendOutput($result, SWIG_From_int(result));
|
||||
// owned by SWIG, ModelState destructor gets called when the JavaScript object is finalized (see below)
|
||||
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, SWIG_POINTER_OWN));
|
||||
// owned by the application. NodeJS does not guarantee the finalizer will be called so applications must call FreeMetadata themselves.
|
||||
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0));
|
||||
}
|
||||
|
||||
|
||||
|
@ -68,27 +68,29 @@ using namespace node;
|
|||
%nodefaultctor ModelState;
|
||||
%nodefaultdtor ModelState;
|
||||
|
||||
%typemap(out) MetadataItem* %{
|
||||
%typemap(out) TokenMetadata* %{
|
||||
$result = SWIGV8_ARRAY_NEW();
|
||||
for (int i = 0; i < arg1->num_items; ++i) {
|
||||
SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_MetadataItem, SWIG_POINTER_OWN));
|
||||
for (int i = 0; i < arg1->num_tokens; ++i) {
|
||||
SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_TokenMetadata, 0));
|
||||
}
|
||||
%}
|
||||
|
||||
%nodefaultdtor Metadata;
|
||||
%nodefaultctor Metadata;
|
||||
%nodefaultctor MetadataItem;
|
||||
%nodefaultdtor MetadataItem;
|
||||
|
||||
%extend struct Metadata {
|
||||
~Metadata() {
|
||||
DS_FreeMetadata($self);
|
||||
%typemap(out) CandidateTranscript* %{
|
||||
$result = SWIGV8_ARRAY_NEW();
|
||||
for (int i = 0; i < arg1->num_transcripts; ++i) {
|
||||
SWIGV8_AppendOutput($result, SWIG_NewPointerObj(SWIG_as_voidptr(&result[i]), SWIGTYPE_p_CandidateTranscript, 0));
|
||||
}
|
||||
}
|
||||
%}
|
||||
|
||||
%extend struct MetadataItem {
|
||||
~MetadataItem() { }
|
||||
}
|
||||
%ignore Metadata::num_transcripts;
|
||||
%ignore CandidateTranscript::num_tokens;
|
||||
|
||||
%nodefaultctor Metadata;
|
||||
%nodefaultdtor Metadata;
|
||||
%nodefaultctor CandidateTranscript;
|
||||
%nodefaultdtor CandidateTranscript;
|
||||
%nodefaultctor TokenMetadata;
|
||||
%nodefaultdtor TokenMetadata;
|
||||
|
||||
%rename ("%(strip:[DS_])s") "";
|
||||
|
||||
|
|
|
@ -115,15 +115,16 @@ Model.prototype.stt = function(aBuffer) {
|
|||
}
|
||||
|
||||
/**
|
||||
* Use the DeepSpeech model to perform Speech-To-Text and output metadata
|
||||
* about the results.
|
||||
* Use the DeepSpeech model to perform Speech-To-Text and output results including metadata.
|
||||
*
|
||||
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
|
||||
*
|
||||
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
||||
* @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
||||
*/
|
||||
Model.prototype.sttWithMetadata = function(aBuffer) {
|
||||
return binding.SpeechToTextWithMetadata(this._impl, aBuffer);
|
||||
Model.prototype.sttWithMetadata = function(aBuffer, aNumResults) {
|
||||
aNumResults = aNumResults || 1;
|
||||
return binding.SpeechToTextWithMetadata(this._impl, aBuffer, aNumResults);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -172,7 +173,19 @@ Stream.prototype.intermediateDecode = function() {
|
|||
}
|
||||
|
||||
/**
|
||||
* Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||
* Compute the intermediate decoding of an ongoing streaming inference, return results including metadata.
|
||||
*
|
||||
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
|
||||
*
|
||||
* @return {object} :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
||||
*/
|
||||
Stream.prototype.intermediateDecodeWithMetadata = function(aNumResults) {
|
||||
aNumResults = aNumResults || 1;
|
||||
return binding.IntermediateDecode(this._impl, aNumResults);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference.
|
||||
*
|
||||
* @return {string} The STT result.
|
||||
*
|
||||
|
@ -185,14 +198,17 @@ Stream.prototype.finishStream = function() {
|
|||
}
|
||||
|
||||
/**
|
||||
* Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
|
||||
* Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference.
|
||||
*
|
||||
* @param {number} aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
|
||||
*
|
||||
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
|
||||
*
|
||||
* This method will free the stream, it must not be used after this method is called.
|
||||
*/
|
||||
Stream.prototype.finishStreamWithMetadata = function() {
|
||||
result = binding.FinishStreamWithMetadata(this._impl);
|
||||
Stream.prototype.finishStreamWithMetadata = function(aNumResults) {
|
||||
aNumResults = aNumResults || 1;
|
||||
result = binding.FinishStreamWithMetadata(this._impl, aNumResults);
|
||||
this._impl = null;
|
||||
return result;
|
||||
}
|
||||
|
@ -236,70 +252,80 @@ function Version() {
|
|||
}
|
||||
|
||||
|
||||
//// Metadata and MetadataItem are here only for documentation purposes
|
||||
//// Metadata, CandidateTranscript and TokenMetadata are here only for documentation purposes
|
||||
|
||||
/**
|
||||
* @class
|
||||
*
|
||||
* Stores each individual character, along with its timing information
|
||||
* Stores text of an individual token, along with its timing information
|
||||
*/
|
||||
function MetadataItem() {}
|
||||
function TokenMetadata() {}
|
||||
|
||||
/**
|
||||
* The character generated for transcription
|
||||
* The text corresponding to this token
|
||||
*
|
||||
* @return {string} The character generated
|
||||
* @return {string} The text generated
|
||||
*/
|
||||
MetadataItem.prototype.character = function() {}
|
||||
TokenMetadata.prototype.text = function() {}
|
||||
|
||||
/**
|
||||
* Position of the character in units of 20ms
|
||||
* Position of the token in units of 20ms
|
||||
*
|
||||
* @return {int} The position of the character
|
||||
* @return {int} The position of the token
|
||||
*/
|
||||
MetadataItem.prototype.timestep = function() {};
|
||||
TokenMetadata.prototype.timestep = function() {};
|
||||
|
||||
/**
|
||||
* Position of the character in seconds
|
||||
* Position of the token in seconds
|
||||
*
|
||||
* @return {float} The position of the character
|
||||
* @return {float} The position of the token
|
||||
*/
|
||||
MetadataItem.prototype.start_time = function() {};
|
||||
TokenMetadata.prototype.start_time = function() {};
|
||||
|
||||
/**
|
||||
* @class
|
||||
*
|
||||
* Stores the entire CTC output as an array of character metadata objects
|
||||
* A single transcript computed by the model, including a confidence value and
|
||||
* the metadata for its constituent tokens.
|
||||
*/
|
||||
function Metadata () {}
|
||||
function CandidateTranscript () {}
|
||||
|
||||
/**
|
||||
* List of items
|
||||
* Array of tokens
|
||||
*
|
||||
* @return {array} List of :js:func:`MetadataItem`
|
||||
* @return {array} Array of :js:func:`TokenMetadata`
|
||||
*/
|
||||
Metadata.prototype.items = function() {}
|
||||
|
||||
/**
|
||||
* Size of the list of items
|
||||
*
|
||||
* @return {int} Number of items
|
||||
*/
|
||||
Metadata.prototype.num_items = function() {}
|
||||
CandidateTranscript.prototype.tokens = function() {}
|
||||
|
||||
/**
|
||||
* Approximated confidence value for this transcription. This is roughly the
|
||||
* sum of the acoustic model logit values for each timestep/character that
|
||||
* sum of the acoustic model logit values for each timestep/token that
|
||||
* contributed to the creation of this transcription.
|
||||
*
|
||||
* @return {float} Confidence value
|
||||
*/
|
||||
Metadata.prototype.confidence = function() {}
|
||||
CandidateTranscript.prototype.confidence = function() {}
|
||||
|
||||
/**
|
||||
* @class
|
||||
*
|
||||
* An array of CandidateTranscript objects computed by the model.
|
||||
*/
|
||||
function Metadata () {}
|
||||
|
||||
/**
|
||||
* Array of transcripts
|
||||
*
|
||||
* @return {array} Array of :js:func:`CandidateTranscript` objects
|
||||
*/
|
||||
Metadata.prototype.transcripts = function() {}
|
||||
|
||||
|
||||
module.exports = {
|
||||
Model: Model,
|
||||
Metadata: Metadata,
|
||||
MetadataItem: MetadataItem,
|
||||
CandidateTranscript: CandidateTranscript,
|
||||
TokenMetadata: TokenMetadata,
|
||||
Version: Version,
|
||||
FreeModel: FreeModel,
|
||||
FreeStream: FreeStream,
|
||||
|
|
|
@ -37,27 +37,39 @@ ModelState::decode(const DecoderState& state) const
|
|||
}
|
||||
|
||||
Metadata*
|
||||
ModelState::decode_metadata(const DecoderState& state)
|
||||
ModelState::decode_metadata(const DecoderState& state,
|
||||
size_t num_results)
|
||||
{
|
||||
vector<Output> out = state.decode();
|
||||
vector<Output> out = state.decode(num_results);
|
||||
unsigned int num_returned = out.size();
|
||||
|
||||
std::unique_ptr<Metadata> metadata(new Metadata());
|
||||
metadata->num_items = out[0].tokens.size();
|
||||
metadata->confidence = out[0].confidence;
|
||||
CandidateTranscript* transcripts = (CandidateTranscript*)malloc(sizeof(CandidateTranscript)*num_returned);
|
||||
|
||||
std::unique_ptr<MetadataItem[]> items(new MetadataItem[metadata->num_items]());
|
||||
for (int i = 0; i < num_returned; ++i) {
|
||||
TokenMetadata* tokens = (TokenMetadata*)malloc(sizeof(TokenMetadata)*out[i].tokens.size());
|
||||
|
||||
// Loop through each character
|
||||
for (int i = 0; i < out[0].tokens.size(); ++i) {
|
||||
items[i].character = strdup(alphabet_.StringFromLabel(out[0].tokens[i]).c_str());
|
||||
items[i].timestep = out[0].timesteps[i];
|
||||
items[i].start_time = out[0].timesteps[i] * ((float)audio_win_step_ / sample_rate_);
|
||||
|
||||
if (items[i].start_time < 0) {
|
||||
items[i].start_time = 0;
|
||||
}
|
||||
for (int j = 0; j < out[i].tokens.size(); ++j) {
|
||||
TokenMetadata token {
|
||||
strdup(alphabet_.StringFromLabel(out[i].tokens[j]).c_str()), // text
|
||||
static_cast<unsigned int>(out[i].timesteps[j]), // timestep
|
||||
out[i].timesteps[j] * ((float)audio_win_step_ / sample_rate_), // start_time
|
||||
};
|
||||
memcpy(&tokens[j], &token, sizeof(TokenMetadata));
|
||||
}
|
||||
|
||||
metadata->items = items.release();
|
||||
return metadata.release();
|
||||
CandidateTranscript transcript {
|
||||
tokens, // tokens
|
||||
static_cast<unsigned int>(out[i].tokens.size()), // num_tokens
|
||||
out[i].confidence, // confidence
|
||||
};
|
||||
memcpy(&transcripts[i], &transcript, sizeof(CandidateTranscript));
|
||||
}
|
||||
|
||||
Metadata* ret = (Metadata*)malloc(sizeof(Metadata));
|
||||
Metadata metadata {
|
||||
transcripts, // transcripts
|
||||
num_returned, // num_transcripts
|
||||
};
|
||||
memcpy(ret, &metadata, sizeof(Metadata));
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -66,11 +66,14 @@ struct ModelState {
|
|||
* @brief Return character-level metadata including letter timings.
|
||||
*
|
||||
* @param state Decoder state to use when decoding.
|
||||
* @param num_results Maximum number of candidate results to return.
|
||||
*
|
||||
* @return Metadata struct containing MetadataItem structs for each character.
|
||||
* The user is responsible for freeing Metadata by calling DS_FreeMetadata().
|
||||
* @return A Metadata struct containing CandidateTranscript structs.
|
||||
* Each represents an candidate transcript, with the first ranked most probable.
|
||||
* The user is responsible for freeing Result by calling DS_FreeMetadata().
|
||||
*/
|
||||
virtual Metadata* decode_metadata(const DecoderState& state);
|
||||
virtual Metadata* decode_metadata(const DecoderState& state,
|
||||
size_t num_results);
|
||||
};
|
||||
|
||||
#endif // MODELSTATE_H
|
||||
|
|
|
@ -121,17 +121,20 @@ class Model(object):
|
|||
"""
|
||||
return deepspeech.impl.SpeechToText(self._impl, audio_buffer)
|
||||
|
||||
def sttWithMetadata(self, audio_buffer):
|
||||
def sttWithMetadata(self, audio_buffer, num_results=1):
|
||||
"""
|
||||
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
|
||||
Use the DeepSpeech model to perform Speech-To-Text and return results including metadata.
|
||||
|
||||
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
:type audio_buffer: numpy.int16 array
|
||||
|
||||
:return: Outputs a struct of individual letters along with their timing information.
|
||||
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||
:type num_results: int
|
||||
|
||||
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
|
||||
:type: :func:`Metadata`
|
||||
"""
|
||||
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer)
|
||||
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer, num_results)
|
||||
|
||||
def createStream(self):
|
||||
"""
|
||||
|
@ -187,10 +190,27 @@ class Stream(object):
|
|||
raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
|
||||
return deepspeech.impl.IntermediateDecode(self._impl)
|
||||
|
||||
def intermediateDecodeWithMetadata(self, num_results=1):
|
||||
"""
|
||||
Compute the intermediate decoding of an ongoing streaming inference and return results including metadata.
|
||||
|
||||
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||
:type num_results: int
|
||||
|
||||
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
|
||||
:type: :func:`Metadata`
|
||||
|
||||
:throws: RuntimeError if the stream object is not valid
|
||||
"""
|
||||
if not self._impl:
|
||||
raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
|
||||
return deepspeech.impl.IntermediateDecodeWithMetadata(self._impl, num_results)
|
||||
|
||||
def finishStream(self):
|
||||
"""
|
||||
Signal the end of an audio signal to an ongoing streaming inference,
|
||||
returns the STT result over the whole audio signal.
|
||||
Compute the final decoding of an ongoing streaming inference and return
|
||||
the result. Signals the end of an ongoing streaming inference. The underlying
|
||||
stream object must not be used after this method is called.
|
||||
|
||||
:return: The STT result.
|
||||
:type: str
|
||||
|
@ -203,19 +223,24 @@ class Stream(object):
|
|||
self._impl = None
|
||||
return result
|
||||
|
||||
def finishStreamWithMetadata(self):
|
||||
def finishStreamWithMetadata(self, num_results=1):
|
||||
"""
|
||||
Signal the end of an audio signal to an ongoing streaming inference,
|
||||
returns per-letter metadata.
|
||||
Compute the final decoding of an ongoing streaming inference and return
|
||||
results including metadata. Signals the end of an ongoing streaming
|
||||
inference. The underlying stream object must not be used after this
|
||||
method is called.
|
||||
|
||||
:return: Outputs a struct of individual letters along with their timing information.
|
||||
:param num_results: Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
||||
:type num_results: int
|
||||
|
||||
:return: Metadata object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
|
||||
:type: :func:`Metadata`
|
||||
|
||||
:throws: RuntimeError if the stream object is not valid
|
||||
"""
|
||||
if not self._impl:
|
||||
raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
|
||||
result = deepspeech.impl.FinishStreamWithMetadata(self._impl)
|
||||
result = deepspeech.impl.FinishStreamWithMetadata(self._impl, num_results)
|
||||
self._impl = None
|
||||
return result
|
||||
|
||||
|
@ -233,52 +258,43 @@ class Stream(object):
|
|||
|
||||
|
||||
# This is only for documentation purpose
|
||||
# Metadata and MetadataItem should be in sync with native_client/deepspeech.h
|
||||
class MetadataItem(object):
|
||||
# Metadata, CandidateTranscript and TokenMetadata should be in sync with native_client/deepspeech.h
|
||||
class TokenMetadata(object):
|
||||
"""
|
||||
Stores each individual character, along with its timing information
|
||||
"""
|
||||
|
||||
def character(self):
|
||||
def text(self):
|
||||
"""
|
||||
The character generated for transcription
|
||||
The text for this token
|
||||
"""
|
||||
|
||||
|
||||
def timestep(self):
|
||||
"""
|
||||
Position of the character in units of 20ms
|
||||
Position of the token in units of 20ms
|
||||
"""
|
||||
|
||||
|
||||
def start_time(self):
|
||||
"""
|
||||
Position of the character in seconds
|
||||
Position of the token in seconds
|
||||
"""
|
||||
|
||||
|
||||
class Metadata(object):
|
||||
class CandidateTranscript(object):
|
||||
"""
|
||||
Stores the entire CTC output as an array of character metadata objects
|
||||
"""
|
||||
def items(self):
|
||||
def tokens(self):
|
||||
"""
|
||||
List of items
|
||||
List of tokens
|
||||
|
||||
:return: A list of :func:`MetadataItem` elements
|
||||
:return: A list of :func:`TokenMetadata` elements
|
||||
:type: list
|
||||
"""
|
||||
|
||||
|
||||
def num_items(self):
|
||||
"""
|
||||
Size of the list of items
|
||||
|
||||
:return: Size of the list of items
|
||||
:type: int
|
||||
"""
|
||||
|
||||
|
||||
def confidence(self):
|
||||
"""
|
||||
Approximated confidence value for this transcription. This is roughly the
|
||||
|
@ -286,3 +302,12 @@ class Metadata(object):
|
|||
contributed to the creation of this transcription.
|
||||
"""
|
||||
|
||||
|
||||
class Metadata(object):
|
||||
def transcripts(self):
|
||||
"""
|
||||
List of candidate transcripts
|
||||
|
||||
:return: A list of :func:`CandidateTranscript` objects
|
||||
:type: list
|
||||
"""
|
||||
|
|
|
@ -18,6 +18,7 @@ try:
|
|||
except ImportError:
|
||||
from pipes import quote
|
||||
|
||||
|
||||
def convert_samplerate(audio_path, desired_sample_rate):
|
||||
sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate)
|
||||
try:
|
||||
|
@ -31,25 +32,25 @@ def convert_samplerate(audio_path, desired_sample_rate):
|
|||
|
||||
|
||||
def metadata_to_string(metadata):
|
||||
return ''.join(item.character for item in metadata.items)
|
||||
return ''.join(token.text for token in metadata.tokens)
|
||||
|
||||
def words_from_metadata(metadata):
|
||||
|
||||
def words_from_candidate_transcript(metadata):
|
||||
word = ""
|
||||
word_list = []
|
||||
word_start_time = 0
|
||||
# Loop through each character
|
||||
for i in range(0, metadata.num_items):
|
||||
item = metadata.items[i]
|
||||
for i, token in enumerate(metadata.tokens):
|
||||
# Append character to word if it's not a space
|
||||
if item.character != " ":
|
||||
if token.text != " ":
|
||||
if len(word) == 0:
|
||||
# Log the start time of the new word
|
||||
word_start_time = item.start_time
|
||||
word_start_time = token.start_time
|
||||
|
||||
word = word + item.character
|
||||
word = word + token.text
|
||||
# Word boundary is either a space or the last character in the array
|
||||
if item.character == " " or i == metadata.num_items - 1:
|
||||
word_duration = item.start_time - word_start_time
|
||||
if token.text == " " or i == len(metadata.tokens) - 1:
|
||||
word_duration = token.start_time - word_start_time
|
||||
|
||||
if word_duration < 0:
|
||||
word_duration = 0
|
||||
|
@ -69,9 +70,11 @@ def words_from_metadata(metadata):
|
|||
|
||||
def metadata_json_output(metadata):
|
||||
json_result = dict()
|
||||
json_result["words"] = words_from_metadata(metadata)
|
||||
json_result["confidence"] = metadata.confidence
|
||||
return json.dumps(json_result)
|
||||
json_result["transcripts"] = [{
|
||||
"confidence": transcript.confidence,
|
||||
"words": words_from_candidate_transcript(transcript),
|
||||
} for transcript in metadata.transcripts]
|
||||
return json.dumps(json_result, indent=2)
|
||||
|
||||
|
||||
|
||||
|
@ -141,9 +144,9 @@ def main():
|
|||
print('Running inference.', file=sys.stderr)
|
||||
inference_start = timer()
|
||||
if args.extended:
|
||||
print(metadata_to_string(ds.sttWithMetadata(audio)))
|
||||
print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
|
||||
elif args.json:
|
||||
print(metadata_json_output(ds.sttWithMetadata(audio)))
|
||||
print(metadata_json_output(ds.sttWithMetadata(audio, 3)))
|
||||
else:
|
||||
print(ds.stt(audio))
|
||||
inference_end = timer() - inference_start
|
||||
|
|
|
@ -38,30 +38,69 @@ import_array();
|
|||
%append_output(SWIG_NewPointerObj(%as_voidptr($1), $1_descriptor, SWIG_POINTER_OWN));
|
||||
}
|
||||
|
||||
%typemap(out) MetadataItem* %{
|
||||
$result = PyList_New(arg1->num_items);
|
||||
for (int i = 0; i < arg1->num_items; ++i) {
|
||||
PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->items[i]), SWIGTYPE_p_MetadataItem, 0);
|
||||
%fragment("parent_reference_init", "init") {
|
||||
// Thread-safe initialization - initialize during Python module initialization
|
||||
parent_reference();
|
||||
}
|
||||
|
||||
%fragment("parent_reference_function", "header", fragment="parent_reference_init") {
|
||||
|
||||
static PyObject *parent_reference() {
|
||||
static PyObject *parent_reference_string = SWIG_Python_str_FromChar("__parent_reference");
|
||||
return parent_reference_string;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
%typemap(out, fragment="parent_reference_function") CandidateTranscript* %{
|
||||
$result = PyList_New(arg1->num_transcripts);
|
||||
for (int i = 0; i < arg1->num_transcripts; ++i) {
|
||||
PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->transcripts[i]), SWIGTYPE_p_CandidateTranscript, 0);
|
||||
// Add a reference to Metadata in the returned elements to avoid premature
|
||||
// garbage collection
|
||||
PyObject_SetAttr(o, parent_reference(), $self);
|
||||
PyList_SetItem($result, i, o);
|
||||
}
|
||||
%}
|
||||
|
||||
%extend struct MetadataItem {
|
||||
%typemap(out, fragment="parent_reference_function") TokenMetadata* %{
|
||||
$result = PyList_New(arg1->num_tokens);
|
||||
for (int i = 0; i < arg1->num_tokens; ++i) {
|
||||
PyObject* o = SWIG_NewPointerObj(SWIG_as_voidptr(&arg1->tokens[i]), SWIGTYPE_p_TokenMetadata, 0);
|
||||
// Add a reference to CandidateTranscript in the returned elements to avoid premature
|
||||
// garbage collection
|
||||
PyObject_SetAttr(o, parent_reference(), $self);
|
||||
PyList_SetItem($result, i, o);
|
||||
}
|
||||
%}
|
||||
|
||||
%extend struct TokenMetadata {
|
||||
%pythoncode %{
|
||||
def __repr__(self):
|
||||
return 'MetadataItem(character=\'{}\', timestep={}, start_time={})'.format(self.character, self.timestep, self.start_time)
|
||||
return 'TokenMetadata(text=\'{}\', timestep={}, start_time={})'.format(self.text, self.timestep, self.start_time)
|
||||
%}
|
||||
}
|
||||
|
||||
%extend struct CandidateTranscript {
|
||||
%pythoncode %{
|
||||
def __repr__(self):
|
||||
tokens_repr = ',\n'.join(repr(i) for i in self.tokens)
|
||||
tokens_repr = '\n'.join(' ' + l for l in tokens_repr.split('\n'))
|
||||
return 'CandidateTranscript(confidence={}, tokens=[\n{}\n])'.format(self.confidence, tokens_repr)
|
||||
%}
|
||||
}
|
||||
|
||||
%extend struct Metadata {
|
||||
%pythoncode %{
|
||||
def __repr__(self):
|
||||
items_repr = ', \n'.join(' ' + repr(i) for i in self.items)
|
||||
return 'Metadata(confidence={}, items=[\n{}\n])'.format(self.confidence, items_repr)
|
||||
transcripts_repr = ',\n'.join(repr(i) for i in self.transcripts)
|
||||
transcripts_repr = '\n'.join(' ' + l for l in transcripts_repr.split('\n'))
|
||||
return 'Metadata(transcripts=[\n{}\n])'.format(transcripts_repr)
|
||||
%}
|
||||
}
|
||||
|
||||
%ignore Metadata::num_items;
|
||||
%ignore Metadata::num_transcripts;
|
||||
%ignore CandidateTranscript::num_tokens;
|
||||
|
||||
%extend struct Metadata {
|
||||
~Metadata() {
|
||||
|
@ -69,10 +108,12 @@ import_array();
|
|||
}
|
||||
}
|
||||
|
||||
%nodefaultdtor Metadata;
|
||||
%nodefaultctor Metadata;
|
||||
%nodefaultctor MetadataItem;
|
||||
%nodefaultdtor MetadataItem;
|
||||
%nodefaultdtor Metadata;
|
||||
%nodefaultctor CandidateTranscript;
|
||||
%nodefaultdtor CandidateTranscript;
|
||||
%nodefaultctor TokenMetadata;
|
||||
%nodefaultdtor TokenMetadata;
|
||||
|
||||
%typemap(newfree) char* "DS_FreeString($1);";
|
||||
|
||||
|
|
Loading…
Reference in New Issue