Address review comments
This commit is contained in:
parent
1d3b3a31a1
commit
8dedda7759
@ -5,7 +5,7 @@ This directory contains language-specific data files. Most importantly, you will
|
|||||||
|
|
||||||
1. A list of unique characters for the target language (e.g. English) in `data/alphabet.txt`
|
1. A list of unique characters for the target language (e.g. English) in `data/alphabet.txt`
|
||||||
|
|
||||||
2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`, which includes a binary n-gram language model generated with `data/lm/generate_lm.py`.
|
2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`. The scorer package includes a binary n-gram language model generated with `data/lm/generate_lm.py`.
|
||||||
|
|
||||||
For more information on how to build these resources from scratch, see `data/lm/README.md`
|
For more information on how to build these resources from scratch, see `data/lm/README.md`
|
||||||
|
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
|
|
||||||
The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
|
The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate `lm.binary` and `librispeech-vocab-500k.txt` in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
|
||||||
|
|
||||||
The scorer package was then built using the `generate_package.py` script:
|
The scorer package was then built using the `generate_package.py` script:
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
python generate_lm.py # this will create lm.binary and librispeech-vocab-500k.txt
|
||||||
python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer
|
python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer
|
||||||
|
@ -27,7 +27,7 @@ This module should be self-contained:
|
|||||||
- pip install native_client/python/dist/deepspeech*.whl
|
- pip install native_client/python/dist/deepspeech*.whl
|
||||||
- pip install -r requirements_eval_tflite.txt
|
- pip install -r requirements_eval_tflite.txt
|
||||||
|
|
||||||
Then run with a TF Lite model, LM and a CSV test file
|
Then run with a TF Lite model, a scorer and a CSV test file
|
||||||
'''
|
'''
|
||||||
|
|
||||||
BEAM_WIDTH = 500
|
BEAM_WIDTH = 500
|
||||||
|
@ -20,16 +20,20 @@ class Scorer(swigwrapper.Scorer):
|
|||||||
super(Scorer, self).__init__()
|
super(Scorer, self).__init__()
|
||||||
# Allow bare initialization
|
# Allow bare initialization
|
||||||
if alphabet:
|
if alphabet:
|
||||||
|
assert alpha is not None, 'alpha parameter is required'
|
||||||
|
assert beta is not None, 'beta parameter is required'
|
||||||
|
assert scorer_path, 'scorer_path parameter is required'
|
||||||
|
|
||||||
serialized = alphabet.serialize()
|
serialized = alphabet.serialize()
|
||||||
native_alphabet = swigwrapper.Alphabet()
|
native_alphabet = swigwrapper.Alphabet()
|
||||||
err = native_alphabet.deserialize(serialized, len(serialized))
|
err = native_alphabet.deserialize(serialized, len(serialized))
|
||||||
if err != 0:
|
if err != 0:
|
||||||
raise ValueError("Error when deserializing alphabet.")
|
raise ValueError('Error when deserializing alphabet.')
|
||||||
|
|
||||||
err = self.init(scorer_path.encode('utf-8'),
|
err = self.init(scorer_path.encode('utf-8'),
|
||||||
native_alphabet)
|
native_alphabet)
|
||||||
if err != 0:
|
if err != 0:
|
||||||
raise ValueError("Scorer initialization failed with error code {}".format(err), err)
|
raise ValueError('Scorer initialization failed with error code {}'.format(err))
|
||||||
|
|
||||||
self.reset_params(alpha, beta)
|
self.reset_params(alpha, beta)
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ DecoderState::init(const Alphabet& alphabet,
|
|||||||
size_t beam_size,
|
size_t beam_size,
|
||||||
double cutoff_prob,
|
double cutoff_prob,
|
||||||
size_t cutoff_top_n,
|
size_t cutoff_top_n,
|
||||||
Scorer *ext_scorer)
|
std::shared_ptr<Scorer> ext_scorer)
|
||||||
{
|
{
|
||||||
// assign special ids
|
// assign special ids
|
||||||
abs_time_step_ = 0;
|
abs_time_step_ = 0;
|
||||||
@ -36,7 +36,7 @@ DecoderState::init(const Alphabet& alphabet,
|
|||||||
prefix_root_.reset(root);
|
prefix_root_.reset(root);
|
||||||
prefixes_.push_back(root);
|
prefixes_.push_back(root);
|
||||||
|
|
||||||
if (ext_scorer != nullptr && (bool)(ext_scorer_->dictionary)) {
|
if (ext_scorer && (bool)(ext_scorer_->dictionary)) {
|
||||||
// no need for std::make_shared<>() since Copy() does 'new' behind the doors
|
// no need for std::make_shared<>() since Copy() does 'new' behind the doors
|
||||||
auto dict_ptr = std::shared_ptr<PathTrie::FstType>(ext_scorer->dictionary->Copy(true));
|
auto dict_ptr = std::shared_ptr<PathTrie::FstType>(ext_scorer->dictionary->Copy(true));
|
||||||
root->set_dictionary(dict_ptr);
|
root->set_dictionary(dict_ptr);
|
||||||
@ -58,7 +58,7 @@ DecoderState::next(const double *probs,
|
|||||||
|
|
||||||
float min_cutoff = -NUM_FLT_INF;
|
float min_cutoff = -NUM_FLT_INF;
|
||||||
bool full_beam = false;
|
bool full_beam = false;
|
||||||
if (ext_scorer_ != nullptr) {
|
if (ext_scorer_) {
|
||||||
size_t num_prefixes = std::min(prefixes_.size(), beam_size_);
|
size_t num_prefixes = std::min(prefixes_.size(), beam_size_);
|
||||||
std::partial_sort(prefixes_.begin(),
|
std::partial_sort(prefixes_.begin(),
|
||||||
prefixes_.begin() + num_prefixes,
|
prefixes_.begin() + num_prefixes,
|
||||||
@ -109,7 +109,7 @@ DecoderState::next(const double *probs,
|
|||||||
log_p = log_prob_c + prefix->score;
|
log_p = log_prob_c + prefix->score;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ext_scorer_ != nullptr) {
|
if (ext_scorer_) {
|
||||||
// skip scoring the space in word based LMs
|
// skip scoring the space in word based LMs
|
||||||
PathTrie* prefix_to_score;
|
PathTrie* prefix_to_score;
|
||||||
if (ext_scorer_->is_utf8_mode()) {
|
if (ext_scorer_->is_utf8_mode()) {
|
||||||
@ -166,7 +166,7 @@ DecoderState::decode() const
|
|||||||
}
|
}
|
||||||
|
|
||||||
// score the last word of each prefix that doesn't end with space
|
// score the last word of each prefix that doesn't end with space
|
||||||
if (ext_scorer_ != nullptr) {
|
if (ext_scorer_) {
|
||||||
for (size_t i = 0; i < beam_size_ && i < prefixes_copy.size(); ++i) {
|
for (size_t i = 0; i < beam_size_ && i < prefixes_copy.size(); ++i) {
|
||||||
auto prefix = prefixes_copy[i];
|
auto prefix = prefixes_copy[i];
|
||||||
if (!ext_scorer_->is_scoring_boundary(prefix->parent, prefix->character)) {
|
if (!ext_scorer_->is_scoring_boundary(prefix->parent, prefix->character)) {
|
||||||
@ -200,7 +200,7 @@ DecoderState::decode() const
|
|||||||
Output output;
|
Output output;
|
||||||
prefixes_copy[i]->get_path_vec(output.tokens, output.timesteps);
|
prefixes_copy[i]->get_path_vec(output.tokens, output.timesteps);
|
||||||
double approx_ctc = scores[prefixes_copy[i]];
|
double approx_ctc = scores[prefixes_copy[i]];
|
||||||
if (ext_scorer_ != nullptr) {
|
if (ext_scorer_) {
|
||||||
auto words = ext_scorer_->split_labels_into_scored_units(output.tokens);
|
auto words = ext_scorer_->split_labels_into_scored_units(output.tokens);
|
||||||
// remove term insertion weight
|
// remove term insertion weight
|
||||||
approx_ctc -= words.size() * ext_scorer_->beta;
|
approx_ctc -= words.size() * ext_scorer_->beta;
|
||||||
@ -222,7 +222,7 @@ std::vector<Output> ctc_beam_search_decoder(
|
|||||||
size_t beam_size,
|
size_t beam_size,
|
||||||
double cutoff_prob,
|
double cutoff_prob,
|
||||||
size_t cutoff_top_n,
|
size_t cutoff_top_n,
|
||||||
Scorer *ext_scorer)
|
std::shared_ptr<Scorer> ext_scorer)
|
||||||
{
|
{
|
||||||
DecoderState state;
|
DecoderState state;
|
||||||
state.init(alphabet, beam_size, cutoff_prob, cutoff_top_n, ext_scorer);
|
state.init(alphabet, beam_size, cutoff_prob, cutoff_top_n, ext_scorer);
|
||||||
@ -243,7 +243,7 @@ ctc_beam_search_decoder_batch(
|
|||||||
size_t num_processes,
|
size_t num_processes,
|
||||||
double cutoff_prob,
|
double cutoff_prob,
|
||||||
size_t cutoff_top_n,
|
size_t cutoff_top_n,
|
||||||
Scorer *ext_scorer)
|
std::shared_ptr<Scorer> ext_scorer)
|
||||||
{
|
{
|
||||||
VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
|
VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
|
||||||
VALID_CHECK_EQ(batch_size, seq_lengths_size, "must have one sequence length per batch element");
|
VALID_CHECK_EQ(batch_size, seq_lengths_size, "must have one sequence length per batch element");
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#ifndef CTC_BEAM_SEARCH_DECODER_H_
|
#ifndef CTC_BEAM_SEARCH_DECODER_H_
|
||||||
#define CTC_BEAM_SEARCH_DECODER_H_
|
#define CTC_BEAM_SEARCH_DECODER_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -16,7 +17,7 @@ class DecoderState {
|
|||||||
double cutoff_prob_;
|
double cutoff_prob_;
|
||||||
size_t cutoff_top_n_;
|
size_t cutoff_top_n_;
|
||||||
|
|
||||||
Scorer* ext_scorer_; // weak
|
std::shared_ptr<Scorer> ext_scorer_;
|
||||||
std::vector<PathTrie*> prefixes_;
|
std::vector<PathTrie*> prefixes_;
|
||||||
std::unique_ptr<PathTrie> prefix_root_;
|
std::unique_ptr<PathTrie> prefix_root_;
|
||||||
|
|
||||||
@ -45,7 +46,7 @@ public:
|
|||||||
size_t beam_size,
|
size_t beam_size,
|
||||||
double cutoff_prob,
|
double cutoff_prob,
|
||||||
size_t cutoff_top_n,
|
size_t cutoff_top_n,
|
||||||
Scorer *ext_scorer);
|
std::shared_ptr<Scorer> ext_scorer);
|
||||||
|
|
||||||
/* Send data to the decoder
|
/* Send data to the decoder
|
||||||
*
|
*
|
||||||
@ -95,7 +96,7 @@ std::vector<Output> ctc_beam_search_decoder(
|
|||||||
size_t beam_size,
|
size_t beam_size,
|
||||||
double cutoff_prob,
|
double cutoff_prob,
|
||||||
size_t cutoff_top_n,
|
size_t cutoff_top_n,
|
||||||
Scorer *ext_scorer);
|
std::shared_ptr<Scorer> ext_scorer);
|
||||||
|
|
||||||
/* CTC Beam Search Decoder for batch data
|
/* CTC Beam Search Decoder for batch data
|
||||||
* Parameters:
|
* Parameters:
|
||||||
@ -126,6 +127,6 @@ ctc_beam_search_decoder_batch(
|
|||||||
size_t num_processes,
|
size_t num_processes,
|
||||||
double cutoff_prob,
|
double cutoff_prob,
|
||||||
size_t cutoff_top_n,
|
size_t cutoff_top_n,
|
||||||
Scorer *ext_scorer);
|
std::shared_ptr<Scorer> ext_scorer);
|
||||||
|
|
||||||
#endif // CTC_BEAM_SEARCH_DECODER_H_
|
#endif // CTC_BEAM_SEARCH_DECODER_H_
|
||||||
|
@ -71,8 +71,19 @@ void Scorer::setup_char_map()
|
|||||||
|
|
||||||
int Scorer::load_lm(const std::string& lm_path)
|
int Scorer::load_lm(const std::string& lm_path)
|
||||||
{
|
{
|
||||||
// load language model
|
// Check if file is readable to avoid KenLM throwing an exception
|
||||||
const char* filename = lm_path.c_str();
|
const char* filename = lm_path.c_str();
|
||||||
|
if (access(filename, R_OK) != 0) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the file format is valid to avoid KenLM throwing an exception
|
||||||
|
lm::ngram::ModelType model_type;
|
||||||
|
if (!lm::ngram::RecognizeBinary(filename, model_type)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load the LM
|
||||||
lm::ngram::Config config;
|
lm::ngram::Config config;
|
||||||
config.load_method = util::LoadMethod::LAZY;
|
config.load_method = util::LoadMethod::LAZY;
|
||||||
language_model_.reset(lm::ngram::LoadVirtual(filename, config));
|
language_model_.reset(lm::ngram::LoadVirtual(filename, config));
|
||||||
@ -100,21 +111,21 @@ int Scorer::load_trie(std::ifstream& fin, const std::string& file_path)
|
|||||||
int magic;
|
int magic;
|
||||||
fin.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
fin.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||||
if (magic != MAGIC) {
|
if (magic != MAGIC) {
|
||||||
std::cerr << "Error: Can't parse trie file, invalid header. Try updating "
|
std::cerr << "Error: Can't parse scorer file, invalid header. Try updating "
|
||||||
"your trie file." << std::endl;
|
"your scorer file." << std::endl;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int version;
|
int version;
|
||||||
fin.read(reinterpret_cast<char*>(&version), sizeof(version));
|
fin.read(reinterpret_cast<char*>(&version), sizeof(version));
|
||||||
if (version != FILE_VERSION) {
|
if (version != FILE_VERSION) {
|
||||||
std::cerr << "Error: Trie file version mismatch (" << version
|
std::cerr << "Error: Scorer file version mismatch (" << version
|
||||||
<< " instead of expected " << FILE_VERSION
|
<< " instead of expected " << FILE_VERSION
|
||||||
<< "). ";
|
<< "). ";
|
||||||
if (version < FILE_VERSION) {
|
if (version < FILE_VERSION) {
|
||||||
std::cerr << "Update your trie file.";
|
std::cerr << "Update your scorer file.";
|
||||||
} else {
|
} else {
|
||||||
std::cerr << "Downgrade your trie file or update your version of DeepSpeech.";
|
std::cerr << "Downgrade your scorer file or update your version of DeepSpeech.";
|
||||||
}
|
}
|
||||||
std::cerr << std::endl;
|
std::cerr << std::endl;
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -7,9 +7,10 @@
|
|||||||
#include "workspace_status.h"
|
#include "workspace_status.h"
|
||||||
%}
|
%}
|
||||||
|
|
||||||
%include "pyabc.i"
|
%include <pyabc.i>
|
||||||
%include "std_string.i"
|
%include <std_string.i>
|
||||||
%include "std_vector.i"
|
%include <std_vector.i>
|
||||||
|
%include <std_shared_ptr.i>
|
||||||
%include "numpy.i"
|
%include "numpy.i"
|
||||||
|
|
||||||
%init %{
|
%init %{
|
||||||
@ -20,6 +21,8 @@ namespace std {
|
|||||||
%template(StringVector) vector<string>;
|
%template(StringVector) vector<string>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
%shared_ptr(Scorer);
|
||||||
|
|
||||||
// Convert NumPy arrays to pointer+lengths
|
// Convert NumPy arrays to pointer+lengths
|
||||||
%apply (double* IN_ARRAY2, int DIM1, int DIM2) {(const double *probs, int time_dim, int class_dim)};
|
%apply (double* IN_ARRAY2, int DIM1, int DIM2) {(const double *probs, int time_dim, int class_dim)};
|
||||||
%apply (double* IN_ARRAY3, int DIM1, int DIM2, int DIM3) {(const double *probs, int batch_size, int time_dim, int class_dim)};
|
%apply (double* IN_ARRAY3, int DIM1, int DIM2, int DIM3) {(const double *probs, int batch_size, int time_dim, int class_dim)};
|
||||||
|
@ -319,7 +319,7 @@ int
|
|||||||
DS_DisableExternalScorer(ModelState* aCtx)
|
DS_DisableExternalScorer(ModelState* aCtx)
|
||||||
{
|
{
|
||||||
if (aCtx->scorer_) {
|
if (aCtx->scorer_) {
|
||||||
aCtx->scorer_.reset(nullptr);
|
aCtx->scorer_.reset();
|
||||||
return DS_ERR_OK;
|
return DS_ERR_OK;
|
||||||
}
|
}
|
||||||
return DS_ERR_SCORER_NOT_ENABLED;
|
return DS_ERR_SCORER_NOT_ENABLED;
|
||||||
@ -363,7 +363,7 @@ DS_CreateStream(ModelState* aCtx,
|
|||||||
aCtx->beam_width_,
|
aCtx->beam_width_,
|
||||||
cutoff_prob,
|
cutoff_prob,
|
||||||
cutoff_top_n,
|
cutoff_top_n,
|
||||||
aCtx->scorer_.get());
|
aCtx->scorer_);
|
||||||
|
|
||||||
*retval = ctx.release();
|
*retval = ctx.release();
|
||||||
return DS_ERR_OK;
|
return DS_ERR_OK;
|
||||||
|
@ -51,12 +51,11 @@ Please push DeepSpeech data to ``/sdcard/deepspeech/``\ , including:
|
|||||||
|
|
||||||
|
|
||||||
* ``output_graph.tflite`` which is the TF Lite model
|
* ``output_graph.tflite`` which is the TF Lite model
|
||||||
* ``kenlm.scorer``, if you want to use the language model ; please
|
* ``kenlm.scorer``, if you want to use the scorer; please be aware that too big
|
||||||
be aware that too big language model will make the device run out of memory
|
scorer will make the device run out of memory
|
||||||
|
|
||||||
Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/ds``\ :
|
Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/ds``\ :
|
||||||
|
|
||||||
|
|
||||||
* ``deepspeech``
|
* ``deepspeech``
|
||||||
* ``libdeepspeech.so``
|
* ``libdeepspeech.so``
|
||||||
* ``libc++_shared.so``
|
* ``libc++_shared.so``
|
||||||
|
@ -32,8 +32,8 @@ parser.addArgument(['--model'], {required: true, help: 'Path to the model (proto
|
|||||||
parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'});
|
parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'});
|
||||||
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
|
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
|
||||||
parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'});
|
parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'});
|
||||||
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not set, use default value from scorer.', type: 'float'});
|
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not specified, use default from the scorer package.', type: 'float'});
|
||||||
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not set, use default value from scorer.', type: 'float'});
|
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.', type: 'float'});
|
||||||
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
|
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
|
||||||
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
|
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
|
||||||
var args = parser.parseArgs();
|
var args = parser.parseArgs();
|
||||||
|
@ -16,7 +16,7 @@ struct ModelState {
|
|||||||
static constexpr unsigned int BATCH_SIZE = 1;
|
static constexpr unsigned int BATCH_SIZE = 1;
|
||||||
|
|
||||||
Alphabet alphabet_;
|
Alphabet alphabet_;
|
||||||
std::unique_ptr<Scorer> scorer_;
|
std::shared_ptr<Scorer> scorer_;
|
||||||
unsigned int beam_width_;
|
unsigned int beam_width_;
|
||||||
unsigned int n_steps_;
|
unsigned int n_steps_;
|
||||||
unsigned int n_context_;
|
unsigned int n_context_;
|
||||||
|
@ -95,9 +95,9 @@ def main():
|
|||||||
parser.add_argument('--beam_width', type=int, default=500,
|
parser.add_argument('--beam_width', type=int, default=500,
|
||||||
help='Beam width for the CTC decoder')
|
help='Beam width for the CTC decoder')
|
||||||
parser.add_argument('--lm_alpha', type=float,
|
parser.add_argument('--lm_alpha', type=float,
|
||||||
help='Language model weight (lm_alpha)')
|
help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
|
||||||
parser.add_argument('--lm_beta', type=float,
|
parser.add_argument('--lm_beta', type=float,
|
||||||
help='Word insertion bonus (lm_beta)')
|
help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
|
||||||
parser.add_argument('--version', action=VersionAction,
|
parser.add_argument('--version', action=VersionAction,
|
||||||
help='Print version and exits')
|
help='Print version and exits')
|
||||||
parser.add_argument('--extended', required=False, action='store_true',
|
parser.add_argument('--extended', required=False, action='store_true',
|
||||||
|
Loading…
Reference in New Issue
Block a user