Address review comments

This commit is contained in:
Reuben Morais 2020-02-05 17:19:53 +01:00
parent 1d3b3a31a1
commit 8dedda7759
13 changed files with 56 additions and 38 deletions

View File

@ -5,7 +5,7 @@ This directory contains language-specific data files. Most importantly, you will
1. A list of unique characters for the target language (e.g. English) in `data/alphabet.txt` 1. A list of unique characters for the target language (e.g. English) in `data/alphabet.txt`
2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`, which includes a binary n-gram language model generated with `data/lm/generate_lm.py`. 2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`. The scorer package includes a binary n-gram language model generated with `data/lm/generate_lm.py`.
For more information on how to build these resources from scratch, see `data/lm/README.md` For more information on how to build these resources from scratch, see `data/lm/README.md`

View File

@ -1,8 +1,8 @@
The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter). The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate `lm.binary` and `librispeech-vocab-500k.txt` in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
The scorer package was then built using the `generate_package.py` script: The scorer package was then built using the `generate_package.py` script:
.. code-block:: bash .. code-block:: bash
python generate_lm.py # this will create lm.binary and librispeech-vocab-500k.txt
python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer

View File

@ -27,7 +27,7 @@ This module should be self-contained:
- pip install native_client/python/dist/deepspeech*.whl - pip install native_client/python/dist/deepspeech*.whl
- pip install -r requirements_eval_tflite.txt - pip install -r requirements_eval_tflite.txt
Then run with a TF Lite model, LM and a CSV test file Then run with a TF Lite model, a scorer and a CSV test file
''' '''
BEAM_WIDTH = 500 BEAM_WIDTH = 500

View File

@ -20,16 +20,20 @@ class Scorer(swigwrapper.Scorer):
super(Scorer, self).__init__() super(Scorer, self).__init__()
# Allow bare initialization # Allow bare initialization
if alphabet: if alphabet:
assert alpha is not None, 'alpha parameter is required'
assert beta is not None, 'beta parameter is required'
assert scorer_path, 'scorer_path parameter is required'
serialized = alphabet.serialize() serialized = alphabet.serialize()
native_alphabet = swigwrapper.Alphabet() native_alphabet = swigwrapper.Alphabet()
err = native_alphabet.deserialize(serialized, len(serialized)) err = native_alphabet.deserialize(serialized, len(serialized))
if err != 0: if err != 0:
raise ValueError("Error when deserializing alphabet.") raise ValueError('Error when deserializing alphabet.')
err = self.init(scorer_path.encode('utf-8'), err = self.init(scorer_path.encode('utf-8'),
native_alphabet) native_alphabet)
if err != 0: if err != 0:
raise ValueError("Scorer initialization failed with error code {}".format(err), err) raise ValueError('Scorer initialization failed with error code {}'.format(err))
self.reset_params(alpha, beta) self.reset_params(alpha, beta)

View File

@ -18,7 +18,7 @@ DecoderState::init(const Alphabet& alphabet,
size_t beam_size, size_t beam_size,
double cutoff_prob, double cutoff_prob,
size_t cutoff_top_n, size_t cutoff_top_n,
Scorer *ext_scorer) std::shared_ptr<Scorer> ext_scorer)
{ {
// assign special ids // assign special ids
abs_time_step_ = 0; abs_time_step_ = 0;
@ -36,7 +36,7 @@ DecoderState::init(const Alphabet& alphabet,
prefix_root_.reset(root); prefix_root_.reset(root);
prefixes_.push_back(root); prefixes_.push_back(root);
if (ext_scorer != nullptr && (bool)(ext_scorer_->dictionary)) { if (ext_scorer && (bool)(ext_scorer_->dictionary)) {
// no need for std::make_shared<>() since Copy() does 'new' behind the doors // no need for std::make_shared<>() since Copy() does 'new' behind the doors
auto dict_ptr = std::shared_ptr<PathTrie::FstType>(ext_scorer->dictionary->Copy(true)); auto dict_ptr = std::shared_ptr<PathTrie::FstType>(ext_scorer->dictionary->Copy(true));
root->set_dictionary(dict_ptr); root->set_dictionary(dict_ptr);
@ -58,7 +58,7 @@ DecoderState::next(const double *probs,
float min_cutoff = -NUM_FLT_INF; float min_cutoff = -NUM_FLT_INF;
bool full_beam = false; bool full_beam = false;
if (ext_scorer_ != nullptr) { if (ext_scorer_) {
size_t num_prefixes = std::min(prefixes_.size(), beam_size_); size_t num_prefixes = std::min(prefixes_.size(), beam_size_);
std::partial_sort(prefixes_.begin(), std::partial_sort(prefixes_.begin(),
prefixes_.begin() + num_prefixes, prefixes_.begin() + num_prefixes,
@ -109,7 +109,7 @@ DecoderState::next(const double *probs,
log_p = log_prob_c + prefix->score; log_p = log_prob_c + prefix->score;
} }
if (ext_scorer_ != nullptr) { if (ext_scorer_) {
// skip scoring the space in word based LMs // skip scoring the space in word based LMs
PathTrie* prefix_to_score; PathTrie* prefix_to_score;
if (ext_scorer_->is_utf8_mode()) { if (ext_scorer_->is_utf8_mode()) {
@ -166,7 +166,7 @@ DecoderState::decode() const
} }
// score the last word of each prefix that doesn't end with space // score the last word of each prefix that doesn't end with space
if (ext_scorer_ != nullptr) { if (ext_scorer_) {
for (size_t i = 0; i < beam_size_ && i < prefixes_copy.size(); ++i) { for (size_t i = 0; i < beam_size_ && i < prefixes_copy.size(); ++i) {
auto prefix = prefixes_copy[i]; auto prefix = prefixes_copy[i];
if (!ext_scorer_->is_scoring_boundary(prefix->parent, prefix->character)) { if (!ext_scorer_->is_scoring_boundary(prefix->parent, prefix->character)) {
@ -200,7 +200,7 @@ DecoderState::decode() const
Output output; Output output;
prefixes_copy[i]->get_path_vec(output.tokens, output.timesteps); prefixes_copy[i]->get_path_vec(output.tokens, output.timesteps);
double approx_ctc = scores[prefixes_copy[i]]; double approx_ctc = scores[prefixes_copy[i]];
if (ext_scorer_ != nullptr) { if (ext_scorer_) {
auto words = ext_scorer_->split_labels_into_scored_units(output.tokens); auto words = ext_scorer_->split_labels_into_scored_units(output.tokens);
// remove term insertion weight // remove term insertion weight
approx_ctc -= words.size() * ext_scorer_->beta; approx_ctc -= words.size() * ext_scorer_->beta;
@ -222,7 +222,7 @@ std::vector<Output> ctc_beam_search_decoder(
size_t beam_size, size_t beam_size,
double cutoff_prob, double cutoff_prob,
size_t cutoff_top_n, size_t cutoff_top_n,
Scorer *ext_scorer) std::shared_ptr<Scorer> ext_scorer)
{ {
DecoderState state; DecoderState state;
state.init(alphabet, beam_size, cutoff_prob, cutoff_top_n, ext_scorer); state.init(alphabet, beam_size, cutoff_prob, cutoff_top_n, ext_scorer);
@ -243,7 +243,7 @@ ctc_beam_search_decoder_batch(
size_t num_processes, size_t num_processes,
double cutoff_prob, double cutoff_prob,
size_t cutoff_top_n, size_t cutoff_top_n,
Scorer *ext_scorer) std::shared_ptr<Scorer> ext_scorer)
{ {
VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!"); VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
VALID_CHECK_EQ(batch_size, seq_lengths_size, "must have one sequence length per batch element"); VALID_CHECK_EQ(batch_size, seq_lengths_size, "must have one sequence length per batch element");

View File

@ -1,6 +1,7 @@
#ifndef CTC_BEAM_SEARCH_DECODER_H_ #ifndef CTC_BEAM_SEARCH_DECODER_H_
#define CTC_BEAM_SEARCH_DECODER_H_ #define CTC_BEAM_SEARCH_DECODER_H_
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
@ -16,7 +17,7 @@ class DecoderState {
double cutoff_prob_; double cutoff_prob_;
size_t cutoff_top_n_; size_t cutoff_top_n_;
Scorer* ext_scorer_; // weak std::shared_ptr<Scorer> ext_scorer_;
std::vector<PathTrie*> prefixes_; std::vector<PathTrie*> prefixes_;
std::unique_ptr<PathTrie> prefix_root_; std::unique_ptr<PathTrie> prefix_root_;
@ -45,7 +46,7 @@ public:
size_t beam_size, size_t beam_size,
double cutoff_prob, double cutoff_prob,
size_t cutoff_top_n, size_t cutoff_top_n,
Scorer *ext_scorer); std::shared_ptr<Scorer> ext_scorer);
/* Send data to the decoder /* Send data to the decoder
* *
@ -95,7 +96,7 @@ std::vector<Output> ctc_beam_search_decoder(
size_t beam_size, size_t beam_size,
double cutoff_prob, double cutoff_prob,
size_t cutoff_top_n, size_t cutoff_top_n,
Scorer *ext_scorer); std::shared_ptr<Scorer> ext_scorer);
/* CTC Beam Search Decoder for batch data /* CTC Beam Search Decoder for batch data
* Parameters: * Parameters:
@ -126,6 +127,6 @@ ctc_beam_search_decoder_batch(
size_t num_processes, size_t num_processes,
double cutoff_prob, double cutoff_prob,
size_t cutoff_top_n, size_t cutoff_top_n,
Scorer *ext_scorer); std::shared_ptr<Scorer> ext_scorer);
#endif // CTC_BEAM_SEARCH_DECODER_H_ #endif // CTC_BEAM_SEARCH_DECODER_H_

View File

@ -71,8 +71,19 @@ void Scorer::setup_char_map()
int Scorer::load_lm(const std::string& lm_path) int Scorer::load_lm(const std::string& lm_path)
{ {
// load language model // Check if file is readable to avoid KenLM throwing an exception
const char* filename = lm_path.c_str(); const char* filename = lm_path.c_str();
if (access(filename, R_OK) != 0) {
return 1;
}
// Check if the file format is valid to avoid KenLM throwing an exception
lm::ngram::ModelType model_type;
if (!lm::ngram::RecognizeBinary(filename, model_type)) {
return 1;
}
// Load the LM
lm::ngram::Config config; lm::ngram::Config config;
config.load_method = util::LoadMethod::LAZY; config.load_method = util::LoadMethod::LAZY;
language_model_.reset(lm::ngram::LoadVirtual(filename, config)); language_model_.reset(lm::ngram::LoadVirtual(filename, config));
@ -100,21 +111,21 @@ int Scorer::load_trie(std::ifstream& fin, const std::string& file_path)
int magic; int magic;
fin.read(reinterpret_cast<char*>(&magic), sizeof(magic)); fin.read(reinterpret_cast<char*>(&magic), sizeof(magic));
if (magic != MAGIC) { if (magic != MAGIC) {
std::cerr << "Error: Can't parse trie file, invalid header. Try updating " std::cerr << "Error: Can't parse scorer file, invalid header. Try updating "
"your trie file." << std::endl; "your scorer file." << std::endl;
return 1; return 1;
} }
int version; int version;
fin.read(reinterpret_cast<char*>(&version), sizeof(version)); fin.read(reinterpret_cast<char*>(&version), sizeof(version));
if (version != FILE_VERSION) { if (version != FILE_VERSION) {
std::cerr << "Error: Trie file version mismatch (" << version std::cerr << "Error: Scorer file version mismatch (" << version
<< " instead of expected " << FILE_VERSION << " instead of expected " << FILE_VERSION
<< "). "; << "). ";
if (version < FILE_VERSION) { if (version < FILE_VERSION) {
std::cerr << "Update your trie file."; std::cerr << "Update your scorer file.";
} else { } else {
std::cerr << "Downgrade your trie file or update your version of DeepSpeech."; std::cerr << "Downgrade your scorer file or update your version of DeepSpeech.";
} }
std::cerr << std::endl; std::cerr << std::endl;
return 1; return 1;

View File

@ -7,9 +7,10 @@
#include "workspace_status.h" #include "workspace_status.h"
%} %}
%include "pyabc.i" %include <pyabc.i>
%include "std_string.i" %include <std_string.i>
%include "std_vector.i" %include <std_vector.i>
%include <std_shared_ptr.i>
%include "numpy.i" %include "numpy.i"
%init %{ %init %{
@ -20,6 +21,8 @@ namespace std {
%template(StringVector) vector<string>; %template(StringVector) vector<string>;
} }
%shared_ptr(Scorer);
// Convert NumPy arrays to pointer+lengths // Convert NumPy arrays to pointer+lengths
%apply (double* IN_ARRAY2, int DIM1, int DIM2) {(const double *probs, int time_dim, int class_dim)}; %apply (double* IN_ARRAY2, int DIM1, int DIM2) {(const double *probs, int time_dim, int class_dim)};
%apply (double* IN_ARRAY3, int DIM1, int DIM2, int DIM3) {(const double *probs, int batch_size, int time_dim, int class_dim)}; %apply (double* IN_ARRAY3, int DIM1, int DIM2, int DIM3) {(const double *probs, int batch_size, int time_dim, int class_dim)};

View File

@ -319,7 +319,7 @@ int
DS_DisableExternalScorer(ModelState* aCtx) DS_DisableExternalScorer(ModelState* aCtx)
{ {
if (aCtx->scorer_) { if (aCtx->scorer_) {
aCtx->scorer_.reset(nullptr); aCtx->scorer_.reset();
return DS_ERR_OK; return DS_ERR_OK;
} }
return DS_ERR_SCORER_NOT_ENABLED; return DS_ERR_SCORER_NOT_ENABLED;
@ -363,7 +363,7 @@ DS_CreateStream(ModelState* aCtx,
aCtx->beam_width_, aCtx->beam_width_,
cutoff_prob, cutoff_prob,
cutoff_top_n, cutoff_top_n,
aCtx->scorer_.get()); aCtx->scorer_);
*retval = ctx.release(); *retval = ctx.release();
return DS_ERR_OK; return DS_ERR_OK;

View File

@ -51,12 +51,11 @@ Please push DeepSpeech data to ``/sdcard/deepspeech/``\ , including:
* ``output_graph.tflite`` which is the TF Lite model * ``output_graph.tflite`` which is the TF Lite model
* ``kenlm.scorer``, if you want to use the language model ; please * ``kenlm.scorer``, if you want to use the scorer; please be aware that too big
be aware that too big language model will make the device run out of memory scorer will make the device run out of memory
Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/ds``\ : Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/ds``\ :
* ``deepspeech`` * ``deepspeech``
* ``libdeepspeech.so`` * ``libdeepspeech.so``
* ``libc++_shared.so`` * ``libc++_shared.so``

View File

@ -32,8 +32,8 @@ parser.addArgument(['--model'], {required: true, help: 'Path to the model (proto
parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'}); parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'});
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'}); parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'}); parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'});
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not set, use default value from scorer.', type: 'float'}); parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not specified, use default from the scorer package.', type: 'float'});
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not set, use default value from scorer.', type: 'float'}); parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.', type: 'float'});
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'}); parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'}); parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
var args = parser.parseArgs(); var args = parser.parseArgs();

View File

@ -16,7 +16,7 @@ struct ModelState {
static constexpr unsigned int BATCH_SIZE = 1; static constexpr unsigned int BATCH_SIZE = 1;
Alphabet alphabet_; Alphabet alphabet_;
std::unique_ptr<Scorer> scorer_; std::shared_ptr<Scorer> scorer_;
unsigned int beam_width_; unsigned int beam_width_;
unsigned int n_steps_; unsigned int n_steps_;
unsigned int n_context_; unsigned int n_context_;

View File

@ -95,9 +95,9 @@ def main():
parser.add_argument('--beam_width', type=int, default=500, parser.add_argument('--beam_width', type=int, default=500,
help='Beam width for the CTC decoder') help='Beam width for the CTC decoder')
parser.add_argument('--lm_alpha', type=float, parser.add_argument('--lm_alpha', type=float,
help='Language model weight (lm_alpha)') help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
parser.add_argument('--lm_beta', type=float, parser.add_argument('--lm_beta', type=float,
help='Word insertion bonus (lm_beta)') help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
parser.add_argument('--version', action=VersionAction, parser.add_argument('--version', action=VersionAction,
help='Print version and exits') help='Print version and exits')
parser.add_argument('--extended', required=False, action='store_true', parser.add_argument('--extended', required=False, action='store_true',