Address review comments

This commit is contained in:
Reuben Morais 2020-02-05 17:19:53 +01:00
parent 1d3b3a31a1
commit 8dedda7759
13 changed files with 56 additions and 38 deletions

View File

@ -5,7 +5,7 @@ This directory contains language-specific data files. Most importantly, you will
1. A list of unique characters for the target language (e.g. English) in `data/alphabet.txt`
2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`, which includes a binary n-gram language model generated with `data/lm/generate_lm.py`.
2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`. The scorer package includes a binary n-gram language model generated with `data/lm/generate_lm.py`.
For more information on how to build these resources from scratch, see `data/lm/README.md`

View File

@ -1,8 +1,8 @@
The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate `lm.binary` and `librispeech-vocab-500k.txt` in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
The scorer package was then built using the `generate_package.py` script:
.. code-block:: bash
python generate_lm.py # this will create lm.binary and librispeech-vocab-500k.txt
python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer

View File

@ -27,7 +27,7 @@ This module should be self-contained:
- pip install native_client/python/dist/deepspeech*.whl
- pip install -r requirements_eval_tflite.txt
Then run with a TF Lite model, LM and a CSV test file
Then run with a TF Lite model, a scorer and a CSV test file
'''
BEAM_WIDTH = 500

View File

@ -20,16 +20,20 @@ class Scorer(swigwrapper.Scorer):
super(Scorer, self).__init__()
# Allow bare initialization
if alphabet:
assert alpha is not None, 'alpha parameter is required'
assert beta is not None, 'beta parameter is required'
assert scorer_path, 'scorer_path parameter is required'
serialized = alphabet.serialize()
native_alphabet = swigwrapper.Alphabet()
err = native_alphabet.deserialize(serialized, len(serialized))
if err != 0:
raise ValueError("Error when deserializing alphabet.")
raise ValueError('Error when deserializing alphabet.')
err = self.init(scorer_path.encode('utf-8'),
native_alphabet)
if err != 0:
raise ValueError("Scorer initialization failed with error code {}".format(err), err)
raise ValueError('Scorer initialization failed with error code {}'.format(err))
self.reset_params(alpha, beta)

View File

@ -18,7 +18,7 @@ DecoderState::init(const Alphabet& alphabet,
size_t beam_size,
double cutoff_prob,
size_t cutoff_top_n,
Scorer *ext_scorer)
std::shared_ptr<Scorer> ext_scorer)
{
// assign special ids
abs_time_step_ = 0;
@ -36,7 +36,7 @@ DecoderState::init(const Alphabet& alphabet,
prefix_root_.reset(root);
prefixes_.push_back(root);
if (ext_scorer != nullptr && (bool)(ext_scorer_->dictionary)) {
if (ext_scorer && (bool)(ext_scorer_->dictionary)) {
// no need for std::make_shared<>() since Copy() does 'new' behind the doors
auto dict_ptr = std::shared_ptr<PathTrie::FstType>(ext_scorer->dictionary->Copy(true));
root->set_dictionary(dict_ptr);
@ -58,7 +58,7 @@ DecoderState::next(const double *probs,
float min_cutoff = -NUM_FLT_INF;
bool full_beam = false;
if (ext_scorer_ != nullptr) {
if (ext_scorer_) {
size_t num_prefixes = std::min(prefixes_.size(), beam_size_);
std::partial_sort(prefixes_.begin(),
prefixes_.begin() + num_prefixes,
@ -109,7 +109,7 @@ DecoderState::next(const double *probs,
log_p = log_prob_c + prefix->score;
}
if (ext_scorer_ != nullptr) {
if (ext_scorer_) {
// skip scoring the space in word based LMs
PathTrie* prefix_to_score;
if (ext_scorer_->is_utf8_mode()) {
@ -166,7 +166,7 @@ DecoderState::decode() const
}
// score the last word of each prefix that doesn't end with space
if (ext_scorer_ != nullptr) {
if (ext_scorer_) {
for (size_t i = 0; i < beam_size_ && i < prefixes_copy.size(); ++i) {
auto prefix = prefixes_copy[i];
if (!ext_scorer_->is_scoring_boundary(prefix->parent, prefix->character)) {
@ -200,7 +200,7 @@ DecoderState::decode() const
Output output;
prefixes_copy[i]->get_path_vec(output.tokens, output.timesteps);
double approx_ctc = scores[prefixes_copy[i]];
if (ext_scorer_ != nullptr) {
if (ext_scorer_) {
auto words = ext_scorer_->split_labels_into_scored_units(output.tokens);
// remove term insertion weight
approx_ctc -= words.size() * ext_scorer_->beta;
@ -222,7 +222,7 @@ std::vector<Output> ctc_beam_search_decoder(
size_t beam_size,
double cutoff_prob,
size_t cutoff_top_n,
Scorer *ext_scorer)
std::shared_ptr<Scorer> ext_scorer)
{
DecoderState state;
state.init(alphabet, beam_size, cutoff_prob, cutoff_top_n, ext_scorer);
@ -243,7 +243,7 @@ ctc_beam_search_decoder_batch(
size_t num_processes,
double cutoff_prob,
size_t cutoff_top_n,
Scorer *ext_scorer)
std::shared_ptr<Scorer> ext_scorer)
{
VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
VALID_CHECK_EQ(batch_size, seq_lengths_size, "must have one sequence length per batch element");

View File

@ -1,6 +1,7 @@
#ifndef CTC_BEAM_SEARCH_DECODER_H_
#define CTC_BEAM_SEARCH_DECODER_H_
#include <memory>
#include <string>
#include <vector>
@ -16,7 +17,7 @@ class DecoderState {
double cutoff_prob_;
size_t cutoff_top_n_;
Scorer* ext_scorer_; // weak
std::shared_ptr<Scorer> ext_scorer_;
std::vector<PathTrie*> prefixes_;
std::unique_ptr<PathTrie> prefix_root_;
@ -45,7 +46,7 @@ public:
size_t beam_size,
double cutoff_prob,
size_t cutoff_top_n,
Scorer *ext_scorer);
std::shared_ptr<Scorer> ext_scorer);
/* Send data to the decoder
*
@ -95,7 +96,7 @@ std::vector<Output> ctc_beam_search_decoder(
size_t beam_size,
double cutoff_prob,
size_t cutoff_top_n,
Scorer *ext_scorer);
std::shared_ptr<Scorer> ext_scorer);
/* CTC Beam Search Decoder for batch data
* Parameters:
@ -126,6 +127,6 @@ ctc_beam_search_decoder_batch(
size_t num_processes,
double cutoff_prob,
size_t cutoff_top_n,
Scorer *ext_scorer);
std::shared_ptr<Scorer> ext_scorer);
#endif // CTC_BEAM_SEARCH_DECODER_H_

View File

@ -71,8 +71,19 @@ void Scorer::setup_char_map()
int Scorer::load_lm(const std::string& lm_path)
{
// load language model
// Check if file is readable to avoid KenLM throwing an exception
const char* filename = lm_path.c_str();
if (access(filename, R_OK) != 0) {
return 1;
}
// Check if the file format is valid to avoid KenLM throwing an exception
lm::ngram::ModelType model_type;
if (!lm::ngram::RecognizeBinary(filename, model_type)) {
return 1;
}
// Load the LM
lm::ngram::Config config;
config.load_method = util::LoadMethod::LAZY;
language_model_.reset(lm::ngram::LoadVirtual(filename, config));
@ -100,21 +111,21 @@ int Scorer::load_trie(std::ifstream& fin, const std::string& file_path)
int magic;
fin.read(reinterpret_cast<char*>(&magic), sizeof(magic));
if (magic != MAGIC) {
std::cerr << "Error: Can't parse trie file, invalid header. Try updating "
"your trie file." << std::endl;
std::cerr << "Error: Can't parse scorer file, invalid header. Try updating "
"your scorer file." << std::endl;
return 1;
}
int version;
fin.read(reinterpret_cast<char*>(&version), sizeof(version));
if (version != FILE_VERSION) {
std::cerr << "Error: Trie file version mismatch (" << version
std::cerr << "Error: Scorer file version mismatch (" << version
<< " instead of expected " << FILE_VERSION
<< "). ";
if (version < FILE_VERSION) {
std::cerr << "Update your trie file.";
std::cerr << "Update your scorer file.";
} else {
std::cerr << "Downgrade your trie file or update your version of DeepSpeech.";
std::cerr << "Downgrade your scorer file or update your version of DeepSpeech.";
}
std::cerr << std::endl;
return 1;

View File

@ -7,9 +7,10 @@
#include "workspace_status.h"
%}
%include "pyabc.i"
%include "std_string.i"
%include "std_vector.i"
%include <pyabc.i>
%include <std_string.i>
%include <std_vector.i>
%include <std_shared_ptr.i>
%include "numpy.i"
%init %{
@ -20,6 +21,8 @@ namespace std {
%template(StringVector) vector<string>;
}
%shared_ptr(Scorer);
// Convert NumPy arrays to pointer+lengths
%apply (double* IN_ARRAY2, int DIM1, int DIM2) {(const double *probs, int time_dim, int class_dim)};
%apply (double* IN_ARRAY3, int DIM1, int DIM2, int DIM3) {(const double *probs, int batch_size, int time_dim, int class_dim)};

View File

@ -319,7 +319,7 @@ int
DS_DisableExternalScorer(ModelState* aCtx)
{
if (aCtx->scorer_) {
aCtx->scorer_.reset(nullptr);
aCtx->scorer_.reset();
return DS_ERR_OK;
}
return DS_ERR_SCORER_NOT_ENABLED;
@ -363,7 +363,7 @@ DS_CreateStream(ModelState* aCtx,
aCtx->beam_width_,
cutoff_prob,
cutoff_top_n,
aCtx->scorer_.get());
aCtx->scorer_);
*retval = ctx.release();
return DS_ERR_OK;

View File

@ -51,12 +51,11 @@ Please push DeepSpeech data to ``/sdcard/deepspeech/``\ , including:
* ``output_graph.tflite`` which is the TF Lite model
* ``kenlm.scorer``, if you want to use the language model ; please
be aware that too big language model will make the device run out of memory
* ``kenlm.scorer``, if you want to use the scorer; please be aware that too big
scorer will make the device run out of memory
Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/ds``\ :
* ``deepspeech``
* ``libdeepspeech.so``
* ``libc++_shared.so``

View File

@ -32,8 +32,8 @@ parser.addArgument(['--model'], {required: true, help: 'Path to the model (proto
parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'});
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'});
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not set, use default value from scorer.', type: 'float'});
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not set, use default value from scorer.', type: 'float'});
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not specified, use default from the scorer package.', type: 'float'});
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.', type: 'float'});
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
var args = parser.parseArgs();

View File

@ -16,7 +16,7 @@ struct ModelState {
static constexpr unsigned int BATCH_SIZE = 1;
Alphabet alphabet_;
std::unique_ptr<Scorer> scorer_;
std::shared_ptr<Scorer> scorer_;
unsigned int beam_width_;
unsigned int n_steps_;
unsigned int n_context_;

View File

@ -95,9 +95,9 @@ def main():
parser.add_argument('--beam_width', type=int, default=500,
help='Beam width for the CTC decoder')
parser.add_argument('--lm_alpha', type=float,
help='Language model weight (lm_alpha)')
help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
parser.add_argument('--lm_beta', type=float,
help='Word insertion bonus (lm_beta)')
help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
parser.add_argument('--version', action=VersionAction,
help='Print version and exits')
parser.add_argument('--extended', required=False, action='store_true',