Address review comments
This commit is contained in:
parent
1d3b3a31a1
commit
8dedda7759
@ -5,7 +5,7 @@ This directory contains language-specific data files. Most importantly, you will
|
||||
|
||||
1. A list of unique characters for the target language (e.g. English) in `data/alphabet.txt`
|
||||
|
||||
2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`, which includes a binary n-gram language model generated with `data/lm/generate_lm.py`.
|
||||
2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`. The scorer package includes a binary n-gram language model generated with `data/lm/generate_lm.py`.
|
||||
|
||||
For more information on how to build these resources from scratch, see `data/lm/README.md`
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
|
||||
The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate lm.binary in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
|
||||
The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_\ , using the `generate_lm.py` script (will generate `lm.binary` and `librispeech-vocab-500k.txt` in the folder it is run from). `KenLM <https://github.com/kpu/kenlm>`_'s built binaries must be in your PATH (lmplz, build_binary, filter).
|
||||
|
||||
The scorer package was then built using the `generate_package.py` script:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer
|
||||
python generate_lm.py # this will create lm.binary and librispeech-vocab-500k.txt
|
||||
python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --default_alpha 0.75 --default_beta 1.85 --package kenlm.scorer
|
||||
|
@ -27,7 +27,7 @@ This module should be self-contained:
|
||||
- pip install native_client/python/dist/deepspeech*.whl
|
||||
- pip install -r requirements_eval_tflite.txt
|
||||
|
||||
Then run with a TF Lite model, LM and a CSV test file
|
||||
Then run with a TF Lite model, a scorer and a CSV test file
|
||||
'''
|
||||
|
||||
BEAM_WIDTH = 500
|
||||
|
@ -20,16 +20,20 @@ class Scorer(swigwrapper.Scorer):
|
||||
super(Scorer, self).__init__()
|
||||
# Allow bare initialization
|
||||
if alphabet:
|
||||
assert alpha is not None, 'alpha parameter is required'
|
||||
assert beta is not None, 'beta parameter is required'
|
||||
assert scorer_path, 'scorer_path parameter is required'
|
||||
|
||||
serialized = alphabet.serialize()
|
||||
native_alphabet = swigwrapper.Alphabet()
|
||||
err = native_alphabet.deserialize(serialized, len(serialized))
|
||||
if err != 0:
|
||||
raise ValueError("Error when deserializing alphabet.")
|
||||
raise ValueError('Error when deserializing alphabet.')
|
||||
|
||||
err = self.init(scorer_path.encode('utf-8'),
|
||||
native_alphabet)
|
||||
if err != 0:
|
||||
raise ValueError("Scorer initialization failed with error code {}".format(err), err)
|
||||
raise ValueError('Scorer initialization failed with error code {}'.format(err))
|
||||
|
||||
self.reset_params(alpha, beta)
|
||||
|
||||
|
@ -18,7 +18,7 @@ DecoderState::init(const Alphabet& alphabet,
|
||||
size_t beam_size,
|
||||
double cutoff_prob,
|
||||
size_t cutoff_top_n,
|
||||
Scorer *ext_scorer)
|
||||
std::shared_ptr<Scorer> ext_scorer)
|
||||
{
|
||||
// assign special ids
|
||||
abs_time_step_ = 0;
|
||||
@ -36,7 +36,7 @@ DecoderState::init(const Alphabet& alphabet,
|
||||
prefix_root_.reset(root);
|
||||
prefixes_.push_back(root);
|
||||
|
||||
if (ext_scorer != nullptr && (bool)(ext_scorer_->dictionary)) {
|
||||
if (ext_scorer && (bool)(ext_scorer_->dictionary)) {
|
||||
// no need for std::make_shared<>() since Copy() does 'new' behind the doors
|
||||
auto dict_ptr = std::shared_ptr<PathTrie::FstType>(ext_scorer->dictionary->Copy(true));
|
||||
root->set_dictionary(dict_ptr);
|
||||
@ -58,7 +58,7 @@ DecoderState::next(const double *probs,
|
||||
|
||||
float min_cutoff = -NUM_FLT_INF;
|
||||
bool full_beam = false;
|
||||
if (ext_scorer_ != nullptr) {
|
||||
if (ext_scorer_) {
|
||||
size_t num_prefixes = std::min(prefixes_.size(), beam_size_);
|
||||
std::partial_sort(prefixes_.begin(),
|
||||
prefixes_.begin() + num_prefixes,
|
||||
@ -109,7 +109,7 @@ DecoderState::next(const double *probs,
|
||||
log_p = log_prob_c + prefix->score;
|
||||
}
|
||||
|
||||
if (ext_scorer_ != nullptr) {
|
||||
if (ext_scorer_) {
|
||||
// skip scoring the space in word based LMs
|
||||
PathTrie* prefix_to_score;
|
||||
if (ext_scorer_->is_utf8_mode()) {
|
||||
@ -166,7 +166,7 @@ DecoderState::decode() const
|
||||
}
|
||||
|
||||
// score the last word of each prefix that doesn't end with space
|
||||
if (ext_scorer_ != nullptr) {
|
||||
if (ext_scorer_) {
|
||||
for (size_t i = 0; i < beam_size_ && i < prefixes_copy.size(); ++i) {
|
||||
auto prefix = prefixes_copy[i];
|
||||
if (!ext_scorer_->is_scoring_boundary(prefix->parent, prefix->character)) {
|
||||
@ -200,7 +200,7 @@ DecoderState::decode() const
|
||||
Output output;
|
||||
prefixes_copy[i]->get_path_vec(output.tokens, output.timesteps);
|
||||
double approx_ctc = scores[prefixes_copy[i]];
|
||||
if (ext_scorer_ != nullptr) {
|
||||
if (ext_scorer_) {
|
||||
auto words = ext_scorer_->split_labels_into_scored_units(output.tokens);
|
||||
// remove term insertion weight
|
||||
approx_ctc -= words.size() * ext_scorer_->beta;
|
||||
@ -222,7 +222,7 @@ std::vector<Output> ctc_beam_search_decoder(
|
||||
size_t beam_size,
|
||||
double cutoff_prob,
|
||||
size_t cutoff_top_n,
|
||||
Scorer *ext_scorer)
|
||||
std::shared_ptr<Scorer> ext_scorer)
|
||||
{
|
||||
DecoderState state;
|
||||
state.init(alphabet, beam_size, cutoff_prob, cutoff_top_n, ext_scorer);
|
||||
@ -243,7 +243,7 @@ ctc_beam_search_decoder_batch(
|
||||
size_t num_processes,
|
||||
double cutoff_prob,
|
||||
size_t cutoff_top_n,
|
||||
Scorer *ext_scorer)
|
||||
std::shared_ptr<Scorer> ext_scorer)
|
||||
{
|
||||
VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
|
||||
VALID_CHECK_EQ(batch_size, seq_lengths_size, "must have one sequence length per batch element");
|
||||
|
@ -1,6 +1,7 @@
|
||||
#ifndef CTC_BEAM_SEARCH_DECODER_H_
|
||||
#define CTC_BEAM_SEARCH_DECODER_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@ -16,7 +17,7 @@ class DecoderState {
|
||||
double cutoff_prob_;
|
||||
size_t cutoff_top_n_;
|
||||
|
||||
Scorer* ext_scorer_; // weak
|
||||
std::shared_ptr<Scorer> ext_scorer_;
|
||||
std::vector<PathTrie*> prefixes_;
|
||||
std::unique_ptr<PathTrie> prefix_root_;
|
||||
|
||||
@ -45,7 +46,7 @@ public:
|
||||
size_t beam_size,
|
||||
double cutoff_prob,
|
||||
size_t cutoff_top_n,
|
||||
Scorer *ext_scorer);
|
||||
std::shared_ptr<Scorer> ext_scorer);
|
||||
|
||||
/* Send data to the decoder
|
||||
*
|
||||
@ -95,7 +96,7 @@ std::vector<Output> ctc_beam_search_decoder(
|
||||
size_t beam_size,
|
||||
double cutoff_prob,
|
||||
size_t cutoff_top_n,
|
||||
Scorer *ext_scorer);
|
||||
std::shared_ptr<Scorer> ext_scorer);
|
||||
|
||||
/* CTC Beam Search Decoder for batch data
|
||||
* Parameters:
|
||||
@ -126,6 +127,6 @@ ctc_beam_search_decoder_batch(
|
||||
size_t num_processes,
|
||||
double cutoff_prob,
|
||||
size_t cutoff_top_n,
|
||||
Scorer *ext_scorer);
|
||||
std::shared_ptr<Scorer> ext_scorer);
|
||||
|
||||
#endif // CTC_BEAM_SEARCH_DECODER_H_
|
||||
|
@ -71,8 +71,19 @@ void Scorer::setup_char_map()
|
||||
|
||||
int Scorer::load_lm(const std::string& lm_path)
|
||||
{
|
||||
// load language model
|
||||
// Check if file is readable to avoid KenLM throwing an exception
|
||||
const char* filename = lm_path.c_str();
|
||||
if (access(filename, R_OK) != 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Check if the file format is valid to avoid KenLM throwing an exception
|
||||
lm::ngram::ModelType model_type;
|
||||
if (!lm::ngram::RecognizeBinary(filename, model_type)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Load the LM
|
||||
lm::ngram::Config config;
|
||||
config.load_method = util::LoadMethod::LAZY;
|
||||
language_model_.reset(lm::ngram::LoadVirtual(filename, config));
|
||||
@ -100,21 +111,21 @@ int Scorer::load_trie(std::ifstream& fin, const std::string& file_path)
|
||||
int magic;
|
||||
fin.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||
if (magic != MAGIC) {
|
||||
std::cerr << "Error: Can't parse trie file, invalid header. Try updating "
|
||||
"your trie file." << std::endl;
|
||||
std::cerr << "Error: Can't parse scorer file, invalid header. Try updating "
|
||||
"your scorer file." << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int version;
|
||||
fin.read(reinterpret_cast<char*>(&version), sizeof(version));
|
||||
if (version != FILE_VERSION) {
|
||||
std::cerr << "Error: Trie file version mismatch (" << version
|
||||
std::cerr << "Error: Scorer file version mismatch (" << version
|
||||
<< " instead of expected " << FILE_VERSION
|
||||
<< "). ";
|
||||
if (version < FILE_VERSION) {
|
||||
std::cerr << "Update your trie file.";
|
||||
std::cerr << "Update your scorer file.";
|
||||
} else {
|
||||
std::cerr << "Downgrade your trie file or update your version of DeepSpeech.";
|
||||
std::cerr << "Downgrade your scorer file or update your version of DeepSpeech.";
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
return 1;
|
||||
|
@ -7,9 +7,10 @@
|
||||
#include "workspace_status.h"
|
||||
%}
|
||||
|
||||
%include "pyabc.i"
|
||||
%include "std_string.i"
|
||||
%include "std_vector.i"
|
||||
%include <pyabc.i>
|
||||
%include <std_string.i>
|
||||
%include <std_vector.i>
|
||||
%include <std_shared_ptr.i>
|
||||
%include "numpy.i"
|
||||
|
||||
%init %{
|
||||
@ -20,6 +21,8 @@ namespace std {
|
||||
%template(StringVector) vector<string>;
|
||||
}
|
||||
|
||||
%shared_ptr(Scorer);
|
||||
|
||||
// Convert NumPy arrays to pointer+lengths
|
||||
%apply (double* IN_ARRAY2, int DIM1, int DIM2) {(const double *probs, int time_dim, int class_dim)};
|
||||
%apply (double* IN_ARRAY3, int DIM1, int DIM2, int DIM3) {(const double *probs, int batch_size, int time_dim, int class_dim)};
|
||||
|
@ -319,7 +319,7 @@ int
|
||||
DS_DisableExternalScorer(ModelState* aCtx)
|
||||
{
|
||||
if (aCtx->scorer_) {
|
||||
aCtx->scorer_.reset(nullptr);
|
||||
aCtx->scorer_.reset();
|
||||
return DS_ERR_OK;
|
||||
}
|
||||
return DS_ERR_SCORER_NOT_ENABLED;
|
||||
@ -363,7 +363,7 @@ DS_CreateStream(ModelState* aCtx,
|
||||
aCtx->beam_width_,
|
||||
cutoff_prob,
|
||||
cutoff_top_n,
|
||||
aCtx->scorer_.get());
|
||||
aCtx->scorer_);
|
||||
|
||||
*retval = ctx.release();
|
||||
return DS_ERR_OK;
|
||||
|
@ -51,12 +51,11 @@ Please push DeepSpeech data to ``/sdcard/deepspeech/``\ , including:
|
||||
|
||||
|
||||
* ``output_graph.tflite`` which is the TF Lite model
|
||||
* ``kenlm.scorer``, if you want to use the language model ; please
|
||||
be aware that too big language model will make the device run out of memory
|
||||
* ``kenlm.scorer``, if you want to use the scorer; please be aware that too big
|
||||
scorer will make the device run out of memory
|
||||
|
||||
Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/ds``\ :
|
||||
|
||||
|
||||
* ``deepspeech``
|
||||
* ``libdeepspeech.so``
|
||||
* ``libc++_shared.so``
|
||||
|
@ -32,8 +32,8 @@ parser.addArgument(['--model'], {required: true, help: 'Path to the model (proto
|
||||
parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'});
|
||||
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
|
||||
parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'});
|
||||
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not set, use default value from scorer.', type: 'float'});
|
||||
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not set, use default value from scorer.', type: 'float'});
|
||||
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not specified, use default from the scorer package.', type: 'float'});
|
||||
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.', type: 'float'});
|
||||
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
|
||||
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
|
||||
var args = parser.parseArgs();
|
||||
|
@ -16,7 +16,7 @@ struct ModelState {
|
||||
static constexpr unsigned int BATCH_SIZE = 1;
|
||||
|
||||
Alphabet alphabet_;
|
||||
std::unique_ptr<Scorer> scorer_;
|
||||
std::shared_ptr<Scorer> scorer_;
|
||||
unsigned int beam_width_;
|
||||
unsigned int n_steps_;
|
||||
unsigned int n_context_;
|
||||
|
@ -95,9 +95,9 @@ def main():
|
||||
parser.add_argument('--beam_width', type=int, default=500,
|
||||
help='Beam width for the CTC decoder')
|
||||
parser.add_argument('--lm_alpha', type=float,
|
||||
help='Language model weight (lm_alpha)')
|
||||
help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
|
||||
parser.add_argument('--lm_beta', type=float,
|
||||
help='Word insertion bonus (lm_beta)')
|
||||
help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
|
||||
parser.add_argument('--version', action=VersionAction,
|
||||
help='Print version and exits')
|
||||
parser.add_argument('--extended', required=False, action='store_true',
|
||||
|
Loading…
Reference in New Issue
Block a user