Update all API consumers
This commit is contained in:
parent
708b21a63e
commit
1e2eb96248
@ -882,8 +882,7 @@ def package_zip():
|
||||
}
|
||||
}, f)
|
||||
|
||||
shutil.copy(FLAGS.lm_binary_path, export_dir)
|
||||
shutil.copy(FLAGS.lm_trie_path, export_dir)
|
||||
shutil.copy(FLAGS.scorer_path, export_dir)
|
||||
|
||||
archive = shutil.make_archive(zip_filename, 'zip', export_dir)
|
||||
log_info('Exported packaged model {}'.format(archive))
|
||||
@ -926,10 +925,9 @@ def do_single_file_inference(input_file_path):
|
||||
|
||||
logits = np.squeeze(logits)
|
||||
|
||||
if FLAGS.lm_binary_path:
|
||||
if FLAGS.scorer_path:
|
||||
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
|
||||
FLAGS.lm_binary_path, FLAGS.lm_trie_path,
|
||||
Config.alphabet)
|
||||
FLAGS.scorer_path, Config.alphabet)
|
||||
else:
|
||||
scorer = None
|
||||
decoded = ctc_beam_search_decoder(logits, Config.alphabet, FLAGS.beam_width,
|
||||
|
@ -172,7 +172,7 @@ RUN ./configure
|
||||
|
||||
|
||||
# Build DeepSpeech
|
||||
RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=cuda -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_trie --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
|
||||
RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=cuda -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx --copt=-fvisibility=hidden //native_client:libdeepspeech.so --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
|
||||
|
||||
###
|
||||
### Using TensorFlow upstream should work
|
||||
@ -187,8 +187,7 @@ RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_s
|
||||
# RUN pip3 install /tmp/tensorflow_pkg/*.whl
|
||||
|
||||
# Copy built libs to /DeepSpeech/native_client
|
||||
RUN cp /tensorflow/bazel-bin/native_client/generate_trie /DeepSpeech/native_client/ \
|
||||
&& cp /tensorflow/bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/
|
||||
RUN cp /tensorflow/bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/
|
||||
|
||||
# Install TensorFlow
|
||||
WORKDIR /DeepSpeech/
|
||||
|
@ -21,8 +21,7 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
|
||||
--n_hidden 100 --epochs 1 \
|
||||
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
|
||||
--learning_rate 0.001 --dropout_rate 0.05 \
|
||||
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
|
||||
--lm_trie_path 'data/smoke_test/vocab.trie' | tee /tmp/resume.log
|
||||
--scorer_path 'data/smoke_test/pruned_lm.scorer' | tee /tmp/resume.log
|
||||
|
||||
if ! grep "Restored variables from most recent checkpoint" /tmp/resume.log; then
|
||||
echo "Did not resume training from checkpoint"
|
||||
|
@ -25,6 +25,5 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
|
||||
--n_hidden 100 --epochs $epoch_count \
|
||||
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
|
||||
--learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train' \
|
||||
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
|
||||
--lm_trie_path 'data/smoke_test/vocab.trie' \
|
||||
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
|
||||
--audio_sample_rate ${audio_sample_rate}
|
||||
|
@ -21,12 +21,10 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
|
||||
--n_hidden 100 --epochs 1 \
|
||||
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' --checkpoint_secs 0 \
|
||||
--learning_rate 0.001 --dropout_rate 0.05 \
|
||||
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
|
||||
--lm_trie_path 'data/smoke_test/vocab.trie'
|
||||
--scorer_path 'data/smoke_test/pruned_lm.scorer'
|
||||
|
||||
python -u DeepSpeech.py \
|
||||
--n_hidden 100 \
|
||||
--checkpoint_dir '/tmp/ckpt' \
|
||||
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
|
||||
--lm_trie_path 'data/smoke_test/vocab.trie' \
|
||||
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
|
||||
--one_shot_infer 'data/smoke_test/LDC93S1.wav'
|
||||
|
@ -20,8 +20,7 @@ python -u DeepSpeech.py --noshow_progressbar \
|
||||
--n_hidden 100 \
|
||||
--checkpoint_dir '/tmp/ckpt' \
|
||||
--export_dir '/tmp/train_tflite' \
|
||||
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
|
||||
--lm_trie_path 'data/smoke_test/vocab.trie' \
|
||||
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
|
||||
--audio_sample_rate ${audio_sample_rate} \
|
||||
--export_tflite
|
||||
|
||||
@ -31,8 +30,7 @@ python -u DeepSpeech.py --noshow_progressbar \
|
||||
--n_hidden 100 \
|
||||
--checkpoint_dir '/tmp/ckpt' \
|
||||
--export_dir '/tmp/train_tflite/en-us' \
|
||||
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
|
||||
--lm_trie_path 'data/smoke_test/vocab.trie' \
|
||||
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
|
||||
--audio_sample_rate ${audio_sample_rate} \
|
||||
--export_language 'Fake English (fk-FK)' \
|
||||
--export_zip
|
||||
|
@ -50,7 +50,7 @@ def create_bundle(alphabet_path, lm_path, vocab_path, package_path, force_utf8,
|
||||
scorer.set_alphabet(alphabet)
|
||||
scorer.set_utf8_mode(use_utf8)
|
||||
scorer.reset_params(default_alpha, default_beta)
|
||||
scorer.load_lm(lm_path, "")
|
||||
scorer.load_lm(lm_path)
|
||||
scorer.fill_dictionary(list(words))
|
||||
shutil.copy(lm_path, package_path)
|
||||
scorer.save_dictionary(package_path, True) # append, not overwrite
|
||||
|
@ -7,7 +7,13 @@ C
|
||||
.. doxygenfunction:: DS_FreeModel
|
||||
:project: deepspeech-c
|
||||
|
||||
.. doxygenfunction:: DS_EnableDecoderWithLM
|
||||
.. doxygenfunction:: DS_EnableExternalScorer
|
||||
:project: deepspeech-c
|
||||
|
||||
.. doxygenfunction:: DS_DisableExternalScorer
|
||||
:project: deepspeech-c
|
||||
|
||||
.. doxygenfunction:: DS_SetScorerAlphaBeta
|
||||
:project: deepspeech-c
|
||||
|
||||
.. doxygenfunction:: DS_GetModelSampleRate
|
||||
|
@ -42,10 +42,9 @@ def sparse_tuple_to_texts(sp_tuple, alphabet):
|
||||
|
||||
|
||||
def evaluate(test_csvs, create_model, try_loading):
|
||||
if FLAGS.lm_binary_path:
|
||||
if FLAGS.scorer_path:
|
||||
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
|
||||
FLAGS.lm_binary_path, FLAGS.lm_trie_path,
|
||||
Config.alphabet)
|
||||
FLAGS.scorer_path, Config.alphabet)
|
||||
else:
|
||||
scorer = None
|
||||
|
||||
|
@ -27,17 +27,18 @@ This module should be self-contained:
|
||||
- pip install native_client/python/dist/deepspeech*.whl
|
||||
- pip install -r requirements_eval_tflite.txt
|
||||
|
||||
Then run with a TF Lite model, LM/trie and a CSV test file
|
||||
Then run with a TF Lite model, LM and a CSV test file
|
||||
'''
|
||||
|
||||
BEAM_WIDTH = 500
|
||||
LM_ALPHA = 0.75
|
||||
LM_BETA = 1.85
|
||||
|
||||
def tflite_worker(model, lm, trie, queue_in, queue_out, gpu_mask):
|
||||
def tflite_worker(model, scorer, queue_in, queue_out, gpu_mask):
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
|
||||
ds = Model(model, BEAM_WIDTH)
|
||||
ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
|
||||
ds.enableExternalScorer(scorer)
|
||||
ds.setScorerAlphaBeta(LM_ALPHA, LM_BETA)
|
||||
|
||||
while True:
|
||||
try:
|
||||
@ -64,7 +65,7 @@ def main(args, _):
|
||||
|
||||
processes = []
|
||||
for i in range(args.proc):
|
||||
worker_process = Process(target=tflite_worker, args=(args.model, args.lm, args.trie, work_todo, work_done, i), daemon=True, name='tflite_process_{}'.format(i))
|
||||
worker_process = Process(target=tflite_worker, args=(args.model, args.scorer, work_todo, work_done, i), daemon=True, name='tflite_process_{}'.format(i))
|
||||
worker_process.start() # Launch reader() as a separate python process
|
||||
processes.append(worker_process)
|
||||
|
||||
@ -113,10 +114,8 @@ def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Computing TFLite accuracy')
|
||||
parser.add_argument('--model', required=True,
|
||||
help='Path to the model (protocol buffer binary file)')
|
||||
parser.add_argument('--lm', required=True,
|
||||
help='Path to the language model binary file')
|
||||
parser.add_argument('--trie', required=True,
|
||||
help='Path to the language model trie file created with native_client/generate_trie')
|
||||
parser.add_argument('--scorer', required=True,
|
||||
help='Path to the external scorer file')
|
||||
parser.add_argument('--csv', required=True,
|
||||
help='Path to the CSV source file')
|
||||
parser.add_argument('--proc', required=False, default=cpu_count(), type=int,
|
||||
|
@ -12,19 +12,17 @@
|
||||
|
||||
char* model = NULL;
|
||||
|
||||
char* lm = NULL;
|
||||
|
||||
char* trie = NULL;
|
||||
char* scorer = NULL;
|
||||
|
||||
char* audio = NULL;
|
||||
|
||||
int beam_width = 500;
|
||||
|
||||
float lm_alpha = 0.75f;
|
||||
bool set_alphabeta = false;
|
||||
|
||||
float lm_beta = 1.85f;
|
||||
float lm_alpha = 0.f;
|
||||
|
||||
bool load_without_trie = false;
|
||||
float lm_beta = 0.f;
|
||||
|
||||
bool show_times = false;
|
||||
|
||||
@ -39,39 +37,36 @@ int stream_size = 0;
|
||||
void PrintHelp(const char* bin)
|
||||
{
|
||||
std::cout <<
|
||||
"Usage: " << bin << " --model MODEL [--lm LM --trie TRIE] --audio AUDIO [-t] [-e]\n"
|
||||
"Usage: " << bin << " --model MODEL [--scorer SCORER] --audio AUDIO [-t] [-e]\n"
|
||||
"\n"
|
||||
"Running DeepSpeech inference.\n"
|
||||
"\n"
|
||||
" --model MODEL Path to the model (protocol buffer binary file)\n"
|
||||
" --lm LM Path to the language model binary file\n"
|
||||
" --trie TRIE Path to the language model trie file created with native_client/generate_trie\n"
|
||||
" --audio AUDIO Path to the audio file to run (WAV format)\n"
|
||||
" --beam_width BEAM_WIDTH Value for decoder beam width (int)\n"
|
||||
" --lm_alpha LM_ALPHA Value for language model alpha param (float)\n"
|
||||
" --lm_beta LM_BETA Value for language model beta param (float)\n"
|
||||
" -t Run in benchmark mode, output mfcc & inference time\n"
|
||||
" --extended Output string from extended metadata\n"
|
||||
" --json Extended output, shows word timings as JSON\n"
|
||||
" --stream size Run in stream mode, output intermediate results\n"
|
||||
" --help Show help\n"
|
||||
" --version Print version and exits\n";
|
||||
"\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
|
||||
"\t--scorer SCORER\t\tPath to the external scorer file\n"
|
||||
"\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
|
||||
"\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
|
||||
"\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
|
||||
"\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
|
||||
"\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
|
||||
"\t--extended\t\tOutput string from extended metadata\n"
|
||||
"\t--json\t\t\tExtended output, shows word timings as JSON\n"
|
||||
"\t--stream size\t\tRun in stream mode, output intermediate results\n"
|
||||
"\t--help\t\t\tShow help\n"
|
||||
"\t--version\t\tPrint version and exits\n";
|
||||
DS_PrintVersions();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
bool ProcessArgs(int argc, char** argv)
|
||||
{
|
||||
const char* const short_opts = "m:a:l:r:w:c:d:b:tehv";
|
||||
const char* const short_opts = "m:a:s:r:w:c:d:b:tehv";
|
||||
const option long_opts[] = {
|
||||
{"model", required_argument, nullptr, 'm'},
|
||||
{"lm", required_argument, nullptr, 'l'},
|
||||
{"trie", required_argument, nullptr, 'r'},
|
||||
{"scorer", required_argument, nullptr, 'l'},
|
||||
{"audio", required_argument, nullptr, 'w'},
|
||||
{"beam_width", required_argument, nullptr, 'b'},
|
||||
{"lm_alpha", required_argument, nullptr, 'c'},
|
||||
{"lm_beta", required_argument, nullptr, 'd'},
|
||||
{"run_very_slowly_without_trie_I_really_know_what_Im_doing", no_argument, nullptr, 999},
|
||||
{"t", no_argument, nullptr, 't'},
|
||||
{"extended", no_argument, nullptr, 'e'},
|
||||
{"json", no_argument, nullptr, 'j'},
|
||||
@ -95,31 +90,25 @@ bool ProcessArgs(int argc, char** argv)
|
||||
break;
|
||||
|
||||
case 'l':
|
||||
lm = optarg;
|
||||
break;
|
||||
|
||||
case 'r':
|
||||
trie = optarg;
|
||||
scorer = optarg;
|
||||
break;
|
||||
|
||||
case 'w':
|
||||
audio = optarg;
|
||||
break;
|
||||
|
||||
case 'b':
|
||||
beam_width = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 'c':
|
||||
lm_alpha = atof(optarg);
|
||||
break;
|
||||
|
||||
case 'd':
|
||||
lm_beta = atof(optarg);
|
||||
break;
|
||||
case 'b':
|
||||
beam_width = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 'c':
|
||||
set_alphabeta = true;
|
||||
lm_alpha = atof(optarg);
|
||||
break;
|
||||
|
||||
case 999:
|
||||
load_without_trie = true;
|
||||
case 'd':
|
||||
set_alphabeta = true;
|
||||
lm_beta = atof(optarg);
|
||||
break;
|
||||
|
||||
case 't':
|
||||
|
@ -374,16 +374,19 @@ main(int argc, char **argv)
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (lm && (trie || load_without_trie)) {
|
||||
int status = DS_EnableDecoderWithLM(ctx,
|
||||
lm,
|
||||
trie,
|
||||
lm_alpha,
|
||||
lm_beta);
|
||||
if (scorer) {
|
||||
int status = DS_EnableExternalScorer(ctx, scorer);
|
||||
if (status != 0) {
|
||||
fprintf(stderr, "Could not enable CTC decoder with LM.\n");
|
||||
fprintf(stderr, "Could not enable external scorer.\n");
|
||||
return 1;
|
||||
}
|
||||
if (set_alphabeta) {
|
||||
status = DS_SetScorerAlphaBeta(ctx, lm_alpha, lm_beta);
|
||||
if (status != 0) {
|
||||
fprintf(stderr, "Error setting scorer alpha and beta.\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef NO_SOX
|
||||
|
@ -12,12 +12,11 @@ class Scorer(swigwrapper.Scorer):
|
||||
:type alpha: float
|
||||
:param beta: Word insertion bonus.
|
||||
:type beta: float
|
||||
:model_path: Path to load language model.
|
||||
:trie_path: Path to trie file.
|
||||
:model_path: Path to load scorer.
|
||||
:alphabet: Alphabet
|
||||
:type model_path: basestring
|
||||
"""
|
||||
def __init__(self, alpha=None, beta=None, model_path=None, trie_path=None, alphabet=None):
|
||||
def __init__(self, alpha=None, beta=None, model_path=None, alphabet=None):
|
||||
super(Scorer, self).__init__()
|
||||
# Allow bare initialization
|
||||
if alphabet:
|
||||
@ -27,15 +26,15 @@ class Scorer(swigwrapper.Scorer):
|
||||
if err != 0:
|
||||
raise ValueError("Error when deserializing alphabet.")
|
||||
|
||||
err = self.init(alpha, beta,
|
||||
model_path.encode('utf-8'),
|
||||
trie_path.encode('utf-8'),
|
||||
err = self.init(model_path.encode('utf-8'),
|
||||
native_alphabet)
|
||||
if err != 0:
|
||||
raise ValueError("Scorer initialization failed with error code {}".format(err), err)
|
||||
|
||||
def load_lm(self, lm_path, trie_path):
|
||||
super(Scorer, self).load_lm(lm_path.encode('utf-8'), trie_path.encode('utf-8'))
|
||||
self.reset_params(alpha, beta)
|
||||
|
||||
def load_lm(self, lm_path):
|
||||
super(Scorer, self).load_lm(lm_path.encode('utf-8'))
|
||||
|
||||
def save_dictionary(self, save_path, *args, **kwargs):
|
||||
super(Scorer, self).save_dictionary(save_path.encode('utf-8'), *args, **kwargs)
|
||||
|
@ -6,7 +6,6 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "lm/enumerate_vocab.hh"
|
||||
#include "lm/virtual_interface.hh"
|
||||
#include "lm/word_index.hh"
|
||||
#include "util/string_piece.hh"
|
||||
@ -19,18 +18,6 @@ const std::string START_TOKEN = "<s>";
|
||||
const std::string UNK_TOKEN = "<unk>";
|
||||
const std::string END_TOKEN = "</s>";
|
||||
|
||||
// Implement a callback to retrieve the dictionary of language model.
|
||||
class RetrieveStrEnumerateVocab : public lm::EnumerateVocab {
|
||||
public:
|
||||
RetrieveStrEnumerateVocab() {}
|
||||
|
||||
void Add(lm::WordIndex index, const StringPiece &str) {
|
||||
vocabulary.push_back(std::string(str.data(), str.length()));
|
||||
}
|
||||
|
||||
std::vector<std::string> vocabulary;
|
||||
};
|
||||
|
||||
/* External scorer to query score for n-gram or sentence, including language
|
||||
* model scoring and word insertion.
|
||||
*
|
||||
|
@ -310,7 +310,7 @@ DS_EnableExternalScorer(ModelState* aCtx,
|
||||
aCtx->scorer_.reset(new Scorer());
|
||||
int err = aCtx->scorer_->init(aScorerPath, aCtx->alphabet_);
|
||||
if (err != 0) {
|
||||
return DS_ERR_INVALID_LM;
|
||||
return DS_ERR_INVALID_SCORER;
|
||||
}
|
||||
return DS_ERR_OK;
|
||||
}
|
||||
|
@ -59,7 +59,7 @@ enum DeepSpeech_Error_Codes
|
||||
// Invalid parameters
|
||||
DS_ERR_INVALID_ALPHABET = 0x2000,
|
||||
DS_ERR_INVALID_SHAPE = 0x2001,
|
||||
DS_ERR_INVALID_LM = 0x2002,
|
||||
DS_ERR_INVALID_SCORER = 0x2002,
|
||||
DS_ERR_MODEL_INCOMPATIBLE = 0x2003,
|
||||
DS_ERR_SCORER_NOT_ENABLED = 0x2004,
|
||||
|
||||
@ -129,7 +129,7 @@ DEEPSPEECH_EXPORT
|
||||
int DS_DisableExternalScorer(ModelState* aCtx);
|
||||
|
||||
/**
|
||||
* @brief Set hyperparameters alpha and beta of a KenLM external scorer.
|
||||
* @brief Set hyperparameters alpha and beta of the external scorer.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model being changed.
|
||||
* @param aAlpha The alpha hyperparameter of the decoder. Language model weight.
|
||||
|
@ -1,141 +0,0 @@
|
||||
#ifndef DEEPSPEECH_COMPAT_H
|
||||
#define DEEPSPEECH_COMPAT_H
|
||||
|
||||
#include "deepspeech.h"
|
||||
|
||||
#warning This header is a convenience wrapper for compatibility with \
|
||||
the previous API, it has deprecated function names and arguments. \
|
||||
If possible, update your code instead of using this header.
|
||||
|
||||
/**
|
||||
* @brief An object providing an interface to a trained DeepSpeech model.
|
||||
*
|
||||
* @param aModelPath The path to the frozen model graph.
|
||||
* @param aNCep UNUSED, DEPRECATED.
|
||||
* @param aNContext UNUSED, DEPRECATED.
|
||||
* @param aAlphabetConfigPath UNUSED, DEPRECATED.
|
||||
* @param aBeamWidth The beam width used by the decoder. A larger beam
|
||||
* width generates better results at the cost of decoding
|
||||
* time.
|
||||
* @param[out] retval a ModelState pointer
|
||||
*
|
||||
* @return Zero on success, non-zero on failure.
|
||||
*/
|
||||
int DS_CreateModel(const char* aModelPath,
|
||||
unsigned int /*aNCep*/,
|
||||
unsigned int /*aNContext*/,
|
||||
const char* /*aAlphabetConfigPath*/,
|
||||
unsigned int aBeamWidth,
|
||||
ModelState** retval)
|
||||
{
|
||||
return DS_CreateModel(aModelPath, aBeamWidth, retval);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Frees associated resources and destroys model object.
|
||||
*/
|
||||
void DS_DestroyModel(ModelState* ctx)
|
||||
{
|
||||
return DS_FreeModel(ctx);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Enable decoding using beam scoring with a KenLM language model.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model being changed.
|
||||
* @param aAlphabetConfigPath UNUSED, DEPRECATED.
|
||||
* @param aLMPath The path to the language model binary file.
|
||||
* @param aTriePath The path to the trie file build from the same vocabu-
|
||||
* lary as the language model binary.
|
||||
* @param aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model
|
||||
weight.
|
||||
* @param aLMBeta The beta hyperparameter of the CTC decoder. Word insertion
|
||||
weight.
|
||||
*
|
||||
* @return Zero on success, non-zero on failure (invalid arguments).
|
||||
*/
|
||||
int DS_EnableDecoderWithLM(ModelState* aCtx,
|
||||
const char* /*aAlphabetConfigPath*/,
|
||||
const char* aLMPath,
|
||||
const char* aTriePath,
|
||||
float aLMAlpha,
|
||||
float aLMBeta)
|
||||
{
|
||||
return DS_EnableDecoderWithLM(aCtx, aLMPath, aTriePath, aLMAlpha, aLMBeta);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Create a new streaming inference state. The streaming state returned
|
||||
* by this function can then be passed to {@link DS_FeedAudioContent()}
|
||||
* and {@link DS_FinishStream()}.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aSampleRate UNUSED, DEPRECATED.
|
||||
* @param[out] retval an opaque pointer that represents the streaming state. Can
|
||||
* be NULL if an error occurs.
|
||||
*
|
||||
* @return Zero for success, non-zero on failure.
|
||||
*/
|
||||
int DS_SetupStream(ModelState* aCtx,
|
||||
unsigned int /*aSampleRate*/,
|
||||
StreamingState** retval)
|
||||
{
|
||||
return DS_CreateStream(aCtx, retval);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Destroy a streaming state without decoding the computed logits. This
|
||||
* can be used if you no longer need the result of an ongoing streaming
|
||||
* inference and don't want to perform a costly decode operation.
|
||||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
*
|
||||
* @note This method will free the state pointer (@p aSctx).
|
||||
*/
|
||||
void DS_DiscardStream(StreamingState* aSctx)
|
||||
{
|
||||
return DS_FreeStream(aSctx);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Use the DeepSpeech model to perform Speech-To-Text.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||
* sample rate (matching what the model was trained on).
|
||||
* @param aBufferSize The number of samples in the audio signal.
|
||||
* @param aSampleRate UNUSED, DEPRECATED.
|
||||
*
|
||||
* @return The STT result. The user is responsible for freeing the string using
|
||||
* {@link DS_FreeString()}. Returns NULL on error.
|
||||
*/
|
||||
char* DS_SpeechToText(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int /*aSampleRate*/)
|
||||
{
|
||||
return DS_SpeechToText(aCtx, aBuffer, aBufferSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
|
||||
* about the results.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||
* sample rate (matching what the model was trained on).
|
||||
* @param aBufferSize The number of samples in the audio signal.
|
||||
* @param aSampleRate UNUSED, DEPRECATED.
|
||||
*
|
||||
* @return Outputs a struct of individual letters along with their timing information.
|
||||
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
|
||||
*/
|
||||
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int /*aSampleRate*/)
|
||||
{
|
||||
return DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
|
||||
}
|
||||
|
||||
#endif /* DEEPSPEECH_COMPAT_H */
|
@ -82,8 +82,8 @@ namespace DeepSpeechClient
|
||||
throw new ArgumentException("Invalid alphabet embedded in model. (Data corruption?)");
|
||||
case ErrorCodes.DS_ERR_INVALID_SHAPE:
|
||||
throw new ArgumentException("Invalid model shape.");
|
||||
case ErrorCodes.DS_ERR_INVALID_LM:
|
||||
throw new ArgumentException("Invalid language model file.");
|
||||
case ErrorCodes.DS_ERR_INVALID_SCORER:
|
||||
throw new ArgumentException("Invalid scorer file.");
|
||||
case ErrorCodes.DS_ERR_FAIL_INIT_MMAP:
|
||||
throw new ArgumentException("Failed to initialize memory mapped model.");
|
||||
case ErrorCodes.DS_ERR_FAIL_INIT_SESS:
|
||||
@ -100,6 +100,8 @@ namespace DeepSpeechClient
|
||||
throw new ArgumentException("Error failed to create session.");
|
||||
case ErrorCodes.DS_ERR_MODEL_INCOMPATIBLE:
|
||||
throw new ArgumentException("Error incompatible model.");
|
||||
case ErrorCodes.DS_ERR_SCORER_NOT_ENABLED:
|
||||
throw new ArgumentException("External scorer is not enabled.");
|
||||
default:
|
||||
throw new ArgumentException("Unknown error, please make sure you are using the correct native binary.");
|
||||
}
|
||||
@ -114,45 +116,48 @@ namespace DeepSpeechClient
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Enable decoding using beam scoring with a KenLM language model.
|
||||
/// Enable decoding using an external scorer.
|
||||
/// </summary>
|
||||
/// <param name="aLMPath">The path to the language model binary file.</param>
|
||||
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param>
|
||||
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
|
||||
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
|
||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
|
||||
/// <exception cref="FileNotFoundException">Thrown when cannot find the language model or trie file.</exception>
|
||||
public unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath,
|
||||
float aLMAlpha, float aLMBeta)
|
||||
/// <param name="aScorerPath">The path to the external scorer file.</param>
|
||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with an external scorer.</exception>
|
||||
/// <exception cref="FileNotFoundException">Thrown when cannot find the scorer file.</exception>
|
||||
public unsafe void EnableExternalScorer(string aScorerPath)
|
||||
{
|
||||
string exceptionMessage = null;
|
||||
if (string.IsNullOrWhiteSpace(aLMPath))
|
||||
if (string.IsNullOrWhiteSpace(aScorerPath))
|
||||
{
|
||||
exceptionMessage = "Path to the language model file cannot be empty.";
|
||||
throw new FileNotFoundException("Path to the scorer file cannot be empty.");
|
||||
}
|
||||
if (!File.Exists(aLMPath))
|
||||
if (!File.Exists(aScorerPath))
|
||||
{
|
||||
exceptionMessage = $"Cannot find the language model file: {aLMPath}";
|
||||
}
|
||||
if (string.IsNullOrWhiteSpace(aTriePath))
|
||||
{
|
||||
exceptionMessage = "Path to the trie file cannot be empty.";
|
||||
}
|
||||
if (!File.Exists(aTriePath))
|
||||
{
|
||||
exceptionMessage = $"Cannot find the trie file: {aTriePath}";
|
||||
throw new FileNotFoundException($"Cannot find the scorer file: {aScorerPath}");
|
||||
}
|
||||
|
||||
if (exceptionMessage != null)
|
||||
{
|
||||
throw new FileNotFoundException(exceptionMessage);
|
||||
}
|
||||
var resultCode = NativeImp.DS_EnableExternalScorer(_modelStatePP, aScorerPath);
|
||||
EvaluateResultCode(resultCode);
|
||||
}
|
||||
|
||||
var resultCode = NativeImp.DS_EnableDecoderWithLM(_modelStatePP,
|
||||
aLMPath,
|
||||
aTriePath,
|
||||
aLMAlpha,
|
||||
aLMBeta);
|
||||
/// <summary>
|
||||
/// Disable decoding using an external scorer.
|
||||
/// </summary>
|
||||
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
|
||||
public unsafe void DisableExternalScorer()
|
||||
{
|
||||
var resultCode = NativeImp.DS_DisableExternalScorer(_modelStatePP);
|
||||
EvaluateResultCode(resultCode);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Set hyperparameters alpha and beta of the external scorer.
|
||||
/// </summary>
|
||||
/// <param name="aAlpha">The alpha hyperparameter of the decoder. Language model weight.</param>
|
||||
/// <param name="aBeta">The beta hyperparameter of the decoder. Word insertion weight.</param>
|
||||
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
|
||||
public unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta)
|
||||
{
|
||||
var resultCode = NativeImp.DS_SetScorerAlphaBeta(_modelStatePP,
|
||||
aAlpha,
|
||||
aBeta);
|
||||
EvaluateResultCode(resultCode);
|
||||
}
|
||||
|
||||
|
@ -14,8 +14,9 @@
|
||||
// Invalid parameters
|
||||
DS_ERR_INVALID_ALPHABET = 0x2000,
|
||||
DS_ERR_INVALID_SHAPE = 0x2001,
|
||||
DS_ERR_INVALID_LM = 0x2002,
|
||||
DS_ERR_INVALID_SCORER = 0x2002,
|
||||
DS_ERR_MODEL_INCOMPATIBLE = 0x2003,
|
||||
DS_ERR_SCORER_NOT_ENABLED = 0x2004,
|
||||
|
||||
// Runtime failures
|
||||
DS_ERR_FAIL_INIT_MMAP = 0x3000,
|
||||
|
@ -21,18 +21,26 @@ namespace DeepSpeechClient.Interfaces
|
||||
unsafe int GetModelSampleRate();
|
||||
|
||||
/// <summary>
|
||||
/// Enable decoding using beam scoring with a KenLM language model.
|
||||
/// Enable decoding using an external scorer.
|
||||
/// </summary>
|
||||
/// <param name="aLMPath">The path to the language model binary file.</param>
|
||||
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param>
|
||||
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
|
||||
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
|
||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
|
||||
/// <exception cref="FileNotFoundException">Thrown when cannot find the language model or trie file.</exception>
|
||||
unsafe void EnableDecoderWithLM(string aLMPath,
|
||||
string aTriePath,
|
||||
float aLMAlpha,
|
||||
float aLMBeta);
|
||||
/// <param name="aScorerPath">The path to the external scorer file.</param>
|
||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with an external scorer.</exception>
|
||||
/// <exception cref="FileNotFoundException">Thrown when cannot find the scorer file.</exception>
|
||||
unsafe void EnableExternalScorer(string aScorerPath);
|
||||
|
||||
/// <summary>
|
||||
/// Disable decoding using an external scorer.
|
||||
/// </summary>
|
||||
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
|
||||
unsafe void DisableExternalScorer();
|
||||
|
||||
/// <summary>
|
||||
/// Set hyperparameters alpha and beta of the external scorer.
|
||||
/// </summary>
|
||||
/// <param name="aAlpha">The alpha hyperparameter of the decoder. Language model weight.</param>
|
||||
/// <param name="aBeta">The beta hyperparameter of the decoder. Word insertion weight.</param>
|
||||
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
|
||||
unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta);
|
||||
|
||||
/// <summary>
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
||||
|
@ -23,11 +23,16 @@ namespace DeepSpeechClient
|
||||
internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(IntPtr** aCtx,
|
||||
string aLMPath,
|
||||
string aTriePath,
|
||||
float aLMAlpha,
|
||||
float aLMBeta);
|
||||
internal static unsafe extern ErrorCodes DS_EnableExternalScorer(IntPtr** aCtx,
|
||||
string aScorerPath);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern ErrorCodes DS_DisableExternalScorer(IntPtr** aCtx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern ErrorCodes DS_SetScorerAlphaBeta(IntPtr** aCtx,
|
||||
float aAlpha,
|
||||
float aBeta);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
|
||||
CharSet = CharSet.Ansi, SetLastError = true)]
|
||||
|
@ -35,22 +35,18 @@ namespace CSharpExamples
|
||||
static void Main(string[] args)
|
||||
{
|
||||
string model = null;
|
||||
string lm = null;
|
||||
string trie = null;
|
||||
string scorer = null;
|
||||
string audio = null;
|
||||
bool extended = false;
|
||||
if (args.Length > 0)
|
||||
{
|
||||
model = GetArgument(args, "--model");
|
||||
lm = GetArgument(args, "--lm");
|
||||
trie = GetArgument(args, "--trie");
|
||||
scorer = GetArgument(args, "--scorer");
|
||||
audio = GetArgument(args, "--audio");
|
||||
extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended"));
|
||||
}
|
||||
|
||||
const uint BEAM_WIDTH = 500;
|
||||
const float LM_ALPHA = 0.75f;
|
||||
const float LM_BETA = 1.85f;
|
||||
|
||||
Stopwatch stopwatch = new Stopwatch();
|
||||
try
|
||||
@ -64,14 +60,10 @@ namespace CSharpExamples
|
||||
|
||||
Console.WriteLine($"Model loaded - {stopwatch.Elapsed.Milliseconds} ms");
|
||||
stopwatch.Reset();
|
||||
if (lm != null)
|
||||
if (scorer != null)
|
||||
{
|
||||
Console.WriteLine("Loadin LM...");
|
||||
sttClient.EnableDecoderWithLM(
|
||||
lm ?? "lm.binary",
|
||||
trie ?? "trie",
|
||||
LM_ALPHA, LM_BETA);
|
||||
|
||||
Console.WriteLine("Loading scorer...");
|
||||
sttClient.EnableExternalScorer(scorer ?? "kenlm.scorer");
|
||||
}
|
||||
|
||||
string audioFile = audio ?? "arctic_a0024.wav";
|
||||
|
@ -31,8 +31,6 @@ public class DeepSpeechActivity extends AppCompatActivity {
|
||||
Button _startInference;
|
||||
|
||||
final int BEAM_WIDTH = 50;
|
||||
final float LM_ALPHA = 0.75f;
|
||||
final float LM_BETA = 1.85f;
|
||||
|
||||
private char readLEChar(RandomAccessFile f) throws IOException {
|
||||
byte b1 = f.readByte();
|
||||
|
@ -30,15 +30,11 @@ import java.nio.ByteBuffer;
|
||||
public class BasicTest {
|
||||
|
||||
public static final String modelFile = "/data/local/tmp/test/output_graph.tflite";
|
||||
public static final String lmFile = "/data/local/tmp/test/lm.binary";
|
||||
public static final String trieFile = "/data/local/tmp/test/trie";
|
||||
public static final String scorerFile = "/data/local/tmp/test/kenlm.scorer";
|
||||
public static final String wavFile = "/data/local/tmp/test/LDC93S1.wav";
|
||||
|
||||
public static final int BEAM_WIDTH = 50;
|
||||
|
||||
public static final float LM_ALPHA = 0.75f;
|
||||
public static final float LM_BETA = 1.85f;
|
||||
|
||||
private char readLEChar(RandomAccessFile f) throws IOException {
|
||||
byte b1 = f.readByte();
|
||||
byte b2 = f.readByte();
|
||||
@ -130,7 +126,7 @@ public class BasicTest {
|
||||
@Test
|
||||
public void loadDeepSpeech_stt_withLM() {
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
|
||||
m.enableDecoderWithLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
|
||||
m.enableExternalScorer(scorerFile);
|
||||
|
||||
String decoded = doSTT(m, false);
|
||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||
@ -149,7 +145,7 @@ public class BasicTest {
|
||||
@Test
|
||||
public void loadDeepSpeech_sttWithMetadata_withLM() {
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
|
||||
m.enableDecoderWithLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
|
||||
m.enableExternalScorer(scorerFile);
|
||||
|
||||
String decoded = doSTT(m, true);
|
||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||
|
@ -47,17 +47,35 @@ public class DeepSpeechModel {
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Enable decoding using beam scoring with a KenLM language model.
|
||||
* @brief Enable decoding using an external scorer.
|
||||
*
|
||||
* @param lm The path to the language model binary file.
|
||||
* @param trie The path to the trie file build from the same vocabulary as the language model binary.
|
||||
* @param lm_alpha The alpha hyperparameter of the CTC decoder. Language Model weight.
|
||||
* @param lm_beta The beta hyperparameter of the CTC decoder. Word insertion weight.
|
||||
* @param scorer The path to the external scorer file.
|
||||
*
|
||||
* @return Zero on success, non-zero on failure (invalid arguments).
|
||||
*/
|
||||
public void enableDecoderWithLM(String lm, String trie, float lm_alpha, float lm_beta) {
|
||||
impl.EnableDecoderWithLM(this._msp, lm, trie, lm_alpha, lm_beta);
|
||||
public void enableExternalScorer(String scorer) {
|
||||
impl.EnableExternalScorer(this._msp, scorer);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Disable decoding using an external scorer.
|
||||
*
|
||||
* @return Zero on success, non-zero on failure (invalid arguments).
|
||||
*/
|
||||
public void disableExternalScorer() {
|
||||
impl.DisableExternalScorer(this._msp);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Enable decoding using beam scoring with a KenLM language model.
|
||||
*
|
||||
* @param alpha The alpha hyperparameter of the decoder. Language model weight.
|
||||
* @param beta The beta hyperparameter of the decoder. Word insertion weight.
|
||||
*
|
||||
* @return Zero on success, non-zero on failure (invalid arguments).
|
||||
*/
|
||||
public void setScorerAlphaBeta(float alpha, float beta) {
|
||||
impl.SetScorerAlphaBeta(this._msp, alpha, beta);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -29,12 +29,11 @@ VersionAction.prototype.call = function(parser) {
|
||||
|
||||
var parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
|
||||
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
|
||||
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
|
||||
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
|
||||
parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'});
|
||||
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
|
||||
parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'});
|
||||
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha)', defaultValue: 0.75, type: 'float'});
|
||||
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta)', defaultValue: 1.85, type: 'float'});
|
||||
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not set, use default value from scorer.', type: 'float'});
|
||||
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not set, use default value from scorer.', type: 'float'});
|
||||
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
|
||||
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
|
||||
var args = parser.parseArgs();
|
||||
@ -60,12 +59,16 @@ console.error('Loaded model in %ds.', totalTime(model_load_end));
|
||||
|
||||
var desired_sample_rate = model.sampleRate();
|
||||
|
||||
if (args['lm'] && args['trie']) {
|
||||
console.error('Loading language model from files %s %s', args['lm'], args['trie']);
|
||||
const lm_load_start = process.hrtime();
|
||||
model.enableDecoderWithLM(args['lm'], args['trie'], args['lm_alpha'], args['lm_beta']);
|
||||
const lm_load_end = process.hrtime(lm_load_start);
|
||||
console.error('Loaded language model in %ds.', totalTime(lm_load_end));
|
||||
if (args['scorer']) {
|
||||
console.error('Loading scorer from file %s', args['scorer']);
|
||||
const scorer_load_start = process.hrtime();
|
||||
model.enableExternalScorer(args['scorer']);
|
||||
const scorer_load_end = process.hrtime(scorer_load_start);
|
||||
console.error('Loaded scorer in %ds.', totalTime(scorer_load_end));
|
||||
|
||||
if (args['lm_alpha'] && args['lm_beta']) {
|
||||
model.setScorerAlphaBeta(args['lm_alpha'], args['lm_beta']);
|
||||
}
|
||||
}
|
||||
|
||||
const buffer = Fs.readFileSync(args['audio']);
|
||||
|
@ -52,31 +52,46 @@ Model.prototype.sampleRate = function() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable decoding using beam scoring with a KenLM language model.
|
||||
* Enable decoding using an external scorer.
|
||||
*
|
||||
* @param {string} aScorerPath The path to the external scorer file.
|
||||
*
|
||||
* @return {number} Zero on success, non-zero on failure (invalid arguments).
|
||||
*/
|
||||
Model.prototype.enableExternalScorer = function(aScorerPath) {
|
||||
return binding.EnableExternalScorer(this._impl, aScorerPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Disable decoding using an external scorer.
|
||||
*
|
||||
* @return {number} Zero on success, non-zero on failure (invalid arguments).
|
||||
*/
|
||||
Model.prototype.disableExternalScorer = function() {
|
||||
return binding.EnableExternalScorer(this._impl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set hyperparameters alpha and beta of the external scorer.
|
||||
*
|
||||
* @param {string} aLMPath The path to the language model binary file.
|
||||
* @param {string} aTriePath The path to the trie file build from the same vocabulary as the language model binary.
|
||||
* @param {float} aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model weight.
|
||||
* @param {float} aLMBeta The beta hyperparameter of the CTC decoder. Word insertion weight.
|
||||
*
|
||||
* @return {number} Zero on success, non-zero on failure (invalid arguments).
|
||||
*/
|
||||
Model.prototype.enableDecoderWithLM = function() {
|
||||
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
|
||||
return binding.EnableDecoderWithLM.apply(null, args);
|
||||
Model.prototype.setScorerAlphaBeta = function(aLMAlpha, aLMBeta) {
|
||||
return binding.SetScorerAlphaBeta(this._impl, aLMAlpha, aLMBeta);
|
||||
}
|
||||
|
||||
/**
|
||||
* Use the DeepSpeech model to perform Speech-To-Text.
|
||||
*
|
||||
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
* @param {number} aBufferSize The number of samples in the audio signal.
|
||||
*
|
||||
* @return {string} The STT result. Returns undefined on error.
|
||||
*/
|
||||
Model.prototype.stt = function() {
|
||||
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
|
||||
return binding.SpeechToText.apply(null, args);
|
||||
Model.prototype.stt = function(aBuffer) {
|
||||
return binding.SpeechToText(this._impl, aBuffer);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -84,25 +99,22 @@ Model.prototype.stt = function() {
|
||||
* about the results.
|
||||
*
|
||||
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
* @param {number} aBufferSize The number of samples in the audio signal.
|
||||
*
|
||||
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
||||
*/
|
||||
Model.prototype.sttWithMetadata = function() {
|
||||
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
|
||||
return binding.SpeechToTextWithMetadata.apply(null, args);
|
||||
Model.prototype.sttWithMetadata = function(aBuffer) {
|
||||
return binding.SpeechToTextWithMetadata(this._impl, aBuffer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new streaming inference state. The streaming state returned by this function can then be passed to :js:func:`Model.feedAudioContent` and :js:func:`Model.finishStream`.
|
||||
* Create a new streaming inference state. One can then call :js:func:`Stream.feedAudioContent` and :js:func:`Stream.finishStream` on the returned stream object.
|
||||
*
|
||||
* @return {object} an opaque object that represents the streaming state.
|
||||
* @return {object} a :js:func:`Stream` object that represents the streaming state.
|
||||
*
|
||||
* @throws on error
|
||||
*/
|
||||
Model.prototype.createStream = function() {
|
||||
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
|
||||
const rets = binding.CreateStream.apply(null, args);
|
||||
const rets = binding.CreateStream(this._impl);
|
||||
const status = rets[0];
|
||||
const ctx = rets[1];
|
||||
if (status !== 0) {
|
||||
@ -111,55 +123,56 @@ Model.prototype.createStream = function() {
|
||||
return ctx;
|
||||
}
|
||||
|
||||
function Stream(nativeStream) {
|
||||
this._impl = nativeStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* Feed audio samples to an ongoing streaming inference.
|
||||
*
|
||||
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
|
||||
* @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the
|
||||
* appropriate sample rate (matching what the model was trained on).
|
||||
* @param {number} aBufferSize The number of samples in @param aBuffer.
|
||||
*/
|
||||
Model.prototype.feedAudioContent = function() {
|
||||
binding.FeedAudioContent.apply(null, arguments);
|
||||
Stream.prototype.feedAudioContent = function(aBuffer) {
|
||||
binding.FeedAudioContent(this._impl, aBuffer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the intermediate decoding of an ongoing streaming inference.
|
||||
*
|
||||
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
|
||||
*
|
||||
* @return {string} The STT intermediate result.
|
||||
*/
|
||||
Model.prototype.intermediateDecode = function() {
|
||||
return binding.IntermediateDecode.apply(null, arguments);
|
||||
Stream.prototype.intermediateDecode = function() {
|
||||
return binding.IntermediateDecode(this._impl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||
*
|
||||
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
|
||||
*
|
||||
* @return {string} The STT result.
|
||||
*
|
||||
* This method will free the state (@param aSctx).
|
||||
* This method will free the stream, it must not be used after this method is called.
|
||||
*/
|
||||
Model.prototype.finishStream = function() {
|
||||
return binding.FinishStream.apply(null, arguments);
|
||||
Stream.prototype.finishStream = function() {
|
||||
result = binding.FinishStream(this._impl);
|
||||
this._impl = null;
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
|
||||
*
|
||||
* @param {object} aSctx A streaming state pointer returned by :js:func:`Model.setupStream`.
|
||||
*
|
||||
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
|
||||
*
|
||||
* This method will free the state pointer (@param aSctx).
|
||||
* This method will free the stream, it must not be used after this method is called.
|
||||
*/
|
||||
Model.prototype.finishStreamWithMetadata = function() {
|
||||
return binding.FinishStreamWithMetadata.apply(null, arguments);
|
||||
Stream.prototype.finishStreamWithMetadata = function() {
|
||||
result = binding.FinishStreamWithMetadata(this._impl);
|
||||
this._impl = null;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Frees associated resources and destroys model object.
|
||||
*
|
||||
@ -184,10 +197,10 @@ function FreeMetadata(metadata) {
|
||||
* can be used if you no longer need the result of an ongoing streaming
|
||||
* inference and don't want to perform a costly decode operation.
|
||||
*
|
||||
* @param {Object} stream A streaming state pointer returned by :js:func:`Model.createStream`.
|
||||
* @param {Object} stream A stream object returned by :js:func:`Model.createStream`.
|
||||
*/
|
||||
function FreeStream(stream) {
|
||||
return binding.FreeStream(stream);
|
||||
return binding.FreeStream(stream._impl);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -21,7 +21,6 @@ import deepspeech
|
||||
|
||||
# rename for backwards compatibility
|
||||
from deepspeech.impl import PrintVersions as printVersions
|
||||
from deepspeech.impl import FreeStream as freeStream
|
||||
|
||||
class Model(object):
|
||||
"""
|
||||
@ -56,127 +55,159 @@ class Model(object):
|
||||
"""
|
||||
return deepspeech.impl.GetModelSampleRate(self._impl)
|
||||
|
||||
def enableDecoderWithLM(self, *args, **kwargs):
|
||||
def enableExternalScorer(self, scorer_path):
|
||||
"""
|
||||
Enable decoding using beam scoring with a KenLM language model.
|
||||
Enable decoding using an external scorer.
|
||||
|
||||
:param aLMPath: The path to the language model binary file.
|
||||
:type aLMPath: str
|
||||
:param scorer_path: The path to the external scorer file.
|
||||
:type scorer_path: str
|
||||
|
||||
:param aTriePath: The path to the trie file build from the same vocabulary as the language model binary.
|
||||
:type aTriePath: str
|
||||
|
||||
:param aLMAlpha: The alpha hyperparameter of the CTC decoder. Language Model weight.
|
||||
:type aLMAlpha: float
|
||||
|
||||
:param aLMBeta: The beta hyperparameter of the CTC decoder. Word insertion weight.
|
||||
:type aLMBeta: float
|
||||
|
||||
:return: Zero on success, non-zero on failure (invalid arguments).
|
||||
:return: Zero on success, non-zero on failure.
|
||||
:type: int
|
||||
"""
|
||||
return deepspeech.impl.EnableDecoderWithLM(self._impl, *args, **kwargs)
|
||||
return deepspeech.impl.EnableExternalScorer(self._impl, scorer_path)
|
||||
|
||||
def stt(self, *args, **kwargs):
|
||||
def disableExternalScorer(self):
|
||||
"""
|
||||
Disable decoding using an external scorer.
|
||||
|
||||
:return: Zero on success, non-zero on failure.
|
||||
"""
|
||||
return deepspeech.impl.DisableExternalScorer(self._impl)
|
||||
|
||||
def setScorerAlphaBeta(self, alpha, beta):
|
||||
"""
|
||||
Set hyperparameters alpha and beta of the external scorer.
|
||||
|
||||
:param alpha: The alpha hyperparameter of the decoder. Language model weight.
|
||||
:type alpha: float
|
||||
|
||||
:param beta: The beta hyperparameter of the decoder. Word insertion weight.
|
||||
:type beta: float
|
||||
|
||||
:return: Zero on success, non-zero on failure.
|
||||
:type: int
|
||||
"""
|
||||
return deepspeech.impl.SetScorerAlphaBeta(self._impl, alpha, beta)
|
||||
|
||||
def stt(self, audio_buffer):
|
||||
"""
|
||||
Use the DeepSpeech model to perform Speech-To-Text.
|
||||
|
||||
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
:type aBuffer: int array
|
||||
|
||||
:param aBufferSize: The number of samples in the audio signal.
|
||||
:type aBufferSize: int
|
||||
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
:type audio_buffer: numpy.int16 array
|
||||
|
||||
:return: The STT result.
|
||||
:type: str
|
||||
"""
|
||||
return deepspeech.impl.SpeechToText(self._impl, *args, **kwargs)
|
||||
return deepspeech.impl.SpeechToText(self._impl, audio_buffer)
|
||||
|
||||
def sttWithMetadata(self, *args, **kwargs):
|
||||
def sttWithMetadata(self, audio_buffer):
|
||||
"""
|
||||
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
|
||||
|
||||
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
:type aBuffer: int array
|
||||
|
||||
:param aBufferSize: The number of samples in the audio signal.
|
||||
:type aBufferSize: int
|
||||
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
:type audio_buffer: numpy.int16 array
|
||||
|
||||
:return: Outputs a struct of individual letters along with their timing information.
|
||||
:type: :func:`Metadata`
|
||||
"""
|
||||
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
|
||||
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer)
|
||||
|
||||
def createStream(self):
|
||||
"""
|
||||
Create a new streaming inference state. The streaming state returned
|
||||
by this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.
|
||||
Create a new streaming inference state. The streaming state returned by
|
||||
this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.
|
||||
|
||||
:return: Object holding the stream
|
||||
:return: Stream object representing the newly created stream
|
||||
:type: :func:`Stream`
|
||||
|
||||
:throws: RuntimeError on error
|
||||
"""
|
||||
status, ctx = deepspeech.impl.CreateStream(self._impl)
|
||||
if status != 0:
|
||||
raise RuntimeError("CreateStream failed with error code {}".format(status))
|
||||
return ctx
|
||||
return Stream(ctx)
|
||||
|
||||
# pylint: disable=no-self-use
|
||||
def feedAudioContent(self, *args, **kwargs):
|
||||
|
||||
class Stream(object):
|
||||
def __init__(self, native_stream):
|
||||
self._impl = native_stream
|
||||
|
||||
def __del__(self):
|
||||
if self._impl:
|
||||
self.freeStream()
|
||||
|
||||
def feedAudioContent(self, audio_buffer):
|
||||
"""
|
||||
Feed audio samples to an ongoing streaming inference.
|
||||
|
||||
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
|
||||
:type aSctx: object
|
||||
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
:type audio_buffer: numpy.int16 array
|
||||
|
||||
:param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).
|
||||
:type aBuffer: int array
|
||||
|
||||
:param aBufferSize: The number of samples in @p aBuffer.
|
||||
:type aBufferSize: int
|
||||
:throws: RuntimeError if the stream object is not valid
|
||||
"""
|
||||
deepspeech.impl.FeedAudioContent(*args, **kwargs)
|
||||
if not self._impl:
|
||||
raise RuntimeError("Stream object is not valid. Trying to feed an already finished stream?")
|
||||
deepspeech.impl.FeedAudioContent(self._impl, audio_buffer)
|
||||
|
||||
# pylint: disable=no-self-use
|
||||
def intermediateDecode(self, *args, **kwargs):
|
||||
def intermediateDecode(self):
|
||||
"""
|
||||
Compute the intermediate decoding of an ongoing streaming inference.
|
||||
|
||||
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
|
||||
:type aSctx: object
|
||||
|
||||
:return: The STT intermediate result.
|
||||
:type: str
|
||||
"""
|
||||
return deepspeech.impl.IntermediateDecode(*args, **kwargs)
|
||||
|
||||
# pylint: disable=no-self-use
|
||||
def finishStream(self, *args, **kwargs):
|
||||
:throws: RuntimeError if the stream object is not valid
|
||||
"""
|
||||
Signal the end of an audio signal to an ongoing streaming
|
||||
inference, returns the STT result over the whole audio signal.
|
||||
if not self._impl:
|
||||
raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
|
||||
return deepspeech.impl.IntermediateDecode(self._impl)
|
||||
|
||||
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
|
||||
:type aSctx: object
|
||||
def finishStream(self):
|
||||
"""
|
||||
Signal the end of an audio signal to an ongoing streaming inference,
|
||||
returns the STT result over the whole audio signal.
|
||||
|
||||
:return: The STT result.
|
||||
:type: str
|
||||
"""
|
||||
return deepspeech.impl.FinishStream(*args, **kwargs)
|
||||
|
||||
# pylint: disable=no-self-use
|
||||
def finishStreamWithMetadata(self, *args, **kwargs):
|
||||
:throws: RuntimeError if the stream object is not valid
|
||||
"""
|
||||
Signal the end of an audio signal to an ongoing streaming
|
||||
inference, returns per-letter metadata.
|
||||
if not self._impl:
|
||||
raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
|
||||
result = deepspeech.impl.FinishStream(self._impl)
|
||||
self._impl = None
|
||||
return result
|
||||
|
||||
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
|
||||
:type aSctx: object
|
||||
def finishStreamWithMetadata(self):
|
||||
"""
|
||||
Signal the end of an audio signal to an ongoing streaming inference,
|
||||
returns per-letter metadata.
|
||||
|
||||
:return: Outputs a struct of individual letters along with their timing information.
|
||||
:type: :func:`Metadata`
|
||||
|
||||
:throws: RuntimeError if the stream object is not valid
|
||||
"""
|
||||
return deepspeech.impl.FinishStreamWithMetadata(*args, **kwargs)
|
||||
if not self._impl:
|
||||
raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
|
||||
result = deepspeech.impl.FinishStreamWithMetadata(self._impl)
|
||||
self._impl = None
|
||||
return result
|
||||
|
||||
def freeStream(self):
|
||||
"""
|
||||
Destroy a streaming state without decoding the computed logits. This can
|
||||
be used if you no longer need the result of an ongoing streaming inference.
|
||||
|
||||
:throws: RuntimeError if the stream object is not valid
|
||||
"""
|
||||
if not self._impl:
|
||||
raise RuntimeError("Stream object is not valid. Trying to free an already finished stream?")
|
||||
deepspeech.impl.FreeStream(self._impl)
|
||||
self._impl = None
|
||||
|
||||
|
||||
# This is only for documentation purpose
|
||||
# Metadata and MetadataItem should be in sync with native_client/deepspeech.h
|
||||
@ -189,22 +220,18 @@ class MetadataItem(object):
|
||||
"""
|
||||
The character generated for transcription
|
||||
"""
|
||||
# pylint: disable=unnecessary-pass
|
||||
pass
|
||||
|
||||
|
||||
def timestep(self):
|
||||
"""
|
||||
Position of the character in units of 20ms
|
||||
"""
|
||||
# pylint: disable=unnecessary-pass
|
||||
pass
|
||||
|
||||
|
||||
def start_time(self):
|
||||
"""
|
||||
Position of the character in seconds
|
||||
"""
|
||||
# pylint: disable=unnecessary-pass
|
||||
pass
|
||||
|
||||
|
||||
class Metadata(object):
|
||||
@ -218,8 +245,7 @@ class Metadata(object):
|
||||
:return: A list of :func:`MetadataItem` elements
|
||||
:type: list
|
||||
"""
|
||||
# pylint: disable=unnecessary-pass
|
||||
pass
|
||||
|
||||
|
||||
def num_items(self):
|
||||
"""
|
||||
@ -228,8 +254,7 @@ class Metadata(object):
|
||||
:return: Size of the list of items
|
||||
:type: int
|
||||
"""
|
||||
# pylint: disable=unnecessary-pass
|
||||
pass
|
||||
|
||||
|
||||
def confidence(self):
|
||||
"""
|
||||
@ -237,5 +262,4 @@ class Metadata(object):
|
||||
sum of the acoustic model logit values for each timestep/character that
|
||||
contributed to the creation of this transcription.
|
||||
"""
|
||||
# pylint: disable=unnecessary-pass
|
||||
pass
|
||||
|
||||
|
@ -72,7 +72,7 @@ def metadata_json_output(metadata):
|
||||
json_result["words"] = words_from_metadata(metadata)
|
||||
json_result["confidence"] = metadata.confidence
|
||||
return json.dumps(json_result)
|
||||
|
||||
|
||||
|
||||
|
||||
class VersionAction(argparse.Action):
|
||||
@ -88,17 +88,15 @@ def main():
|
||||
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
|
||||
parser.add_argument('--model', required=True,
|
||||
help='Path to the model (protocol buffer binary file)')
|
||||
parser.add_argument('--lm', nargs='?',
|
||||
help='Path to the language model binary file')
|
||||
parser.add_argument('--trie', nargs='?',
|
||||
help='Path to the language model trie file created with native_client/generate_trie')
|
||||
parser.add_argument('--scorer', required=False,
|
||||
help='Path to the external scorer file')
|
||||
parser.add_argument('--audio', required=True,
|
||||
help='Path to the audio file to run (WAV format)')
|
||||
parser.add_argument('--beam_width', type=int, default=500,
|
||||
help='Beam width for the CTC decoder')
|
||||
parser.add_argument('--lm_alpha', type=float, default=0.75,
|
||||
parser.add_argument('--lm_alpha', type=float,
|
||||
help='Language model weight (lm_alpha)')
|
||||
parser.add_argument('--lm_beta', type=float, default=1.85,
|
||||
parser.add_argument('--lm_beta', type=float,
|
||||
help='Word insertion bonus (lm_beta)')
|
||||
parser.add_argument('--version', action=VersionAction,
|
||||
help='Print version and exits')
|
||||
@ -116,12 +114,15 @@ def main():
|
||||
|
||||
desired_sample_rate = ds.sampleRate()
|
||||
|
||||
if args.lm and args.trie:
|
||||
print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
|
||||
lm_load_start = timer()
|
||||
ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta)
|
||||
lm_load_end = timer() - lm_load_start
|
||||
print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)
|
||||
if args.scorer:
|
||||
print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
|
||||
scorer_load_start = timer()
|
||||
ds.enableExternalScorer(args.scorer)
|
||||
scorer_load_end = timer() - scorer_load_start
|
||||
print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)
|
||||
|
||||
if args.lm_alpha and args.lm_beta:
|
||||
ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)
|
||||
|
||||
fin = wave.open(args.audio, 'rb')
|
||||
fs = fin.getframerate()
|
||||
|
@ -14,21 +14,13 @@ from deepspeech import Model
|
||||
# Beam width used in the CTC decoder when building candidate transcriptions
|
||||
BEAM_WIDTH = 500
|
||||
|
||||
# The alpha hyperparameter of the CTC decoder. Language Model weight
|
||||
LM_ALPHA = 0.75
|
||||
|
||||
# The beta hyperparameter of the CTC decoder. Word insertion bonus.
|
||||
LM_BETA = 1.85
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
|
||||
parser.add_argument('--model', required=True,
|
||||
help='Path to the model (protocol buffer binary file)')
|
||||
parser.add_argument('--lm', nargs='?',
|
||||
help='Path to the language model binary file')
|
||||
parser.add_argument('--trie', nargs='?',
|
||||
help='Path to the language model trie file created with native_client/generate_trie')
|
||||
parser.add_argument('--scorer', nargs='?',
|
||||
help='Path to the external scorer file')
|
||||
parser.add_argument('--audio1', required=True,
|
||||
help='First audio file to use in interleaved streams')
|
||||
parser.add_argument('--audio2', required=True,
|
||||
@ -37,8 +29,8 @@ def main():
|
||||
|
||||
ds = Model(args.model, BEAM_WIDTH)
|
||||
|
||||
if args.lm and args.trie:
|
||||
ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA)
|
||||
if args.scorer:
|
||||
ds.enableExternalScorer(args.scorer)
|
||||
|
||||
fin = wave.open(args.audio1, 'rb')
|
||||
fs1 = fin.getframerate()
|
||||
@ -57,11 +49,11 @@ def main():
|
||||
splits2 = np.array_split(audio2, 10)
|
||||
|
||||
for part1, part2 in zip(splits1, splits2):
|
||||
ds.feedAudioContent(stream1, part1)
|
||||
ds.feedAudioContent(stream2, part2)
|
||||
stream1.feedAudioContent(part1)
|
||||
stream2.feedAudioContent(part2)
|
||||
|
||||
print(ds.finishStream(stream1))
|
||||
print(ds.finishStream(stream2))
|
||||
print(stream1.finishStream())
|
||||
print(stream2.finishStream())
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
|
||||
|
||||
BAZEL_TARGETS="
|
||||
//native_client:libdeepspeech.so
|
||||
//native_client:generate_trie
|
||||
"
|
||||
|
||||
BAZEL_BUILD_FLAGS="${BAZEL_ARM64_FLAGS} ${BAZEL_EXTRA_FLAGS}"
|
||||
|
@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
|
||||
|
||||
BAZEL_TARGETS="
|
||||
//native_client:libdeepspeech.so
|
||||
//native_client:generate_trie
|
||||
"
|
||||
|
||||
BAZEL_ENV_FLAGS="TF_NEED_CUDA=1 ${TF_CUDA_FLAGS}"
|
||||
|
@ -30,11 +30,11 @@ then:
|
||||
image: ${build.docker_image}
|
||||
|
||||
env:
|
||||
DEEPSPEECH_MODEL: "https://github.com/reuben/DeepSpeech/releases/download/v0.6.0-alpha.15/models.tar.gz"
|
||||
DEEPSPEECH_MODEL: "https://github.com/reuben/DeepSpeech/releases/download/v0.6.1/models.tar.gz"
|
||||
DEEPSPEECH_AUDIO: "https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz"
|
||||
PIP_DEFAULT_TIMEOUT: "60"
|
||||
EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
|
||||
EXAMPLES_CHECKOUT_TARGET: "master"
|
||||
EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05"
|
||||
|
||||
command:
|
||||
- "/bin/bash"
|
||||
|
@ -10,7 +10,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
|
||||
|
||||
BAZEL_TARGETS="
|
||||
//native_client:libdeepspeech.so
|
||||
//native_client:generate_trie
|
||||
"
|
||||
|
||||
if [ "${runtime}" = "tflite" ]; then
|
||||
|
@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
|
||||
|
||||
BAZEL_TARGETS="
|
||||
//native_client:libdeepspeech.so
|
||||
//native_client:generate_trie
|
||||
"
|
||||
|
||||
BAZEL_BUILD_FLAGS="${BAZEL_ARM_FLAGS} ${BAZEL_EXTRA_FLAGS}"
|
||||
|
@ -49,7 +49,7 @@ deepspeech --version
|
||||
|
||||
pushd ${HOME}/DeepSpeech/ds/
|
||||
python bin/import_ldc93s1.py data/smoke_test
|
||||
python evaluate_tflite.py --model "${TASKCLUSTER_TMP_DIR}/${model_name_mmap}" --lm data/smoke_test/vocab.pruned.lm --trie data/smoke_test/vocab.trie --csv data/smoke_test/ldc93s1.csv
|
||||
python evaluate_tflite.py --model "${TASKCLUSTER_TMP_DIR}/${model_name_mmap}" --scorer data/smoke_test/pruned_lm.scorer --csv data/smoke_test/ldc93s1.csv
|
||||
popd
|
||||
|
||||
virtualenv_deactivate "${pyalias}" "${PYENV_NAME}"
|
||||
|
@ -378,7 +378,7 @@ run_netframework_inference_tests()
|
||||
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_withlm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
phrase_pbmodel_withlm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
set -e
|
||||
assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?"
|
||||
}
|
||||
@ -401,7 +401,7 @@ run_electronjs_inference_tests()
|
||||
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
set -e
|
||||
assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?"
|
||||
}
|
||||
@ -427,7 +427,7 @@ run_basic_inference_tests()
|
||||
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
status=$?
|
||||
set -e
|
||||
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm}" "$status"
|
||||
@ -444,7 +444,7 @@ run_all_inference_tests()
|
||||
assert_correct_ldc93s1 "${phrase_pbmodel_nolm_stereo_44k}" "$status"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
status=$?
|
||||
set -e
|
||||
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_stereo_44k}" "$status"
|
||||
@ -457,7 +457,7 @@ run_all_inference_tests()
|
||||
assert_correct_warning_upsampling "${phrase_pbmodel_nolm_mono_8k}"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
|
||||
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
|
||||
set -e
|
||||
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
|
||||
fi;
|
||||
@ -470,8 +470,7 @@ run_prod_concurrent_stream_tests()
|
||||
set +e
|
||||
output=$(python ${TASKCLUSTER_TMP_DIR}/test_sources/concurrent_streams.py \
|
||||
--model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} \
|
||||
--lm ${TASKCLUSTER_TMP_DIR}/lm.binary \
|
||||
--trie ${TASKCLUSTER_TMP_DIR}/trie \
|
||||
--scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer \
|
||||
--audio1 ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_16000.wav \
|
||||
--audio2 ${TASKCLUSTER_TMP_DIR}/new-home-in-the-stars-16k.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
status=$?
|
||||
@ -489,19 +488,19 @@ run_prod_inference_tests()
|
||||
local _bitrate=$1
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
status=$?
|
||||
set -e
|
||||
assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
status=$?
|
||||
set -e
|
||||
assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
status=$?
|
||||
set -e
|
||||
assert_correct_ldc93s1_prodmodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}"
|
||||
@ -509,7 +508,7 @@ run_prod_inference_tests()
|
||||
# Run down-sampling warning test only when we actually perform downsampling
|
||||
if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then
|
||||
set +e
|
||||
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
|
||||
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
|
||||
set -e
|
||||
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
|
||||
fi;
|
||||
@ -520,19 +519,19 @@ run_prodtflite_inference_tests()
|
||||
local _bitrate=$1
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
status=$?
|
||||
set -e
|
||||
assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
status=$?
|
||||
set -e
|
||||
assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
status=$?
|
||||
set -e
|
||||
assert_correct_ldc93s1_prodtflitemodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}"
|
||||
@ -540,7 +539,7 @@ run_prodtflite_inference_tests()
|
||||
# Run down-sampling warning test only when we actually perform downsampling
|
||||
if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then
|
||||
set +e
|
||||
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
|
||||
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
|
||||
set -e
|
||||
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
|
||||
fi;
|
||||
@ -555,7 +554,7 @@ run_multi_inference_tests()
|
||||
assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_nolm}" "$status"
|
||||
|
||||
set +e -o pipefail
|
||||
multi_phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%')
|
||||
multi_phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%')
|
||||
status=$?
|
||||
set -e +o pipefail
|
||||
assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_withlm}" "$status"
|
||||
@ -564,7 +563,7 @@ run_multi_inference_tests()
|
||||
run_cpp_only_inference_tests()
|
||||
{
|
||||
set +e
|
||||
phrase_pbmodel_withlm_intermediate_decode=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} --stream 1280 2>${TASKCLUSTER_TMP_DIR}/stderr | tail -n 1)
|
||||
phrase_pbmodel_withlm_intermediate_decode=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} --stream 1280 2>${TASKCLUSTER_TMP_DIR}/stderr | tail -n 1)
|
||||
status=$?
|
||||
set -e
|
||||
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_intermediate_decode}" "$status"
|
||||
@ -669,8 +668,7 @@ download_data()
|
||||
${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source}"
|
||||
${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source_mmap}"
|
||||
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/*.wav ${TASKCLUSTER_TMP_DIR}/
|
||||
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.pruned.lm ${TASKCLUSTER_TMP_DIR}/lm.binary
|
||||
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.trie ${TASKCLUSTER_TMP_DIR}/trie
|
||||
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/pruned_lm.scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer
|
||||
cp -R ${DS_ROOT_TASK}/DeepSpeech/ds/native_client/test ${TASKCLUSTER_TMP_DIR}/test_sources
|
||||
}
|
||||
|
||||
@ -1562,7 +1560,6 @@ package_native_client()
|
||||
fi;
|
||||
|
||||
${TAR} -cf - \
|
||||
-C ${tensorflow_dir}/bazel-bin/native_client/ generate_trie${PLATFORM_EXE_SUFFIX} \
|
||||
-C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so \
|
||||
-C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so.if.lib \
|
||||
-C ${deepspeech_dir}/ LICENSE \
|
||||
@ -1767,8 +1764,7 @@ android_setup_apk_data()
|
||||
adb push \
|
||||
${TASKCLUSTER_TMP_DIR}/${model_name} \
|
||||
${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} \
|
||||
${TASKCLUSTER_TMP_DIR}/lm.binary \
|
||||
${TASKCLUSTER_TMP_DIR}/trie \
|
||||
${TASKCLUSTER_TMP_DIR}/kenlm.scorer \
|
||||
${ANDROID_TMP_DIR}/test/
|
||||
}
|
||||
|
||||
|
@ -10,7 +10,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
|
||||
|
||||
BAZEL_TARGETS="
|
||||
//native_client:libdeepspeech.so
|
||||
//native_client:generate_trie
|
||||
"
|
||||
|
||||
if [ "${package_option}" = "--cuda" ]; then
|
||||
|
@ -44,7 +44,7 @@ payload:
|
||||
MSYS: 'winsymlinks:nativestrict'
|
||||
TENSORFLOW_BUILD_ARTIFACT: ${build.tensorflow}
|
||||
EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
|
||||
EXAMPLES_CHECKOUT_TARGET: "master"
|
||||
EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05"
|
||||
|
||||
command:
|
||||
- >-
|
||||
|
@ -29,7 +29,7 @@ def fail(message, code=1):
|
||||
def transcribe_file(audio_path, tlog_path):
|
||||
from DeepSpeech import create_model, try_loading # pylint: disable=cyclic-import,import-outside-toplevel
|
||||
initialize_globals()
|
||||
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, Config.alphabet)
|
||||
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path, Config.alphabet)
|
||||
try:
|
||||
num_processes = cpu_count()
|
||||
except NotImplementedError:
|
||||
|
@ -143,10 +143,8 @@ def create_flags():
|
||||
|
||||
f.DEFINE_boolean('utf8', False, 'enable UTF-8 mode. When this is used the model outputs UTF-8 sequences directly rather than using an alphabet mapping.')
|
||||
f.DEFINE_string('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.')
|
||||
f.DEFINE_string('lm_binary_path', 'data/lm/lm.binary', 'path to the language model binary file created with KenLM')
|
||||
f.DEFINE_alias('lm', 'lm_binary_path')
|
||||
f.DEFINE_string('lm_trie_path', 'data/lm/trie', 'path to the language model trie file created with native_client/generate_trie')
|
||||
f.DEFINE_alias('trie', 'lm_trie_path')
|
||||
f.DEFINE_string('scorer_path', 'data/lm/kenlm.scorer', 'path to the external scorer file created with data/lm/generate_package.py')
|
||||
f.DEFINE_alias('scorer', 'scorer_path')
|
||||
f.DEFINE_integer('beam_width', 1024, 'beam width used in the CTC decoder when building candidate transcriptions')
|
||||
f.DEFINE_float('lm_alpha', 0.75, 'the alpha hyperparameter of the CTC decoder. Language Model weight.')
|
||||
f.DEFINE_float('lm_beta', 1.85, 'the beta hyperparameter of the CTC decoder. Word insertion weight.')
|
||||
|
Loading…
x
Reference in New Issue
Block a user