Update all API consumers

This commit is contained in:
Reuben Morais 2020-01-21 11:54:01 +01:00
parent 708b21a63e
commit 1e2eb96248
41 changed files with 393 additions and 516 deletions

View File

@ -882,8 +882,7 @@ def package_zip():
} }
}, f) }, f)
shutil.copy(FLAGS.lm_binary_path, export_dir) shutil.copy(FLAGS.scorer_path, export_dir)
shutil.copy(FLAGS.lm_trie_path, export_dir)
archive = shutil.make_archive(zip_filename, 'zip', export_dir) archive = shutil.make_archive(zip_filename, 'zip', export_dir)
log_info('Exported packaged model {}'.format(archive)) log_info('Exported packaged model {}'.format(archive))
@ -926,10 +925,9 @@ def do_single_file_inference(input_file_path):
logits = np.squeeze(logits) logits = np.squeeze(logits)
if FLAGS.lm_binary_path: if FLAGS.scorer_path:
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
FLAGS.lm_binary_path, FLAGS.lm_trie_path, FLAGS.scorer_path, Config.alphabet)
Config.alphabet)
else: else:
scorer = None scorer = None
decoded = ctc_beam_search_decoder(logits, Config.alphabet, FLAGS.beam_width, decoded = ctc_beam_search_decoder(logits, Config.alphabet, FLAGS.beam_width,

View File

@ -172,7 +172,7 @@ RUN ./configure
# Build DeepSpeech # Build DeepSpeech
RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=cuda -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_trie --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH} RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=cuda -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx --copt=-fvisibility=hidden //native_client:libdeepspeech.so --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
### ###
### Using TensorFlow upstream should work ### Using TensorFlow upstream should work
@ -187,8 +187,7 @@ RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_s
# RUN pip3 install /tmp/tensorflow_pkg/*.whl # RUN pip3 install /tmp/tensorflow_pkg/*.whl
# Copy built libs to /DeepSpeech/native_client # Copy built libs to /DeepSpeech/native_client
RUN cp /tensorflow/bazel-bin/native_client/generate_trie /DeepSpeech/native_client/ \ RUN cp /tensorflow/bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/
&& cp /tensorflow/bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/
# Install TensorFlow # Install TensorFlow
WORKDIR /DeepSpeech/ WORKDIR /DeepSpeech/

View File

@ -21,8 +21,7 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--n_hidden 100 --epochs 1 \ --n_hidden 100 --epochs 1 \
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \ --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
--learning_rate 0.001 --dropout_rate 0.05 \ --learning_rate 0.001 --dropout_rate 0.05 \
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \ --scorer_path 'data/smoke_test/pruned_lm.scorer' | tee /tmp/resume.log
--lm_trie_path 'data/smoke_test/vocab.trie' | tee /tmp/resume.log
if ! grep "Restored variables from most recent checkpoint" /tmp/resume.log; then if ! grep "Restored variables from most recent checkpoint" /tmp/resume.log; then
echo "Did not resume training from checkpoint" echo "Did not resume training from checkpoint"

View File

@ -25,6 +25,5 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--n_hidden 100 --epochs $epoch_count \ --n_hidden 100 --epochs $epoch_count \
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \ --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
--learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train' \ --learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train' \
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \ --scorer_path 'data/smoke_test/pruned_lm.scorer' \
--lm_trie_path 'data/smoke_test/vocab.trie' \
--audio_sample_rate ${audio_sample_rate} --audio_sample_rate ${audio_sample_rate}

View File

@ -21,12 +21,10 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--n_hidden 100 --epochs 1 \ --n_hidden 100 --epochs 1 \
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' --checkpoint_secs 0 \ --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' --checkpoint_secs 0 \
--learning_rate 0.001 --dropout_rate 0.05 \ --learning_rate 0.001 --dropout_rate 0.05 \
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \ --scorer_path 'data/smoke_test/pruned_lm.scorer'
--lm_trie_path 'data/smoke_test/vocab.trie'
python -u DeepSpeech.py \ python -u DeepSpeech.py \
--n_hidden 100 \ --n_hidden 100 \
--checkpoint_dir '/tmp/ckpt' \ --checkpoint_dir '/tmp/ckpt' \
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \ --scorer_path 'data/smoke_test/pruned_lm.scorer' \
--lm_trie_path 'data/smoke_test/vocab.trie' \
--one_shot_infer 'data/smoke_test/LDC93S1.wav' --one_shot_infer 'data/smoke_test/LDC93S1.wav'

View File

@ -20,8 +20,7 @@ python -u DeepSpeech.py --noshow_progressbar \
--n_hidden 100 \ --n_hidden 100 \
--checkpoint_dir '/tmp/ckpt' \ --checkpoint_dir '/tmp/ckpt' \
--export_dir '/tmp/train_tflite' \ --export_dir '/tmp/train_tflite' \
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \ --scorer_path 'data/smoke_test/pruned_lm.scorer' \
--lm_trie_path 'data/smoke_test/vocab.trie' \
--audio_sample_rate ${audio_sample_rate} \ --audio_sample_rate ${audio_sample_rate} \
--export_tflite --export_tflite
@ -31,8 +30,7 @@ python -u DeepSpeech.py --noshow_progressbar \
--n_hidden 100 \ --n_hidden 100 \
--checkpoint_dir '/tmp/ckpt' \ --checkpoint_dir '/tmp/ckpt' \
--export_dir '/tmp/train_tflite/en-us' \ --export_dir '/tmp/train_tflite/en-us' \
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \ --scorer_path 'data/smoke_test/pruned_lm.scorer' \
--lm_trie_path 'data/smoke_test/vocab.trie' \
--audio_sample_rate ${audio_sample_rate} \ --audio_sample_rate ${audio_sample_rate} \
--export_language 'Fake English (fk-FK)' \ --export_language 'Fake English (fk-FK)' \
--export_zip --export_zip

View File

@ -50,7 +50,7 @@ def create_bundle(alphabet_path, lm_path, vocab_path, package_path, force_utf8,
scorer.set_alphabet(alphabet) scorer.set_alphabet(alphabet)
scorer.set_utf8_mode(use_utf8) scorer.set_utf8_mode(use_utf8)
scorer.reset_params(default_alpha, default_beta) scorer.reset_params(default_alpha, default_beta)
scorer.load_lm(lm_path, "") scorer.load_lm(lm_path)
scorer.fill_dictionary(list(words)) scorer.fill_dictionary(list(words))
shutil.copy(lm_path, package_path) shutil.copy(lm_path, package_path)
scorer.save_dictionary(package_path, True) # append, not overwrite scorer.save_dictionary(package_path, True) # append, not overwrite

View File

@ -7,7 +7,13 @@ C
.. doxygenfunction:: DS_FreeModel .. doxygenfunction:: DS_FreeModel
:project: deepspeech-c :project: deepspeech-c
.. doxygenfunction:: DS_EnableDecoderWithLM .. doxygenfunction:: DS_EnableExternalScorer
:project: deepspeech-c
.. doxygenfunction:: DS_DisableExternalScorer
:project: deepspeech-c
.. doxygenfunction:: DS_SetScorerAlphaBeta
:project: deepspeech-c :project: deepspeech-c
.. doxygenfunction:: DS_GetModelSampleRate .. doxygenfunction:: DS_GetModelSampleRate

View File

@ -42,10 +42,9 @@ def sparse_tuple_to_texts(sp_tuple, alphabet):
def evaluate(test_csvs, create_model, try_loading): def evaluate(test_csvs, create_model, try_loading):
if FLAGS.lm_binary_path: if FLAGS.scorer_path:
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
FLAGS.lm_binary_path, FLAGS.lm_trie_path, FLAGS.scorer_path, Config.alphabet)
Config.alphabet)
else: else:
scorer = None scorer = None

View File

@ -27,17 +27,18 @@ This module should be self-contained:
- pip install native_client/python/dist/deepspeech*.whl - pip install native_client/python/dist/deepspeech*.whl
- pip install -r requirements_eval_tflite.txt - pip install -r requirements_eval_tflite.txt
Then run with a TF Lite model, LM/trie and a CSV test file Then run with a TF Lite model, LM and a CSV test file
''' '''
BEAM_WIDTH = 500 BEAM_WIDTH = 500
LM_ALPHA = 0.75 LM_ALPHA = 0.75
LM_BETA = 1.85 LM_BETA = 1.85
def tflite_worker(model, lm, trie, queue_in, queue_out, gpu_mask): def tflite_worker(model, scorer, queue_in, queue_out, gpu_mask):
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask) os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
ds = Model(model, BEAM_WIDTH) ds = Model(model, BEAM_WIDTH)
ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA) ds.enableExternalScorer(scorer)
ds.setScorerAlphaBeta(LM_ALPHA, LM_BETA)
while True: while True:
try: try:
@ -64,7 +65,7 @@ def main(args, _):
processes = [] processes = []
for i in range(args.proc): for i in range(args.proc):
worker_process = Process(target=tflite_worker, args=(args.model, args.lm, args.trie, work_todo, work_done, i), daemon=True, name='tflite_process_{}'.format(i)) worker_process = Process(target=tflite_worker, args=(args.model, args.scorer, work_todo, work_done, i), daemon=True, name='tflite_process_{}'.format(i))
worker_process.start() # Launch reader() as a separate python process worker_process.start() # Launch reader() as a separate python process
processes.append(worker_process) processes.append(worker_process)
@ -113,10 +114,8 @@ def parse_args():
parser = argparse.ArgumentParser(description='Computing TFLite accuracy') parser = argparse.ArgumentParser(description='Computing TFLite accuracy')
parser.add_argument('--model', required=True, parser.add_argument('--model', required=True,
help='Path to the model (protocol buffer binary file)') help='Path to the model (protocol buffer binary file)')
parser.add_argument('--lm', required=True, parser.add_argument('--scorer', required=True,
help='Path to the language model binary file') help='Path to the external scorer file')
parser.add_argument('--trie', required=True,
help='Path to the language model trie file created with native_client/generate_trie')
parser.add_argument('--csv', required=True, parser.add_argument('--csv', required=True,
help='Path to the CSV source file') help='Path to the CSV source file')
parser.add_argument('--proc', required=False, default=cpu_count(), type=int, parser.add_argument('--proc', required=False, default=cpu_count(), type=int,

View File

@ -12,19 +12,17 @@
char* model = NULL; char* model = NULL;
char* lm = NULL; char* scorer = NULL;
char* trie = NULL;
char* audio = NULL; char* audio = NULL;
int beam_width = 500; int beam_width = 500;
float lm_alpha = 0.75f; bool set_alphabeta = false;
float lm_beta = 1.85f; float lm_alpha = 0.f;
bool load_without_trie = false; float lm_beta = 0.f;
bool show_times = false; bool show_times = false;
@ -39,39 +37,36 @@ int stream_size = 0;
void PrintHelp(const char* bin) void PrintHelp(const char* bin)
{ {
std::cout << std::cout <<
"Usage: " << bin << " --model MODEL [--lm LM --trie TRIE] --audio AUDIO [-t] [-e]\n" "Usage: " << bin << " --model MODEL [--scorer SCORER] --audio AUDIO [-t] [-e]\n"
"\n" "\n"
"Running DeepSpeech inference.\n" "Running DeepSpeech inference.\n"
"\n" "\n"
" --model MODEL Path to the model (protocol buffer binary file)\n" "\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
" --lm LM Path to the language model binary file\n" "\t--scorer SCORER\t\tPath to the external scorer file\n"
" --trie TRIE Path to the language model trie file created with native_client/generate_trie\n" "\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
" --audio AUDIO Path to the audio file to run (WAV format)\n" "\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
" --beam_width BEAM_WIDTH Value for decoder beam width (int)\n" "\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
" --lm_alpha LM_ALPHA Value for language model alpha param (float)\n" "\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
" --lm_beta LM_BETA Value for language model beta param (float)\n" "\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
" -t Run in benchmark mode, output mfcc & inference time\n" "\t--extended\t\tOutput string from extended metadata\n"
" --extended Output string from extended metadata\n" "\t--json\t\t\tExtended output, shows word timings as JSON\n"
" --json Extended output, shows word timings as JSON\n" "\t--stream size\t\tRun in stream mode, output intermediate results\n"
" --stream size Run in stream mode, output intermediate results\n" "\t--help\t\t\tShow help\n"
" --help Show help\n" "\t--version\t\tPrint version and exits\n";
" --version Print version and exits\n";
DS_PrintVersions(); DS_PrintVersions();
exit(1); exit(1);
} }
bool ProcessArgs(int argc, char** argv) bool ProcessArgs(int argc, char** argv)
{ {
const char* const short_opts = "m:a:l:r:w:c:d:b:tehv"; const char* const short_opts = "m:a:s:r:w:c:d:b:tehv";
const option long_opts[] = { const option long_opts[] = {
{"model", required_argument, nullptr, 'm'}, {"model", required_argument, nullptr, 'm'},
{"lm", required_argument, nullptr, 'l'}, {"scorer", required_argument, nullptr, 'l'},
{"trie", required_argument, nullptr, 'r'},
{"audio", required_argument, nullptr, 'w'}, {"audio", required_argument, nullptr, 'w'},
{"beam_width", required_argument, nullptr, 'b'}, {"beam_width", required_argument, nullptr, 'b'},
{"lm_alpha", required_argument, nullptr, 'c'}, {"lm_alpha", required_argument, nullptr, 'c'},
{"lm_beta", required_argument, nullptr, 'd'}, {"lm_beta", required_argument, nullptr, 'd'},
{"run_very_slowly_without_trie_I_really_know_what_Im_doing", no_argument, nullptr, 999},
{"t", no_argument, nullptr, 't'}, {"t", no_argument, nullptr, 't'},
{"extended", no_argument, nullptr, 'e'}, {"extended", no_argument, nullptr, 'e'},
{"json", no_argument, nullptr, 'j'}, {"json", no_argument, nullptr, 'j'},
@ -95,11 +90,7 @@ bool ProcessArgs(int argc, char** argv)
break; break;
case 'l': case 'l':
lm = optarg; scorer = optarg;
break;
case 'r':
trie = optarg;
break; break;
case 'w': case 'w':
@ -111,17 +102,15 @@ bool ProcessArgs(int argc, char** argv)
break; break;
case 'c': case 'c':
set_alphabeta = true;
lm_alpha = atof(optarg); lm_alpha = atof(optarg);
break; break;
case 'd': case 'd':
set_alphabeta = true;
lm_beta = atof(optarg); lm_beta = atof(optarg);
break; break;
case 999:
load_without_trie = true;
break;
case 't': case 't':
show_times = true; show_times = true;
break; break;

View File

@ -374,16 +374,19 @@ main(int argc, char **argv)
return 1; return 1;
} }
if (lm && (trie || load_without_trie)) { if (scorer) {
int status = DS_EnableDecoderWithLM(ctx, int status = DS_EnableExternalScorer(ctx, scorer);
lm,
trie,
lm_alpha,
lm_beta);
if (status != 0) { if (status != 0) {
fprintf(stderr, "Could not enable CTC decoder with LM.\n"); fprintf(stderr, "Could not enable external scorer.\n");
return 1; return 1;
} }
if (set_alphabeta) {
status = DS_SetScorerAlphaBeta(ctx, lm_alpha, lm_beta);
if (status != 0) {
fprintf(stderr, "Error setting scorer alpha and beta.\n");
return 1;
}
}
} }
#ifndef NO_SOX #ifndef NO_SOX

View File

@ -12,12 +12,11 @@ class Scorer(swigwrapper.Scorer):
:type alpha: float :type alpha: float
:param beta: Word insertion bonus. :param beta: Word insertion bonus.
:type beta: float :type beta: float
:model_path: Path to load language model. :model_path: Path to load scorer.
:trie_path: Path to trie file.
:alphabet: Alphabet :alphabet: Alphabet
:type model_path: basestring :type model_path: basestring
""" """
def __init__(self, alpha=None, beta=None, model_path=None, trie_path=None, alphabet=None): def __init__(self, alpha=None, beta=None, model_path=None, alphabet=None):
super(Scorer, self).__init__() super(Scorer, self).__init__()
# Allow bare initialization # Allow bare initialization
if alphabet: if alphabet:
@ -27,15 +26,15 @@ class Scorer(swigwrapper.Scorer):
if err != 0: if err != 0:
raise ValueError("Error when deserializing alphabet.") raise ValueError("Error when deserializing alphabet.")
err = self.init(alpha, beta, err = self.init(model_path.encode('utf-8'),
model_path.encode('utf-8'),
trie_path.encode('utf-8'),
native_alphabet) native_alphabet)
if err != 0: if err != 0:
raise ValueError("Scorer initialization failed with error code {}".format(err), err) raise ValueError("Scorer initialization failed with error code {}".format(err), err)
def load_lm(self, lm_path, trie_path): self.reset_params(alpha, beta)
super(Scorer, self).load_lm(lm_path.encode('utf-8'), trie_path.encode('utf-8'))
def load_lm(self, lm_path):
super(Scorer, self).load_lm(lm_path.encode('utf-8'))
def save_dictionary(self, save_path, *args, **kwargs): def save_dictionary(self, save_path, *args, **kwargs):
super(Scorer, self).save_dictionary(save_path.encode('utf-8'), *args, **kwargs) super(Scorer, self).save_dictionary(save_path.encode('utf-8'), *args, **kwargs)

View File

@ -6,7 +6,6 @@
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "lm/enumerate_vocab.hh"
#include "lm/virtual_interface.hh" #include "lm/virtual_interface.hh"
#include "lm/word_index.hh" #include "lm/word_index.hh"
#include "util/string_piece.hh" #include "util/string_piece.hh"
@ -19,18 +18,6 @@ const std::string START_TOKEN = "<s>";
const std::string UNK_TOKEN = "<unk>"; const std::string UNK_TOKEN = "<unk>";
const std::string END_TOKEN = "</s>"; const std::string END_TOKEN = "</s>";
// Implement a callback to retrieve the dictionary of language model.
class RetrieveStrEnumerateVocab : public lm::EnumerateVocab {
public:
RetrieveStrEnumerateVocab() {}
void Add(lm::WordIndex index, const StringPiece &str) {
vocabulary.push_back(std::string(str.data(), str.length()));
}
std::vector<std::string> vocabulary;
};
/* External scorer to query score for n-gram or sentence, including language /* External scorer to query score for n-gram or sentence, including language
* model scoring and word insertion. * model scoring and word insertion.
* *

View File

@ -310,7 +310,7 @@ DS_EnableExternalScorer(ModelState* aCtx,
aCtx->scorer_.reset(new Scorer()); aCtx->scorer_.reset(new Scorer());
int err = aCtx->scorer_->init(aScorerPath, aCtx->alphabet_); int err = aCtx->scorer_->init(aScorerPath, aCtx->alphabet_);
if (err != 0) { if (err != 0) {
return DS_ERR_INVALID_LM; return DS_ERR_INVALID_SCORER;
} }
return DS_ERR_OK; return DS_ERR_OK;
} }

View File

@ -59,7 +59,7 @@ enum DeepSpeech_Error_Codes
// Invalid parameters // Invalid parameters
DS_ERR_INVALID_ALPHABET = 0x2000, DS_ERR_INVALID_ALPHABET = 0x2000,
DS_ERR_INVALID_SHAPE = 0x2001, DS_ERR_INVALID_SHAPE = 0x2001,
DS_ERR_INVALID_LM = 0x2002, DS_ERR_INVALID_SCORER = 0x2002,
DS_ERR_MODEL_INCOMPATIBLE = 0x2003, DS_ERR_MODEL_INCOMPATIBLE = 0x2003,
DS_ERR_SCORER_NOT_ENABLED = 0x2004, DS_ERR_SCORER_NOT_ENABLED = 0x2004,
@ -129,7 +129,7 @@ DEEPSPEECH_EXPORT
int DS_DisableExternalScorer(ModelState* aCtx); int DS_DisableExternalScorer(ModelState* aCtx);
/** /**
* @brief Set hyperparameters alpha and beta of a KenLM external scorer. * @brief Set hyperparameters alpha and beta of the external scorer.
* *
* @param aCtx The ModelState pointer for the model being changed. * @param aCtx The ModelState pointer for the model being changed.
* @param aAlpha The alpha hyperparameter of the decoder. Language model weight. * @param aAlpha The alpha hyperparameter of the decoder. Language model weight.

View File

@ -1,141 +0,0 @@
#ifndef DEEPSPEECH_COMPAT_H
#define DEEPSPEECH_COMPAT_H
#include "deepspeech.h"
#warning This header is a convenience wrapper for compatibility with \
the previous API, it has deprecated function names and arguments. \
If possible, update your code instead of using this header.
/**
* @brief An object providing an interface to a trained DeepSpeech model.
*
* @param aModelPath The path to the frozen model graph.
* @param aNCep UNUSED, DEPRECATED.
* @param aNContext UNUSED, DEPRECATED.
* @param aAlphabetConfigPath UNUSED, DEPRECATED.
* @param aBeamWidth The beam width used by the decoder. A larger beam
* width generates better results at the cost of decoding
* time.
* @param[out] retval a ModelState pointer
*
* @return Zero on success, non-zero on failure.
*/
int DS_CreateModel(const char* aModelPath,
unsigned int /*aNCep*/,
unsigned int /*aNContext*/,
const char* /*aAlphabetConfigPath*/,
unsigned int aBeamWidth,
ModelState** retval)
{
return DS_CreateModel(aModelPath, aBeamWidth, retval);
}
/**
* @brief Frees associated resources and destroys model object.
*/
void DS_DestroyModel(ModelState* ctx)
{
return DS_FreeModel(ctx);
}
/**
* @brief Enable decoding using beam scoring with a KenLM language model.
*
* @param aCtx The ModelState pointer for the model being changed.
* @param aAlphabetConfigPath UNUSED, DEPRECATED.
* @param aLMPath The path to the language model binary file.
* @param aTriePath The path to the trie file build from the same vocabu-
* lary as the language model binary.
* @param aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model
weight.
* @param aLMBeta The beta hyperparameter of the CTC decoder. Word insertion
weight.
*
* @return Zero on success, non-zero on failure (invalid arguments).
*/
int DS_EnableDecoderWithLM(ModelState* aCtx,
const char* /*aAlphabetConfigPath*/,
const char* aLMPath,
const char* aTriePath,
float aLMAlpha,
float aLMBeta)
{
return DS_EnableDecoderWithLM(aCtx, aLMPath, aTriePath, aLMAlpha, aLMBeta);
}
/**
* @brief Create a new streaming inference state. The streaming state returned
* by this function can then be passed to {@link DS_FeedAudioContent()}
* and {@link DS_FinishStream()}.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aSampleRate UNUSED, DEPRECATED.
* @param[out] retval an opaque pointer that represents the streaming state. Can
* be NULL if an error occurs.
*
* @return Zero for success, non-zero on failure.
*/
int DS_SetupStream(ModelState* aCtx,
unsigned int /*aSampleRate*/,
StreamingState** retval)
{
return DS_CreateStream(aCtx, retval);
}
/**
* @brief Destroy a streaming state without decoding the computed logits. This
* can be used if you no longer need the result of an ongoing streaming
* inference and don't want to perform a costly decode operation.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
* @note This method will free the state pointer (@p aSctx).
*/
void DS_DiscardStream(StreamingState* aSctx)
{
return DS_FreeStream(aSctx);
}
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate UNUSED, DEPRECATED.
*
* @return The STT result. The user is responsible for freeing the string using
* {@link DS_FreeString()}. Returns NULL on error.
*/
char* DS_SpeechToText(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int /*aSampleRate*/)
{
return DS_SpeechToText(aCtx, aBuffer, aBufferSize);
}
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate UNUSED, DEPRECATED.
*
* @return Outputs a struct of individual letters along with their timing information.
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
*/
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int /*aSampleRate*/)
{
return DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
}
#endif /* DEEPSPEECH_COMPAT_H */

View File

@ -82,8 +82,8 @@ namespace DeepSpeechClient
throw new ArgumentException("Invalid alphabet embedded in model. (Data corruption?)"); throw new ArgumentException("Invalid alphabet embedded in model. (Data corruption?)");
case ErrorCodes.DS_ERR_INVALID_SHAPE: case ErrorCodes.DS_ERR_INVALID_SHAPE:
throw new ArgumentException("Invalid model shape."); throw new ArgumentException("Invalid model shape.");
case ErrorCodes.DS_ERR_INVALID_LM: case ErrorCodes.DS_ERR_INVALID_SCORER:
throw new ArgumentException("Invalid language model file."); throw new ArgumentException("Invalid scorer file.");
case ErrorCodes.DS_ERR_FAIL_INIT_MMAP: case ErrorCodes.DS_ERR_FAIL_INIT_MMAP:
throw new ArgumentException("Failed to initialize memory mapped model."); throw new ArgumentException("Failed to initialize memory mapped model.");
case ErrorCodes.DS_ERR_FAIL_INIT_SESS: case ErrorCodes.DS_ERR_FAIL_INIT_SESS:
@ -100,6 +100,8 @@ namespace DeepSpeechClient
throw new ArgumentException("Error failed to create session."); throw new ArgumentException("Error failed to create session.");
case ErrorCodes.DS_ERR_MODEL_INCOMPATIBLE: case ErrorCodes.DS_ERR_MODEL_INCOMPATIBLE:
throw new ArgumentException("Error incompatible model."); throw new ArgumentException("Error incompatible model.");
case ErrorCodes.DS_ERR_SCORER_NOT_ENABLED:
throw new ArgumentException("External scorer is not enabled.");
default: default:
throw new ArgumentException("Unknown error, please make sure you are using the correct native binary."); throw new ArgumentException("Unknown error, please make sure you are using the correct native binary.");
} }
@ -114,45 +116,48 @@ namespace DeepSpeechClient
} }
/// <summary> /// <summary>
/// Enable decoding using beam scoring with a KenLM language model. /// Enable decoding using an external scorer.
/// </summary> /// </summary>
/// <param name="aLMPath">The path to the language model binary file.</param> /// <param name="aScorerPath">The path to the external scorer file.</param>
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param> /// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with an external scorer.</exception>
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param> /// <exception cref="FileNotFoundException">Thrown when cannot find the scorer file.</exception>
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param> public unsafe void EnableExternalScorer(string aScorerPath)
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
/// <exception cref="FileNotFoundException">Thrown when cannot find the language model or trie file.</exception>
public unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath,
float aLMAlpha, float aLMBeta)
{ {
string exceptionMessage = null; string exceptionMessage = null;
if (string.IsNullOrWhiteSpace(aLMPath)) if (string.IsNullOrWhiteSpace(aScorerPath))
{ {
exceptionMessage = "Path to the language model file cannot be empty."; throw new FileNotFoundException("Path to the scorer file cannot be empty.");
} }
if (!File.Exists(aLMPath)) if (!File.Exists(aScorerPath))
{ {
exceptionMessage = $"Cannot find the language model file: {aLMPath}"; throw new FileNotFoundException($"Cannot find the scorer file: {aScorerPath}");
}
if (string.IsNullOrWhiteSpace(aTriePath))
{
exceptionMessage = "Path to the trie file cannot be empty.";
}
if (!File.Exists(aTriePath))
{
exceptionMessage = $"Cannot find the trie file: {aTriePath}";
} }
if (exceptionMessage != null) var resultCode = NativeImp.DS_EnableExternalScorer(_modelStatePP, aScorerPath);
{ EvaluateResultCode(resultCode);
throw new FileNotFoundException(exceptionMessage);
} }
var resultCode = NativeImp.DS_EnableDecoderWithLM(_modelStatePP, /// <summary>
aLMPath, /// Disable decoding using an external scorer.
aTriePath, /// </summary>
aLMAlpha, /// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
aLMBeta); public unsafe void DisableExternalScorer()
{
var resultCode = NativeImp.DS_DisableExternalScorer(_modelStatePP);
EvaluateResultCode(resultCode);
}
/// <summary>
/// Set hyperparameters alpha and beta of the external scorer.
/// </summary>
/// <param name="aAlpha">The alpha hyperparameter of the decoder. Language model weight.</param>
/// <param name="aBeta">The beta hyperparameter of the decoder. Word insertion weight.</param>
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
public unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta)
{
var resultCode = NativeImp.DS_SetScorerAlphaBeta(_modelStatePP,
aAlpha,
aBeta);
EvaluateResultCode(resultCode); EvaluateResultCode(resultCode);
} }

View File

@ -14,8 +14,9 @@
// Invalid parameters // Invalid parameters
DS_ERR_INVALID_ALPHABET = 0x2000, DS_ERR_INVALID_ALPHABET = 0x2000,
DS_ERR_INVALID_SHAPE = 0x2001, DS_ERR_INVALID_SHAPE = 0x2001,
DS_ERR_INVALID_LM = 0x2002, DS_ERR_INVALID_SCORER = 0x2002,
DS_ERR_MODEL_INCOMPATIBLE = 0x2003, DS_ERR_MODEL_INCOMPATIBLE = 0x2003,
DS_ERR_SCORER_NOT_ENABLED = 0x2004,
// Runtime failures // Runtime failures
DS_ERR_FAIL_INIT_MMAP = 0x3000, DS_ERR_FAIL_INIT_MMAP = 0x3000,

View File

@ -21,18 +21,26 @@ namespace DeepSpeechClient.Interfaces
unsafe int GetModelSampleRate(); unsafe int GetModelSampleRate();
/// <summary> /// <summary>
/// Enable decoding using beam scoring with a KenLM language model. /// Enable decoding using an external scorer.
/// </summary> /// </summary>
/// <param name="aLMPath">The path to the language model binary file.</param> /// <param name="aScorerPath">The path to the external scorer file.</param>
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param> /// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with an external scorer.</exception>
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param> /// <exception cref="FileNotFoundException">Thrown when cannot find the scorer file.</exception>
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param> unsafe void EnableExternalScorer(string aScorerPath);
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
/// <exception cref="FileNotFoundException">Thrown when cannot find the language model or trie file.</exception> /// <summary>
unsafe void EnableDecoderWithLM(string aLMPath, /// Disable decoding using an external scorer.
string aTriePath, /// </summary>
float aLMAlpha, /// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
float aLMBeta); unsafe void DisableExternalScorer();
/// <summary>
/// Set hyperparameters alpha and beta of the external scorer.
/// </summary>
/// <param name="aAlpha">The alpha hyperparameter of the decoder. Language model weight.</param>
/// <param name="aBeta">The beta hyperparameter of the decoder. Word insertion weight.</param>
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta);
/// <summary> /// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text. /// Use the DeepSpeech model to perform Speech-To-Text.

View File

@ -23,11 +23,16 @@ namespace DeepSpeechClient
internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx); internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(IntPtr** aCtx, internal static unsafe extern ErrorCodes DS_EnableExternalScorer(IntPtr** aCtx,
string aLMPath, string aScorerPath);
string aTriePath,
float aLMAlpha, [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
float aLMBeta); internal static unsafe extern ErrorCodes DS_DisableExternalScorer(IntPtr** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_SetScorerAlphaBeta(IntPtr** aCtx,
float aAlpha,
float aBeta);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
CharSet = CharSet.Ansi, SetLastError = true)] CharSet = CharSet.Ansi, SetLastError = true)]

View File

@ -35,22 +35,18 @@ namespace CSharpExamples
static void Main(string[] args) static void Main(string[] args)
{ {
string model = null; string model = null;
string lm = null; string scorer = null;
string trie = null;
string audio = null; string audio = null;
bool extended = false; bool extended = false;
if (args.Length > 0) if (args.Length > 0)
{ {
model = GetArgument(args, "--model"); model = GetArgument(args, "--model");
lm = GetArgument(args, "--lm"); scorer = GetArgument(args, "--scorer");
trie = GetArgument(args, "--trie");
audio = GetArgument(args, "--audio"); audio = GetArgument(args, "--audio");
extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended")); extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended"));
} }
const uint BEAM_WIDTH = 500; const uint BEAM_WIDTH = 500;
const float LM_ALPHA = 0.75f;
const float LM_BETA = 1.85f;
Stopwatch stopwatch = new Stopwatch(); Stopwatch stopwatch = new Stopwatch();
try try
@ -64,14 +60,10 @@ namespace CSharpExamples
Console.WriteLine($"Model loaded - {stopwatch.Elapsed.Milliseconds} ms"); Console.WriteLine($"Model loaded - {stopwatch.Elapsed.Milliseconds} ms");
stopwatch.Reset(); stopwatch.Reset();
if (lm != null) if (scorer != null)
{ {
Console.WriteLine("Loadin LM..."); Console.WriteLine("Loading scorer...");
sttClient.EnableDecoderWithLM( sttClient.EnableExternalScorer(scorer ?? "kenlm.scorer");
lm ?? "lm.binary",
trie ?? "trie",
LM_ALPHA, LM_BETA);
} }
string audioFile = audio ?? "arctic_a0024.wav"; string audioFile = audio ?? "arctic_a0024.wav";

View File

@ -31,8 +31,6 @@ public class DeepSpeechActivity extends AppCompatActivity {
Button _startInference; Button _startInference;
final int BEAM_WIDTH = 50; final int BEAM_WIDTH = 50;
final float LM_ALPHA = 0.75f;
final float LM_BETA = 1.85f;
private char readLEChar(RandomAccessFile f) throws IOException { private char readLEChar(RandomAccessFile f) throws IOException {
byte b1 = f.readByte(); byte b1 = f.readByte();

View File

@ -30,15 +30,11 @@ import java.nio.ByteBuffer;
public class BasicTest { public class BasicTest {
public static final String modelFile = "/data/local/tmp/test/output_graph.tflite"; public static final String modelFile = "/data/local/tmp/test/output_graph.tflite";
public static final String lmFile = "/data/local/tmp/test/lm.binary"; public static final String scorerFile = "/data/local/tmp/test/kenlm.scorer";
public static final String trieFile = "/data/local/tmp/test/trie";
public static final String wavFile = "/data/local/tmp/test/LDC93S1.wav"; public static final String wavFile = "/data/local/tmp/test/LDC93S1.wav";
public static final int BEAM_WIDTH = 50; public static final int BEAM_WIDTH = 50;
public static final float LM_ALPHA = 0.75f;
public static final float LM_BETA = 1.85f;
private char readLEChar(RandomAccessFile f) throws IOException { private char readLEChar(RandomAccessFile f) throws IOException {
byte b1 = f.readByte(); byte b1 = f.readByte();
byte b2 = f.readByte(); byte b2 = f.readByte();
@ -130,7 +126,7 @@ public class BasicTest {
@Test @Test
public void loadDeepSpeech_stt_withLM() { public void loadDeepSpeech_stt_withLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH); DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
m.enableDecoderWithLM(lmFile, trieFile, LM_ALPHA, LM_BETA); m.enableExternalScorer(scorerFile);
String decoded = doSTT(m, false); String decoded = doSTT(m, false);
assertEquals("she had your dark suit in greasy wash water all year", decoded); assertEquals("she had your dark suit in greasy wash water all year", decoded);
@ -149,7 +145,7 @@ public class BasicTest {
@Test @Test
public void loadDeepSpeech_sttWithMetadata_withLM() { public void loadDeepSpeech_sttWithMetadata_withLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH); DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
m.enableDecoderWithLM(lmFile, trieFile, LM_ALPHA, LM_BETA); m.enableExternalScorer(scorerFile);
String decoded = doSTT(m, true); String decoded = doSTT(m, true);
assertEquals("she had your dark suit in greasy wash water all year", decoded); assertEquals("she had your dark suit in greasy wash water all year", decoded);

View File

@ -47,17 +47,35 @@ public class DeepSpeechModel {
} }
/** /**
* @brief Enable decoding using beam scoring with a KenLM language model. * @brief Enable decoding using an external scorer.
* *
* @param lm The path to the language model binary file. * @param scorer The path to the external scorer file.
* @param trie The path to the trie file build from the same vocabulary as the language model binary.
* @param lm_alpha The alpha hyperparameter of the CTC decoder. Language Model weight.
* @param lm_beta The beta hyperparameter of the CTC decoder. Word insertion weight.
* *
* @return Zero on success, non-zero on failure (invalid arguments). * @return Zero on success, non-zero on failure (invalid arguments).
*/ */
public void enableDecoderWithLM(String lm, String trie, float lm_alpha, float lm_beta) { public void enableExternalScorer(String scorer) {
impl.EnableDecoderWithLM(this._msp, lm, trie, lm_alpha, lm_beta); impl.EnableExternalScorer(this._msp, scorer);
}
/**
* @brief Disable decoding using an external scorer.
*
* @return Zero on success, non-zero on failure (invalid arguments).
*/
public void disableExternalScorer() {
impl.DisableExternalScorer(this._msp);
}
/**
* @brief Enable decoding using beam scoring with a KenLM language model.
*
* @param alpha The alpha hyperparameter of the decoder. Language model weight.
* @param beta The beta hyperparameter of the decoder. Word insertion weight.
*
* @return Zero on success, non-zero on failure (invalid arguments).
*/
public void setScorerAlphaBeta(float alpha, float beta) {
impl.SetScorerAlphaBeta(this._msp, alpha, beta);
} }
/* /*

View File

@ -29,12 +29,11 @@ VersionAction.prototype.call = function(parser) {
var parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'}); var parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'}); parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'}); parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'});
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'}); parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'}); parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'});
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha)', defaultValue: 0.75, type: 'float'}); parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not set, use default value from scorer.', type: 'float'});
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta)', defaultValue: 1.85, type: 'float'}); parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not set, use default value from scorer.', type: 'float'});
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'}); parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'}); parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
var args = parser.parseArgs(); var args = parser.parseArgs();
@ -60,12 +59,16 @@ console.error('Loaded model in %ds.', totalTime(model_load_end));
var desired_sample_rate = model.sampleRate(); var desired_sample_rate = model.sampleRate();
if (args['lm'] && args['trie']) { if (args['scorer']) {
console.error('Loading language model from files %s %s', args['lm'], args['trie']); console.error('Loading scorer from file %s', args['scorer']);
const lm_load_start = process.hrtime(); const scorer_load_start = process.hrtime();
model.enableDecoderWithLM(args['lm'], args['trie'], args['lm_alpha'], args['lm_beta']); model.enableExternalScorer(args['scorer']);
const lm_load_end = process.hrtime(lm_load_start); const scorer_load_end = process.hrtime(scorer_load_start);
console.error('Loaded language model in %ds.', totalTime(lm_load_end)); console.error('Loaded scorer in %ds.', totalTime(scorer_load_end));
if (args['lm_alpha'] && args['lm_beta']) {
model.setScorerAlphaBeta(args['lm_alpha'], args['lm_beta']);
}
} }
const buffer = Fs.readFileSync(args['audio']); const buffer = Fs.readFileSync(args['audio']);

View File

@ -52,31 +52,46 @@ Model.prototype.sampleRate = function() {
} }
/** /**
* Enable decoding using beam scoring with a KenLM language model. * Enable decoding using an external scorer.
*
* @param {string} aScorerPath The path to the external scorer file.
*
* @return {number} Zero on success, non-zero on failure (invalid arguments).
*/
Model.prototype.enableExternalScorer = function(aScorerPath) {
return binding.EnableExternalScorer(this._impl, aScorerPath);
}
/**
* Disable decoding using an external scorer.
*
* @return {number} Zero on success, non-zero on failure (invalid arguments).
*/
Model.prototype.disableExternalScorer = function() {
return binding.EnableExternalScorer(this._impl);
}
/**
* Set hyperparameters alpha and beta of the external scorer.
* *
* @param {string} aLMPath The path to the language model binary file.
* @param {string} aTriePath The path to the trie file build from the same vocabulary as the language model binary.
* @param {float} aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model weight. * @param {float} aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model weight.
* @param {float} aLMBeta The beta hyperparameter of the CTC decoder. Word insertion weight. * @param {float} aLMBeta The beta hyperparameter of the CTC decoder. Word insertion weight.
* *
* @return {number} Zero on success, non-zero on failure (invalid arguments). * @return {number} Zero on success, non-zero on failure (invalid arguments).
*/ */
Model.prototype.enableDecoderWithLM = function() { Model.prototype.setScorerAlphaBeta = function(aLMAlpha, aLMBeta) {
const args = [this._impl].concat(Array.prototype.slice.call(arguments)); return binding.SetScorerAlphaBeta(this._impl, aLMAlpha, aLMBeta);
return binding.EnableDecoderWithLM.apply(null, args);
} }
/** /**
* Use the DeepSpeech model to perform Speech-To-Text. * Use the DeepSpeech model to perform Speech-To-Text.
* *
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in the audio signal.
* *
* @return {string} The STT result. Returns undefined on error. * @return {string} The STT result. Returns undefined on error.
*/ */
Model.prototype.stt = function() { Model.prototype.stt = function(aBuffer) {
const args = [this._impl].concat(Array.prototype.slice.call(arguments)); return binding.SpeechToText(this._impl, aBuffer);
return binding.SpeechToText.apply(null, args);
} }
/** /**
@ -84,25 +99,22 @@ Model.prototype.stt = function() {
* about the results. * about the results.
* *
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in the audio signal.
* *
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error. * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/ */
Model.prototype.sttWithMetadata = function() { Model.prototype.sttWithMetadata = function(aBuffer) {
const args = [this._impl].concat(Array.prototype.slice.call(arguments)); return binding.SpeechToTextWithMetadata(this._impl, aBuffer);
return binding.SpeechToTextWithMetadata.apply(null, args);
} }
/** /**
* Create a new streaming inference state. The streaming state returned by this function can then be passed to :js:func:`Model.feedAudioContent` and :js:func:`Model.finishStream`. * Create a new streaming inference state. One can then call :js:func:`Stream.feedAudioContent` and :js:func:`Stream.finishStream` on the returned stream object.
* *
* @return {object} an opaque object that represents the streaming state. * @return {object} a :js:func:`Stream` object that represents the streaming state.
* *
* @throws on error * @throws on error
*/ */
Model.prototype.createStream = function() { Model.prototype.createStream = function() {
const args = [this._impl].concat(Array.prototype.slice.call(arguments)); const rets = binding.CreateStream(this._impl);
const rets = binding.CreateStream.apply(null, args);
const status = rets[0]; const status = rets[0];
const ctx = rets[1]; const ctx = rets[1];
if (status !== 0) { if (status !== 0) {
@ -111,55 +123,56 @@ Model.prototype.createStream = function() {
return ctx; return ctx;
} }
function Stream(nativeStream) {
this._impl = nativeStream;
}
/** /**
* Feed audio samples to an ongoing streaming inference. * Feed audio samples to an ongoing streaming inference.
* *
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
* @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the * @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate (matching what the model was trained on). * appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in @param aBuffer.
*/ */
Model.prototype.feedAudioContent = function() { Stream.prototype.feedAudioContent = function(aBuffer) {
binding.FeedAudioContent.apply(null, arguments); binding.FeedAudioContent(this._impl, aBuffer);
} }
/** /**
* Compute the intermediate decoding of an ongoing streaming inference. * Compute the intermediate decoding of an ongoing streaming inference.
* *
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
*
* @return {string} The STT intermediate result. * @return {string} The STT intermediate result.
*/ */
Model.prototype.intermediateDecode = function() { Stream.prototype.intermediateDecode = function() {
return binding.IntermediateDecode.apply(null, arguments); return binding.IntermediateDecode(this._impl);
} }
/** /**
* Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal. * Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
* *
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
*
* @return {string} The STT result. * @return {string} The STT result.
* *
* This method will free the state (@param aSctx). * This method will free the stream, it must not be used after this method is called.
*/ */
Model.prototype.finishStream = function() { Stream.prototype.finishStream = function() {
return binding.FinishStream.apply(null, arguments); result = binding.FinishStream(this._impl);
this._impl = null;
return result;
} }
/** /**
* Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata. * Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
* *
* @param {object} aSctx A streaming state pointer returned by :js:func:`Model.setupStream`.
*
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
* *
* This method will free the state pointer (@param aSctx). * This method will free the stream, it must not be used after this method is called.
*/ */
Model.prototype.finishStreamWithMetadata = function() { Stream.prototype.finishStreamWithMetadata = function() {
return binding.FinishStreamWithMetadata.apply(null, arguments); result = binding.FinishStreamWithMetadata(this._impl);
this._impl = null;
return result;
} }
/** /**
* Frees associated resources and destroys model object. * Frees associated resources and destroys model object.
* *
@ -184,10 +197,10 @@ function FreeMetadata(metadata) {
* can be used if you no longer need the result of an ongoing streaming * can be used if you no longer need the result of an ongoing streaming
* inference and don't want to perform a costly decode operation. * inference and don't want to perform a costly decode operation.
* *
* @param {Object} stream A streaming state pointer returned by :js:func:`Model.createStream`. * @param {Object} stream A stream object returned by :js:func:`Model.createStream`.
*/ */
function FreeStream(stream) { function FreeStream(stream) {
return binding.FreeStream(stream); return binding.FreeStream(stream._impl);
} }
/** /**

View File

@ -21,7 +21,6 @@ import deepspeech
# rename for backwards compatibility # rename for backwards compatibility
from deepspeech.impl import PrintVersions as printVersions from deepspeech.impl import PrintVersions as printVersions
from deepspeech.impl import FreeStream as freeStream
class Model(object): class Model(object):
""" """
@ -56,127 +55,159 @@ class Model(object):
""" """
return deepspeech.impl.GetModelSampleRate(self._impl) return deepspeech.impl.GetModelSampleRate(self._impl)
def enableDecoderWithLM(self, *args, **kwargs): def enableExternalScorer(self, scorer_path):
""" """
Enable decoding using beam scoring with a KenLM language model. Enable decoding using an external scorer.
:param aLMPath: The path to the language model binary file. :param scorer_path: The path to the external scorer file.
:type aLMPath: str :type scorer_path: str
:param aTriePath: The path to the trie file build from the same vocabulary as the language model binary. :return: Zero on success, non-zero on failure.
:type aTriePath: str
:param aLMAlpha: The alpha hyperparameter of the CTC decoder. Language Model weight.
:type aLMAlpha: float
:param aLMBeta: The beta hyperparameter of the CTC decoder. Word insertion weight.
:type aLMBeta: float
:return: Zero on success, non-zero on failure (invalid arguments).
:type: int :type: int
""" """
return deepspeech.impl.EnableDecoderWithLM(self._impl, *args, **kwargs) return deepspeech.impl.EnableExternalScorer(self._impl, scorer_path)
def stt(self, *args, **kwargs): def disableExternalScorer(self):
"""
Disable decoding using an external scorer.
:return: Zero on success, non-zero on failure.
"""
return deepspeech.impl.DisableExternalScorer(self._impl)
def setScorerAlphaBeta(self, alpha, beta):
"""
Set hyperparameters alpha and beta of the external scorer.
:param alpha: The alpha hyperparameter of the decoder. Language model weight.
:type alpha: float
:param beta: The beta hyperparameter of the decoder. Word insertion weight.
:type beta: float
:return: Zero on success, non-zero on failure.
:type: int
"""
return deepspeech.impl.SetScorerAlphaBeta(self._impl, alpha, beta)
def stt(self, audio_buffer):
""" """
Use the DeepSpeech model to perform Speech-To-Text. Use the DeepSpeech model to perform Speech-To-Text.
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array :type audio_buffer: numpy.int16 array
:param aBufferSize: The number of samples in the audio signal.
:type aBufferSize: int
:return: The STT result. :return: The STT result.
:type: str :type: str
""" """
return deepspeech.impl.SpeechToText(self._impl, *args, **kwargs) return deepspeech.impl.SpeechToText(self._impl, audio_buffer)
def sttWithMetadata(self, *args, **kwargs): def sttWithMetadata(self, audio_buffer):
""" """
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results. Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array :type audio_buffer: numpy.int16 array
:param aBufferSize: The number of samples in the audio signal.
:type aBufferSize: int
:return: Outputs a struct of individual letters along with their timing information. :return: Outputs a struct of individual letters along with their timing information.
:type: :func:`Metadata` :type: :func:`Metadata`
""" """
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs) return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer)
def createStream(self): def createStream(self):
""" """
Create a new streaming inference state. The streaming state returned Create a new streaming inference state. The streaming state returned by
by this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`. this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.
:return: Object holding the stream :return: Stream object representing the newly created stream
:type: :func:`Stream`
:throws: RuntimeError on error :throws: RuntimeError on error
""" """
status, ctx = deepspeech.impl.CreateStream(self._impl) status, ctx = deepspeech.impl.CreateStream(self._impl)
if status != 0: if status != 0:
raise RuntimeError("CreateStream failed with error code {}".format(status)) raise RuntimeError("CreateStream failed with error code {}".format(status))
return ctx return Stream(ctx)
# pylint: disable=no-self-use
def feedAudioContent(self, *args, **kwargs): class Stream(object):
def __init__(self, native_stream):
self._impl = native_stream
def __del__(self):
if self._impl:
self.freeStream()
def feedAudioContent(self, audio_buffer):
""" """
Feed audio samples to an ongoing streaming inference. Feed audio samples to an ongoing streaming inference.
:param aSctx: A streaming state pointer returned by :func:`createStream()`. :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type aSctx: object :type audio_buffer: numpy.int16 array
:param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on). :throws: RuntimeError if the stream object is not valid
:type aBuffer: int array
:param aBufferSize: The number of samples in @p aBuffer.
:type aBufferSize: int
""" """
deepspeech.impl.FeedAudioContent(*args, **kwargs) if not self._impl:
raise RuntimeError("Stream object is not valid. Trying to feed an already finished stream?")
deepspeech.impl.FeedAudioContent(self._impl, audio_buffer)
# pylint: disable=no-self-use def intermediateDecode(self):
def intermediateDecode(self, *args, **kwargs):
""" """
Compute the intermediate decoding of an ongoing streaming inference. Compute the intermediate decoding of an ongoing streaming inference.
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
:type aSctx: object
:return: The STT intermediate result. :return: The STT intermediate result.
:type: str :type: str
"""
return deepspeech.impl.IntermediateDecode(*args, **kwargs)
# pylint: disable=no-self-use :throws: RuntimeError if the stream object is not valid
def finishStream(self, *args, **kwargs):
""" """
Signal the end of an audio signal to an ongoing streaming if not self._impl:
inference, returns the STT result over the whole audio signal. raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
return deepspeech.impl.IntermediateDecode(self._impl)
:param aSctx: A streaming state pointer returned by :func:`createStream()`. def finishStream(self):
:type aSctx: object """
Signal the end of an audio signal to an ongoing streaming inference,
returns the STT result over the whole audio signal.
:return: The STT result. :return: The STT result.
:type: str :type: str
"""
return deepspeech.impl.FinishStream(*args, **kwargs)
# pylint: disable=no-self-use :throws: RuntimeError if the stream object is not valid
def finishStreamWithMetadata(self, *args, **kwargs):
""" """
Signal the end of an audio signal to an ongoing streaming if not self._impl:
inference, returns per-letter metadata. raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
result = deepspeech.impl.FinishStream(self._impl)
self._impl = None
return result
:param aSctx: A streaming state pointer returned by :func:`createStream()`. def finishStreamWithMetadata(self):
:type aSctx: object """
Signal the end of an audio signal to an ongoing streaming inference,
returns per-letter metadata.
:return: Outputs a struct of individual letters along with their timing information. :return: Outputs a struct of individual letters along with their timing information.
:type: :func:`Metadata` :type: :func:`Metadata`
:throws: RuntimeError if the stream object is not valid
""" """
return deepspeech.impl.FinishStreamWithMetadata(*args, **kwargs) if not self._impl:
raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
result = deepspeech.impl.FinishStreamWithMetadata(self._impl)
self._impl = None
return result
def freeStream(self):
"""
Destroy a streaming state without decoding the computed logits. This can
be used if you no longer need the result of an ongoing streaming inference.
:throws: RuntimeError if the stream object is not valid
"""
if not self._impl:
raise RuntimeError("Stream object is not valid. Trying to free an already finished stream?")
deepspeech.impl.FreeStream(self._impl)
self._impl = None
# This is only for documentation purpose # This is only for documentation purpose
# Metadata and MetadataItem should be in sync with native_client/deepspeech.h # Metadata and MetadataItem should be in sync with native_client/deepspeech.h
@ -189,22 +220,18 @@ class MetadataItem(object):
""" """
The character generated for transcription The character generated for transcription
""" """
# pylint: disable=unnecessary-pass
pass
def timestep(self): def timestep(self):
""" """
Position of the character in units of 20ms Position of the character in units of 20ms
""" """
# pylint: disable=unnecessary-pass
pass
def start_time(self): def start_time(self):
""" """
Position of the character in seconds Position of the character in seconds
""" """
# pylint: disable=unnecessary-pass
pass
class Metadata(object): class Metadata(object):
@ -218,8 +245,7 @@ class Metadata(object):
:return: A list of :func:`MetadataItem` elements :return: A list of :func:`MetadataItem` elements
:type: list :type: list
""" """
# pylint: disable=unnecessary-pass
pass
def num_items(self): def num_items(self):
""" """
@ -228,8 +254,7 @@ class Metadata(object):
:return: Size of the list of items :return: Size of the list of items
:type: int :type: int
""" """
# pylint: disable=unnecessary-pass
pass
def confidence(self): def confidence(self):
""" """
@ -237,5 +262,4 @@ class Metadata(object):
sum of the acoustic model logit values for each timestep/character that sum of the acoustic model logit values for each timestep/character that
contributed to the creation of this transcription. contributed to the creation of this transcription.
""" """
# pylint: disable=unnecessary-pass
pass

View File

@ -88,17 +88,15 @@ def main():
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
parser.add_argument('--model', required=True, parser.add_argument('--model', required=True,
help='Path to the model (protocol buffer binary file)') help='Path to the model (protocol buffer binary file)')
parser.add_argument('--lm', nargs='?', parser.add_argument('--scorer', required=False,
help='Path to the language model binary file') help='Path to the external scorer file')
parser.add_argument('--trie', nargs='?',
help='Path to the language model trie file created with native_client/generate_trie')
parser.add_argument('--audio', required=True, parser.add_argument('--audio', required=True,
help='Path to the audio file to run (WAV format)') help='Path to the audio file to run (WAV format)')
parser.add_argument('--beam_width', type=int, default=500, parser.add_argument('--beam_width', type=int, default=500,
help='Beam width for the CTC decoder') help='Beam width for the CTC decoder')
parser.add_argument('--lm_alpha', type=float, default=0.75, parser.add_argument('--lm_alpha', type=float,
help='Language model weight (lm_alpha)') help='Language model weight (lm_alpha)')
parser.add_argument('--lm_beta', type=float, default=1.85, parser.add_argument('--lm_beta', type=float,
help='Word insertion bonus (lm_beta)') help='Word insertion bonus (lm_beta)')
parser.add_argument('--version', action=VersionAction, parser.add_argument('--version', action=VersionAction,
help='Print version and exits') help='Print version and exits')
@ -116,12 +114,15 @@ def main():
desired_sample_rate = ds.sampleRate() desired_sample_rate = ds.sampleRate()
if args.lm and args.trie: if args.scorer:
print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
lm_load_start = timer() scorer_load_start = timer()
ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta) ds.enableExternalScorer(args.scorer)
lm_load_end = timer() - lm_load_start scorer_load_end = timer() - scorer_load_start
print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)
if args.lm_alpha and args.lm_beta:
ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)
fin = wave.open(args.audio, 'rb') fin = wave.open(args.audio, 'rb')
fs = fin.getframerate() fs = fin.getframerate()

View File

@ -14,21 +14,13 @@ from deepspeech import Model
# Beam width used in the CTC decoder when building candidate transcriptions # Beam width used in the CTC decoder when building candidate transcriptions
BEAM_WIDTH = 500 BEAM_WIDTH = 500
# The alpha hyperparameter of the CTC decoder. Language Model weight
LM_ALPHA = 0.75
# The beta hyperparameter of the CTC decoder. Word insertion bonus.
LM_BETA = 1.85
def main(): def main():
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
parser.add_argument('--model', required=True, parser.add_argument('--model', required=True,
help='Path to the model (protocol buffer binary file)') help='Path to the model (protocol buffer binary file)')
parser.add_argument('--lm', nargs='?', parser.add_argument('--scorer', nargs='?',
help='Path to the language model binary file') help='Path to the external scorer file')
parser.add_argument('--trie', nargs='?',
help='Path to the language model trie file created with native_client/generate_trie')
parser.add_argument('--audio1', required=True, parser.add_argument('--audio1', required=True,
help='First audio file to use in interleaved streams') help='First audio file to use in interleaved streams')
parser.add_argument('--audio2', required=True, parser.add_argument('--audio2', required=True,
@ -37,8 +29,8 @@ def main():
ds = Model(args.model, BEAM_WIDTH) ds = Model(args.model, BEAM_WIDTH)
if args.lm and args.trie: if args.scorer:
ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA) ds.enableExternalScorer(args.scorer)
fin = wave.open(args.audio1, 'rb') fin = wave.open(args.audio1, 'rb')
fs1 = fin.getframerate() fs1 = fin.getframerate()
@ -57,11 +49,11 @@ def main():
splits2 = np.array_split(audio2, 10) splits2 = np.array_split(audio2, 10)
for part1, part2 in zip(splits1, splits2): for part1, part2 in zip(splits1, splits2):
ds.feedAudioContent(stream1, part1) stream1.feedAudioContent(part1)
ds.feedAudioContent(stream2, part2) stream2.feedAudioContent(part2)
print(ds.finishStream(stream1)) print(stream1.finishStream())
print(ds.finishStream(stream2)) print(stream2.finishStream())
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
BAZEL_TARGETS=" BAZEL_TARGETS="
//native_client:libdeepspeech.so //native_client:libdeepspeech.so
//native_client:generate_trie
" "
BAZEL_BUILD_FLAGS="${BAZEL_ARM64_FLAGS} ${BAZEL_EXTRA_FLAGS}" BAZEL_BUILD_FLAGS="${BAZEL_ARM64_FLAGS} ${BAZEL_EXTRA_FLAGS}"

View File

@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
BAZEL_TARGETS=" BAZEL_TARGETS="
//native_client:libdeepspeech.so //native_client:libdeepspeech.so
//native_client:generate_trie
" "
BAZEL_ENV_FLAGS="TF_NEED_CUDA=1 ${TF_CUDA_FLAGS}" BAZEL_ENV_FLAGS="TF_NEED_CUDA=1 ${TF_CUDA_FLAGS}"

View File

@ -30,11 +30,11 @@ then:
image: ${build.docker_image} image: ${build.docker_image}
env: env:
DEEPSPEECH_MODEL: "https://github.com/reuben/DeepSpeech/releases/download/v0.6.0-alpha.15/models.tar.gz" DEEPSPEECH_MODEL: "https://github.com/reuben/DeepSpeech/releases/download/v0.6.1/models.tar.gz"
DEEPSPEECH_AUDIO: "https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz" DEEPSPEECH_AUDIO: "https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz"
PIP_DEFAULT_TIMEOUT: "60" PIP_DEFAULT_TIMEOUT: "60"
EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples" EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
EXAMPLES_CHECKOUT_TARGET: "master" EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05"
command: command:
- "/bin/bash" - "/bin/bash"

View File

@ -10,7 +10,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
BAZEL_TARGETS=" BAZEL_TARGETS="
//native_client:libdeepspeech.so //native_client:libdeepspeech.so
//native_client:generate_trie
" "
if [ "${runtime}" = "tflite" ]; then if [ "${runtime}" = "tflite" ]; then

View File

@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
BAZEL_TARGETS=" BAZEL_TARGETS="
//native_client:libdeepspeech.so //native_client:libdeepspeech.so
//native_client:generate_trie
" "
BAZEL_BUILD_FLAGS="${BAZEL_ARM_FLAGS} ${BAZEL_EXTRA_FLAGS}" BAZEL_BUILD_FLAGS="${BAZEL_ARM_FLAGS} ${BAZEL_EXTRA_FLAGS}"

View File

@ -49,7 +49,7 @@ deepspeech --version
pushd ${HOME}/DeepSpeech/ds/ pushd ${HOME}/DeepSpeech/ds/
python bin/import_ldc93s1.py data/smoke_test python bin/import_ldc93s1.py data/smoke_test
python evaluate_tflite.py --model "${TASKCLUSTER_TMP_DIR}/${model_name_mmap}" --lm data/smoke_test/vocab.pruned.lm --trie data/smoke_test/vocab.trie --csv data/smoke_test/ldc93s1.csv python evaluate_tflite.py --model "${TASKCLUSTER_TMP_DIR}/${model_name_mmap}" --scorer data/smoke_test/pruned_lm.scorer --csv data/smoke_test/ldc93s1.csv
popd popd
virtualenv_deactivate "${pyalias}" "${PYENV_NAME}" virtualenv_deactivate "${pyalias}" "${PYENV_NAME}"

View File

@ -378,7 +378,7 @@ run_netframework_inference_tests()
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?" assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e set +e
phrase_pbmodel_withlm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e set -e
assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?" assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?"
} }
@ -401,7 +401,7 @@ run_electronjs_inference_tests()
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?" assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e set -e
assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?" assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?"
} }
@ -427,7 +427,7 @@ run_basic_inference_tests()
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status" assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"
set +e set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm}" "$status" assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm}" "$status"
@ -444,7 +444,7 @@ run_all_inference_tests()
assert_correct_ldc93s1 "${phrase_pbmodel_nolm_stereo_44k}" "$status" assert_correct_ldc93s1 "${phrase_pbmodel_nolm_stereo_44k}" "$status"
set +e set +e
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_stereo_44k}" "$status" assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_stereo_44k}" "$status"
@ -457,7 +457,7 @@ run_all_inference_tests()
assert_correct_warning_upsampling "${phrase_pbmodel_nolm_mono_8k}" assert_correct_warning_upsampling "${phrase_pbmodel_nolm_mono_8k}"
set +e set +e
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null) phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
set -e set -e
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}" assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
fi; fi;
@ -470,8 +470,7 @@ run_prod_concurrent_stream_tests()
set +e set +e
output=$(python ${TASKCLUSTER_TMP_DIR}/test_sources/concurrent_streams.py \ output=$(python ${TASKCLUSTER_TMP_DIR}/test_sources/concurrent_streams.py \
--model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} \ --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} \
--lm ${TASKCLUSTER_TMP_DIR}/lm.binary \ --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer \
--trie ${TASKCLUSTER_TMP_DIR}/trie \
--audio1 ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_16000.wav \ --audio1 ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_16000.wav \
--audio2 ${TASKCLUSTER_TMP_DIR}/new-home-in-the-stars-16k.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) --audio2 ${TASKCLUSTER_TMP_DIR}/new-home-in-the-stars-16k.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
@ -489,19 +488,19 @@ run_prod_inference_tests()
local _bitrate=$1 local _bitrate=$1
set +e set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}" assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
set +e set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}" assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
set +e set +e
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_prodmodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}" assert_correct_ldc93s1_prodmodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}"
@ -509,7 +508,7 @@ run_prod_inference_tests()
# Run down-sampling warning test only when we actually perform downsampling # Run down-sampling warning test only when we actually perform downsampling
if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then
set +e set +e
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null) phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
set -e set -e
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}" assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
fi; fi;
@ -520,19 +519,19 @@ run_prodtflite_inference_tests()
local _bitrate=$1 local _bitrate=$1
set +e set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}" assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
set +e set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}" assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
set +e set +e
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_prodtflitemodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}" assert_correct_ldc93s1_prodtflitemodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}"
@ -540,7 +539,7 @@ run_prodtflite_inference_tests()
# Run down-sampling warning test only when we actually perform downsampling # Run down-sampling warning test only when we actually perform downsampling
if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then
set +e set +e
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null) phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
set -e set -e
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}" assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
fi; fi;
@ -555,7 +554,7 @@ run_multi_inference_tests()
assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_nolm}" "$status" assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_nolm}" "$status"
set +e -o pipefail set +e -o pipefail
multi_phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%') multi_phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%')
status=$? status=$?
set -e +o pipefail set -e +o pipefail
assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_withlm}" "$status" assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_withlm}" "$status"
@ -564,7 +563,7 @@ run_multi_inference_tests()
run_cpp_only_inference_tests() run_cpp_only_inference_tests()
{ {
set +e set +e
phrase_pbmodel_withlm_intermediate_decode=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} --stream 1280 2>${TASKCLUSTER_TMP_DIR}/stderr | tail -n 1) phrase_pbmodel_withlm_intermediate_decode=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} --stream 1280 2>${TASKCLUSTER_TMP_DIR}/stderr | tail -n 1)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_intermediate_decode}" "$status" assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_intermediate_decode}" "$status"
@ -669,8 +668,7 @@ download_data()
${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source}" ${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source}"
${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source_mmap}" ${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source_mmap}"
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/*.wav ${TASKCLUSTER_TMP_DIR}/ cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/*.wav ${TASKCLUSTER_TMP_DIR}/
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.pruned.lm ${TASKCLUSTER_TMP_DIR}/lm.binary cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/pruned_lm.scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.trie ${TASKCLUSTER_TMP_DIR}/trie
cp -R ${DS_ROOT_TASK}/DeepSpeech/ds/native_client/test ${TASKCLUSTER_TMP_DIR}/test_sources cp -R ${DS_ROOT_TASK}/DeepSpeech/ds/native_client/test ${TASKCLUSTER_TMP_DIR}/test_sources
} }
@ -1562,7 +1560,6 @@ package_native_client()
fi; fi;
${TAR} -cf - \ ${TAR} -cf - \
-C ${tensorflow_dir}/bazel-bin/native_client/ generate_trie${PLATFORM_EXE_SUFFIX} \
-C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so \ -C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so \
-C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so.if.lib \ -C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so.if.lib \
-C ${deepspeech_dir}/ LICENSE \ -C ${deepspeech_dir}/ LICENSE \
@ -1767,8 +1764,7 @@ android_setup_apk_data()
adb push \ adb push \
${TASKCLUSTER_TMP_DIR}/${model_name} \ ${TASKCLUSTER_TMP_DIR}/${model_name} \
${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} \ ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} \
${TASKCLUSTER_TMP_DIR}/lm.binary \ ${TASKCLUSTER_TMP_DIR}/kenlm.scorer \
${TASKCLUSTER_TMP_DIR}/trie \
${ANDROID_TMP_DIR}/test/ ${ANDROID_TMP_DIR}/test/
} }

View File

@ -10,7 +10,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
BAZEL_TARGETS=" BAZEL_TARGETS="
//native_client:libdeepspeech.so //native_client:libdeepspeech.so
//native_client:generate_trie
" "
if [ "${package_option}" = "--cuda" ]; then if [ "${package_option}" = "--cuda" ]; then

View File

@ -44,7 +44,7 @@ payload:
MSYS: 'winsymlinks:nativestrict' MSYS: 'winsymlinks:nativestrict'
TENSORFLOW_BUILD_ARTIFACT: ${build.tensorflow} TENSORFLOW_BUILD_ARTIFACT: ${build.tensorflow}
EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples" EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
EXAMPLES_CHECKOUT_TARGET: "master" EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05"
command: command:
- >- - >-

View File

@ -29,7 +29,7 @@ def fail(message, code=1):
def transcribe_file(audio_path, tlog_path): def transcribe_file(audio_path, tlog_path):
from DeepSpeech import create_model, try_loading # pylint: disable=cyclic-import,import-outside-toplevel from DeepSpeech import create_model, try_loading # pylint: disable=cyclic-import,import-outside-toplevel
initialize_globals() initialize_globals()
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, Config.alphabet) scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path, Config.alphabet)
try: try:
num_processes = cpu_count() num_processes = cpu_count()
except NotImplementedError: except NotImplementedError:

View File

@ -143,10 +143,8 @@ def create_flags():
f.DEFINE_boolean('utf8', False, 'enable UTF-8 mode. When this is used the model outputs UTF-8 sequences directly rather than using an alphabet mapping.') f.DEFINE_boolean('utf8', False, 'enable UTF-8 mode. When this is used the model outputs UTF-8 sequences directly rather than using an alphabet mapping.')
f.DEFINE_string('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.') f.DEFINE_string('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.')
f.DEFINE_string('lm_binary_path', 'data/lm/lm.binary', 'path to the language model binary file created with KenLM') f.DEFINE_string('scorer_path', 'data/lm/kenlm.scorer', 'path to the external scorer file created with data/lm/generate_package.py')
f.DEFINE_alias('lm', 'lm_binary_path') f.DEFINE_alias('scorer', 'scorer_path')
f.DEFINE_string('lm_trie_path', 'data/lm/trie', 'path to the language model trie file created with native_client/generate_trie')
f.DEFINE_alias('trie', 'lm_trie_path')
f.DEFINE_integer('beam_width', 1024, 'beam width used in the CTC decoder when building candidate transcriptions') f.DEFINE_integer('beam_width', 1024, 'beam width used in the CTC decoder when building candidate transcriptions')
f.DEFINE_float('lm_alpha', 0.75, 'the alpha hyperparameter of the CTC decoder. Language Model weight.') f.DEFINE_float('lm_alpha', 0.75, 'the alpha hyperparameter of the CTC decoder. Language Model weight.')
f.DEFINE_float('lm_beta', 1.85, 'the beta hyperparameter of the CTC decoder. Word insertion weight.') f.DEFINE_float('lm_beta', 1.85, 'the beta hyperparameter of the CTC decoder. Word insertion weight.')