Update all API consumers
This commit is contained in:
parent
708b21a63e
commit
1e2eb96248
@ -882,8 +882,7 @@ def package_zip():
|
|||||||
}
|
}
|
||||||
}, f)
|
}, f)
|
||||||
|
|
||||||
shutil.copy(FLAGS.lm_binary_path, export_dir)
|
shutil.copy(FLAGS.scorer_path, export_dir)
|
||||||
shutil.copy(FLAGS.lm_trie_path, export_dir)
|
|
||||||
|
|
||||||
archive = shutil.make_archive(zip_filename, 'zip', export_dir)
|
archive = shutil.make_archive(zip_filename, 'zip', export_dir)
|
||||||
log_info('Exported packaged model {}'.format(archive))
|
log_info('Exported packaged model {}'.format(archive))
|
||||||
@ -926,10 +925,9 @@ def do_single_file_inference(input_file_path):
|
|||||||
|
|
||||||
logits = np.squeeze(logits)
|
logits = np.squeeze(logits)
|
||||||
|
|
||||||
if FLAGS.lm_binary_path:
|
if FLAGS.scorer_path:
|
||||||
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
|
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
|
||||||
FLAGS.lm_binary_path, FLAGS.lm_trie_path,
|
FLAGS.scorer_path, Config.alphabet)
|
||||||
Config.alphabet)
|
|
||||||
else:
|
else:
|
||||||
scorer = None
|
scorer = None
|
||||||
decoded = ctc_beam_search_decoder(logits, Config.alphabet, FLAGS.beam_width,
|
decoded = ctc_beam_search_decoder(logits, Config.alphabet, FLAGS.beam_width,
|
||||||
|
@ -172,7 +172,7 @@ RUN ./configure
|
|||||||
|
|
||||||
|
|
||||||
# Build DeepSpeech
|
# Build DeepSpeech
|
||||||
RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=cuda -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_trie --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
|
RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=cuda -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx --copt=-fvisibility=hidden //native_client:libdeepspeech.so --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
|
||||||
|
|
||||||
###
|
###
|
||||||
### Using TensorFlow upstream should work
|
### Using TensorFlow upstream should work
|
||||||
@ -187,8 +187,7 @@ RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_s
|
|||||||
# RUN pip3 install /tmp/tensorflow_pkg/*.whl
|
# RUN pip3 install /tmp/tensorflow_pkg/*.whl
|
||||||
|
|
||||||
# Copy built libs to /DeepSpeech/native_client
|
# Copy built libs to /DeepSpeech/native_client
|
||||||
RUN cp /tensorflow/bazel-bin/native_client/generate_trie /DeepSpeech/native_client/ \
|
RUN cp /tensorflow/bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/
|
||||||
&& cp /tensorflow/bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/
|
|
||||||
|
|
||||||
# Install TensorFlow
|
# Install TensorFlow
|
||||||
WORKDIR /DeepSpeech/
|
WORKDIR /DeepSpeech/
|
||||||
|
@ -21,8 +21,7 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
|
|||||||
--n_hidden 100 --epochs 1 \
|
--n_hidden 100 --epochs 1 \
|
||||||
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
|
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
|
||||||
--learning_rate 0.001 --dropout_rate 0.05 \
|
--learning_rate 0.001 --dropout_rate 0.05 \
|
||||||
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
|
--scorer_path 'data/smoke_test/pruned_lm.scorer' | tee /tmp/resume.log
|
||||||
--lm_trie_path 'data/smoke_test/vocab.trie' | tee /tmp/resume.log
|
|
||||||
|
|
||||||
if ! grep "Restored variables from most recent checkpoint" /tmp/resume.log; then
|
if ! grep "Restored variables from most recent checkpoint" /tmp/resume.log; then
|
||||||
echo "Did not resume training from checkpoint"
|
echo "Did not resume training from checkpoint"
|
||||||
|
@ -25,6 +25,5 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
|
|||||||
--n_hidden 100 --epochs $epoch_count \
|
--n_hidden 100 --epochs $epoch_count \
|
||||||
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
|
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
|
||||||
--learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train' \
|
--learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train' \
|
||||||
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
|
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
|
||||||
--lm_trie_path 'data/smoke_test/vocab.trie' \
|
|
||||||
--audio_sample_rate ${audio_sample_rate}
|
--audio_sample_rate ${audio_sample_rate}
|
||||||
|
@ -21,12 +21,10 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
|
|||||||
--n_hidden 100 --epochs 1 \
|
--n_hidden 100 --epochs 1 \
|
||||||
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' --checkpoint_secs 0 \
|
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' --checkpoint_secs 0 \
|
||||||
--learning_rate 0.001 --dropout_rate 0.05 \
|
--learning_rate 0.001 --dropout_rate 0.05 \
|
||||||
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
|
--scorer_path 'data/smoke_test/pruned_lm.scorer'
|
||||||
--lm_trie_path 'data/smoke_test/vocab.trie'
|
|
||||||
|
|
||||||
python -u DeepSpeech.py \
|
python -u DeepSpeech.py \
|
||||||
--n_hidden 100 \
|
--n_hidden 100 \
|
||||||
--checkpoint_dir '/tmp/ckpt' \
|
--checkpoint_dir '/tmp/ckpt' \
|
||||||
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
|
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
|
||||||
--lm_trie_path 'data/smoke_test/vocab.trie' \
|
|
||||||
--one_shot_infer 'data/smoke_test/LDC93S1.wav'
|
--one_shot_infer 'data/smoke_test/LDC93S1.wav'
|
||||||
|
@ -20,8 +20,7 @@ python -u DeepSpeech.py --noshow_progressbar \
|
|||||||
--n_hidden 100 \
|
--n_hidden 100 \
|
||||||
--checkpoint_dir '/tmp/ckpt' \
|
--checkpoint_dir '/tmp/ckpt' \
|
||||||
--export_dir '/tmp/train_tflite' \
|
--export_dir '/tmp/train_tflite' \
|
||||||
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
|
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
|
||||||
--lm_trie_path 'data/smoke_test/vocab.trie' \
|
|
||||||
--audio_sample_rate ${audio_sample_rate} \
|
--audio_sample_rate ${audio_sample_rate} \
|
||||||
--export_tflite
|
--export_tflite
|
||||||
|
|
||||||
@ -31,8 +30,7 @@ python -u DeepSpeech.py --noshow_progressbar \
|
|||||||
--n_hidden 100 \
|
--n_hidden 100 \
|
||||||
--checkpoint_dir '/tmp/ckpt' \
|
--checkpoint_dir '/tmp/ckpt' \
|
||||||
--export_dir '/tmp/train_tflite/en-us' \
|
--export_dir '/tmp/train_tflite/en-us' \
|
||||||
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
|
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
|
||||||
--lm_trie_path 'data/smoke_test/vocab.trie' \
|
|
||||||
--audio_sample_rate ${audio_sample_rate} \
|
--audio_sample_rate ${audio_sample_rate} \
|
||||||
--export_language 'Fake English (fk-FK)' \
|
--export_language 'Fake English (fk-FK)' \
|
||||||
--export_zip
|
--export_zip
|
||||||
|
@ -50,7 +50,7 @@ def create_bundle(alphabet_path, lm_path, vocab_path, package_path, force_utf8,
|
|||||||
scorer.set_alphabet(alphabet)
|
scorer.set_alphabet(alphabet)
|
||||||
scorer.set_utf8_mode(use_utf8)
|
scorer.set_utf8_mode(use_utf8)
|
||||||
scorer.reset_params(default_alpha, default_beta)
|
scorer.reset_params(default_alpha, default_beta)
|
||||||
scorer.load_lm(lm_path, "")
|
scorer.load_lm(lm_path)
|
||||||
scorer.fill_dictionary(list(words))
|
scorer.fill_dictionary(list(words))
|
||||||
shutil.copy(lm_path, package_path)
|
shutil.copy(lm_path, package_path)
|
||||||
scorer.save_dictionary(package_path, True) # append, not overwrite
|
scorer.save_dictionary(package_path, True) # append, not overwrite
|
||||||
|
@ -7,7 +7,13 @@ C
|
|||||||
.. doxygenfunction:: DS_FreeModel
|
.. doxygenfunction:: DS_FreeModel
|
||||||
:project: deepspeech-c
|
:project: deepspeech-c
|
||||||
|
|
||||||
.. doxygenfunction:: DS_EnableDecoderWithLM
|
.. doxygenfunction:: DS_EnableExternalScorer
|
||||||
|
:project: deepspeech-c
|
||||||
|
|
||||||
|
.. doxygenfunction:: DS_DisableExternalScorer
|
||||||
|
:project: deepspeech-c
|
||||||
|
|
||||||
|
.. doxygenfunction:: DS_SetScorerAlphaBeta
|
||||||
:project: deepspeech-c
|
:project: deepspeech-c
|
||||||
|
|
||||||
.. doxygenfunction:: DS_GetModelSampleRate
|
.. doxygenfunction:: DS_GetModelSampleRate
|
||||||
|
@ -42,10 +42,9 @@ def sparse_tuple_to_texts(sp_tuple, alphabet):
|
|||||||
|
|
||||||
|
|
||||||
def evaluate(test_csvs, create_model, try_loading):
|
def evaluate(test_csvs, create_model, try_loading):
|
||||||
if FLAGS.lm_binary_path:
|
if FLAGS.scorer_path:
|
||||||
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
|
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
|
||||||
FLAGS.lm_binary_path, FLAGS.lm_trie_path,
|
FLAGS.scorer_path, Config.alphabet)
|
||||||
Config.alphabet)
|
|
||||||
else:
|
else:
|
||||||
scorer = None
|
scorer = None
|
||||||
|
|
||||||
|
@ -27,17 +27,18 @@ This module should be self-contained:
|
|||||||
- pip install native_client/python/dist/deepspeech*.whl
|
- pip install native_client/python/dist/deepspeech*.whl
|
||||||
- pip install -r requirements_eval_tflite.txt
|
- pip install -r requirements_eval_tflite.txt
|
||||||
|
|
||||||
Then run with a TF Lite model, LM/trie and a CSV test file
|
Then run with a TF Lite model, LM and a CSV test file
|
||||||
'''
|
'''
|
||||||
|
|
||||||
BEAM_WIDTH = 500
|
BEAM_WIDTH = 500
|
||||||
LM_ALPHA = 0.75
|
LM_ALPHA = 0.75
|
||||||
LM_BETA = 1.85
|
LM_BETA = 1.85
|
||||||
|
|
||||||
def tflite_worker(model, lm, trie, queue_in, queue_out, gpu_mask):
|
def tflite_worker(model, scorer, queue_in, queue_out, gpu_mask):
|
||||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
|
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
|
||||||
ds = Model(model, BEAM_WIDTH)
|
ds = Model(model, BEAM_WIDTH)
|
||||||
ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
|
ds.enableExternalScorer(scorer)
|
||||||
|
ds.setScorerAlphaBeta(LM_ALPHA, LM_BETA)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
@ -64,7 +65,7 @@ def main(args, _):
|
|||||||
|
|
||||||
processes = []
|
processes = []
|
||||||
for i in range(args.proc):
|
for i in range(args.proc):
|
||||||
worker_process = Process(target=tflite_worker, args=(args.model, args.lm, args.trie, work_todo, work_done, i), daemon=True, name='tflite_process_{}'.format(i))
|
worker_process = Process(target=tflite_worker, args=(args.model, args.scorer, work_todo, work_done, i), daemon=True, name='tflite_process_{}'.format(i))
|
||||||
worker_process.start() # Launch reader() as a separate python process
|
worker_process.start() # Launch reader() as a separate python process
|
||||||
processes.append(worker_process)
|
processes.append(worker_process)
|
||||||
|
|
||||||
@ -113,10 +114,8 @@ def parse_args():
|
|||||||
parser = argparse.ArgumentParser(description='Computing TFLite accuracy')
|
parser = argparse.ArgumentParser(description='Computing TFLite accuracy')
|
||||||
parser.add_argument('--model', required=True,
|
parser.add_argument('--model', required=True,
|
||||||
help='Path to the model (protocol buffer binary file)')
|
help='Path to the model (protocol buffer binary file)')
|
||||||
parser.add_argument('--lm', required=True,
|
parser.add_argument('--scorer', required=True,
|
||||||
help='Path to the language model binary file')
|
help='Path to the external scorer file')
|
||||||
parser.add_argument('--trie', required=True,
|
|
||||||
help='Path to the language model trie file created with native_client/generate_trie')
|
|
||||||
parser.add_argument('--csv', required=True,
|
parser.add_argument('--csv', required=True,
|
||||||
help='Path to the CSV source file')
|
help='Path to the CSV source file')
|
||||||
parser.add_argument('--proc', required=False, default=cpu_count(), type=int,
|
parser.add_argument('--proc', required=False, default=cpu_count(), type=int,
|
||||||
|
@ -12,19 +12,17 @@
|
|||||||
|
|
||||||
char* model = NULL;
|
char* model = NULL;
|
||||||
|
|
||||||
char* lm = NULL;
|
char* scorer = NULL;
|
||||||
|
|
||||||
char* trie = NULL;
|
|
||||||
|
|
||||||
char* audio = NULL;
|
char* audio = NULL;
|
||||||
|
|
||||||
int beam_width = 500;
|
int beam_width = 500;
|
||||||
|
|
||||||
float lm_alpha = 0.75f;
|
bool set_alphabeta = false;
|
||||||
|
|
||||||
float lm_beta = 1.85f;
|
float lm_alpha = 0.f;
|
||||||
|
|
||||||
bool load_without_trie = false;
|
float lm_beta = 0.f;
|
||||||
|
|
||||||
bool show_times = false;
|
bool show_times = false;
|
||||||
|
|
||||||
@ -39,39 +37,36 @@ int stream_size = 0;
|
|||||||
void PrintHelp(const char* bin)
|
void PrintHelp(const char* bin)
|
||||||
{
|
{
|
||||||
std::cout <<
|
std::cout <<
|
||||||
"Usage: " << bin << " --model MODEL [--lm LM --trie TRIE] --audio AUDIO [-t] [-e]\n"
|
"Usage: " << bin << " --model MODEL [--scorer SCORER] --audio AUDIO [-t] [-e]\n"
|
||||||
"\n"
|
"\n"
|
||||||
"Running DeepSpeech inference.\n"
|
"Running DeepSpeech inference.\n"
|
||||||
"\n"
|
"\n"
|
||||||
" --model MODEL Path to the model (protocol buffer binary file)\n"
|
"\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
|
||||||
" --lm LM Path to the language model binary file\n"
|
"\t--scorer SCORER\t\tPath to the external scorer file\n"
|
||||||
" --trie TRIE Path to the language model trie file created with native_client/generate_trie\n"
|
"\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
|
||||||
" --audio AUDIO Path to the audio file to run (WAV format)\n"
|
"\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
|
||||||
" --beam_width BEAM_WIDTH Value for decoder beam width (int)\n"
|
"\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
|
||||||
" --lm_alpha LM_ALPHA Value for language model alpha param (float)\n"
|
"\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
|
||||||
" --lm_beta LM_BETA Value for language model beta param (float)\n"
|
"\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
|
||||||
" -t Run in benchmark mode, output mfcc & inference time\n"
|
"\t--extended\t\tOutput string from extended metadata\n"
|
||||||
" --extended Output string from extended metadata\n"
|
"\t--json\t\t\tExtended output, shows word timings as JSON\n"
|
||||||
" --json Extended output, shows word timings as JSON\n"
|
"\t--stream size\t\tRun in stream mode, output intermediate results\n"
|
||||||
" --stream size Run in stream mode, output intermediate results\n"
|
"\t--help\t\t\tShow help\n"
|
||||||
" --help Show help\n"
|
"\t--version\t\tPrint version and exits\n";
|
||||||
" --version Print version and exits\n";
|
|
||||||
DS_PrintVersions();
|
DS_PrintVersions();
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ProcessArgs(int argc, char** argv)
|
bool ProcessArgs(int argc, char** argv)
|
||||||
{
|
{
|
||||||
const char* const short_opts = "m:a:l:r:w:c:d:b:tehv";
|
const char* const short_opts = "m:a:s:r:w:c:d:b:tehv";
|
||||||
const option long_opts[] = {
|
const option long_opts[] = {
|
||||||
{"model", required_argument, nullptr, 'm'},
|
{"model", required_argument, nullptr, 'm'},
|
||||||
{"lm", required_argument, nullptr, 'l'},
|
{"scorer", required_argument, nullptr, 'l'},
|
||||||
{"trie", required_argument, nullptr, 'r'},
|
|
||||||
{"audio", required_argument, nullptr, 'w'},
|
{"audio", required_argument, nullptr, 'w'},
|
||||||
{"beam_width", required_argument, nullptr, 'b'},
|
{"beam_width", required_argument, nullptr, 'b'},
|
||||||
{"lm_alpha", required_argument, nullptr, 'c'},
|
{"lm_alpha", required_argument, nullptr, 'c'},
|
||||||
{"lm_beta", required_argument, nullptr, 'd'},
|
{"lm_beta", required_argument, nullptr, 'd'},
|
||||||
{"run_very_slowly_without_trie_I_really_know_what_Im_doing", no_argument, nullptr, 999},
|
|
||||||
{"t", no_argument, nullptr, 't'},
|
{"t", no_argument, nullptr, 't'},
|
||||||
{"extended", no_argument, nullptr, 'e'},
|
{"extended", no_argument, nullptr, 'e'},
|
||||||
{"json", no_argument, nullptr, 'j'},
|
{"json", no_argument, nullptr, 'j'},
|
||||||
@ -95,11 +90,7 @@ bool ProcessArgs(int argc, char** argv)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case 'l':
|
case 'l':
|
||||||
lm = optarg;
|
scorer = optarg;
|
||||||
break;
|
|
||||||
|
|
||||||
case 'r':
|
|
||||||
trie = optarg;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'w':
|
case 'w':
|
||||||
@ -111,17 +102,15 @@ bool ProcessArgs(int argc, char** argv)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case 'c':
|
case 'c':
|
||||||
|
set_alphabeta = true;
|
||||||
lm_alpha = atof(optarg);
|
lm_alpha = atof(optarg);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'd':
|
case 'd':
|
||||||
|
set_alphabeta = true;
|
||||||
lm_beta = atof(optarg);
|
lm_beta = atof(optarg);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 999:
|
|
||||||
load_without_trie = true;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 't':
|
case 't':
|
||||||
show_times = true;
|
show_times = true;
|
||||||
break;
|
break;
|
||||||
|
@ -374,16 +374,19 @@ main(int argc, char **argv)
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lm && (trie || load_without_trie)) {
|
if (scorer) {
|
||||||
int status = DS_EnableDecoderWithLM(ctx,
|
int status = DS_EnableExternalScorer(ctx, scorer);
|
||||||
lm,
|
|
||||||
trie,
|
|
||||||
lm_alpha,
|
|
||||||
lm_beta);
|
|
||||||
if (status != 0) {
|
if (status != 0) {
|
||||||
fprintf(stderr, "Could not enable CTC decoder with LM.\n");
|
fprintf(stderr, "Could not enable external scorer.\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
if (set_alphabeta) {
|
||||||
|
status = DS_SetScorerAlphaBeta(ctx, lm_alpha, lm_beta);
|
||||||
|
if (status != 0) {
|
||||||
|
fprintf(stderr, "Error setting scorer alpha and beta.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef NO_SOX
|
#ifndef NO_SOX
|
||||||
|
@ -12,12 +12,11 @@ class Scorer(swigwrapper.Scorer):
|
|||||||
:type alpha: float
|
:type alpha: float
|
||||||
:param beta: Word insertion bonus.
|
:param beta: Word insertion bonus.
|
||||||
:type beta: float
|
:type beta: float
|
||||||
:model_path: Path to load language model.
|
:model_path: Path to load scorer.
|
||||||
:trie_path: Path to trie file.
|
|
||||||
:alphabet: Alphabet
|
:alphabet: Alphabet
|
||||||
:type model_path: basestring
|
:type model_path: basestring
|
||||||
"""
|
"""
|
||||||
def __init__(self, alpha=None, beta=None, model_path=None, trie_path=None, alphabet=None):
|
def __init__(self, alpha=None, beta=None, model_path=None, alphabet=None):
|
||||||
super(Scorer, self).__init__()
|
super(Scorer, self).__init__()
|
||||||
# Allow bare initialization
|
# Allow bare initialization
|
||||||
if alphabet:
|
if alphabet:
|
||||||
@ -27,15 +26,15 @@ class Scorer(swigwrapper.Scorer):
|
|||||||
if err != 0:
|
if err != 0:
|
||||||
raise ValueError("Error when deserializing alphabet.")
|
raise ValueError("Error when deserializing alphabet.")
|
||||||
|
|
||||||
err = self.init(alpha, beta,
|
err = self.init(model_path.encode('utf-8'),
|
||||||
model_path.encode('utf-8'),
|
|
||||||
trie_path.encode('utf-8'),
|
|
||||||
native_alphabet)
|
native_alphabet)
|
||||||
if err != 0:
|
if err != 0:
|
||||||
raise ValueError("Scorer initialization failed with error code {}".format(err), err)
|
raise ValueError("Scorer initialization failed with error code {}".format(err), err)
|
||||||
|
|
||||||
def load_lm(self, lm_path, trie_path):
|
self.reset_params(alpha, beta)
|
||||||
super(Scorer, self).load_lm(lm_path.encode('utf-8'), trie_path.encode('utf-8'))
|
|
||||||
|
def load_lm(self, lm_path):
|
||||||
|
super(Scorer, self).load_lm(lm_path.encode('utf-8'))
|
||||||
|
|
||||||
def save_dictionary(self, save_path, *args, **kwargs):
|
def save_dictionary(self, save_path, *args, **kwargs):
|
||||||
super(Scorer, self).save_dictionary(save_path.encode('utf-8'), *args, **kwargs)
|
super(Scorer, self).save_dictionary(save_path.encode('utf-8'), *args, **kwargs)
|
||||||
|
@ -6,7 +6,6 @@
|
|||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "lm/enumerate_vocab.hh"
|
|
||||||
#include "lm/virtual_interface.hh"
|
#include "lm/virtual_interface.hh"
|
||||||
#include "lm/word_index.hh"
|
#include "lm/word_index.hh"
|
||||||
#include "util/string_piece.hh"
|
#include "util/string_piece.hh"
|
||||||
@ -19,18 +18,6 @@ const std::string START_TOKEN = "<s>";
|
|||||||
const std::string UNK_TOKEN = "<unk>";
|
const std::string UNK_TOKEN = "<unk>";
|
||||||
const std::string END_TOKEN = "</s>";
|
const std::string END_TOKEN = "</s>";
|
||||||
|
|
||||||
// Implement a callback to retrieve the dictionary of language model.
|
|
||||||
class RetrieveStrEnumerateVocab : public lm::EnumerateVocab {
|
|
||||||
public:
|
|
||||||
RetrieveStrEnumerateVocab() {}
|
|
||||||
|
|
||||||
void Add(lm::WordIndex index, const StringPiece &str) {
|
|
||||||
vocabulary.push_back(std::string(str.data(), str.length()));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string> vocabulary;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* External scorer to query score for n-gram or sentence, including language
|
/* External scorer to query score for n-gram or sentence, including language
|
||||||
* model scoring and word insertion.
|
* model scoring and word insertion.
|
||||||
*
|
*
|
||||||
|
@ -310,7 +310,7 @@ DS_EnableExternalScorer(ModelState* aCtx,
|
|||||||
aCtx->scorer_.reset(new Scorer());
|
aCtx->scorer_.reset(new Scorer());
|
||||||
int err = aCtx->scorer_->init(aScorerPath, aCtx->alphabet_);
|
int err = aCtx->scorer_->init(aScorerPath, aCtx->alphabet_);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
return DS_ERR_INVALID_LM;
|
return DS_ERR_INVALID_SCORER;
|
||||||
}
|
}
|
||||||
return DS_ERR_OK;
|
return DS_ERR_OK;
|
||||||
}
|
}
|
||||||
|
@ -59,7 +59,7 @@ enum DeepSpeech_Error_Codes
|
|||||||
// Invalid parameters
|
// Invalid parameters
|
||||||
DS_ERR_INVALID_ALPHABET = 0x2000,
|
DS_ERR_INVALID_ALPHABET = 0x2000,
|
||||||
DS_ERR_INVALID_SHAPE = 0x2001,
|
DS_ERR_INVALID_SHAPE = 0x2001,
|
||||||
DS_ERR_INVALID_LM = 0x2002,
|
DS_ERR_INVALID_SCORER = 0x2002,
|
||||||
DS_ERR_MODEL_INCOMPATIBLE = 0x2003,
|
DS_ERR_MODEL_INCOMPATIBLE = 0x2003,
|
||||||
DS_ERR_SCORER_NOT_ENABLED = 0x2004,
|
DS_ERR_SCORER_NOT_ENABLED = 0x2004,
|
||||||
|
|
||||||
@ -129,7 +129,7 @@ DEEPSPEECH_EXPORT
|
|||||||
int DS_DisableExternalScorer(ModelState* aCtx);
|
int DS_DisableExternalScorer(ModelState* aCtx);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Set hyperparameters alpha and beta of a KenLM external scorer.
|
* @brief Set hyperparameters alpha and beta of the external scorer.
|
||||||
*
|
*
|
||||||
* @param aCtx The ModelState pointer for the model being changed.
|
* @param aCtx The ModelState pointer for the model being changed.
|
||||||
* @param aAlpha The alpha hyperparameter of the decoder. Language model weight.
|
* @param aAlpha The alpha hyperparameter of the decoder. Language model weight.
|
||||||
|
@ -1,141 +0,0 @@
|
|||||||
#ifndef DEEPSPEECH_COMPAT_H
|
|
||||||
#define DEEPSPEECH_COMPAT_H
|
|
||||||
|
|
||||||
#include "deepspeech.h"
|
|
||||||
|
|
||||||
#warning This header is a convenience wrapper for compatibility with \
|
|
||||||
the previous API, it has deprecated function names and arguments. \
|
|
||||||
If possible, update your code instead of using this header.
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief An object providing an interface to a trained DeepSpeech model.
|
|
||||||
*
|
|
||||||
* @param aModelPath The path to the frozen model graph.
|
|
||||||
* @param aNCep UNUSED, DEPRECATED.
|
|
||||||
* @param aNContext UNUSED, DEPRECATED.
|
|
||||||
* @param aAlphabetConfigPath UNUSED, DEPRECATED.
|
|
||||||
* @param aBeamWidth The beam width used by the decoder. A larger beam
|
|
||||||
* width generates better results at the cost of decoding
|
|
||||||
* time.
|
|
||||||
* @param[out] retval a ModelState pointer
|
|
||||||
*
|
|
||||||
* @return Zero on success, non-zero on failure.
|
|
||||||
*/
|
|
||||||
int DS_CreateModel(const char* aModelPath,
|
|
||||||
unsigned int /*aNCep*/,
|
|
||||||
unsigned int /*aNContext*/,
|
|
||||||
const char* /*aAlphabetConfigPath*/,
|
|
||||||
unsigned int aBeamWidth,
|
|
||||||
ModelState** retval)
|
|
||||||
{
|
|
||||||
return DS_CreateModel(aModelPath, aBeamWidth, retval);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Frees associated resources and destroys model object.
|
|
||||||
*/
|
|
||||||
void DS_DestroyModel(ModelState* ctx)
|
|
||||||
{
|
|
||||||
return DS_FreeModel(ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Enable decoding using beam scoring with a KenLM language model.
|
|
||||||
*
|
|
||||||
* @param aCtx The ModelState pointer for the model being changed.
|
|
||||||
* @param aAlphabetConfigPath UNUSED, DEPRECATED.
|
|
||||||
* @param aLMPath The path to the language model binary file.
|
|
||||||
* @param aTriePath The path to the trie file build from the same vocabu-
|
|
||||||
* lary as the language model binary.
|
|
||||||
* @param aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model
|
|
||||||
weight.
|
|
||||||
* @param aLMBeta The beta hyperparameter of the CTC decoder. Word insertion
|
|
||||||
weight.
|
|
||||||
*
|
|
||||||
* @return Zero on success, non-zero on failure (invalid arguments).
|
|
||||||
*/
|
|
||||||
int DS_EnableDecoderWithLM(ModelState* aCtx,
|
|
||||||
const char* /*aAlphabetConfigPath*/,
|
|
||||||
const char* aLMPath,
|
|
||||||
const char* aTriePath,
|
|
||||||
float aLMAlpha,
|
|
||||||
float aLMBeta)
|
|
||||||
{
|
|
||||||
return DS_EnableDecoderWithLM(aCtx, aLMPath, aTriePath, aLMAlpha, aLMBeta);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Create a new streaming inference state. The streaming state returned
|
|
||||||
* by this function can then be passed to {@link DS_FeedAudioContent()}
|
|
||||||
* and {@link DS_FinishStream()}.
|
|
||||||
*
|
|
||||||
* @param aCtx The ModelState pointer for the model to use.
|
|
||||||
* @param aSampleRate UNUSED, DEPRECATED.
|
|
||||||
* @param[out] retval an opaque pointer that represents the streaming state. Can
|
|
||||||
* be NULL if an error occurs.
|
|
||||||
*
|
|
||||||
* @return Zero for success, non-zero on failure.
|
|
||||||
*/
|
|
||||||
int DS_SetupStream(ModelState* aCtx,
|
|
||||||
unsigned int /*aSampleRate*/,
|
|
||||||
StreamingState** retval)
|
|
||||||
{
|
|
||||||
return DS_CreateStream(aCtx, retval);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Destroy a streaming state without decoding the computed logits. This
|
|
||||||
* can be used if you no longer need the result of an ongoing streaming
|
|
||||||
* inference and don't want to perform a costly decode operation.
|
|
||||||
*
|
|
||||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
|
||||||
*
|
|
||||||
* @note This method will free the state pointer (@p aSctx).
|
|
||||||
*/
|
|
||||||
void DS_DiscardStream(StreamingState* aSctx)
|
|
||||||
{
|
|
||||||
return DS_FreeStream(aSctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Use the DeepSpeech model to perform Speech-To-Text.
|
|
||||||
*
|
|
||||||
* @param aCtx The ModelState pointer for the model to use.
|
|
||||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
|
||||||
* sample rate (matching what the model was trained on).
|
|
||||||
* @param aBufferSize The number of samples in the audio signal.
|
|
||||||
* @param aSampleRate UNUSED, DEPRECATED.
|
|
||||||
*
|
|
||||||
* @return The STT result. The user is responsible for freeing the string using
|
|
||||||
* {@link DS_FreeString()}. Returns NULL on error.
|
|
||||||
*/
|
|
||||||
char* DS_SpeechToText(ModelState* aCtx,
|
|
||||||
const short* aBuffer,
|
|
||||||
unsigned int aBufferSize,
|
|
||||||
unsigned int /*aSampleRate*/)
|
|
||||||
{
|
|
||||||
return DS_SpeechToText(aCtx, aBuffer, aBufferSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
|
|
||||||
* about the results.
|
|
||||||
*
|
|
||||||
* @param aCtx The ModelState pointer for the model to use.
|
|
||||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
|
||||||
* sample rate (matching what the model was trained on).
|
|
||||||
* @param aBufferSize The number of samples in the audio signal.
|
|
||||||
* @param aSampleRate UNUSED, DEPRECATED.
|
|
||||||
*
|
|
||||||
* @return Outputs a struct of individual letters along with their timing information.
|
|
||||||
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
|
|
||||||
*/
|
|
||||||
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
|
||||||
const short* aBuffer,
|
|
||||||
unsigned int aBufferSize,
|
|
||||||
unsigned int /*aSampleRate*/)
|
|
||||||
{
|
|
||||||
return DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* DEEPSPEECH_COMPAT_H */
|
|
@ -82,8 +82,8 @@ namespace DeepSpeechClient
|
|||||||
throw new ArgumentException("Invalid alphabet embedded in model. (Data corruption?)");
|
throw new ArgumentException("Invalid alphabet embedded in model. (Data corruption?)");
|
||||||
case ErrorCodes.DS_ERR_INVALID_SHAPE:
|
case ErrorCodes.DS_ERR_INVALID_SHAPE:
|
||||||
throw new ArgumentException("Invalid model shape.");
|
throw new ArgumentException("Invalid model shape.");
|
||||||
case ErrorCodes.DS_ERR_INVALID_LM:
|
case ErrorCodes.DS_ERR_INVALID_SCORER:
|
||||||
throw new ArgumentException("Invalid language model file.");
|
throw new ArgumentException("Invalid scorer file.");
|
||||||
case ErrorCodes.DS_ERR_FAIL_INIT_MMAP:
|
case ErrorCodes.DS_ERR_FAIL_INIT_MMAP:
|
||||||
throw new ArgumentException("Failed to initialize memory mapped model.");
|
throw new ArgumentException("Failed to initialize memory mapped model.");
|
||||||
case ErrorCodes.DS_ERR_FAIL_INIT_SESS:
|
case ErrorCodes.DS_ERR_FAIL_INIT_SESS:
|
||||||
@ -100,6 +100,8 @@ namespace DeepSpeechClient
|
|||||||
throw new ArgumentException("Error failed to create session.");
|
throw new ArgumentException("Error failed to create session.");
|
||||||
case ErrorCodes.DS_ERR_MODEL_INCOMPATIBLE:
|
case ErrorCodes.DS_ERR_MODEL_INCOMPATIBLE:
|
||||||
throw new ArgumentException("Error incompatible model.");
|
throw new ArgumentException("Error incompatible model.");
|
||||||
|
case ErrorCodes.DS_ERR_SCORER_NOT_ENABLED:
|
||||||
|
throw new ArgumentException("External scorer is not enabled.");
|
||||||
default:
|
default:
|
||||||
throw new ArgumentException("Unknown error, please make sure you are using the correct native binary.");
|
throw new ArgumentException("Unknown error, please make sure you are using the correct native binary.");
|
||||||
}
|
}
|
||||||
@ -114,45 +116,48 @@ namespace DeepSpeechClient
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Enable decoding using beam scoring with a KenLM language model.
|
/// Enable decoding using an external scorer.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="aLMPath">The path to the language model binary file.</param>
|
/// <param name="aScorerPath">The path to the external scorer file.</param>
|
||||||
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param>
|
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with an external scorer.</exception>
|
||||||
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
|
/// <exception cref="FileNotFoundException">Thrown when cannot find the scorer file.</exception>
|
||||||
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
|
public unsafe void EnableExternalScorer(string aScorerPath)
|
||||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
|
|
||||||
/// <exception cref="FileNotFoundException">Thrown when cannot find the language model or trie file.</exception>
|
|
||||||
public unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath,
|
|
||||||
float aLMAlpha, float aLMBeta)
|
|
||||||
{
|
{
|
||||||
string exceptionMessage = null;
|
string exceptionMessage = null;
|
||||||
if (string.IsNullOrWhiteSpace(aLMPath))
|
if (string.IsNullOrWhiteSpace(aScorerPath))
|
||||||
{
|
{
|
||||||
exceptionMessage = "Path to the language model file cannot be empty.";
|
throw new FileNotFoundException("Path to the scorer file cannot be empty.");
|
||||||
}
|
}
|
||||||
if (!File.Exists(aLMPath))
|
if (!File.Exists(aScorerPath))
|
||||||
{
|
{
|
||||||
exceptionMessage = $"Cannot find the language model file: {aLMPath}";
|
throw new FileNotFoundException($"Cannot find the scorer file: {aScorerPath}");
|
||||||
}
|
|
||||||
if (string.IsNullOrWhiteSpace(aTriePath))
|
|
||||||
{
|
|
||||||
exceptionMessage = "Path to the trie file cannot be empty.";
|
|
||||||
}
|
|
||||||
if (!File.Exists(aTriePath))
|
|
||||||
{
|
|
||||||
exceptionMessage = $"Cannot find the trie file: {aTriePath}";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (exceptionMessage != null)
|
var resultCode = NativeImp.DS_EnableExternalScorer(_modelStatePP, aScorerPath);
|
||||||
{
|
EvaluateResultCode(resultCode);
|
||||||
throw new FileNotFoundException(exceptionMessage);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var resultCode = NativeImp.DS_EnableDecoderWithLM(_modelStatePP,
|
/// <summary>
|
||||||
aLMPath,
|
/// Disable decoding using an external scorer.
|
||||||
aTriePath,
|
/// </summary>
|
||||||
aLMAlpha,
|
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
|
||||||
aLMBeta);
|
public unsafe void DisableExternalScorer()
|
||||||
|
{
|
||||||
|
var resultCode = NativeImp.DS_DisableExternalScorer(_modelStatePP);
|
||||||
|
EvaluateResultCode(resultCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Set hyperparameters alpha and beta of the external scorer.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="aAlpha">The alpha hyperparameter of the decoder. Language model weight.</param>
|
||||||
|
/// <param name="aBeta">The beta hyperparameter of the decoder. Word insertion weight.</param>
|
||||||
|
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
|
||||||
|
public unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta)
|
||||||
|
{
|
||||||
|
var resultCode = NativeImp.DS_SetScorerAlphaBeta(_modelStatePP,
|
||||||
|
aAlpha,
|
||||||
|
aBeta);
|
||||||
EvaluateResultCode(resultCode);
|
EvaluateResultCode(resultCode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,8 +14,9 @@
|
|||||||
// Invalid parameters
|
// Invalid parameters
|
||||||
DS_ERR_INVALID_ALPHABET = 0x2000,
|
DS_ERR_INVALID_ALPHABET = 0x2000,
|
||||||
DS_ERR_INVALID_SHAPE = 0x2001,
|
DS_ERR_INVALID_SHAPE = 0x2001,
|
||||||
DS_ERR_INVALID_LM = 0x2002,
|
DS_ERR_INVALID_SCORER = 0x2002,
|
||||||
DS_ERR_MODEL_INCOMPATIBLE = 0x2003,
|
DS_ERR_MODEL_INCOMPATIBLE = 0x2003,
|
||||||
|
DS_ERR_SCORER_NOT_ENABLED = 0x2004,
|
||||||
|
|
||||||
// Runtime failures
|
// Runtime failures
|
||||||
DS_ERR_FAIL_INIT_MMAP = 0x3000,
|
DS_ERR_FAIL_INIT_MMAP = 0x3000,
|
||||||
|
@ -21,18 +21,26 @@ namespace DeepSpeechClient.Interfaces
|
|||||||
unsafe int GetModelSampleRate();
|
unsafe int GetModelSampleRate();
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Enable decoding using beam scoring with a KenLM language model.
|
/// Enable decoding using an external scorer.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="aLMPath">The path to the language model binary file.</param>
|
/// <param name="aScorerPath">The path to the external scorer file.</param>
|
||||||
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param>
|
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with an external scorer.</exception>
|
||||||
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
|
/// <exception cref="FileNotFoundException">Thrown when cannot find the scorer file.</exception>
|
||||||
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
|
unsafe void EnableExternalScorer(string aScorerPath);
|
||||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
|
|
||||||
/// <exception cref="FileNotFoundException">Thrown when cannot find the language model or trie file.</exception>
|
/// <summary>
|
||||||
unsafe void EnableDecoderWithLM(string aLMPath,
|
/// Disable decoding using an external scorer.
|
||||||
string aTriePath,
|
/// </summary>
|
||||||
float aLMAlpha,
|
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
|
||||||
float aLMBeta);
|
unsafe void DisableExternalScorer();
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Set hyperparameters alpha and beta of the external scorer.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="aAlpha">The alpha hyperparameter of the decoder. Language model weight.</param>
|
||||||
|
/// <param name="aBeta">The beta hyperparameter of the decoder. Word insertion weight.</param>
|
||||||
|
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
|
||||||
|
unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
/// Use the DeepSpeech model to perform Speech-To-Text.
|
||||||
|
@ -23,11 +23,16 @@ namespace DeepSpeechClient
|
|||||||
internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx);
|
internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx);
|
||||||
|
|
||||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||||
internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(IntPtr** aCtx,
|
internal static unsafe extern ErrorCodes DS_EnableExternalScorer(IntPtr** aCtx,
|
||||||
string aLMPath,
|
string aScorerPath);
|
||||||
string aTriePath,
|
|
||||||
float aLMAlpha,
|
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||||
float aLMBeta);
|
internal static unsafe extern ErrorCodes DS_DisableExternalScorer(IntPtr** aCtx);
|
||||||
|
|
||||||
|
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||||
|
internal static unsafe extern ErrorCodes DS_SetScorerAlphaBeta(IntPtr** aCtx,
|
||||||
|
float aAlpha,
|
||||||
|
float aBeta);
|
||||||
|
|
||||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
|
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
|
||||||
CharSet = CharSet.Ansi, SetLastError = true)]
|
CharSet = CharSet.Ansi, SetLastError = true)]
|
||||||
|
@ -35,22 +35,18 @@ namespace CSharpExamples
|
|||||||
static void Main(string[] args)
|
static void Main(string[] args)
|
||||||
{
|
{
|
||||||
string model = null;
|
string model = null;
|
||||||
string lm = null;
|
string scorer = null;
|
||||||
string trie = null;
|
|
||||||
string audio = null;
|
string audio = null;
|
||||||
bool extended = false;
|
bool extended = false;
|
||||||
if (args.Length > 0)
|
if (args.Length > 0)
|
||||||
{
|
{
|
||||||
model = GetArgument(args, "--model");
|
model = GetArgument(args, "--model");
|
||||||
lm = GetArgument(args, "--lm");
|
scorer = GetArgument(args, "--scorer");
|
||||||
trie = GetArgument(args, "--trie");
|
|
||||||
audio = GetArgument(args, "--audio");
|
audio = GetArgument(args, "--audio");
|
||||||
extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended"));
|
extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended"));
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint BEAM_WIDTH = 500;
|
const uint BEAM_WIDTH = 500;
|
||||||
const float LM_ALPHA = 0.75f;
|
|
||||||
const float LM_BETA = 1.85f;
|
|
||||||
|
|
||||||
Stopwatch stopwatch = new Stopwatch();
|
Stopwatch stopwatch = new Stopwatch();
|
||||||
try
|
try
|
||||||
@ -64,14 +60,10 @@ namespace CSharpExamples
|
|||||||
|
|
||||||
Console.WriteLine($"Model loaded - {stopwatch.Elapsed.Milliseconds} ms");
|
Console.WriteLine($"Model loaded - {stopwatch.Elapsed.Milliseconds} ms");
|
||||||
stopwatch.Reset();
|
stopwatch.Reset();
|
||||||
if (lm != null)
|
if (scorer != null)
|
||||||
{
|
{
|
||||||
Console.WriteLine("Loadin LM...");
|
Console.WriteLine("Loading scorer...");
|
||||||
sttClient.EnableDecoderWithLM(
|
sttClient.EnableExternalScorer(scorer ?? "kenlm.scorer");
|
||||||
lm ?? "lm.binary",
|
|
||||||
trie ?? "trie",
|
|
||||||
LM_ALPHA, LM_BETA);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
string audioFile = audio ?? "arctic_a0024.wav";
|
string audioFile = audio ?? "arctic_a0024.wav";
|
||||||
|
@ -31,8 +31,6 @@ public class DeepSpeechActivity extends AppCompatActivity {
|
|||||||
Button _startInference;
|
Button _startInference;
|
||||||
|
|
||||||
final int BEAM_WIDTH = 50;
|
final int BEAM_WIDTH = 50;
|
||||||
final float LM_ALPHA = 0.75f;
|
|
||||||
final float LM_BETA = 1.85f;
|
|
||||||
|
|
||||||
private char readLEChar(RandomAccessFile f) throws IOException {
|
private char readLEChar(RandomAccessFile f) throws IOException {
|
||||||
byte b1 = f.readByte();
|
byte b1 = f.readByte();
|
||||||
|
@ -30,15 +30,11 @@ import java.nio.ByteBuffer;
|
|||||||
public class BasicTest {
|
public class BasicTest {
|
||||||
|
|
||||||
public static final String modelFile = "/data/local/tmp/test/output_graph.tflite";
|
public static final String modelFile = "/data/local/tmp/test/output_graph.tflite";
|
||||||
public static final String lmFile = "/data/local/tmp/test/lm.binary";
|
public static final String scorerFile = "/data/local/tmp/test/kenlm.scorer";
|
||||||
public static final String trieFile = "/data/local/tmp/test/trie";
|
|
||||||
public static final String wavFile = "/data/local/tmp/test/LDC93S1.wav";
|
public static final String wavFile = "/data/local/tmp/test/LDC93S1.wav";
|
||||||
|
|
||||||
public static final int BEAM_WIDTH = 50;
|
public static final int BEAM_WIDTH = 50;
|
||||||
|
|
||||||
public static final float LM_ALPHA = 0.75f;
|
|
||||||
public static final float LM_BETA = 1.85f;
|
|
||||||
|
|
||||||
private char readLEChar(RandomAccessFile f) throws IOException {
|
private char readLEChar(RandomAccessFile f) throws IOException {
|
||||||
byte b1 = f.readByte();
|
byte b1 = f.readByte();
|
||||||
byte b2 = f.readByte();
|
byte b2 = f.readByte();
|
||||||
@ -130,7 +126,7 @@ public class BasicTest {
|
|||||||
@Test
|
@Test
|
||||||
public void loadDeepSpeech_stt_withLM() {
|
public void loadDeepSpeech_stt_withLM() {
|
||||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
|
DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
|
||||||
m.enableDecoderWithLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
|
m.enableExternalScorer(scorerFile);
|
||||||
|
|
||||||
String decoded = doSTT(m, false);
|
String decoded = doSTT(m, false);
|
||||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||||
@ -149,7 +145,7 @@ public class BasicTest {
|
|||||||
@Test
|
@Test
|
||||||
public void loadDeepSpeech_sttWithMetadata_withLM() {
|
public void loadDeepSpeech_sttWithMetadata_withLM() {
|
||||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
|
DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
|
||||||
m.enableDecoderWithLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
|
m.enableExternalScorer(scorerFile);
|
||||||
|
|
||||||
String decoded = doSTT(m, true);
|
String decoded = doSTT(m, true);
|
||||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||||
|
@ -47,17 +47,35 @@ public class DeepSpeechModel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Enable decoding using beam scoring with a KenLM language model.
|
* @brief Enable decoding using an external scorer.
|
||||||
*
|
*
|
||||||
* @param lm The path to the language model binary file.
|
* @param scorer The path to the external scorer file.
|
||||||
* @param trie The path to the trie file build from the same vocabulary as the language model binary.
|
|
||||||
* @param lm_alpha The alpha hyperparameter of the CTC decoder. Language Model weight.
|
|
||||||
* @param lm_beta The beta hyperparameter of the CTC decoder. Word insertion weight.
|
|
||||||
*
|
*
|
||||||
* @return Zero on success, non-zero on failure (invalid arguments).
|
* @return Zero on success, non-zero on failure (invalid arguments).
|
||||||
*/
|
*/
|
||||||
public void enableDecoderWithLM(String lm, String trie, float lm_alpha, float lm_beta) {
|
public void enableExternalScorer(String scorer) {
|
||||||
impl.EnableDecoderWithLM(this._msp, lm, trie, lm_alpha, lm_beta);
|
impl.EnableExternalScorer(this._msp, scorer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Disable decoding using an external scorer.
|
||||||
|
*
|
||||||
|
* @return Zero on success, non-zero on failure (invalid arguments).
|
||||||
|
*/
|
||||||
|
public void disableExternalScorer() {
|
||||||
|
impl.DisableExternalScorer(this._msp);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Enable decoding using beam scoring with a KenLM language model.
|
||||||
|
*
|
||||||
|
* @param alpha The alpha hyperparameter of the decoder. Language model weight.
|
||||||
|
* @param beta The beta hyperparameter of the decoder. Word insertion weight.
|
||||||
|
*
|
||||||
|
* @return Zero on success, non-zero on failure (invalid arguments).
|
||||||
|
*/
|
||||||
|
public void setScorerAlphaBeta(float alpha, float beta) {
|
||||||
|
impl.SetScorerAlphaBeta(this._msp, alpha, beta);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -29,12 +29,11 @@ VersionAction.prototype.call = function(parser) {
|
|||||||
|
|
||||||
var parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
|
var parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
|
||||||
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
|
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
|
||||||
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
|
parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'});
|
||||||
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
|
|
||||||
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
|
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
|
||||||
parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'});
|
parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'});
|
||||||
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha)', defaultValue: 0.75, type: 'float'});
|
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not set, use default value from scorer.', type: 'float'});
|
||||||
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta)', defaultValue: 1.85, type: 'float'});
|
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not set, use default value from scorer.', type: 'float'});
|
||||||
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
|
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
|
||||||
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
|
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
|
||||||
var args = parser.parseArgs();
|
var args = parser.parseArgs();
|
||||||
@ -60,12 +59,16 @@ console.error('Loaded model in %ds.', totalTime(model_load_end));
|
|||||||
|
|
||||||
var desired_sample_rate = model.sampleRate();
|
var desired_sample_rate = model.sampleRate();
|
||||||
|
|
||||||
if (args['lm'] && args['trie']) {
|
if (args['scorer']) {
|
||||||
console.error('Loading language model from files %s %s', args['lm'], args['trie']);
|
console.error('Loading scorer from file %s', args['scorer']);
|
||||||
const lm_load_start = process.hrtime();
|
const scorer_load_start = process.hrtime();
|
||||||
model.enableDecoderWithLM(args['lm'], args['trie'], args['lm_alpha'], args['lm_beta']);
|
model.enableExternalScorer(args['scorer']);
|
||||||
const lm_load_end = process.hrtime(lm_load_start);
|
const scorer_load_end = process.hrtime(scorer_load_start);
|
||||||
console.error('Loaded language model in %ds.', totalTime(lm_load_end));
|
console.error('Loaded scorer in %ds.', totalTime(scorer_load_end));
|
||||||
|
|
||||||
|
if (args['lm_alpha'] && args['lm_beta']) {
|
||||||
|
model.setScorerAlphaBeta(args['lm_alpha'], args['lm_beta']);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const buffer = Fs.readFileSync(args['audio']);
|
const buffer = Fs.readFileSync(args['audio']);
|
||||||
|
@ -52,31 +52,46 @@ Model.prototype.sampleRate = function() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enable decoding using beam scoring with a KenLM language model.
|
* Enable decoding using an external scorer.
|
||||||
|
*
|
||||||
|
* @param {string} aScorerPath The path to the external scorer file.
|
||||||
|
*
|
||||||
|
* @return {number} Zero on success, non-zero on failure (invalid arguments).
|
||||||
|
*/
|
||||||
|
Model.prototype.enableExternalScorer = function(aScorerPath) {
|
||||||
|
return binding.EnableExternalScorer(this._impl, aScorerPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Disable decoding using an external scorer.
|
||||||
|
*
|
||||||
|
* @return {number} Zero on success, non-zero on failure (invalid arguments).
|
||||||
|
*/
|
||||||
|
Model.prototype.disableExternalScorer = function() {
|
||||||
|
return binding.EnableExternalScorer(this._impl);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set hyperparameters alpha and beta of the external scorer.
|
||||||
*
|
*
|
||||||
* @param {string} aLMPath The path to the language model binary file.
|
|
||||||
* @param {string} aTriePath The path to the trie file build from the same vocabulary as the language model binary.
|
|
||||||
* @param {float} aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model weight.
|
* @param {float} aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model weight.
|
||||||
* @param {float} aLMBeta The beta hyperparameter of the CTC decoder. Word insertion weight.
|
* @param {float} aLMBeta The beta hyperparameter of the CTC decoder. Word insertion weight.
|
||||||
*
|
*
|
||||||
* @return {number} Zero on success, non-zero on failure (invalid arguments).
|
* @return {number} Zero on success, non-zero on failure (invalid arguments).
|
||||||
*/
|
*/
|
||||||
Model.prototype.enableDecoderWithLM = function() {
|
Model.prototype.setScorerAlphaBeta = function(aLMAlpha, aLMBeta) {
|
||||||
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
|
return binding.SetScorerAlphaBeta(this._impl, aLMAlpha, aLMBeta);
|
||||||
return binding.EnableDecoderWithLM.apply(null, args);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Use the DeepSpeech model to perform Speech-To-Text.
|
* Use the DeepSpeech model to perform Speech-To-Text.
|
||||||
*
|
*
|
||||||
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||||
* @param {number} aBufferSize The number of samples in the audio signal.
|
|
||||||
*
|
*
|
||||||
* @return {string} The STT result. Returns undefined on error.
|
* @return {string} The STT result. Returns undefined on error.
|
||||||
*/
|
*/
|
||||||
Model.prototype.stt = function() {
|
Model.prototype.stt = function(aBuffer) {
|
||||||
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
|
return binding.SpeechToText(this._impl, aBuffer);
|
||||||
return binding.SpeechToText.apply(null, args);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -84,25 +99,22 @@ Model.prototype.stt = function() {
|
|||||||
* about the results.
|
* about the results.
|
||||||
*
|
*
|
||||||
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||||
* @param {number} aBufferSize The number of samples in the audio signal.
|
|
||||||
*
|
*
|
||||||
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
||||||
*/
|
*/
|
||||||
Model.prototype.sttWithMetadata = function() {
|
Model.prototype.sttWithMetadata = function(aBuffer) {
|
||||||
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
|
return binding.SpeechToTextWithMetadata(this._impl, aBuffer);
|
||||||
return binding.SpeechToTextWithMetadata.apply(null, args);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new streaming inference state. The streaming state returned by this function can then be passed to :js:func:`Model.feedAudioContent` and :js:func:`Model.finishStream`.
|
* Create a new streaming inference state. One can then call :js:func:`Stream.feedAudioContent` and :js:func:`Stream.finishStream` on the returned stream object.
|
||||||
*
|
*
|
||||||
* @return {object} an opaque object that represents the streaming state.
|
* @return {object} a :js:func:`Stream` object that represents the streaming state.
|
||||||
*
|
*
|
||||||
* @throws on error
|
* @throws on error
|
||||||
*/
|
*/
|
||||||
Model.prototype.createStream = function() {
|
Model.prototype.createStream = function() {
|
||||||
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
|
const rets = binding.CreateStream(this._impl);
|
||||||
const rets = binding.CreateStream.apply(null, args);
|
|
||||||
const status = rets[0];
|
const status = rets[0];
|
||||||
const ctx = rets[1];
|
const ctx = rets[1];
|
||||||
if (status !== 0) {
|
if (status !== 0) {
|
||||||
@ -111,55 +123,56 @@ Model.prototype.createStream = function() {
|
|||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function Stream(nativeStream) {
|
||||||
|
this._impl = nativeStream;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Feed audio samples to an ongoing streaming inference.
|
* Feed audio samples to an ongoing streaming inference.
|
||||||
*
|
*
|
||||||
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
|
|
||||||
* @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the
|
* @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the
|
||||||
* appropriate sample rate (matching what the model was trained on).
|
* appropriate sample rate (matching what the model was trained on).
|
||||||
* @param {number} aBufferSize The number of samples in @param aBuffer.
|
|
||||||
*/
|
*/
|
||||||
Model.prototype.feedAudioContent = function() {
|
Stream.prototype.feedAudioContent = function(aBuffer) {
|
||||||
binding.FeedAudioContent.apply(null, arguments);
|
binding.FeedAudioContent(this._impl, aBuffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compute the intermediate decoding of an ongoing streaming inference.
|
* Compute the intermediate decoding of an ongoing streaming inference.
|
||||||
*
|
*
|
||||||
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
|
|
||||||
*
|
|
||||||
* @return {string} The STT intermediate result.
|
* @return {string} The STT intermediate result.
|
||||||
*/
|
*/
|
||||||
Model.prototype.intermediateDecode = function() {
|
Stream.prototype.intermediateDecode = function() {
|
||||||
return binding.IntermediateDecode.apply(null, arguments);
|
return binding.IntermediateDecode(this._impl);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
|
* Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||||
*
|
*
|
||||||
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
|
|
||||||
*
|
|
||||||
* @return {string} The STT result.
|
* @return {string} The STT result.
|
||||||
*
|
*
|
||||||
* This method will free the state (@param aSctx).
|
* This method will free the stream, it must not be used after this method is called.
|
||||||
*/
|
*/
|
||||||
Model.prototype.finishStream = function() {
|
Stream.prototype.finishStream = function() {
|
||||||
return binding.FinishStream.apply(null, arguments);
|
result = binding.FinishStream(this._impl);
|
||||||
|
this._impl = null;
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
|
* Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
|
||||||
*
|
*
|
||||||
* @param {object} aSctx A streaming state pointer returned by :js:func:`Model.setupStream`.
|
|
||||||
*
|
|
||||||
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
|
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
|
||||||
*
|
*
|
||||||
* This method will free the state pointer (@param aSctx).
|
* This method will free the stream, it must not be used after this method is called.
|
||||||
*/
|
*/
|
||||||
Model.prototype.finishStreamWithMetadata = function() {
|
Stream.prototype.finishStreamWithMetadata = function() {
|
||||||
return binding.FinishStreamWithMetadata.apply(null, arguments);
|
result = binding.FinishStreamWithMetadata(this._impl);
|
||||||
|
this._impl = null;
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Frees associated resources and destroys model object.
|
* Frees associated resources and destroys model object.
|
||||||
*
|
*
|
||||||
@ -184,10 +197,10 @@ function FreeMetadata(metadata) {
|
|||||||
* can be used if you no longer need the result of an ongoing streaming
|
* can be used if you no longer need the result of an ongoing streaming
|
||||||
* inference and don't want to perform a costly decode operation.
|
* inference and don't want to perform a costly decode operation.
|
||||||
*
|
*
|
||||||
* @param {Object} stream A streaming state pointer returned by :js:func:`Model.createStream`.
|
* @param {Object} stream A stream object returned by :js:func:`Model.createStream`.
|
||||||
*/
|
*/
|
||||||
function FreeStream(stream) {
|
function FreeStream(stream) {
|
||||||
return binding.FreeStream(stream);
|
return binding.FreeStream(stream._impl);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -21,7 +21,6 @@ import deepspeech
|
|||||||
|
|
||||||
# rename for backwards compatibility
|
# rename for backwards compatibility
|
||||||
from deepspeech.impl import PrintVersions as printVersions
|
from deepspeech.impl import PrintVersions as printVersions
|
||||||
from deepspeech.impl import FreeStream as freeStream
|
|
||||||
|
|
||||||
class Model(object):
|
class Model(object):
|
||||||
"""
|
"""
|
||||||
@ -56,127 +55,159 @@ class Model(object):
|
|||||||
"""
|
"""
|
||||||
return deepspeech.impl.GetModelSampleRate(self._impl)
|
return deepspeech.impl.GetModelSampleRate(self._impl)
|
||||||
|
|
||||||
def enableDecoderWithLM(self, *args, **kwargs):
|
def enableExternalScorer(self, scorer_path):
|
||||||
"""
|
"""
|
||||||
Enable decoding using beam scoring with a KenLM language model.
|
Enable decoding using an external scorer.
|
||||||
|
|
||||||
:param aLMPath: The path to the language model binary file.
|
:param scorer_path: The path to the external scorer file.
|
||||||
:type aLMPath: str
|
:type scorer_path: str
|
||||||
|
|
||||||
:param aTriePath: The path to the trie file build from the same vocabulary as the language model binary.
|
:return: Zero on success, non-zero on failure.
|
||||||
:type aTriePath: str
|
|
||||||
|
|
||||||
:param aLMAlpha: The alpha hyperparameter of the CTC decoder. Language Model weight.
|
|
||||||
:type aLMAlpha: float
|
|
||||||
|
|
||||||
:param aLMBeta: The beta hyperparameter of the CTC decoder. Word insertion weight.
|
|
||||||
:type aLMBeta: float
|
|
||||||
|
|
||||||
:return: Zero on success, non-zero on failure (invalid arguments).
|
|
||||||
:type: int
|
:type: int
|
||||||
"""
|
"""
|
||||||
return deepspeech.impl.EnableDecoderWithLM(self._impl, *args, **kwargs)
|
return deepspeech.impl.EnableExternalScorer(self._impl, scorer_path)
|
||||||
|
|
||||||
def stt(self, *args, **kwargs):
|
def disableExternalScorer(self):
|
||||||
|
"""
|
||||||
|
Disable decoding using an external scorer.
|
||||||
|
|
||||||
|
:return: Zero on success, non-zero on failure.
|
||||||
|
"""
|
||||||
|
return deepspeech.impl.DisableExternalScorer(self._impl)
|
||||||
|
|
||||||
|
def setScorerAlphaBeta(self, alpha, beta):
|
||||||
|
"""
|
||||||
|
Set hyperparameters alpha and beta of the external scorer.
|
||||||
|
|
||||||
|
:param alpha: The alpha hyperparameter of the decoder. Language model weight.
|
||||||
|
:type alpha: float
|
||||||
|
|
||||||
|
:param beta: The beta hyperparameter of the decoder. Word insertion weight.
|
||||||
|
:type beta: float
|
||||||
|
|
||||||
|
:return: Zero on success, non-zero on failure.
|
||||||
|
:type: int
|
||||||
|
"""
|
||||||
|
return deepspeech.impl.SetScorerAlphaBeta(self._impl, alpha, beta)
|
||||||
|
|
||||||
|
def stt(self, audio_buffer):
|
||||||
"""
|
"""
|
||||||
Use the DeepSpeech model to perform Speech-To-Text.
|
Use the DeepSpeech model to perform Speech-To-Text.
|
||||||
|
|
||||||
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||||
:type aBuffer: int array
|
:type audio_buffer: numpy.int16 array
|
||||||
|
|
||||||
:param aBufferSize: The number of samples in the audio signal.
|
|
||||||
:type aBufferSize: int
|
|
||||||
|
|
||||||
:return: The STT result.
|
:return: The STT result.
|
||||||
:type: str
|
:type: str
|
||||||
"""
|
"""
|
||||||
return deepspeech.impl.SpeechToText(self._impl, *args, **kwargs)
|
return deepspeech.impl.SpeechToText(self._impl, audio_buffer)
|
||||||
|
|
||||||
def sttWithMetadata(self, *args, **kwargs):
|
def sttWithMetadata(self, audio_buffer):
|
||||||
"""
|
"""
|
||||||
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
|
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
|
||||||
|
|
||||||
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||||
:type aBuffer: int array
|
:type audio_buffer: numpy.int16 array
|
||||||
|
|
||||||
:param aBufferSize: The number of samples in the audio signal.
|
|
||||||
:type aBufferSize: int
|
|
||||||
|
|
||||||
:return: Outputs a struct of individual letters along with their timing information.
|
:return: Outputs a struct of individual letters along with their timing information.
|
||||||
:type: :func:`Metadata`
|
:type: :func:`Metadata`
|
||||||
"""
|
"""
|
||||||
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
|
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer)
|
||||||
|
|
||||||
def createStream(self):
|
def createStream(self):
|
||||||
"""
|
"""
|
||||||
Create a new streaming inference state. The streaming state returned
|
Create a new streaming inference state. The streaming state returned by
|
||||||
by this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.
|
this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.
|
||||||
|
|
||||||
:return: Object holding the stream
|
:return: Stream object representing the newly created stream
|
||||||
|
:type: :func:`Stream`
|
||||||
|
|
||||||
:throws: RuntimeError on error
|
:throws: RuntimeError on error
|
||||||
"""
|
"""
|
||||||
status, ctx = deepspeech.impl.CreateStream(self._impl)
|
status, ctx = deepspeech.impl.CreateStream(self._impl)
|
||||||
if status != 0:
|
if status != 0:
|
||||||
raise RuntimeError("CreateStream failed with error code {}".format(status))
|
raise RuntimeError("CreateStream failed with error code {}".format(status))
|
||||||
return ctx
|
return Stream(ctx)
|
||||||
|
|
||||||
# pylint: disable=no-self-use
|
|
||||||
def feedAudioContent(self, *args, **kwargs):
|
class Stream(object):
|
||||||
|
def __init__(self, native_stream):
|
||||||
|
self._impl = native_stream
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
if self._impl:
|
||||||
|
self.freeStream()
|
||||||
|
|
||||||
|
def feedAudioContent(self, audio_buffer):
|
||||||
"""
|
"""
|
||||||
Feed audio samples to an ongoing streaming inference.
|
Feed audio samples to an ongoing streaming inference.
|
||||||
|
|
||||||
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
|
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||||
:type aSctx: object
|
:type audio_buffer: numpy.int16 array
|
||||||
|
|
||||||
:param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).
|
:throws: RuntimeError if the stream object is not valid
|
||||||
:type aBuffer: int array
|
|
||||||
|
|
||||||
:param aBufferSize: The number of samples in @p aBuffer.
|
|
||||||
:type aBufferSize: int
|
|
||||||
"""
|
"""
|
||||||
deepspeech.impl.FeedAudioContent(*args, **kwargs)
|
if not self._impl:
|
||||||
|
raise RuntimeError("Stream object is not valid. Trying to feed an already finished stream?")
|
||||||
|
deepspeech.impl.FeedAudioContent(self._impl, audio_buffer)
|
||||||
|
|
||||||
# pylint: disable=no-self-use
|
def intermediateDecode(self):
|
||||||
def intermediateDecode(self, *args, **kwargs):
|
|
||||||
"""
|
"""
|
||||||
Compute the intermediate decoding of an ongoing streaming inference.
|
Compute the intermediate decoding of an ongoing streaming inference.
|
||||||
|
|
||||||
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
|
|
||||||
:type aSctx: object
|
|
||||||
|
|
||||||
:return: The STT intermediate result.
|
:return: The STT intermediate result.
|
||||||
:type: str
|
:type: str
|
||||||
"""
|
|
||||||
return deepspeech.impl.IntermediateDecode(*args, **kwargs)
|
|
||||||
|
|
||||||
# pylint: disable=no-self-use
|
:throws: RuntimeError if the stream object is not valid
|
||||||
def finishStream(self, *args, **kwargs):
|
|
||||||
"""
|
"""
|
||||||
Signal the end of an audio signal to an ongoing streaming
|
if not self._impl:
|
||||||
inference, returns the STT result over the whole audio signal.
|
raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
|
||||||
|
return deepspeech.impl.IntermediateDecode(self._impl)
|
||||||
|
|
||||||
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
|
def finishStream(self):
|
||||||
:type aSctx: object
|
"""
|
||||||
|
Signal the end of an audio signal to an ongoing streaming inference,
|
||||||
|
returns the STT result over the whole audio signal.
|
||||||
|
|
||||||
:return: The STT result.
|
:return: The STT result.
|
||||||
:type: str
|
:type: str
|
||||||
"""
|
|
||||||
return deepspeech.impl.FinishStream(*args, **kwargs)
|
|
||||||
|
|
||||||
# pylint: disable=no-self-use
|
:throws: RuntimeError if the stream object is not valid
|
||||||
def finishStreamWithMetadata(self, *args, **kwargs):
|
|
||||||
"""
|
"""
|
||||||
Signal the end of an audio signal to an ongoing streaming
|
if not self._impl:
|
||||||
inference, returns per-letter metadata.
|
raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
|
||||||
|
result = deepspeech.impl.FinishStream(self._impl)
|
||||||
|
self._impl = None
|
||||||
|
return result
|
||||||
|
|
||||||
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
|
def finishStreamWithMetadata(self):
|
||||||
:type aSctx: object
|
"""
|
||||||
|
Signal the end of an audio signal to an ongoing streaming inference,
|
||||||
|
returns per-letter metadata.
|
||||||
|
|
||||||
:return: Outputs a struct of individual letters along with their timing information.
|
:return: Outputs a struct of individual letters along with their timing information.
|
||||||
:type: :func:`Metadata`
|
:type: :func:`Metadata`
|
||||||
|
|
||||||
|
:throws: RuntimeError if the stream object is not valid
|
||||||
"""
|
"""
|
||||||
return deepspeech.impl.FinishStreamWithMetadata(*args, **kwargs)
|
if not self._impl:
|
||||||
|
raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
|
||||||
|
result = deepspeech.impl.FinishStreamWithMetadata(self._impl)
|
||||||
|
self._impl = None
|
||||||
|
return result
|
||||||
|
|
||||||
|
def freeStream(self):
|
||||||
|
"""
|
||||||
|
Destroy a streaming state without decoding the computed logits. This can
|
||||||
|
be used if you no longer need the result of an ongoing streaming inference.
|
||||||
|
|
||||||
|
:throws: RuntimeError if the stream object is not valid
|
||||||
|
"""
|
||||||
|
if not self._impl:
|
||||||
|
raise RuntimeError("Stream object is not valid. Trying to free an already finished stream?")
|
||||||
|
deepspeech.impl.FreeStream(self._impl)
|
||||||
|
self._impl = None
|
||||||
|
|
||||||
|
|
||||||
# This is only for documentation purpose
|
# This is only for documentation purpose
|
||||||
# Metadata and MetadataItem should be in sync with native_client/deepspeech.h
|
# Metadata and MetadataItem should be in sync with native_client/deepspeech.h
|
||||||
@ -189,22 +220,18 @@ class MetadataItem(object):
|
|||||||
"""
|
"""
|
||||||
The character generated for transcription
|
The character generated for transcription
|
||||||
"""
|
"""
|
||||||
# pylint: disable=unnecessary-pass
|
|
||||||
pass
|
|
||||||
|
|
||||||
def timestep(self):
|
def timestep(self):
|
||||||
"""
|
"""
|
||||||
Position of the character in units of 20ms
|
Position of the character in units of 20ms
|
||||||
"""
|
"""
|
||||||
# pylint: disable=unnecessary-pass
|
|
||||||
pass
|
|
||||||
|
|
||||||
def start_time(self):
|
def start_time(self):
|
||||||
"""
|
"""
|
||||||
Position of the character in seconds
|
Position of the character in seconds
|
||||||
"""
|
"""
|
||||||
# pylint: disable=unnecessary-pass
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class Metadata(object):
|
class Metadata(object):
|
||||||
@ -218,8 +245,7 @@ class Metadata(object):
|
|||||||
:return: A list of :func:`MetadataItem` elements
|
:return: A list of :func:`MetadataItem` elements
|
||||||
:type: list
|
:type: list
|
||||||
"""
|
"""
|
||||||
# pylint: disable=unnecessary-pass
|
|
||||||
pass
|
|
||||||
|
|
||||||
def num_items(self):
|
def num_items(self):
|
||||||
"""
|
"""
|
||||||
@ -228,8 +254,7 @@ class Metadata(object):
|
|||||||
:return: Size of the list of items
|
:return: Size of the list of items
|
||||||
:type: int
|
:type: int
|
||||||
"""
|
"""
|
||||||
# pylint: disable=unnecessary-pass
|
|
||||||
pass
|
|
||||||
|
|
||||||
def confidence(self):
|
def confidence(self):
|
||||||
"""
|
"""
|
||||||
@ -237,5 +262,4 @@ class Metadata(object):
|
|||||||
sum of the acoustic model logit values for each timestep/character that
|
sum of the acoustic model logit values for each timestep/character that
|
||||||
contributed to the creation of this transcription.
|
contributed to the creation of this transcription.
|
||||||
"""
|
"""
|
||||||
# pylint: disable=unnecessary-pass
|
|
||||||
pass
|
|
||||||
|
@ -88,17 +88,15 @@ def main():
|
|||||||
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
|
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
|
||||||
parser.add_argument('--model', required=True,
|
parser.add_argument('--model', required=True,
|
||||||
help='Path to the model (protocol buffer binary file)')
|
help='Path to the model (protocol buffer binary file)')
|
||||||
parser.add_argument('--lm', nargs='?',
|
parser.add_argument('--scorer', required=False,
|
||||||
help='Path to the language model binary file')
|
help='Path to the external scorer file')
|
||||||
parser.add_argument('--trie', nargs='?',
|
|
||||||
help='Path to the language model trie file created with native_client/generate_trie')
|
|
||||||
parser.add_argument('--audio', required=True,
|
parser.add_argument('--audio', required=True,
|
||||||
help='Path to the audio file to run (WAV format)')
|
help='Path to the audio file to run (WAV format)')
|
||||||
parser.add_argument('--beam_width', type=int, default=500,
|
parser.add_argument('--beam_width', type=int, default=500,
|
||||||
help='Beam width for the CTC decoder')
|
help='Beam width for the CTC decoder')
|
||||||
parser.add_argument('--lm_alpha', type=float, default=0.75,
|
parser.add_argument('--lm_alpha', type=float,
|
||||||
help='Language model weight (lm_alpha)')
|
help='Language model weight (lm_alpha)')
|
||||||
parser.add_argument('--lm_beta', type=float, default=1.85,
|
parser.add_argument('--lm_beta', type=float,
|
||||||
help='Word insertion bonus (lm_beta)')
|
help='Word insertion bonus (lm_beta)')
|
||||||
parser.add_argument('--version', action=VersionAction,
|
parser.add_argument('--version', action=VersionAction,
|
||||||
help='Print version and exits')
|
help='Print version and exits')
|
||||||
@ -116,12 +114,15 @@ def main():
|
|||||||
|
|
||||||
desired_sample_rate = ds.sampleRate()
|
desired_sample_rate = ds.sampleRate()
|
||||||
|
|
||||||
if args.lm and args.trie:
|
if args.scorer:
|
||||||
print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
|
print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
|
||||||
lm_load_start = timer()
|
scorer_load_start = timer()
|
||||||
ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta)
|
ds.enableExternalScorer(args.scorer)
|
||||||
lm_load_end = timer() - lm_load_start
|
scorer_load_end = timer() - scorer_load_start
|
||||||
print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)
|
print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)
|
||||||
|
|
||||||
|
if args.lm_alpha and args.lm_beta:
|
||||||
|
ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)
|
||||||
|
|
||||||
fin = wave.open(args.audio, 'rb')
|
fin = wave.open(args.audio, 'rb')
|
||||||
fs = fin.getframerate()
|
fs = fin.getframerate()
|
||||||
|
@ -14,21 +14,13 @@ from deepspeech import Model
|
|||||||
# Beam width used in the CTC decoder when building candidate transcriptions
|
# Beam width used in the CTC decoder when building candidate transcriptions
|
||||||
BEAM_WIDTH = 500
|
BEAM_WIDTH = 500
|
||||||
|
|
||||||
# The alpha hyperparameter of the CTC decoder. Language Model weight
|
|
||||||
LM_ALPHA = 0.75
|
|
||||||
|
|
||||||
# The beta hyperparameter of the CTC decoder. Word insertion bonus.
|
|
||||||
LM_BETA = 1.85
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
|
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
|
||||||
parser.add_argument('--model', required=True,
|
parser.add_argument('--model', required=True,
|
||||||
help='Path to the model (protocol buffer binary file)')
|
help='Path to the model (protocol buffer binary file)')
|
||||||
parser.add_argument('--lm', nargs='?',
|
parser.add_argument('--scorer', nargs='?',
|
||||||
help='Path to the language model binary file')
|
help='Path to the external scorer file')
|
||||||
parser.add_argument('--trie', nargs='?',
|
|
||||||
help='Path to the language model trie file created with native_client/generate_trie')
|
|
||||||
parser.add_argument('--audio1', required=True,
|
parser.add_argument('--audio1', required=True,
|
||||||
help='First audio file to use in interleaved streams')
|
help='First audio file to use in interleaved streams')
|
||||||
parser.add_argument('--audio2', required=True,
|
parser.add_argument('--audio2', required=True,
|
||||||
@ -37,8 +29,8 @@ def main():
|
|||||||
|
|
||||||
ds = Model(args.model, BEAM_WIDTH)
|
ds = Model(args.model, BEAM_WIDTH)
|
||||||
|
|
||||||
if args.lm and args.trie:
|
if args.scorer:
|
||||||
ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA)
|
ds.enableExternalScorer(args.scorer)
|
||||||
|
|
||||||
fin = wave.open(args.audio1, 'rb')
|
fin = wave.open(args.audio1, 'rb')
|
||||||
fs1 = fin.getframerate()
|
fs1 = fin.getframerate()
|
||||||
@ -57,11 +49,11 @@ def main():
|
|||||||
splits2 = np.array_split(audio2, 10)
|
splits2 = np.array_split(audio2, 10)
|
||||||
|
|
||||||
for part1, part2 in zip(splits1, splits2):
|
for part1, part2 in zip(splits1, splits2):
|
||||||
ds.feedAudioContent(stream1, part1)
|
stream1.feedAudioContent(part1)
|
||||||
ds.feedAudioContent(stream2, part2)
|
stream2.feedAudioContent(part2)
|
||||||
|
|
||||||
print(ds.finishStream(stream1))
|
print(stream1.finishStream())
|
||||||
print(ds.finishStream(stream2))
|
print(stream2.finishStream())
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
|
|||||||
|
|
||||||
BAZEL_TARGETS="
|
BAZEL_TARGETS="
|
||||||
//native_client:libdeepspeech.so
|
//native_client:libdeepspeech.so
|
||||||
//native_client:generate_trie
|
|
||||||
"
|
"
|
||||||
|
|
||||||
BAZEL_BUILD_FLAGS="${BAZEL_ARM64_FLAGS} ${BAZEL_EXTRA_FLAGS}"
|
BAZEL_BUILD_FLAGS="${BAZEL_ARM64_FLAGS} ${BAZEL_EXTRA_FLAGS}"
|
||||||
|
@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
|
|||||||
|
|
||||||
BAZEL_TARGETS="
|
BAZEL_TARGETS="
|
||||||
//native_client:libdeepspeech.so
|
//native_client:libdeepspeech.so
|
||||||
//native_client:generate_trie
|
|
||||||
"
|
"
|
||||||
|
|
||||||
BAZEL_ENV_FLAGS="TF_NEED_CUDA=1 ${TF_CUDA_FLAGS}"
|
BAZEL_ENV_FLAGS="TF_NEED_CUDA=1 ${TF_CUDA_FLAGS}"
|
||||||
|
@ -30,11 +30,11 @@ then:
|
|||||||
image: ${build.docker_image}
|
image: ${build.docker_image}
|
||||||
|
|
||||||
env:
|
env:
|
||||||
DEEPSPEECH_MODEL: "https://github.com/reuben/DeepSpeech/releases/download/v0.6.0-alpha.15/models.tar.gz"
|
DEEPSPEECH_MODEL: "https://github.com/reuben/DeepSpeech/releases/download/v0.6.1/models.tar.gz"
|
||||||
DEEPSPEECH_AUDIO: "https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz"
|
DEEPSPEECH_AUDIO: "https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz"
|
||||||
PIP_DEFAULT_TIMEOUT: "60"
|
PIP_DEFAULT_TIMEOUT: "60"
|
||||||
EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
|
EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
|
||||||
EXAMPLES_CHECKOUT_TARGET: "master"
|
EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05"
|
||||||
|
|
||||||
command:
|
command:
|
||||||
- "/bin/bash"
|
- "/bin/bash"
|
||||||
|
@ -10,7 +10,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
|
|||||||
|
|
||||||
BAZEL_TARGETS="
|
BAZEL_TARGETS="
|
||||||
//native_client:libdeepspeech.so
|
//native_client:libdeepspeech.so
|
||||||
//native_client:generate_trie
|
|
||||||
"
|
"
|
||||||
|
|
||||||
if [ "${runtime}" = "tflite" ]; then
|
if [ "${runtime}" = "tflite" ]; then
|
||||||
|
@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
|
|||||||
|
|
||||||
BAZEL_TARGETS="
|
BAZEL_TARGETS="
|
||||||
//native_client:libdeepspeech.so
|
//native_client:libdeepspeech.so
|
||||||
//native_client:generate_trie
|
|
||||||
"
|
"
|
||||||
|
|
||||||
BAZEL_BUILD_FLAGS="${BAZEL_ARM_FLAGS} ${BAZEL_EXTRA_FLAGS}"
|
BAZEL_BUILD_FLAGS="${BAZEL_ARM_FLAGS} ${BAZEL_EXTRA_FLAGS}"
|
||||||
|
@ -49,7 +49,7 @@ deepspeech --version
|
|||||||
|
|
||||||
pushd ${HOME}/DeepSpeech/ds/
|
pushd ${HOME}/DeepSpeech/ds/
|
||||||
python bin/import_ldc93s1.py data/smoke_test
|
python bin/import_ldc93s1.py data/smoke_test
|
||||||
python evaluate_tflite.py --model "${TASKCLUSTER_TMP_DIR}/${model_name_mmap}" --lm data/smoke_test/vocab.pruned.lm --trie data/smoke_test/vocab.trie --csv data/smoke_test/ldc93s1.csv
|
python evaluate_tflite.py --model "${TASKCLUSTER_TMP_DIR}/${model_name_mmap}" --scorer data/smoke_test/pruned_lm.scorer --csv data/smoke_test/ldc93s1.csv
|
||||||
popd
|
popd
|
||||||
|
|
||||||
virtualenv_deactivate "${pyalias}" "${PYENV_NAME}"
|
virtualenv_deactivate "${pyalias}" "${PYENV_NAME}"
|
||||||
|
@ -378,7 +378,7 @@ run_netframework_inference_tests()
|
|||||||
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
|
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
phrase_pbmodel_withlm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||||
set -e
|
set -e
|
||||||
assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?"
|
assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?"
|
||||||
}
|
}
|
||||||
@ -401,7 +401,7 @@ run_electronjs_inference_tests()
|
|||||||
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
|
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||||
set -e
|
set -e
|
||||||
assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?"
|
assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?"
|
||||||
}
|
}
|
||||||
@ -427,7 +427,7 @@ run_basic_inference_tests()
|
|||||||
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"
|
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||||
status=$?
|
status=$?
|
||||||
set -e
|
set -e
|
||||||
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm}" "$status"
|
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm}" "$status"
|
||||||
@ -444,7 +444,7 @@ run_all_inference_tests()
|
|||||||
assert_correct_ldc93s1 "${phrase_pbmodel_nolm_stereo_44k}" "$status"
|
assert_correct_ldc93s1 "${phrase_pbmodel_nolm_stereo_44k}" "$status"
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||||
status=$?
|
status=$?
|
||||||
set -e
|
set -e
|
||||||
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_stereo_44k}" "$status"
|
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_stereo_44k}" "$status"
|
||||||
@ -457,7 +457,7 @@ run_all_inference_tests()
|
|||||||
assert_correct_warning_upsampling "${phrase_pbmodel_nolm_mono_8k}"
|
assert_correct_warning_upsampling "${phrase_pbmodel_nolm_mono_8k}"
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
|
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
|
||||||
set -e
|
set -e
|
||||||
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
|
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
|
||||||
fi;
|
fi;
|
||||||
@ -470,8 +470,7 @@ run_prod_concurrent_stream_tests()
|
|||||||
set +e
|
set +e
|
||||||
output=$(python ${TASKCLUSTER_TMP_DIR}/test_sources/concurrent_streams.py \
|
output=$(python ${TASKCLUSTER_TMP_DIR}/test_sources/concurrent_streams.py \
|
||||||
--model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} \
|
--model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} \
|
||||||
--lm ${TASKCLUSTER_TMP_DIR}/lm.binary \
|
--scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer \
|
||||||
--trie ${TASKCLUSTER_TMP_DIR}/trie \
|
|
||||||
--audio1 ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_16000.wav \
|
--audio1 ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_16000.wav \
|
||||||
--audio2 ${TASKCLUSTER_TMP_DIR}/new-home-in-the-stars-16k.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
--audio2 ${TASKCLUSTER_TMP_DIR}/new-home-in-the-stars-16k.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||||
status=$?
|
status=$?
|
||||||
@ -489,19 +488,19 @@ run_prod_inference_tests()
|
|||||||
local _bitrate=$1
|
local _bitrate=$1
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||||
status=$?
|
status=$?
|
||||||
set -e
|
set -e
|
||||||
assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
|
assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||||
status=$?
|
status=$?
|
||||||
set -e
|
set -e
|
||||||
assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
|
assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||||
status=$?
|
status=$?
|
||||||
set -e
|
set -e
|
||||||
assert_correct_ldc93s1_prodmodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}"
|
assert_correct_ldc93s1_prodmodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}"
|
||||||
@ -509,7 +508,7 @@ run_prod_inference_tests()
|
|||||||
# Run down-sampling warning test only when we actually perform downsampling
|
# Run down-sampling warning test only when we actually perform downsampling
|
||||||
if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then
|
if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
|
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
|
||||||
set -e
|
set -e
|
||||||
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
|
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
|
||||||
fi;
|
fi;
|
||||||
@ -520,19 +519,19 @@ run_prodtflite_inference_tests()
|
|||||||
local _bitrate=$1
|
local _bitrate=$1
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||||
status=$?
|
status=$?
|
||||||
set -e
|
set -e
|
||||||
assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
|
assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||||
status=$?
|
status=$?
|
||||||
set -e
|
set -e
|
||||||
assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
|
assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||||
status=$?
|
status=$?
|
||||||
set -e
|
set -e
|
||||||
assert_correct_ldc93s1_prodtflitemodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}"
|
assert_correct_ldc93s1_prodtflitemodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}"
|
||||||
@ -540,7 +539,7 @@ run_prodtflite_inference_tests()
|
|||||||
# Run down-sampling warning test only when we actually perform downsampling
|
# Run down-sampling warning test only when we actually perform downsampling
|
||||||
if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then
|
if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
|
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
|
||||||
set -e
|
set -e
|
||||||
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
|
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
|
||||||
fi;
|
fi;
|
||||||
@ -555,7 +554,7 @@ run_multi_inference_tests()
|
|||||||
assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_nolm}" "$status"
|
assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_nolm}" "$status"
|
||||||
|
|
||||||
set +e -o pipefail
|
set +e -o pipefail
|
||||||
multi_phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%')
|
multi_phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%')
|
||||||
status=$?
|
status=$?
|
||||||
set -e +o pipefail
|
set -e +o pipefail
|
||||||
assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_withlm}" "$status"
|
assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_withlm}" "$status"
|
||||||
@ -564,7 +563,7 @@ run_multi_inference_tests()
|
|||||||
run_cpp_only_inference_tests()
|
run_cpp_only_inference_tests()
|
||||||
{
|
{
|
||||||
set +e
|
set +e
|
||||||
phrase_pbmodel_withlm_intermediate_decode=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} --stream 1280 2>${TASKCLUSTER_TMP_DIR}/stderr | tail -n 1)
|
phrase_pbmodel_withlm_intermediate_decode=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} --stream 1280 2>${TASKCLUSTER_TMP_DIR}/stderr | tail -n 1)
|
||||||
status=$?
|
status=$?
|
||||||
set -e
|
set -e
|
||||||
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_intermediate_decode}" "$status"
|
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_intermediate_decode}" "$status"
|
||||||
@ -669,8 +668,7 @@ download_data()
|
|||||||
${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source}"
|
${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source}"
|
||||||
${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source_mmap}"
|
${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source_mmap}"
|
||||||
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/*.wav ${TASKCLUSTER_TMP_DIR}/
|
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/*.wav ${TASKCLUSTER_TMP_DIR}/
|
||||||
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.pruned.lm ${TASKCLUSTER_TMP_DIR}/lm.binary
|
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/pruned_lm.scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer
|
||||||
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.trie ${TASKCLUSTER_TMP_DIR}/trie
|
|
||||||
cp -R ${DS_ROOT_TASK}/DeepSpeech/ds/native_client/test ${TASKCLUSTER_TMP_DIR}/test_sources
|
cp -R ${DS_ROOT_TASK}/DeepSpeech/ds/native_client/test ${TASKCLUSTER_TMP_DIR}/test_sources
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1562,7 +1560,6 @@ package_native_client()
|
|||||||
fi;
|
fi;
|
||||||
|
|
||||||
${TAR} -cf - \
|
${TAR} -cf - \
|
||||||
-C ${tensorflow_dir}/bazel-bin/native_client/ generate_trie${PLATFORM_EXE_SUFFIX} \
|
|
||||||
-C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so \
|
-C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so \
|
||||||
-C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so.if.lib \
|
-C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so.if.lib \
|
||||||
-C ${deepspeech_dir}/ LICENSE \
|
-C ${deepspeech_dir}/ LICENSE \
|
||||||
@ -1767,8 +1764,7 @@ android_setup_apk_data()
|
|||||||
adb push \
|
adb push \
|
||||||
${TASKCLUSTER_TMP_DIR}/${model_name} \
|
${TASKCLUSTER_TMP_DIR}/${model_name} \
|
||||||
${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} \
|
${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} \
|
||||||
${TASKCLUSTER_TMP_DIR}/lm.binary \
|
${TASKCLUSTER_TMP_DIR}/kenlm.scorer \
|
||||||
${TASKCLUSTER_TMP_DIR}/trie \
|
|
||||||
${ANDROID_TMP_DIR}/test/
|
${ANDROID_TMP_DIR}/test/
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,7 +10,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
|
|||||||
|
|
||||||
BAZEL_TARGETS="
|
BAZEL_TARGETS="
|
||||||
//native_client:libdeepspeech.so
|
//native_client:libdeepspeech.so
|
||||||
//native_client:generate_trie
|
|
||||||
"
|
"
|
||||||
|
|
||||||
if [ "${package_option}" = "--cuda" ]; then
|
if [ "${package_option}" = "--cuda" ]; then
|
||||||
|
@ -44,7 +44,7 @@ payload:
|
|||||||
MSYS: 'winsymlinks:nativestrict'
|
MSYS: 'winsymlinks:nativestrict'
|
||||||
TENSORFLOW_BUILD_ARTIFACT: ${build.tensorflow}
|
TENSORFLOW_BUILD_ARTIFACT: ${build.tensorflow}
|
||||||
EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
|
EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
|
||||||
EXAMPLES_CHECKOUT_TARGET: "master"
|
EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05"
|
||||||
|
|
||||||
command:
|
command:
|
||||||
- >-
|
- >-
|
||||||
|
@ -29,7 +29,7 @@ def fail(message, code=1):
|
|||||||
def transcribe_file(audio_path, tlog_path):
|
def transcribe_file(audio_path, tlog_path):
|
||||||
from DeepSpeech import create_model, try_loading # pylint: disable=cyclic-import,import-outside-toplevel
|
from DeepSpeech import create_model, try_loading # pylint: disable=cyclic-import,import-outside-toplevel
|
||||||
initialize_globals()
|
initialize_globals()
|
||||||
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, Config.alphabet)
|
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path, Config.alphabet)
|
||||||
try:
|
try:
|
||||||
num_processes = cpu_count()
|
num_processes = cpu_count()
|
||||||
except NotImplementedError:
|
except NotImplementedError:
|
||||||
|
@ -143,10 +143,8 @@ def create_flags():
|
|||||||
|
|
||||||
f.DEFINE_boolean('utf8', False, 'enable UTF-8 mode. When this is used the model outputs UTF-8 sequences directly rather than using an alphabet mapping.')
|
f.DEFINE_boolean('utf8', False, 'enable UTF-8 mode. When this is used the model outputs UTF-8 sequences directly rather than using an alphabet mapping.')
|
||||||
f.DEFINE_string('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.')
|
f.DEFINE_string('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.')
|
||||||
f.DEFINE_string('lm_binary_path', 'data/lm/lm.binary', 'path to the language model binary file created with KenLM')
|
f.DEFINE_string('scorer_path', 'data/lm/kenlm.scorer', 'path to the external scorer file created with data/lm/generate_package.py')
|
||||||
f.DEFINE_alias('lm', 'lm_binary_path')
|
f.DEFINE_alias('scorer', 'scorer_path')
|
||||||
f.DEFINE_string('lm_trie_path', 'data/lm/trie', 'path to the language model trie file created with native_client/generate_trie')
|
|
||||||
f.DEFINE_alias('trie', 'lm_trie_path')
|
|
||||||
f.DEFINE_integer('beam_width', 1024, 'beam width used in the CTC decoder when building candidate transcriptions')
|
f.DEFINE_integer('beam_width', 1024, 'beam width used in the CTC decoder when building candidate transcriptions')
|
||||||
f.DEFINE_float('lm_alpha', 0.75, 'the alpha hyperparameter of the CTC decoder. Language Model weight.')
|
f.DEFINE_float('lm_alpha', 0.75, 'the alpha hyperparameter of the CTC decoder. Language Model weight.')
|
||||||
f.DEFINE_float('lm_beta', 1.85, 'the beta hyperparameter of the CTC decoder. Word insertion weight.')
|
f.DEFINE_float('lm_beta', 1.85, 'the beta hyperparameter of the CTC decoder. Word insertion weight.')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user