From 1e2eb962485555d8fea364d7ffa6cb1dbc14f317 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 21 Jan 2020 11:54:01 +0100 Subject: [PATCH] Update all API consumers --- DeepSpeech.py | 8 +- Dockerfile | 5 +- bin/run-tc-ldc93s1_checkpoint.sh | 3 +- bin/run-tc-ldc93s1_new.sh | 3 +- bin/run-tc-ldc93s1_singleshotinference.sh | 6 +- bin/run-tc-ldc93s1_tflite.sh | 6 +- data/lm/generate_package.py | 2 +- doc/C-API.rst | 8 +- evaluate.py | 5 +- evaluate_tflite.py | 15 +- native_client/args.h | 73 +++---- native_client/client.cc | 17 +- native_client/ctcdecode/__init__.py | 15 +- native_client/ctcdecode/scorer.h | 13 -- native_client/deepspeech.cc | 2 +- native_client/deepspeech.h | 4 +- native_client/deepspeech_compat.h | 141 -------------- .../dotnet/DeepSpeechClient/DeepSpeech.cs | 69 +++---- .../DeepSpeechClient/Enums/ErrorCodes.cs | 3 +- .../Interfaces/IDeepSpeech.cs | 30 +-- .../dotnet/DeepSpeechClient/NativeImp.cs | 15 +- .../dotnet/DeepSpeechConsole/Program.cs | 18 +- .../deepspeech/DeepSpeechActivity.java | 2 - .../libdeepspeech/test/BasicTest.java | 10 +- .../libdeepspeech/DeepSpeechModel.java | 32 +++- native_client/javascript/client.js | 23 ++- native_client/javascript/index.js | 89 +++++---- native_client/python/__init__.py | 180 ++++++++++-------- native_client/python/client.py | 27 +-- native_client/test/concurrent_streams.py | 24 +-- taskcluster/arm64-build.sh | 1 - taskcluster/cuda-build.sh | 1 - taskcluster/examples-base.tyml | 4 +- taskcluster/host-build.sh | 1 - taskcluster/rpi3-build.sh | 1 - taskcluster/tc-evaluate_tflite.sh | 2 +- taskcluster/tc-tests-utils.sh | 40 ++-- taskcluster/win-build.sh | 1 - taskcluster/win-opt-base.tyml | 2 +- transcribe.py | 2 +- util/flags.py | 6 +- 41 files changed, 393 insertions(+), 516 deletions(-) delete mode 100644 native_client/deepspeech_compat.h diff --git a/DeepSpeech.py b/DeepSpeech.py index 67971b48..e6d3a929 100755 --- a/DeepSpeech.py +++ b/DeepSpeech.py @@ -882,8 +882,7 @@ def package_zip(): } }, f) - shutil.copy(FLAGS.lm_binary_path, export_dir) - shutil.copy(FLAGS.lm_trie_path, export_dir) + shutil.copy(FLAGS.scorer_path, export_dir) archive = shutil.make_archive(zip_filename, 'zip', export_dir) log_info('Exported packaged model {}'.format(archive)) @@ -926,10 +925,9 @@ def do_single_file_inference(input_file_path): logits = np.squeeze(logits) - if FLAGS.lm_binary_path: + if FLAGS.scorer_path: scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, - FLAGS.lm_binary_path, FLAGS.lm_trie_path, - Config.alphabet) + FLAGS.scorer_path, Config.alphabet) else: scorer = None decoded = ctc_beam_search_decoder(logits, Config.alphabet, FLAGS.beam_width, diff --git a/Dockerfile b/Dockerfile index 58b27891..56afdbfc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -172,7 +172,7 @@ RUN ./configure # Build DeepSpeech -RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=cuda -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_trie --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH} +RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=cuda -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx --copt=-fvisibility=hidden //native_client:libdeepspeech.so --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH} ### ### Using TensorFlow upstream should work @@ -187,8 +187,7 @@ RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_s # RUN pip3 install /tmp/tensorflow_pkg/*.whl # Copy built libs to /DeepSpeech/native_client -RUN cp /tensorflow/bazel-bin/native_client/generate_trie /DeepSpeech/native_client/ \ - && cp /tensorflow/bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/ +RUN cp /tensorflow/bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/ # Install TensorFlow WORKDIR /DeepSpeech/ diff --git a/bin/run-tc-ldc93s1_checkpoint.sh b/bin/run-tc-ldc93s1_checkpoint.sh index ae0836a1..0602dada 100755 --- a/bin/run-tc-ldc93s1_checkpoint.sh +++ b/bin/run-tc-ldc93s1_checkpoint.sh @@ -21,8 +21,7 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ --n_hidden 100 --epochs 1 \ --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \ --learning_rate 0.001 --dropout_rate 0.05 \ - --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \ - --lm_trie_path 'data/smoke_test/vocab.trie' | tee /tmp/resume.log + --scorer_path 'data/smoke_test/pruned_lm.scorer' | tee /tmp/resume.log if ! grep "Restored variables from most recent checkpoint" /tmp/resume.log; then echo "Did not resume training from checkpoint" diff --git a/bin/run-tc-ldc93s1_new.sh b/bin/run-tc-ldc93s1_new.sh index ff8751ed..8e9cf4d4 100755 --- a/bin/run-tc-ldc93s1_new.sh +++ b/bin/run-tc-ldc93s1_new.sh @@ -25,6 +25,5 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ --n_hidden 100 --epochs $epoch_count \ --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \ --learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train' \ - --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \ - --lm_trie_path 'data/smoke_test/vocab.trie' \ + --scorer_path 'data/smoke_test/pruned_lm.scorer' \ --audio_sample_rate ${audio_sample_rate} diff --git a/bin/run-tc-ldc93s1_singleshotinference.sh b/bin/run-tc-ldc93s1_singleshotinference.sh index fc30c48f..997bf08f 100755 --- a/bin/run-tc-ldc93s1_singleshotinference.sh +++ b/bin/run-tc-ldc93s1_singleshotinference.sh @@ -21,12 +21,10 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ --n_hidden 100 --epochs 1 \ --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' --checkpoint_secs 0 \ --learning_rate 0.001 --dropout_rate 0.05 \ - --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \ - --lm_trie_path 'data/smoke_test/vocab.trie' + --scorer_path 'data/smoke_test/pruned_lm.scorer' python -u DeepSpeech.py \ --n_hidden 100 \ --checkpoint_dir '/tmp/ckpt' \ - --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \ - --lm_trie_path 'data/smoke_test/vocab.trie' \ + --scorer_path 'data/smoke_test/pruned_lm.scorer' \ --one_shot_infer 'data/smoke_test/LDC93S1.wav' diff --git a/bin/run-tc-ldc93s1_tflite.sh b/bin/run-tc-ldc93s1_tflite.sh index b5a7772c..f7daca21 100755 --- a/bin/run-tc-ldc93s1_tflite.sh +++ b/bin/run-tc-ldc93s1_tflite.sh @@ -20,8 +20,7 @@ python -u DeepSpeech.py --noshow_progressbar \ --n_hidden 100 \ --checkpoint_dir '/tmp/ckpt' \ --export_dir '/tmp/train_tflite' \ - --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \ - --lm_trie_path 'data/smoke_test/vocab.trie' \ + --scorer_path 'data/smoke_test/pruned_lm.scorer' \ --audio_sample_rate ${audio_sample_rate} \ --export_tflite @@ -31,8 +30,7 @@ python -u DeepSpeech.py --noshow_progressbar \ --n_hidden 100 \ --checkpoint_dir '/tmp/ckpt' \ --export_dir '/tmp/train_tflite/en-us' \ - --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \ - --lm_trie_path 'data/smoke_test/vocab.trie' \ + --scorer_path 'data/smoke_test/pruned_lm.scorer' \ --audio_sample_rate ${audio_sample_rate} \ --export_language 'Fake English (fk-FK)' \ --export_zip diff --git a/data/lm/generate_package.py b/data/lm/generate_package.py index 4d064fdd..d8f39c4e 100644 --- a/data/lm/generate_package.py +++ b/data/lm/generate_package.py @@ -50,7 +50,7 @@ def create_bundle(alphabet_path, lm_path, vocab_path, package_path, force_utf8, scorer.set_alphabet(alphabet) scorer.set_utf8_mode(use_utf8) scorer.reset_params(default_alpha, default_beta) - scorer.load_lm(lm_path, "") + scorer.load_lm(lm_path) scorer.fill_dictionary(list(words)) shutil.copy(lm_path, package_path) scorer.save_dictionary(package_path, True) # append, not overwrite diff --git a/doc/C-API.rst b/doc/C-API.rst index 0541247e..6556d4bb 100644 --- a/doc/C-API.rst +++ b/doc/C-API.rst @@ -7,7 +7,13 @@ C .. doxygenfunction:: DS_FreeModel :project: deepspeech-c -.. doxygenfunction:: DS_EnableDecoderWithLM +.. doxygenfunction:: DS_EnableExternalScorer + :project: deepspeech-c + +.. doxygenfunction:: DS_DisableExternalScorer + :project: deepspeech-c + +.. doxygenfunction:: DS_SetScorerAlphaBeta :project: deepspeech-c .. doxygenfunction:: DS_GetModelSampleRate diff --git a/evaluate.py b/evaluate.py index 8df73966..cdb13e31 100755 --- a/evaluate.py +++ b/evaluate.py @@ -42,10 +42,9 @@ def sparse_tuple_to_texts(sp_tuple, alphabet): def evaluate(test_csvs, create_model, try_loading): - if FLAGS.lm_binary_path: + if FLAGS.scorer_path: scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, - FLAGS.lm_binary_path, FLAGS.lm_trie_path, - Config.alphabet) + FLAGS.scorer_path, Config.alphabet) else: scorer = None diff --git a/evaluate_tflite.py b/evaluate_tflite.py index 8b2ba453..bdc5f231 100644 --- a/evaluate_tflite.py +++ b/evaluate_tflite.py @@ -27,17 +27,18 @@ This module should be self-contained: - pip install native_client/python/dist/deepspeech*.whl - pip install -r requirements_eval_tflite.txt -Then run with a TF Lite model, LM/trie and a CSV test file +Then run with a TF Lite model, LM and a CSV test file ''' BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 -def tflite_worker(model, lm, trie, queue_in, queue_out, gpu_mask): +def tflite_worker(model, scorer, queue_in, queue_out, gpu_mask): os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask) ds = Model(model, BEAM_WIDTH) - ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA) + ds.enableExternalScorer(scorer) + ds.setScorerAlphaBeta(LM_ALPHA, LM_BETA) while True: try: @@ -64,7 +65,7 @@ def main(args, _): processes = [] for i in range(args.proc): - worker_process = Process(target=tflite_worker, args=(args.model, args.lm, args.trie, work_todo, work_done, i), daemon=True, name='tflite_process_{}'.format(i)) + worker_process = Process(target=tflite_worker, args=(args.model, args.scorer, work_todo, work_done, i), daemon=True, name='tflite_process_{}'.format(i)) worker_process.start() # Launch reader() as a separate python process processes.append(worker_process) @@ -113,10 +114,8 @@ def parse_args(): parser = argparse.ArgumentParser(description='Computing TFLite accuracy') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') - parser.add_argument('--lm', required=True, - help='Path to the language model binary file') - parser.add_argument('--trie', required=True, - help='Path to the language model trie file created with native_client/generate_trie') + parser.add_argument('--scorer', required=True, + help='Path to the external scorer file') parser.add_argument('--csv', required=True, help='Path to the CSV source file') parser.add_argument('--proc', required=False, default=cpu_count(), type=int, diff --git a/native_client/args.h b/native_client/args.h index 6342763f..a158fb18 100644 --- a/native_client/args.h +++ b/native_client/args.h @@ -12,19 +12,17 @@ char* model = NULL; -char* lm = NULL; - -char* trie = NULL; +char* scorer = NULL; char* audio = NULL; int beam_width = 500; -float lm_alpha = 0.75f; +bool set_alphabeta = false; -float lm_beta = 1.85f; +float lm_alpha = 0.f; -bool load_without_trie = false; +float lm_beta = 0.f; bool show_times = false; @@ -39,39 +37,36 @@ int stream_size = 0; void PrintHelp(const char* bin) { std::cout << - "Usage: " << bin << " --model MODEL [--lm LM --trie TRIE] --audio AUDIO [-t] [-e]\n" + "Usage: " << bin << " --model MODEL [--scorer SCORER] --audio AUDIO [-t] [-e]\n" "\n" "Running DeepSpeech inference.\n" "\n" - " --model MODEL Path to the model (protocol buffer binary file)\n" - " --lm LM Path to the language model binary file\n" - " --trie TRIE Path to the language model trie file created with native_client/generate_trie\n" - " --audio AUDIO Path to the audio file to run (WAV format)\n" - " --beam_width BEAM_WIDTH Value for decoder beam width (int)\n" - " --lm_alpha LM_ALPHA Value for language model alpha param (float)\n" - " --lm_beta LM_BETA Value for language model beta param (float)\n" - " -t Run in benchmark mode, output mfcc & inference time\n" - " --extended Output string from extended metadata\n" - " --json Extended output, shows word timings as JSON\n" - " --stream size Run in stream mode, output intermediate results\n" - " --help Show help\n" - " --version Print version and exits\n"; + "\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n" + "\t--scorer SCORER\t\tPath to the external scorer file\n" + "\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n" + "\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n" + "\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n" + "\t--lm_beta LM_BETA\tValue for language model beta param (float)\n" + "\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n" + "\t--extended\t\tOutput string from extended metadata\n" + "\t--json\t\t\tExtended output, shows word timings as JSON\n" + "\t--stream size\t\tRun in stream mode, output intermediate results\n" + "\t--help\t\t\tShow help\n" + "\t--version\t\tPrint version and exits\n"; DS_PrintVersions(); exit(1); } bool ProcessArgs(int argc, char** argv) { - const char* const short_opts = "m:a:l:r:w:c:d:b:tehv"; + const char* const short_opts = "m:a:s:r:w:c:d:b:tehv"; const option long_opts[] = { {"model", required_argument, nullptr, 'm'}, - {"lm", required_argument, nullptr, 'l'}, - {"trie", required_argument, nullptr, 'r'}, + {"scorer", required_argument, nullptr, 'l'}, {"audio", required_argument, nullptr, 'w'}, {"beam_width", required_argument, nullptr, 'b'}, {"lm_alpha", required_argument, nullptr, 'c'}, {"lm_beta", required_argument, nullptr, 'd'}, - {"run_very_slowly_without_trie_I_really_know_what_Im_doing", no_argument, nullptr, 999}, {"t", no_argument, nullptr, 't'}, {"extended", no_argument, nullptr, 'e'}, {"json", no_argument, nullptr, 'j'}, @@ -95,31 +90,25 @@ bool ProcessArgs(int argc, char** argv) break; case 'l': - lm = optarg; - break; - - case 'r': - trie = optarg; + scorer = optarg; break; case 'w': audio = optarg; break; - case 'b': - beam_width = atoi(optarg); - break; - - case 'c': - lm_alpha = atof(optarg); - break; - - case 'd': - lm_beta = atof(optarg); - break; + case 'b': + beam_width = atoi(optarg); + break; + + case 'c': + set_alphabeta = true; + lm_alpha = atof(optarg); + break; - case 999: - load_without_trie = true; + case 'd': + set_alphabeta = true; + lm_beta = atof(optarg); break; case 't': diff --git a/native_client/client.cc b/native_client/client.cc index 99af904e..718fba75 100644 --- a/native_client/client.cc +++ b/native_client/client.cc @@ -374,16 +374,19 @@ main(int argc, char **argv) return 1; } - if (lm && (trie || load_without_trie)) { - int status = DS_EnableDecoderWithLM(ctx, - lm, - trie, - lm_alpha, - lm_beta); + if (scorer) { + int status = DS_EnableExternalScorer(ctx, scorer); if (status != 0) { - fprintf(stderr, "Could not enable CTC decoder with LM.\n"); + fprintf(stderr, "Could not enable external scorer.\n"); return 1; } + if (set_alphabeta) { + status = DS_SetScorerAlphaBeta(ctx, lm_alpha, lm_beta); + if (status != 0) { + fprintf(stderr, "Error setting scorer alpha and beta.\n"); + return 1; + } + } } #ifndef NO_SOX diff --git a/native_client/ctcdecode/__init__.py b/native_client/ctcdecode/__init__.py index 3fab4eb7..8ba2e9b2 100644 --- a/native_client/ctcdecode/__init__.py +++ b/native_client/ctcdecode/__init__.py @@ -12,12 +12,11 @@ class Scorer(swigwrapper.Scorer): :type alpha: float :param beta: Word insertion bonus. :type beta: float - :model_path: Path to load language model. - :trie_path: Path to trie file. + :model_path: Path to load scorer. :alphabet: Alphabet :type model_path: basestring """ - def __init__(self, alpha=None, beta=None, model_path=None, trie_path=None, alphabet=None): + def __init__(self, alpha=None, beta=None, model_path=None, alphabet=None): super(Scorer, self).__init__() # Allow bare initialization if alphabet: @@ -27,15 +26,15 @@ class Scorer(swigwrapper.Scorer): if err != 0: raise ValueError("Error when deserializing alphabet.") - err = self.init(alpha, beta, - model_path.encode('utf-8'), - trie_path.encode('utf-8'), + err = self.init(model_path.encode('utf-8'), native_alphabet) if err != 0: raise ValueError("Scorer initialization failed with error code {}".format(err), err) - def load_lm(self, lm_path, trie_path): - super(Scorer, self).load_lm(lm_path.encode('utf-8'), trie_path.encode('utf-8')) + self.reset_params(alpha, beta) + + def load_lm(self, lm_path): + super(Scorer, self).load_lm(lm_path.encode('utf-8')) def save_dictionary(self, save_path, *args, **kwargs): super(Scorer, self).save_dictionary(save_path.encode('utf-8'), *args, **kwargs) diff --git a/native_client/ctcdecode/scorer.h b/native_client/ctcdecode/scorer.h index db58d581..b2e5c817 100644 --- a/native_client/ctcdecode/scorer.h +++ b/native_client/ctcdecode/scorer.h @@ -6,7 +6,6 @@ #include #include -#include "lm/enumerate_vocab.hh" #include "lm/virtual_interface.hh" #include "lm/word_index.hh" #include "util/string_piece.hh" @@ -19,18 +18,6 @@ const std::string START_TOKEN = ""; const std::string UNK_TOKEN = ""; const std::string END_TOKEN = ""; -// Implement a callback to retrieve the dictionary of language model. -class RetrieveStrEnumerateVocab : public lm::EnumerateVocab { -public: - RetrieveStrEnumerateVocab() {} - - void Add(lm::WordIndex index, const StringPiece &str) { - vocabulary.push_back(std::string(str.data(), str.length())); - } - - std::vector vocabulary; -}; - /* External scorer to query score for n-gram or sentence, including language * model scoring and word insertion. * diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc index e8b3dc02..0a61f3de 100644 --- a/native_client/deepspeech.cc +++ b/native_client/deepspeech.cc @@ -310,7 +310,7 @@ DS_EnableExternalScorer(ModelState* aCtx, aCtx->scorer_.reset(new Scorer()); int err = aCtx->scorer_->init(aScorerPath, aCtx->alphabet_); if (err != 0) { - return DS_ERR_INVALID_LM; + return DS_ERR_INVALID_SCORER; } return DS_ERR_OK; } diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h index 94f6664e..4e017653 100644 --- a/native_client/deepspeech.h +++ b/native_client/deepspeech.h @@ -59,7 +59,7 @@ enum DeepSpeech_Error_Codes // Invalid parameters DS_ERR_INVALID_ALPHABET = 0x2000, DS_ERR_INVALID_SHAPE = 0x2001, - DS_ERR_INVALID_LM = 0x2002, + DS_ERR_INVALID_SCORER = 0x2002, DS_ERR_MODEL_INCOMPATIBLE = 0x2003, DS_ERR_SCORER_NOT_ENABLED = 0x2004, @@ -129,7 +129,7 @@ DEEPSPEECH_EXPORT int DS_DisableExternalScorer(ModelState* aCtx); /** - * @brief Set hyperparameters alpha and beta of a KenLM external scorer. + * @brief Set hyperparameters alpha and beta of the external scorer. * * @param aCtx The ModelState pointer for the model being changed. * @param aAlpha The alpha hyperparameter of the decoder. Language model weight. diff --git a/native_client/deepspeech_compat.h b/native_client/deepspeech_compat.h deleted file mode 100644 index c83bcbc8..00000000 --- a/native_client/deepspeech_compat.h +++ /dev/null @@ -1,141 +0,0 @@ -#ifndef DEEPSPEECH_COMPAT_H -#define DEEPSPEECH_COMPAT_H - -#include "deepspeech.h" - -#warning This header is a convenience wrapper for compatibility with \ - the previous API, it has deprecated function names and arguments. \ - If possible, update your code instead of using this header. - -/** - * @brief An object providing an interface to a trained DeepSpeech model. - * - * @param aModelPath The path to the frozen model graph. - * @param aNCep UNUSED, DEPRECATED. - * @param aNContext UNUSED, DEPRECATED. - * @param aAlphabetConfigPath UNUSED, DEPRECATED. - * @param aBeamWidth The beam width used by the decoder. A larger beam - * width generates better results at the cost of decoding - * time. - * @param[out] retval a ModelState pointer - * - * @return Zero on success, non-zero on failure. - */ -int DS_CreateModel(const char* aModelPath, - unsigned int /*aNCep*/, - unsigned int /*aNContext*/, - const char* /*aAlphabetConfigPath*/, - unsigned int aBeamWidth, - ModelState** retval) -{ - return DS_CreateModel(aModelPath, aBeamWidth, retval); -} - -/** - * @brief Frees associated resources and destroys model object. - */ -void DS_DestroyModel(ModelState* ctx) -{ - return DS_FreeModel(ctx); -} - -/** - * @brief Enable decoding using beam scoring with a KenLM language model. - * - * @param aCtx The ModelState pointer for the model being changed. - * @param aAlphabetConfigPath UNUSED, DEPRECATED. - * @param aLMPath The path to the language model binary file. - * @param aTriePath The path to the trie file build from the same vocabu- - * lary as the language model binary. - * @param aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model - weight. - * @param aLMBeta The beta hyperparameter of the CTC decoder. Word insertion - weight. - * - * @return Zero on success, non-zero on failure (invalid arguments). - */ -int DS_EnableDecoderWithLM(ModelState* aCtx, - const char* /*aAlphabetConfigPath*/, - const char* aLMPath, - const char* aTriePath, - float aLMAlpha, - float aLMBeta) -{ - return DS_EnableDecoderWithLM(aCtx, aLMPath, aTriePath, aLMAlpha, aLMBeta); -} - -/** - * @brief Create a new streaming inference state. The streaming state returned - * by this function can then be passed to {@link DS_FeedAudioContent()} - * and {@link DS_FinishStream()}. - * - * @param aCtx The ModelState pointer for the model to use. - * @param aSampleRate UNUSED, DEPRECATED. - * @param[out] retval an opaque pointer that represents the streaming state. Can - * be NULL if an error occurs. - * - * @return Zero for success, non-zero on failure. - */ -int DS_SetupStream(ModelState* aCtx, - unsigned int /*aSampleRate*/, - StreamingState** retval) -{ - return DS_CreateStream(aCtx, retval); -} - -/** - * @brief Destroy a streaming state without decoding the computed logits. This - * can be used if you no longer need the result of an ongoing streaming - * inference and don't want to perform a costly decode operation. - * - * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. - * - * @note This method will free the state pointer (@p aSctx). - */ -void DS_DiscardStream(StreamingState* aSctx) -{ - return DS_FreeStream(aSctx); -} - -/** - * @brief Use the DeepSpeech model to perform Speech-To-Text. - * - * @param aCtx The ModelState pointer for the model to use. - * @param aBuffer A 16-bit, mono raw audio signal at the appropriate - * sample rate (matching what the model was trained on). - * @param aBufferSize The number of samples in the audio signal. - * @param aSampleRate UNUSED, DEPRECATED. - * - * @return The STT result. The user is responsible for freeing the string using - * {@link DS_FreeString()}. Returns NULL on error. - */ -char* DS_SpeechToText(ModelState* aCtx, - const short* aBuffer, - unsigned int aBufferSize, - unsigned int /*aSampleRate*/) -{ - return DS_SpeechToText(aCtx, aBuffer, aBufferSize); -} - -/** - * @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata - * about the results. - * - * @param aCtx The ModelState pointer for the model to use. - * @param aBuffer A 16-bit, mono raw audio signal at the appropriate - * sample rate (matching what the model was trained on). - * @param aBufferSize The number of samples in the audio signal. - * @param aSampleRate UNUSED, DEPRECATED. - * - * @return Outputs a struct of individual letters along with their timing information. - * The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. - */ -Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, - const short* aBuffer, - unsigned int aBufferSize, - unsigned int /*aSampleRate*/) -{ - return DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize); -} - -#endif /* DEEPSPEECH_COMPAT_H */ diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs index 754be8ae..e5e33370 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs @@ -82,8 +82,8 @@ namespace DeepSpeechClient throw new ArgumentException("Invalid alphabet embedded in model. (Data corruption?)"); case ErrorCodes.DS_ERR_INVALID_SHAPE: throw new ArgumentException("Invalid model shape."); - case ErrorCodes.DS_ERR_INVALID_LM: - throw new ArgumentException("Invalid language model file."); + case ErrorCodes.DS_ERR_INVALID_SCORER: + throw new ArgumentException("Invalid scorer file."); case ErrorCodes.DS_ERR_FAIL_INIT_MMAP: throw new ArgumentException("Failed to initialize memory mapped model."); case ErrorCodes.DS_ERR_FAIL_INIT_SESS: @@ -100,6 +100,8 @@ namespace DeepSpeechClient throw new ArgumentException("Error failed to create session."); case ErrorCodes.DS_ERR_MODEL_INCOMPATIBLE: throw new ArgumentException("Error incompatible model."); + case ErrorCodes.DS_ERR_SCORER_NOT_ENABLED: + throw new ArgumentException("External scorer is not enabled."); default: throw new ArgumentException("Unknown error, please make sure you are using the correct native binary."); } @@ -114,45 +116,48 @@ namespace DeepSpeechClient } /// - /// Enable decoding using beam scoring with a KenLM language model. + /// Enable decoding using an external scorer. /// - /// The path to the language model binary file. - /// The path to the trie file build from the same vocabulary as the language model binary. - /// The alpha hyperparameter of the CTC decoder. Language Model weight. - /// The beta hyperparameter of the CTC decoder. Word insertion weight. - /// Thrown when the native binary failed to enable decoding with a language model. - /// Thrown when cannot find the language model or trie file. - public unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath, - float aLMAlpha, float aLMBeta) + /// The path to the external scorer file. + /// Thrown when the native binary failed to enable decoding with an external scorer. + /// Thrown when cannot find the scorer file. + public unsafe void EnableExternalScorer(string aScorerPath) { string exceptionMessage = null; - if (string.IsNullOrWhiteSpace(aLMPath)) + if (string.IsNullOrWhiteSpace(aScorerPath)) { - exceptionMessage = "Path to the language model file cannot be empty."; + throw new FileNotFoundException("Path to the scorer file cannot be empty."); } - if (!File.Exists(aLMPath)) + if (!File.Exists(aScorerPath)) { - exceptionMessage = $"Cannot find the language model file: {aLMPath}"; - } - if (string.IsNullOrWhiteSpace(aTriePath)) - { - exceptionMessage = "Path to the trie file cannot be empty."; - } - if (!File.Exists(aTriePath)) - { - exceptionMessage = $"Cannot find the trie file: {aTriePath}"; + throw new FileNotFoundException($"Cannot find the scorer file: {aScorerPath}"); } - if (exceptionMessage != null) - { - throw new FileNotFoundException(exceptionMessage); - } + var resultCode = NativeImp.DS_EnableExternalScorer(_modelStatePP, aScorerPath); + EvaluateResultCode(resultCode); + } - var resultCode = NativeImp.DS_EnableDecoderWithLM(_modelStatePP, - aLMPath, - aTriePath, - aLMAlpha, - aLMBeta); + /// + /// Disable decoding using an external scorer. + /// + /// Thrown when an external scorer is not enabled. + public unsafe void DisableExternalScorer() + { + var resultCode = NativeImp.DS_DisableExternalScorer(_modelStatePP); + EvaluateResultCode(resultCode); + } + + /// + /// Set hyperparameters alpha and beta of the external scorer. + /// + /// The alpha hyperparameter of the decoder. Language model weight. + /// The beta hyperparameter of the decoder. Word insertion weight. + /// Thrown when an external scorer is not enabled. + public unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta) + { + var resultCode = NativeImp.DS_SetScorerAlphaBeta(_modelStatePP, + aAlpha, + aBeta); EvaluateResultCode(resultCode); } diff --git a/native_client/dotnet/DeepSpeechClient/Enums/ErrorCodes.cs b/native_client/dotnet/DeepSpeechClient/Enums/ErrorCodes.cs index 019564c2..30660add 100644 --- a/native_client/dotnet/DeepSpeechClient/Enums/ErrorCodes.cs +++ b/native_client/dotnet/DeepSpeechClient/Enums/ErrorCodes.cs @@ -14,8 +14,9 @@ // Invalid parameters DS_ERR_INVALID_ALPHABET = 0x2000, DS_ERR_INVALID_SHAPE = 0x2001, - DS_ERR_INVALID_LM = 0x2002, + DS_ERR_INVALID_SCORER = 0x2002, DS_ERR_MODEL_INCOMPATIBLE = 0x2003, + DS_ERR_SCORER_NOT_ENABLED = 0x2004, // Runtime failures DS_ERR_FAIL_INIT_MMAP = 0x3000, diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs index 734f4240..ecbfb7e9 100644 --- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs @@ -21,18 +21,26 @@ namespace DeepSpeechClient.Interfaces unsafe int GetModelSampleRate(); /// - /// Enable decoding using beam scoring with a KenLM language model. + /// Enable decoding using an external scorer. /// - /// The path to the language model binary file. - /// The path to the trie file build from the same vocabulary as the language model binary. - /// The alpha hyperparameter of the CTC decoder. Language Model weight. - /// The beta hyperparameter of the CTC decoder. Word insertion weight. - /// Thrown when the native binary failed to enable decoding with a language model. - /// Thrown when cannot find the language model or trie file. - unsafe void EnableDecoderWithLM(string aLMPath, - string aTriePath, - float aLMAlpha, - float aLMBeta); + /// The path to the external scorer file. + /// Thrown when the native binary failed to enable decoding with an external scorer. + /// Thrown when cannot find the scorer file. + unsafe void EnableExternalScorer(string aScorerPath); + + /// + /// Disable decoding using an external scorer. + /// + /// Thrown when an external scorer is not enabled. + unsafe void DisableExternalScorer(); + + /// + /// Set hyperparameters alpha and beta of the external scorer. + /// + /// The alpha hyperparameter of the decoder. Language model weight. + /// The beta hyperparameter of the decoder. Word insertion weight. + /// Thrown when an external scorer is not enabled. + unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta); /// /// Use the DeepSpeech model to perform Speech-To-Text. diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs index 3b79282b..1c49feec 100644 --- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs +++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs @@ -23,11 +23,16 @@ namespace DeepSpeechClient internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(IntPtr** aCtx, - string aLMPath, - string aTriePath, - float aLMAlpha, - float aLMBeta); + internal static unsafe extern ErrorCodes DS_EnableExternalScorer(IntPtr** aCtx, + string aScorerPath); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern ErrorCodes DS_DisableExternalScorer(IntPtr** aCtx); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern ErrorCodes DS_SetScorerAlphaBeta(IntPtr** aCtx, + float aAlpha, + float aBeta); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, CharSet = CharSet.Ansi, SetLastError = true)] diff --git a/native_client/dotnet/DeepSpeechConsole/Program.cs b/native_client/dotnet/DeepSpeechConsole/Program.cs index 8c75a481..1f6e299b 100644 --- a/native_client/dotnet/DeepSpeechConsole/Program.cs +++ b/native_client/dotnet/DeepSpeechConsole/Program.cs @@ -35,22 +35,18 @@ namespace CSharpExamples static void Main(string[] args) { string model = null; - string lm = null; - string trie = null; + string scorer = null; string audio = null; bool extended = false; if (args.Length > 0) { model = GetArgument(args, "--model"); - lm = GetArgument(args, "--lm"); - trie = GetArgument(args, "--trie"); + scorer = GetArgument(args, "--scorer"); audio = GetArgument(args, "--audio"); extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended")); } const uint BEAM_WIDTH = 500; - const float LM_ALPHA = 0.75f; - const float LM_BETA = 1.85f; Stopwatch stopwatch = new Stopwatch(); try @@ -64,14 +60,10 @@ namespace CSharpExamples Console.WriteLine($"Model loaded - {stopwatch.Elapsed.Milliseconds} ms"); stopwatch.Reset(); - if (lm != null) + if (scorer != null) { - Console.WriteLine("Loadin LM..."); - sttClient.EnableDecoderWithLM( - lm ?? "lm.binary", - trie ?? "trie", - LM_ALPHA, LM_BETA); - + Console.WriteLine("Loading scorer..."); + sttClient.EnableExternalScorer(scorer ?? "kenlm.scorer"); } string audioFile = audio ?? "arctic_a0024.wav"; diff --git a/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java b/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java index a1065d4e..12e758df 100644 --- a/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java +++ b/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java @@ -31,8 +31,6 @@ public class DeepSpeechActivity extends AppCompatActivity { Button _startInference; final int BEAM_WIDTH = 50; - final float LM_ALPHA = 0.75f; - final float LM_BETA = 1.85f; private char readLEChar(RandomAccessFile f) throws IOException { byte b1 = f.readByte(); diff --git a/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java b/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java index 50ad71f2..bb6bbe42 100644 --- a/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java +++ b/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java @@ -30,15 +30,11 @@ import java.nio.ByteBuffer; public class BasicTest { public static final String modelFile = "/data/local/tmp/test/output_graph.tflite"; - public static final String lmFile = "/data/local/tmp/test/lm.binary"; - public static final String trieFile = "/data/local/tmp/test/trie"; + public static final String scorerFile = "/data/local/tmp/test/kenlm.scorer"; public static final String wavFile = "/data/local/tmp/test/LDC93S1.wav"; public static final int BEAM_WIDTH = 50; - public static final float LM_ALPHA = 0.75f; - public static final float LM_BETA = 1.85f; - private char readLEChar(RandomAccessFile f) throws IOException { byte b1 = f.readByte(); byte b2 = f.readByte(); @@ -130,7 +126,7 @@ public class BasicTest { @Test public void loadDeepSpeech_stt_withLM() { DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH); - m.enableDecoderWithLM(lmFile, trieFile, LM_ALPHA, LM_BETA); + m.enableExternalScorer(scorerFile); String decoded = doSTT(m, false); assertEquals("she had your dark suit in greasy wash water all year", decoded); @@ -149,7 +145,7 @@ public class BasicTest { @Test public void loadDeepSpeech_sttWithMetadata_withLM() { DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH); - m.enableDecoderWithLM(lmFile, trieFile, LM_ALPHA, LM_BETA); + m.enableExternalScorer(scorerFile); String decoded = doSTT(m, true); assertEquals("she had your dark suit in greasy wash water all year", decoded); diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java index e063f86b..0438ac10 100644 --- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java @@ -47,17 +47,35 @@ public class DeepSpeechModel { } /** - * @brief Enable decoding using beam scoring with a KenLM language model. + * @brief Enable decoding using an external scorer. * - * @param lm The path to the language model binary file. - * @param trie The path to the trie file build from the same vocabulary as the language model binary. - * @param lm_alpha The alpha hyperparameter of the CTC decoder. Language Model weight. - * @param lm_beta The beta hyperparameter of the CTC decoder. Word insertion weight. + * @param scorer The path to the external scorer file. * * @return Zero on success, non-zero on failure (invalid arguments). */ - public void enableDecoderWithLM(String lm, String trie, float lm_alpha, float lm_beta) { - impl.EnableDecoderWithLM(this._msp, lm, trie, lm_alpha, lm_beta); + public void enableExternalScorer(String scorer) { + impl.EnableExternalScorer(this._msp, scorer); + } + + /** + * @brief Disable decoding using an external scorer. + * + * @return Zero on success, non-zero on failure (invalid arguments). + */ + public void disableExternalScorer() { + impl.DisableExternalScorer(this._msp); + } + + /** + * @brief Enable decoding using beam scoring with a KenLM language model. + * + * @param alpha The alpha hyperparameter of the decoder. Language model weight. + * @param beta The beta hyperparameter of the decoder. Word insertion weight. + * + * @return Zero on success, non-zero on failure (invalid arguments). + */ + public void setScorerAlphaBeta(float alpha, float beta) { + impl.SetScorerAlphaBeta(this._msp, alpha, beta); } /* diff --git a/native_client/javascript/client.js b/native_client/javascript/client.js index b504650f..79561a97 100644 --- a/native_client/javascript/client.js +++ b/native_client/javascript/client.js @@ -29,12 +29,11 @@ VersionAction.prototype.call = function(parser) { var parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'}); parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'}); -parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'}); -parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'}); +parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'}); parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'}); parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'}); -parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha)', defaultValue: 0.75, type: 'float'}); -parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta)', defaultValue: 1.85, type: 'float'}); +parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not set, use default value from scorer.', type: 'float'}); +parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not set, use default value from scorer.', type: 'float'}); parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'}); parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'}); var args = parser.parseArgs(); @@ -60,12 +59,16 @@ console.error('Loaded model in %ds.', totalTime(model_load_end)); var desired_sample_rate = model.sampleRate(); -if (args['lm'] && args['trie']) { - console.error('Loading language model from files %s %s', args['lm'], args['trie']); - const lm_load_start = process.hrtime(); - model.enableDecoderWithLM(args['lm'], args['trie'], args['lm_alpha'], args['lm_beta']); - const lm_load_end = process.hrtime(lm_load_start); - console.error('Loaded language model in %ds.', totalTime(lm_load_end)); +if (args['scorer']) { + console.error('Loading scorer from file %s', args['scorer']); + const scorer_load_start = process.hrtime(); + model.enableExternalScorer(args['scorer']); + const scorer_load_end = process.hrtime(scorer_load_start); + console.error('Loaded scorer in %ds.', totalTime(scorer_load_end)); + + if (args['lm_alpha'] && args['lm_beta']) { + model.setScorerAlphaBeta(args['lm_alpha'], args['lm_beta']); + } } const buffer = Fs.readFileSync(args['audio']); diff --git a/native_client/javascript/index.js b/native_client/javascript/index.js index 1d4137c7..2ce039bf 100644 --- a/native_client/javascript/index.js +++ b/native_client/javascript/index.js @@ -52,31 +52,46 @@ Model.prototype.sampleRate = function() { } /** - * Enable decoding using beam scoring with a KenLM language model. + * Enable decoding using an external scorer. + * + * @param {string} aScorerPath The path to the external scorer file. + * + * @return {number} Zero on success, non-zero on failure (invalid arguments). + */ +Model.prototype.enableExternalScorer = function(aScorerPath) { + return binding.EnableExternalScorer(this._impl, aScorerPath); +} + +/** + * Disable decoding using an external scorer. + * + * @return {number} Zero on success, non-zero on failure (invalid arguments). + */ +Model.prototype.disableExternalScorer = function() { + return binding.EnableExternalScorer(this._impl); +} + +/** + * Set hyperparameters alpha and beta of the external scorer. * - * @param {string} aLMPath The path to the language model binary file. - * @param {string} aTriePath The path to the trie file build from the same vocabulary as the language model binary. * @param {float} aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model weight. * @param {float} aLMBeta The beta hyperparameter of the CTC decoder. Word insertion weight. * * @return {number} Zero on success, non-zero on failure (invalid arguments). */ -Model.prototype.enableDecoderWithLM = function() { - const args = [this._impl].concat(Array.prototype.slice.call(arguments)); - return binding.EnableDecoderWithLM.apply(null, args); +Model.prototype.setScorerAlphaBeta = function(aLMAlpha, aLMBeta) { + return binding.SetScorerAlphaBeta(this._impl, aLMAlpha, aLMBeta); } /** * Use the DeepSpeech model to perform Speech-To-Text. * * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). - * @param {number} aBufferSize The number of samples in the audio signal. * * @return {string} The STT result. Returns undefined on error. */ -Model.prototype.stt = function() { - const args = [this._impl].concat(Array.prototype.slice.call(arguments)); - return binding.SpeechToText.apply(null, args); +Model.prototype.stt = function(aBuffer) { + return binding.SpeechToText(this._impl, aBuffer); } /** @@ -84,25 +99,22 @@ Model.prototype.stt = function() { * about the results. * * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). - * @param {number} aBufferSize The number of samples in the audio signal. * * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error. */ -Model.prototype.sttWithMetadata = function() { - const args = [this._impl].concat(Array.prototype.slice.call(arguments)); - return binding.SpeechToTextWithMetadata.apply(null, args); +Model.prototype.sttWithMetadata = function(aBuffer) { + return binding.SpeechToTextWithMetadata(this._impl, aBuffer); } /** - * Create a new streaming inference state. The streaming state returned by this function can then be passed to :js:func:`Model.feedAudioContent` and :js:func:`Model.finishStream`. + * Create a new streaming inference state. One can then call :js:func:`Stream.feedAudioContent` and :js:func:`Stream.finishStream` on the returned stream object. * - * @return {object} an opaque object that represents the streaming state. + * @return {object} a :js:func:`Stream` object that represents the streaming state. * * @throws on error */ Model.prototype.createStream = function() { - const args = [this._impl].concat(Array.prototype.slice.call(arguments)); - const rets = binding.CreateStream.apply(null, args); + const rets = binding.CreateStream(this._impl); const status = rets[0]; const ctx = rets[1]; if (status !== 0) { @@ -111,55 +123,56 @@ Model.prototype.createStream = function() { return ctx; } +function Stream(nativeStream) { + this._impl = nativeStream; +} + /** * Feed audio samples to an ongoing streaming inference. * - * @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`. * @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the * appropriate sample rate (matching what the model was trained on). - * @param {number} aBufferSize The number of samples in @param aBuffer. */ -Model.prototype.feedAudioContent = function() { - binding.FeedAudioContent.apply(null, arguments); +Stream.prototype.feedAudioContent = function(aBuffer) { + binding.FeedAudioContent(this._impl, aBuffer); } /** * Compute the intermediate decoding of an ongoing streaming inference. * - * @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`. - * * @return {string} The STT intermediate result. */ -Model.prototype.intermediateDecode = function() { - return binding.IntermediateDecode.apply(null, arguments); +Stream.prototype.intermediateDecode = function() { + return binding.IntermediateDecode(this._impl); } /** * Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal. * - * @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`. - * * @return {string} The STT result. * - * This method will free the state (@param aSctx). + * This method will free the stream, it must not be used after this method is called. */ -Model.prototype.finishStream = function() { - return binding.FinishStream.apply(null, arguments); +Stream.prototype.finishStream = function() { + result = binding.FinishStream(this._impl); + this._impl = null; + return result; } /** * Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata. * - * @param {object} aSctx A streaming state pointer returned by :js:func:`Model.setupStream`. - * * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. * - * This method will free the state pointer (@param aSctx). + * This method will free the stream, it must not be used after this method is called. */ -Model.prototype.finishStreamWithMetadata = function() { - return binding.FinishStreamWithMetadata.apply(null, arguments); +Stream.prototype.finishStreamWithMetadata = function() { + result = binding.FinishStreamWithMetadata(this._impl); + this._impl = null; + return result; } + /** * Frees associated resources and destroys model object. * @@ -184,10 +197,10 @@ function FreeMetadata(metadata) { * can be used if you no longer need the result of an ongoing streaming * inference and don't want to perform a costly decode operation. * - * @param {Object} stream A streaming state pointer returned by :js:func:`Model.createStream`. + * @param {Object} stream A stream object returned by :js:func:`Model.createStream`. */ function FreeStream(stream) { - return binding.FreeStream(stream); + return binding.FreeStream(stream._impl); } /** diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py index 0cd220e8..ee38287f 100644 --- a/native_client/python/__init__.py +++ b/native_client/python/__init__.py @@ -21,7 +21,6 @@ import deepspeech # rename for backwards compatibility from deepspeech.impl import PrintVersions as printVersions -from deepspeech.impl import FreeStream as freeStream class Model(object): """ @@ -56,127 +55,159 @@ class Model(object): """ return deepspeech.impl.GetModelSampleRate(self._impl) - def enableDecoderWithLM(self, *args, **kwargs): + def enableExternalScorer(self, scorer_path): """ - Enable decoding using beam scoring with a KenLM language model. + Enable decoding using an external scorer. - :param aLMPath: The path to the language model binary file. - :type aLMPath: str + :param scorer_path: The path to the external scorer file. + :type scorer_path: str - :param aTriePath: The path to the trie file build from the same vocabulary as the language model binary. - :type aTriePath: str - - :param aLMAlpha: The alpha hyperparameter of the CTC decoder. Language Model weight. - :type aLMAlpha: float - - :param aLMBeta: The beta hyperparameter of the CTC decoder. Word insertion weight. - :type aLMBeta: float - - :return: Zero on success, non-zero on failure (invalid arguments). + :return: Zero on success, non-zero on failure. :type: int """ - return deepspeech.impl.EnableDecoderWithLM(self._impl, *args, **kwargs) + return deepspeech.impl.EnableExternalScorer(self._impl, scorer_path) - def stt(self, *args, **kwargs): + def disableExternalScorer(self): + """ + Disable decoding using an external scorer. + + :return: Zero on success, non-zero on failure. + """ + return deepspeech.impl.DisableExternalScorer(self._impl) + + def setScorerAlphaBeta(self, alpha, beta): + """ + Set hyperparameters alpha and beta of the external scorer. + + :param alpha: The alpha hyperparameter of the decoder. Language model weight. + :type alpha: float + + :param beta: The beta hyperparameter of the decoder. Word insertion weight. + :type beta: float + + :return: Zero on success, non-zero on failure. + :type: int + """ + return deepspeech.impl.SetScorerAlphaBeta(self._impl, alpha, beta) + + def stt(self, audio_buffer): """ Use the DeepSpeech model to perform Speech-To-Text. - :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). - :type aBuffer: int array - - :param aBufferSize: The number of samples in the audio signal. - :type aBufferSize: int + :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). + :type audio_buffer: numpy.int16 array :return: The STT result. :type: str """ - return deepspeech.impl.SpeechToText(self._impl, *args, **kwargs) + return deepspeech.impl.SpeechToText(self._impl, audio_buffer) - def sttWithMetadata(self, *args, **kwargs): + def sttWithMetadata(self, audio_buffer): """ Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results. - :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). - :type aBuffer: int array - - :param aBufferSize: The number of samples in the audio signal. - :type aBufferSize: int + :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). + :type audio_buffer: numpy.int16 array :return: Outputs a struct of individual letters along with their timing information. :type: :func:`Metadata` """ - return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs) + return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer) def createStream(self): """ - Create a new streaming inference state. The streaming state returned - by this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`. + Create a new streaming inference state. The streaming state returned by + this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`. - :return: Object holding the stream + :return: Stream object representing the newly created stream + :type: :func:`Stream` :throws: RuntimeError on error """ status, ctx = deepspeech.impl.CreateStream(self._impl) if status != 0: raise RuntimeError("CreateStream failed with error code {}".format(status)) - return ctx + return Stream(ctx) - # pylint: disable=no-self-use - def feedAudioContent(self, *args, **kwargs): + +class Stream(object): + def __init__(self, native_stream): + self._impl = native_stream + + def __del__(self): + if self._impl: + self.freeStream() + + def feedAudioContent(self, audio_buffer): """ Feed audio samples to an ongoing streaming inference. - :param aSctx: A streaming state pointer returned by :func:`createStream()`. - :type aSctx: object + :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). + :type audio_buffer: numpy.int16 array - :param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on). - :type aBuffer: int array - - :param aBufferSize: The number of samples in @p aBuffer. - :type aBufferSize: int + :throws: RuntimeError if the stream object is not valid """ - deepspeech.impl.FeedAudioContent(*args, **kwargs) + if not self._impl: + raise RuntimeError("Stream object is not valid. Trying to feed an already finished stream?") + deepspeech.impl.FeedAudioContent(self._impl, audio_buffer) - # pylint: disable=no-self-use - def intermediateDecode(self, *args, **kwargs): + def intermediateDecode(self): """ Compute the intermediate decoding of an ongoing streaming inference. - :param aSctx: A streaming state pointer returned by :func:`createStream()`. - :type aSctx: object - :return: The STT intermediate result. :type: str - """ - return deepspeech.impl.IntermediateDecode(*args, **kwargs) - # pylint: disable=no-self-use - def finishStream(self, *args, **kwargs): + :throws: RuntimeError if the stream object is not valid """ - Signal the end of an audio signal to an ongoing streaming - inference, returns the STT result over the whole audio signal. + if not self._impl: + raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?") + return deepspeech.impl.IntermediateDecode(self._impl) - :param aSctx: A streaming state pointer returned by :func:`createStream()`. - :type aSctx: object + def finishStream(self): + """ + Signal the end of an audio signal to an ongoing streaming inference, + returns the STT result over the whole audio signal. :return: The STT result. :type: str - """ - return deepspeech.impl.FinishStream(*args, **kwargs) - # pylint: disable=no-self-use - def finishStreamWithMetadata(self, *args, **kwargs): + :throws: RuntimeError if the stream object is not valid """ - Signal the end of an audio signal to an ongoing streaming - inference, returns per-letter metadata. + if not self._impl: + raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?") + result = deepspeech.impl.FinishStream(self._impl) + self._impl = None + return result - :param aSctx: A streaming state pointer returned by :func:`createStream()`. - :type aSctx: object + def finishStreamWithMetadata(self): + """ + Signal the end of an audio signal to an ongoing streaming inference, + returns per-letter metadata. :return: Outputs a struct of individual letters along with their timing information. :type: :func:`Metadata` + + :throws: RuntimeError if the stream object is not valid """ - return deepspeech.impl.FinishStreamWithMetadata(*args, **kwargs) + if not self._impl: + raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?") + result = deepspeech.impl.FinishStreamWithMetadata(self._impl) + self._impl = None + return result + + def freeStream(self): + """ + Destroy a streaming state without decoding the computed logits. This can + be used if you no longer need the result of an ongoing streaming inference. + + :throws: RuntimeError if the stream object is not valid + """ + if not self._impl: + raise RuntimeError("Stream object is not valid. Trying to free an already finished stream?") + deepspeech.impl.FreeStream(self._impl) + self._impl = None + # This is only for documentation purpose # Metadata and MetadataItem should be in sync with native_client/deepspeech.h @@ -189,22 +220,18 @@ class MetadataItem(object): """ The character generated for transcription """ - # pylint: disable=unnecessary-pass - pass + def timestep(self): """ Position of the character in units of 20ms """ - # pylint: disable=unnecessary-pass - pass + def start_time(self): """ Position of the character in seconds """ - # pylint: disable=unnecessary-pass - pass class Metadata(object): @@ -218,8 +245,7 @@ class Metadata(object): :return: A list of :func:`MetadataItem` elements :type: list """ - # pylint: disable=unnecessary-pass - pass + def num_items(self): """ @@ -228,8 +254,7 @@ class Metadata(object): :return: Size of the list of items :type: int """ - # pylint: disable=unnecessary-pass - pass + def confidence(self): """ @@ -237,5 +262,4 @@ class Metadata(object): sum of the acoustic model logit values for each timestep/character that contributed to the creation of this transcription. """ - # pylint: disable=unnecessary-pass - pass + diff --git a/native_client/python/client.py b/native_client/python/client.py index 91a63491..ba5d70b2 100644 --- a/native_client/python/client.py +++ b/native_client/python/client.py @@ -72,7 +72,7 @@ def metadata_json_output(metadata): json_result["words"] = words_from_metadata(metadata) json_result["confidence"] = metadata.confidence return json.dumps(json_result) - + class VersionAction(argparse.Action): @@ -88,17 +88,15 @@ def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') - parser.add_argument('--lm', nargs='?', - help='Path to the language model binary file') - parser.add_argument('--trie', nargs='?', - help='Path to the language model trie file created with native_client/generate_trie') + parser.add_argument('--scorer', required=False, + help='Path to the external scorer file') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, default=500, help='Beam width for the CTC decoder') - parser.add_argument('--lm_alpha', type=float, default=0.75, + parser.add_argument('--lm_alpha', type=float, help='Language model weight (lm_alpha)') - parser.add_argument('--lm_beta', type=float, default=1.85, + parser.add_argument('--lm_beta', type=float, help='Word insertion bonus (lm_beta)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') @@ -116,12 +114,15 @@ def main(): desired_sample_rate = ds.sampleRate() - if args.lm and args.trie: - print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) - lm_load_start = timer() - ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta) - lm_load_end = timer() - lm_load_start - print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) + if args.scorer: + print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) + scorer_load_start = timer() + ds.enableExternalScorer(args.scorer) + scorer_load_end = timer() - scorer_load_start + print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) + + if args.lm_alpha and args.lm_beta: + ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() diff --git a/native_client/test/concurrent_streams.py b/native_client/test/concurrent_streams.py index 2b2b4ed0..d799de36 100644 --- a/native_client/test/concurrent_streams.py +++ b/native_client/test/concurrent_streams.py @@ -14,21 +14,13 @@ from deepspeech import Model # Beam width used in the CTC decoder when building candidate transcriptions BEAM_WIDTH = 500 -# The alpha hyperparameter of the CTC decoder. Language Model weight -LM_ALPHA = 0.75 - -# The beta hyperparameter of the CTC decoder. Word insertion bonus. -LM_BETA = 1.85 - def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') - parser.add_argument('--lm', nargs='?', - help='Path to the language model binary file') - parser.add_argument('--trie', nargs='?', - help='Path to the language model trie file created with native_client/generate_trie') + parser.add_argument('--scorer', nargs='?', + help='Path to the external scorer file') parser.add_argument('--audio1', required=True, help='First audio file to use in interleaved streams') parser.add_argument('--audio2', required=True, @@ -37,8 +29,8 @@ def main(): ds = Model(args.model, BEAM_WIDTH) - if args.lm and args.trie: - ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA) + if args.scorer: + ds.enableExternalScorer(args.scorer) fin = wave.open(args.audio1, 'rb') fs1 = fin.getframerate() @@ -57,11 +49,11 @@ def main(): splits2 = np.array_split(audio2, 10) for part1, part2 in zip(splits1, splits2): - ds.feedAudioContent(stream1, part1) - ds.feedAudioContent(stream2, part2) + stream1.feedAudioContent(part1) + stream2.feedAudioContent(part2) - print(ds.finishStream(stream1)) - print(ds.finishStream(stream2)) + print(stream1.finishStream()) + print(stream2.finishStream()) if __name__ == '__main__': main() diff --git a/taskcluster/arm64-build.sh b/taskcluster/arm64-build.sh index 178b9b35..26518d2d 100644 --- a/taskcluster/arm64-build.sh +++ b/taskcluster/arm64-build.sh @@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh BAZEL_TARGETS=" //native_client:libdeepspeech.so -//native_client:generate_trie " BAZEL_BUILD_FLAGS="${BAZEL_ARM64_FLAGS} ${BAZEL_EXTRA_FLAGS}" diff --git a/taskcluster/cuda-build.sh b/taskcluster/cuda-build.sh index cfc77824..df3e049f 100755 --- a/taskcluster/cuda-build.sh +++ b/taskcluster/cuda-build.sh @@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh BAZEL_TARGETS=" //native_client:libdeepspeech.so -//native_client:generate_trie " BAZEL_ENV_FLAGS="TF_NEED_CUDA=1 ${TF_CUDA_FLAGS}" diff --git a/taskcluster/examples-base.tyml b/taskcluster/examples-base.tyml index 5f3a1bdb..9739f36a 100644 --- a/taskcluster/examples-base.tyml +++ b/taskcluster/examples-base.tyml @@ -30,11 +30,11 @@ then: image: ${build.docker_image} env: - DEEPSPEECH_MODEL: "https://github.com/reuben/DeepSpeech/releases/download/v0.6.0-alpha.15/models.tar.gz" + DEEPSPEECH_MODEL: "https://github.com/reuben/DeepSpeech/releases/download/v0.6.1/models.tar.gz" DEEPSPEECH_AUDIO: "https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz" PIP_DEFAULT_TIMEOUT: "60" EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples" - EXAMPLES_CHECKOUT_TARGET: "master" + EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05" command: - "/bin/bash" diff --git a/taskcluster/host-build.sh b/taskcluster/host-build.sh index ac01f2f5..1575832c 100755 --- a/taskcluster/host-build.sh +++ b/taskcluster/host-build.sh @@ -10,7 +10,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh BAZEL_TARGETS=" //native_client:libdeepspeech.so -//native_client:generate_trie " if [ "${runtime}" = "tflite" ]; then diff --git a/taskcluster/rpi3-build.sh b/taskcluster/rpi3-build.sh index 2fbaf8b1..3b17d7ef 100755 --- a/taskcluster/rpi3-build.sh +++ b/taskcluster/rpi3-build.sh @@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh BAZEL_TARGETS=" //native_client:libdeepspeech.so -//native_client:generate_trie " BAZEL_BUILD_FLAGS="${BAZEL_ARM_FLAGS} ${BAZEL_EXTRA_FLAGS}" diff --git a/taskcluster/tc-evaluate_tflite.sh b/taskcluster/tc-evaluate_tflite.sh index 6b4f6d32..dce4b63f 100755 --- a/taskcluster/tc-evaluate_tflite.sh +++ b/taskcluster/tc-evaluate_tflite.sh @@ -49,7 +49,7 @@ deepspeech --version pushd ${HOME}/DeepSpeech/ds/ python bin/import_ldc93s1.py data/smoke_test - python evaluate_tflite.py --model "${TASKCLUSTER_TMP_DIR}/${model_name_mmap}" --lm data/smoke_test/vocab.pruned.lm --trie data/smoke_test/vocab.trie --csv data/smoke_test/ldc93s1.csv + python evaluate_tflite.py --model "${TASKCLUSTER_TMP_DIR}/${model_name_mmap}" --scorer data/smoke_test/pruned_lm.scorer --csv data/smoke_test/ldc93s1.csv popd virtualenv_deactivate "${pyalias}" "${PYENV_NAME}" diff --git a/taskcluster/tc-tests-utils.sh b/taskcluster/tc-tests-utils.sh index 4841afaf..877aa4ad 100755 --- a/taskcluster/tc-tests-utils.sh +++ b/taskcluster/tc-tests-utils.sh @@ -378,7 +378,7 @@ run_netframework_inference_tests() assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?" set +e - phrase_pbmodel_withlm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) + phrase_pbmodel_withlm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) set -e assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?" } @@ -401,7 +401,7 @@ run_electronjs_inference_tests() assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?" set +e - phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) + phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) set -e assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?" } @@ -427,7 +427,7 @@ run_basic_inference_tests() assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status" set +e - phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) + phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) status=$? set -e assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm}" "$status" @@ -444,7 +444,7 @@ run_all_inference_tests() assert_correct_ldc93s1 "${phrase_pbmodel_nolm_stereo_44k}" "$status" set +e - phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) + phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) status=$? set -e assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_stereo_44k}" "$status" @@ -457,7 +457,7 @@ run_all_inference_tests() assert_correct_warning_upsampling "${phrase_pbmodel_nolm_mono_8k}" set +e - phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null) + phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null) set -e assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}" fi; @@ -470,8 +470,7 @@ run_prod_concurrent_stream_tests() set +e output=$(python ${TASKCLUSTER_TMP_DIR}/test_sources/concurrent_streams.py \ --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} \ - --lm ${TASKCLUSTER_TMP_DIR}/lm.binary \ - --trie ${TASKCLUSTER_TMP_DIR}/trie \ + --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer \ --audio1 ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_16000.wav \ --audio2 ${TASKCLUSTER_TMP_DIR}/new-home-in-the-stars-16k.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) status=$? @@ -489,19 +488,19 @@ run_prod_inference_tests() local _bitrate=$1 set +e - phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) + phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) status=$? set -e assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}" set +e - phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) + phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) status=$? set -e assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}" set +e - phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) + phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) status=$? set -e assert_correct_ldc93s1_prodmodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}" @@ -509,7 +508,7 @@ run_prod_inference_tests() # Run down-sampling warning test only when we actually perform downsampling if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then set +e - phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null) + phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null) set -e assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}" fi; @@ -520,19 +519,19 @@ run_prodtflite_inference_tests() local _bitrate=$1 set +e - phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) + phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) status=$? set -e assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}" set +e - phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) + phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr) status=$? set -e assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}" set +e - phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) + phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) status=$? set -e assert_correct_ldc93s1_prodtflitemodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}" @@ -540,7 +539,7 @@ run_prodtflite_inference_tests() # Run down-sampling warning test only when we actually perform downsampling if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then set +e - phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null) + phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null) set -e assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}" fi; @@ -555,7 +554,7 @@ run_multi_inference_tests() assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_nolm}" "$status" set +e -o pipefail - multi_phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%') + multi_phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%') status=$? set -e +o pipefail assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_withlm}" "$status" @@ -564,7 +563,7 @@ run_multi_inference_tests() run_cpp_only_inference_tests() { set +e - phrase_pbmodel_withlm_intermediate_decode=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} --stream 1280 2>${TASKCLUSTER_TMP_DIR}/stderr | tail -n 1) + phrase_pbmodel_withlm_intermediate_decode=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} --stream 1280 2>${TASKCLUSTER_TMP_DIR}/stderr | tail -n 1) status=$? set -e assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_intermediate_decode}" "$status" @@ -669,8 +668,7 @@ download_data() ${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source}" ${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source_mmap}" cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/*.wav ${TASKCLUSTER_TMP_DIR}/ - cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.pruned.lm ${TASKCLUSTER_TMP_DIR}/lm.binary - cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.trie ${TASKCLUSTER_TMP_DIR}/trie + cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/pruned_lm.scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer cp -R ${DS_ROOT_TASK}/DeepSpeech/ds/native_client/test ${TASKCLUSTER_TMP_DIR}/test_sources } @@ -1562,7 +1560,6 @@ package_native_client() fi; ${TAR} -cf - \ - -C ${tensorflow_dir}/bazel-bin/native_client/ generate_trie${PLATFORM_EXE_SUFFIX} \ -C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so \ -C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so.if.lib \ -C ${deepspeech_dir}/ LICENSE \ @@ -1767,8 +1764,7 @@ android_setup_apk_data() adb push \ ${TASKCLUSTER_TMP_DIR}/${model_name} \ ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} \ - ${TASKCLUSTER_TMP_DIR}/lm.binary \ - ${TASKCLUSTER_TMP_DIR}/trie \ + ${TASKCLUSTER_TMP_DIR}/kenlm.scorer \ ${ANDROID_TMP_DIR}/test/ } diff --git a/taskcluster/win-build.sh b/taskcluster/win-build.sh index e3a4133d..39c3f261 100755 --- a/taskcluster/win-build.sh +++ b/taskcluster/win-build.sh @@ -10,7 +10,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh BAZEL_TARGETS=" //native_client:libdeepspeech.so -//native_client:generate_trie " if [ "${package_option}" = "--cuda" ]; then diff --git a/taskcluster/win-opt-base.tyml b/taskcluster/win-opt-base.tyml index f1a3c680..e0c12162 100644 --- a/taskcluster/win-opt-base.tyml +++ b/taskcluster/win-opt-base.tyml @@ -44,7 +44,7 @@ payload: MSYS: 'winsymlinks:nativestrict' TENSORFLOW_BUILD_ARTIFACT: ${build.tensorflow} EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples" - EXAMPLES_CHECKOUT_TARGET: "master" + EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05" command: - >- diff --git a/transcribe.py b/transcribe.py index 8c761a9a..c66bbe61 100755 --- a/transcribe.py +++ b/transcribe.py @@ -29,7 +29,7 @@ def fail(message, code=1): def transcribe_file(audio_path, tlog_path): from DeepSpeech import create_model, try_loading # pylint: disable=cyclic-import,import-outside-toplevel initialize_globals() - scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, Config.alphabet) + scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path, Config.alphabet) try: num_processes = cpu_count() except NotImplementedError: diff --git a/util/flags.py b/util/flags.py index 49d54fd0..c3ed2af8 100644 --- a/util/flags.py +++ b/util/flags.py @@ -143,10 +143,8 @@ def create_flags(): f.DEFINE_boolean('utf8', False, 'enable UTF-8 mode. When this is used the model outputs UTF-8 sequences directly rather than using an alphabet mapping.') f.DEFINE_string('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.') - f.DEFINE_string('lm_binary_path', 'data/lm/lm.binary', 'path to the language model binary file created with KenLM') - f.DEFINE_alias('lm', 'lm_binary_path') - f.DEFINE_string('lm_trie_path', 'data/lm/trie', 'path to the language model trie file created with native_client/generate_trie') - f.DEFINE_alias('trie', 'lm_trie_path') + f.DEFINE_string('scorer_path', 'data/lm/kenlm.scorer', 'path to the external scorer file created with data/lm/generate_package.py') + f.DEFINE_alias('scorer', 'scorer_path') f.DEFINE_integer('beam_width', 1024, 'beam width used in the CTC decoder when building candidate transcriptions') f.DEFINE_float('lm_alpha', 0.75, 'the alpha hyperparameter of the CTC decoder. Language Model weight.') f.DEFINE_float('lm_beta', 1.85, 'the beta hyperparameter of the CTC decoder. Word insertion weight.')