Update all API consumers

This commit is contained in:
Reuben Morais 2020-01-21 11:54:01 +01:00
parent 708b21a63e
commit 1e2eb96248
41 changed files with 393 additions and 516 deletions

View File

@ -882,8 +882,7 @@ def package_zip():
}
}, f)
shutil.copy(FLAGS.lm_binary_path, export_dir)
shutil.copy(FLAGS.lm_trie_path, export_dir)
shutil.copy(FLAGS.scorer_path, export_dir)
archive = shutil.make_archive(zip_filename, 'zip', export_dir)
log_info('Exported packaged model {}'.format(archive))
@ -926,10 +925,9 @@ def do_single_file_inference(input_file_path):
logits = np.squeeze(logits)
if FLAGS.lm_binary_path:
if FLAGS.scorer_path:
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
FLAGS.lm_binary_path, FLAGS.lm_trie_path,
Config.alphabet)
FLAGS.scorer_path, Config.alphabet)
else:
scorer = None
decoded = ctc_beam_search_decoder(logits, Config.alphabet, FLAGS.beam_width,

View File

@ -172,7 +172,7 @@ RUN ./configure
# Build DeepSpeech
RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=cuda -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_trie --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=cuda -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx --copt=-fvisibility=hidden //native_client:libdeepspeech.so --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
###
### Using TensorFlow upstream should work
@ -187,8 +187,7 @@ RUN bazel build --workspace_status_command="bash native_client/bazel_workspace_s
# RUN pip3 install /tmp/tensorflow_pkg/*.whl
# Copy built libs to /DeepSpeech/native_client
RUN cp /tensorflow/bazel-bin/native_client/generate_trie /DeepSpeech/native_client/ \
&& cp /tensorflow/bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/
RUN cp /tensorflow/bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/
# Install TensorFlow
WORKDIR /DeepSpeech/

View File

@ -21,8 +21,7 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--n_hidden 100 --epochs 1 \
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
--learning_rate 0.001 --dropout_rate 0.05 \
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
--lm_trie_path 'data/smoke_test/vocab.trie' | tee /tmp/resume.log
--scorer_path 'data/smoke_test/pruned_lm.scorer' | tee /tmp/resume.log
if ! grep "Restored variables from most recent checkpoint" /tmp/resume.log; then
echo "Did not resume training from checkpoint"

View File

@ -25,6 +25,5 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--n_hidden 100 --epochs $epoch_count \
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
--learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train' \
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
--lm_trie_path 'data/smoke_test/vocab.trie' \
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
--audio_sample_rate ${audio_sample_rate}

View File

@ -21,12 +21,10 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
--n_hidden 100 --epochs 1 \
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' --checkpoint_secs 0 \
--learning_rate 0.001 --dropout_rate 0.05 \
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
--lm_trie_path 'data/smoke_test/vocab.trie'
--scorer_path 'data/smoke_test/pruned_lm.scorer'
python -u DeepSpeech.py \
--n_hidden 100 \
--checkpoint_dir '/tmp/ckpt' \
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
--lm_trie_path 'data/smoke_test/vocab.trie' \
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
--one_shot_infer 'data/smoke_test/LDC93S1.wav'

View File

@ -20,8 +20,7 @@ python -u DeepSpeech.py --noshow_progressbar \
--n_hidden 100 \
--checkpoint_dir '/tmp/ckpt' \
--export_dir '/tmp/train_tflite' \
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
--lm_trie_path 'data/smoke_test/vocab.trie' \
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
--audio_sample_rate ${audio_sample_rate} \
--export_tflite
@ -31,8 +30,7 @@ python -u DeepSpeech.py --noshow_progressbar \
--n_hidden 100 \
--checkpoint_dir '/tmp/ckpt' \
--export_dir '/tmp/train_tflite/en-us' \
--lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
--lm_trie_path 'data/smoke_test/vocab.trie' \
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
--audio_sample_rate ${audio_sample_rate} \
--export_language 'Fake English (fk-FK)' \
--export_zip

View File

@ -50,7 +50,7 @@ def create_bundle(alphabet_path, lm_path, vocab_path, package_path, force_utf8,
scorer.set_alphabet(alphabet)
scorer.set_utf8_mode(use_utf8)
scorer.reset_params(default_alpha, default_beta)
scorer.load_lm(lm_path, "")
scorer.load_lm(lm_path)
scorer.fill_dictionary(list(words))
shutil.copy(lm_path, package_path)
scorer.save_dictionary(package_path, True) # append, not overwrite

View File

@ -7,7 +7,13 @@ C
.. doxygenfunction:: DS_FreeModel
:project: deepspeech-c
.. doxygenfunction:: DS_EnableDecoderWithLM
.. doxygenfunction:: DS_EnableExternalScorer
:project: deepspeech-c
.. doxygenfunction:: DS_DisableExternalScorer
:project: deepspeech-c
.. doxygenfunction:: DS_SetScorerAlphaBeta
:project: deepspeech-c
.. doxygenfunction:: DS_GetModelSampleRate

View File

@ -42,10 +42,9 @@ def sparse_tuple_to_texts(sp_tuple, alphabet):
def evaluate(test_csvs, create_model, try_loading):
if FLAGS.lm_binary_path:
if FLAGS.scorer_path:
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
FLAGS.lm_binary_path, FLAGS.lm_trie_path,
Config.alphabet)
FLAGS.scorer_path, Config.alphabet)
else:
scorer = None

View File

@ -27,17 +27,18 @@ This module should be self-contained:
- pip install native_client/python/dist/deepspeech*.whl
- pip install -r requirements_eval_tflite.txt
Then run with a TF Lite model, LM/trie and a CSV test file
Then run with a TF Lite model, LM and a CSV test file
'''
BEAM_WIDTH = 500
LM_ALPHA = 0.75
LM_BETA = 1.85
def tflite_worker(model, lm, trie, queue_in, queue_out, gpu_mask):
def tflite_worker(model, scorer, queue_in, queue_out, gpu_mask):
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
ds = Model(model, BEAM_WIDTH)
ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
ds.enableExternalScorer(scorer)
ds.setScorerAlphaBeta(LM_ALPHA, LM_BETA)
while True:
try:
@ -64,7 +65,7 @@ def main(args, _):
processes = []
for i in range(args.proc):
worker_process = Process(target=tflite_worker, args=(args.model, args.lm, args.trie, work_todo, work_done, i), daemon=True, name='tflite_process_{}'.format(i))
worker_process = Process(target=tflite_worker, args=(args.model, args.scorer, work_todo, work_done, i), daemon=True, name='tflite_process_{}'.format(i))
worker_process.start() # Launch reader() as a separate python process
processes.append(worker_process)
@ -113,10 +114,8 @@ def parse_args():
parser = argparse.ArgumentParser(description='Computing TFLite accuracy')
parser.add_argument('--model', required=True,
help='Path to the model (protocol buffer binary file)')
parser.add_argument('--lm', required=True,
help='Path to the language model binary file')
parser.add_argument('--trie', required=True,
help='Path to the language model trie file created with native_client/generate_trie')
parser.add_argument('--scorer', required=True,
help='Path to the external scorer file')
parser.add_argument('--csv', required=True,
help='Path to the CSV source file')
parser.add_argument('--proc', required=False, default=cpu_count(), type=int,

View File

@ -12,19 +12,17 @@
char* model = NULL;
char* lm = NULL;
char* trie = NULL;
char* scorer = NULL;
char* audio = NULL;
int beam_width = 500;
float lm_alpha = 0.75f;
bool set_alphabeta = false;
float lm_beta = 1.85f;
float lm_alpha = 0.f;
bool load_without_trie = false;
float lm_beta = 0.f;
bool show_times = false;
@ -39,39 +37,36 @@ int stream_size = 0;
void PrintHelp(const char* bin)
{
std::cout <<
"Usage: " << bin << " --model MODEL [--lm LM --trie TRIE] --audio AUDIO [-t] [-e]\n"
"Usage: " << bin << " --model MODEL [--scorer SCORER] --audio AUDIO [-t] [-e]\n"
"\n"
"Running DeepSpeech inference.\n"
"\n"
" --model MODEL Path to the model (protocol buffer binary file)\n"
" --lm LM Path to the language model binary file\n"
" --trie TRIE Path to the language model trie file created with native_client/generate_trie\n"
" --audio AUDIO Path to the audio file to run (WAV format)\n"
" --beam_width BEAM_WIDTH Value for decoder beam width (int)\n"
" --lm_alpha LM_ALPHA Value for language model alpha param (float)\n"
" --lm_beta LM_BETA Value for language model beta param (float)\n"
" -t Run in benchmark mode, output mfcc & inference time\n"
" --extended Output string from extended metadata\n"
" --json Extended output, shows word timings as JSON\n"
" --stream size Run in stream mode, output intermediate results\n"
" --help Show help\n"
" --version Print version and exits\n";
"\t--model MODEL\t\tPath to the model (protocol buffer binary file)\n"
"\t--scorer SCORER\t\tPath to the external scorer file\n"
"\t--audio AUDIO\t\tPath to the audio file to run (WAV format)\n"
"\t--beam_width BEAM_WIDTH\tValue for decoder beam width (int)\n"
"\t--lm_alpha LM_ALPHA\tValue for language model alpha param (float)\n"
"\t--lm_beta LM_BETA\tValue for language model beta param (float)\n"
"\t-t\t\t\tRun in benchmark mode, output mfcc & inference time\n"
"\t--extended\t\tOutput string from extended metadata\n"
"\t--json\t\t\tExtended output, shows word timings as JSON\n"
"\t--stream size\t\tRun in stream mode, output intermediate results\n"
"\t--help\t\t\tShow help\n"
"\t--version\t\tPrint version and exits\n";
DS_PrintVersions();
exit(1);
}
bool ProcessArgs(int argc, char** argv)
{
const char* const short_opts = "m:a:l:r:w:c:d:b:tehv";
const char* const short_opts = "m:a:s:r:w:c:d:b:tehv";
const option long_opts[] = {
{"model", required_argument, nullptr, 'm'},
{"lm", required_argument, nullptr, 'l'},
{"trie", required_argument, nullptr, 'r'},
{"scorer", required_argument, nullptr, 'l'},
{"audio", required_argument, nullptr, 'w'},
{"beam_width", required_argument, nullptr, 'b'},
{"lm_alpha", required_argument, nullptr, 'c'},
{"lm_beta", required_argument, nullptr, 'd'},
{"run_very_slowly_without_trie_I_really_know_what_Im_doing", no_argument, nullptr, 999},
{"t", no_argument, nullptr, 't'},
{"extended", no_argument, nullptr, 'e'},
{"json", no_argument, nullptr, 'j'},
@ -95,31 +90,25 @@ bool ProcessArgs(int argc, char** argv)
break;
case 'l':
lm = optarg;
break;
case 'r':
trie = optarg;
scorer = optarg;
break;
case 'w':
audio = optarg;
break;
case 'b':
beam_width = atoi(optarg);
break;
case 'c':
lm_alpha = atof(optarg);
break;
case 'd':
lm_beta = atof(optarg);
break;
case 'b':
beam_width = atoi(optarg);
break;
case 'c':
set_alphabeta = true;
lm_alpha = atof(optarg);
break;
case 999:
load_without_trie = true;
case 'd':
set_alphabeta = true;
lm_beta = atof(optarg);
break;
case 't':

View File

@ -374,16 +374,19 @@ main(int argc, char **argv)
return 1;
}
if (lm && (trie || load_without_trie)) {
int status = DS_EnableDecoderWithLM(ctx,
lm,
trie,
lm_alpha,
lm_beta);
if (scorer) {
int status = DS_EnableExternalScorer(ctx, scorer);
if (status != 0) {
fprintf(stderr, "Could not enable CTC decoder with LM.\n");
fprintf(stderr, "Could not enable external scorer.\n");
return 1;
}
if (set_alphabeta) {
status = DS_SetScorerAlphaBeta(ctx, lm_alpha, lm_beta);
if (status != 0) {
fprintf(stderr, "Error setting scorer alpha and beta.\n");
return 1;
}
}
}
#ifndef NO_SOX

View File

@ -12,12 +12,11 @@ class Scorer(swigwrapper.Scorer):
:type alpha: float
:param beta: Word insertion bonus.
:type beta: float
:model_path: Path to load language model.
:trie_path: Path to trie file.
:model_path: Path to load scorer.
:alphabet: Alphabet
:type model_path: basestring
"""
def __init__(self, alpha=None, beta=None, model_path=None, trie_path=None, alphabet=None):
def __init__(self, alpha=None, beta=None, model_path=None, alphabet=None):
super(Scorer, self).__init__()
# Allow bare initialization
if alphabet:
@ -27,15 +26,15 @@ class Scorer(swigwrapper.Scorer):
if err != 0:
raise ValueError("Error when deserializing alphabet.")
err = self.init(alpha, beta,
model_path.encode('utf-8'),
trie_path.encode('utf-8'),
err = self.init(model_path.encode('utf-8'),
native_alphabet)
if err != 0:
raise ValueError("Scorer initialization failed with error code {}".format(err), err)
def load_lm(self, lm_path, trie_path):
super(Scorer, self).load_lm(lm_path.encode('utf-8'), trie_path.encode('utf-8'))
self.reset_params(alpha, beta)
def load_lm(self, lm_path):
super(Scorer, self).load_lm(lm_path.encode('utf-8'))
def save_dictionary(self, save_path, *args, **kwargs):
super(Scorer, self).save_dictionary(save_path.encode('utf-8'), *args, **kwargs)

View File

@ -6,7 +6,6 @@
#include <unordered_map>
#include <vector>
#include "lm/enumerate_vocab.hh"
#include "lm/virtual_interface.hh"
#include "lm/word_index.hh"
#include "util/string_piece.hh"
@ -19,18 +18,6 @@ const std::string START_TOKEN = "<s>";
const std::string UNK_TOKEN = "<unk>";
const std::string END_TOKEN = "</s>";
// Implement a callback to retrieve the dictionary of language model.
class RetrieveStrEnumerateVocab : public lm::EnumerateVocab {
public:
RetrieveStrEnumerateVocab() {}
void Add(lm::WordIndex index, const StringPiece &str) {
vocabulary.push_back(std::string(str.data(), str.length()));
}
std::vector<std::string> vocabulary;
};
/* External scorer to query score for n-gram or sentence, including language
* model scoring and word insertion.
*

View File

@ -310,7 +310,7 @@ DS_EnableExternalScorer(ModelState* aCtx,
aCtx->scorer_.reset(new Scorer());
int err = aCtx->scorer_->init(aScorerPath, aCtx->alphabet_);
if (err != 0) {
return DS_ERR_INVALID_LM;
return DS_ERR_INVALID_SCORER;
}
return DS_ERR_OK;
}

View File

@ -59,7 +59,7 @@ enum DeepSpeech_Error_Codes
// Invalid parameters
DS_ERR_INVALID_ALPHABET = 0x2000,
DS_ERR_INVALID_SHAPE = 0x2001,
DS_ERR_INVALID_LM = 0x2002,
DS_ERR_INVALID_SCORER = 0x2002,
DS_ERR_MODEL_INCOMPATIBLE = 0x2003,
DS_ERR_SCORER_NOT_ENABLED = 0x2004,
@ -129,7 +129,7 @@ DEEPSPEECH_EXPORT
int DS_DisableExternalScorer(ModelState* aCtx);
/**
* @brief Set hyperparameters alpha and beta of a KenLM external scorer.
* @brief Set hyperparameters alpha and beta of the external scorer.
*
* @param aCtx The ModelState pointer for the model being changed.
* @param aAlpha The alpha hyperparameter of the decoder. Language model weight.

View File

@ -1,141 +0,0 @@
#ifndef DEEPSPEECH_COMPAT_H
#define DEEPSPEECH_COMPAT_H
#include "deepspeech.h"
#warning This header is a convenience wrapper for compatibility with \
the previous API, it has deprecated function names and arguments. \
If possible, update your code instead of using this header.
/**
* @brief An object providing an interface to a trained DeepSpeech model.
*
* @param aModelPath The path to the frozen model graph.
* @param aNCep UNUSED, DEPRECATED.
* @param aNContext UNUSED, DEPRECATED.
* @param aAlphabetConfigPath UNUSED, DEPRECATED.
* @param aBeamWidth The beam width used by the decoder. A larger beam
* width generates better results at the cost of decoding
* time.
* @param[out] retval a ModelState pointer
*
* @return Zero on success, non-zero on failure.
*/
int DS_CreateModel(const char* aModelPath,
unsigned int /*aNCep*/,
unsigned int /*aNContext*/,
const char* /*aAlphabetConfigPath*/,
unsigned int aBeamWidth,
ModelState** retval)
{
return DS_CreateModel(aModelPath, aBeamWidth, retval);
}
/**
* @brief Frees associated resources and destroys model object.
*/
void DS_DestroyModel(ModelState* ctx)
{
return DS_FreeModel(ctx);
}
/**
* @brief Enable decoding using beam scoring with a KenLM language model.
*
* @param aCtx The ModelState pointer for the model being changed.
* @param aAlphabetConfigPath UNUSED, DEPRECATED.
* @param aLMPath The path to the language model binary file.
* @param aTriePath The path to the trie file build from the same vocabu-
* lary as the language model binary.
* @param aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model
weight.
* @param aLMBeta The beta hyperparameter of the CTC decoder. Word insertion
weight.
*
* @return Zero on success, non-zero on failure (invalid arguments).
*/
int DS_EnableDecoderWithLM(ModelState* aCtx,
const char* /*aAlphabetConfigPath*/,
const char* aLMPath,
const char* aTriePath,
float aLMAlpha,
float aLMBeta)
{
return DS_EnableDecoderWithLM(aCtx, aLMPath, aTriePath, aLMAlpha, aLMBeta);
}
/**
* @brief Create a new streaming inference state. The streaming state returned
* by this function can then be passed to {@link DS_FeedAudioContent()}
* and {@link DS_FinishStream()}.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aSampleRate UNUSED, DEPRECATED.
* @param[out] retval an opaque pointer that represents the streaming state. Can
* be NULL if an error occurs.
*
* @return Zero for success, non-zero on failure.
*/
int DS_SetupStream(ModelState* aCtx,
unsigned int /*aSampleRate*/,
StreamingState** retval)
{
return DS_CreateStream(aCtx, retval);
}
/**
* @brief Destroy a streaming state without decoding the computed logits. This
* can be used if you no longer need the result of an ongoing streaming
* inference and don't want to perform a costly decode operation.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
* @note This method will free the state pointer (@p aSctx).
*/
void DS_DiscardStream(StreamingState* aSctx)
{
return DS_FreeStream(aSctx);
}
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate UNUSED, DEPRECATED.
*
* @return The STT result. The user is responsible for freeing the string using
* {@link DS_FreeString()}. Returns NULL on error.
*/
char* DS_SpeechToText(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int /*aSampleRate*/)
{
return DS_SpeechToText(aCtx, aBuffer, aBufferSize);
}
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate UNUSED, DEPRECATED.
*
* @return Outputs a struct of individual letters along with their timing information.
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
*/
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int /*aSampleRate*/)
{
return DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
}
#endif /* DEEPSPEECH_COMPAT_H */

View File

@ -82,8 +82,8 @@ namespace DeepSpeechClient
throw new ArgumentException("Invalid alphabet embedded in model. (Data corruption?)");
case ErrorCodes.DS_ERR_INVALID_SHAPE:
throw new ArgumentException("Invalid model shape.");
case ErrorCodes.DS_ERR_INVALID_LM:
throw new ArgumentException("Invalid language model file.");
case ErrorCodes.DS_ERR_INVALID_SCORER:
throw new ArgumentException("Invalid scorer file.");
case ErrorCodes.DS_ERR_FAIL_INIT_MMAP:
throw new ArgumentException("Failed to initialize memory mapped model.");
case ErrorCodes.DS_ERR_FAIL_INIT_SESS:
@ -100,6 +100,8 @@ namespace DeepSpeechClient
throw new ArgumentException("Error failed to create session.");
case ErrorCodes.DS_ERR_MODEL_INCOMPATIBLE:
throw new ArgumentException("Error incompatible model.");
case ErrorCodes.DS_ERR_SCORER_NOT_ENABLED:
throw new ArgumentException("External scorer is not enabled.");
default:
throw new ArgumentException("Unknown error, please make sure you are using the correct native binary.");
}
@ -114,45 +116,48 @@ namespace DeepSpeechClient
}
/// <summary>
/// Enable decoding using beam scoring with a KenLM language model.
/// Enable decoding using an external scorer.
/// </summary>
/// <param name="aLMPath">The path to the language model binary file.</param>
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param>
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
/// <exception cref="FileNotFoundException">Thrown when cannot find the language model or trie file.</exception>
public unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath,
float aLMAlpha, float aLMBeta)
/// <param name="aScorerPath">The path to the external scorer file.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with an external scorer.</exception>
/// <exception cref="FileNotFoundException">Thrown when cannot find the scorer file.</exception>
public unsafe void EnableExternalScorer(string aScorerPath)
{
string exceptionMessage = null;
if (string.IsNullOrWhiteSpace(aLMPath))
if (string.IsNullOrWhiteSpace(aScorerPath))
{
exceptionMessage = "Path to the language model file cannot be empty.";
throw new FileNotFoundException("Path to the scorer file cannot be empty.");
}
if (!File.Exists(aLMPath))
if (!File.Exists(aScorerPath))
{
exceptionMessage = $"Cannot find the language model file: {aLMPath}";
}
if (string.IsNullOrWhiteSpace(aTriePath))
{
exceptionMessage = "Path to the trie file cannot be empty.";
}
if (!File.Exists(aTriePath))
{
exceptionMessage = $"Cannot find the trie file: {aTriePath}";
throw new FileNotFoundException($"Cannot find the scorer file: {aScorerPath}");
}
if (exceptionMessage != null)
{
throw new FileNotFoundException(exceptionMessage);
}
var resultCode = NativeImp.DS_EnableExternalScorer(_modelStatePP, aScorerPath);
EvaluateResultCode(resultCode);
}
var resultCode = NativeImp.DS_EnableDecoderWithLM(_modelStatePP,
aLMPath,
aTriePath,
aLMAlpha,
aLMBeta);
/// <summary>
/// Disable decoding using an external scorer.
/// </summary>
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
public unsafe void DisableExternalScorer()
{
var resultCode = NativeImp.DS_DisableExternalScorer(_modelStatePP);
EvaluateResultCode(resultCode);
}
/// <summary>
/// Set hyperparameters alpha and beta of the external scorer.
/// </summary>
/// <param name="aAlpha">The alpha hyperparameter of the decoder. Language model weight.</param>
/// <param name="aBeta">The beta hyperparameter of the decoder. Word insertion weight.</param>
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
public unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta)
{
var resultCode = NativeImp.DS_SetScorerAlphaBeta(_modelStatePP,
aAlpha,
aBeta);
EvaluateResultCode(resultCode);
}

View File

@ -14,8 +14,9 @@
// Invalid parameters
DS_ERR_INVALID_ALPHABET = 0x2000,
DS_ERR_INVALID_SHAPE = 0x2001,
DS_ERR_INVALID_LM = 0x2002,
DS_ERR_INVALID_SCORER = 0x2002,
DS_ERR_MODEL_INCOMPATIBLE = 0x2003,
DS_ERR_SCORER_NOT_ENABLED = 0x2004,
// Runtime failures
DS_ERR_FAIL_INIT_MMAP = 0x3000,

View File

@ -21,18 +21,26 @@ namespace DeepSpeechClient.Interfaces
unsafe int GetModelSampleRate();
/// <summary>
/// Enable decoding using beam scoring with a KenLM language model.
/// Enable decoding using an external scorer.
/// </summary>
/// <param name="aLMPath">The path to the language model binary file.</param>
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param>
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
/// <exception cref="FileNotFoundException">Thrown when cannot find the language model or trie file.</exception>
unsafe void EnableDecoderWithLM(string aLMPath,
string aTriePath,
float aLMAlpha,
float aLMBeta);
/// <param name="aScorerPath">The path to the external scorer file.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with an external scorer.</exception>
/// <exception cref="FileNotFoundException">Thrown when cannot find the scorer file.</exception>
unsafe void EnableExternalScorer(string aScorerPath);
/// <summary>
/// Disable decoding using an external scorer.
/// </summary>
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
unsafe void DisableExternalScorer();
/// <summary>
/// Set hyperparameters alpha and beta of the external scorer.
/// </summary>
/// <param name="aAlpha">The alpha hyperparameter of the decoder. Language model weight.</param>
/// <param name="aBeta">The beta hyperparameter of the decoder. Word insertion weight.</param>
/// <exception cref="ArgumentException">Thrown when an external scorer is not enabled.</exception>
unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta);
/// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text.

View File

@ -23,11 +23,16 @@ namespace DeepSpeechClient
internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(IntPtr** aCtx,
string aLMPath,
string aTriePath,
float aLMAlpha,
float aLMBeta);
internal static unsafe extern ErrorCodes DS_EnableExternalScorer(IntPtr** aCtx,
string aScorerPath);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_DisableExternalScorer(IntPtr** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_SetScorerAlphaBeta(IntPtr** aCtx,
float aAlpha,
float aBeta);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
CharSet = CharSet.Ansi, SetLastError = true)]

View File

@ -35,22 +35,18 @@ namespace CSharpExamples
static void Main(string[] args)
{
string model = null;
string lm = null;
string trie = null;
string scorer = null;
string audio = null;
bool extended = false;
if (args.Length > 0)
{
model = GetArgument(args, "--model");
lm = GetArgument(args, "--lm");
trie = GetArgument(args, "--trie");
scorer = GetArgument(args, "--scorer");
audio = GetArgument(args, "--audio");
extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended"));
}
const uint BEAM_WIDTH = 500;
const float LM_ALPHA = 0.75f;
const float LM_BETA = 1.85f;
Stopwatch stopwatch = new Stopwatch();
try
@ -64,14 +60,10 @@ namespace CSharpExamples
Console.WriteLine($"Model loaded - {stopwatch.Elapsed.Milliseconds} ms");
stopwatch.Reset();
if (lm != null)
if (scorer != null)
{
Console.WriteLine("Loadin LM...");
sttClient.EnableDecoderWithLM(
lm ?? "lm.binary",
trie ?? "trie",
LM_ALPHA, LM_BETA);
Console.WriteLine("Loading scorer...");
sttClient.EnableExternalScorer(scorer ?? "kenlm.scorer");
}
string audioFile = audio ?? "arctic_a0024.wav";

View File

@ -31,8 +31,6 @@ public class DeepSpeechActivity extends AppCompatActivity {
Button _startInference;
final int BEAM_WIDTH = 50;
final float LM_ALPHA = 0.75f;
final float LM_BETA = 1.85f;
private char readLEChar(RandomAccessFile f) throws IOException {
byte b1 = f.readByte();

View File

@ -30,15 +30,11 @@ import java.nio.ByteBuffer;
public class BasicTest {
public static final String modelFile = "/data/local/tmp/test/output_graph.tflite";
public static final String lmFile = "/data/local/tmp/test/lm.binary";
public static final String trieFile = "/data/local/tmp/test/trie";
public static final String scorerFile = "/data/local/tmp/test/kenlm.scorer";
public static final String wavFile = "/data/local/tmp/test/LDC93S1.wav";
public static final int BEAM_WIDTH = 50;
public static final float LM_ALPHA = 0.75f;
public static final float LM_BETA = 1.85f;
private char readLEChar(RandomAccessFile f) throws IOException {
byte b1 = f.readByte();
byte b2 = f.readByte();
@ -130,7 +126,7 @@ public class BasicTest {
@Test
public void loadDeepSpeech_stt_withLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
m.enableDecoderWithLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
m.enableExternalScorer(scorerFile);
String decoded = doSTT(m, false);
assertEquals("she had your dark suit in greasy wash water all year", decoded);
@ -149,7 +145,7 @@ public class BasicTest {
@Test
public void loadDeepSpeech_sttWithMetadata_withLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
m.enableDecoderWithLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
m.enableExternalScorer(scorerFile);
String decoded = doSTT(m, true);
assertEquals("she had your dark suit in greasy wash water all year", decoded);

View File

@ -47,17 +47,35 @@ public class DeepSpeechModel {
}
/**
* @brief Enable decoding using beam scoring with a KenLM language model.
* @brief Enable decoding using an external scorer.
*
* @param lm The path to the language model binary file.
* @param trie The path to the trie file build from the same vocabulary as the language model binary.
* @param lm_alpha The alpha hyperparameter of the CTC decoder. Language Model weight.
* @param lm_beta The beta hyperparameter of the CTC decoder. Word insertion weight.
* @param scorer The path to the external scorer file.
*
* @return Zero on success, non-zero on failure (invalid arguments).
*/
public void enableDecoderWithLM(String lm, String trie, float lm_alpha, float lm_beta) {
impl.EnableDecoderWithLM(this._msp, lm, trie, lm_alpha, lm_beta);
public void enableExternalScorer(String scorer) {
impl.EnableExternalScorer(this._msp, scorer);
}
/**
* @brief Disable decoding using an external scorer.
*
* @return Zero on success, non-zero on failure (invalid arguments).
*/
public void disableExternalScorer() {
impl.DisableExternalScorer(this._msp);
}
/**
* @brief Enable decoding using beam scoring with a KenLM language model.
*
* @param alpha The alpha hyperparameter of the decoder. Language model weight.
* @param beta The beta hyperparameter of the decoder. Word insertion weight.
*
* @return Zero on success, non-zero on failure (invalid arguments).
*/
public void setScorerAlphaBeta(float alpha, float beta) {
impl.SetScorerAlphaBeta(this._msp, alpha, beta);
}
/*

View File

@ -29,12 +29,11 @@ VersionAction.prototype.call = function(parser) {
var parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'});
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
parser.addArgument(['--beam_width'], {help: 'Beam width for the CTC decoder', defaultValue: 500, type: 'int'});
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha)', defaultValue: 0.75, type: 'float'});
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta)', defaultValue: 1.85, type: 'float'});
parser.addArgument(['--lm_alpha'], {help: 'Language model weight (lm_alpha). If not set, use default value from scorer.', type: 'float'});
parser.addArgument(['--lm_beta'], {help: 'Word insertion bonus (lm_beta). If not set, use default value from scorer.', type: 'float'});
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
var args = parser.parseArgs();
@ -60,12 +59,16 @@ console.error('Loaded model in %ds.', totalTime(model_load_end));
var desired_sample_rate = model.sampleRate();
if (args['lm'] && args['trie']) {
console.error('Loading language model from files %s %s', args['lm'], args['trie']);
const lm_load_start = process.hrtime();
model.enableDecoderWithLM(args['lm'], args['trie'], args['lm_alpha'], args['lm_beta']);
const lm_load_end = process.hrtime(lm_load_start);
console.error('Loaded language model in %ds.', totalTime(lm_load_end));
if (args['scorer']) {
console.error('Loading scorer from file %s', args['scorer']);
const scorer_load_start = process.hrtime();
model.enableExternalScorer(args['scorer']);
const scorer_load_end = process.hrtime(scorer_load_start);
console.error('Loaded scorer in %ds.', totalTime(scorer_load_end));
if (args['lm_alpha'] && args['lm_beta']) {
model.setScorerAlphaBeta(args['lm_alpha'], args['lm_beta']);
}
}
const buffer = Fs.readFileSync(args['audio']);

View File

@ -52,31 +52,46 @@ Model.prototype.sampleRate = function() {
}
/**
* Enable decoding using beam scoring with a KenLM language model.
* Enable decoding using an external scorer.
*
* @param {string} aScorerPath The path to the external scorer file.
*
* @return {number} Zero on success, non-zero on failure (invalid arguments).
*/
Model.prototype.enableExternalScorer = function(aScorerPath) {
return binding.EnableExternalScorer(this._impl, aScorerPath);
}
/**
* Disable decoding using an external scorer.
*
* @return {number} Zero on success, non-zero on failure (invalid arguments).
*/
Model.prototype.disableExternalScorer = function() {
return binding.EnableExternalScorer(this._impl);
}
/**
* Set hyperparameters alpha and beta of the external scorer.
*
* @param {string} aLMPath The path to the language model binary file.
* @param {string} aTriePath The path to the trie file build from the same vocabulary as the language model binary.
* @param {float} aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model weight.
* @param {float} aLMBeta The beta hyperparameter of the CTC decoder. Word insertion weight.
*
* @return {number} Zero on success, non-zero on failure (invalid arguments).
*/
Model.prototype.enableDecoderWithLM = function() {
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
return binding.EnableDecoderWithLM.apply(null, args);
Model.prototype.setScorerAlphaBeta = function(aLMAlpha, aLMBeta) {
return binding.SetScorerAlphaBeta(this._impl, aLMAlpha, aLMBeta);
}
/**
* Use the DeepSpeech model to perform Speech-To-Text.
*
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in the audio signal.
*
* @return {string} The STT result. Returns undefined on error.
*/
Model.prototype.stt = function() {
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
return binding.SpeechToText.apply(null, args);
Model.prototype.stt = function(aBuffer) {
return binding.SpeechToText(this._impl, aBuffer);
}
/**
@ -84,25 +99,22 @@ Model.prototype.stt = function() {
* about the results.
*
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in the audio signal.
*
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/
Model.prototype.sttWithMetadata = function() {
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
return binding.SpeechToTextWithMetadata.apply(null, args);
Model.prototype.sttWithMetadata = function(aBuffer) {
return binding.SpeechToTextWithMetadata(this._impl, aBuffer);
}
/**
* Create a new streaming inference state. The streaming state returned by this function can then be passed to :js:func:`Model.feedAudioContent` and :js:func:`Model.finishStream`.
* Create a new streaming inference state. One can then call :js:func:`Stream.feedAudioContent` and :js:func:`Stream.finishStream` on the returned stream object.
*
* @return {object} an opaque object that represents the streaming state.
* @return {object} a :js:func:`Stream` object that represents the streaming state.
*
* @throws on error
*/
Model.prototype.createStream = function() {
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
const rets = binding.CreateStream.apply(null, args);
const rets = binding.CreateStream(this._impl);
const status = rets[0];
const ctx = rets[1];
if (status !== 0) {
@ -111,55 +123,56 @@ Model.prototype.createStream = function() {
return ctx;
}
function Stream(nativeStream) {
this._impl = nativeStream;
}
/**
* Feed audio samples to an ongoing streaming inference.
*
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
* @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in @param aBuffer.
*/
Model.prototype.feedAudioContent = function() {
binding.FeedAudioContent.apply(null, arguments);
Stream.prototype.feedAudioContent = function(aBuffer) {
binding.FeedAudioContent(this._impl, aBuffer);
}
/**
* Compute the intermediate decoding of an ongoing streaming inference.
*
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
*
* @return {string} The STT intermediate result.
*/
Model.prototype.intermediateDecode = function() {
return binding.IntermediateDecode.apply(null, arguments);
Stream.prototype.intermediateDecode = function() {
return binding.IntermediateDecode(this._impl);
}
/**
* Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
*
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
*
* @return {string} The STT result.
*
* This method will free the state (@param aSctx).
* This method will free the stream, it must not be used after this method is called.
*/
Model.prototype.finishStream = function() {
return binding.FinishStream.apply(null, arguments);
Stream.prototype.finishStream = function() {
result = binding.FinishStream(this._impl);
this._impl = null;
return result;
}
/**
* Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
*
* @param {object} aSctx A streaming state pointer returned by :js:func:`Model.setupStream`.
*
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
*
* This method will free the state pointer (@param aSctx).
* This method will free the stream, it must not be used after this method is called.
*/
Model.prototype.finishStreamWithMetadata = function() {
return binding.FinishStreamWithMetadata.apply(null, arguments);
Stream.prototype.finishStreamWithMetadata = function() {
result = binding.FinishStreamWithMetadata(this._impl);
this._impl = null;
return result;
}
/**
* Frees associated resources and destroys model object.
*
@ -184,10 +197,10 @@ function FreeMetadata(metadata) {
* can be used if you no longer need the result of an ongoing streaming
* inference and don't want to perform a costly decode operation.
*
* @param {Object} stream A streaming state pointer returned by :js:func:`Model.createStream`.
* @param {Object} stream A stream object returned by :js:func:`Model.createStream`.
*/
function FreeStream(stream) {
return binding.FreeStream(stream);
return binding.FreeStream(stream._impl);
}
/**

View File

@ -21,7 +21,6 @@ import deepspeech
# rename for backwards compatibility
from deepspeech.impl import PrintVersions as printVersions
from deepspeech.impl import FreeStream as freeStream
class Model(object):
"""
@ -56,127 +55,159 @@ class Model(object):
"""
return deepspeech.impl.GetModelSampleRate(self._impl)
def enableDecoderWithLM(self, *args, **kwargs):
def enableExternalScorer(self, scorer_path):
"""
Enable decoding using beam scoring with a KenLM language model.
Enable decoding using an external scorer.
:param aLMPath: The path to the language model binary file.
:type aLMPath: str
:param scorer_path: The path to the external scorer file.
:type scorer_path: str
:param aTriePath: The path to the trie file build from the same vocabulary as the language model binary.
:type aTriePath: str
:param aLMAlpha: The alpha hyperparameter of the CTC decoder. Language Model weight.
:type aLMAlpha: float
:param aLMBeta: The beta hyperparameter of the CTC decoder. Word insertion weight.
:type aLMBeta: float
:return: Zero on success, non-zero on failure (invalid arguments).
:return: Zero on success, non-zero on failure.
:type: int
"""
return deepspeech.impl.EnableDecoderWithLM(self._impl, *args, **kwargs)
return deepspeech.impl.EnableExternalScorer(self._impl, scorer_path)
def stt(self, *args, **kwargs):
def disableExternalScorer(self):
"""
Disable decoding using an external scorer.
:return: Zero on success, non-zero on failure.
"""
return deepspeech.impl.DisableExternalScorer(self._impl)
def setScorerAlphaBeta(self, alpha, beta):
"""
Set hyperparameters alpha and beta of the external scorer.
:param alpha: The alpha hyperparameter of the decoder. Language model weight.
:type alpha: float
:param beta: The beta hyperparameter of the decoder. Word insertion weight.
:type beta: float
:return: Zero on success, non-zero on failure.
:type: int
"""
return deepspeech.impl.SetScorerAlphaBeta(self._impl, alpha, beta)
def stt(self, audio_buffer):
"""
Use the DeepSpeech model to perform Speech-To-Text.
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array
:param aBufferSize: The number of samples in the audio signal.
:type aBufferSize: int
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type audio_buffer: numpy.int16 array
:return: The STT result.
:type: str
"""
return deepspeech.impl.SpeechToText(self._impl, *args, **kwargs)
return deepspeech.impl.SpeechToText(self._impl, audio_buffer)
def sttWithMetadata(self, *args, **kwargs):
def sttWithMetadata(self, audio_buffer):
"""
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array
:param aBufferSize: The number of samples in the audio signal.
:type aBufferSize: int
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type audio_buffer: numpy.int16 array
:return: Outputs a struct of individual letters along with their timing information.
:type: :func:`Metadata`
"""
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer)
def createStream(self):
"""
Create a new streaming inference state. The streaming state returned
by this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.
Create a new streaming inference state. The streaming state returned by
this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.
:return: Object holding the stream
:return: Stream object representing the newly created stream
:type: :func:`Stream`
:throws: RuntimeError on error
"""
status, ctx = deepspeech.impl.CreateStream(self._impl)
if status != 0:
raise RuntimeError("CreateStream failed with error code {}".format(status))
return ctx
return Stream(ctx)
# pylint: disable=no-self-use
def feedAudioContent(self, *args, **kwargs):
class Stream(object):
def __init__(self, native_stream):
self._impl = native_stream
def __del__(self):
if self._impl:
self.freeStream()
def feedAudioContent(self, audio_buffer):
"""
Feed audio samples to an ongoing streaming inference.
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
:type aSctx: object
:param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type audio_buffer: numpy.int16 array
:param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array
:param aBufferSize: The number of samples in @p aBuffer.
:type aBufferSize: int
:throws: RuntimeError if the stream object is not valid
"""
deepspeech.impl.FeedAudioContent(*args, **kwargs)
if not self._impl:
raise RuntimeError("Stream object is not valid. Trying to feed an already finished stream?")
deepspeech.impl.FeedAudioContent(self._impl, audio_buffer)
# pylint: disable=no-self-use
def intermediateDecode(self, *args, **kwargs):
def intermediateDecode(self):
"""
Compute the intermediate decoding of an ongoing streaming inference.
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
:type aSctx: object
:return: The STT intermediate result.
:type: str
"""
return deepspeech.impl.IntermediateDecode(*args, **kwargs)
# pylint: disable=no-self-use
def finishStream(self, *args, **kwargs):
:throws: RuntimeError if the stream object is not valid
"""
Signal the end of an audio signal to an ongoing streaming
inference, returns the STT result over the whole audio signal.
if not self._impl:
raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
return deepspeech.impl.IntermediateDecode(self._impl)
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
:type aSctx: object
def finishStream(self):
"""
Signal the end of an audio signal to an ongoing streaming inference,
returns the STT result over the whole audio signal.
:return: The STT result.
:type: str
"""
return deepspeech.impl.FinishStream(*args, **kwargs)
# pylint: disable=no-self-use
def finishStreamWithMetadata(self, *args, **kwargs):
:throws: RuntimeError if the stream object is not valid
"""
Signal the end of an audio signal to an ongoing streaming
inference, returns per-letter metadata.
if not self._impl:
raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
result = deepspeech.impl.FinishStream(self._impl)
self._impl = None
return result
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
:type aSctx: object
def finishStreamWithMetadata(self):
"""
Signal the end of an audio signal to an ongoing streaming inference,
returns per-letter metadata.
:return: Outputs a struct of individual letters along with their timing information.
:type: :func:`Metadata`
:throws: RuntimeError if the stream object is not valid
"""
return deepspeech.impl.FinishStreamWithMetadata(*args, **kwargs)
if not self._impl:
raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
result = deepspeech.impl.FinishStreamWithMetadata(self._impl)
self._impl = None
return result
def freeStream(self):
"""
Destroy a streaming state without decoding the computed logits. This can
be used if you no longer need the result of an ongoing streaming inference.
:throws: RuntimeError if the stream object is not valid
"""
if not self._impl:
raise RuntimeError("Stream object is not valid. Trying to free an already finished stream?")
deepspeech.impl.FreeStream(self._impl)
self._impl = None
# This is only for documentation purpose
# Metadata and MetadataItem should be in sync with native_client/deepspeech.h
@ -189,22 +220,18 @@ class MetadataItem(object):
"""
The character generated for transcription
"""
# pylint: disable=unnecessary-pass
pass
def timestep(self):
"""
Position of the character in units of 20ms
"""
# pylint: disable=unnecessary-pass
pass
def start_time(self):
"""
Position of the character in seconds
"""
# pylint: disable=unnecessary-pass
pass
class Metadata(object):
@ -218,8 +245,7 @@ class Metadata(object):
:return: A list of :func:`MetadataItem` elements
:type: list
"""
# pylint: disable=unnecessary-pass
pass
def num_items(self):
"""
@ -228,8 +254,7 @@ class Metadata(object):
:return: Size of the list of items
:type: int
"""
# pylint: disable=unnecessary-pass
pass
def confidence(self):
"""
@ -237,5 +262,4 @@ class Metadata(object):
sum of the acoustic model logit values for each timestep/character that
contributed to the creation of this transcription.
"""
# pylint: disable=unnecessary-pass
pass

View File

@ -72,7 +72,7 @@ def metadata_json_output(metadata):
json_result["words"] = words_from_metadata(metadata)
json_result["confidence"] = metadata.confidence
return json.dumps(json_result)
class VersionAction(argparse.Action):
@ -88,17 +88,15 @@ def main():
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
parser.add_argument('--model', required=True,
help='Path to the model (protocol buffer binary file)')
parser.add_argument('--lm', nargs='?',
help='Path to the language model binary file')
parser.add_argument('--trie', nargs='?',
help='Path to the language model trie file created with native_client/generate_trie')
parser.add_argument('--scorer', required=False,
help='Path to the external scorer file')
parser.add_argument('--audio', required=True,
help='Path to the audio file to run (WAV format)')
parser.add_argument('--beam_width', type=int, default=500,
help='Beam width for the CTC decoder')
parser.add_argument('--lm_alpha', type=float, default=0.75,
parser.add_argument('--lm_alpha', type=float,
help='Language model weight (lm_alpha)')
parser.add_argument('--lm_beta', type=float, default=1.85,
parser.add_argument('--lm_beta', type=float,
help='Word insertion bonus (lm_beta)')
parser.add_argument('--version', action=VersionAction,
help='Print version and exits')
@ -116,12 +114,15 @@ def main():
desired_sample_rate = ds.sampleRate()
if args.lm and args.trie:
print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
lm_load_start = timer()
ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta)
lm_load_end = timer() - lm_load_start
print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)
if args.scorer:
print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
scorer_load_start = timer()
ds.enableExternalScorer(args.scorer)
scorer_load_end = timer() - scorer_load_start
print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)
if args.lm_alpha and args.lm_beta:
ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)
fin = wave.open(args.audio, 'rb')
fs = fin.getframerate()

View File

@ -14,21 +14,13 @@ from deepspeech import Model
# Beam width used in the CTC decoder when building candidate transcriptions
BEAM_WIDTH = 500
# The alpha hyperparameter of the CTC decoder. Language Model weight
LM_ALPHA = 0.75
# The beta hyperparameter of the CTC decoder. Word insertion bonus.
LM_BETA = 1.85
def main():
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
parser.add_argument('--model', required=True,
help='Path to the model (protocol buffer binary file)')
parser.add_argument('--lm', nargs='?',
help='Path to the language model binary file')
parser.add_argument('--trie', nargs='?',
help='Path to the language model trie file created with native_client/generate_trie')
parser.add_argument('--scorer', nargs='?',
help='Path to the external scorer file')
parser.add_argument('--audio1', required=True,
help='First audio file to use in interleaved streams')
parser.add_argument('--audio2', required=True,
@ -37,8 +29,8 @@ def main():
ds = Model(args.model, BEAM_WIDTH)
if args.lm and args.trie:
ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA)
if args.scorer:
ds.enableExternalScorer(args.scorer)
fin = wave.open(args.audio1, 'rb')
fs1 = fin.getframerate()
@ -57,11 +49,11 @@ def main():
splits2 = np.array_split(audio2, 10)
for part1, part2 in zip(splits1, splits2):
ds.feedAudioContent(stream1, part1)
ds.feedAudioContent(stream2, part2)
stream1.feedAudioContent(part1)
stream2.feedAudioContent(part2)
print(ds.finishStream(stream1))
print(ds.finishStream(stream2))
print(stream1.finishStream())
print(stream2.finishStream())
if __name__ == '__main__':
main()

View File

@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
BAZEL_TARGETS="
//native_client:libdeepspeech.so
//native_client:generate_trie
"
BAZEL_BUILD_FLAGS="${BAZEL_ARM64_FLAGS} ${BAZEL_EXTRA_FLAGS}"

View File

@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
BAZEL_TARGETS="
//native_client:libdeepspeech.so
//native_client:generate_trie
"
BAZEL_ENV_FLAGS="TF_NEED_CUDA=1 ${TF_CUDA_FLAGS}"

View File

@ -30,11 +30,11 @@ then:
image: ${build.docker_image}
env:
DEEPSPEECH_MODEL: "https://github.com/reuben/DeepSpeech/releases/download/v0.6.0-alpha.15/models.tar.gz"
DEEPSPEECH_MODEL: "https://github.com/reuben/DeepSpeech/releases/download/v0.6.1/models.tar.gz"
DEEPSPEECH_AUDIO: "https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz"
PIP_DEFAULT_TIMEOUT: "60"
EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
EXAMPLES_CHECKOUT_TARGET: "master"
EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05"
command:
- "/bin/bash"

View File

@ -10,7 +10,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
BAZEL_TARGETS="
//native_client:libdeepspeech.so
//native_client:generate_trie
"
if [ "${runtime}" = "tflite" ]; then

View File

@ -8,7 +8,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
BAZEL_TARGETS="
//native_client:libdeepspeech.so
//native_client:generate_trie
"
BAZEL_BUILD_FLAGS="${BAZEL_ARM_FLAGS} ${BAZEL_EXTRA_FLAGS}"

View File

@ -49,7 +49,7 @@ deepspeech --version
pushd ${HOME}/DeepSpeech/ds/
python bin/import_ldc93s1.py data/smoke_test
python evaluate_tflite.py --model "${TASKCLUSTER_TMP_DIR}/${model_name_mmap}" --lm data/smoke_test/vocab.pruned.lm --trie data/smoke_test/vocab.trie --csv data/smoke_test/ldc93s1.csv
python evaluate_tflite.py --model "${TASKCLUSTER_TMP_DIR}/${model_name_mmap}" --scorer data/smoke_test/pruned_lm.scorer --csv data/smoke_test/ldc93s1.csv
popd
virtualenv_deactivate "${pyalias}" "${PYENV_NAME}"

View File

@ -378,7 +378,7 @@ run_netframework_inference_tests()
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e
phrase_pbmodel_withlm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
phrase_pbmodel_withlm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e
assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?"
}
@ -401,7 +401,7 @@ run_electronjs_inference_tests()
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e
assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?"
}
@ -427,7 +427,7 @@ run_basic_inference_tests()
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"
set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$?
set -e
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm}" "$status"
@ -444,7 +444,7 @@ run_all_inference_tests()
assert_correct_ldc93s1 "${phrase_pbmodel_nolm_stereo_44k}" "$status"
set +e
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$?
set -e
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_stereo_44k}" "$status"
@ -457,7 +457,7 @@ run_all_inference_tests()
assert_correct_warning_upsampling "${phrase_pbmodel_nolm_mono_8k}"
set +e
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
set -e
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
fi;
@ -470,8 +470,7 @@ run_prod_concurrent_stream_tests()
set +e
output=$(python ${TASKCLUSTER_TMP_DIR}/test_sources/concurrent_streams.py \
--model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} \
--lm ${TASKCLUSTER_TMP_DIR}/lm.binary \
--trie ${TASKCLUSTER_TMP_DIR}/trie \
--scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer \
--audio1 ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_16000.wav \
--audio2 ${TASKCLUSTER_TMP_DIR}/new-home-in-the-stars-16k.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$?
@ -489,19 +488,19 @@ run_prod_inference_tests()
local _bitrate=$1
set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$?
set -e
assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$?
set -e
assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
set +e
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$?
set -e
assert_correct_ldc93s1_prodmodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}"
@ -509,7 +508,7 @@ run_prod_inference_tests()
# Run down-sampling warning test only when we actually perform downsampling
if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then
set +e
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
set -e
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
fi;
@ -520,19 +519,19 @@ run_prodtflite_inference_tests()
local _bitrate=$1
set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$?
set -e
assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$?
set -e
assert_correct_ldc93s1_prodtflitemodel "${phrase_pbmodel_withlm}" "$status" "${_bitrate}"
set +e
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$?
set -e
assert_correct_ldc93s1_prodtflitemodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" "${_bitrate}"
@ -540,7 +539,7 @@ run_prodtflite_inference_tests()
# Run down-sampling warning test only when we actually perform downsampling
if [ "${ldc93s1_sample_filename}" != "LDC93S1_pcms16le_1_8000.wav" ]; then
set +e
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
set -e
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
fi;
@ -555,7 +554,7 @@ run_multi_inference_tests()
assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_nolm}" "$status"
set +e -o pipefail
multi_phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%')
multi_phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%')
status=$?
set -e +o pipefail
assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_withlm}" "$status"
@ -564,7 +563,7 @@ run_multi_inference_tests()
run_cpp_only_inference_tests()
{
set +e
phrase_pbmodel_withlm_intermediate_decode=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} --stream 1280 2>${TASKCLUSTER_TMP_DIR}/stderr | tail -n 1)
phrase_pbmodel_withlm_intermediate_decode=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer --audio ${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} --stream 1280 2>${TASKCLUSTER_TMP_DIR}/stderr | tail -n 1)
status=$?
set -e
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_intermediate_decode}" "$status"
@ -669,8 +668,7 @@ download_data()
${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source}"
${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source_mmap}"
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/*.wav ${TASKCLUSTER_TMP_DIR}/
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.pruned.lm ${TASKCLUSTER_TMP_DIR}/lm.binary
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.trie ${TASKCLUSTER_TMP_DIR}/trie
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/pruned_lm.scorer ${TASKCLUSTER_TMP_DIR}/kenlm.scorer
cp -R ${DS_ROOT_TASK}/DeepSpeech/ds/native_client/test ${TASKCLUSTER_TMP_DIR}/test_sources
}
@ -1562,7 +1560,6 @@ package_native_client()
fi;
${TAR} -cf - \
-C ${tensorflow_dir}/bazel-bin/native_client/ generate_trie${PLATFORM_EXE_SUFFIX} \
-C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so \
-C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so.if.lib \
-C ${deepspeech_dir}/ LICENSE \
@ -1767,8 +1764,7 @@ android_setup_apk_data()
adb push \
${TASKCLUSTER_TMP_DIR}/${model_name} \
${TASKCLUSTER_TMP_DIR}/${ldc93s1_sample_filename} \
${TASKCLUSTER_TMP_DIR}/lm.binary \
${TASKCLUSTER_TMP_DIR}/trie \
${TASKCLUSTER_TMP_DIR}/kenlm.scorer \
${ANDROID_TMP_DIR}/test/
}

View File

@ -10,7 +10,6 @@ source ${DS_ROOT_TASK}/DeepSpeech/tf/tc-vars.sh
BAZEL_TARGETS="
//native_client:libdeepspeech.so
//native_client:generate_trie
"
if [ "${package_option}" = "--cuda" ]; then

View File

@ -44,7 +44,7 @@ payload:
MSYS: 'winsymlinks:nativestrict'
TENSORFLOW_BUILD_ARTIFACT: ${build.tensorflow}
EXAMPLES_CLONE_URL: "https://github.com/mozilla/DeepSpeech-examples"
EXAMPLES_CHECKOUT_TARGET: "master"
EXAMPLES_CHECKOUT_TARGET: "f3dee7910d1642e14b1e3877568f8342c1c22e05"
command:
- >-

View File

@ -29,7 +29,7 @@ def fail(message, code=1):
def transcribe_file(audio_path, tlog_path):
from DeepSpeech import create_model, try_loading # pylint: disable=cyclic-import,import-outside-toplevel
initialize_globals()
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, Config.alphabet)
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path, Config.alphabet)
try:
num_processes = cpu_count()
except NotImplementedError:

View File

@ -143,10 +143,8 @@ def create_flags():
f.DEFINE_boolean('utf8', False, 'enable UTF-8 mode. When this is used the model outputs UTF-8 sequences directly rather than using an alphabet mapping.')
f.DEFINE_string('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.')
f.DEFINE_string('lm_binary_path', 'data/lm/lm.binary', 'path to the language model binary file created with KenLM')
f.DEFINE_alias('lm', 'lm_binary_path')
f.DEFINE_string('lm_trie_path', 'data/lm/trie', 'path to the language model trie file created with native_client/generate_trie')
f.DEFINE_alias('trie', 'lm_trie_path')
f.DEFINE_string('scorer_path', 'data/lm/kenlm.scorer', 'path to the external scorer file created with data/lm/generate_package.py')
f.DEFINE_alias('scorer', 'scorer_path')
f.DEFINE_integer('beam_width', 1024, 'beam width used in the CTC decoder when building candidate transcriptions')
f.DEFINE_float('lm_alpha', 0.75, 'the alpha hyperparameter of the CTC decoder. Language Model weight.')
f.DEFINE_float('lm_beta', 1.85, 'the beta hyperparameter of the CTC decoder. Word insertion weight.')