Update examples to run latest DeepSpeech

Fixes #2351
This commit is contained in:
Alexandre Lissy 2019-09-17 12:03:08 +02:00
parent 5ef0117df0
commit b5a3e328da
10 changed files with 20 additions and 40 deletions

View File

@ -17,16 +17,6 @@ const LM_ALPHA = 0.75;
// The beta hyperparameter of the CTC decoder. Word insertion bonus. // The beta hyperparameter of the CTC decoder. Word insertion bonus.
const LM_BETA = 1.85; const LM_BETA = 1.85;
// These constants are tied to the shape of the graph used (changing them changes
// the geometry of the first layer), so make sure you use the same constants that
// were used during training
// Number of MFCC features to use
const N_FEATURES = 26;
// Size of the context window used for producing timesteps in the input vector
const N_CONTEXT = 9;
let VersionAction = function VersionAction(options) { let VersionAction = function VersionAction(options) {
options = options || {}; options = options || {};
options.nargs = 0; options.nargs = 0;
@ -55,15 +45,14 @@ function totalTime(hrtimeValue) {
console.error('Loading model from file %s', args['model']); console.error('Loading model from file %s', args['model']);
const model_load_start = process.hrtime(); const model_load_start = process.hrtime();
let model = new Ds.Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'], BEAM_WIDTH); let model = new Ds.Model(args['model'], args['alphabet'], BEAM_WIDTH);
const model_load_end = process.hrtime(model_load_start); const model_load_end = process.hrtime(model_load_start);
console.error('Loaded model in %ds.', totalTime(model_load_end)); console.error('Loaded model in %ds.', totalTime(model_load_end));
if (args['lm'] && args['trie']) { if (args['lm'] && args['trie']) {
console.error('Loading language model from files %s %s', args['lm'], args['trie']); console.error('Loading language model from files %s %s', args['lm'], args['trie']);
const lm_load_start = process.hrtime(); const lm_load_start = process.hrtime();
model.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'], model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA);
LM_ALPHA, LM_BETA);
const lm_load_end = process.hrtime(lm_load_start); const lm_load_end = process.hrtime(lm_load_start);
console.error('Loaded language model in %ds.', totalTime(lm_load_end)); console.error('Loaded language model in %ds.', totalTime(lm_load_end));
} }
@ -106,7 +95,7 @@ const ffmpeg = spawn('ffmpeg', [
]); ]);
let audioLength = 0; let audioLength = 0;
let sctx = model.setupStream(AUDIO_SAMPLE_RATE); let sctx = model.createStream(AUDIO_SAMPLE_RATE);
function finishStream() { function finishStream() {
const model_load_start = process.hrtime(); const model_load_start = process.hrtime();
@ -119,7 +108,7 @@ function finishStream() {
function intermediateDecode() { function intermediateDecode() {
finishStream(); finishStream();
sctx = model.setupStream(AUDIO_SAMPLE_RATE); sctx = model.createStream(AUDIO_SAMPLE_RATE);
} }
function feedAudioContent(chunk) { function feedAudioContent(chunk) {

View File

@ -8,7 +8,7 @@
}, },
"dependencies": { "dependencies": {
"argparse": "^1.0.10", "argparse": "^1.0.10",
"deepspeech": "^0.4.1", "deepspeech": "^0.6.0-alpha.5",
"node-vad": "^1.1.1", "node-vad": "^1.1.1",
"util": "^0.11.1" "util": "^0.11.1"
}, },

View File

@ -162,11 +162,11 @@ def main(ARGS):
print('Initializing model...') print('Initializing model...')
logging.info("ARGS.model: %s", ARGS.model) logging.info("ARGS.model: %s", ARGS.model)
logging.info("ARGS.alphabet: %s", ARGS.alphabet) logging.info("ARGS.alphabet: %s", ARGS.alphabet)
model = deepspeech.Model(ARGS.model, ARGS.n_features, ARGS.n_context, ARGS.alphabet, ARGS.beam_width) model = deepspeech.Model(ARGS.model, ARGS.alphabet, ARGS.beam_width)
if ARGS.lm and ARGS.trie: if ARGS.lm and ARGS.trie:
logging.info("ARGS.lm: %s", ARGS.lm) logging.info("ARGS.lm: %s", ARGS.lm)
logging.info("ARGS.trie: %s", ARGS.trie) logging.info("ARGS.trie: %s", ARGS.trie)
model.enableDecoderWithLM(ARGS.alphabet, ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta) model.enableDecoderWithLM(ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta)
# Start audio with VAD # Start audio with VAD
vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness, vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
@ -179,7 +179,7 @@ def main(ARGS):
# Stream from microphone to DeepSpeech using VAD # Stream from microphone to DeepSpeech using VAD
spinner = None spinner = None
if not ARGS.nospinner: spinner = Halo(spinner='line') if not ARGS.nospinner: spinner = Halo(spinner='line')
stream_context = model.setupStream() stream_context = model.createStream()
wav_data = bytearray() wav_data = bytearray()
for frame in frames: for frame in frames:
if frame is not None: if frame is not None:
@ -195,15 +195,13 @@ def main(ARGS):
wav_data = bytearray() wav_data = bytearray()
text = model.finishStream(stream_context) text = model.finishStream(stream_context)
print("Recognized: %s" % text) print("Recognized: %s" % text)
stream_context = model.setupStream() stream_context = model.createStream()
if __name__ == '__main__': if __name__ == '__main__':
BEAM_WIDTH = 500 BEAM_WIDTH = 500
DEFAULT_SAMPLE_RATE = 16000 DEFAULT_SAMPLE_RATE = 16000
LM_ALPHA = 0.75 LM_ALPHA = 0.75
LM_BETA = 1.85 LM_BETA = 1.85
N_FEATURES = 26
N_CONTEXT = 9
import argparse import argparse
parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD") parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD")
@ -229,10 +227,6 @@ if __name__ == '__main__':
help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().") help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
parser.add_argument('-r', '--rate', type=int, default=DEFAULT_SAMPLE_RATE, parser.add_argument('-r', '--rate', type=int, default=DEFAULT_SAMPLE_RATE,
help=f"Input device sample rate. Default: {DEFAULT_SAMPLE_RATE}. Your device may require 44100.") help=f"Input device sample rate. Default: {DEFAULT_SAMPLE_RATE}. Your device may require 44100.")
parser.add_argument('-nf', '--n_features', type=int, default=N_FEATURES,
help=f"Number of MFCC features to use. Default: {N_FEATURES}")
parser.add_argument('-nc', '--n_context', type=int, default=N_CONTEXT,
help=f"Size of the context window used for producing timesteps in the input vector. Default: {N_CONTEXT}")
parser.add_argument('-la', '--lm_alpha', type=float, default=LM_ALPHA, parser.add_argument('-la', '--lm_alpha', type=float, default=LM_ALPHA,
help=f"The alpha hyperparameter of the CTC decoder. Language Model weight. Default: {LM_ALPHA}") help=f"The alpha hyperparameter of the CTC decoder. Language Model weight. Default: {LM_ALPHA}")
parser.add_argument('-lb', '--lm_beta', type=float, default=LM_BETA, parser.add_argument('-lb', '--lm_beta', type=float, default=LM_BETA,

View File

@ -1,5 +1,6 @@
deepspeech~=0.4.1 deepspeech~=0.6.0a5
pyaudio~=0.2.11 pyaudio~=0.2.11
webrtcvad~=2.0.10 webrtcvad~=2.0.10
halo~=0.0.18 halo~=0.0.18
numpy~=1.15.1 numpy~=1.15.1
scipy~=1.1.0

View File

@ -6,19 +6,17 @@ const Duplex = require('stream').Duplex;
const Wav = require('node-wav'); const Wav = require('node-wav');
const BEAM_WIDTH = 1024; const BEAM_WIDTH = 1024;
const N_FEATURES = 26;
const N_CONTEXT = 9;
let modelPath = './models/output_graph.pbmm'; let modelPath = './models/output_graph.pbmm';
let alphabetPath = './models/alphabet.txt'; let alphabetPath = './models/alphabet.txt';
let model = new DeepSpeech.Model(modelPath, N_FEATURES, N_CONTEXT, alphabetPath, BEAM_WIDTH); let model = new DeepSpeech.Model(modelPath, alphabetPath, BEAM_WIDTH);
const LM_ALPHA = 0.75; const LM_ALPHA = 0.75;
const LM_BETA = 1.85; const LM_BETA = 1.85;
let lmPath = './models/lm.binary'; let lmPath = './models/lm.binary';
let triePath = './models/trie'; let triePath = './models/trie';
model.enableDecoderWithLM(alphabetPath, lmPath, triePath, LM_ALPHA, LM_BETA); model.enableDecoderWithLM(lmPath, triePath, LM_ALPHA, LM_BETA);
let audioFile = process.argv[2] || './audio/2830-3980-0043.wav'; let audioFile = process.argv[2] || './audio/2830-3980-0043.wav';

View File

@ -8,7 +8,7 @@
}, },
"dependencies": { "dependencies": {
"argparse": "^1.0.10", "argparse": "^1.0.10",
"deepspeech": "^0.4.1", "deepspeech": "^0.6.0-alpha.5",
"node-wav": "0.0.2", "node-wav": "0.0.2",
"sox-stream": "^2.0.3", "sox-stream": "^2.0.3",
"util": "^0.11.1" "util": "^0.11.1"

View File

@ -72,7 +72,7 @@ def main(args):
logging.debug("************************************************************************************************************") logging.debug("************************************************************************************************************")
print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2])) print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
else: else:
sctx = model_retval[0].setupStream() sctx = model_retval[0].createStream()
subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'), subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
bufsize=0) bufsize=0)

View File

@ -283,7 +283,7 @@ class App(QMainWindow):
logging.debug("Start Recording pressed") logging.debug("Start Recording pressed")
logging.debug("Preparing for transcription...") logging.debug("Preparing for transcription...")
sctx = self.model[0].setupStream() sctx = self.model[0].createStream()
subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'), subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
bufsize=0) bufsize=0)

View File

@ -1,3 +1,3 @@
deepspeech==0.4.1 deepspeech~=0.6.0a5
webrtcvad webrtcvad
pyqt5 pyqt5

View File

@ -16,19 +16,17 @@ Load the pre-trained model into the memory
Returns a list [DeepSpeech Object, Model Load Time, LM Load Time] Returns a list [DeepSpeech Object, Model Load Time, LM Load Time]
''' '''
def load_model(models, alphabet, lm, trie): def load_model(models, alphabet, lm, trie):
N_FEATURES = 26
N_CONTEXT = 9
BEAM_WIDTH = 500 BEAM_WIDTH = 500
LM_ALPHA = 0.75 LM_ALPHA = 0.75
LM_BETA = 1.85 LM_BETA = 1.85
model_load_start = timer() model_load_start = timer()
ds = Model(models, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) ds = Model(models, alphabet, BEAM_WIDTH)
model_load_end = timer() - model_load_start model_load_end = timer() - model_load_start
logging.debug("Loaded model in %0.3fs." % (model_load_end)) logging.debug("Loaded model in %0.3fs." % (model_load_end))
lm_load_start = timer() lm_load_start = timer()
ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA) ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
lm_load_end = timer() - lm_load_start lm_load_end = timer() - lm_load_start
logging.debug('Loaded language model in %0.3fs.' % (lm_load_end)) logging.debug('Loaded language model in %0.3fs.' % (lm_load_end))