From b5a3e328dae95207f3341a126eb006dfbf37f7a4 Mon Sep 17 00:00:00 2001
From: Alexandre Lissy <lissyx@lissyx.dyndns.org>
Date: Tue, 17 Sep 2019 12:03:08 +0200
Subject: [PATCH] Update examples to run latest DeepSpeech

Fixes #2351
---
 examples/ffmpeg_vad_streaming/index.js        | 19 ++++---------------
 examples/ffmpeg_vad_streaming/package.json    |  2 +-
 .../mic_vad_streaming/mic_vad_streaming.py    | 14 ++++----------
 examples/mic_vad_streaming/requirements.txt   |  3 ++-
 examples/nodejs_wav/index.js                  |  8 +++-----
 examples/nodejs_wav/package.json              |  2 +-
 .../vad_transcriber/audioTranscript_cmd.py    |  2 +-
 .../vad_transcriber/audioTranscript_gui.py    |  2 +-
 examples/vad_transcriber/requirements.txt     |  2 +-
 examples/vad_transcriber/wavTranscriber.py    |  6 ++----
 10 files changed, 20 insertions(+), 40 deletions(-)

diff --git a/examples/ffmpeg_vad_streaming/index.js b/examples/ffmpeg_vad_streaming/index.js
index a2b61ea3..8aef749b 100644
--- a/examples/ffmpeg_vad_streaming/index.js
+++ b/examples/ffmpeg_vad_streaming/index.js
@@ -17,16 +17,6 @@ const LM_ALPHA = 0.75;
 // The beta hyperparameter of the CTC decoder. Word insertion bonus.
 const LM_BETA = 1.85;
 
-// These constants are tied to the shape of the graph used (changing them changes
-// the geometry of the first layer), so make sure you use the same constants that
-// were used during training
-
-// Number of MFCC features to use
-const N_FEATURES = 26;
-
-// Size of the context window used for producing timesteps in the input vector
-const N_CONTEXT = 9;
-
 let VersionAction = function VersionAction(options) {
 	options = options || {};
 	options.nargs = 0;
@@ -55,15 +45,14 @@ function totalTime(hrtimeValue) {
 
 console.error('Loading model from file %s', args['model']);
 const model_load_start = process.hrtime();
-let model = new Ds.Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'], BEAM_WIDTH);
+let model = new Ds.Model(args['model'], args['alphabet'], BEAM_WIDTH);
 const model_load_end = process.hrtime(model_load_start);
 console.error('Loaded model in %ds.', totalTime(model_load_end));
 
 if (args['lm'] && args['trie']) {
 	console.error('Loading language model from files %s %s', args['lm'], args['trie']);
 	const lm_load_start = process.hrtime();
-	model.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'],
-		LM_ALPHA, LM_BETA);
+	model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA);
 	const lm_load_end = process.hrtime(lm_load_start);
 	console.error('Loaded language model in %ds.', totalTime(lm_load_end));
 }
@@ -106,7 +95,7 @@ const ffmpeg = spawn('ffmpeg', [
 ]);
 
 let audioLength = 0;
-let sctx = model.setupStream(AUDIO_SAMPLE_RATE);
+let sctx = model.createStream(AUDIO_SAMPLE_RATE);
 
 function finishStream() {
 	const model_load_start = process.hrtime();
@@ -119,7 +108,7 @@ function finishStream() {
 
 function intermediateDecode() {
 	finishStream();
-	sctx = model.setupStream(AUDIO_SAMPLE_RATE);
+	sctx = model.createStream(AUDIO_SAMPLE_RATE);
 }
 
 function feedAudioContent(chunk) {
diff --git a/examples/ffmpeg_vad_streaming/package.json b/examples/ffmpeg_vad_streaming/package.json
index 9b1e3c23..d343b3b0 100644
--- a/examples/ffmpeg_vad_streaming/package.json
+++ b/examples/ffmpeg_vad_streaming/package.json
@@ -8,7 +8,7 @@
   },
   "dependencies": {
     "argparse": "^1.0.10",
-    "deepspeech": "^0.4.1",
+    "deepspeech": "^0.6.0-alpha.5",
     "node-vad": "^1.1.1",
     "util": "^0.11.1"
   },
diff --git a/examples/mic_vad_streaming/mic_vad_streaming.py b/examples/mic_vad_streaming/mic_vad_streaming.py
index 53a869e3..9e799276 100755
--- a/examples/mic_vad_streaming/mic_vad_streaming.py
+++ b/examples/mic_vad_streaming/mic_vad_streaming.py
@@ -162,11 +162,11 @@ def main(ARGS):
     print('Initializing model...')
     logging.info("ARGS.model: %s", ARGS.model)
     logging.info("ARGS.alphabet: %s", ARGS.alphabet)
-    model = deepspeech.Model(ARGS.model, ARGS.n_features, ARGS.n_context, ARGS.alphabet, ARGS.beam_width)
+    model = deepspeech.Model(ARGS.model, ARGS.alphabet, ARGS.beam_width)
     if ARGS.lm and ARGS.trie:
         logging.info("ARGS.lm: %s", ARGS.lm)
         logging.info("ARGS.trie: %s", ARGS.trie)
-        model.enableDecoderWithLM(ARGS.alphabet, ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta)
+        model.enableDecoderWithLM(ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta)
 
     # Start audio with VAD
     vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
@@ -179,7 +179,7 @@ def main(ARGS):
     # Stream from microphone to DeepSpeech using VAD
     spinner = None
     if not ARGS.nospinner: spinner = Halo(spinner='line')
-    stream_context = model.setupStream()
+    stream_context = model.createStream()
     wav_data = bytearray()
     for frame in frames:
         if frame is not None:
@@ -195,15 +195,13 @@ def main(ARGS):
                 wav_data = bytearray()
             text = model.finishStream(stream_context)
             print("Recognized: %s" % text)
-            stream_context = model.setupStream()
+            stream_context = model.createStream()
 
 if __name__ == '__main__':
     BEAM_WIDTH = 500
     DEFAULT_SAMPLE_RATE = 16000
     LM_ALPHA = 0.75
     LM_BETA = 1.85
-    N_FEATURES = 26
-    N_CONTEXT = 9
 
     import argparse
     parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD")
@@ -229,10 +227,6 @@ if __name__ == '__main__':
                         help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
     parser.add_argument('-r', '--rate', type=int, default=DEFAULT_SAMPLE_RATE,
                         help=f"Input device sample rate. Default: {DEFAULT_SAMPLE_RATE}. Your device may require 44100.")
-    parser.add_argument('-nf', '--n_features', type=int, default=N_FEATURES,
-                        help=f"Number of MFCC features to use. Default: {N_FEATURES}")
-    parser.add_argument('-nc', '--n_context', type=int, default=N_CONTEXT,
-                        help=f"Size of the context window used for producing timesteps in the input vector. Default: {N_CONTEXT}")
     parser.add_argument('-la', '--lm_alpha', type=float, default=LM_ALPHA,
                         help=f"The alpha hyperparameter of the CTC decoder. Language Model weight. Default: {LM_ALPHA}")
     parser.add_argument('-lb', '--lm_beta', type=float, default=LM_BETA,
diff --git a/examples/mic_vad_streaming/requirements.txt b/examples/mic_vad_streaming/requirements.txt
index 5c904be4..397d1eaf 100644
--- a/examples/mic_vad_streaming/requirements.txt
+++ b/examples/mic_vad_streaming/requirements.txt
@@ -1,5 +1,6 @@
-deepspeech~=0.4.1
+deepspeech~=0.6.0a5
 pyaudio~=0.2.11
 webrtcvad~=2.0.10
 halo~=0.0.18
 numpy~=1.15.1
+scipy~=1.1.0
diff --git a/examples/nodejs_wav/index.js b/examples/nodejs_wav/index.js
index 0b56b35c..a5432217 100644
--- a/examples/nodejs_wav/index.js
+++ b/examples/nodejs_wav/index.js
@@ -6,19 +6,17 @@ const Duplex = require('stream').Duplex;
 const Wav = require('node-wav');
 
 const BEAM_WIDTH = 1024;
-const N_FEATURES = 26;
-const N_CONTEXT = 9;
 let modelPath = './models/output_graph.pbmm';
 let alphabetPath = './models/alphabet.txt';
 
-let model = new DeepSpeech.Model(modelPath, N_FEATURES, N_CONTEXT, alphabetPath, BEAM_WIDTH);
+let model = new DeepSpeech.Model(modelPath, alphabetPath, BEAM_WIDTH);
 
 const LM_ALPHA = 0.75;
 const LM_BETA = 1.85;
 let lmPath = './models/lm.binary';
 let triePath = './models/trie';
 
-model.enableDecoderWithLM(alphabetPath, lmPath, triePath, LM_ALPHA, LM_BETA);
+model.enableDecoderWithLM(lmPath, triePath, LM_ALPHA, LM_BETA);
 
 let audioFile = process.argv[2] || './audio/2830-3980-0043.wav';
 
@@ -69,4 +67,4 @@ audioStream.on('finish', () => {
 	let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000);
 	
 	console.log('result:', result);
-});
\ No newline at end of file
+});
diff --git a/examples/nodejs_wav/package.json b/examples/nodejs_wav/package.json
index bbf0824f..0bfef7a5 100644
--- a/examples/nodejs_wav/package.json
+++ b/examples/nodejs_wav/package.json
@@ -8,7 +8,7 @@
   },
   "dependencies": {
     "argparse": "^1.0.10",
-    "deepspeech": "^0.4.1",
+    "deepspeech": "^0.6.0-alpha.5",
     "node-wav": "0.0.2",
     "sox-stream": "^2.0.3",
     "util": "^0.11.1"
diff --git a/examples/vad_transcriber/audioTranscript_cmd.py b/examples/vad_transcriber/audioTranscript_cmd.py
index 552c58ad..78a21c15 100644
--- a/examples/vad_transcriber/audioTranscript_cmd.py
+++ b/examples/vad_transcriber/audioTranscript_cmd.py
@@ -72,7 +72,7 @@ def main(args):
         logging.debug("************************************************************************************************************")
         print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
     else:
-        sctx = model_retval[0].setupStream()
+        sctx = model_retval[0].createStream()
         subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'),
                                    stdout=subprocess.PIPE,
                                    bufsize=0)
diff --git a/examples/vad_transcriber/audioTranscript_gui.py b/examples/vad_transcriber/audioTranscript_gui.py
index 38614599..30fbb0ad 100644
--- a/examples/vad_transcriber/audioTranscript_gui.py
+++ b/examples/vad_transcriber/audioTranscript_gui.py
@@ -283,7 +283,7 @@ class App(QMainWindow):
             logging.debug("Start Recording pressed")
             logging.debug("Preparing for transcription...")
 
-            sctx = self.model[0].setupStream()
+            sctx = self.model[0].createStream()
             subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'),
                                        stdout=subprocess.PIPE,
                                        bufsize=0)
diff --git a/examples/vad_transcriber/requirements.txt b/examples/vad_transcriber/requirements.txt
index ef2b6743..62b32331 100644
--- a/examples/vad_transcriber/requirements.txt
+++ b/examples/vad_transcriber/requirements.txt
@@ -1,3 +1,3 @@
-deepspeech==0.4.1
+deepspeech~=0.6.0a5
 webrtcvad
 pyqt5
diff --git a/examples/vad_transcriber/wavTranscriber.py b/examples/vad_transcriber/wavTranscriber.py
index 2735879f..9f21f362 100644
--- a/examples/vad_transcriber/wavTranscriber.py
+++ b/examples/vad_transcriber/wavTranscriber.py
@@ -16,19 +16,17 @@ Load the pre-trained model into the memory
 Returns a list [DeepSpeech Object, Model Load Time, LM Load Time]
 '''
 def load_model(models, alphabet, lm, trie):
-    N_FEATURES = 26
-    N_CONTEXT = 9
     BEAM_WIDTH = 500
     LM_ALPHA = 0.75
     LM_BETA = 1.85
 
     model_load_start = timer()
-    ds = Model(models, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
+    ds = Model(models, alphabet, BEAM_WIDTH)
     model_load_end = timer() - model_load_start
     logging.debug("Loaded model in %0.3fs." % (model_load_end))
 
     lm_load_start = timer()
-    ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)
+    ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
     lm_load_end = timer() - lm_load_start
     logging.debug('Loaded language model in %0.3fs.' % (lm_load_end))