Merge pull request #1761 from igorfritzsch/ffmpeg-VAD-inference-example

Add example for Nodejs streaming from arbitrary source with VAD and FFmpeg
2018-12-05 01:44:00 -02:00 · 2018-12-05 01:44:00 -02:00 · 6177da9bc6
parent d8fd6245a8 6f3c48c428
commit 6177da9bc6
3 changed files with 163 additions and 0 deletions
--- a/examples/ffmpeg_vad_streaming/README.MD
+++ b/examples/ffmpeg_vad_streaming/README.MD
@ -0,0 +1,29 @@
 # FFmpeg VAD Streaming
 Streaming inference from arbitrary source (FFmpeg input) to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Node.js.
 This example was successfully tested with a mobile phone streaming a live feed to a RTMP server (nginx-rtmp), which then could be used by this script for near real time speech recognition.
 ## Installation
 ```bash
 npm install
 ```
 Moreover FFmpeg must be installed:
 ```bash
 sudo apt-get install ffmpeg
 ```
 ## Usage
 Here is an example for a local audio file:
 ```bash
 node ./index.js --audio <AUDIO_FILE> --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt
 ```
 Here is an example for a remote RTMP-Stream:
 ```bash
 node ./index.js  --audio rtmp://<IP>:1935/live/teststream --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt
 ```
--- a/examples/ffmpeg_vad_streaming/index.js
+++ b/examples/ffmpeg_vad_streaming/index.js
@ -0,0 +1,118 @@
 #!/usr/bin/env node
 const VAD = require("node-vad");
 const Ds = require('deepspeech');
 const argparse = require('argparse');
 const util = require('util');
 // These constants control the beam search decoder
 // Beam width used in the CTC decoder when building candidate transcriptions
 const BEAM_WIDTH = 1024;
 // The alpha hyperparameter of the CTC decoder. Language Model weight
 const LM_WEIGHT = 1.50;
 // Valid word insertion weight. This is used to lessen the word insertion penalty
 // when the inserted word is part of the vocabulary
 const VALID_WORD_COUNT_WEIGHT = 2.25;
 // These constants are tied to the shape of the graph used (changing them changes
 // the geometry of the first layer), so make sure you use the same constants that
 // were used during training
 // Number of MFCC features to use
 const N_FEATURES = 26;
 // Size of the context window used for producing timesteps in the input vector
 const N_CONTEXT = 9;
 let VersionAction = function VersionAction(options) {
 	options = options || {};
 	options.nargs = 0;
 	argparse.Action.call(this, options);
 };
 util.inherits(VersionAction, argparse.Action);
 VersionAction.prototype.call = function(parser) {
 	Ds.printVersions();
 	process.exit(0);
 };
 let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
 parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
 parser.addArgument(['--alphabet'], {required: true, help: 'Path to the configuration file specifying the alphabet used by the network'});
 parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
 parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
 parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
 parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
 let args = parser.parseArgs();
 function totalTime(hrtimeValue) {
 	return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
 }
 console.error('Loading model from file %s', args['model']);
 const model_load_start = process.hrtime();
 let model = new Ds.Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'], BEAM_WIDTH);
 const model_load_end = process.hrtime(model_load_start);
 console.error('Loaded model in %ds.', totalTime(model_load_end));
 if (args['lm'] && args['trie']) {
 	console.error('Loading language model from files %s %s', args['lm'], args['trie']);
 	const lm_load_start = process.hrtime();
 	model.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'],
 		LM_WEIGHT, VALID_WORD_COUNT_WEIGHT);
 	const lm_load_end = process.hrtime(lm_load_start);
 	console.error('Loaded language model in %ds.', totalTime(lm_load_end));
 }
 const vad = new VAD(VAD.Mode.NORMAL);
 const voice = {START: true, STOP: false};
 let sctx = model.setupStream(150, 16000);
 let state = voice.STOP;
 function finishStream() {
 	const model_load_start = process.hrtime();
 	console.error('Running inference.');
 	console.log('Transcription: ', model.finishStream(sctx));
 	const model_load_end = process.hrtime(model_load_start);
 	console.error('Inference took %ds.', totalTime(model_load_end));
 }
 let ffmpeg = require('child_process').spawn('ffmpeg', [
 	'-hide_banner',
 	'-nostats',
 	'-loglevel', 'fatal',
 	'-i', args['audio'],
 	'-af', 'highpass=f=200,lowpass=f=3000',
 	'-vn',
 	'-acodec', 'pcm_s16le',
 	'-ac', 1,
 	'-ar', 16000,
 	'-f', 's16le',
 	'pipe:'
 ]);
 ffmpeg.stdout.on('data', chunk => {
 	vad.processAudio(chunk, 16000).then(res => {
 		switch (res) {
 			case VAD.Event.SILENCE:
 				if (state === voice.START) {
 					state = voice.STOP;
 					finishStream();
 					sctx = model.setupStream(150,16000);
 				}
 				break;
 			case VAD.Event.VOICE:
 				state = voice.START;
 				model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2));
 				break;
 		}
 	});
 });
 ffmpeg.stdout.on('close', code => {
 	finishStream();
 });
--- a/examples/ffmpeg_vad_streaming/package.json
+++ b/examples/ffmpeg_vad_streaming/package.json
@ -0,0 +1,16 @@
 {
  "name": "ffmpeg-vad-streaming",
  "version": "1.0.0",
  "description": "Streaming inference from arbitrary source with VAD and FFmpeg",
  "main": "index.js",
  "scripts": {
    "start": "node ./index.js"
  },
  "dependencies": {
    "argparse": "^1.0.10",
    "deepspeech": "^0.3.0",
    "node-vad": "^1.1.1",
    "util": "^0.11.1"
  },
  "license" : "MIT"
 }