diff --git a/examples/ffmpeg_vad_streaming/README.MD b/examples/ffmpeg_vad_streaming/README.MD new file mode 100644 index 00000000..c7886662 --- /dev/null +++ b/examples/ffmpeg_vad_streaming/README.MD @@ -0,0 +1,29 @@ +# FFmpeg VAD Streaming + +Streaming inference from arbitrary source (FFmpeg input) to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Node.js. + +This example was successfully tested with a mobile phone streaming a live feed to a RTMP server (nginx-rtmp), which then could be used by this script for near real time speech recognition. + +## Installation + +```bash +npm install +``` + +Moreover FFmpeg must be installed: + +```bash +sudo apt-get install ffmpeg +``` + +## Usage + +Here is an example for a local audio file: +```bash +node ./index.js --audio --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt +``` + +Here is an example for a remote RTMP-Stream: +```bash +node ./index.js --audio rtmp://:1935/live/teststream --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt +``` diff --git a/examples/ffmpeg_vad_streaming/index.js b/examples/ffmpeg_vad_streaming/index.js new file mode 100644 index 00000000..37f6d871 --- /dev/null +++ b/examples/ffmpeg_vad_streaming/index.js @@ -0,0 +1,118 @@ +#!/usr/bin/env node + +const VAD = require("node-vad"); +const Ds = require('deepspeech'); +const argparse = require('argparse'); +const util = require('util'); + +// These constants control the beam search decoder + +// Beam width used in the CTC decoder when building candidate transcriptions +const BEAM_WIDTH = 1024; + +// The alpha hyperparameter of the CTC decoder. Language Model weight +const LM_WEIGHT = 1.50; + +// Valid word insertion weight. This is used to lessen the word insertion penalty +// when the inserted word is part of the vocabulary +const VALID_WORD_COUNT_WEIGHT = 2.25; + +// These constants are tied to the shape of the graph used (changing them changes +// the geometry of the first layer), so make sure you use the same constants that +// were used during training + +// Number of MFCC features to use +const N_FEATURES = 26; + +// Size of the context window used for producing timesteps in the input vector +const N_CONTEXT = 9; + +let VersionAction = function VersionAction(options) { + options = options || {}; + options.nargs = 0; + argparse.Action.call(this, options); +}; + +util.inherits(VersionAction, argparse.Action); + +VersionAction.prototype.call = function(parser) { + Ds.printVersions(); + process.exit(0); +}; + +let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'}); +parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'}); +parser.addArgument(['--alphabet'], {required: true, help: 'Path to the configuration file specifying the alphabet used by the network'}); +parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'}); +parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'}); +parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'}); +parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'}); +let args = parser.parseArgs(); + +function totalTime(hrtimeValue) { + return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4); +} + +console.error('Loading model from file %s', args['model']); +const model_load_start = process.hrtime(); +let model = new Ds.Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'], BEAM_WIDTH); +const model_load_end = process.hrtime(model_load_start); +console.error('Loaded model in %ds.', totalTime(model_load_end)); + +if (args['lm'] && args['trie']) { + console.error('Loading language model from files %s %s', args['lm'], args['trie']); + const lm_load_start = process.hrtime(); + model.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'], + LM_WEIGHT, VALID_WORD_COUNT_WEIGHT); + const lm_load_end = process.hrtime(lm_load_start); + console.error('Loaded language model in %ds.', totalTime(lm_load_end)); +} + +const vad = new VAD(VAD.Mode.NORMAL); +const voice = {START: true, STOP: false}; +let sctx = model.setupStream(150, 16000); +let state = voice.STOP; + +function finishStream() { + const model_load_start = process.hrtime(); + console.error('Running inference.'); + console.log('Transcription: ', model.finishStream(sctx)); + const model_load_end = process.hrtime(model_load_start); + console.error('Inference took %ds.', totalTime(model_load_end)); +} + +let ffmpeg = require('child_process').spawn('ffmpeg', [ + '-hide_banner', + '-nostats', + '-loglevel', 'fatal', + '-i', args['audio'], + '-af', 'highpass=f=200,lowpass=f=3000', + '-vn', + '-acodec', 'pcm_s16le', + '-ac', 1, + '-ar', 16000, + '-f', 's16le', + 'pipe:' +]); + +ffmpeg.stdout.on('data', chunk => { + vad.processAudio(chunk, 16000).then(res => { + switch (res) { + case VAD.Event.SILENCE: + if (state === voice.START) { + state = voice.STOP; + finishStream(); + sctx = model.setupStream(150,16000); + } + break; + case VAD.Event.VOICE: + state = voice.START; + model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2)); + break; + } + }); +}); + +ffmpeg.stdout.on('close', code => { + finishStream(); +}); diff --git a/examples/ffmpeg_vad_streaming/package.json b/examples/ffmpeg_vad_streaming/package.json new file mode 100644 index 00000000..09b8bcce --- /dev/null +++ b/examples/ffmpeg_vad_streaming/package.json @@ -0,0 +1,16 @@ +{ + "name": "ffmpeg-vad-streaming", + "version": "1.0.0", + "description": "Streaming inference from arbitrary source with VAD and FFmpeg", + "main": "index.js", + "scripts": { + "start": "node ./index.js" + }, + "dependencies": { + "argparse": "^1.0.10", + "deepspeech": "^0.3.0", + "node-vad": "^1.1.1", + "util": "^0.11.1" + }, + "license" : "MIT" +}