Merge pull request #1761 from igorfritzsch/ffmpeg-VAD-inference-example
Add example for Nodejs streaming from arbitrary source with VAD and FFmpeg
This commit is contained in:
commit
6177da9bc6
|
@ -0,0 +1,29 @@
|
||||||
|
# FFmpeg VAD Streaming
|
||||||
|
|
||||||
|
Streaming inference from arbitrary source (FFmpeg input) to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Node.js.
|
||||||
|
|
||||||
|
This example was successfully tested with a mobile phone streaming a live feed to a RTMP server (nginx-rtmp), which then could be used by this script for near real time speech recognition.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install
|
||||||
|
```
|
||||||
|
|
||||||
|
Moreover FFmpeg must be installed:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt-get install ffmpeg
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Here is an example for a local audio file:
|
||||||
|
```bash
|
||||||
|
node ./index.js --audio <AUDIO_FILE> --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Here is an example for a remote RTMP-Stream:
|
||||||
|
```bash
|
||||||
|
node ./index.js --audio rtmp://<IP>:1935/live/teststream --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt
|
||||||
|
```
|
|
@ -0,0 +1,118 @@
|
||||||
|
#!/usr/bin/env node
|
||||||
|
|
||||||
|
const VAD = require("node-vad");
|
||||||
|
const Ds = require('deepspeech');
|
||||||
|
const argparse = require('argparse');
|
||||||
|
const util = require('util');
|
||||||
|
|
||||||
|
// These constants control the beam search decoder
|
||||||
|
|
||||||
|
// Beam width used in the CTC decoder when building candidate transcriptions
|
||||||
|
const BEAM_WIDTH = 1024;
|
||||||
|
|
||||||
|
// The alpha hyperparameter of the CTC decoder. Language Model weight
|
||||||
|
const LM_WEIGHT = 1.50;
|
||||||
|
|
||||||
|
// Valid word insertion weight. This is used to lessen the word insertion penalty
|
||||||
|
// when the inserted word is part of the vocabulary
|
||||||
|
const VALID_WORD_COUNT_WEIGHT = 2.25;
|
||||||
|
|
||||||
|
// These constants are tied to the shape of the graph used (changing them changes
|
||||||
|
// the geometry of the first layer), so make sure you use the same constants that
|
||||||
|
// were used during training
|
||||||
|
|
||||||
|
// Number of MFCC features to use
|
||||||
|
const N_FEATURES = 26;
|
||||||
|
|
||||||
|
// Size of the context window used for producing timesteps in the input vector
|
||||||
|
const N_CONTEXT = 9;
|
||||||
|
|
||||||
|
let VersionAction = function VersionAction(options) {
|
||||||
|
options = options || {};
|
||||||
|
options.nargs = 0;
|
||||||
|
argparse.Action.call(this, options);
|
||||||
|
};
|
||||||
|
|
||||||
|
util.inherits(VersionAction, argparse.Action);
|
||||||
|
|
||||||
|
VersionAction.prototype.call = function(parser) {
|
||||||
|
Ds.printVersions();
|
||||||
|
process.exit(0);
|
||||||
|
};
|
||||||
|
|
||||||
|
let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
|
||||||
|
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
|
||||||
|
parser.addArgument(['--alphabet'], {required: true, help: 'Path to the configuration file specifying the alphabet used by the network'});
|
||||||
|
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
|
||||||
|
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
|
||||||
|
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
|
||||||
|
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
|
||||||
|
let args = parser.parseArgs();
|
||||||
|
|
||||||
|
function totalTime(hrtimeValue) {
|
||||||
|
return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.error('Loading model from file %s', args['model']);
|
||||||
|
const model_load_start = process.hrtime();
|
||||||
|
let model = new Ds.Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'], BEAM_WIDTH);
|
||||||
|
const model_load_end = process.hrtime(model_load_start);
|
||||||
|
console.error('Loaded model in %ds.', totalTime(model_load_end));
|
||||||
|
|
||||||
|
if (args['lm'] && args['trie']) {
|
||||||
|
console.error('Loading language model from files %s %s', args['lm'], args['trie']);
|
||||||
|
const lm_load_start = process.hrtime();
|
||||||
|
model.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'],
|
||||||
|
LM_WEIGHT, VALID_WORD_COUNT_WEIGHT);
|
||||||
|
const lm_load_end = process.hrtime(lm_load_start);
|
||||||
|
console.error('Loaded language model in %ds.', totalTime(lm_load_end));
|
||||||
|
}
|
||||||
|
|
||||||
|
const vad = new VAD(VAD.Mode.NORMAL);
|
||||||
|
const voice = {START: true, STOP: false};
|
||||||
|
let sctx = model.setupStream(150, 16000);
|
||||||
|
let state = voice.STOP;
|
||||||
|
|
||||||
|
function finishStream() {
|
||||||
|
const model_load_start = process.hrtime();
|
||||||
|
console.error('Running inference.');
|
||||||
|
console.log('Transcription: ', model.finishStream(sctx));
|
||||||
|
const model_load_end = process.hrtime(model_load_start);
|
||||||
|
console.error('Inference took %ds.', totalTime(model_load_end));
|
||||||
|
}
|
||||||
|
|
||||||
|
let ffmpeg = require('child_process').spawn('ffmpeg', [
|
||||||
|
'-hide_banner',
|
||||||
|
'-nostats',
|
||||||
|
'-loglevel', 'fatal',
|
||||||
|
'-i', args['audio'],
|
||||||
|
'-af', 'highpass=f=200,lowpass=f=3000',
|
||||||
|
'-vn',
|
||||||
|
'-acodec', 'pcm_s16le',
|
||||||
|
'-ac', 1,
|
||||||
|
'-ar', 16000,
|
||||||
|
'-f', 's16le',
|
||||||
|
'pipe:'
|
||||||
|
]);
|
||||||
|
|
||||||
|
ffmpeg.stdout.on('data', chunk => {
|
||||||
|
vad.processAudio(chunk, 16000).then(res => {
|
||||||
|
switch (res) {
|
||||||
|
case VAD.Event.SILENCE:
|
||||||
|
if (state === voice.START) {
|
||||||
|
state = voice.STOP;
|
||||||
|
finishStream();
|
||||||
|
sctx = model.setupStream(150,16000);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case VAD.Event.VOICE:
|
||||||
|
state = voice.START;
|
||||||
|
model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
ffmpeg.stdout.on('close', code => {
|
||||||
|
finishStream();
|
||||||
|
});
|
|
@ -0,0 +1,16 @@
|
||||||
|
{
|
||||||
|
"name": "ffmpeg-vad-streaming",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "Streaming inference from arbitrary source with VAD and FFmpeg",
|
||||||
|
"main": "index.js",
|
||||||
|
"scripts": {
|
||||||
|
"start": "node ./index.js"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"argparse": "^1.0.10",
|
||||||
|
"deepspeech": "^0.3.0",
|
||||||
|
"node-vad": "^1.1.1",
|
||||||
|
"util": "^0.11.1"
|
||||||
|
},
|
||||||
|
"license" : "MIT"
|
||||||
|
}
|
Loading…
Reference in New Issue