176 lines
5.7 KiB
JavaScript
176 lines
5.7 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
// This is required for process.versions.electron below
|
|
/// <reference types="electron" />
|
|
|
|
import * as Ds from "./index";
|
|
import * as Fs from "fs";
|
|
import Sox from "sox-stream";
|
|
import * as argparse from "argparse";
|
|
|
|
const MemoryStream = require("memory-stream");
|
|
const Wav = require("node-wav");
|
|
const Duplex = require("stream").Duplex;
|
|
|
|
class VersionAction extends argparse.Action {
|
|
call(parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: string | string[], optionString: string | null) {
|
|
console.log('Coqui STT ' + Ds.Version());
|
|
let runtime = 'Node';
|
|
if (process.versions.electron) {
|
|
runtime = 'Electron';
|
|
}
|
|
console.error('Runtime: ' + runtime);
|
|
process.exit(0);
|
|
}
|
|
}
|
|
|
|
let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running Coqui STT inference.'});
|
|
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
|
|
parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'});
|
|
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
|
|
parser.addArgument(['--version'], {action: VersionAction, nargs: 0, help: 'Print version and exits'});
|
|
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
|
|
parser.addArgument(['--stream'], {action: 'storeTrue', help: 'Use streaming code path (for tests)'});
|
|
parser.addArgument(['--hot_words'], {help: 'Hot-words and their boosts. Word:Boost pairs are comma-separated'});
|
|
let args = parser.parseArgs();
|
|
|
|
function totalTime(hrtimeValue: number[]): string {
|
|
return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
|
|
}
|
|
|
|
function candidateTranscriptToString(transcript: Ds.CandidateTranscript): string {
|
|
var retval = ""
|
|
for (var i = 0; i < transcript.tokens.length; ++i) {
|
|
retval += transcript.tokens[i].text;
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
// sphinx-doc: js_ref_model_start
|
|
console.error('Loading model from file %s', args['model']);
|
|
const model_load_start = process.hrtime();
|
|
let model = new Ds.Model(args['model']);
|
|
const model_load_end = process.hrtime(model_load_start);
|
|
console.error('Loaded model in %ds.', totalTime(model_load_end));
|
|
|
|
if (args['beam_width']) {
|
|
model.setBeamWidth(args['beam_width']);
|
|
}
|
|
// sphinx-doc: js_ref_model_stop
|
|
|
|
let desired_sample_rate = model.sampleRate();
|
|
|
|
if (args['scorer']) {
|
|
console.error('Loading scorer from file %s', args['scorer']);
|
|
const scorer_load_start = process.hrtime();
|
|
model.enableExternalScorer(args['scorer']);
|
|
const scorer_load_end = process.hrtime(scorer_load_start);
|
|
console.error('Loaded scorer in %ds.', totalTime(scorer_load_end));
|
|
|
|
if (args['lm_alpha'] && args['lm_beta']) {
|
|
model.setScorerAlphaBeta(args['lm_alpha'], args['lm_beta']);
|
|
}
|
|
}
|
|
|
|
if (args['hot_words']) {
|
|
console.error('Adding hot-words %s', args['hot_words']);
|
|
for (let word_boost of args['hot_words'].split(',')) {
|
|
let word = word_boost.split(':');
|
|
model.addHotWord(word[0], parseFloat(word[1]));
|
|
}
|
|
}
|
|
|
|
const buffer = Fs.readFileSync(args['audio']);
|
|
const result = Wav.decode(buffer);
|
|
|
|
if (result.sampleRate < desired_sample_rate) {
|
|
console.error(`Warning: original sample rate ( ${result.sampleRate})` +
|
|
`is lower than ${desired_sample_rate} Hz. ` +
|
|
`Up-sampling might produce erratic speech recognition.`);
|
|
}
|
|
|
|
function handleExit() {
|
|
if (process.versions.electron) {
|
|
const { app } = require("electron");
|
|
app.quit();
|
|
} else {
|
|
process.exit(0);
|
|
}
|
|
}
|
|
|
|
function bufferToStream(buffer: Buffer) {
|
|
var stream = new Duplex();
|
|
stream.push(buffer);
|
|
stream.push(null);
|
|
return stream;
|
|
}
|
|
|
|
let conversionStream = bufferToStream(buffer).
|
|
pipe(Sox({
|
|
global: {
|
|
'no-dither': true,
|
|
'replay-gain': 'off',
|
|
},
|
|
output: {
|
|
bits: 16,
|
|
rate: desired_sample_rate,
|
|
channels: 1,
|
|
encoding: 'signed-integer',
|
|
endian: 'little',
|
|
compression: 0.0,
|
|
type: 'raw'
|
|
}
|
|
}));
|
|
|
|
if (!args['stream']) {
|
|
let audioStream = new MemoryStream();
|
|
conversionStream.pipe(audioStream);
|
|
audioStream.on('finish', () => {
|
|
let audioBuffer = audioStream.toBuffer();
|
|
|
|
const inference_start = process.hrtime();
|
|
console.error('Running inference.');
|
|
const audioLength = (audioBuffer.length / 2) * (1 / desired_sample_rate);
|
|
|
|
// sphinx-doc: js_ref_inference_start
|
|
if (args['extended']) {
|
|
let metadata = model.sttWithMetadata(audioBuffer, 1);
|
|
console.log(candidateTranscriptToString(metadata.transcripts[0]));
|
|
Ds.FreeMetadata(metadata);
|
|
} else {
|
|
console.log(model.stt(audioBuffer));
|
|
}
|
|
// sphinx-doc: js_ref_inference_stop
|
|
const inference_stop = process.hrtime(inference_start);
|
|
console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4));
|
|
Ds.FreeModel(model);
|
|
// Allow some time for resources to exhaust and ensure we finish the
|
|
// process anyway
|
|
setTimeout(() => {
|
|
handleExit();
|
|
}, 1*1000);
|
|
});
|
|
audioStream.on('close', () => {
|
|
handleExit();
|
|
});
|
|
} else {
|
|
let stream = model.createStream();
|
|
conversionStream.on('data', (chunk: Buffer) => {
|
|
stream.feedAudioContent(chunk);
|
|
if (args['extended']) {
|
|
let metadata = stream.intermediateDecodeWithMetadata();
|
|
console.error('intermediate: ' + candidateTranscriptToString(metadata.transcripts[0]));
|
|
} else {
|
|
console.error('intermediate: ' + stream.intermediateDecode());
|
|
}
|
|
});
|
|
conversionStream.on('end', () => {
|
|
if (args['extended']) {
|
|
let metadata = stream.finishStreamWithMetadata();
|
|
console.log(candidateTranscriptToString(metadata.transcripts[0]));
|
|
} else {
|
|
console.log(stream.finishStream());
|
|
}
|
|
});
|
|
}
|