STT/native_client/javascript/client.ts
2021-04-18 17:03:16 +02:00

176 lines
5.7 KiB
JavaScript

#!/usr/bin/env node
// This is required for process.versions.electron below
/// <reference types="electron" />
import * as Ds from "./index";
import * as Fs from "fs";
import Sox from "sox-stream";
import * as argparse from "argparse";
const MemoryStream = require("memory-stream");
const Wav = require("node-wav");
const Duplex = require("stream").Duplex;
class VersionAction extends argparse.Action {
call(parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: string | string[], optionString: string | null) {
console.log('Coqui STT ' + Ds.Version());
let runtime = 'Node';
if (process.versions.electron) {
runtime = 'Electron';
}
console.error('Runtime: ' + runtime);
process.exit(0);
}
}
let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running Coqui STT inference.'});
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
parser.addArgument(['--scorer'], {help: 'Path to the external scorer file'});
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
parser.addArgument(['--version'], {action: VersionAction, nargs: 0, help: 'Print version and exits'});
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
parser.addArgument(['--stream'], {action: 'storeTrue', help: 'Use streaming code path (for tests)'});
parser.addArgument(['--hot_words'], {help: 'Hot-words and their boosts. Word:Boost pairs are comma-separated'});
let args = parser.parseArgs();
function totalTime(hrtimeValue: number[]): string {
return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
}
function candidateTranscriptToString(transcript: Ds.CandidateTranscript): string {
var retval = ""
for (var i = 0; i < transcript.tokens.length; ++i) {
retval += transcript.tokens[i].text;
}
return retval;
}
// sphinx-doc: js_ref_model_start
console.error('Loading model from file %s', args['model']);
const model_load_start = process.hrtime();
let model = new Ds.Model(args['model']);
const model_load_end = process.hrtime(model_load_start);
console.error('Loaded model in %ds.', totalTime(model_load_end));
if (args['beam_width']) {
model.setBeamWidth(args['beam_width']);
}
// sphinx-doc: js_ref_model_stop
let desired_sample_rate = model.sampleRate();
if (args['scorer']) {
console.error('Loading scorer from file %s', args['scorer']);
const scorer_load_start = process.hrtime();
model.enableExternalScorer(args['scorer']);
const scorer_load_end = process.hrtime(scorer_load_start);
console.error('Loaded scorer in %ds.', totalTime(scorer_load_end));
if (args['lm_alpha'] && args['lm_beta']) {
model.setScorerAlphaBeta(args['lm_alpha'], args['lm_beta']);
}
}
if (args['hot_words']) {
console.error('Adding hot-words %s', args['hot_words']);
for (let word_boost of args['hot_words'].split(',')) {
let word = word_boost.split(':');
model.addHotWord(word[0], parseFloat(word[1]));
}
}
const buffer = Fs.readFileSync(args['audio']);
const result = Wav.decode(buffer);
if (result.sampleRate < desired_sample_rate) {
console.error(`Warning: original sample rate ( ${result.sampleRate})` +
`is lower than ${desired_sample_rate} Hz. ` +
`Up-sampling might produce erratic speech recognition.`);
}
function handleExit() {
if (process.versions.electron) {
const { app } = require("electron");
app.quit();
} else {
process.exit(0);
}
}
function bufferToStream(buffer: Buffer) {
var stream = new Duplex();
stream.push(buffer);
stream.push(null);
return stream;
}
let conversionStream = bufferToStream(buffer).
pipe(Sox({
global: {
'no-dither': true,
'replay-gain': 'off',
},
output: {
bits: 16,
rate: desired_sample_rate,
channels: 1,
encoding: 'signed-integer',
endian: 'little',
compression: 0.0,
type: 'raw'
}
}));
if (!args['stream']) {
let audioStream = new MemoryStream();
conversionStream.pipe(audioStream);
audioStream.on('finish', () => {
let audioBuffer = audioStream.toBuffer();
const inference_start = process.hrtime();
console.error('Running inference.');
const audioLength = (audioBuffer.length / 2) * (1 / desired_sample_rate);
// sphinx-doc: js_ref_inference_start
if (args['extended']) {
let metadata = model.sttWithMetadata(audioBuffer, 1);
console.log(candidateTranscriptToString(metadata.transcripts[0]));
Ds.FreeMetadata(metadata);
} else {
console.log(model.stt(audioBuffer));
}
// sphinx-doc: js_ref_inference_stop
const inference_stop = process.hrtime(inference_start);
console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4));
Ds.FreeModel(model);
// Allow some time for resources to exhaust and ensure we finish the
// process anyway
setTimeout(() => {
handleExit();
}, 1*1000);
});
audioStream.on('close', () => {
handleExit();
});
} else {
let stream = model.createStream();
conversionStream.on('data', (chunk: Buffer) => {
stream.feedAudioContent(chunk);
if (args['extended']) {
let metadata = stream.intermediateDecodeWithMetadata();
console.error('intermediate: ' + candidateTranscriptToString(metadata.transcripts[0]));
} else {
console.error('intermediate: ' + stream.intermediateDecode());
}
});
conversionStream.on('end', () => {
if (args['extended']) {
let metadata = stream.finishStreamWithMetadata();
console.log(candidateTranscriptToString(metadata.transcripts[0]));
} else {
console.log(stream.finishStream());
}
});
}