363 lines
12 KiB
TypeScript
363 lines
12 KiB
TypeScript
import binary from 'node-pre-gyp';
|
|
import path from 'path';
|
|
|
|
// 'lib', 'binding', 'v0.1.1', ['node', 'v' + process.versions.modules, process.platform, process.arch].join('-'), 'stt-bindings.node')
|
|
const binding_path = binary.find(path.resolve(path.join(__dirname, 'package.json')));
|
|
|
|
// On Windows, we can't rely on RPATH being set to $ORIGIN/../ or on
|
|
// @loader_path/../ but we can change the PATH to include the proper directory
|
|
// for the dynamic linker
|
|
if (process.platform === 'win32') {
|
|
var dslib_path = path.resolve(path.join(binding_path, '../..'));
|
|
// electron-builder does weird magic hand-in-hand with electronjs,
|
|
// and messes with the path where we expect things to be for the Windows
|
|
// linker.
|
|
if ('electron' in process.versions) {
|
|
dslib_path = dslib_path.replace("app.asar", "app.asar.unpacked");
|
|
}
|
|
var oldPath = process.env.PATH;
|
|
process.env['PATH'] = `${dslib_path};${process.env.PATH}`;
|
|
}
|
|
|
|
const binding = require(binding_path);
|
|
|
|
if (process.platform === 'win32') {
|
|
process.env['PATH'] = oldPath;
|
|
}
|
|
|
|
/**
|
|
* Stores text of an individual token, along with its timing information
|
|
*/
|
|
export interface TokenMetadata {
|
|
/** The text corresponding to this token */
|
|
text: string;
|
|
|
|
/** Position of the token in units of 20ms */
|
|
timestep: number;
|
|
|
|
/** Position of the token in seconds */
|
|
start_time: number;
|
|
}
|
|
|
|
/**
|
|
* A single transcript computed by the model, including a confidence value and
|
|
* the metadata for its constituent tokens.
|
|
*/
|
|
export interface CandidateTranscript {
|
|
tokens: TokenMetadata[];
|
|
|
|
/**
|
|
* Approximated confidence value for this transcription. This is roughly the
|
|
* sum of the acoustic model logit values for each timestep/token that
|
|
* contributed to the creation of this transcription.
|
|
*/
|
|
confidence: number;
|
|
}
|
|
|
|
/**
|
|
* An array of CandidateTranscript objects computed by the model.
|
|
*/
|
|
export interface Metadata {
|
|
transcripts: CandidateTranscript[];
|
|
}
|
|
|
|
/**
|
|
* Provides an interface to a Coqui STT stream. The constructor cannot be called
|
|
* directly, use :js:func:`Model.createStream`.
|
|
*/
|
|
class StreamImpl {
|
|
/** @internal */
|
|
_impl: any;
|
|
|
|
/**
|
|
* @param nativeStream SWIG wrapper for native StreamingState object.
|
|
*/
|
|
constructor(nativeStream: object) {
|
|
this._impl = nativeStream;
|
|
}
|
|
|
|
/**
|
|
* Feed audio samples to an ongoing streaming inference.
|
|
*
|
|
* @param aBuffer An array of 16-bit, mono raw audio samples at the
|
|
* appropriate sample rate (matching what the model was trained on).
|
|
*/
|
|
feedAudioContent(aBuffer: Buffer): void {
|
|
binding.FeedAudioContent(this._impl, aBuffer);
|
|
}
|
|
|
|
/**
|
|
* Compute the intermediate decoding of an ongoing streaming inference.
|
|
*
|
|
* @return The STT intermediate result.
|
|
*/
|
|
intermediateDecode(): string {
|
|
return binding.IntermediateDecode(this._impl);
|
|
}
|
|
|
|
/**
|
|
* Compute the intermediate decoding of an ongoing streaming inference, return results including metadata.
|
|
*
|
|
* @param aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
|
|
*
|
|
* @return :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
|
*/
|
|
intermediateDecodeWithMetadata(aNumResults: number = 1): Metadata {
|
|
return binding.IntermediateDecodeWithMetadata(this._impl, aNumResults);
|
|
}
|
|
|
|
/**
|
|
* Compute the final decoding of an ongoing streaming inference and return the result. Signals the end of an ongoing streaming inference.
|
|
*
|
|
* @return The STT result.
|
|
*
|
|
* This method will free the stream, it must not be used after this method is called.
|
|
*/
|
|
finishStream(): string {
|
|
const result = binding.FinishStream(this._impl);
|
|
this._impl = null;
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Compute the final decoding of an ongoing streaming inference and return the results including metadata. Signals the end of an ongoing streaming inference.
|
|
*
|
|
* @param aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this. Default value is 1 if not specified.
|
|
*
|
|
* @return Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
|
|
*
|
|
* This method will free the stream, it must not be used after this method is called.
|
|
*/
|
|
finishStreamWithMetadata(aNumResults: number = 1): Metadata {
|
|
const result = binding.FinishStreamWithMetadata(this._impl, aNumResults);
|
|
this._impl = null;
|
|
return result;
|
|
}
|
|
}
|
|
/**
|
|
* Exposes the type of Stream without actually exposing the class.
|
|
* Because the Stream class should not be instantiated directly,
|
|
* but instead be created via :js:func:`Model.createStream`.
|
|
*/
|
|
export type Stream = StreamImpl;
|
|
|
|
/**
|
|
* An object providing an interface to a trained Coqui STT model.
|
|
*/
|
|
export class Model {
|
|
/** @internal */
|
|
_impl: any;
|
|
|
|
/**
|
|
* @param aModelPath The path to the frozen model graph.
|
|
*
|
|
* @throws on error
|
|
*/
|
|
constructor(aModelPath: string) {
|
|
this._impl = null;
|
|
|
|
const [status, impl] = binding.CreateModel(aModelPath);
|
|
if (status !== 0) {
|
|
throw `CreateModel failed: ${binding.ErrorCodeToErrorMessage(status)} (0x${status.toString(16)})`;
|
|
}
|
|
|
|
this._impl = impl;
|
|
}
|
|
|
|
/**
|
|
* Get beam width value used by the model. If :js:func:`Model.setBeamWidth` was
|
|
* not called before, will return the default value loaded from the model file.
|
|
*
|
|
* @return Beam width value used by the model.
|
|
*/
|
|
beamWidth(): number {
|
|
return binding.GetModelBeamWidth(this._impl);
|
|
}
|
|
|
|
/**
|
|
* Set beam width value used by the model.
|
|
*
|
|
* @param aBeamWidth The beam width used by the model. A larger beam width value generates better results at the cost of decoding time.
|
|
*
|
|
* @throws on error
|
|
*/
|
|
setBeamWidth(aBeamWidth: number): void {
|
|
const status = binding.SetModelBeamWidth(this._impl, aBeamWidth);
|
|
if (status !== 0) {
|
|
throw `SetModelBeamWidth failed: ${binding.ErrorCodeToErrorMessage(status)} (0x${status.toString(16)})`;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Add a hot-word and its boost.
|
|
*
|
|
* Words that don't occur in the scorer (e.g. proper nouns) or strings that contain spaces won't be taken into account.
|
|
*
|
|
* @param aWord word
|
|
* @param aBoost boost Positive value increases and negative reduces chance of a word occuring in a transcription. Excessive positive boost might lead to splitting up of letters of the word following the hot-word.
|
|
*
|
|
* @throws on error
|
|
*/
|
|
addHotWord(aWord: string, aBoost: number): void {
|
|
const status = binding.AddHotWord(this._impl, aWord, aBoost);
|
|
if (status !== 0) {
|
|
throw `addHotWord failed: ${binding.ErrorCodeToErrorMessage(status)} (0x${status.toString(16)})`;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Erase entry for hot-word
|
|
*
|
|
* @param aWord word
|
|
*
|
|
* @throws on error
|
|
*/
|
|
eraseHotWord(aWord: string): void {
|
|
const status = binding.EraseHotWord(this._impl, aWord);
|
|
if (status !== 0) {
|
|
throw `eraseHotWord failed: ${binding.ErrorCodeToErrorMessage(status)} (0x${status.toString(16)})`;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clear all hot-word entries
|
|
*
|
|
* @throws on error
|
|
*/
|
|
clearHotWords(): void {
|
|
const status = binding.ClearHotWords(this._impl);
|
|
if (status !== 0) {
|
|
throw `clearHotWord failed: ${binding.ErrorCodeToErrorMessage(status)} (0x${status.toString(16)})`;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return the sample rate expected by the model.
|
|
*
|
|
* @return Sample rate.
|
|
*/
|
|
sampleRate(): number {
|
|
return binding.GetModelSampleRate(this._impl);
|
|
}
|
|
|
|
/**
|
|
* Enable decoding using an external scorer.
|
|
*
|
|
* @param aScorerPath The path to the external scorer file.
|
|
*
|
|
* @throws on error
|
|
*/
|
|
enableExternalScorer(aScorerPath: string): void {
|
|
const status = binding.EnableExternalScorer(this._impl, aScorerPath);
|
|
if (status !== 0) {
|
|
throw `EnableExternalScorer failed: ${binding.ErrorCodeToErrorMessage(status)} (0x${status.toString(16)})`;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Disable decoding using an external scorer.
|
|
*
|
|
* @throws on error
|
|
*/
|
|
disableExternalScorer(): void {
|
|
const status = binding.DisableExternalScorer(this._impl);
|
|
if (status !== 0) {
|
|
throw `DisableExternalScorer failed: ${binding.ErrorCodeToErrorMessage(status)} (0x${status.toString(16)})`;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Set hyperparameters alpha and beta of the external scorer.
|
|
*
|
|
* @param aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model weight.
|
|
* @param aLMBeta The beta hyperparameter of the CTC decoder. Word insertion weight.
|
|
*
|
|
* @throws on error
|
|
*/
|
|
setScorerAlphaBeta(aLMAlpha: number, aLMBeta: number): void {
|
|
const status = binding.SetScorerAlphaBeta(this._impl, aLMAlpha, aLMBeta);
|
|
if (status !== 0) {
|
|
throw `SetScorerAlphaBeta failed: ${binding.ErrorCodeToErrorMessage(status)} (0x${status.toString(16)})`;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Use the Coqui STT model to perform Speech-To-Text.
|
|
*
|
|
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
|
*
|
|
* @return The STT result. Returns undefined on error.
|
|
*/
|
|
stt(aBuffer: Buffer): string {
|
|
return binding.SpeechToText(this._impl, aBuffer);
|
|
}
|
|
|
|
/**
|
|
* Use the Coqui STT model to perform Speech-To-Text and output metadata
|
|
* about the results.
|
|
*
|
|
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
|
* @param aNumResults Maximum number of candidate transcripts to return. Returned list might be smaller than this.
|
|
* Default value is 1 if not specified.
|
|
*
|
|
* @return :js:func:`Metadata` object containing multiple candidate transcripts. Each transcript has per-token metadata including timing information.
|
|
* The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
|
*/
|
|
sttWithMetadata(aBuffer: Buffer, aNumResults: number = 1): Metadata {
|
|
return binding.SpeechToTextWithMetadata(this._impl, aBuffer, aNumResults);
|
|
}
|
|
|
|
/**
|
|
* Create a new streaming inference state. One can then call :js:func:`StreamImpl.feedAudioContent` and :js:func:`StreamImpl.finishStream` on the returned stream object.
|
|
*
|
|
* @return a :js:func:`StreamImpl` object that represents the streaming state.
|
|
*
|
|
* @throws on error
|
|
*/
|
|
createStream(): StreamImpl {
|
|
const [status, ctx] = binding.CreateStream(this._impl);
|
|
if (status !== 0) {
|
|
throw `CreateStream failed: ${binding.ErrorCodeToErrorMessage(status)} (0x${status.toString(16)})`;
|
|
}
|
|
return new StreamImpl(ctx);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Frees associated resources and destroys model object.
|
|
*
|
|
* @param model A model pointer returned by :js:func:`Model`
|
|
*
|
|
*/
|
|
export function FreeModel(model: Model): void {
|
|
binding.FreeModel(model._impl);
|
|
}
|
|
|
|
/**
|
|
* Free memory allocated for metadata information.
|
|
*
|
|
* @param metadata Object containing metadata as returned by :js:func:`Model.sttWithMetadata` or :js:func:`StreamImpl.finishStreamWithMetadata`
|
|
*/
|
|
export function FreeMetadata(metadata: Metadata): void {
|
|
binding.FreeMetadata(metadata);
|
|
}
|
|
|
|
/**
|
|
* Destroy a streaming state without decoding the computed logits. This
|
|
* can be used if you no longer need the result of an ongoing streaming
|
|
* inference and don't want to perform a costly decode operation.
|
|
*
|
|
* @param stream A streaming state pointer returned by :js:func:`Model.createStream`.
|
|
*/
|
|
export function FreeStream(stream: StreamImpl): void {
|
|
binding.FreeStream(stream._impl);
|
|
}
|
|
|
|
/**
|
|
* Returns the version of this library. The returned version is a semantic
|
|
* version (SemVer 2.0.0).
|
|
*/
|
|
export function Version(): string {
|
|
return binding.Version();
|
|
}
|