Merge pull request #2350 from mozilla/breaking-api-cleanup
[BREAKING] API cleanup
This commit is contained in:
commit
6b7ebf47f2
|
@ -29,13 +29,11 @@ Then run with a TF Lite model, alphabet, LM/trie and a CSV test file
|
|||
BEAM_WIDTH = 500
|
||||
LM_ALPHA = 0.75
|
||||
LM_BETA = 1.85
|
||||
N_FEATURES = 26
|
||||
N_CONTEXT = 9
|
||||
|
||||
def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask):
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
|
||||
ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
|
||||
ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)
|
||||
ds = Model(model, alphabet, BEAM_WIDTH)
|
||||
ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
|
||||
|
||||
while True:
|
||||
msg = queue_in.get()
|
||||
|
|
|
@ -22,8 +22,6 @@ namespace DeepSpeechWPF
|
|||
{
|
||||
private readonly IDeepSpeech _sttClient;
|
||||
|
||||
private const uint N_CEP = 26;
|
||||
private const uint N_CONTEXT = 9;
|
||||
private const uint BEAM_WIDTH = 500;
|
||||
private const float LM_ALPHA = 0.75f;
|
||||
private const float LM_BETA = 1.85f;
|
||||
|
@ -79,7 +77,7 @@ namespace DeepSpeechWPF
|
|||
{
|
||||
try
|
||||
{
|
||||
_sttClient.CreateModel("output_graph.pbmm", N_CEP, N_CONTEXT, "alphabet.txt", BEAM_WIDTH);
|
||||
_sttClient.CreateModel("output_graph.pbmm", "alphabet.txt", BEAM_WIDTH);
|
||||
Dispatcher.Invoke(() => { EnableControls(); });
|
||||
}
|
||||
catch (Exception ex)
|
||||
|
@ -155,7 +153,7 @@ namespace DeepSpeechWPF
|
|||
{
|
||||
try
|
||||
{
|
||||
_sttClient.EnableDecoderWithLM("alphabet.txt", "lm.binary", "trie", LM_ALPHA, LM_BETA);
|
||||
_sttClient.EnableDecoderWithLM("lm.binary", "trie", LM_ALPHA, LM_BETA);
|
||||
Dispatcher.Invoke(() => lblStatus.Content = "LM loaded.");
|
||||
}
|
||||
catch (Exception ex)
|
||||
|
@ -198,7 +196,7 @@ namespace DeepSpeechWPF
|
|||
_soundInSource.Dispose();
|
||||
_convertedSource.Dispose();
|
||||
_audioCapture.DataAvailable -= _capture_DataAvailable;
|
||||
_sttClient.DiscardStream(); //this a good example of discardstream, the user changed the audio input, so we no longer need the current stream
|
||||
_sttClient.FreeStream(); //this a good example of FreeStream, the user changed the audio input, so we no longer need the current stream
|
||||
}
|
||||
if (_audioCaptureDevices!=null)
|
||||
{
|
||||
|
@ -252,7 +250,7 @@ namespace DeepSpeechWPF
|
|||
|
||||
private void BtnStartRecording_Click(object sender, RoutedEventArgs e)
|
||||
{
|
||||
_sttClient.SetupStream(16000);
|
||||
_sttClient.CreateStream(16000);
|
||||
_audioCapture.Start();
|
||||
btnStartRecording.IsEnabled = false;
|
||||
btnStopRecording.IsEnabled = true;
|
||||
|
|
|
@ -33,8 +33,6 @@
|
|||
#include "deepspeech.h"
|
||||
#include "args.h"
|
||||
|
||||
#define N_CEP 26
|
||||
#define N_CONTEXT 9
|
||||
#define BEAM_WIDTH 500
|
||||
#define LM_ALPHA 0.75f
|
||||
#define LM_BETA 1.85f
|
||||
|
@ -72,7 +70,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
|
|||
DS_FreeMetadata(metadata);
|
||||
} else if (stream_size > 0) {
|
||||
StreamingState* ctx;
|
||||
int status = DS_SetupStream(aCtx, aSampleRate, &ctx);
|
||||
int status = DS_CreateStream(aCtx, aSampleRate, &ctx);
|
||||
if (status != DS_ERR_OK) {
|
||||
res.string = strdup("");
|
||||
return res;
|
||||
|
@ -377,7 +375,7 @@ main(int argc, char **argv)
|
|||
|
||||
// Initialise DeepSpeech
|
||||
ModelState* ctx;
|
||||
int status = DS_CreateModel(model, N_CEP, N_CONTEXT, alphabet, BEAM_WIDTH, &ctx);
|
||||
int status = DS_CreateModel(model, alphabet, BEAM_WIDTH, &ctx);
|
||||
if (status != 0) {
|
||||
fprintf(stderr, "Could not create model.\n");
|
||||
return 1;
|
||||
|
@ -385,7 +383,6 @@ main(int argc, char **argv)
|
|||
|
||||
if (lm && (trie || load_without_trie)) {
|
||||
int status = DS_EnableDecoderWithLM(ctx,
|
||||
alphabet,
|
||||
lm,
|
||||
trie,
|
||||
LM_ALPHA,
|
||||
|
@ -449,7 +446,7 @@ main(int argc, char **argv)
|
|||
sox_quit();
|
||||
#endif // NO_SOX
|
||||
|
||||
DS_DestroyModel(ctx);
|
||||
DS_FreeModel(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -257,8 +257,6 @@ StreamingState::processBatch(const vector<float>& buf, unsigned int n_steps)
|
|||
|
||||
int
|
||||
DS_CreateModel(const char* aModelPath,
|
||||
unsigned int aNCep,
|
||||
unsigned int aNContext,
|
||||
const char* aAlphabetConfigPath,
|
||||
unsigned int aBeamWidth,
|
||||
ModelState** retval)
|
||||
|
@ -285,7 +283,7 @@ DS_CreateModel(const char* aModelPath,
|
|||
return DS_ERR_FAIL_CREATE_MODEL;
|
||||
}
|
||||
|
||||
int err = model->init(aModelPath, aNCep, aNContext, aAlphabetConfigPath, aBeamWidth);
|
||||
int err = model->init(aModelPath, aAlphabetConfigPath, aBeamWidth);
|
||||
if (err != DS_ERR_OK) {
|
||||
return err;
|
||||
}
|
||||
|
@ -295,14 +293,13 @@ DS_CreateModel(const char* aModelPath,
|
|||
}
|
||||
|
||||
void
|
||||
DS_DestroyModel(ModelState* ctx)
|
||||
DS_FreeModel(ModelState* ctx)
|
||||
{
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
int
|
||||
DS_EnableDecoderWithLM(ModelState* aCtx,
|
||||
const char* aAlphabetConfigPath,
|
||||
const char* aLMPath,
|
||||
const char* aTriePath,
|
||||
float aLMAlpha,
|
||||
|
@ -320,9 +317,9 @@ DS_EnableDecoderWithLM(ModelState* aCtx,
|
|||
}
|
||||
|
||||
int
|
||||
DS_SetupStream(ModelState* aCtx,
|
||||
unsigned int aSampleRate,
|
||||
StreamingState** retval)
|
||||
DS_CreateStream(ModelState* aCtx,
|
||||
unsigned int aSampleRate,
|
||||
StreamingState** retval)
|
||||
{
|
||||
*retval = nullptr;
|
||||
|
||||
|
@ -371,7 +368,7 @@ char*
|
|||
DS_FinishStream(StreamingState* aSctx)
|
||||
{
|
||||
char* str = aSctx->finishStream();
|
||||
DS_DiscardStream(aSctx);
|
||||
DS_FreeStream(aSctx);
|
||||
return str;
|
||||
}
|
||||
|
||||
|
@ -379,18 +376,18 @@ Metadata*
|
|||
DS_FinishStreamWithMetadata(StreamingState* aSctx)
|
||||
{
|
||||
Metadata* metadata = aSctx->finishStreamWithMetadata();
|
||||
DS_DiscardStream(aSctx);
|
||||
DS_FreeStream(aSctx);
|
||||
return metadata;
|
||||
}
|
||||
|
||||
StreamingState*
|
||||
SetupStreamAndFeedAudioContent(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate)
|
||||
CreateStreamAndFeedAudioContent(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate)
|
||||
{
|
||||
StreamingState* ctx;
|
||||
int status = DS_SetupStream(aCtx, aSampleRate, &ctx);
|
||||
int status = DS_CreateStream(aCtx, aSampleRate, &ctx);
|
||||
if (status != DS_ERR_OK) {
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -404,7 +401,7 @@ DS_SpeechToText(ModelState* aCtx,
|
|||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate)
|
||||
{
|
||||
StreamingState* ctx = SetupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
return DS_FinishStream(ctx);
|
||||
}
|
||||
|
||||
|
@ -414,12 +411,12 @@ DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
|||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate)
|
||||
{
|
||||
StreamingState* ctx = SetupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
return DS_FinishStreamWithMetadata(ctx);
|
||||
}
|
||||
|
||||
void
|
||||
DS_DiscardStream(StreamingState* aSctx)
|
||||
DS_FreeStream(StreamingState* aSctx)
|
||||
{
|
||||
delete aSctx;
|
||||
}
|
||||
|
|
|
@ -63,8 +63,6 @@ enum DeepSpeech_Error_Codes
|
|||
* @brief An object providing an interface to a trained DeepSpeech model.
|
||||
*
|
||||
* @param aModelPath The path to the frozen model graph.
|
||||
* @param aNCep The number of cepstrum the model was trained with.
|
||||
* @param aNContext The context window the model was trained with.
|
||||
* @param aAlphabetConfigPath The path to the configuration file specifying
|
||||
* the alphabet used by the network. See alphabet.h.
|
||||
* @param aBeamWidth The beam width used by the decoder. A larger beam
|
||||
|
@ -76,8 +74,6 @@ enum DeepSpeech_Error_Codes
|
|||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
int DS_CreateModel(const char* aModelPath,
|
||||
unsigned int aNCep,
|
||||
unsigned int aNContext,
|
||||
const char* aAlphabetConfigPath,
|
||||
unsigned int aBeamWidth,
|
||||
ModelState** retval);
|
||||
|
@ -86,7 +82,7 @@ int DS_CreateModel(const char* aModelPath,
|
|||
* @brief Frees associated resources and destroys model object.
|
||||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
void DS_DestroyModel(ModelState* ctx);
|
||||
void DS_FreeModel(ModelState* ctx);
|
||||
|
||||
/**
|
||||
* @brief Enable decoding using beam scoring with a KenLM language model.
|
||||
|
@ -106,7 +102,6 @@ void DS_DestroyModel(ModelState* ctx);
|
|||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
int DS_EnableDecoderWithLM(ModelState* aCtx,
|
||||
const char* aAlphabetConfigPath,
|
||||
const char* aLMPath,
|
||||
const char* aTriePath,
|
||||
float aLMAlpha,
|
||||
|
@ -145,9 +140,9 @@ char* DS_SpeechToText(ModelState* aCtx,
|
|||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate);
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate);
|
||||
|
||||
/**
|
||||
* @brief Create a new streaming inference state. The streaming state returned
|
||||
|
@ -162,14 +157,14 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
|||
* @return Zero for success, non-zero on failure.
|
||||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
int DS_SetupStream(ModelState* aCtx,
|
||||
unsigned int aSampleRate,
|
||||
StreamingState** retval);
|
||||
int DS_CreateStream(ModelState* aCtx,
|
||||
unsigned int aSampleRate,
|
||||
StreamingState** retval);
|
||||
|
||||
/**
|
||||
* @brief Feed audio samples to an ongoing streaming inference.
|
||||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
* @param aBuffer An array of 16-bit, mono raw audio samples at the
|
||||
* appropriate sample rate.
|
||||
* @param aBufferSize The number of samples in @p aBuffer.
|
||||
|
@ -185,7 +180,7 @@ void DS_FeedAudioContent(StreamingState* aSctx,
|
|||
* currently capable of streaming, so it always starts from the beginning
|
||||
* of the audio.
|
||||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
*
|
||||
* @return The STT intermediate result. The user is responsible for freeing the
|
||||
* string using {@link DS_FreeString()}.
|
||||
|
@ -197,7 +192,7 @@ char* DS_IntermediateDecode(StreamingState* aSctx);
|
|||
* @brief Signal the end of an audio signal to an ongoing streaming
|
||||
* inference, returns the STT result over the whole audio signal.
|
||||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
*
|
||||
* @return The STT result. The user is responsible for freeing the string using
|
||||
* {@link DS_FreeString()}.
|
||||
|
@ -211,7 +206,7 @@ char* DS_FinishStream(StreamingState* aSctx);
|
|||
* @brief Signal the end of an audio signal to an ongoing streaming
|
||||
* inference, returns per-letter metadata.
|
||||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
*
|
||||
* @return Outputs a struct of individual letters along with their timing information.
|
||||
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
|
||||
|
@ -226,12 +221,12 @@ Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx);
|
|||
* can be used if you no longer need the result of an ongoing streaming
|
||||
* inference and don't want to perform a costly decode operation.
|
||||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
*
|
||||
* @note This method will free the state pointer (@p aSctx).
|
||||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
void DS_DiscardStream(StreamingState* aSctx);
|
||||
void DS_FreeStream(StreamingState* aSctx);
|
||||
|
||||
/**
|
||||
* @brief Free memory allocated for metadata information.
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
#ifndef DEEPSPEECH_COMPAT_H
|
||||
#define DEEPSPEECH_COMPAT_H
|
||||
|
||||
#include "deepspeech.h"
|
||||
|
||||
#warning This header is a convenience wrapper for compatibility with \
|
||||
the previous API, it has deprecated function names and arguments. \
|
||||
If possible, update your code instead of using this header.
|
||||
|
||||
/**
|
||||
* @brief An object providing an interface to a trained DeepSpeech model.
|
||||
*
|
||||
* @param aModelPath The path to the frozen model graph.
|
||||
* @param aNCep UNUSED, DEPRECATED.
|
||||
* @param aNContext UNUSED, DEPRECATED.
|
||||
* @param aAlphabetConfigPath The path to the configuration file specifying
|
||||
* the alphabet used by the network. See alphabet.h.
|
||||
* @param aBeamWidth The beam width used by the decoder. A larger beam
|
||||
* width generates better results at the cost of decoding
|
||||
* time.
|
||||
* @param[out] retval a ModelState pointer
|
||||
*
|
||||
* @return Zero on success, non-zero on failure.
|
||||
*/
|
||||
int DS_CreateModel(const char* aModelPath,
|
||||
unsigned int /*aNCep*/,
|
||||
unsigned int /*aNContext*/,
|
||||
const char* aAlphabetConfigPath,
|
||||
unsigned int aBeamWidth,
|
||||
ModelState** retval)
|
||||
{
|
||||
return DS_CreateModel(aModelPath, aAlphabetConfigPath, aBeamWidth, retval);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Frees associated resources and destroys model object.
|
||||
*/
|
||||
void DS_DestroyModel(ModelState* ctx)
|
||||
{
|
||||
return DS_FreeModel(ctx);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Enable decoding using beam scoring with a KenLM language model.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model being changed.
|
||||
* @param aAlphabetConfigPath UNUSED, DEPRECATED.
|
||||
* @param aLMPath The path to the language model binary file.
|
||||
* @param aTriePath The path to the trie file build from the same vocabu-
|
||||
* lary as the language model binary.
|
||||
* @param aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model
|
||||
weight.
|
||||
* @param aLMBeta The beta hyperparameter of the CTC decoder. Word insertion
|
||||
weight.
|
||||
*
|
||||
* @return Zero on success, non-zero on failure (invalid arguments).
|
||||
*/
|
||||
int DS_EnableDecoderWithLM(ModelState* aCtx,
|
||||
const char* /*aAlphabetConfigPath*/,
|
||||
const char* aLMPath,
|
||||
const char* aTriePath,
|
||||
float aLMAlpha,
|
||||
float aLMBeta)
|
||||
{
|
||||
return DS_EnableDecoderWithLM(aCtx, aLMPath, aTriePath, aLMAlpha, aLMBeta);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Create a new streaming inference state. The streaming state returned
|
||||
* by this function can then be passed to {@link DS_FeedAudioContent()}
|
||||
* and {@link DS_FinishStream()}.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aSampleRate The sample-rate of the audio signal.
|
||||
* @param[out] retval an opaque pointer that represents the streaming state. Can
|
||||
* be NULL if an error occurs.
|
||||
*
|
||||
* @return Zero for success, non-zero on failure.
|
||||
*/
|
||||
int DS_SetupStream(ModelState* aCtx,
|
||||
unsigned int aSampleRate,
|
||||
StreamingState** retval)
|
||||
{
|
||||
return DS_CreateStream(aCtx, aSampleRate, retval);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Destroy a streaming state without decoding the computed logits. This
|
||||
* can be used if you no longer need the result of an ongoing streaming
|
||||
* inference and don't want to perform a costly decode operation.
|
||||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
*
|
||||
* @note This method will free the state pointer (@p aSctx).
|
||||
*/
|
||||
void DS_DiscardStream(StreamingState* aSctx)
|
||||
{
|
||||
return DS_FreeStream(aSctx);
|
||||
}
|
||||
|
||||
#endif /* DEEPSPEECH_COMPAT_H */
|
|
@ -32,13 +32,11 @@ namespace DeepSpeechClient
|
|||
/// Create an object providing an interface to a trained DeepSpeech model.
|
||||
/// </summary>
|
||||
/// <param name="aModelPath">The path to the frozen model graph.</param>
|
||||
/// <param name="aNCep">The number of cepstrum the model was trained with.</param>
|
||||
/// <param name="aNContext">The context window the model was trained with.</param>
|
||||
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
|
||||
/// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param>
|
||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception>
|
||||
public unsafe void CreateModel(string aModelPath, uint aNCep,
|
||||
uint aNContext, string aAlphabetConfigPath, uint aBeamWidth)
|
||||
public unsafe void CreateModel(string aModelPath,
|
||||
string aAlphabetConfigPath, uint aBeamWidth)
|
||||
{
|
||||
string exceptionMessage = null;
|
||||
if (string.IsNullOrWhiteSpace(aModelPath))
|
||||
|
@ -63,8 +61,6 @@ namespace DeepSpeechClient
|
|||
throw new FileNotFoundException(exceptionMessage);
|
||||
}
|
||||
var resultCode = NativeImp.DS_CreateModel(aModelPath,
|
||||
aNCep,
|
||||
aNContext,
|
||||
aAlphabetConfigPath,
|
||||
aBeamWidth,
|
||||
ref _modelStatePP);
|
||||
|
@ -116,20 +112,18 @@ namespace DeepSpeechClient
|
|||
/// </summary>
|
||||
public unsafe void Dispose()
|
||||
{
|
||||
NativeImp.DS_DestroyModel(_modelStatePP);
|
||||
NativeImp.DS_FreeModel(_modelStatePP);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Enable decoding using beam scoring with a KenLM language model.
|
||||
/// </summary>
|
||||
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
|
||||
/// <param name="aLMPath">The path to the language model binary file.</param>
|
||||
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param>
|
||||
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
|
||||
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
|
||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
|
||||
public unsafe void EnableDecoderWithLM(string aAlphabetConfigPath,
|
||||
string aLMPath, string aTriePath,
|
||||
public unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath,
|
||||
float aLMAlpha, float aLMBeta)
|
||||
{
|
||||
string exceptionMessage = null;
|
||||
|
@ -148,7 +142,6 @@ namespace DeepSpeechClient
|
|||
}
|
||||
|
||||
var resultCode = NativeImp.DS_EnableDecoderWithLM(_modelStatePP,
|
||||
aAlphabetConfigPath,
|
||||
aLMPath,
|
||||
aTriePath,
|
||||
aLMAlpha,
|
||||
|
@ -206,9 +199,9 @@ namespace DeepSpeechClient
|
|||
/// </summary>
|
||||
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
|
||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
|
||||
public unsafe void SetupStream(uint aSampleRate)
|
||||
public unsafe void CreateStream(uint aSampleRate)
|
||||
{
|
||||
var resultCode = NativeImp.DS_SetupStream(_modelStatePP, aSampleRate, ref _streamingStatePP);
|
||||
var resultCode = NativeImp.DS_CreateStream(_modelStatePP, aSampleRate, ref _streamingStatePP);
|
||||
EvaluateResultCode(resultCode);
|
||||
}
|
||||
|
||||
|
@ -217,9 +210,9 @@ namespace DeepSpeechClient
|
|||
/// This can be used if you no longer need the result of an ongoing streaming
|
||||
/// inference and don't want to perform a costly decode operation.
|
||||
/// </summary>
|
||||
public unsafe void DiscardStream()
|
||||
public unsafe void FreeStream()
|
||||
{
|
||||
NativeImp.DS_DiscardStream(ref _streamingStatePP);
|
||||
NativeImp.DS_FreeStream(ref _streamingStatePP);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
|
|
@ -17,27 +17,22 @@ namespace DeepSpeechClient.Interfaces
|
|||
/// Create an object providing an interface to a trained DeepSpeech model.
|
||||
/// </summary>
|
||||
/// <param name="aModelPath">The path to the frozen model graph.</param>
|
||||
/// <param name="aNCep">The number of cepstrum the model was trained with.</param>
|
||||
/// <param name="aNContext">The context window the model was trained with.</param>
|
||||
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
|
||||
/// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param>
|
||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception>
|
||||
unsafe void CreateModel(string aModelPath, uint aNCep,
|
||||
uint aNContext,
|
||||
unsafe void CreateModel(string aModelPath,
|
||||
string aAlphabetConfigPath,
|
||||
uint aBeamWidth);
|
||||
|
||||
/// <summary>
|
||||
/// Enable decoding using beam scoring with a KenLM language model.
|
||||
/// </summary>
|
||||
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
|
||||
/// <param name="aLMPath">The path to the language model binary file.</param>
|
||||
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param>
|
||||
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
|
||||
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
|
||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
|
||||
unsafe void EnableDecoderWithLM(string aAlphabetConfigPath,
|
||||
string aLMPath,
|
||||
unsafe void EnableDecoderWithLM(string aLMPath,
|
||||
string aTriePath,
|
||||
float aLMAlpha,
|
||||
float aLMBeta);
|
||||
|
@ -69,7 +64,7 @@ namespace DeepSpeechClient.Interfaces
|
|||
/// This can be used if you no longer need the result of an ongoing streaming
|
||||
/// inference and don't want to perform a costly decode operation.
|
||||
/// </summary>
|
||||
unsafe void DiscardStream();
|
||||
unsafe void FreeStream();
|
||||
|
||||
/// <summary>
|
||||
/// Free a DeepSpeech allocated string
|
||||
|
@ -86,7 +81,7 @@ namespace DeepSpeechClient.Interfaces
|
|||
/// </summary>
|
||||
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
|
||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
|
||||
unsafe void SetupStream(uint aSampleRate);
|
||||
unsafe void CreateStream(uint aSampleRate);
|
||||
|
||||
/// <summary>
|
||||
/// Feeds audio samples to an ongoing streaming inference.
|
||||
|
|
|
@ -17,15 +17,12 @@ namespace DeepSpeechClient
|
|||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath,
|
||||
uint aNCep,
|
||||
uint aNContext,
|
||||
string aAlphabetConfigPath,
|
||||
uint aBeamWidth,
|
||||
ref ModelState** pint);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(ModelState** aCtx,
|
||||
string aAlphabetConfigPath,
|
||||
string aLMPath,
|
||||
string aTriePath,
|
||||
float aLMAlpha,
|
||||
|
@ -45,14 +42,14 @@ namespace DeepSpeechClient
|
|||
uint aSampleRate);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern void DS_DestroyModel(ModelState** aCtx);
|
||||
internal static unsafe extern void DS_FreeModel(ModelState** aCtx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern ErrorCodes DS_SetupStream(ModelState** aCtx,
|
||||
internal static unsafe extern ErrorCodes DS_CreateStream(ModelState** aCtx,
|
||||
uint aSampleRate, ref StreamingState** retval);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern void DS_DiscardStream(ref StreamingState** aSctx);
|
||||
internal static unsafe extern void DS_FreeStream(ref StreamingState** aSctx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern void DS_FreeMetadata(IntPtr metadata);
|
||||
|
|
|
@ -7,6 +7,8 @@ using GraphDef = System.IntPtr;
|
|||
|
||||
namespace DeepSpeechClient.Structs
|
||||
{
|
||||
//FIXME: ModelState is an opaque pointer to the API, why is this code reverse
|
||||
// engineering its contents?
|
||||
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Ansi)]
|
||||
public unsafe struct ModelState
|
||||
{
|
||||
|
|
|
@ -50,8 +50,6 @@ namespace CSharpExamples
|
|||
extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended"));
|
||||
}
|
||||
|
||||
const uint N_CEP = 26;
|
||||
const uint N_CONTEXT = 9;
|
||||
const uint BEAM_WIDTH = 500;
|
||||
const float LM_ALPHA = 0.75f;
|
||||
const float LM_BETA = 1.85f;
|
||||
|
@ -66,7 +64,6 @@ namespace CSharpExamples
|
|||
stopwatch.Start();
|
||||
sttClient.CreateModel(
|
||||
model ?? "output_graph.pbmm",
|
||||
N_CEP, N_CONTEXT,
|
||||
alphabet ?? "alphabet.txt",
|
||||
BEAM_WIDTH);
|
||||
stopwatch.Stop();
|
||||
|
@ -77,7 +74,6 @@ namespace CSharpExamples
|
|||
{
|
||||
Console.WriteLine("Loadin LM...");
|
||||
sttClient.EnableDecoderWithLM(
|
||||
alphabet ?? "alphabet.txt",
|
||||
lm ?? "lm.binary",
|
||||
trie ?? "trie",
|
||||
LM_ALPHA, LM_BETA);
|
||||
|
|
|
@ -31,8 +31,6 @@ public class DeepSpeechActivity extends AppCompatActivity {
|
|||
|
||||
Button _startInference;
|
||||
|
||||
final int N_CEP = 26;
|
||||
final int N_CONTEXT = 9;
|
||||
final int BEAM_WIDTH = 50;
|
||||
final float LM_ALPHA = 0.75f;
|
||||
final float LM_BETA = 1.85f;
|
||||
|
@ -54,7 +52,7 @@ public class DeepSpeechActivity extends AppCompatActivity {
|
|||
private void newModel(String tfliteModel, String alphabet) {
|
||||
this._tfliteStatus.setText("Creating model");
|
||||
if (this._m == null) {
|
||||
this._m = new DeepSpeechModel(tfliteModel, N_CEP, N_CONTEXT, alphabet, BEAM_WIDTH);
|
||||
this._m = new DeepSpeechModel(tfliteModel, alphabet, BEAM_WIDTH);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -167,7 +165,7 @@ public class DeepSpeechActivity extends AppCompatActivity {
|
|||
super.onDestroy();
|
||||
|
||||
if (this._m != null) {
|
||||
this._m.destroyModel();
|
||||
this._m.freeModel();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,8 +35,6 @@ public class BasicTest {
|
|||
public static final String trieFile = "/data/local/tmp/test/trie";
|
||||
public static final String wavFile = "/data/local/tmp/test/LDC93S1.wav";
|
||||
|
||||
public static final int N_CEP = 26;
|
||||
public static final int N_CONTEXT = 9;
|
||||
public static final int BEAM_WIDTH = 50;
|
||||
|
||||
public static final float LM_ALPHA = 0.75f;
|
||||
|
@ -66,8 +64,8 @@ public class BasicTest {
|
|||
|
||||
@Test
|
||||
public void loadDeepSpeech_basic() {
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
|
||||
m.destroyModel();
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
|
||||
m.freeModel();
|
||||
}
|
||||
|
||||
private String metadataToString(Metadata m) {
|
||||
|
@ -123,39 +121,39 @@ public class BasicTest {
|
|||
|
||||
@Test
|
||||
public void loadDeepSpeech_stt_noLM() {
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
|
||||
|
||||
String decoded = doSTT(m, false);
|
||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||
m.destroyModel();
|
||||
m.freeModel();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void loadDeepSpeech_stt_withLM() {
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
|
||||
m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA);
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
|
||||
m.enableDecoderWihLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
|
||||
|
||||
String decoded = doSTT(m, false);
|
||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||
m.destroyModel();
|
||||
m.freeModel();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void loadDeepSpeech_sttWithMetadata_noLM() {
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
|
||||
|
||||
String decoded = doSTT(m, true);
|
||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||
m.destroyModel();
|
||||
m.freeModel();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void loadDeepSpeech_sttWithMetadata_withLM() {
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
|
||||
m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA);
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
|
||||
m.enableDecoderWihLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
|
||||
|
||||
String decoded = doSTT(m, true);
|
||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||
m.destroyModel();
|
||||
m.freeModel();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,18 +11,18 @@ public class DeepSpeechModel {
|
|||
SWIGTYPE_p_p_ModelState _mspp;
|
||||
SWIGTYPE_p_ModelState _msp;
|
||||
|
||||
public DeepSpeechModel(String modelPath, int n_cep, int n_context, String alphabetPath, int beam_width) {
|
||||
public DeepSpeechModel(String modelPath, String alphabetPath, int beam_width) {
|
||||
this._mspp = impl.new_modelstatep();
|
||||
impl.CreateModel(modelPath, n_cep, n_context, alphabetPath, beam_width, this._mspp);
|
||||
impl.CreateModel(modelPath, alphabetPath, beam_width, this._mspp);
|
||||
this._msp = impl.modelstatep_value(this._mspp);
|
||||
}
|
||||
|
||||
public void destroyModel() {
|
||||
impl.DestroyModel(this._msp);
|
||||
public void freeModel() {
|
||||
impl.FreeModel(this._msp);
|
||||
}
|
||||
|
||||
public void enableDecoderWihLM(String alphabet, String lm, String trie, float lm_alpha, float lm_beta) {
|
||||
impl.EnableDecoderWithLM(this._msp, alphabet, lm, trie, lm_alpha, lm_beta);
|
||||
public void enableDecoderWihLM(String lm, String trie, float lm_alpha, float lm_beta) {
|
||||
impl.EnableDecoderWithLM(this._msp, lm, trie, lm_alpha, lm_beta);
|
||||
}
|
||||
|
||||
public String stt(short[] buffer, int buffer_size, int sample_rate) {
|
||||
|
@ -33,9 +33,9 @@ public class DeepSpeechModel {
|
|||
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, sample_rate);
|
||||
}
|
||||
|
||||
public DeepSpeechStreamingState setupStream(int sample_rate) {
|
||||
public DeepSpeechStreamingState createStream(int sample_rate) {
|
||||
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
|
||||
impl.SetupStream(this._msp, sample_rate, ssp);
|
||||
impl.CreateStream(this._msp, sample_rate, ssp);
|
||||
return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
|
||||
}
|
||||
|
||||
|
|
|
@ -22,16 +22,6 @@ const LM_ALPHA = 0.75;
|
|||
const LM_BETA = 1.85;
|
||||
|
||||
|
||||
// These constants are tied to the shape of the graph used (changing them changes
|
||||
// the geometry of the first layer), so make sure you use the same constants that
|
||||
// were used during training
|
||||
|
||||
// Number of MFCC features to use
|
||||
const N_FEATURES = 26;
|
||||
|
||||
// Size of the context window used for producing timesteps in the input vector
|
||||
const N_CONTEXT = 9;
|
||||
|
||||
var VersionAction = function VersionAction(options) {
|
||||
options = options || {};
|
||||
options.nargs = 0;
|
||||
|
@ -109,15 +99,14 @@ audioStream.on('finish', () => {
|
|||
|
||||
console.error('Loading model from file %s', args['model']);
|
||||
const model_load_start = process.hrtime();
|
||||
var model = new Ds.Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'], BEAM_WIDTH);
|
||||
var model = new Ds.Model(args['model'], args['alphabet'], BEAM_WIDTH);
|
||||
const model_load_end = process.hrtime(model_load_start);
|
||||
console.error('Loaded model in %ds.', totalTime(model_load_end));
|
||||
|
||||
if (args['lm'] && args['trie']) {
|
||||
console.error('Loading language model from files %s %s', args['lm'], args['trie']);
|
||||
const lm_load_start = process.hrtime();
|
||||
model.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'],
|
||||
LM_ALPHA, LM_BETA);
|
||||
model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA);
|
||||
const lm_load_end = process.hrtime(lm_load_start);
|
||||
console.error('Loaded language model in %ds.', totalTime(lm_load_end));
|
||||
}
|
||||
|
@ -135,6 +124,6 @@ audioStream.on('finish', () => {
|
|||
}
|
||||
const inference_stop = process.hrtime(inference_start);
|
||||
console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4));
|
||||
Ds.DestroyModel(model);
|
||||
Ds.FreeModel(model);
|
||||
process.exit(0);
|
||||
});
|
||||
|
|
|
@ -47,7 +47,7 @@ using namespace node;
|
|||
}
|
||||
|
||||
|
||||
// convert double pointer retval in SetupStream to an output
|
||||
// convert double pointer retval in CreateStream to an output
|
||||
%typemap(in, numinputs=0) StreamingState **retval (StreamingState *ret) {
|
||||
ret = NULL;
|
||||
$1 = &ret;
|
||||
|
|
|
@ -48,13 +48,13 @@ Model.prototype.sttWithMetadata = function() {
|
|||
return binding.SpeechToTextWithMetadata.apply(null, args);
|
||||
}
|
||||
|
||||
Model.prototype.setupStream = function() {
|
||||
Model.prototype.createStream = function() {
|
||||
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
|
||||
const rets = binding.SetupStream.apply(null, args);
|
||||
const rets = binding.CreateStream.apply(null, args);
|
||||
const status = rets[0];
|
||||
const ctx = rets[1];
|
||||
if (status !== 0) {
|
||||
throw "SetupStream failed with error code " + status;
|
||||
throw "CreateStream failed with error code " + status;
|
||||
}
|
||||
return ctx;
|
||||
}
|
||||
|
@ -75,13 +75,14 @@ Model.prototype.finishStreamWithMetadata = function() {
|
|||
return binding.FinishStreamWithMetadata.apply(null, arguments);
|
||||
}
|
||||
|
||||
function DestroyModel(model) {
|
||||
return binding.DestroyModel(model._impl);
|
||||
function FreeModel(model) {
|
||||
return binding.FreeModel(model._impl);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
Model: Model,
|
||||
printVersions: binding.PrintVersions,
|
||||
DestroyModel: DestroyModel,
|
||||
FreeModel: FreeModel,
|
||||
FreeStream: binding.FreeStream,
|
||||
FreeMetadata: binding.FreeMetadata
|
||||
};
|
||||
|
|
|
@ -25,13 +25,9 @@ ModelState::~ModelState()
|
|||
|
||||
int
|
||||
ModelState::init(const char* model_path,
|
||||
unsigned int n_features,
|
||||
unsigned int n_context,
|
||||
const char* alphabet_path,
|
||||
unsigned int beam_width)
|
||||
{
|
||||
n_features_ = n_features;
|
||||
n_context_ = n_context;
|
||||
if (alphabet_.init(alphabet_path)) {
|
||||
return DS_ERR_INVALID_ALPHABET;
|
||||
}
|
||||
|
|
|
@ -35,8 +35,6 @@ struct ModelState {
|
|||
virtual ~ModelState();
|
||||
|
||||
virtual int init(const char* model_path,
|
||||
unsigned int n_features,
|
||||
unsigned int n_context,
|
||||
const char* alphabet_path,
|
||||
unsigned int beam_width);
|
||||
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
import os
|
||||
import platform
|
||||
|
||||
#The API is not snake case which triggers linter errors
|
||||
#pylint: disable=invalid-name
|
||||
|
||||
# On Windows, we can't rely on RPATH being set to $ORIGIN/lib/ or on
|
||||
# @loader_path/lib but we can change the PATH to include the proper directory
|
||||
# for the dynamic linker
|
||||
|
@ -12,6 +15,7 @@ import deepspeech
|
|||
|
||||
# rename for backwards compatibility
|
||||
from deepspeech.impl import PrintVersions as printVersions
|
||||
from deepspeech.impl import FreeStream as freeStream
|
||||
|
||||
class Model(object):
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
@ -25,7 +29,7 @@ class Model(object):
|
|||
|
||||
def __del__(self):
|
||||
if self._impl:
|
||||
deepspeech.impl.DestroyModel(self._impl)
|
||||
deepspeech.impl.FreeModel(self._impl)
|
||||
self._impl = None
|
||||
|
||||
def enableDecoderWithLM(self, *args, **kwargs):
|
||||
|
@ -37,11 +41,11 @@ class Model(object):
|
|||
def sttWithMetadata(self, *args, **kwargs):
|
||||
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
|
||||
|
||||
def setupStream(self, sample_rate=16000):
|
||||
status, ctx = deepspeech.impl.SetupStream(self._impl,
|
||||
aSampleRate=sample_rate)
|
||||
def createStream(self, sample_rate=16000):
|
||||
status, ctx = deepspeech.impl.CreateStream(self._impl,
|
||||
aSampleRate=sample_rate)
|
||||
if status != 0:
|
||||
raise RuntimeError("SetupStream failed with error code {}".format(status))
|
||||
raise RuntimeError("CreateStream failed with error code {}".format(status))
|
||||
return ctx
|
||||
|
||||
def feedAudioContent(self, *args, **kwargs):
|
||||
|
|
|
@ -32,17 +32,6 @@ LM_ALPHA = 0.75
|
|||
LM_BETA = 1.85
|
||||
|
||||
|
||||
# These constants are tied to the shape of the graph used (changing them changes
|
||||
# the geometry of the first layer), so make sure you use the same constants that
|
||||
# were used during training
|
||||
|
||||
# Number of MFCC features to use
|
||||
N_FEATURES = 26
|
||||
|
||||
# Size of the context window used for producing timesteps in the input vector
|
||||
N_CONTEXT = 9
|
||||
|
||||
|
||||
def convert_samplerate(audio_path):
|
||||
sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), SAMPLE_RATE)
|
||||
try:
|
||||
|
@ -88,14 +77,14 @@ def main():
|
|||
|
||||
print('Loading model from file {}'.format(args.model), file=sys.stderr)
|
||||
model_load_start = timer()
|
||||
ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
|
||||
ds = Model(args.model, args.alphabet, BEAM_WIDTH)
|
||||
model_load_end = timer() - model_load_start
|
||||
print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)
|
||||
|
||||
if args.lm and args.trie:
|
||||
print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
|
||||
lm_load_start = timer()
|
||||
ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA)
|
||||
ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA)
|
||||
lm_load_end = timer() - lm_load_start
|
||||
print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ import_array();
|
|||
}
|
||||
|
||||
%typemap(argout) ModelState **retval {
|
||||
// not owned, Python wrapper in __init__.py calls DS_DestroyModel
|
||||
// not owned, Python wrapper in __init__.py calls DS_FreeModel
|
||||
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0));
|
||||
}
|
||||
|
||||
|
|
|
@ -21,17 +21,6 @@ LM_ALPHA = 0.75
|
|||
LM_BETA = 1.85
|
||||
|
||||
|
||||
# These constants are tied to the shape of the graph used (changing them changes
|
||||
# the geometry of the first layer), so make sure you use the same constants that
|
||||
# were used during training
|
||||
|
||||
# Number of MFCC features to use
|
||||
N_FEATURES = 26
|
||||
|
||||
# Size of the context window used for producing timesteps in the input vector
|
||||
N_CONTEXT = 9
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
|
||||
parser.add_argument('--model', required=True,
|
||||
|
@ -48,10 +37,10 @@ def main():
|
|||
help='Second audio file to use in interleaved streams')
|
||||
args = parser.parse_args()
|
||||
|
||||
ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
|
||||
ds = Model(args.model, args.alphabet, BEAM_WIDTH)
|
||||
|
||||
if args.lm and args.trie:
|
||||
ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA)
|
||||
ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA)
|
||||
|
||||
fin = wave.open(args.audio1, 'rb')
|
||||
fs1 = fin.getframerate()
|
||||
|
@ -63,8 +52,8 @@ def main():
|
|||
audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
|
||||
fin.close()
|
||||
|
||||
stream1 = ds.setupStream(sample_rate=fs1)
|
||||
stream2 = ds.setupStream(sample_rate=fs2)
|
||||
stream1 = ds.createStream(sample_rate=fs1)
|
||||
stream2 = ds.createStream(sample_rate=fs2)
|
||||
|
||||
splits1 = np.array_split(audio1, 10)
|
||||
splits2 = np.array_split(audio2, 10)
|
||||
|
|
|
@ -89,12 +89,10 @@ TFLiteModelState::~TFLiteModelState()
|
|||
|
||||
int
|
||||
TFLiteModelState::init(const char* model_path,
|
||||
unsigned int n_features,
|
||||
unsigned int n_context,
|
||||
const char* alphabet_path,
|
||||
unsigned int beam_width)
|
||||
{
|
||||
int err = ModelState::init(model_path, n_features, n_context, alphabet_path, beam_width);
|
||||
int err = ModelState::init(model_path, alphabet_path, beam_width);
|
||||
if (err != DS_ERR_OK) {
|
||||
return err;
|
||||
}
|
||||
|
|
|
@ -31,8 +31,6 @@ struct TFLiteModelState : public ModelState
|
|||
virtual ~TFLiteModelState();
|
||||
|
||||
virtual int init(const char* model_path,
|
||||
unsigned int n_features,
|
||||
unsigned int n_context,
|
||||
const char* alphabet_path,
|
||||
unsigned int beam_width) override;
|
||||
|
||||
|
|
|
@ -25,12 +25,10 @@ TFModelState::~TFModelState()
|
|||
|
||||
int
|
||||
TFModelState::init(const char* model_path,
|
||||
unsigned int n_features,
|
||||
unsigned int n_context,
|
||||
const char* alphabet_path,
|
||||
unsigned int beam_width)
|
||||
{
|
||||
int err = ModelState::init(model_path, n_features, n_context, alphabet_path, beam_width);
|
||||
int err = ModelState::init(model_path, alphabet_path, beam_width);
|
||||
if (err != DS_ERR_OK) {
|
||||
return err;
|
||||
}
|
||||
|
|
|
@ -19,8 +19,6 @@ struct TFModelState : public ModelState
|
|||
virtual ~TFModelState();
|
||||
|
||||
virtual int init(const char* model_path,
|
||||
unsigned int n_features,
|
||||
unsigned int n_context,
|
||||
const char* alphabet_path,
|
||||
unsigned int beam_width) override;
|
||||
|
||||
|
|
Loading…
Reference in New Issue