Merge pull request #2350 from mozilla/breaking-api-cleanup

[BREAKING] API cleanup
This commit is contained in:
Reuben Morais 2019-09-09 21:28:43 +02:00 committed by GitHub
commit 6b7ebf47f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 206 additions and 183 deletions

View File

@ -29,13 +29,11 @@ Then run with a TF Lite model, alphabet, LM/trie and a CSV test file
BEAM_WIDTH = 500 BEAM_WIDTH = 500
LM_ALPHA = 0.75 LM_ALPHA = 0.75
LM_BETA = 1.85 LM_BETA = 1.85
N_FEATURES = 26
N_CONTEXT = 9
def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask): def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask):
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask) os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) ds = Model(model, alphabet, BEAM_WIDTH)
ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA) ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
while True: while True:
msg = queue_in.get() msg = queue_in.get()

View File

@ -22,8 +22,6 @@ namespace DeepSpeechWPF
{ {
private readonly IDeepSpeech _sttClient; private readonly IDeepSpeech _sttClient;
private const uint N_CEP = 26;
private const uint N_CONTEXT = 9;
private const uint BEAM_WIDTH = 500; private const uint BEAM_WIDTH = 500;
private const float LM_ALPHA = 0.75f; private const float LM_ALPHA = 0.75f;
private const float LM_BETA = 1.85f; private const float LM_BETA = 1.85f;
@ -79,7 +77,7 @@ namespace DeepSpeechWPF
{ {
try try
{ {
_sttClient.CreateModel("output_graph.pbmm", N_CEP, N_CONTEXT, "alphabet.txt", BEAM_WIDTH); _sttClient.CreateModel("output_graph.pbmm", "alphabet.txt", BEAM_WIDTH);
Dispatcher.Invoke(() => { EnableControls(); }); Dispatcher.Invoke(() => { EnableControls(); });
} }
catch (Exception ex) catch (Exception ex)
@ -155,7 +153,7 @@ namespace DeepSpeechWPF
{ {
try try
{ {
_sttClient.EnableDecoderWithLM("alphabet.txt", "lm.binary", "trie", LM_ALPHA, LM_BETA); _sttClient.EnableDecoderWithLM("lm.binary", "trie", LM_ALPHA, LM_BETA);
Dispatcher.Invoke(() => lblStatus.Content = "LM loaded."); Dispatcher.Invoke(() => lblStatus.Content = "LM loaded.");
} }
catch (Exception ex) catch (Exception ex)
@ -198,7 +196,7 @@ namespace DeepSpeechWPF
_soundInSource.Dispose(); _soundInSource.Dispose();
_convertedSource.Dispose(); _convertedSource.Dispose();
_audioCapture.DataAvailable -= _capture_DataAvailable; _audioCapture.DataAvailable -= _capture_DataAvailable;
_sttClient.DiscardStream(); //this a good example of discardstream, the user changed the audio input, so we no longer need the current stream _sttClient.FreeStream(); //this a good example of FreeStream, the user changed the audio input, so we no longer need the current stream
} }
if (_audioCaptureDevices!=null) if (_audioCaptureDevices!=null)
{ {
@ -252,7 +250,7 @@ namespace DeepSpeechWPF
private void BtnStartRecording_Click(object sender, RoutedEventArgs e) private void BtnStartRecording_Click(object sender, RoutedEventArgs e)
{ {
_sttClient.SetupStream(16000); _sttClient.CreateStream(16000);
_audioCapture.Start(); _audioCapture.Start();
btnStartRecording.IsEnabled = false; btnStartRecording.IsEnabled = false;
btnStopRecording.IsEnabled = true; btnStopRecording.IsEnabled = true;

View File

@ -33,8 +33,6 @@
#include "deepspeech.h" #include "deepspeech.h"
#include "args.h" #include "args.h"
#define N_CEP 26
#define N_CONTEXT 9
#define BEAM_WIDTH 500 #define BEAM_WIDTH 500
#define LM_ALPHA 0.75f #define LM_ALPHA 0.75f
#define LM_BETA 1.85f #define LM_BETA 1.85f
@ -72,7 +70,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
DS_FreeMetadata(metadata); DS_FreeMetadata(metadata);
} else if (stream_size > 0) { } else if (stream_size > 0) {
StreamingState* ctx; StreamingState* ctx;
int status = DS_SetupStream(aCtx, aSampleRate, &ctx); int status = DS_CreateStream(aCtx, aSampleRate, &ctx);
if (status != DS_ERR_OK) { if (status != DS_ERR_OK) {
res.string = strdup(""); res.string = strdup("");
return res; return res;
@ -377,7 +375,7 @@ main(int argc, char **argv)
// Initialise DeepSpeech // Initialise DeepSpeech
ModelState* ctx; ModelState* ctx;
int status = DS_CreateModel(model, N_CEP, N_CONTEXT, alphabet, BEAM_WIDTH, &ctx); int status = DS_CreateModel(model, alphabet, BEAM_WIDTH, &ctx);
if (status != 0) { if (status != 0) {
fprintf(stderr, "Could not create model.\n"); fprintf(stderr, "Could not create model.\n");
return 1; return 1;
@ -385,7 +383,6 @@ main(int argc, char **argv)
if (lm && (trie || load_without_trie)) { if (lm && (trie || load_without_trie)) {
int status = DS_EnableDecoderWithLM(ctx, int status = DS_EnableDecoderWithLM(ctx,
alphabet,
lm, lm,
trie, trie,
LM_ALPHA, LM_ALPHA,
@ -449,7 +446,7 @@ main(int argc, char **argv)
sox_quit(); sox_quit();
#endif // NO_SOX #endif // NO_SOX
DS_DestroyModel(ctx); DS_FreeModel(ctx);
return 0; return 0;
} }

View File

@ -257,8 +257,6 @@ StreamingState::processBatch(const vector<float>& buf, unsigned int n_steps)
int int
DS_CreateModel(const char* aModelPath, DS_CreateModel(const char* aModelPath,
unsigned int aNCep,
unsigned int aNContext,
const char* aAlphabetConfigPath, const char* aAlphabetConfigPath,
unsigned int aBeamWidth, unsigned int aBeamWidth,
ModelState** retval) ModelState** retval)
@ -285,7 +283,7 @@ DS_CreateModel(const char* aModelPath,
return DS_ERR_FAIL_CREATE_MODEL; return DS_ERR_FAIL_CREATE_MODEL;
} }
int err = model->init(aModelPath, aNCep, aNContext, aAlphabetConfigPath, aBeamWidth); int err = model->init(aModelPath, aAlphabetConfigPath, aBeamWidth);
if (err != DS_ERR_OK) { if (err != DS_ERR_OK) {
return err; return err;
} }
@ -295,14 +293,13 @@ DS_CreateModel(const char* aModelPath,
} }
void void
DS_DestroyModel(ModelState* ctx) DS_FreeModel(ModelState* ctx)
{ {
delete ctx; delete ctx;
} }
int int
DS_EnableDecoderWithLM(ModelState* aCtx, DS_EnableDecoderWithLM(ModelState* aCtx,
const char* aAlphabetConfigPath,
const char* aLMPath, const char* aLMPath,
const char* aTriePath, const char* aTriePath,
float aLMAlpha, float aLMAlpha,
@ -320,7 +317,7 @@ DS_EnableDecoderWithLM(ModelState* aCtx,
} }
int int
DS_SetupStream(ModelState* aCtx, DS_CreateStream(ModelState* aCtx,
unsigned int aSampleRate, unsigned int aSampleRate,
StreamingState** retval) StreamingState** retval)
{ {
@ -371,7 +368,7 @@ char*
DS_FinishStream(StreamingState* aSctx) DS_FinishStream(StreamingState* aSctx)
{ {
char* str = aSctx->finishStream(); char* str = aSctx->finishStream();
DS_DiscardStream(aSctx); DS_FreeStream(aSctx);
return str; return str;
} }
@ -379,18 +376,18 @@ Metadata*
DS_FinishStreamWithMetadata(StreamingState* aSctx) DS_FinishStreamWithMetadata(StreamingState* aSctx)
{ {
Metadata* metadata = aSctx->finishStreamWithMetadata(); Metadata* metadata = aSctx->finishStreamWithMetadata();
DS_DiscardStream(aSctx); DS_FreeStream(aSctx);
return metadata; return metadata;
} }
StreamingState* StreamingState*
SetupStreamAndFeedAudioContent(ModelState* aCtx, CreateStreamAndFeedAudioContent(ModelState* aCtx,
const short* aBuffer, const short* aBuffer,
unsigned int aBufferSize, unsigned int aBufferSize,
unsigned int aSampleRate) unsigned int aSampleRate)
{ {
StreamingState* ctx; StreamingState* ctx;
int status = DS_SetupStream(aCtx, aSampleRate, &ctx); int status = DS_CreateStream(aCtx, aSampleRate, &ctx);
if (status != DS_ERR_OK) { if (status != DS_ERR_OK) {
return nullptr; return nullptr;
} }
@ -404,7 +401,7 @@ DS_SpeechToText(ModelState* aCtx,
unsigned int aBufferSize, unsigned int aBufferSize,
unsigned int aSampleRate) unsigned int aSampleRate)
{ {
StreamingState* ctx = SetupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate); StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
return DS_FinishStream(ctx); return DS_FinishStream(ctx);
} }
@ -414,12 +411,12 @@ DS_SpeechToTextWithMetadata(ModelState* aCtx,
unsigned int aBufferSize, unsigned int aBufferSize,
unsigned int aSampleRate) unsigned int aSampleRate)
{ {
StreamingState* ctx = SetupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate); StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
return DS_FinishStreamWithMetadata(ctx); return DS_FinishStreamWithMetadata(ctx);
} }
void void
DS_DiscardStream(StreamingState* aSctx) DS_FreeStream(StreamingState* aSctx)
{ {
delete aSctx; delete aSctx;
} }

View File

@ -63,8 +63,6 @@ enum DeepSpeech_Error_Codes
* @brief An object providing an interface to a trained DeepSpeech model. * @brief An object providing an interface to a trained DeepSpeech model.
* *
* @param aModelPath The path to the frozen model graph. * @param aModelPath The path to the frozen model graph.
* @param aNCep The number of cepstrum the model was trained with.
* @param aNContext The context window the model was trained with.
* @param aAlphabetConfigPath The path to the configuration file specifying * @param aAlphabetConfigPath The path to the configuration file specifying
* the alphabet used by the network. See alphabet.h. * the alphabet used by the network. See alphabet.h.
* @param aBeamWidth The beam width used by the decoder. A larger beam * @param aBeamWidth The beam width used by the decoder. A larger beam
@ -76,8 +74,6 @@ enum DeepSpeech_Error_Codes
*/ */
DEEPSPEECH_EXPORT DEEPSPEECH_EXPORT
int DS_CreateModel(const char* aModelPath, int DS_CreateModel(const char* aModelPath,
unsigned int aNCep,
unsigned int aNContext,
const char* aAlphabetConfigPath, const char* aAlphabetConfigPath,
unsigned int aBeamWidth, unsigned int aBeamWidth,
ModelState** retval); ModelState** retval);
@ -86,7 +82,7 @@ int DS_CreateModel(const char* aModelPath,
* @brief Frees associated resources and destroys model object. * @brief Frees associated resources and destroys model object.
*/ */
DEEPSPEECH_EXPORT DEEPSPEECH_EXPORT
void DS_DestroyModel(ModelState* ctx); void DS_FreeModel(ModelState* ctx);
/** /**
* @brief Enable decoding using beam scoring with a KenLM language model. * @brief Enable decoding using beam scoring with a KenLM language model.
@ -106,7 +102,6 @@ void DS_DestroyModel(ModelState* ctx);
*/ */
DEEPSPEECH_EXPORT DEEPSPEECH_EXPORT
int DS_EnableDecoderWithLM(ModelState* aCtx, int DS_EnableDecoderWithLM(ModelState* aCtx,
const char* aAlphabetConfigPath,
const char* aLMPath, const char* aLMPath,
const char* aTriePath, const char* aTriePath,
float aLMAlpha, float aLMAlpha,
@ -162,14 +157,14 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
* @return Zero for success, non-zero on failure. * @return Zero for success, non-zero on failure.
*/ */
DEEPSPEECH_EXPORT DEEPSPEECH_EXPORT
int DS_SetupStream(ModelState* aCtx, int DS_CreateStream(ModelState* aCtx,
unsigned int aSampleRate, unsigned int aSampleRate,
StreamingState** retval); StreamingState** retval);
/** /**
* @brief Feed audio samples to an ongoing streaming inference. * @brief Feed audio samples to an ongoing streaming inference.
* *
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}. * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aBuffer An array of 16-bit, mono raw audio samples at the * @param aBuffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate. * appropriate sample rate.
* @param aBufferSize The number of samples in @p aBuffer. * @param aBufferSize The number of samples in @p aBuffer.
@ -185,7 +180,7 @@ void DS_FeedAudioContent(StreamingState* aSctx,
* currently capable of streaming, so it always starts from the beginning * currently capable of streaming, so it always starts from the beginning
* of the audio. * of the audio.
* *
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}. * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* *
* @return The STT intermediate result. The user is responsible for freeing the * @return The STT intermediate result. The user is responsible for freeing the
* string using {@link DS_FreeString()}. * string using {@link DS_FreeString()}.
@ -197,7 +192,7 @@ char* DS_IntermediateDecode(StreamingState* aSctx);
* @brief Signal the end of an audio signal to an ongoing streaming * @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns the STT result over the whole audio signal. * inference, returns the STT result over the whole audio signal.
* *
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}. * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* *
* @return The STT result. The user is responsible for freeing the string using * @return The STT result. The user is responsible for freeing the string using
* {@link DS_FreeString()}. * {@link DS_FreeString()}.
@ -211,7 +206,7 @@ char* DS_FinishStream(StreamingState* aSctx);
* @brief Signal the end of an audio signal to an ongoing streaming * @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns per-letter metadata. * inference, returns per-letter metadata.
* *
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}. * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* *
* @return Outputs a struct of individual letters along with their timing information. * @return Outputs a struct of individual letters along with their timing information.
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. * The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
@ -226,12 +221,12 @@ Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx);
* can be used if you no longer need the result of an ongoing streaming * can be used if you no longer need the result of an ongoing streaming
* inference and don't want to perform a costly decode operation. * inference and don't want to perform a costly decode operation.
* *
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}. * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* *
* @note This method will free the state pointer (@p aSctx). * @note This method will free the state pointer (@p aSctx).
*/ */
DEEPSPEECH_EXPORT DEEPSPEECH_EXPORT
void DS_DiscardStream(StreamingState* aSctx); void DS_FreeStream(StreamingState* aSctx);
/** /**
* @brief Free memory allocated for metadata information. * @brief Free memory allocated for metadata information.

View File

@ -0,0 +1,101 @@
#ifndef DEEPSPEECH_COMPAT_H
#define DEEPSPEECH_COMPAT_H
#include "deepspeech.h"
#warning This header is a convenience wrapper for compatibility with \
the previous API, it has deprecated function names and arguments. \
If possible, update your code instead of using this header.
/**
* @brief An object providing an interface to a trained DeepSpeech model.
*
* @param aModelPath The path to the frozen model graph.
* @param aNCep UNUSED, DEPRECATED.
* @param aNContext UNUSED, DEPRECATED.
* @param aAlphabetConfigPath The path to the configuration file specifying
* the alphabet used by the network. See alphabet.h.
* @param aBeamWidth The beam width used by the decoder. A larger beam
* width generates better results at the cost of decoding
* time.
* @param[out] retval a ModelState pointer
*
* @return Zero on success, non-zero on failure.
*/
int DS_CreateModel(const char* aModelPath,
unsigned int /*aNCep*/,
unsigned int /*aNContext*/,
const char* aAlphabetConfigPath,
unsigned int aBeamWidth,
ModelState** retval)
{
return DS_CreateModel(aModelPath, aAlphabetConfigPath, aBeamWidth, retval);
}
/**
* @brief Frees associated resources and destroys model object.
*/
void DS_DestroyModel(ModelState* ctx)
{
return DS_FreeModel(ctx);
}
/**
* @brief Enable decoding using beam scoring with a KenLM language model.
*
* @param aCtx The ModelState pointer for the model being changed.
* @param aAlphabetConfigPath UNUSED, DEPRECATED.
* @param aLMPath The path to the language model binary file.
* @param aTriePath The path to the trie file build from the same vocabu-
* lary as the language model binary.
* @param aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model
weight.
* @param aLMBeta The beta hyperparameter of the CTC decoder. Word insertion
weight.
*
* @return Zero on success, non-zero on failure (invalid arguments).
*/
int DS_EnableDecoderWithLM(ModelState* aCtx,
const char* /*aAlphabetConfigPath*/,
const char* aLMPath,
const char* aTriePath,
float aLMAlpha,
float aLMBeta)
{
return DS_EnableDecoderWithLM(aCtx, aLMPath, aTriePath, aLMAlpha, aLMBeta);
}
/**
* @brief Create a new streaming inference state. The streaming state returned
* by this function can then be passed to {@link DS_FeedAudioContent()}
* and {@link DS_FinishStream()}.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aSampleRate The sample-rate of the audio signal.
* @param[out] retval an opaque pointer that represents the streaming state. Can
* be NULL if an error occurs.
*
* @return Zero for success, non-zero on failure.
*/
int DS_SetupStream(ModelState* aCtx,
unsigned int aSampleRate,
StreamingState** retval)
{
return DS_CreateStream(aCtx, aSampleRate, retval);
}
/**
* @brief Destroy a streaming state without decoding the computed logits. This
* can be used if you no longer need the result of an ongoing streaming
* inference and don't want to perform a costly decode operation.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
* @note This method will free the state pointer (@p aSctx).
*/
void DS_DiscardStream(StreamingState* aSctx)
{
return DS_FreeStream(aSctx);
}
#endif /* DEEPSPEECH_COMPAT_H */

View File

@ -32,13 +32,11 @@ namespace DeepSpeechClient
/// Create an object providing an interface to a trained DeepSpeech model. /// Create an object providing an interface to a trained DeepSpeech model.
/// </summary> /// </summary>
/// <param name="aModelPath">The path to the frozen model graph.</param> /// <param name="aModelPath">The path to the frozen model graph.</param>
/// <param name="aNCep">The number of cepstrum the model was trained with.</param>
/// <param name="aNContext">The context window the model was trained with.</param>
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param> /// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
/// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param> /// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception> /// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception>
public unsafe void CreateModel(string aModelPath, uint aNCep, public unsafe void CreateModel(string aModelPath,
uint aNContext, string aAlphabetConfigPath, uint aBeamWidth) string aAlphabetConfigPath, uint aBeamWidth)
{ {
string exceptionMessage = null; string exceptionMessage = null;
if (string.IsNullOrWhiteSpace(aModelPath)) if (string.IsNullOrWhiteSpace(aModelPath))
@ -63,8 +61,6 @@ namespace DeepSpeechClient
throw new FileNotFoundException(exceptionMessage); throw new FileNotFoundException(exceptionMessage);
} }
var resultCode = NativeImp.DS_CreateModel(aModelPath, var resultCode = NativeImp.DS_CreateModel(aModelPath,
aNCep,
aNContext,
aAlphabetConfigPath, aAlphabetConfigPath,
aBeamWidth, aBeamWidth,
ref _modelStatePP); ref _modelStatePP);
@ -116,20 +112,18 @@ namespace DeepSpeechClient
/// </summary> /// </summary>
public unsafe void Dispose() public unsafe void Dispose()
{ {
NativeImp.DS_DestroyModel(_modelStatePP); NativeImp.DS_FreeModel(_modelStatePP);
} }
/// <summary> /// <summary>
/// Enable decoding using beam scoring with a KenLM language model. /// Enable decoding using beam scoring with a KenLM language model.
/// </summary> /// </summary>
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
/// <param name="aLMPath">The path to the language model binary file.</param> /// <param name="aLMPath">The path to the language model binary file.</param>
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param> /// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param>
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param> /// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param> /// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception> /// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
public unsafe void EnableDecoderWithLM(string aAlphabetConfigPath, public unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath,
string aLMPath, string aTriePath,
float aLMAlpha, float aLMBeta) float aLMAlpha, float aLMBeta)
{ {
string exceptionMessage = null; string exceptionMessage = null;
@ -148,7 +142,6 @@ namespace DeepSpeechClient
} }
var resultCode = NativeImp.DS_EnableDecoderWithLM(_modelStatePP, var resultCode = NativeImp.DS_EnableDecoderWithLM(_modelStatePP,
aAlphabetConfigPath,
aLMPath, aLMPath,
aTriePath, aTriePath,
aLMAlpha, aLMAlpha,
@ -206,9 +199,9 @@ namespace DeepSpeechClient
/// </summary> /// </summary>
/// <param name="aSampleRate">The sample-rate of the audio signal</param> /// <param name="aSampleRate">The sample-rate of the audio signal</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception> /// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
public unsafe void SetupStream(uint aSampleRate) public unsafe void CreateStream(uint aSampleRate)
{ {
var resultCode = NativeImp.DS_SetupStream(_modelStatePP, aSampleRate, ref _streamingStatePP); var resultCode = NativeImp.DS_CreateStream(_modelStatePP, aSampleRate, ref _streamingStatePP);
EvaluateResultCode(resultCode); EvaluateResultCode(resultCode);
} }
@ -217,9 +210,9 @@ namespace DeepSpeechClient
/// This can be used if you no longer need the result of an ongoing streaming /// This can be used if you no longer need the result of an ongoing streaming
/// inference and don't want to perform a costly decode operation. /// inference and don't want to perform a costly decode operation.
/// </summary> /// </summary>
public unsafe void DiscardStream() public unsafe void FreeStream()
{ {
NativeImp.DS_DiscardStream(ref _streamingStatePP); NativeImp.DS_FreeStream(ref _streamingStatePP);
} }
/// <summary> /// <summary>

View File

@ -17,27 +17,22 @@ namespace DeepSpeechClient.Interfaces
/// Create an object providing an interface to a trained DeepSpeech model. /// Create an object providing an interface to a trained DeepSpeech model.
/// </summary> /// </summary>
/// <param name="aModelPath">The path to the frozen model graph.</param> /// <param name="aModelPath">The path to the frozen model graph.</param>
/// <param name="aNCep">The number of cepstrum the model was trained with.</param>
/// <param name="aNContext">The context window the model was trained with.</param>
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param> /// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
/// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param> /// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception> /// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception>
unsafe void CreateModel(string aModelPath, uint aNCep, unsafe void CreateModel(string aModelPath,
uint aNContext,
string aAlphabetConfigPath, string aAlphabetConfigPath,
uint aBeamWidth); uint aBeamWidth);
/// <summary> /// <summary>
/// Enable decoding using beam scoring with a KenLM language model. /// Enable decoding using beam scoring with a KenLM language model.
/// </summary> /// </summary>
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
/// <param name="aLMPath">The path to the language model binary file.</param> /// <param name="aLMPath">The path to the language model binary file.</param>
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param> /// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param>
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param> /// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param> /// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception> /// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
unsafe void EnableDecoderWithLM(string aAlphabetConfigPath, unsafe void EnableDecoderWithLM(string aLMPath,
string aLMPath,
string aTriePath, string aTriePath,
float aLMAlpha, float aLMAlpha,
float aLMBeta); float aLMBeta);
@ -69,7 +64,7 @@ namespace DeepSpeechClient.Interfaces
/// This can be used if you no longer need the result of an ongoing streaming /// This can be used if you no longer need the result of an ongoing streaming
/// inference and don't want to perform a costly decode operation. /// inference and don't want to perform a costly decode operation.
/// </summary> /// </summary>
unsafe void DiscardStream(); unsafe void FreeStream();
/// <summary> /// <summary>
/// Free a DeepSpeech allocated string /// Free a DeepSpeech allocated string
@ -86,7 +81,7 @@ namespace DeepSpeechClient.Interfaces
/// </summary> /// </summary>
/// <param name="aSampleRate">The sample-rate of the audio signal</param> /// <param name="aSampleRate">The sample-rate of the audio signal</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception> /// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
unsafe void SetupStream(uint aSampleRate); unsafe void CreateStream(uint aSampleRate);
/// <summary> /// <summary>
/// Feeds audio samples to an ongoing streaming inference. /// Feeds audio samples to an ongoing streaming inference.

View File

@ -17,15 +17,12 @@ namespace DeepSpeechClient
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath, internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath,
uint aNCep,
uint aNContext,
string aAlphabetConfigPath, string aAlphabetConfigPath,
uint aBeamWidth, uint aBeamWidth,
ref ModelState** pint); ref ModelState** pint);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(ModelState** aCtx, internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(ModelState** aCtx,
string aAlphabetConfigPath,
string aLMPath, string aLMPath,
string aTriePath, string aTriePath,
float aLMAlpha, float aLMAlpha,
@ -45,14 +42,14 @@ namespace DeepSpeechClient
uint aSampleRate); uint aSampleRate);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_DestroyModel(ModelState** aCtx); internal static unsafe extern void DS_FreeModel(ModelState** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_SetupStream(ModelState** aCtx, internal static unsafe extern ErrorCodes DS_CreateStream(ModelState** aCtx,
uint aSampleRate, ref StreamingState** retval); uint aSampleRate, ref StreamingState** retval);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_DiscardStream(ref StreamingState** aSctx); internal static unsafe extern void DS_FreeStream(ref StreamingState** aSctx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeMetadata(IntPtr metadata); internal static unsafe extern void DS_FreeMetadata(IntPtr metadata);

View File

@ -7,6 +7,8 @@ using GraphDef = System.IntPtr;
namespace DeepSpeechClient.Structs namespace DeepSpeechClient.Structs
{ {
//FIXME: ModelState is an opaque pointer to the API, why is this code reverse
// engineering its contents?
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Ansi)] [StructLayout(LayoutKind.Sequential, CharSet = CharSet.Ansi)]
public unsafe struct ModelState public unsafe struct ModelState
{ {

View File

@ -50,8 +50,6 @@ namespace CSharpExamples
extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended")); extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended"));
} }
const uint N_CEP = 26;
const uint N_CONTEXT = 9;
const uint BEAM_WIDTH = 500; const uint BEAM_WIDTH = 500;
const float LM_ALPHA = 0.75f; const float LM_ALPHA = 0.75f;
const float LM_BETA = 1.85f; const float LM_BETA = 1.85f;
@ -66,7 +64,6 @@ namespace CSharpExamples
stopwatch.Start(); stopwatch.Start();
sttClient.CreateModel( sttClient.CreateModel(
model ?? "output_graph.pbmm", model ?? "output_graph.pbmm",
N_CEP, N_CONTEXT,
alphabet ?? "alphabet.txt", alphabet ?? "alphabet.txt",
BEAM_WIDTH); BEAM_WIDTH);
stopwatch.Stop(); stopwatch.Stop();
@ -77,7 +74,6 @@ namespace CSharpExamples
{ {
Console.WriteLine("Loadin LM..."); Console.WriteLine("Loadin LM...");
sttClient.EnableDecoderWithLM( sttClient.EnableDecoderWithLM(
alphabet ?? "alphabet.txt",
lm ?? "lm.binary", lm ?? "lm.binary",
trie ?? "trie", trie ?? "trie",
LM_ALPHA, LM_BETA); LM_ALPHA, LM_BETA);

View File

@ -31,8 +31,6 @@ public class DeepSpeechActivity extends AppCompatActivity {
Button _startInference; Button _startInference;
final int N_CEP = 26;
final int N_CONTEXT = 9;
final int BEAM_WIDTH = 50; final int BEAM_WIDTH = 50;
final float LM_ALPHA = 0.75f; final float LM_ALPHA = 0.75f;
final float LM_BETA = 1.85f; final float LM_BETA = 1.85f;
@ -54,7 +52,7 @@ public class DeepSpeechActivity extends AppCompatActivity {
private void newModel(String tfliteModel, String alphabet) { private void newModel(String tfliteModel, String alphabet) {
this._tfliteStatus.setText("Creating model"); this._tfliteStatus.setText("Creating model");
if (this._m == null) { if (this._m == null) {
this._m = new DeepSpeechModel(tfliteModel, N_CEP, N_CONTEXT, alphabet, BEAM_WIDTH); this._m = new DeepSpeechModel(tfliteModel, alphabet, BEAM_WIDTH);
} }
} }
@ -167,7 +165,7 @@ public class DeepSpeechActivity extends AppCompatActivity {
super.onDestroy(); super.onDestroy();
if (this._m != null) { if (this._m != null) {
this._m.destroyModel(); this._m.freeModel();
} }
} }
} }

View File

@ -35,8 +35,6 @@ public class BasicTest {
public static final String trieFile = "/data/local/tmp/test/trie"; public static final String trieFile = "/data/local/tmp/test/trie";
public static final String wavFile = "/data/local/tmp/test/LDC93S1.wav"; public static final String wavFile = "/data/local/tmp/test/LDC93S1.wav";
public static final int N_CEP = 26;
public static final int N_CONTEXT = 9;
public static final int BEAM_WIDTH = 50; public static final int BEAM_WIDTH = 50;
public static final float LM_ALPHA = 0.75f; public static final float LM_ALPHA = 0.75f;
@ -66,8 +64,8 @@ public class BasicTest {
@Test @Test
public void loadDeepSpeech_basic() { public void loadDeepSpeech_basic() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH); DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
m.destroyModel(); m.freeModel();
} }
private String metadataToString(Metadata m) { private String metadataToString(Metadata m) {
@ -123,39 +121,39 @@ public class BasicTest {
@Test @Test
public void loadDeepSpeech_stt_noLM() { public void loadDeepSpeech_stt_noLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH); DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
String decoded = doSTT(m, false); String decoded = doSTT(m, false);
assertEquals("she had your dark suit in greasy wash water all year", decoded); assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel(); m.freeModel();
} }
@Test @Test
public void loadDeepSpeech_stt_withLM() { public void loadDeepSpeech_stt_withLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH); DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA); m.enableDecoderWihLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
String decoded = doSTT(m, false); String decoded = doSTT(m, false);
assertEquals("she had your dark suit in greasy wash water all year", decoded); assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel(); m.freeModel();
} }
@Test @Test
public void loadDeepSpeech_sttWithMetadata_noLM() { public void loadDeepSpeech_sttWithMetadata_noLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH); DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
String decoded = doSTT(m, true); String decoded = doSTT(m, true);
assertEquals("she had your dark suit in greasy wash water all year", decoded); assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel(); m.freeModel();
} }
@Test @Test
public void loadDeepSpeech_sttWithMetadata_withLM() { public void loadDeepSpeech_sttWithMetadata_withLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH); DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA); m.enableDecoderWihLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
String decoded = doSTT(m, true); String decoded = doSTT(m, true);
assertEquals("she had your dark suit in greasy wash water all year", decoded); assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel(); m.freeModel();
} }
} }

View File

@ -11,18 +11,18 @@ public class DeepSpeechModel {
SWIGTYPE_p_p_ModelState _mspp; SWIGTYPE_p_p_ModelState _mspp;
SWIGTYPE_p_ModelState _msp; SWIGTYPE_p_ModelState _msp;
public DeepSpeechModel(String modelPath, int n_cep, int n_context, String alphabetPath, int beam_width) { public DeepSpeechModel(String modelPath, String alphabetPath, int beam_width) {
this._mspp = impl.new_modelstatep(); this._mspp = impl.new_modelstatep();
impl.CreateModel(modelPath, n_cep, n_context, alphabetPath, beam_width, this._mspp); impl.CreateModel(modelPath, alphabetPath, beam_width, this._mspp);
this._msp = impl.modelstatep_value(this._mspp); this._msp = impl.modelstatep_value(this._mspp);
} }
public void destroyModel() { public void freeModel() {
impl.DestroyModel(this._msp); impl.FreeModel(this._msp);
} }
public void enableDecoderWihLM(String alphabet, String lm, String trie, float lm_alpha, float lm_beta) { public void enableDecoderWihLM(String lm, String trie, float lm_alpha, float lm_beta) {
impl.EnableDecoderWithLM(this._msp, alphabet, lm, trie, lm_alpha, lm_beta); impl.EnableDecoderWithLM(this._msp, lm, trie, lm_alpha, lm_beta);
} }
public String stt(short[] buffer, int buffer_size, int sample_rate) { public String stt(short[] buffer, int buffer_size, int sample_rate) {
@ -33,9 +33,9 @@ public class DeepSpeechModel {
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, sample_rate); return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, sample_rate);
} }
public DeepSpeechStreamingState setupStream(int sample_rate) { public DeepSpeechStreamingState createStream(int sample_rate) {
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep(); SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
impl.SetupStream(this._msp, sample_rate, ssp); impl.CreateStream(this._msp, sample_rate, ssp);
return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp)); return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
} }

View File

@ -22,16 +22,6 @@ const LM_ALPHA = 0.75;
const LM_BETA = 1.85; const LM_BETA = 1.85;
// These constants are tied to the shape of the graph used (changing them changes
// the geometry of the first layer), so make sure you use the same constants that
// were used during training
// Number of MFCC features to use
const N_FEATURES = 26;
// Size of the context window used for producing timesteps in the input vector
const N_CONTEXT = 9;
var VersionAction = function VersionAction(options) { var VersionAction = function VersionAction(options) {
options = options || {}; options = options || {};
options.nargs = 0; options.nargs = 0;
@ -109,15 +99,14 @@ audioStream.on('finish', () => {
console.error('Loading model from file %s', args['model']); console.error('Loading model from file %s', args['model']);
const model_load_start = process.hrtime(); const model_load_start = process.hrtime();
var model = new Ds.Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'], BEAM_WIDTH); var model = new Ds.Model(args['model'], args['alphabet'], BEAM_WIDTH);
const model_load_end = process.hrtime(model_load_start); const model_load_end = process.hrtime(model_load_start);
console.error('Loaded model in %ds.', totalTime(model_load_end)); console.error('Loaded model in %ds.', totalTime(model_load_end));
if (args['lm'] && args['trie']) { if (args['lm'] && args['trie']) {
console.error('Loading language model from files %s %s', args['lm'], args['trie']); console.error('Loading language model from files %s %s', args['lm'], args['trie']);
const lm_load_start = process.hrtime(); const lm_load_start = process.hrtime();
model.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'], model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA);
LM_ALPHA, LM_BETA);
const lm_load_end = process.hrtime(lm_load_start); const lm_load_end = process.hrtime(lm_load_start);
console.error('Loaded language model in %ds.', totalTime(lm_load_end)); console.error('Loaded language model in %ds.', totalTime(lm_load_end));
} }
@ -135,6 +124,6 @@ audioStream.on('finish', () => {
} }
const inference_stop = process.hrtime(inference_start); const inference_stop = process.hrtime(inference_start);
console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4)); console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4));
Ds.DestroyModel(model); Ds.FreeModel(model);
process.exit(0); process.exit(0);
}); });

View File

@ -47,7 +47,7 @@ using namespace node;
} }
// convert double pointer retval in SetupStream to an output // convert double pointer retval in CreateStream to an output
%typemap(in, numinputs=0) StreamingState **retval (StreamingState *ret) { %typemap(in, numinputs=0) StreamingState **retval (StreamingState *ret) {
ret = NULL; ret = NULL;
$1 = &ret; $1 = &ret;

View File

@ -48,13 +48,13 @@ Model.prototype.sttWithMetadata = function() {
return binding.SpeechToTextWithMetadata.apply(null, args); return binding.SpeechToTextWithMetadata.apply(null, args);
} }
Model.prototype.setupStream = function() { Model.prototype.createStream = function() {
const args = [this._impl].concat(Array.prototype.slice.call(arguments)); const args = [this._impl].concat(Array.prototype.slice.call(arguments));
const rets = binding.SetupStream.apply(null, args); const rets = binding.CreateStream.apply(null, args);
const status = rets[0]; const status = rets[0];
const ctx = rets[1]; const ctx = rets[1];
if (status !== 0) { if (status !== 0) {
throw "SetupStream failed with error code " + status; throw "CreateStream failed with error code " + status;
} }
return ctx; return ctx;
} }
@ -75,13 +75,14 @@ Model.prototype.finishStreamWithMetadata = function() {
return binding.FinishStreamWithMetadata.apply(null, arguments); return binding.FinishStreamWithMetadata.apply(null, arguments);
} }
function DestroyModel(model) { function FreeModel(model) {
return binding.DestroyModel(model._impl); return binding.FreeModel(model._impl);
} }
module.exports = { module.exports = {
Model: Model, Model: Model,
printVersions: binding.PrintVersions, printVersions: binding.PrintVersions,
DestroyModel: DestroyModel, FreeModel: FreeModel,
FreeStream: binding.FreeStream,
FreeMetadata: binding.FreeMetadata FreeMetadata: binding.FreeMetadata
}; };

View File

@ -25,13 +25,9 @@ ModelState::~ModelState()
int int
ModelState::init(const char* model_path, ModelState::init(const char* model_path,
unsigned int n_features,
unsigned int n_context,
const char* alphabet_path, const char* alphabet_path,
unsigned int beam_width) unsigned int beam_width)
{ {
n_features_ = n_features;
n_context_ = n_context;
if (alphabet_.init(alphabet_path)) { if (alphabet_.init(alphabet_path)) {
return DS_ERR_INVALID_ALPHABET; return DS_ERR_INVALID_ALPHABET;
} }

View File

@ -35,8 +35,6 @@ struct ModelState {
virtual ~ModelState(); virtual ~ModelState();
virtual int init(const char* model_path, virtual int init(const char* model_path,
unsigned int n_features,
unsigned int n_context,
const char* alphabet_path, const char* alphabet_path,
unsigned int beam_width); unsigned int beam_width);

View File

@ -1,6 +1,9 @@
import os import os
import platform import platform
#The API is not snake case which triggers linter errors
#pylint: disable=invalid-name
# On Windows, we can't rely on RPATH being set to $ORIGIN/lib/ or on # On Windows, we can't rely on RPATH being set to $ORIGIN/lib/ or on
# @loader_path/lib but we can change the PATH to include the proper directory # @loader_path/lib but we can change the PATH to include the proper directory
# for the dynamic linker # for the dynamic linker
@ -12,6 +15,7 @@ import deepspeech
# rename for backwards compatibility # rename for backwards compatibility
from deepspeech.impl import PrintVersions as printVersions from deepspeech.impl import PrintVersions as printVersions
from deepspeech.impl import FreeStream as freeStream
class Model(object): class Model(object):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -25,7 +29,7 @@ class Model(object):
def __del__(self): def __del__(self):
if self._impl: if self._impl:
deepspeech.impl.DestroyModel(self._impl) deepspeech.impl.FreeModel(self._impl)
self._impl = None self._impl = None
def enableDecoderWithLM(self, *args, **kwargs): def enableDecoderWithLM(self, *args, **kwargs):
@ -37,11 +41,11 @@ class Model(object):
def sttWithMetadata(self, *args, **kwargs): def sttWithMetadata(self, *args, **kwargs):
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs) return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
def setupStream(self, sample_rate=16000): def createStream(self, sample_rate=16000):
status, ctx = deepspeech.impl.SetupStream(self._impl, status, ctx = deepspeech.impl.CreateStream(self._impl,
aSampleRate=sample_rate) aSampleRate=sample_rate)
if status != 0: if status != 0:
raise RuntimeError("SetupStream failed with error code {}".format(status)) raise RuntimeError("CreateStream failed with error code {}".format(status))
return ctx return ctx
def feedAudioContent(self, *args, **kwargs): def feedAudioContent(self, *args, **kwargs):

View File

@ -32,17 +32,6 @@ LM_ALPHA = 0.75
LM_BETA = 1.85 LM_BETA = 1.85
# These constants are tied to the shape of the graph used (changing them changes
# the geometry of the first layer), so make sure you use the same constants that
# were used during training
# Number of MFCC features to use
N_FEATURES = 26
# Size of the context window used for producing timesteps in the input vector
N_CONTEXT = 9
def convert_samplerate(audio_path): def convert_samplerate(audio_path):
sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), SAMPLE_RATE) sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), SAMPLE_RATE)
try: try:
@ -88,14 +77,14 @@ def main():
print('Loading model from file {}'.format(args.model), file=sys.stderr) print('Loading model from file {}'.format(args.model), file=sys.stderr)
model_load_start = timer() model_load_start = timer()
ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) ds = Model(args.model, args.alphabet, BEAM_WIDTH)
model_load_end = timer() - model_load_start model_load_end = timer() - model_load_start
print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)
if args.lm and args.trie: if args.lm and args.trie:
print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
lm_load_start = timer() lm_load_start = timer()
ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA) ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA)
lm_load_end = timer() - lm_load_start lm_load_end = timer() - lm_load_start
print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

View File

@ -19,7 +19,7 @@ import_array();
} }
%typemap(argout) ModelState **retval { %typemap(argout) ModelState **retval {
// not owned, Python wrapper in __init__.py calls DS_DestroyModel // not owned, Python wrapper in __init__.py calls DS_FreeModel
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0)); %append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0));
} }

View File

@ -21,17 +21,6 @@ LM_ALPHA = 0.75
LM_BETA = 1.85 LM_BETA = 1.85
# These constants are tied to the shape of the graph used (changing them changes
# the geometry of the first layer), so make sure you use the same constants that
# were used during training
# Number of MFCC features to use
N_FEATURES = 26
# Size of the context window used for producing timesteps in the input vector
N_CONTEXT = 9
def main(): def main():
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
parser.add_argument('--model', required=True, parser.add_argument('--model', required=True,
@ -48,10 +37,10 @@ def main():
help='Second audio file to use in interleaved streams') help='Second audio file to use in interleaved streams')
args = parser.parse_args() args = parser.parse_args()
ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) ds = Model(args.model, args.alphabet, BEAM_WIDTH)
if args.lm and args.trie: if args.lm and args.trie:
ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA) ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA)
fin = wave.open(args.audio1, 'rb') fin = wave.open(args.audio1, 'rb')
fs1 = fin.getframerate() fs1 = fin.getframerate()
@ -63,8 +52,8 @@ def main():
audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
fin.close() fin.close()
stream1 = ds.setupStream(sample_rate=fs1) stream1 = ds.createStream(sample_rate=fs1)
stream2 = ds.setupStream(sample_rate=fs2) stream2 = ds.createStream(sample_rate=fs2)
splits1 = np.array_split(audio1, 10) splits1 = np.array_split(audio1, 10)
splits2 = np.array_split(audio2, 10) splits2 = np.array_split(audio2, 10)

View File

@ -89,12 +89,10 @@ TFLiteModelState::~TFLiteModelState()
int int
TFLiteModelState::init(const char* model_path, TFLiteModelState::init(const char* model_path,
unsigned int n_features,
unsigned int n_context,
const char* alphabet_path, const char* alphabet_path,
unsigned int beam_width) unsigned int beam_width)
{ {
int err = ModelState::init(model_path, n_features, n_context, alphabet_path, beam_width); int err = ModelState::init(model_path, alphabet_path, beam_width);
if (err != DS_ERR_OK) { if (err != DS_ERR_OK) {
return err; return err;
} }

View File

@ -31,8 +31,6 @@ struct TFLiteModelState : public ModelState
virtual ~TFLiteModelState(); virtual ~TFLiteModelState();
virtual int init(const char* model_path, virtual int init(const char* model_path,
unsigned int n_features,
unsigned int n_context,
const char* alphabet_path, const char* alphabet_path,
unsigned int beam_width) override; unsigned int beam_width) override;

View File

@ -25,12 +25,10 @@ TFModelState::~TFModelState()
int int
TFModelState::init(const char* model_path, TFModelState::init(const char* model_path,
unsigned int n_features,
unsigned int n_context,
const char* alphabet_path, const char* alphabet_path,
unsigned int beam_width) unsigned int beam_width)
{ {
int err = ModelState::init(model_path, n_features, n_context, alphabet_path, beam_width); int err = ModelState::init(model_path, alphabet_path, beam_width);
if (err != DS_ERR_OK) { if (err != DS_ERR_OK) {
return err; return err;
} }

View File

@ -19,8 +19,6 @@ struct TFModelState : public ModelState
virtual ~TFModelState(); virtual ~TFModelState();
virtual int init(const char* model_path, virtual int init(const char* model_path,
unsigned int n_features,
unsigned int n_context,
const char* alphabet_path, const char* alphabet_path,
unsigned int beam_width) override; unsigned int beam_width) override;