Merge pull request #2350 from mozilla/breaking-api-cleanup

[BREAKING] API cleanup
This commit is contained in:
Reuben Morais 2019-09-09 21:28:43 +02:00 committed by GitHub
commit 6b7ebf47f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 206 additions and 183 deletions

View File

@ -29,13 +29,11 @@ Then run with a TF Lite model, alphabet, LM/trie and a CSV test file
BEAM_WIDTH = 500
LM_ALPHA = 0.75
LM_BETA = 1.85
N_FEATURES = 26
N_CONTEXT = 9
def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask):
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)
ds = Model(model, alphabet, BEAM_WIDTH)
ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
while True:
msg = queue_in.get()

View File

@ -22,8 +22,6 @@ namespace DeepSpeechWPF
{
private readonly IDeepSpeech _sttClient;
private const uint N_CEP = 26;
private const uint N_CONTEXT = 9;
private const uint BEAM_WIDTH = 500;
private const float LM_ALPHA = 0.75f;
private const float LM_BETA = 1.85f;
@ -79,7 +77,7 @@ namespace DeepSpeechWPF
{
try
{
_sttClient.CreateModel("output_graph.pbmm", N_CEP, N_CONTEXT, "alphabet.txt", BEAM_WIDTH);
_sttClient.CreateModel("output_graph.pbmm", "alphabet.txt", BEAM_WIDTH);
Dispatcher.Invoke(() => { EnableControls(); });
}
catch (Exception ex)
@ -155,7 +153,7 @@ namespace DeepSpeechWPF
{
try
{
_sttClient.EnableDecoderWithLM("alphabet.txt", "lm.binary", "trie", LM_ALPHA, LM_BETA);
_sttClient.EnableDecoderWithLM("lm.binary", "trie", LM_ALPHA, LM_BETA);
Dispatcher.Invoke(() => lblStatus.Content = "LM loaded.");
}
catch (Exception ex)
@ -198,7 +196,7 @@ namespace DeepSpeechWPF
_soundInSource.Dispose();
_convertedSource.Dispose();
_audioCapture.DataAvailable -= _capture_DataAvailable;
_sttClient.DiscardStream(); //this a good example of discardstream, the user changed the audio input, so we no longer need the current stream
_sttClient.FreeStream(); //this a good example of FreeStream, the user changed the audio input, so we no longer need the current stream
}
if (_audioCaptureDevices!=null)
{
@ -252,7 +250,7 @@ namespace DeepSpeechWPF
private void BtnStartRecording_Click(object sender, RoutedEventArgs e)
{
_sttClient.SetupStream(16000);
_sttClient.CreateStream(16000);
_audioCapture.Start();
btnStartRecording.IsEnabled = false;
btnStopRecording.IsEnabled = true;

View File

@ -33,8 +33,6 @@
#include "deepspeech.h"
#include "args.h"
#define N_CEP 26
#define N_CONTEXT 9
#define BEAM_WIDTH 500
#define LM_ALPHA 0.75f
#define LM_BETA 1.85f
@ -72,7 +70,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
DS_FreeMetadata(metadata);
} else if (stream_size > 0) {
StreamingState* ctx;
int status = DS_SetupStream(aCtx, aSampleRate, &ctx);
int status = DS_CreateStream(aCtx, aSampleRate, &ctx);
if (status != DS_ERR_OK) {
res.string = strdup("");
return res;
@ -377,7 +375,7 @@ main(int argc, char **argv)
// Initialise DeepSpeech
ModelState* ctx;
int status = DS_CreateModel(model, N_CEP, N_CONTEXT, alphabet, BEAM_WIDTH, &ctx);
int status = DS_CreateModel(model, alphabet, BEAM_WIDTH, &ctx);
if (status != 0) {
fprintf(stderr, "Could not create model.\n");
return 1;
@ -385,7 +383,6 @@ main(int argc, char **argv)
if (lm && (trie || load_without_trie)) {
int status = DS_EnableDecoderWithLM(ctx,
alphabet,
lm,
trie,
LM_ALPHA,
@ -449,7 +446,7 @@ main(int argc, char **argv)
sox_quit();
#endif // NO_SOX
DS_DestroyModel(ctx);
DS_FreeModel(ctx);
return 0;
}

View File

@ -257,8 +257,6 @@ StreamingState::processBatch(const vector<float>& buf, unsigned int n_steps)
int
DS_CreateModel(const char* aModelPath,
unsigned int aNCep,
unsigned int aNContext,
const char* aAlphabetConfigPath,
unsigned int aBeamWidth,
ModelState** retval)
@ -285,7 +283,7 @@ DS_CreateModel(const char* aModelPath,
return DS_ERR_FAIL_CREATE_MODEL;
}
int err = model->init(aModelPath, aNCep, aNContext, aAlphabetConfigPath, aBeamWidth);
int err = model->init(aModelPath, aAlphabetConfigPath, aBeamWidth);
if (err != DS_ERR_OK) {
return err;
}
@ -295,14 +293,13 @@ DS_CreateModel(const char* aModelPath,
}
void
DS_DestroyModel(ModelState* ctx)
DS_FreeModel(ModelState* ctx)
{
delete ctx;
}
int
DS_EnableDecoderWithLM(ModelState* aCtx,
const char* aAlphabetConfigPath,
const char* aLMPath,
const char* aTriePath,
float aLMAlpha,
@ -320,9 +317,9 @@ DS_EnableDecoderWithLM(ModelState* aCtx,
}
int
DS_SetupStream(ModelState* aCtx,
unsigned int aSampleRate,
StreamingState** retval)
DS_CreateStream(ModelState* aCtx,
unsigned int aSampleRate,
StreamingState** retval)
{
*retval = nullptr;
@ -371,7 +368,7 @@ char*
DS_FinishStream(StreamingState* aSctx)
{
char* str = aSctx->finishStream();
DS_DiscardStream(aSctx);
DS_FreeStream(aSctx);
return str;
}
@ -379,18 +376,18 @@ Metadata*
DS_FinishStreamWithMetadata(StreamingState* aSctx)
{
Metadata* metadata = aSctx->finishStreamWithMetadata();
DS_DiscardStream(aSctx);
DS_FreeStream(aSctx);
return metadata;
}
StreamingState*
SetupStreamAndFeedAudioContent(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate)
CreateStreamAndFeedAudioContent(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate)
{
StreamingState* ctx;
int status = DS_SetupStream(aCtx, aSampleRate, &ctx);
int status = DS_CreateStream(aCtx, aSampleRate, &ctx);
if (status != DS_ERR_OK) {
return nullptr;
}
@ -404,7 +401,7 @@ DS_SpeechToText(ModelState* aCtx,
unsigned int aBufferSize,
unsigned int aSampleRate)
{
StreamingState* ctx = SetupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
return DS_FinishStream(ctx);
}
@ -414,12 +411,12 @@ DS_SpeechToTextWithMetadata(ModelState* aCtx,
unsigned int aBufferSize,
unsigned int aSampleRate)
{
StreamingState* ctx = SetupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
return DS_FinishStreamWithMetadata(ctx);
}
void
DS_DiscardStream(StreamingState* aSctx)
DS_FreeStream(StreamingState* aSctx)
{
delete aSctx;
}

View File

@ -63,8 +63,6 @@ enum DeepSpeech_Error_Codes
* @brief An object providing an interface to a trained DeepSpeech model.
*
* @param aModelPath The path to the frozen model graph.
* @param aNCep The number of cepstrum the model was trained with.
* @param aNContext The context window the model was trained with.
* @param aAlphabetConfigPath The path to the configuration file specifying
* the alphabet used by the network. See alphabet.h.
* @param aBeamWidth The beam width used by the decoder. A larger beam
@ -76,8 +74,6 @@ enum DeepSpeech_Error_Codes
*/
DEEPSPEECH_EXPORT
int DS_CreateModel(const char* aModelPath,
unsigned int aNCep,
unsigned int aNContext,
const char* aAlphabetConfigPath,
unsigned int aBeamWidth,
ModelState** retval);
@ -86,7 +82,7 @@ int DS_CreateModel(const char* aModelPath,
* @brief Frees associated resources and destroys model object.
*/
DEEPSPEECH_EXPORT
void DS_DestroyModel(ModelState* ctx);
void DS_FreeModel(ModelState* ctx);
/**
* @brief Enable decoding using beam scoring with a KenLM language model.
@ -106,7 +102,6 @@ void DS_DestroyModel(ModelState* ctx);
*/
DEEPSPEECH_EXPORT
int DS_EnableDecoderWithLM(ModelState* aCtx,
const char* aAlphabetConfigPath,
const char* aLMPath,
const char* aTriePath,
float aLMAlpha,
@ -145,9 +140,9 @@ char* DS_SpeechToText(ModelState* aCtx,
*/
DEEPSPEECH_EXPORT
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate);
const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate);
/**
* @brief Create a new streaming inference state. The streaming state returned
@ -162,14 +157,14 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
* @return Zero for success, non-zero on failure.
*/
DEEPSPEECH_EXPORT
int DS_SetupStream(ModelState* aCtx,
unsigned int aSampleRate,
StreamingState** retval);
int DS_CreateStream(ModelState* aCtx,
unsigned int aSampleRate,
StreamingState** retval);
/**
* @brief Feed audio samples to an ongoing streaming inference.
*
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aBuffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate.
* @param aBufferSize The number of samples in @p aBuffer.
@ -185,7 +180,7 @@ void DS_FeedAudioContent(StreamingState* aSctx,
* currently capable of streaming, so it always starts from the beginning
* of the audio.
*
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
* @return The STT intermediate result. The user is responsible for freeing the
* string using {@link DS_FreeString()}.
@ -197,7 +192,7 @@ char* DS_IntermediateDecode(StreamingState* aSctx);
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns the STT result over the whole audio signal.
*
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
* @return The STT result. The user is responsible for freeing the string using
* {@link DS_FreeString()}.
@ -211,7 +206,7 @@ char* DS_FinishStream(StreamingState* aSctx);
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns per-letter metadata.
*
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
* @return Outputs a struct of individual letters along with their timing information.
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
@ -226,12 +221,12 @@ Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx);
* can be used if you no longer need the result of an ongoing streaming
* inference and don't want to perform a costly decode operation.
*
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
* @note This method will free the state pointer (@p aSctx).
*/
DEEPSPEECH_EXPORT
void DS_DiscardStream(StreamingState* aSctx);
void DS_FreeStream(StreamingState* aSctx);
/**
* @brief Free memory allocated for metadata information.

View File

@ -0,0 +1,101 @@
#ifndef DEEPSPEECH_COMPAT_H
#define DEEPSPEECH_COMPAT_H
#include "deepspeech.h"
#warning This header is a convenience wrapper for compatibility with \
the previous API, it has deprecated function names and arguments. \
If possible, update your code instead of using this header.
/**
* @brief An object providing an interface to a trained DeepSpeech model.
*
* @param aModelPath The path to the frozen model graph.
* @param aNCep UNUSED, DEPRECATED.
* @param aNContext UNUSED, DEPRECATED.
* @param aAlphabetConfigPath The path to the configuration file specifying
* the alphabet used by the network. See alphabet.h.
* @param aBeamWidth The beam width used by the decoder. A larger beam
* width generates better results at the cost of decoding
* time.
* @param[out] retval a ModelState pointer
*
* @return Zero on success, non-zero on failure.
*/
int DS_CreateModel(const char* aModelPath,
unsigned int /*aNCep*/,
unsigned int /*aNContext*/,
const char* aAlphabetConfigPath,
unsigned int aBeamWidth,
ModelState** retval)
{
return DS_CreateModel(aModelPath, aAlphabetConfigPath, aBeamWidth, retval);
}
/**
* @brief Frees associated resources and destroys model object.
*/
void DS_DestroyModel(ModelState* ctx)
{
return DS_FreeModel(ctx);
}
/**
* @brief Enable decoding using beam scoring with a KenLM language model.
*
* @param aCtx The ModelState pointer for the model being changed.
* @param aAlphabetConfigPath UNUSED, DEPRECATED.
* @param aLMPath The path to the language model binary file.
* @param aTriePath The path to the trie file build from the same vocabu-
* lary as the language model binary.
* @param aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model
weight.
* @param aLMBeta The beta hyperparameter of the CTC decoder. Word insertion
weight.
*
* @return Zero on success, non-zero on failure (invalid arguments).
*/
int DS_EnableDecoderWithLM(ModelState* aCtx,
const char* /*aAlphabetConfigPath*/,
const char* aLMPath,
const char* aTriePath,
float aLMAlpha,
float aLMBeta)
{
return DS_EnableDecoderWithLM(aCtx, aLMPath, aTriePath, aLMAlpha, aLMBeta);
}
/**
* @brief Create a new streaming inference state. The streaming state returned
* by this function can then be passed to {@link DS_FeedAudioContent()}
* and {@link DS_FinishStream()}.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aSampleRate The sample-rate of the audio signal.
* @param[out] retval an opaque pointer that represents the streaming state. Can
* be NULL if an error occurs.
*
* @return Zero for success, non-zero on failure.
*/
int DS_SetupStream(ModelState* aCtx,
unsigned int aSampleRate,
StreamingState** retval)
{
return DS_CreateStream(aCtx, aSampleRate, retval);
}
/**
* @brief Destroy a streaming state without decoding the computed logits. This
* can be used if you no longer need the result of an ongoing streaming
* inference and don't want to perform a costly decode operation.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
* @note This method will free the state pointer (@p aSctx).
*/
void DS_DiscardStream(StreamingState* aSctx)
{
return DS_FreeStream(aSctx);
}
#endif /* DEEPSPEECH_COMPAT_H */

View File

@ -32,13 +32,11 @@ namespace DeepSpeechClient
/// Create an object providing an interface to a trained DeepSpeech model.
/// </summary>
/// <param name="aModelPath">The path to the frozen model graph.</param>
/// <param name="aNCep">The number of cepstrum the model was trained with.</param>
/// <param name="aNContext">The context window the model was trained with.</param>
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
/// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception>
public unsafe void CreateModel(string aModelPath, uint aNCep,
uint aNContext, string aAlphabetConfigPath, uint aBeamWidth)
public unsafe void CreateModel(string aModelPath,
string aAlphabetConfigPath, uint aBeamWidth)
{
string exceptionMessage = null;
if (string.IsNullOrWhiteSpace(aModelPath))
@ -63,8 +61,6 @@ namespace DeepSpeechClient
throw new FileNotFoundException(exceptionMessage);
}
var resultCode = NativeImp.DS_CreateModel(aModelPath,
aNCep,
aNContext,
aAlphabetConfigPath,
aBeamWidth,
ref _modelStatePP);
@ -116,20 +112,18 @@ namespace DeepSpeechClient
/// </summary>
public unsafe void Dispose()
{
NativeImp.DS_DestroyModel(_modelStatePP);
NativeImp.DS_FreeModel(_modelStatePP);
}
/// <summary>
/// Enable decoding using beam scoring with a KenLM language model.
/// </summary>
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
/// <param name="aLMPath">The path to the language model binary file.</param>
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param>
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
public unsafe void EnableDecoderWithLM(string aAlphabetConfigPath,
string aLMPath, string aTriePath,
public unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath,
float aLMAlpha, float aLMBeta)
{
string exceptionMessage = null;
@ -148,7 +142,6 @@ namespace DeepSpeechClient
}
var resultCode = NativeImp.DS_EnableDecoderWithLM(_modelStatePP,
aAlphabetConfigPath,
aLMPath,
aTriePath,
aLMAlpha,
@ -206,9 +199,9 @@ namespace DeepSpeechClient
/// </summary>
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
public unsafe void SetupStream(uint aSampleRate)
public unsafe void CreateStream(uint aSampleRate)
{
var resultCode = NativeImp.DS_SetupStream(_modelStatePP, aSampleRate, ref _streamingStatePP);
var resultCode = NativeImp.DS_CreateStream(_modelStatePP, aSampleRate, ref _streamingStatePP);
EvaluateResultCode(resultCode);
}
@ -217,9 +210,9 @@ namespace DeepSpeechClient
/// This can be used if you no longer need the result of an ongoing streaming
/// inference and don't want to perform a costly decode operation.
/// </summary>
public unsafe void DiscardStream()
public unsafe void FreeStream()
{
NativeImp.DS_DiscardStream(ref _streamingStatePP);
NativeImp.DS_FreeStream(ref _streamingStatePP);
}
/// <summary>

View File

@ -17,27 +17,22 @@ namespace DeepSpeechClient.Interfaces
/// Create an object providing an interface to a trained DeepSpeech model.
/// </summary>
/// <param name="aModelPath">The path to the frozen model graph.</param>
/// <param name="aNCep">The number of cepstrum the model was trained with.</param>
/// <param name="aNContext">The context window the model was trained with.</param>
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
/// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception>
unsafe void CreateModel(string aModelPath, uint aNCep,
uint aNContext,
unsafe void CreateModel(string aModelPath,
string aAlphabetConfigPath,
uint aBeamWidth);
/// <summary>
/// Enable decoding using beam scoring with a KenLM language model.
/// </summary>
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
/// <param name="aLMPath">The path to the language model binary file.</param>
/// <param name="aTriePath">The path to the trie file build from the same vocabulary as the language model binary.</param>
/// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
/// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
unsafe void EnableDecoderWithLM(string aAlphabetConfigPath,
string aLMPath,
unsafe void EnableDecoderWithLM(string aLMPath,
string aTriePath,
float aLMAlpha,
float aLMBeta);
@ -69,7 +64,7 @@ namespace DeepSpeechClient.Interfaces
/// This can be used if you no longer need the result of an ongoing streaming
/// inference and don't want to perform a costly decode operation.
/// </summary>
unsafe void DiscardStream();
unsafe void FreeStream();
/// <summary>
/// Free a DeepSpeech allocated string
@ -86,7 +81,7 @@ namespace DeepSpeechClient.Interfaces
/// </summary>
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
unsafe void SetupStream(uint aSampleRate);
unsafe void CreateStream(uint aSampleRate);
/// <summary>
/// Feeds audio samples to an ongoing streaming inference.

View File

@ -17,15 +17,12 @@ namespace DeepSpeechClient
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath,
uint aNCep,
uint aNContext,
string aAlphabetConfigPath,
uint aBeamWidth,
ref ModelState** pint);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(ModelState** aCtx,
string aAlphabetConfigPath,
string aLMPath,
string aTriePath,
float aLMAlpha,
@ -45,14 +42,14 @@ namespace DeepSpeechClient
uint aSampleRate);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_DestroyModel(ModelState** aCtx);
internal static unsafe extern void DS_FreeModel(ModelState** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_SetupStream(ModelState** aCtx,
internal static unsafe extern ErrorCodes DS_CreateStream(ModelState** aCtx,
uint aSampleRate, ref StreamingState** retval);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_DiscardStream(ref StreamingState** aSctx);
internal static unsafe extern void DS_FreeStream(ref StreamingState** aSctx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeMetadata(IntPtr metadata);

View File

@ -7,6 +7,8 @@ using GraphDef = System.IntPtr;
namespace DeepSpeechClient.Structs
{
//FIXME: ModelState is an opaque pointer to the API, why is this code reverse
// engineering its contents?
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Ansi)]
public unsafe struct ModelState
{

View File

@ -50,8 +50,6 @@ namespace CSharpExamples
extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended"));
}
const uint N_CEP = 26;
const uint N_CONTEXT = 9;
const uint BEAM_WIDTH = 500;
const float LM_ALPHA = 0.75f;
const float LM_BETA = 1.85f;
@ -66,7 +64,6 @@ namespace CSharpExamples
stopwatch.Start();
sttClient.CreateModel(
model ?? "output_graph.pbmm",
N_CEP, N_CONTEXT,
alphabet ?? "alphabet.txt",
BEAM_WIDTH);
stopwatch.Stop();
@ -77,7 +74,6 @@ namespace CSharpExamples
{
Console.WriteLine("Loadin LM...");
sttClient.EnableDecoderWithLM(
alphabet ?? "alphabet.txt",
lm ?? "lm.binary",
trie ?? "trie",
LM_ALPHA, LM_BETA);

View File

@ -31,8 +31,6 @@ public class DeepSpeechActivity extends AppCompatActivity {
Button _startInference;
final int N_CEP = 26;
final int N_CONTEXT = 9;
final int BEAM_WIDTH = 50;
final float LM_ALPHA = 0.75f;
final float LM_BETA = 1.85f;
@ -54,7 +52,7 @@ public class DeepSpeechActivity extends AppCompatActivity {
private void newModel(String tfliteModel, String alphabet) {
this._tfliteStatus.setText("Creating model");
if (this._m == null) {
this._m = new DeepSpeechModel(tfliteModel, N_CEP, N_CONTEXT, alphabet, BEAM_WIDTH);
this._m = new DeepSpeechModel(tfliteModel, alphabet, BEAM_WIDTH);
}
}
@ -167,7 +165,7 @@ public class DeepSpeechActivity extends AppCompatActivity {
super.onDestroy();
if (this._m != null) {
this._m.destroyModel();
this._m.freeModel();
}
}
}

View File

@ -35,8 +35,6 @@ public class BasicTest {
public static final String trieFile = "/data/local/tmp/test/trie";
public static final String wavFile = "/data/local/tmp/test/LDC93S1.wav";
public static final int N_CEP = 26;
public static final int N_CONTEXT = 9;
public static final int BEAM_WIDTH = 50;
public static final float LM_ALPHA = 0.75f;
@ -66,8 +64,8 @@ public class BasicTest {
@Test
public void loadDeepSpeech_basic() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
m.destroyModel();
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
m.freeModel();
}
private String metadataToString(Metadata m) {
@ -123,39 +121,39 @@ public class BasicTest {
@Test
public void loadDeepSpeech_stt_noLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
String decoded = doSTT(m, false);
assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel();
m.freeModel();
}
@Test
public void loadDeepSpeech_stt_withLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA);
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
m.enableDecoderWihLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
String decoded = doSTT(m, false);
assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel();
m.freeModel();
}
@Test
public void loadDeepSpeech_sttWithMetadata_noLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
String decoded = doSTT(m, true);
assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel();
m.freeModel();
}
@Test
public void loadDeepSpeech_sttWithMetadata_withLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA);
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH);
m.enableDecoderWihLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
String decoded = doSTT(m, true);
assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel();
m.freeModel();
}
}

View File

@ -11,18 +11,18 @@ public class DeepSpeechModel {
SWIGTYPE_p_p_ModelState _mspp;
SWIGTYPE_p_ModelState _msp;
public DeepSpeechModel(String modelPath, int n_cep, int n_context, String alphabetPath, int beam_width) {
public DeepSpeechModel(String modelPath, String alphabetPath, int beam_width) {
this._mspp = impl.new_modelstatep();
impl.CreateModel(modelPath, n_cep, n_context, alphabetPath, beam_width, this._mspp);
impl.CreateModel(modelPath, alphabetPath, beam_width, this._mspp);
this._msp = impl.modelstatep_value(this._mspp);
}
public void destroyModel() {
impl.DestroyModel(this._msp);
public void freeModel() {
impl.FreeModel(this._msp);
}
public void enableDecoderWihLM(String alphabet, String lm, String trie, float lm_alpha, float lm_beta) {
impl.EnableDecoderWithLM(this._msp, alphabet, lm, trie, lm_alpha, lm_beta);
public void enableDecoderWihLM(String lm, String trie, float lm_alpha, float lm_beta) {
impl.EnableDecoderWithLM(this._msp, lm, trie, lm_alpha, lm_beta);
}
public String stt(short[] buffer, int buffer_size, int sample_rate) {
@ -33,9 +33,9 @@ public class DeepSpeechModel {
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, sample_rate);
}
public DeepSpeechStreamingState setupStream(int sample_rate) {
public DeepSpeechStreamingState createStream(int sample_rate) {
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
impl.SetupStream(this._msp, sample_rate, ssp);
impl.CreateStream(this._msp, sample_rate, ssp);
return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
}

View File

@ -22,16 +22,6 @@ const LM_ALPHA = 0.75;
const LM_BETA = 1.85;
// These constants are tied to the shape of the graph used (changing them changes
// the geometry of the first layer), so make sure you use the same constants that
// were used during training
// Number of MFCC features to use
const N_FEATURES = 26;
// Size of the context window used for producing timesteps in the input vector
const N_CONTEXT = 9;
var VersionAction = function VersionAction(options) {
options = options || {};
options.nargs = 0;
@ -109,15 +99,14 @@ audioStream.on('finish', () => {
console.error('Loading model from file %s', args['model']);
const model_load_start = process.hrtime();
var model = new Ds.Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'], BEAM_WIDTH);
var model = new Ds.Model(args['model'], args['alphabet'], BEAM_WIDTH);
const model_load_end = process.hrtime(model_load_start);
console.error('Loaded model in %ds.', totalTime(model_load_end));
if (args['lm'] && args['trie']) {
console.error('Loading language model from files %s %s', args['lm'], args['trie']);
const lm_load_start = process.hrtime();
model.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'],
LM_ALPHA, LM_BETA);
model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA);
const lm_load_end = process.hrtime(lm_load_start);
console.error('Loaded language model in %ds.', totalTime(lm_load_end));
}
@ -135,6 +124,6 @@ audioStream.on('finish', () => {
}
const inference_stop = process.hrtime(inference_start);
console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4));
Ds.DestroyModel(model);
Ds.FreeModel(model);
process.exit(0);
});

View File

@ -47,7 +47,7 @@ using namespace node;
}
// convert double pointer retval in SetupStream to an output
// convert double pointer retval in CreateStream to an output
%typemap(in, numinputs=0) StreamingState **retval (StreamingState *ret) {
ret = NULL;
$1 = &ret;

View File

@ -48,13 +48,13 @@ Model.prototype.sttWithMetadata = function() {
return binding.SpeechToTextWithMetadata.apply(null, args);
}
Model.prototype.setupStream = function() {
Model.prototype.createStream = function() {
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
const rets = binding.SetupStream.apply(null, args);
const rets = binding.CreateStream.apply(null, args);
const status = rets[0];
const ctx = rets[1];
if (status !== 0) {
throw "SetupStream failed with error code " + status;
throw "CreateStream failed with error code " + status;
}
return ctx;
}
@ -75,13 +75,14 @@ Model.prototype.finishStreamWithMetadata = function() {
return binding.FinishStreamWithMetadata.apply(null, arguments);
}
function DestroyModel(model) {
return binding.DestroyModel(model._impl);
function FreeModel(model) {
return binding.FreeModel(model._impl);
}
module.exports = {
Model: Model,
printVersions: binding.PrintVersions,
DestroyModel: DestroyModel,
FreeModel: FreeModel,
FreeStream: binding.FreeStream,
FreeMetadata: binding.FreeMetadata
};

View File

@ -25,13 +25,9 @@ ModelState::~ModelState()
int
ModelState::init(const char* model_path,
unsigned int n_features,
unsigned int n_context,
const char* alphabet_path,
unsigned int beam_width)
{
n_features_ = n_features;
n_context_ = n_context;
if (alphabet_.init(alphabet_path)) {
return DS_ERR_INVALID_ALPHABET;
}

View File

@ -35,8 +35,6 @@ struct ModelState {
virtual ~ModelState();
virtual int init(const char* model_path,
unsigned int n_features,
unsigned int n_context,
const char* alphabet_path,
unsigned int beam_width);

View File

@ -1,6 +1,9 @@
import os
import platform
#The API is not snake case which triggers linter errors
#pylint: disable=invalid-name
# On Windows, we can't rely on RPATH being set to $ORIGIN/lib/ or on
# @loader_path/lib but we can change the PATH to include the proper directory
# for the dynamic linker
@ -12,6 +15,7 @@ import deepspeech
# rename for backwards compatibility
from deepspeech.impl import PrintVersions as printVersions
from deepspeech.impl import FreeStream as freeStream
class Model(object):
def __init__(self, *args, **kwargs):
@ -25,7 +29,7 @@ class Model(object):
def __del__(self):
if self._impl:
deepspeech.impl.DestroyModel(self._impl)
deepspeech.impl.FreeModel(self._impl)
self._impl = None
def enableDecoderWithLM(self, *args, **kwargs):
@ -37,11 +41,11 @@ class Model(object):
def sttWithMetadata(self, *args, **kwargs):
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
def setupStream(self, sample_rate=16000):
status, ctx = deepspeech.impl.SetupStream(self._impl,
aSampleRate=sample_rate)
def createStream(self, sample_rate=16000):
status, ctx = deepspeech.impl.CreateStream(self._impl,
aSampleRate=sample_rate)
if status != 0:
raise RuntimeError("SetupStream failed with error code {}".format(status))
raise RuntimeError("CreateStream failed with error code {}".format(status))
return ctx
def feedAudioContent(self, *args, **kwargs):

View File

@ -32,17 +32,6 @@ LM_ALPHA = 0.75
LM_BETA = 1.85
# These constants are tied to the shape of the graph used (changing them changes
# the geometry of the first layer), so make sure you use the same constants that
# were used during training
# Number of MFCC features to use
N_FEATURES = 26
# Size of the context window used for producing timesteps in the input vector
N_CONTEXT = 9
def convert_samplerate(audio_path):
sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), SAMPLE_RATE)
try:
@ -88,14 +77,14 @@ def main():
print('Loading model from file {}'.format(args.model), file=sys.stderr)
model_load_start = timer()
ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
ds = Model(args.model, args.alphabet, BEAM_WIDTH)
model_load_end = timer() - model_load_start
print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)
if args.lm and args.trie:
print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
lm_load_start = timer()
ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA)
ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA)
lm_load_end = timer() - lm_load_start
print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

View File

@ -19,7 +19,7 @@ import_array();
}
%typemap(argout) ModelState **retval {
// not owned, Python wrapper in __init__.py calls DS_DestroyModel
// not owned, Python wrapper in __init__.py calls DS_FreeModel
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0));
}

View File

@ -21,17 +21,6 @@ LM_ALPHA = 0.75
LM_BETA = 1.85
# These constants are tied to the shape of the graph used (changing them changes
# the geometry of the first layer), so make sure you use the same constants that
# were used during training
# Number of MFCC features to use
N_FEATURES = 26
# Size of the context window used for producing timesteps in the input vector
N_CONTEXT = 9
def main():
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
parser.add_argument('--model', required=True,
@ -48,10 +37,10 @@ def main():
help='Second audio file to use in interleaved streams')
args = parser.parse_args()
ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
ds = Model(args.model, args.alphabet, BEAM_WIDTH)
if args.lm and args.trie:
ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA)
ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA)
fin = wave.open(args.audio1, 'rb')
fs1 = fin.getframerate()
@ -63,8 +52,8 @@ def main():
audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
fin.close()
stream1 = ds.setupStream(sample_rate=fs1)
stream2 = ds.setupStream(sample_rate=fs2)
stream1 = ds.createStream(sample_rate=fs1)
stream2 = ds.createStream(sample_rate=fs2)
splits1 = np.array_split(audio1, 10)
splits2 = np.array_split(audio2, 10)

View File

@ -89,12 +89,10 @@ TFLiteModelState::~TFLiteModelState()
int
TFLiteModelState::init(const char* model_path,
unsigned int n_features,
unsigned int n_context,
const char* alphabet_path,
unsigned int beam_width)
{
int err = ModelState::init(model_path, n_features, n_context, alphabet_path, beam_width);
int err = ModelState::init(model_path, alphabet_path, beam_width);
if (err != DS_ERR_OK) {
return err;
}

View File

@ -31,8 +31,6 @@ struct TFLiteModelState : public ModelState
virtual ~TFLiteModelState();
virtual int init(const char* model_path,
unsigned int n_features,
unsigned int n_context,
const char* alphabet_path,
unsigned int beam_width) override;

View File

@ -25,12 +25,10 @@ TFModelState::~TFModelState()
int
TFModelState::init(const char* model_path,
unsigned int n_features,
unsigned int n_context,
const char* alphabet_path,
unsigned int beam_width)
{
int err = ModelState::init(model_path, n_features, n_context, alphabet_path, beam_width);
int err = ModelState::init(model_path, alphabet_path, beam_width);
if (err != DS_ERR_OK) {
return err;
}

View File

@ -19,8 +19,6 @@ struct TFModelState : public ModelState
virtual ~TFModelState();
virtual int init(const char* model_path,
unsigned int n_features,
unsigned int n_context,
const char* alphabet_path,
unsigned int beam_width) override;