From a8c53d21542bcea377a69113bbf4f36010c4936b Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Mon, 9 Sep 2019 11:54:53 +0200 Subject: [PATCH] Update .NET bindings and client --- .../dotnet/DeepSpeechClient/DeepSpeech.cs | 23 +++++++------------ .../Interfaces/IDeepSpeech.cs | 13 ++++------- .../dotnet/DeepSpeechClient/NativeImp.cs | 9 +++----- .../DeepSpeechClient/Structs/ModelState.cs | 2 ++ .../dotnet/DeepSpeechConsole/Program.cs | 4 ---- 5 files changed, 17 insertions(+), 34 deletions(-) diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs index d0b2a7d3..21b3dc06 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs @@ -32,13 +32,11 @@ namespace DeepSpeechClient /// Create an object providing an interface to a trained DeepSpeech model. /// /// The path to the frozen model graph. - /// The number of cepstrum the model was trained with. - /// The context window the model was trained with. /// The path to the configuration file specifying the alphabet used by the network. /// The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time. /// Thrown when the native binary failed to create the model. - public unsafe void CreateModel(string aModelPath, uint aNCep, - uint aNContext, string aAlphabetConfigPath, uint aBeamWidth) + public unsafe void CreateModel(string aModelPath, + string aAlphabetConfigPath, uint aBeamWidth) { string exceptionMessage = null; if (string.IsNullOrWhiteSpace(aModelPath)) @@ -63,8 +61,6 @@ namespace DeepSpeechClient throw new FileNotFoundException(exceptionMessage); } var resultCode = NativeImp.DS_CreateModel(aModelPath, - aNCep, - aNContext, aAlphabetConfigPath, aBeamWidth, ref _modelStatePP); @@ -116,20 +112,18 @@ namespace DeepSpeechClient /// public unsafe void Dispose() { - NativeImp.DS_DestroyModel(_modelStatePP); + NativeImp.DS_FreeModel(_modelStatePP); } /// /// Enable decoding using beam scoring with a KenLM language model. /// - /// The path to the configuration file specifying the alphabet used by the network. /// The path to the language model binary file. /// The path to the trie file build from the same vocabulary as the language model binary. /// The alpha hyperparameter of the CTC decoder. Language Model weight. /// The beta hyperparameter of the CTC decoder. Word insertion weight. /// Thrown when the native binary failed to enable decoding with a language model. - public unsafe void EnableDecoderWithLM(string aAlphabetConfigPath, - string aLMPath, string aTriePath, + public unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath, float aLMAlpha, float aLMBeta) { string exceptionMessage = null; @@ -148,7 +142,6 @@ namespace DeepSpeechClient } var resultCode = NativeImp.DS_EnableDecoderWithLM(_modelStatePP, - aAlphabetConfigPath, aLMPath, aTriePath, aLMAlpha, @@ -206,9 +199,9 @@ namespace DeepSpeechClient /// /// The sample-rate of the audio signal /// Thrown when the native binary failed to initialize the streaming mode. - public unsafe void SetupStream(uint aSampleRate) + public unsafe void CreateStream(uint aSampleRate) { - var resultCode = NativeImp.DS_SetupStream(_modelStatePP, aSampleRate, ref _streamingStatePP); + var resultCode = NativeImp.DS_CreateStream(_modelStatePP, aSampleRate, ref _streamingStatePP); EvaluateResultCode(resultCode); } @@ -217,9 +210,9 @@ namespace DeepSpeechClient /// This can be used if you no longer need the result of an ongoing streaming /// inference and don't want to perform a costly decode operation. /// - public unsafe void DiscardStream() + public unsafe void FreeStream() { - NativeImp.DS_DiscardStream(ref _streamingStatePP); + NativeImp.DS_FreeStream(ref _streamingStatePP); } /// diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs index 3c00b996..04ad086c 100644 --- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs @@ -17,27 +17,22 @@ namespace DeepSpeechClient.Interfaces /// Create an object providing an interface to a trained DeepSpeech model. /// /// The path to the frozen model graph. - /// The number of cepstrum the model was trained with. - /// The context window the model was trained with. /// The path to the configuration file specifying the alphabet used by the network. /// The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time. /// Thrown when the native binary failed to create the model. - unsafe void CreateModel(string aModelPath, uint aNCep, - uint aNContext, + unsafe void CreateModel(string aModelPath, string aAlphabetConfigPath, uint aBeamWidth); /// /// Enable decoding using beam scoring with a KenLM language model. /// - /// The path to the configuration file specifying the alphabet used by the network. /// The path to the language model binary file. /// The path to the trie file build from the same vocabulary as the language model binary. /// The alpha hyperparameter of the CTC decoder. Language Model weight. /// The beta hyperparameter of the CTC decoder. Word insertion weight. /// Thrown when the native binary failed to enable decoding with a language model. - unsafe void EnableDecoderWithLM(string aAlphabetConfigPath, - string aLMPath, + unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath, float aLMAlpha, float aLMBeta); @@ -69,7 +64,7 @@ namespace DeepSpeechClient.Interfaces /// This can be used if you no longer need the result of an ongoing streaming /// inference and don't want to perform a costly decode operation. /// - unsafe void DiscardStream(); + unsafe void FreeStream(); /// /// Free a DeepSpeech allocated string @@ -86,7 +81,7 @@ namespace DeepSpeechClient.Interfaces /// /// The sample-rate of the audio signal /// Thrown when the native binary failed to initialize the streaming mode. - unsafe void SetupStream(uint aSampleRate); + unsafe void CreateStream(uint aSampleRate); /// /// Feeds audio samples to an ongoing streaming inference. diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs index cde7beac..f57b973d 100644 --- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs +++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs @@ -17,15 +17,12 @@ namespace DeepSpeechClient [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath, - uint aNCep, - uint aNContext, string aAlphabetConfigPath, uint aBeamWidth, ref ModelState** pint); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(ModelState** aCtx, - string aAlphabetConfigPath, string aLMPath, string aTriePath, float aLMAlpha, @@ -45,14 +42,14 @@ namespace DeepSpeechClient uint aSampleRate); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern void DS_DestroyModel(ModelState** aCtx); + internal static unsafe extern void DS_FreeModel(ModelState** aCtx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern ErrorCodes DS_SetupStream(ModelState** aCtx, + internal static unsafe extern ErrorCodes DS_CreateStream(ModelState** aCtx, uint aSampleRate, ref StreamingState** retval); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern void DS_DiscardStream(ref StreamingState** aSctx); + internal static unsafe extern void DS_FreeStream(ref StreamingState** aSctx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern void DS_FreeMetadata(IntPtr metadata); diff --git a/native_client/dotnet/DeepSpeechClient/Structs/ModelState.cs b/native_client/dotnet/DeepSpeechClient/Structs/ModelState.cs index a90bdb77..abb53758 100644 --- a/native_client/dotnet/DeepSpeechClient/Structs/ModelState.cs +++ b/native_client/dotnet/DeepSpeechClient/Structs/ModelState.cs @@ -7,6 +7,8 @@ using GraphDef = System.IntPtr; namespace DeepSpeechClient.Structs { + //FIXME: ModelState is an opaque pointer to the API, why is this code reverse + // engineering its contents? [StructLayout(LayoutKind.Sequential, CharSet = CharSet.Ansi)] public unsafe struct ModelState { diff --git a/native_client/dotnet/DeepSpeechConsole/Program.cs b/native_client/dotnet/DeepSpeechConsole/Program.cs index 315a1a40..1cfbc686 100644 --- a/native_client/dotnet/DeepSpeechConsole/Program.cs +++ b/native_client/dotnet/DeepSpeechConsole/Program.cs @@ -50,8 +50,6 @@ namespace CSharpExamples extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended")); } - const uint N_CEP = 26; - const uint N_CONTEXT = 9; const uint BEAM_WIDTH = 500; const float LM_ALPHA = 0.75f; const float LM_BETA = 1.85f; @@ -66,7 +64,6 @@ namespace CSharpExamples stopwatch.Start(); sttClient.CreateModel( model ?? "output_graph.pbmm", - N_CEP, N_CONTEXT, alphabet ?? "alphabet.txt", BEAM_WIDTH); stopwatch.Stop(); @@ -77,7 +74,6 @@ namespace CSharpExamples { Console.WriteLine("Loadin LM..."); sttClient.EnableDecoderWithLM( - alphabet ?? "alphabet.txt", lm ?? "lm.binary", trie ?? "trie", LM_ALPHA, LM_BETA);