diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst index f9818d64..2ba3415f 100644 --- a/doc/DotNet-API.rst +++ b/doc/DotNet-API.rst @@ -15,6 +15,13 @@ DeepSpeech Class :project: deepspeech-dotnet :members: +DeepSpeechStream Class +---------------- + +.. doxygenclass:: DeepSpeechClient::DeepSpeechStream + :project: deepspeech-dotnet + :members: + ErrorCodes ---------- diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs index a674c699..1260d926 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs @@ -4,6 +4,7 @@ using DeepSpeechClient.Extensions; using System; using System.IO; using DeepSpeechClient.Enums; +using DeepSpeechClient.Models; namespace DeepSpeechClient { @@ -13,14 +14,16 @@ namespace DeepSpeechClient public class DeepSpeech : IDeepSpeech { private unsafe IntPtr** _modelStatePP; - private unsafe IntPtr** _streamingStatePP; - - - - - public DeepSpeech() + + /// + /// Initializes a new instance of class and creates a new acoustic model. + /// + /// The path to the frozen model graph. + /// The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time. + /// Thrown when the native binary failed to create the model. + public DeepSpeech(string aModelPath, uint aBeamWidth) { - + CreateModel(aModelPath, aBeamWidth); } #region IDeepSpeech @@ -31,7 +34,7 @@ namespace DeepSpeechClient /// The path to the frozen model graph. /// The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time. /// Thrown when the native binary failed to create the model. - public unsafe void CreateModel(string aModelPath, + private unsafe void CreateModel(string aModelPath, uint aBeamWidth) { string exceptionMessage = null; @@ -118,10 +121,19 @@ namespace DeepSpeechClient /// The alpha hyperparameter of the CTC decoder. Language Model weight. /// The beta hyperparameter of the CTC decoder. Word insertion weight. /// Thrown when the native binary failed to enable decoding with a language model. + /// Thrown when cannot find the language model or trie file. public unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath, float aLMAlpha, float aLMBeta) { string exceptionMessage = null; + if (string.IsNullOrWhiteSpace(aLMPath)) + { + exceptionMessage = "Path to the language model file cannot be empty."; + } + if (!File.Exists(aLMPath)) + { + exceptionMessage = $"Cannot find the language model file: {aLMPath}"; + } if (string.IsNullOrWhiteSpace(aTriePath)) { exceptionMessage = "Path to the trie file cannot be empty."; @@ -147,37 +159,41 @@ namespace DeepSpeechClient /// /// Feeds audio samples to an ongoing streaming inference. /// + /// Instance of the stream to feed the data. /// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on). - public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize) + public unsafe void FeedAudioContent(DeepSpeechStream stream, short[] aBuffer, uint aBufferSize) { - NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize); + NativeImp.DS_FeedAudioContent(stream.GetNativePointer(), aBuffer, aBufferSize); } /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// - /// The STT result. The user is responsible for freeing the string. - public unsafe string FinishStream() + /// Instance of the stream to finish. + /// The STT result. + public unsafe string FinishStream(DeepSpeechStream stream) { - return NativeImp.DS_FinishStream(_streamingStatePP).PtrToString(); + return NativeImp.DS_FinishStream(stream.GetNativePointer()).PtrToString(); } /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// - /// The extended metadata. The user is responsible for freeing the struct. - public unsafe Models.Metadata FinishStreamWithMetadata() + /// Instance of the stream to finish. + /// The extended metadata result. + public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream) { - return NativeImp.DS_FinishStreamWithMetadata(_streamingStatePP).PtrToMetadata(); + return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata(); } /// /// Computes the intermediate decoding of an ongoing streaming inference. /// - /// The STT intermediate result. The user is responsible for freeing the string. - public unsafe string IntermediateDecode() + /// Instance of the stream to decode. + /// The STT intermediate result. + public unsafe string IntermediateDecode(DeepSpeechStream stream) { - return NativeImp.DS_IntermediateDecode(_streamingStatePP); + return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()); } /// @@ -191,11 +207,12 @@ namespace DeepSpeechClient /// /// Creates a new streaming inference state. /// - /// Thrown when the native binary failed to initialize the streaming mode. - public unsafe void CreateStream() + public unsafe DeepSpeechStream CreateStream() { - var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref _streamingStatePP); + IntPtr** streamingStatePointer = null; + var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref streamingStatePointer); EvaluateResultCode(resultCode); + return new DeepSpeechStream(streamingStatePointer); } /// @@ -203,25 +220,10 @@ namespace DeepSpeechClient /// This can be used if you no longer need the result of an ongoing streaming /// inference and don't want to perform a costly decode operation. /// - public unsafe void FreeStream() + public unsafe void FreeStream(DeepSpeechStream stream) { - NativeImp.DS_FreeStream(ref _streamingStatePP); - } - - /// - /// Free a DeepSpeech allocated string - /// - public unsafe void FreeString(IntPtr intPtr) - { - NativeImp.DS_FreeString(intPtr); - } - - /// - /// Free a DeepSpeech allocated Metadata struct - /// - public unsafe void FreeMetadata(IntPtr intPtr) - { - NativeImp.DS_FreeMetadata(intPtr); + NativeImp.DS_FreeStream(stream.GetNativePointer()); + stream.Dispose(); } /// @@ -229,7 +231,7 @@ namespace DeepSpeechClient /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// The STT result. The user is responsible for freeing the string. Returns NULL on error. + /// The STT result. Returns NULL on error. public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize) { return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize).PtrToString(); @@ -240,8 +242,8 @@ namespace DeepSpeechClient /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// The extended metadata. The user is responsible for freeing the struct. Returns NULL on error. - public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize) + /// The extended metadata. Returns NULL on error. + public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize) { return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata(); } diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj index 320ecde5..b9077361 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj @@ -48,6 +48,7 @@ + diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs index c47c25a1..734f4240 100644 --- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs @@ -1,10 +1,11 @@ using DeepSpeechClient.Models; using System; +using System.IO; namespace DeepSpeechClient.Interfaces { /// - /// Client interface of the Mozilla's deepspeech implementation. + /// Client interface of the Mozilla's DeepSpeech implementation. /// public interface IDeepSpeech : IDisposable { @@ -13,15 +14,6 @@ namespace DeepSpeechClient.Interfaces /// void PrintVersions(); - /// - /// Create an object providing an interface to a trained DeepSpeech model. - /// - /// The path to the frozen model graph. - /// The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time. - /// Thrown when the native binary failed to create the model. - unsafe void CreateModel(string aModelPath, - uint aBeamWidth); - /// /// Return the sample rate expected by the model. /// @@ -36,6 +28,7 @@ namespace DeepSpeechClient.Interfaces /// The alpha hyperparameter of the CTC decoder. Language Model weight. /// The beta hyperparameter of the CTC decoder. Word insertion weight. /// Thrown when the native binary failed to enable decoding with a language model. + /// Thrown when cannot find the language model or trie file. unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath, float aLMAlpha, @@ -46,7 +39,7 @@ namespace DeepSpeechClient.Interfaces /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// The STT result. The user is responsible for freeing the string. Returns NULL on error. + /// The STT result. Returns NULL on error. unsafe string SpeechToText(short[] aBuffer, uint aBufferSize); @@ -55,7 +48,7 @@ namespace DeepSpeechClient.Interfaces /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// The extended metadata result. The user is responsible for freeing the struct. Returns NULL on error. + /// The extended metadata. Returns NULL on error. unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize); @@ -64,46 +57,39 @@ namespace DeepSpeechClient.Interfaces /// This can be used if you no longer need the result of an ongoing streaming /// inference and don't want to perform a costly decode operation. /// - unsafe void FreeStream(); - - /// - /// Free a DeepSpeech allocated string - /// - unsafe void FreeString(IntPtr intPtr); - - /// - /// Free a DeepSpeech allocated Metadata struct - /// - unsafe void FreeMetadata(IntPtr intPtr); + unsafe void FreeStream(DeepSpeechStream stream); /// /// Creates a new streaming inference state. /// - /// Thrown when the native binary failed to initialize the streaming mode. - unsafe void CreateStream(); + unsafe DeepSpeechStream CreateStream(); /// /// Feeds audio samples to an ongoing streaming inference. /// + /// Instance of the stream to feed the data. /// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on). - unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize); + unsafe void FeedAudioContent(DeepSpeechStream stream, short[] aBuffer, uint aBufferSize); /// /// Computes the intermediate decoding of an ongoing streaming inference. /// - /// The STT intermediate result. The user is responsible for freeing the string. - unsafe string IntermediateDecode(); + /// Instance of the stream to decode. + /// The STT intermediate result. + unsafe string IntermediateDecode(DeepSpeechStream stream); /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// - /// The STT result. The user is responsible for freeing the string. - unsafe string FinishStream(); + /// Instance of the stream to finish. + /// The STT result. + unsafe string FinishStream(DeepSpeechStream stream); /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// - /// The extended metadata result. The user is responsible for freeing the struct. - unsafe Metadata FinishStreamWithMetadata(); + /// Instance of the stream to finish. + /// The extended metadata result. + unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream); } } diff --git a/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs b/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs new file mode 100644 index 00000000..e4605f5e --- /dev/null +++ b/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs @@ -0,0 +1,35 @@ +using System; + +namespace DeepSpeechClient.Models +{ + /// + /// Wrapper of the pointer used for the decoding stream. + /// + public class DeepSpeechStream : IDisposable + { + private unsafe IntPtr** _streamingStatePp; + + /// + /// Initializes a new instance of . + /// + /// Native pointer of the native stream. + public unsafe DeepSpeechStream(IntPtr** streamingStatePP) + { + _streamingStatePp = streamingStatePP; + } + + /// + /// Gets the native pointer. + /// + /// Thrown when the stream has been disposed or not yet initialized. + /// Native pointer of the stream. + internal unsafe IntPtr** GetNativePointer() + { + if (_streamingStatePp == null) + throw new InvalidOperationException("Cannot use a disposed or uninitialized stream."); + return _streamingStatePp; + } + + public unsafe void Dispose() => _streamingStatePp = null; + } +} diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs index 0ea331d8..572055c0 100644 --- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs +++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs @@ -48,7 +48,7 @@ namespace DeepSpeechClient ref IntPtr** retval); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern void DS_FreeStream(ref IntPtr** aSctx); + internal static unsafe extern void DS_FreeStream(IntPtr** aSctx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern void DS_FreeMetadata(IntPtr metadata); diff --git a/native_client/dotnet/DeepSpeechConsole/Program.cs b/native_client/dotnet/DeepSpeechConsole/Program.cs index 364cab71..8c75a481 100644 --- a/native_client/dotnet/DeepSpeechConsole/Program.cs +++ b/native_client/dotnet/DeepSpeechConsole/Program.cs @@ -53,16 +53,13 @@ namespace CSharpExamples const float LM_BETA = 1.85f; Stopwatch stopwatch = new Stopwatch(); - - using (IDeepSpeech sttClient = new DeepSpeech()) + try { - try + Console.WriteLine("Loading model..."); + stopwatch.Start(); + using (IDeepSpeech sttClient = new DeepSpeech(model ?? "output_graph.pbmm", + BEAM_WIDTH)) { - Console.WriteLine("Loading model..."); - stopwatch.Start(); - sttClient.CreateModel( - model ?? "output_graph.pbmm", - BEAM_WIDTH); stopwatch.Stop(); Console.WriteLine($"Model loaded - {stopwatch.Elapsed.Milliseconds} ms"); @@ -88,12 +85,14 @@ namespace CSharpExamples string speechResult; if (extended) { - Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2)); + Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, + Convert.ToUInt32(waveBuffer.MaxSize / 2)); speechResult = MetadataToString(metaResult); } else { - speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2)); + speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, + Convert.ToUInt32(waveBuffer.MaxSize / 2)); } stopwatch.Stop(); @@ -104,10 +103,10 @@ namespace CSharpExamples } waveBuffer.Clear(); } - catch (Exception ex) - { - Console.WriteLine(ex.Message); - } + } + catch (Exception ex) + { + Console.WriteLine(ex.Message); } } }