From 03a822b6707bf1ec83ca3e127ed2d63e0caac8f9 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 10 Dec 2019 14:04:39 +0100 Subject: [PATCH] Revert "Multi-stream support .NET" --- doc/DotNet-API.rst | 7 -- .../net_framework/DeepSpeechWPF/App.xaml.cs | 12 +-- .../ViewModels/MainWindowViewModel.cs | 13 +-- .../dotnet/DeepSpeechClient/DeepSpeech.cs | 88 +++++++++---------- .../DeepSpeechClient/DeepSpeechClient.csproj | 1 - .../Interfaces/IDeepSpeech.cs | 50 +++++++---- .../Models/DeepSpeechStream.cs | 35 -------- .../dotnet/DeepSpeechClient/NativeImp.cs | 2 +- .../dotnet/DeepSpeechConsole/Program.cs | 27 +++--- 9 files changed, 99 insertions(+), 136 deletions(-) delete mode 100644 native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst index 2ba3415f..f9818d64 100644 --- a/doc/DotNet-API.rst +++ b/doc/DotNet-API.rst @@ -15,13 +15,6 @@ DeepSpeech Class :project: deepspeech-dotnet :members: -DeepSpeechStream Class ----------------- - -.. doxygenclass:: DeepSpeechClient::DeepSpeechStream - :project: deepspeech-dotnet - :members: - ErrorCodes ---------- diff --git a/examples/net_framework/DeepSpeechWPF/App.xaml.cs b/examples/net_framework/DeepSpeechWPF/App.xaml.cs index 95c550ae..67dad8ed 100644 --- a/examples/net_framework/DeepSpeechWPF/App.xaml.cs +++ b/examples/net_framework/DeepSpeechWPF/App.xaml.cs @@ -18,20 +18,20 @@ namespace DeepSpeechWPF const int BEAM_WIDTH = 500; + //Register instance of DeepSpeech + DeepSpeechClient.DeepSpeech deepSpeechClient = new DeepSpeechClient.DeepSpeech(); try { - //Register instance of DeepSpeech - DeepSpeechClient.DeepSpeech deepSpeechClient = - new DeepSpeechClient.DeepSpeech("output_graph.pbmm", BEAM_WIDTH); - - SimpleIoc.Default.Register(() => deepSpeechClient); - SimpleIoc.Default.Register(); + deepSpeechClient.CreateModel("output_graph.pbmm", BEAM_WIDTH); } catch (System.Exception ex) { MessageBox.Show(ex.Message); Current.Shutdown(); } + + SimpleIoc.Default.Register(() => deepSpeechClient); + SimpleIoc.Default.Register(); } protected override void OnExit(ExitEventArgs e) diff --git a/examples/net_framework/DeepSpeechWPF/ViewModels/MainWindowViewModel.cs b/examples/net_framework/DeepSpeechWPF/ViewModels/MainWindowViewModel.cs index 207e209a..81b33ce8 100644 --- a/examples/net_framework/DeepSpeechWPF/ViewModels/MainWindowViewModel.cs +++ b/examples/net_framework/DeepSpeechWPF/ViewModels/MainWindowViewModel.cs @@ -4,7 +4,6 @@ using CSCore.CoreAudioAPI; using CSCore.SoundIn; using CSCore.Streams; using DeepSpeechClient.Interfaces; -using DeepSpeechClient.Models; using GalaSoft.MvvmLight.CommandWpf; using Microsoft.Win32; using System; @@ -59,12 +58,6 @@ namespace DeepSpeech.WPF.ViewModels #endregion #region Streaming - - /// - /// Stream used to feed data into the acoustic model. - /// - private DeepSpeechStream _sttStream; - /// /// Records the audio of the selected device. /// @@ -315,7 +308,7 @@ namespace DeepSpeech.WPF.ViewModels if (_bufferQueue.TryDequeue(out short[] buffer)) { StreamingIsBusy = true; - _sttClient.FeedAudioContent(_sttStream, buffer, Convert.ToUInt32(buffer.Length)); + _sttClient.FeedAudioContent(buffer, Convert.ToUInt32(buffer.Length)); StreamingIsBusy = false; } } @@ -393,7 +386,7 @@ namespace DeepSpeech.WPF.ViewModels { await Task.Delay(90); } - Transcription = _sttClient.FinishStream(_sttStream); + Transcription = _sttClient.FinishStream(); EnableStartRecord = true; } @@ -402,7 +395,7 @@ namespace DeepSpeech.WPF.ViewModels /// private void StartRecording() { - _sttStream =_sttClient.CreateStream(); + _sttClient.CreateStream(); _audioCapture.Start(); EnableStartRecord = false; EnableStopRecord = true; diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs index 1260d926..a674c699 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs @@ -4,7 +4,6 @@ using DeepSpeechClient.Extensions; using System; using System.IO; using DeepSpeechClient.Enums; -using DeepSpeechClient.Models; namespace DeepSpeechClient { @@ -14,16 +13,14 @@ namespace DeepSpeechClient public class DeepSpeech : IDeepSpeech { private unsafe IntPtr** _modelStatePP; - - /// - /// Initializes a new instance of class and creates a new acoustic model. - /// - /// The path to the frozen model graph. - /// The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time. - /// Thrown when the native binary failed to create the model. - public DeepSpeech(string aModelPath, uint aBeamWidth) + private unsafe IntPtr** _streamingStatePP; + + + + + public DeepSpeech() { - CreateModel(aModelPath, aBeamWidth); + } #region IDeepSpeech @@ -34,7 +31,7 @@ namespace DeepSpeechClient /// The path to the frozen model graph. /// The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time. /// Thrown when the native binary failed to create the model. - private unsafe void CreateModel(string aModelPath, + public unsafe void CreateModel(string aModelPath, uint aBeamWidth) { string exceptionMessage = null; @@ -121,19 +118,10 @@ namespace DeepSpeechClient /// The alpha hyperparameter of the CTC decoder. Language Model weight. /// The beta hyperparameter of the CTC decoder. Word insertion weight. /// Thrown when the native binary failed to enable decoding with a language model. - /// Thrown when cannot find the language model or trie file. public unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath, float aLMAlpha, float aLMBeta) { string exceptionMessage = null; - if (string.IsNullOrWhiteSpace(aLMPath)) - { - exceptionMessage = "Path to the language model file cannot be empty."; - } - if (!File.Exists(aLMPath)) - { - exceptionMessage = $"Cannot find the language model file: {aLMPath}"; - } if (string.IsNullOrWhiteSpace(aTriePath)) { exceptionMessage = "Path to the trie file cannot be empty."; @@ -159,41 +147,37 @@ namespace DeepSpeechClient /// /// Feeds audio samples to an ongoing streaming inference. /// - /// Instance of the stream to feed the data. /// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on). - public unsafe void FeedAudioContent(DeepSpeechStream stream, short[] aBuffer, uint aBufferSize) + public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize) { - NativeImp.DS_FeedAudioContent(stream.GetNativePointer(), aBuffer, aBufferSize); + NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize); } /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// - /// Instance of the stream to finish. - /// The STT result. - public unsafe string FinishStream(DeepSpeechStream stream) + /// The STT result. The user is responsible for freeing the string. + public unsafe string FinishStream() { - return NativeImp.DS_FinishStream(stream.GetNativePointer()).PtrToString(); + return NativeImp.DS_FinishStream(_streamingStatePP).PtrToString(); } /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// - /// Instance of the stream to finish. - /// The extended metadata result. - public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream) + /// The extended metadata. The user is responsible for freeing the struct. + public unsafe Models.Metadata FinishStreamWithMetadata() { - return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata(); + return NativeImp.DS_FinishStreamWithMetadata(_streamingStatePP).PtrToMetadata(); } /// /// Computes the intermediate decoding of an ongoing streaming inference. /// - /// Instance of the stream to decode. - /// The STT intermediate result. - public unsafe string IntermediateDecode(DeepSpeechStream stream) + /// The STT intermediate result. The user is responsible for freeing the string. + public unsafe string IntermediateDecode() { - return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()); + return NativeImp.DS_IntermediateDecode(_streamingStatePP); } /// @@ -207,12 +191,11 @@ namespace DeepSpeechClient /// /// Creates a new streaming inference state. /// - public unsafe DeepSpeechStream CreateStream() + /// Thrown when the native binary failed to initialize the streaming mode. + public unsafe void CreateStream() { - IntPtr** streamingStatePointer = null; - var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref streamingStatePointer); + var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref _streamingStatePP); EvaluateResultCode(resultCode); - return new DeepSpeechStream(streamingStatePointer); } /// @@ -220,10 +203,25 @@ namespace DeepSpeechClient /// This can be used if you no longer need the result of an ongoing streaming /// inference and don't want to perform a costly decode operation. /// - public unsafe void FreeStream(DeepSpeechStream stream) + public unsafe void FreeStream() { - NativeImp.DS_FreeStream(stream.GetNativePointer()); - stream.Dispose(); + NativeImp.DS_FreeStream(ref _streamingStatePP); + } + + /// + /// Free a DeepSpeech allocated string + /// + public unsafe void FreeString(IntPtr intPtr) + { + NativeImp.DS_FreeString(intPtr); + } + + /// + /// Free a DeepSpeech allocated Metadata struct + /// + public unsafe void FreeMetadata(IntPtr intPtr) + { + NativeImp.DS_FreeMetadata(intPtr); } /// @@ -231,7 +229,7 @@ namespace DeepSpeechClient /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// The STT result. Returns NULL on error. + /// The STT result. The user is responsible for freeing the string. Returns NULL on error. public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize) { return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize).PtrToString(); @@ -242,8 +240,8 @@ namespace DeepSpeechClient /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// The extended metadata. Returns NULL on error. - public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize) + /// The extended metadata. The user is responsible for freeing the struct. Returns NULL on error. + public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize) { return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata(); } diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj index b9077361..320ecde5 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj @@ -48,7 +48,6 @@ - diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs index 734f4240..c47c25a1 100644 --- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs @@ -1,11 +1,10 @@ using DeepSpeechClient.Models; using System; -using System.IO; namespace DeepSpeechClient.Interfaces { /// - /// Client interface of the Mozilla's DeepSpeech implementation. + /// Client interface of the Mozilla's deepspeech implementation. /// public interface IDeepSpeech : IDisposable { @@ -14,6 +13,15 @@ namespace DeepSpeechClient.Interfaces /// void PrintVersions(); + /// + /// Create an object providing an interface to a trained DeepSpeech model. + /// + /// The path to the frozen model graph. + /// The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time. + /// Thrown when the native binary failed to create the model. + unsafe void CreateModel(string aModelPath, + uint aBeamWidth); + /// /// Return the sample rate expected by the model. /// @@ -28,7 +36,6 @@ namespace DeepSpeechClient.Interfaces /// The alpha hyperparameter of the CTC decoder. Language Model weight. /// The beta hyperparameter of the CTC decoder. Word insertion weight. /// Thrown when the native binary failed to enable decoding with a language model. - /// Thrown when cannot find the language model or trie file. unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath, float aLMAlpha, @@ -39,7 +46,7 @@ namespace DeepSpeechClient.Interfaces /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// The STT result. Returns NULL on error. + /// The STT result. The user is responsible for freeing the string. Returns NULL on error. unsafe string SpeechToText(short[] aBuffer, uint aBufferSize); @@ -48,7 +55,7 @@ namespace DeepSpeechClient.Interfaces /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// The extended metadata. Returns NULL on error. + /// The extended metadata result. The user is responsible for freeing the struct. Returns NULL on error. unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize); @@ -57,39 +64,46 @@ namespace DeepSpeechClient.Interfaces /// This can be used if you no longer need the result of an ongoing streaming /// inference and don't want to perform a costly decode operation. /// - unsafe void FreeStream(DeepSpeechStream stream); + unsafe void FreeStream(); + + /// + /// Free a DeepSpeech allocated string + /// + unsafe void FreeString(IntPtr intPtr); + + /// + /// Free a DeepSpeech allocated Metadata struct + /// + unsafe void FreeMetadata(IntPtr intPtr); /// /// Creates a new streaming inference state. /// - unsafe DeepSpeechStream CreateStream(); + /// Thrown when the native binary failed to initialize the streaming mode. + unsafe void CreateStream(); /// /// Feeds audio samples to an ongoing streaming inference. /// - /// Instance of the stream to feed the data. /// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on). - unsafe void FeedAudioContent(DeepSpeechStream stream, short[] aBuffer, uint aBufferSize); + unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize); /// /// Computes the intermediate decoding of an ongoing streaming inference. /// - /// Instance of the stream to decode. - /// The STT intermediate result. - unsafe string IntermediateDecode(DeepSpeechStream stream); + /// The STT intermediate result. The user is responsible for freeing the string. + unsafe string IntermediateDecode(); /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// - /// Instance of the stream to finish. - /// The STT result. - unsafe string FinishStream(DeepSpeechStream stream); + /// The STT result. The user is responsible for freeing the string. + unsafe string FinishStream(); /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// - /// Instance of the stream to finish. - /// The extended metadata result. - unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream); + /// The extended metadata result. The user is responsible for freeing the struct. + unsafe Metadata FinishStreamWithMetadata(); } } diff --git a/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs b/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs deleted file mode 100644 index e4605f5e..00000000 --- a/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs +++ /dev/null @@ -1,35 +0,0 @@ -using System; - -namespace DeepSpeechClient.Models -{ - /// - /// Wrapper of the pointer used for the decoding stream. - /// - public class DeepSpeechStream : IDisposable - { - private unsafe IntPtr** _streamingStatePp; - - /// - /// Initializes a new instance of . - /// - /// Native pointer of the native stream. - public unsafe DeepSpeechStream(IntPtr** streamingStatePP) - { - _streamingStatePp = streamingStatePP; - } - - /// - /// Gets the native pointer. - /// - /// Thrown when the stream has been disposed or not yet initialized. - /// Native pointer of the stream. - internal unsafe IntPtr** GetNativePointer() - { - if (_streamingStatePp == null) - throw new InvalidOperationException("Cannot use a disposed or uninitialized stream."); - return _streamingStatePp; - } - - public unsafe void Dispose() => _streamingStatePp = null; - } -} diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs index 572055c0..0ea331d8 100644 --- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs +++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs @@ -48,7 +48,7 @@ namespace DeepSpeechClient ref IntPtr** retval); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern void DS_FreeStream(IntPtr** aSctx); + internal static unsafe extern void DS_FreeStream(ref IntPtr** aSctx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern void DS_FreeMetadata(IntPtr metadata); diff --git a/native_client/dotnet/DeepSpeechConsole/Program.cs b/native_client/dotnet/DeepSpeechConsole/Program.cs index 8c75a481..364cab71 100644 --- a/native_client/dotnet/DeepSpeechConsole/Program.cs +++ b/native_client/dotnet/DeepSpeechConsole/Program.cs @@ -53,13 +53,16 @@ namespace CSharpExamples const float LM_BETA = 1.85f; Stopwatch stopwatch = new Stopwatch(); - try + + using (IDeepSpeech sttClient = new DeepSpeech()) { - Console.WriteLine("Loading model..."); - stopwatch.Start(); - using (IDeepSpeech sttClient = new DeepSpeech(model ?? "output_graph.pbmm", - BEAM_WIDTH)) + try { + Console.WriteLine("Loading model..."); + stopwatch.Start(); + sttClient.CreateModel( + model ?? "output_graph.pbmm", + BEAM_WIDTH); stopwatch.Stop(); Console.WriteLine($"Model loaded - {stopwatch.Elapsed.Milliseconds} ms"); @@ -85,14 +88,12 @@ namespace CSharpExamples string speechResult; if (extended) { - Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, - Convert.ToUInt32(waveBuffer.MaxSize / 2)); + Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2)); speechResult = MetadataToString(metaResult); } else { - speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, - Convert.ToUInt32(waveBuffer.MaxSize / 2)); + speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2)); } stopwatch.Stop(); @@ -103,10 +104,10 @@ namespace CSharpExamples } waveBuffer.Clear(); } - } - catch (Exception ex) - { - Console.WriteLine(ex.Message); + catch (Exception ex) + { + Console.WriteLine(ex.Message); + } } } }