Merge pull request #2591 from mozilla/revert-2548-net-streams

Revert "Multi-stream support .NET"
2019-12-10 16:16:16 +01:00 · 2019-12-10 16:16:16 +01:00 · 911743a0b8
commit 911743a0b8
parent 35e04d383e 03a822b670
9 changed files with 99 additions and 136 deletions
--- a/doc/DotNet-API.rst
+++ b/doc/DotNet-API.rst
@ -15,13 +15,6 @@ DeepSpeech Class
   :project: deepspeech-dotnet
   :members:
 DeepSpeechStream Class
 ----------------
 .. doxygenclass:: DeepSpeechClient::DeepSpeechStream
   :project: deepspeech-dotnet
   :members:
 ErrorCodes
 ----------
--- a/examples/net_framework/DeepSpeechWPF/App.xaml.cs
+++ b/examples/net_framework/DeepSpeechWPF/App.xaml.cs
@ -18,20 +18,20 @@ namespace DeepSpeechWPF
            const int BEAM_WIDTH = 500;
            //Register instance of DeepSpeech
            DeepSpeechClient.DeepSpeech deepSpeechClient = new DeepSpeechClient.DeepSpeech();
            try
            {
-                //Register instance of DeepSpeech
+                deepSpeechClient.CreateModel("output_graph.pbmm", BEAM_WIDTH);
                DeepSpeechClient.DeepSpeech deepSpeechClient =
                    new DeepSpeechClient.DeepSpeech("output_graph.pbmm", BEAM_WIDTH);
                SimpleIoc.Default.Register<IDeepSpeech>(() => deepSpeechClient);
                SimpleIoc.Default.Register<MainWindowViewModel>();
            }
            catch (System.Exception ex)
            {
                MessageBox.Show(ex.Message);
                Current.Shutdown();
            }
            SimpleIoc.Default.Register<IDeepSpeech>(() => deepSpeechClient);
            SimpleIoc.Default.Register<MainWindowViewModel>();
        }
        protected override void OnExit(ExitEventArgs e)
--- a/examples/net_framework/DeepSpeechWPF/ViewModels/MainWindowViewModel.cs
+++ b/examples/net_framework/DeepSpeechWPF/ViewModels/MainWindowViewModel.cs
@ -4,7 +4,6 @@ using CSCore.CoreAudioAPI;
 using CSCore.SoundIn;
 using CSCore.Streams;
 using DeepSpeechClient.Interfaces;
 using DeepSpeechClient.Models;
 using GalaSoft.MvvmLight.CommandWpf;
 using Microsoft.Win32;
 using System;
@ -59,12 +58,6 @@ namespace DeepSpeech.WPF.ViewModels
        #endregion
        #region Streaming
        /// <summary>
        /// Stream used to feed data into the acoustic model.
        /// </summary>
        private DeepSpeechStream _sttStream;
        /// <summary>
        /// Records the audio of the selected device.
        /// </summary>
@ -315,7 +308,7 @@ namespace DeepSpeech.WPF.ViewModels
                if (_bufferQueue.TryDequeue(out short[] buffer))
                {
                    StreamingIsBusy = true;
-                    _sttClient.FeedAudioContent(_sttStream, buffer, Convert.ToUInt32(buffer.Length));
+                    _sttClient.FeedAudioContent(buffer, Convert.ToUInt32(buffer.Length));
                    StreamingIsBusy = false;
                }
            }
@ -393,7 +386,7 @@ namespace DeepSpeech.WPF.ViewModels
            {
                await Task.Delay(90);
            }
-            Transcription = _sttClient.FinishStream(_sttStream);
+            Transcription = _sttClient.FinishStream();
            EnableStartRecord = true;
        }
@ -402,7 +395,7 @@ namespace DeepSpeech.WPF.ViewModels
        /// </summary>
        private void StartRecording()
        {
-            _sttStream =_sttClient.CreateStream();
+            _sttClient.CreateStream();
            _audioCapture.Start();
            EnableStartRecord = false;
            EnableStopRecord = true;
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
@ -4,7 +4,6 @@ using DeepSpeechClient.Extensions;
 using System;
 using System.IO;
 using DeepSpeechClient.Enums;
 using DeepSpeechClient.Models;
 namespace DeepSpeechClient
 {
@ -14,16 +13,14 @@ namespace DeepSpeechClient
    public class DeepSpeech : IDeepSpeech
    {
        private unsafe IntPtr** _modelStatePP;
-        
+        private unsafe IntPtr** _streamingStatePP;
-        /// <summary>
+
-        /// Initializes a new instance of <see cref="DeepSpeech"/> class and creates a new acoustic model.
+
-        /// </summary>
+
-        /// <param name="aModelPath">The path to the frozen model graph.</param>
+
-        /// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param>
+        public DeepSpeech()
        /// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception>
        public DeepSpeech(string aModelPath, uint aBeamWidth)
        {
-            CreateModel(aModelPath, aBeamWidth);
+
        }
        #region IDeepSpeech
@ -34,7 +31,7 @@ namespace DeepSpeechClient
        /// <param name="aModelPath">The path to the frozen model graph.</param>
        /// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param>
        /// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception>
-        private unsafe void CreateModel(string aModelPath,
+        public unsafe void CreateModel(string aModelPath,
            uint aBeamWidth)
        {
            string exceptionMessage = null;
@ -121,19 +118,10 @@ namespace DeepSpeechClient
        /// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
        /// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
        /// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
        /// <exception cref="FileNotFoundException">Thrown when cannot find the language model or trie file.</exception>
        public unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath,
            float aLMAlpha, float aLMBeta)
        {
            string exceptionMessage = null;
            if (string.IsNullOrWhiteSpace(aLMPath))
            {
                exceptionMessage = "Path to the language model file cannot be empty.";
            }
            if (!File.Exists(aLMPath))
            {
                exceptionMessage = $"Cannot find the language model file: {aLMPath}";
            }
            if (string.IsNullOrWhiteSpace(aTriePath))
            {
                exceptionMessage = "Path to the trie file cannot be empty.";
@ -159,41 +147,37 @@ namespace DeepSpeechClient
        /// <summary>
        /// Feeds audio samples to an ongoing streaming inference.
        /// </summary>
        /// <param name="stream">Instance of the stream to feed the data.</param>
        /// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).</param>
-        public unsafe void FeedAudioContent(DeepSpeechStream stream, short[] aBuffer, uint aBufferSize)
+        public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize)
        {
-            NativeImp.DS_FeedAudioContent(stream.GetNativePointer(), aBuffer, aBufferSize);
+            NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize);
        }
        /// <summary>
        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
        /// </summary>
-        /// <param name="stream">Instance of the stream to finish.</param>
+        /// <returns>The STT result. The user is responsible for freeing the string.</returns>
-        /// <returns>The STT result.</returns>
+        public unsafe string FinishStream()
        public unsafe string FinishStream(DeepSpeechStream stream)
        {
-            return NativeImp.DS_FinishStream(stream.GetNativePointer()).PtrToString();
+            return NativeImp.DS_FinishStream(_streamingStatePP).PtrToString();
        }
        /// <summary>
        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
        /// </summary>
-        /// <param name="stream">Instance of the stream to finish.</param>
+        /// <returns>The extended metadata. The user is responsible for freeing the struct.</returns>
-        /// <returns>The extended metadata result.</returns>
+        public unsafe Models.Metadata FinishStreamWithMetadata()
        public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream)
        {
-            return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata();
+            return NativeImp.DS_FinishStreamWithMetadata(_streamingStatePP).PtrToMetadata();
        }
        /// <summary>
        /// Computes the intermediate decoding of an ongoing streaming inference.
        /// </summary>
-        /// <param name="stream">Instance of the stream to decode.</param>
+        /// <returns>The STT intermediate result. The user is responsible for freeing the string.</returns>
-        /// <returns>The STT intermediate result.</returns>
+        public unsafe string IntermediateDecode()
        public unsafe string IntermediateDecode(DeepSpeechStream stream)
        {
-            return NativeImp.DS_IntermediateDecode(stream.GetNativePointer());
+            return NativeImp.DS_IntermediateDecode(_streamingStatePP);
        }
        /// <summary>
@ -207,12 +191,11 @@ namespace DeepSpeechClient
        /// <summary>
        /// Creates a new streaming inference state.
        /// </summary>
-        public unsafe DeepSpeechStream CreateStream()
+        /// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
        public unsafe void CreateStream()
        {
-            IntPtr** streamingStatePointer = null;
+            var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref _streamingStatePP);
            var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref streamingStatePointer);
            EvaluateResultCode(resultCode);
            return new DeepSpeechStream(streamingStatePointer);
        }
        /// <summary>
@ -220,10 +203,25 @@ namespace DeepSpeechClient
        /// This can be used if you no longer need the result of an ongoing streaming
        /// inference and don't want to perform a costly decode operation.
        /// </summary>
-        public unsafe void FreeStream(DeepSpeechStream stream)
+        public unsafe void FreeStream()
        {
-            NativeImp.DS_FreeStream(stream.GetNativePointer());
+            NativeImp.DS_FreeStream(ref _streamingStatePP);
-            stream.Dispose();
+        }
        /// <summary>
        /// Free a DeepSpeech allocated string
        /// </summary>
        public unsafe void FreeString(IntPtr intPtr)
        {
            NativeImp.DS_FreeString(intPtr);
        }
        /// <summary>
        /// Free a DeepSpeech allocated Metadata struct
        /// </summary>
        public unsafe void FreeMetadata(IntPtr intPtr)
        {
            NativeImp.DS_FreeMetadata(intPtr);
        }
        /// <summary>
@ -231,7 +229,7 @@ namespace DeepSpeechClient
        /// </summary>
        /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
        /// <param name="aBufferSize">The number of samples in the audio signal.</param>
-        /// <returns>The STT result. Returns NULL on error.</returns>
+        /// <returns>The STT result. The user is responsible for freeing the string.  Returns NULL on error.</returns>
        public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize)
        {
            return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize).PtrToString();
@ -242,8 +240,8 @@ namespace DeepSpeechClient
        /// </summary>
        /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
        /// <param name="aBufferSize">The number of samples in the audio signal.</param>
-        /// <returns>The extended metadata. Returns NULL on error.</returns>
+        /// <returns>The extended metadata. The user is responsible for freeing the struct.  Returns NULL on error.</returns>
-        public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
+        public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
        {
            return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
        }
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
@ -48,7 +48,6 @@
    <Compile Include="Enums\ErrorCodes.cs" />
    <Compile Include="Interfaces\IDeepSpeech.cs" />
    <Compile Include="Extensions\NativeExtensions.cs" />
    <Compile Include="Models\DeepSpeechStream.cs" />
    <Compile Include="Models\Metadata.cs" />
    <Compile Include="Models\MetadataItem.cs" />
    <Compile Include="NativeImp.cs" />
--- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
@ -1,11 +1,10 @@
 using DeepSpeechClient.Models;
 using System;
 using System.IO;
 namespace DeepSpeechClient.Interfaces
 {
    /// <summary>
-    /// Client interface of the Mozilla's DeepSpeech implementation.
+    /// Client interface of the Mozilla's deepspeech implementation.
    /// </summary>
    public interface IDeepSpeech : IDisposable
    {
@ -14,6 +13,15 @@ namespace DeepSpeechClient.Interfaces
        /// </summary>
        void PrintVersions();
        /// <summary>
        /// Create an object providing an interface to a trained DeepSpeech model.
        /// </summary>
        /// <param name="aModelPath">The path to the frozen model graph.</param>
        /// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param>
        /// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception>
        unsafe void CreateModel(string aModelPath,
                   uint aBeamWidth);
        /// <summary>
        /// Return the sample rate expected by the model.
        /// </summary>
@ -28,7 +36,6 @@ namespace DeepSpeechClient.Interfaces
        /// <param name="aLMAlpha">The alpha hyperparameter of the CTC decoder. Language Model weight.</param>
        /// <param name="aLMBeta">The beta hyperparameter of the CTC decoder. Word insertion weight.</param>
        /// <exception cref="ArgumentException">Thrown when the native binary failed to enable decoding with a language model.</exception>
        /// <exception cref="FileNotFoundException">Thrown when cannot find the language model or trie file.</exception>
        unsafe void EnableDecoderWithLM(string aLMPath,
                  string aTriePath,
                  float aLMAlpha,
@ -39,7 +46,7 @@ namespace DeepSpeechClient.Interfaces
        /// </summary>
        /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
        /// <param name="aBufferSize">The number of samples in the audio signal.</param>
-        /// <returns>The STT result. Returns NULL on error.</returns>
+        /// <returns>The STT result. The user is responsible for freeing the string.  Returns NULL on error.</returns>
        unsafe string SpeechToText(short[] aBuffer,
                uint aBufferSize);
@ -48,7 +55,7 @@ namespace DeepSpeechClient.Interfaces
        /// </summary>
        /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
        /// <param name="aBufferSize">The number of samples in the audio signal.</param>
-        /// <returns>The extended metadata. Returns NULL on error.</returns>
+        /// <returns>The extended metadata result. The user is responsible for freeing the struct.  Returns NULL on error.</returns>
        unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
                uint aBufferSize);
@ -57,39 +64,46 @@ namespace DeepSpeechClient.Interfaces
        /// This can be used if you no longer need the result of an ongoing streaming
        /// inference and don't want to perform a costly decode operation.
        /// </summary>
-        unsafe void FreeStream(DeepSpeechStream stream);
+        unsafe void FreeStream();
        /// <summary>
        /// Free a DeepSpeech allocated string
        /// </summary>
        unsafe void FreeString(IntPtr intPtr);
        /// <summary>
        /// Free a DeepSpeech allocated Metadata struct
        /// </summary>
        unsafe void FreeMetadata(IntPtr intPtr);
        /// <summary>
        /// Creates a new streaming inference state.
        /// </summary>
-        unsafe DeepSpeechStream CreateStream();
+        /// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
        unsafe void CreateStream();
        /// <summary>
        /// Feeds audio samples to an ongoing streaming inference.
        /// </summary>
        /// <param name="stream">Instance of the stream to feed the data.</param>
        /// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).</param>
-        unsafe void FeedAudioContent(DeepSpeechStream stream, short[] aBuffer, uint aBufferSize);
+        unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize);
        /// <summary>
        /// Computes the intermediate decoding of an ongoing streaming inference.
        /// </summary>
-        /// <param name="stream">Instance of the stream to decode.</param>
+        /// <returns>The STT intermediate result. The user is responsible for freeing the string.</returns>
-        /// <returns>The STT intermediate result.</returns>
+        unsafe string IntermediateDecode();
        unsafe string IntermediateDecode(DeepSpeechStream stream);
        /// <summary>
        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
        /// </summary>
-        /// <param name="stream">Instance of the stream to finish.</param>
+        /// <returns>The STT result. The user is responsible for freeing the string.</returns>
-        /// <returns>The STT result.</returns>
+        unsafe string FinishStream();
        unsafe string FinishStream(DeepSpeechStream stream);
        /// <summary>
        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
        /// </summary>
-        /// <param name="stream">Instance of the stream to finish.</param>
+        /// <returns>The extended metadata result. The user is responsible for freeing the struct.</returns>
-        /// <returns>The extended metadata result.</returns>
+        unsafe Metadata FinishStreamWithMetadata();
        unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream);
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs
+++ b/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs
@ -1,35 +0,0 @@
 using System;
 namespace DeepSpeechClient.Models
 {
    /// <summary>
    /// Wrapper of the pointer used for the decoding stream.
    /// </summary>
    public class DeepSpeechStream : IDisposable
    {
        private unsafe IntPtr** _streamingStatePp;
        /// <summary>
        /// Initializes a new instance of <see cref="DeepSpeechStream"/>.
        /// </summary>
        /// <param name="streamingStatePP">Native pointer of the native stream.</param>
        public unsafe DeepSpeechStream(IntPtr** streamingStatePP)
        {
            _streamingStatePp = streamingStatePP;
        }
        /// <summary>
        /// Gets the native pointer.
        /// </summary>
        /// <exception cref="InvalidOperationException">Thrown when the stream has been disposed or not yet initialized.</exception>
        /// <returns>Native pointer of the stream.</returns>
        internal unsafe IntPtr** GetNativePointer()
        {
            if (_streamingStatePp == null)
                throw new InvalidOperationException("Cannot use a disposed or uninitialized stream.");
            return _streamingStatePp;
        }
        public unsafe void Dispose() => _streamingStatePp = null;
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs
+++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs
@ -48,7 +48,7 @@ namespace DeepSpeechClient
               ref IntPtr** retval);
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
-        internal static unsafe extern void DS_FreeStream(IntPtr** aSctx);
+        internal static unsafe extern void DS_FreeStream(ref IntPtr** aSctx);
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
        internal static unsafe extern void DS_FreeMetadata(IntPtr metadata);
--- a/native_client/dotnet/DeepSpeechConsole/Program.cs
+++ b/native_client/dotnet/DeepSpeechConsole/Program.cs
@ -53,13 +53,16 @@ namespace CSharpExamples
            const float LM_BETA = 1.85f;
            Stopwatch stopwatch = new Stopwatch();
-            try
+
            using (IDeepSpeech sttClient = new DeepSpeech())
            {
-                Console.WriteLine("Loading model...");
+                try
                stopwatch.Start();
                using (IDeepSpeech sttClient = new DeepSpeech(model ?? "output_graph.pbmm",
                    BEAM_WIDTH))
                {
                    Console.WriteLine("Loading model...");
                    stopwatch.Start();
                    sttClient.CreateModel(
                        model ?? "output_graph.pbmm",
                        BEAM_WIDTH);
                    stopwatch.Stop();
                    Console.WriteLine($"Model loaded - {stopwatch.Elapsed.Milliseconds} ms");
@ -85,14 +88,12 @@ namespace CSharpExamples
                        string speechResult;
                        if (extended)
                        {
-                            Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer,
+                            Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
                                Convert.ToUInt32(waveBuffer.MaxSize / 2));
                            speechResult = MetadataToString(metaResult);
                        }
                        else
                        {
-                            speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer,
+                            speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
                                Convert.ToUInt32(waveBuffer.MaxSize / 2));
                        }
                        stopwatch.Stop();
@ -103,10 +104,10 @@ namespace CSharpExamples
                    }
                    waveBuffer.Clear();
                }
-            }
+                catch (Exception ex)
-            catch (Exception ex)
+                {
-            {
+                    Console.WriteLine(ex.Message);
-                Console.WriteLine(ex.Message);
+                }
            }
        }
    }