Adapt .NET bindings to new API

2020-02-25 14:18:23 +01:00 · 2020-02-25 14:18:23 +01:00 · bb709ff955
commit bb709ff955
parent 09048e2ea2
12 changed files with 162 additions and 66 deletions
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
@ -202,10 +202,11 @@ namespace DeepSpeechClient
        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
        /// </summary>
        /// <param name="stream">Instance of the stream to finish.</param>
+        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
        /// <returns>The extended metadata result.</returns>
-        public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream)
+        public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
        {
-            return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata();
+            return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
        }

        /// <summary>
@ -218,6 +219,17 @@ namespace DeepSpeechClient
            return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString();
        }

+        /// <summary>
+        /// Computes the intermediate decoding of an ongoing streaming inference.
+        /// </summary>
+        /// <param name="stream">Instance of the stream to decode.</param>
+        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <returns>The STT intermediate result.</returns>
+        public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
+        {
+            return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
+        }
+
        /// <summary>
        /// Return version of this library. The returned version is a semantic version
        /// (SemVer 2.0.0).
@ -265,10 +277,11 @@ namespace DeepSpeechClient
        /// </summary>
        /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
        /// <param name="aBufferSize">The number of samples in the audio signal.</param>
+        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
        /// <returns>The extended metadata. Returns NULL on error.</returns>
-        public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
+        public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
        {
-            return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
+            return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata();
        }

        #endregion
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
@ -50,11 +50,13 @@
    <Compile Include="Extensions\NativeExtensions.cs" />
    <Compile Include="Models\DeepSpeechStream.cs" />
    <Compile Include="Models\Metadata.cs" />
-    <Compile Include="Models\MetadataItem.cs" />
+    <Compile Include="Models\CandidateTranscript.cs" />
+    <Compile Include="Models\TokenMetadata.cs" />
    <Compile Include="NativeImp.cs" />
    <Compile Include="Properties\AssemblyInfo.cs" />
    <Compile Include="Structs\Metadata.cs" />
-    <Compile Include="Structs\MetadataItem.cs" />
+    <Compile Include="Structs\CandidateTranscript.cs" />
+    <Compile Include="Structs\TokenMetadata.cs" />
  </ItemGroup>
  <ItemGroup />
  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
--- a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs
+++ b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs
@ -26,35 +26,68 @@ namespace DeepSpeechClient.Extensions
        }

        /// <summary>
-        /// Converts a pointer into managed metadata object.
+        /// Converts a pointer into managed TokenMetadata object.
+        /// </summary>
+        /// <param name="intPtr">Native pointer.</param>
+        /// <returns>TokenMetadata managed object.</returns>
+        private static Models.TokenMetadata PtrToTokenMetadata(this IntPtr intPtr)
+        {
+            var token = Marshal.PtrToStructure<TokenMetadata>(intPtr);
+            var managedToken = new Models.TokenMetadata
+            {
+                Timestep = token.timestep,
+                StartTime = token.start_time,
+                Text = token.text.PtrToString(releasePtr: false)
+            };
+            return managedToken;
+        }
+
+        /// <summary>
+        /// Converts a pointer into managed CandidateTranscript object.
+        /// </summary>
+        /// <param name="intPtr">Native pointer.</param>
+        /// <returns>CandidateTranscript managed object.</returns>
+        private static Models.CandidateTranscript PtrToCandidateTranscript(this IntPtr intPtr)
+        {
+            var managedTranscript = new Models.CandidateTranscript();
+            var transcript = Marshal.PtrToStructure<CandidateTranscript>(intPtr);
+
+            managedTranscript.Tokens = new Models.TokenMetadata[transcript.num_tokens];
+            managedTranscript.Confidence = transcript.confidence;
+
+            //we need to manually read each item from the native ptr using its size
+            var sizeOfTokenMetadata = Marshal.SizeOf(typeof(TokenMetadata));
+            for (int i = 0; i < transcript.num_tokens; i++)
+            {
+                managedTranscript.Tokens[i] = transcript.tokens.PtrToTokenMetadata();
+                transcript.tokens += sizeOfTokenMetadata;
+            }
+
+            return managedTranscript;
+        }
+
+        /// <summary>
+        /// Converts a pointer into managed Metadata object.
        /// </summary>
        /// <param name="intPtr">Native pointer.</param>
        /// <returns>Metadata managed object.</returns>
        internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
        {
-            var managedMetaObject = new Models.Metadata();
-            var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));
-
-            managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
-            managedMetaObject.Confidence = metaData.confidence;
+            var managedMetadata = new Models.Metadata();
+            var metadata = Marshal.PtrToStructure<Metadata>(intPtr);

+            managedMetadata.Transcripts = new Models.CandidateTranscript[metadata.num_transcripts];

            //we need to manually read each item from the native ptr using its size
-            var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
-            for (int i = 0; i < metaData.num_items; i++)
+            var sizeOfCandidateTranscript = Marshal.SizeOf(typeof(CandidateTranscript));
+            for (int i = 0; i < metadata.num_transcripts; i++)
            {
-                var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items);
-                managedMetaObject.Items[i] = new Models.MetadataItem
-                {
-                    Timestep = tempItem.timestep,
-                    StartTime = tempItem.start_time,
-                    Character = tempItem.character.PtrToString(releasePtr: false)
-                };
-                //we keep the offset on each read
-                metaData.items += sizeOfMetaItem;
+                managedMetadata.Transcripts[i] = metadata.transcripts.PtrToCandidateTranscript();
+                metadata.transcripts += sizeOfCandidateTranscript;
            }
+
            NativeImp.DS_FreeMetadata(intPtr);
-            return managedMetaObject;
+            return managedMetadata;
        }
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
@ -72,9 +72,11 @@ namespace DeepSpeechClient.Interfaces
        /// </summary>
        /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
        /// <param name="aBufferSize">The number of samples in the audio signal.</param>
+        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
        /// <returns>The extended metadata. Returns NULL on error.</returns>
        unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
-                uint aBufferSize);
+                uint aBufferSize,
+                uint aNumResults);

        /// <summary>
        /// Destroy a streaming state without decoding the computed logits.
@ -102,6 +104,14 @@ namespace DeepSpeechClient.Interfaces
        /// <returns>The STT intermediate result.</returns>
        unsafe string IntermediateDecode(DeepSpeechStream stream);

+        /// <summary>
+        /// Computes the intermediate decoding of an ongoing streaming inference.
+        /// </summary>
+        /// <param name="stream">Instance of the stream to decode.</param>
+        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
+        /// <returns>The extended metadata result.</returns>
+        unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
+
        /// <summary>
        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
        /// </summary>
@ -113,7 +123,8 @@ namespace DeepSpeechClient.Interfaces
        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
        /// </summary>
        /// <param name="stream">Instance of the stream to finish.</param>
+        /// <param name="aNumResults">Number of candidate transcripts to return.</param>
        /// <returns>The extended metadata result.</returns>
-        unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream);
+        unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs
+++ b/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs
@ -0,0 +1,17 @@
+namespace DeepSpeechClient.Models
+{
+    /// <summary>
+    /// Stores the entire CTC output as an array of character metadata objects.
+    /// </summary>
+    public class CandidateTranscript
+    {
+        /// <summary>
+        /// Approximated confidence value for this transcription.
+        /// </summary>
+        public double Confidence { get; set; }
+        /// <summary>
+        /// List of metada tokens containing text, timestep, and time offset.
+        /// </summary>
+        public TokenMetadata[] Tokens { get; set; }
+    }
+}
--- a/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
@ -6,12 +6,8 @@
    public class Metadata
    {
        /// <summary>
-        /// Approximated confidence value for this transcription.
+        /// List of candidate transcripts.
        /// </summary>
-        public double Confidence { get; set; }
-        /// <summary>
-        /// List of metada items containing char, timespet, and time offset.
-        /// </summary>
-        public MetadataItem[] Items { get; set; }
+        public CandidateTranscript[] Transcripts { get; set; }
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs
@ -3,12 +3,12 @@
    /// <summary>
    /// Stores each individual character, along with its timing information.
    /// </summary>
-    public class MetadataItem
+    public class TokenMetadata
    {
        /// <summary>
        /// Char of the current timestep.
        /// </summary>
-        public string Character;
+        public string Text;
        /// <summary>
        /// Position of the character in units of 20ms.
        /// </summary>
--- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs
+++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs
@ -55,7 +55,8 @@ namespace DeepSpeechClient
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
        internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
            short[] aBuffer,
-                uint aBufferSize);
+            uint aBufferSize,
+            uint aNumResults);

        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
        internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
@ -82,12 +83,17 @@ namespace DeepSpeechClient
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
        internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx);

+        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
+        internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx,
+            uint aNumResults);
+
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
            CharSet = CharSet.Ansi, SetLastError = true)]
        internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx);

        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
-        internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx);
+        internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx,
+            uint aNumResults);
        #endregion
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs
+++ b/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs
@ -0,0 +1,22 @@
+using System;
+using System.Runtime.InteropServices;
+
+namespace DeepSpeechClient.Structs
+{
+    [StructLayout(LayoutKind.Sequential)]
+    internal unsafe struct CandidateTranscript
+    {
+        /// <summary>
+        /// Native list of tokens.
+        /// </summary>
+        internal unsafe IntPtr tokens;
+        /// <summary>
+        /// Count of tokens from the native side.
+        /// </summary>
+        internal unsafe int num_tokens;
+        /// <summary>
+        /// Approximated confidence value for this transcription.
+        /// </summary>
+        internal unsafe double confidence;
+    }
+}
--- a/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
@ -7,16 +7,12 @@ namespace DeepSpeechClient.Structs
    internal unsafe struct Metadata
    {
        /// <summary>
-        /// Native list of items.
+        /// Native list of candidate transcripts.
        /// </summary>
-        internal unsafe IntPtr items;
+        internal unsafe IntPtr transcripts;
        /// <summary>
-        /// Count of items from the native side.
+        /// Count of transcripts from the native side.
        /// </summary>
-        internal unsafe int num_items;
-        /// <summary>
-        /// Approximated confidence value for this transcription.
-        /// </summary>
-        internal unsafe double confidence;
+        internal unsafe int num_transcripts;
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs
@ -4,12 +4,12 @@ using System.Runtime.InteropServices;
 namespace DeepSpeechClient.Structs
 {
    [StructLayout(LayoutKind.Sequential)]
-    internal unsafe struct MetadataItem
+    internal unsafe struct TokenMetadata
    {
        /// <summary>
-        /// Native character.
+        /// Native text.
        /// </summary>
-        internal unsafe IntPtr character;
+        internal unsafe IntPtr text;
        /// <summary>
        /// Position of the character in units of 20ms.
        /// </summary>
--- a/native_client/dotnet/DeepSpeechConsole/Program.cs
+++ b/native_client/dotnet/DeepSpeechConsole/Program.cs
@ -21,14 +21,14 @@ namespace CSharpExamples
        static string GetArgument(IEnumerable<string> args, string option)
        => args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();

-        static string MetadataToString(Metadata meta)
+        static string MetadataToString(CandidateTranscript transcript)
        {
            var nl = Environment.NewLine;
            string retval =
-             Environment.NewLine + $"Recognized text: {string.Join("", meta?.Items?.Select(x => x.Character))} {nl}"
-             + $"Confidence: {meta?.Confidence} {nl}"
-             + $"Item count: {meta?.Items?.Length} {nl}"
-             + string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}"));
+             Environment.NewLine + $"Recognized text: {string.Join("", transcript?.Tokens?.Select(x => x.Text))} {nl}"
+             + $"Confidence: {transcript?.Confidence} {nl}"
+             + $"Item count: {transcript?.Tokens?.Length} {nl}"
+             + string.Join(nl, transcript?.Tokens?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Text}"));
            return retval;
        }

@ -75,8 +75,8 @@ namespace CSharpExamples
                        if (extended)
                        {
                            Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer,
-                                Convert.ToUInt32(waveBuffer.MaxSize / 2));
-                            speechResult = MetadataToString(metaResult);
+                                Convert.ToUInt32(waveBuffer.MaxSize / 2), 1);
+                            speechResult = MetadataToString(metaResult.Transcripts[0]);
                        }
                        else
                        {