From bb709ff9553f513afa20bde601fe03b7539a6759 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 25 Feb 2020 14:18:23 +0100 Subject: [PATCH] Adapt .NET bindings to new API --- .../dotnet/DeepSpeechClient/DeepSpeech.cs | 21 ++++-- .../DeepSpeechClient/DeepSpeechClient.csproj | 6 +- .../Extensions/NativeExtensions.cs | 69 ++++++++++++++----- .../Interfaces/IDeepSpeech.cs | 15 +++- .../Models/CandidateTranscript.cs | 17 +++++ .../DeepSpeechClient/Models/Metadata.cs | 8 +-- .../{MetadataItem.cs => TokenMetadata.cs} | 4 +- .../dotnet/DeepSpeechClient/NativeImp.cs | 34 +++++---- .../Structs/CandidateTranscript.cs | 22 ++++++ .../DeepSpeechClient/Structs/Metadata.cs | 12 ++-- .../{MetadataItem.cs => TokenMetadata.cs} | 6 +- .../dotnet/DeepSpeechConsole/Program.cs | 14 ++-- 12 files changed, 162 insertions(+), 66 deletions(-) create mode 100644 native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs rename native_client/dotnet/DeepSpeechClient/Models/{MetadataItem.cs => TokenMetadata.cs} (89%) create mode 100644 native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs rename native_client/dotnet/DeepSpeechClient/Structs/{MetadataItem.cs => TokenMetadata.cs} (80%) diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs index 576ed308..ce184cf4 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs @@ -202,10 +202,11 @@ namespace DeepSpeechClient /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// /// Instance of the stream to finish. + /// Number of candidate transcripts to return. /// The extended metadata result. - public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream) + public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults) { - return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata(); + return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata(); } /// @@ -218,6 +219,17 @@ namespace DeepSpeechClient return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString(); } + /// + /// Computes the intermediate decoding of an ongoing streaming inference. + /// + /// Instance of the stream to decode. + /// Number of candidate transcripts to return. + /// The STT intermediate result. + public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults) + { + return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata(); + } + /// /// Return version of this library. The returned version is a semantic version /// (SemVer 2.0.0). @@ -265,10 +277,11 @@ namespace DeepSpeechClient /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. + /// Number of candidate transcripts to return. /// The extended metadata. Returns NULL on error. - public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize) + public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults) { - return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata(); + return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata(); } #endregion diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj index b9077361..0139b3e8 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj @@ -50,11 +50,13 @@ - + + - + + diff --git a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs index 6b7f4c6a..9325f4b8 100644 --- a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs +++ b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs @@ -26,35 +26,68 @@ namespace DeepSpeechClient.Extensions } /// - /// Converts a pointer into managed metadata object. + /// Converts a pointer into managed TokenMetadata object. + /// + /// Native pointer. + /// TokenMetadata managed object. + private static Models.TokenMetadata PtrToTokenMetadata(this IntPtr intPtr) + { + var token = Marshal.PtrToStructure(intPtr); + var managedToken = new Models.TokenMetadata + { + Timestep = token.timestep, + StartTime = token.start_time, + Text = token.text.PtrToString(releasePtr: false) + }; + return managedToken; + } + + /// + /// Converts a pointer into managed CandidateTranscript object. + /// + /// Native pointer. + /// CandidateTranscript managed object. + private static Models.CandidateTranscript PtrToCandidateTranscript(this IntPtr intPtr) + { + var managedTranscript = new Models.CandidateTranscript(); + var transcript = Marshal.PtrToStructure(intPtr); + + managedTranscript.Tokens = new Models.TokenMetadata[transcript.num_tokens]; + managedTranscript.Confidence = transcript.confidence; + + //we need to manually read each item from the native ptr using its size + var sizeOfTokenMetadata = Marshal.SizeOf(typeof(TokenMetadata)); + for (int i = 0; i < transcript.num_tokens; i++) + { + managedTranscript.Tokens[i] = transcript.tokens.PtrToTokenMetadata(); + transcript.tokens += sizeOfTokenMetadata; + } + + return managedTranscript; + } + + /// + /// Converts a pointer into managed Metadata object. /// /// Native pointer. /// Metadata managed object. internal static Models.Metadata PtrToMetadata(this IntPtr intPtr) { - var managedMetaObject = new Models.Metadata(); - var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata)); - - managedMetaObject.Items = new Models.MetadataItem[metaData.num_items]; - managedMetaObject.Confidence = metaData.confidence; + var managedMetadata = new Models.Metadata(); + var metadata = Marshal.PtrToStructure(intPtr); + managedMetadata.Transcripts = new Models.CandidateTranscript[metadata.num_transcripts]; //we need to manually read each item from the native ptr using its size - var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem)); - for (int i = 0; i < metaData.num_items; i++) + var sizeOfCandidateTranscript = Marshal.SizeOf(typeof(CandidateTranscript)); + for (int i = 0; i < metadata.num_transcripts; i++) { - var tempItem = Marshal.PtrToStructure(metaData.items); - managedMetaObject.Items[i] = new Models.MetadataItem - { - Timestep = tempItem.timestep, - StartTime = tempItem.start_time, - Character = tempItem.character.PtrToString(releasePtr: false) - }; - //we keep the offset on each read - metaData.items += sizeOfMetaItem; + managedMetadata.Transcripts[i] = metadata.transcripts.PtrToCandidateTranscript(); + metadata.transcripts += sizeOfCandidateTranscript; } + NativeImp.DS_FreeMetadata(intPtr); - return managedMetaObject; + return managedMetadata; } } } diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs index 18677abc..ae3e72cf 100644 --- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs @@ -72,9 +72,11 @@ namespace DeepSpeechClient.Interfaces /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. + /// Number of candidate transcripts to return. /// The extended metadata. Returns NULL on error. unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, - uint aBufferSize); + uint aBufferSize, + uint aNumResults); /// /// Destroy a streaming state without decoding the computed logits. @@ -102,6 +104,14 @@ namespace DeepSpeechClient.Interfaces /// The STT intermediate result. unsafe string IntermediateDecode(DeepSpeechStream stream); + /// + /// Computes the intermediate decoding of an ongoing streaming inference. + /// + /// Instance of the stream to decode. + /// Number of candidate transcripts to return. + /// The extended metadata result. + unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults); + /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// @@ -113,7 +123,8 @@ namespace DeepSpeechClient.Interfaces /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// /// Instance of the stream to finish. + /// Number of candidate transcripts to return. /// The extended metadata result. - unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream); + unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults); } } diff --git a/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs b/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs new file mode 100644 index 00000000..cc6b5d28 --- /dev/null +++ b/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs @@ -0,0 +1,17 @@ +namespace DeepSpeechClient.Models +{ + /// + /// Stores the entire CTC output as an array of character metadata objects. + /// + public class CandidateTranscript + { + /// + /// Approximated confidence value for this transcription. + /// + public double Confidence { get; set; } + /// + /// List of metada tokens containing text, timestep, and time offset. + /// + public TokenMetadata[] Tokens { get; set; } + } +} \ No newline at end of file diff --git a/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs b/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs index 870eb162..fb6c613d 100644 --- a/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs +++ b/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs @@ -6,12 +6,8 @@ public class Metadata { /// - /// Approximated confidence value for this transcription. + /// List of candidate transcripts. /// - public double Confidence { get; set; } - /// - /// List of metada items containing char, timespet, and time offset. - /// - public MetadataItem[] Items { get; set; } + public CandidateTranscript[] Transcripts { get; set; } } } \ No newline at end of file diff --git a/native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs b/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs similarity index 89% rename from native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs rename to native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs index e329c6cb..5f2dea56 100644 --- a/native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs +++ b/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs @@ -3,12 +3,12 @@ /// /// Stores each individual character, along with its timing information. /// - public class MetadataItem + public class TokenMetadata { /// /// Char of the current timestep. /// - public string Character; + public string Text; /// /// Position of the character in units of 20ms. /// diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs index 6c3494b6..eabbfe48 100644 --- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs +++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs @@ -17,45 +17,46 @@ namespace DeepSpeechClient [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath, - ref IntPtr** pint); + ref IntPtr** pint); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal unsafe static extern uint DS_GetModelBeamWidth(IntPtr** aCtx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal unsafe static extern ErrorCodes DS_SetModelBeamWidth(IntPtr** aCtx, - uint aBeamWidth); + uint aBeamWidth); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath, - uint aBeamWidth, - ref IntPtr** pint); + uint aBeamWidth, + ref IntPtr** pint); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern ErrorCodes DS_EnableExternalScorer(IntPtr** aCtx, - string aScorerPath); + string aScorerPath); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern ErrorCodes DS_DisableExternalScorer(IntPtr** aCtx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern ErrorCodes DS_SetScorerAlphaBeta(IntPtr** aCtx, - float aAlpha, - float aBeta); + float aAlpha, + float aBeta); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, CharSet = CharSet.Ansi, SetLastError = true)] internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx, - short[] aBuffer, - uint aBufferSize); + short[] aBuffer, + uint aBufferSize); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)] internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx, - short[] aBuffer, - uint aBufferSize); + short[] aBuffer, + uint aBufferSize, + uint aNumResults); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern void DS_FreeModel(IntPtr** aCtx); @@ -76,18 +77,23 @@ namespace DeepSpeechClient [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, CharSet = CharSet.Ansi, SetLastError = true)] internal static unsafe extern void DS_FeedAudioContent(IntPtr** aSctx, - short[] aBuffer, - uint aBufferSize); + short[] aBuffer, + uint aBufferSize); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx); + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx, + uint aNumResults); + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, CharSet = CharSet.Ansi, SetLastError = true)] internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx); + internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx, + uint aNumResults); #endregion } } diff --git a/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs b/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs new file mode 100644 index 00000000..54581f6f --- /dev/null +++ b/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs @@ -0,0 +1,22 @@ +using System; +using System.Runtime.InteropServices; + +namespace DeepSpeechClient.Structs +{ + [StructLayout(LayoutKind.Sequential)] + internal unsafe struct CandidateTranscript + { + /// + /// Native list of tokens. + /// + internal unsafe IntPtr tokens; + /// + /// Count of tokens from the native side. + /// + internal unsafe int num_tokens; + /// + /// Approximated confidence value for this transcription. + /// + internal unsafe double confidence; + } +} diff --git a/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs b/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs index 411da9f2..0a9beddc 100644 --- a/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs +++ b/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs @@ -7,16 +7,12 @@ namespace DeepSpeechClient.Structs internal unsafe struct Metadata { /// - /// Native list of items. + /// Native list of candidate transcripts. /// - internal unsafe IntPtr items; + internal unsafe IntPtr transcripts; /// - /// Count of items from the native side. + /// Count of transcripts from the native side. /// - internal unsafe int num_items; - /// - /// Approximated confidence value for this transcription. - /// - internal unsafe double confidence; + internal unsafe int num_transcripts; } } diff --git a/native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs b/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs similarity index 80% rename from native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs rename to native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs index 10092742..1c660c71 100644 --- a/native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs +++ b/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs @@ -4,12 +4,12 @@ using System.Runtime.InteropServices; namespace DeepSpeechClient.Structs { [StructLayout(LayoutKind.Sequential)] - internal unsafe struct MetadataItem + internal unsafe struct TokenMetadata { /// - /// Native character. + /// Native text. /// - internal unsafe IntPtr character; + internal unsafe IntPtr text; /// /// Position of the character in units of 20ms. /// diff --git a/native_client/dotnet/DeepSpeechConsole/Program.cs b/native_client/dotnet/DeepSpeechConsole/Program.cs index b35c7046..a08e44b6 100644 --- a/native_client/dotnet/DeepSpeechConsole/Program.cs +++ b/native_client/dotnet/DeepSpeechConsole/Program.cs @@ -21,14 +21,14 @@ namespace CSharpExamples static string GetArgument(IEnumerable args, string option) => args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault(); - static string MetadataToString(Metadata meta) + static string MetadataToString(CandidateTranscript transcript) { var nl = Environment.NewLine; string retval = - Environment.NewLine + $"Recognized text: {string.Join("", meta?.Items?.Select(x => x.Character))} {nl}" - + $"Confidence: {meta?.Confidence} {nl}" - + $"Item count: {meta?.Items?.Length} {nl}" - + string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}")); + Environment.NewLine + $"Recognized text: {string.Join("", transcript?.Tokens?.Select(x => x.Text))} {nl}" + + $"Confidence: {transcript?.Confidence} {nl}" + + $"Item count: {transcript?.Tokens?.Length} {nl}" + + string.Join(nl, transcript?.Tokens?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Text}")); return retval; } @@ -75,8 +75,8 @@ namespace CSharpExamples if (extended) { Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, - Convert.ToUInt32(waveBuffer.MaxSize / 2)); - speechResult = MetadataToString(metaResult); + Convert.ToUInt32(waveBuffer.MaxSize / 2), 1); + speechResult = MetadataToString(metaResult.Transcripts[0]); } else {