Adapt .NET bindings to new API

This commit is contained in:
Reuben Morais 2020-02-25 14:18:23 +01:00
parent 09048e2ea2
commit bb709ff955
12 changed files with 162 additions and 66 deletions

View File

@ -202,10 +202,11 @@ namespace DeepSpeechClient
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
/// </summary>
/// <param name="stream">Instance of the stream to finish.</param>
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
/// <returns>The extended metadata result.</returns>
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream)
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
{
return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata();
return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
}
/// <summary>
@ -218,6 +219,17 @@ namespace DeepSpeechClient
return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString();
}
/// <summary>
/// Computes the intermediate decoding of an ongoing streaming inference.
/// </summary>
/// <param name="stream">Instance of the stream to decode.</param>
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
/// <returns>The STT intermediate result.</returns>
public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
{
return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
}
/// <summary>
/// Return version of this library. The returned version is a semantic version
/// (SemVer 2.0.0).
@ -265,10 +277,11 @@ namespace DeepSpeechClient
/// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
/// <returns>The extended metadata. Returns NULL on error.</returns>
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
{
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata();
}
#endregion

View File

@ -50,11 +50,13 @@
<Compile Include="Extensions\NativeExtensions.cs" />
<Compile Include="Models\DeepSpeechStream.cs" />
<Compile Include="Models\Metadata.cs" />
<Compile Include="Models\MetadataItem.cs" />
<Compile Include="Models\CandidateTranscript.cs" />
<Compile Include="Models\TokenMetadata.cs" />
<Compile Include="NativeImp.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Structs\Metadata.cs" />
<Compile Include="Structs\MetadataItem.cs" />
<Compile Include="Structs\CandidateTranscript.cs" />
<Compile Include="Structs\TokenMetadata.cs" />
</ItemGroup>
<ItemGroup />
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />

View File

@ -26,35 +26,68 @@ namespace DeepSpeechClient.Extensions
}
/// <summary>
/// Converts a pointer into managed metadata object.
/// Converts a pointer into managed TokenMetadata object.
/// </summary>
/// <param name="intPtr">Native pointer.</param>
/// <returns>TokenMetadata managed object.</returns>
private static Models.TokenMetadata PtrToTokenMetadata(this IntPtr intPtr)
{
var token = Marshal.PtrToStructure<TokenMetadata>(intPtr);
var managedToken = new Models.TokenMetadata
{
Timestep = token.timestep,
StartTime = token.start_time,
Text = token.text.PtrToString(releasePtr: false)
};
return managedToken;
}
/// <summary>
/// Converts a pointer into managed CandidateTranscript object.
/// </summary>
/// <param name="intPtr">Native pointer.</param>
/// <returns>CandidateTranscript managed object.</returns>
private static Models.CandidateTranscript PtrToCandidateTranscript(this IntPtr intPtr)
{
var managedTranscript = new Models.CandidateTranscript();
var transcript = Marshal.PtrToStructure<CandidateTranscript>(intPtr);
managedTranscript.Tokens = new Models.TokenMetadata[transcript.num_tokens];
managedTranscript.Confidence = transcript.confidence;
//we need to manually read each item from the native ptr using its size
var sizeOfTokenMetadata = Marshal.SizeOf(typeof(TokenMetadata));
for (int i = 0; i < transcript.num_tokens; i++)
{
managedTranscript.Tokens[i] = transcript.tokens.PtrToTokenMetadata();
transcript.tokens += sizeOfTokenMetadata;
}
return managedTranscript;
}
/// <summary>
/// Converts a pointer into managed Metadata object.
/// </summary>
/// <param name="intPtr">Native pointer.</param>
/// <returns>Metadata managed object.</returns>
internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
{
var managedMetaObject = new Models.Metadata();
var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));
managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
managedMetaObject.Confidence = metaData.confidence;
var managedMetadata = new Models.Metadata();
var metadata = Marshal.PtrToStructure<Metadata>(intPtr);
managedMetadata.Transcripts = new Models.CandidateTranscript[metadata.num_transcripts];
//we need to manually read each item from the native ptr using its size
var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
for (int i = 0; i < metaData.num_items; i++)
var sizeOfCandidateTranscript = Marshal.SizeOf(typeof(CandidateTranscript));
for (int i = 0; i < metadata.num_transcripts; i++)
{
var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items);
managedMetaObject.Items[i] = new Models.MetadataItem
{
Timestep = tempItem.timestep,
StartTime = tempItem.start_time,
Character = tempItem.character.PtrToString(releasePtr: false)
};
//we keep the offset on each read
metaData.items += sizeOfMetaItem;
managedMetadata.Transcripts[i] = metadata.transcripts.PtrToCandidateTranscript();
metadata.transcripts += sizeOfCandidateTranscript;
}
NativeImp.DS_FreeMetadata(intPtr);
return managedMetaObject;
return managedMetadata;
}
}
}

View File

@ -72,9 +72,11 @@ namespace DeepSpeechClient.Interfaces
/// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
/// <returns>The extended metadata. Returns NULL on error.</returns>
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
uint aBufferSize);
uint aBufferSize,
uint aNumResults);
/// <summary>
/// Destroy a streaming state without decoding the computed logits.
@ -102,6 +104,14 @@ namespace DeepSpeechClient.Interfaces
/// <returns>The STT intermediate result.</returns>
unsafe string IntermediateDecode(DeepSpeechStream stream);
/// <summary>
/// Computes the intermediate decoding of an ongoing streaming inference.
/// </summary>
/// <param name="stream">Instance of the stream to decode.</param>
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
/// <returns>The extended metadata result.</returns>
unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
/// <summary>
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
/// </summary>
@ -113,7 +123,8 @@ namespace DeepSpeechClient.Interfaces
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
/// </summary>
/// <param name="stream">Instance of the stream to finish.</param>
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
/// <returns>The extended metadata result.</returns>
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream);
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
}
}

View File

@ -0,0 +1,17 @@
namespace DeepSpeechClient.Models
{
/// <summary>
/// Stores the entire CTC output as an array of character metadata objects.
/// </summary>
public class CandidateTranscript
{
/// <summary>
/// Approximated confidence value for this transcription.
/// </summary>
public double Confidence { get; set; }
/// <summary>
/// List of metada tokens containing text, timestep, and time offset.
/// </summary>
public TokenMetadata[] Tokens { get; set; }
}
}

View File

@ -6,12 +6,8 @@
public class Metadata
{
/// <summary>
/// Approximated confidence value for this transcription.
/// List of candidate transcripts.
/// </summary>
public double Confidence { get; set; }
/// <summary>
/// List of metada items containing char, timespet, and time offset.
/// </summary>
public MetadataItem[] Items { get; set; }
public CandidateTranscript[] Transcripts { get; set; }
}
}

View File

@ -3,12 +3,12 @@
/// <summary>
/// Stores each individual character, along with its timing information.
/// </summary>
public class MetadataItem
public class TokenMetadata
{
/// <summary>
/// Char of the current timestep.
/// </summary>
public string Character;
public string Text;
/// <summary>
/// Position of the character in units of 20ms.
/// </summary>

View File

@ -55,7 +55,8 @@ namespace DeepSpeechClient
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
short[] aBuffer,
uint aBufferSize);
uint aBufferSize,
uint aNumResults);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
@ -82,12 +83,17 @@ namespace DeepSpeechClient
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx,
uint aNumResults);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
CharSet = CharSet.Ansi, SetLastError = true)]
internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx);
internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx,
uint aNumResults);
#endregion
}
}

View File

@ -0,0 +1,22 @@
using System;
using System.Runtime.InteropServices;
namespace DeepSpeechClient.Structs
{
[StructLayout(LayoutKind.Sequential)]
internal unsafe struct CandidateTranscript
{
/// <summary>
/// Native list of tokens.
/// </summary>
internal unsafe IntPtr tokens;
/// <summary>
/// Count of tokens from the native side.
/// </summary>
internal unsafe int num_tokens;
/// <summary>
/// Approximated confidence value for this transcription.
/// </summary>
internal unsafe double confidence;
}
}

View File

@ -7,16 +7,12 @@ namespace DeepSpeechClient.Structs
internal unsafe struct Metadata
{
/// <summary>
/// Native list of items.
/// Native list of candidate transcripts.
/// </summary>
internal unsafe IntPtr items;
internal unsafe IntPtr transcripts;
/// <summary>
/// Count of items from the native side.
/// Count of transcripts from the native side.
/// </summary>
internal unsafe int num_items;
/// <summary>
/// Approximated confidence value for this transcription.
/// </summary>
internal unsafe double confidence;
internal unsafe int num_transcripts;
}
}

View File

@ -4,12 +4,12 @@ using System.Runtime.InteropServices;
namespace DeepSpeechClient.Structs
{
[StructLayout(LayoutKind.Sequential)]
internal unsafe struct MetadataItem
internal unsafe struct TokenMetadata
{
/// <summary>
/// Native character.
/// Native text.
/// </summary>
internal unsafe IntPtr character;
internal unsafe IntPtr text;
/// <summary>
/// Position of the character in units of 20ms.
/// </summary>

View File

@ -21,14 +21,14 @@ namespace CSharpExamples
static string GetArgument(IEnumerable<string> args, string option)
=> args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();
static string MetadataToString(Metadata meta)
static string MetadataToString(CandidateTranscript transcript)
{
var nl = Environment.NewLine;
string retval =
Environment.NewLine + $"Recognized text: {string.Join("", meta?.Items?.Select(x => x.Character))} {nl}"
+ $"Confidence: {meta?.Confidence} {nl}"
+ $"Item count: {meta?.Items?.Length} {nl}"
+ string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}"));
Environment.NewLine + $"Recognized text: {string.Join("", transcript?.Tokens?.Select(x => x.Text))} {nl}"
+ $"Confidence: {transcript?.Confidence} {nl}"
+ $"Item count: {transcript?.Tokens?.Length} {nl}"
+ string.Join(nl, transcript?.Tokens?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Text}"));
return retval;
}
@ -75,8 +75,8 @@ namespace CSharpExamples
if (extended)
{
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer,
Convert.ToUInt32(waveBuffer.MaxSize / 2));
speechResult = MetadataToString(metaResult);
Convert.ToUInt32(waveBuffer.MaxSize / 2), 1);
speechResult = MetadataToString(metaResult.Transcripts[0]);
}
else
{