Adapt .NET bindings to new API
This commit is contained in:
parent
09048e2ea2
commit
bb709ff955
@ -202,10 +202,11 @@ namespace DeepSpeechClient
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||
/// </summary>
|
||||
/// <param name="stream">Instance of the stream to finish.</param>
|
||||
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
|
||||
/// <returns>The extended metadata result.</returns>
|
||||
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream)
|
||||
public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
|
||||
{
|
||||
return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata();
|
||||
return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@ -218,6 +219,17 @@ namespace DeepSpeechClient
|
||||
return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the intermediate decoding of an ongoing streaming inference.
|
||||
/// </summary>
|
||||
/// <param name="stream">Instance of the stream to decode.</param>
|
||||
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
|
||||
/// <returns>The STT intermediate result.</returns>
|
||||
public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
|
||||
{
|
||||
return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Return version of this library. The returned version is a semantic version
|
||||
/// (SemVer 2.0.0).
|
||||
@ -265,10 +277,11 @@ namespace DeepSpeechClient
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
|
||||
/// <returns>The extended metadata. Returns NULL on error.</returns>
|
||||
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
|
||||
public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
|
||||
{
|
||||
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
|
||||
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
@ -50,11 +50,13 @@
|
||||
<Compile Include="Extensions\NativeExtensions.cs" />
|
||||
<Compile Include="Models\DeepSpeechStream.cs" />
|
||||
<Compile Include="Models\Metadata.cs" />
|
||||
<Compile Include="Models\MetadataItem.cs" />
|
||||
<Compile Include="Models\CandidateTranscript.cs" />
|
||||
<Compile Include="Models\TokenMetadata.cs" />
|
||||
<Compile Include="NativeImp.cs" />
|
||||
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||
<Compile Include="Structs\Metadata.cs" />
|
||||
<Compile Include="Structs\MetadataItem.cs" />
|
||||
<Compile Include="Structs\CandidateTranscript.cs" />
|
||||
<Compile Include="Structs\TokenMetadata.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup />
|
||||
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
||||
|
@ -26,35 +26,68 @@ namespace DeepSpeechClient.Extensions
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a pointer into managed metadata object.
|
||||
/// Converts a pointer into managed TokenMetadata object.
|
||||
/// </summary>
|
||||
/// <param name="intPtr">Native pointer.</param>
|
||||
/// <returns>TokenMetadata managed object.</returns>
|
||||
private static Models.TokenMetadata PtrToTokenMetadata(this IntPtr intPtr)
|
||||
{
|
||||
var token = Marshal.PtrToStructure<TokenMetadata>(intPtr);
|
||||
var managedToken = new Models.TokenMetadata
|
||||
{
|
||||
Timestep = token.timestep,
|
||||
StartTime = token.start_time,
|
||||
Text = token.text.PtrToString(releasePtr: false)
|
||||
};
|
||||
return managedToken;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a pointer into managed CandidateTranscript object.
|
||||
/// </summary>
|
||||
/// <param name="intPtr">Native pointer.</param>
|
||||
/// <returns>CandidateTranscript managed object.</returns>
|
||||
private static Models.CandidateTranscript PtrToCandidateTranscript(this IntPtr intPtr)
|
||||
{
|
||||
var managedTranscript = new Models.CandidateTranscript();
|
||||
var transcript = Marshal.PtrToStructure<CandidateTranscript>(intPtr);
|
||||
|
||||
managedTranscript.Tokens = new Models.TokenMetadata[transcript.num_tokens];
|
||||
managedTranscript.Confidence = transcript.confidence;
|
||||
|
||||
//we need to manually read each item from the native ptr using its size
|
||||
var sizeOfTokenMetadata = Marshal.SizeOf(typeof(TokenMetadata));
|
||||
for (int i = 0; i < transcript.num_tokens; i++)
|
||||
{
|
||||
managedTranscript.Tokens[i] = transcript.tokens.PtrToTokenMetadata();
|
||||
transcript.tokens += sizeOfTokenMetadata;
|
||||
}
|
||||
|
||||
return managedTranscript;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a pointer into managed Metadata object.
|
||||
/// </summary>
|
||||
/// <param name="intPtr">Native pointer.</param>
|
||||
/// <returns>Metadata managed object.</returns>
|
||||
internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
|
||||
{
|
||||
var managedMetaObject = new Models.Metadata();
|
||||
var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));
|
||||
|
||||
managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
|
||||
managedMetaObject.Confidence = metaData.confidence;
|
||||
var managedMetadata = new Models.Metadata();
|
||||
var metadata = Marshal.PtrToStructure<Metadata>(intPtr);
|
||||
|
||||
managedMetadata.Transcripts = new Models.CandidateTranscript[metadata.num_transcripts];
|
||||
|
||||
//we need to manually read each item from the native ptr using its size
|
||||
var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
|
||||
for (int i = 0; i < metaData.num_items; i++)
|
||||
var sizeOfCandidateTranscript = Marshal.SizeOf(typeof(CandidateTranscript));
|
||||
for (int i = 0; i < metadata.num_transcripts; i++)
|
||||
{
|
||||
var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items);
|
||||
managedMetaObject.Items[i] = new Models.MetadataItem
|
||||
{
|
||||
Timestep = tempItem.timestep,
|
||||
StartTime = tempItem.start_time,
|
||||
Character = tempItem.character.PtrToString(releasePtr: false)
|
||||
};
|
||||
//we keep the offset on each read
|
||||
metaData.items += sizeOfMetaItem;
|
||||
managedMetadata.Transcripts[i] = metadata.transcripts.PtrToCandidateTranscript();
|
||||
metadata.transcripts += sizeOfCandidateTranscript;
|
||||
}
|
||||
|
||||
NativeImp.DS_FreeMetadata(intPtr);
|
||||
return managedMetaObject;
|
||||
return managedMetadata;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -72,9 +72,11 @@ namespace DeepSpeechClient.Interfaces
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
|
||||
/// <returns>The extended metadata. Returns NULL on error.</returns>
|
||||
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
|
||||
uint aBufferSize);
|
||||
uint aBufferSize,
|
||||
uint aNumResults);
|
||||
|
||||
/// <summary>
|
||||
/// Destroy a streaming state without decoding the computed logits.
|
||||
@ -102,6 +104,14 @@ namespace DeepSpeechClient.Interfaces
|
||||
/// <returns>The STT intermediate result.</returns>
|
||||
unsafe string IntermediateDecode(DeepSpeechStream stream);
|
||||
|
||||
/// <summary>
|
||||
/// Computes the intermediate decoding of an ongoing streaming inference.
|
||||
/// </summary>
|
||||
/// <param name="stream">Instance of the stream to decode.</param>
|
||||
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
|
||||
/// <returns>The extended metadata result.</returns>
|
||||
unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
|
||||
|
||||
/// <summary>
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||
/// </summary>
|
||||
@ -113,7 +123,8 @@ namespace DeepSpeechClient.Interfaces
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||
/// </summary>
|
||||
/// <param name="stream">Instance of the stream to finish.</param>
|
||||
/// <param name="aNumResults">Number of candidate transcripts to return.</param>
|
||||
/// <returns>The extended metadata result.</returns>
|
||||
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream);
|
||||
unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,17 @@
|
||||
namespace DeepSpeechClient.Models
|
||||
{
|
||||
/// <summary>
|
||||
/// Stores the entire CTC output as an array of character metadata objects.
|
||||
/// </summary>
|
||||
public class CandidateTranscript
|
||||
{
|
||||
/// <summary>
|
||||
/// Approximated confidence value for this transcription.
|
||||
/// </summary>
|
||||
public double Confidence { get; set; }
|
||||
/// <summary>
|
||||
/// List of metada tokens containing text, timestep, and time offset.
|
||||
/// </summary>
|
||||
public TokenMetadata[] Tokens { get; set; }
|
||||
}
|
||||
}
|
@ -6,12 +6,8 @@
|
||||
public class Metadata
|
||||
{
|
||||
/// <summary>
|
||||
/// Approximated confidence value for this transcription.
|
||||
/// List of candidate transcripts.
|
||||
/// </summary>
|
||||
public double Confidence { get; set; }
|
||||
/// <summary>
|
||||
/// List of metada items containing char, timespet, and time offset.
|
||||
/// </summary>
|
||||
public MetadataItem[] Items { get; set; }
|
||||
public CandidateTranscript[] Transcripts { get; set; }
|
||||
}
|
||||
}
|
@ -3,12 +3,12 @@
|
||||
/// <summary>
|
||||
/// Stores each individual character, along with its timing information.
|
||||
/// </summary>
|
||||
public class MetadataItem
|
||||
public class TokenMetadata
|
||||
{
|
||||
/// <summary>
|
||||
/// Char of the current timestep.
|
||||
/// </summary>
|
||||
public string Character;
|
||||
public string Text;
|
||||
/// <summary>
|
||||
/// Position of the character in units of 20ms.
|
||||
/// </summary>
|
@ -17,45 +17,46 @@ namespace DeepSpeechClient
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath,
|
||||
ref IntPtr** pint);
|
||||
ref IntPtr** pint);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal unsafe static extern uint DS_GetModelBeamWidth(IntPtr** aCtx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal unsafe static extern ErrorCodes DS_SetModelBeamWidth(IntPtr** aCtx,
|
||||
uint aBeamWidth);
|
||||
uint aBeamWidth);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath,
|
||||
uint aBeamWidth,
|
||||
ref IntPtr** pint);
|
||||
uint aBeamWidth,
|
||||
ref IntPtr** pint);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern ErrorCodes DS_EnableExternalScorer(IntPtr** aCtx,
|
||||
string aScorerPath);
|
||||
string aScorerPath);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern ErrorCodes DS_DisableExternalScorer(IntPtr** aCtx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern ErrorCodes DS_SetScorerAlphaBeta(IntPtr** aCtx,
|
||||
float aAlpha,
|
||||
float aBeta);
|
||||
float aAlpha,
|
||||
float aBeta);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
|
||||
CharSet = CharSet.Ansi, SetLastError = true)]
|
||||
internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx,
|
||||
short[] aBuffer,
|
||||
uint aBufferSize);
|
||||
short[] aBuffer,
|
||||
uint aBufferSize);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
|
||||
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
|
||||
short[] aBuffer,
|
||||
uint aBufferSize);
|
||||
short[] aBuffer,
|
||||
uint aBufferSize,
|
||||
uint aNumResults);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
|
||||
@ -76,18 +77,23 @@ namespace DeepSpeechClient
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
|
||||
CharSet = CharSet.Ansi, SetLastError = true)]
|
||||
internal static unsafe extern void DS_FeedAudioContent(IntPtr** aSctx,
|
||||
short[] aBuffer,
|
||||
uint aBufferSize);
|
||||
short[] aBuffer,
|
||||
uint aBufferSize);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx,
|
||||
uint aNumResults);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
|
||||
CharSet = CharSet.Ansi, SetLastError = true)]
|
||||
internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx);
|
||||
internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx,
|
||||
uint aNumResults);
|
||||
#endregion
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,22 @@
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
namespace DeepSpeechClient.Structs
|
||||
{
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
internal unsafe struct CandidateTranscript
|
||||
{
|
||||
/// <summary>
|
||||
/// Native list of tokens.
|
||||
/// </summary>
|
||||
internal unsafe IntPtr tokens;
|
||||
/// <summary>
|
||||
/// Count of tokens from the native side.
|
||||
/// </summary>
|
||||
internal unsafe int num_tokens;
|
||||
/// <summary>
|
||||
/// Approximated confidence value for this transcription.
|
||||
/// </summary>
|
||||
internal unsafe double confidence;
|
||||
}
|
||||
}
|
@ -7,16 +7,12 @@ namespace DeepSpeechClient.Structs
|
||||
internal unsafe struct Metadata
|
||||
{
|
||||
/// <summary>
|
||||
/// Native list of items.
|
||||
/// Native list of candidate transcripts.
|
||||
/// </summary>
|
||||
internal unsafe IntPtr items;
|
||||
internal unsafe IntPtr transcripts;
|
||||
/// <summary>
|
||||
/// Count of items from the native side.
|
||||
/// Count of transcripts from the native side.
|
||||
/// </summary>
|
||||
internal unsafe int num_items;
|
||||
/// <summary>
|
||||
/// Approximated confidence value for this transcription.
|
||||
/// </summary>
|
||||
internal unsafe double confidence;
|
||||
internal unsafe int num_transcripts;
|
||||
}
|
||||
}
|
||||
|
@ -4,12 +4,12 @@ using System.Runtime.InteropServices;
|
||||
namespace DeepSpeechClient.Structs
|
||||
{
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
internal unsafe struct MetadataItem
|
||||
internal unsafe struct TokenMetadata
|
||||
{
|
||||
/// <summary>
|
||||
/// Native character.
|
||||
/// Native text.
|
||||
/// </summary>
|
||||
internal unsafe IntPtr character;
|
||||
internal unsafe IntPtr text;
|
||||
/// <summary>
|
||||
/// Position of the character in units of 20ms.
|
||||
/// </summary>
|
@ -21,14 +21,14 @@ namespace CSharpExamples
|
||||
static string GetArgument(IEnumerable<string> args, string option)
|
||||
=> args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();
|
||||
|
||||
static string MetadataToString(Metadata meta)
|
||||
static string MetadataToString(CandidateTranscript transcript)
|
||||
{
|
||||
var nl = Environment.NewLine;
|
||||
string retval =
|
||||
Environment.NewLine + $"Recognized text: {string.Join("", meta?.Items?.Select(x => x.Character))} {nl}"
|
||||
+ $"Confidence: {meta?.Confidence} {nl}"
|
||||
+ $"Item count: {meta?.Items?.Length} {nl}"
|
||||
+ string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}"));
|
||||
Environment.NewLine + $"Recognized text: {string.Join("", transcript?.Tokens?.Select(x => x.Text))} {nl}"
|
||||
+ $"Confidence: {transcript?.Confidence} {nl}"
|
||||
+ $"Item count: {transcript?.Tokens?.Length} {nl}"
|
||||
+ string.Join(nl, transcript?.Tokens?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Text}"));
|
||||
return retval;
|
||||
}
|
||||
|
||||
@ -75,8 +75,8 @@ namespace CSharpExamples
|
||||
if (extended)
|
||||
{
|
||||
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer,
|
||||
Convert.ToUInt32(waveBuffer.MaxSize / 2));
|
||||
speechResult = MetadataToString(metaResult);
|
||||
Convert.ToUInt32(waveBuffer.MaxSize / 2), 1);
|
||||
speechResult = MetadataToString(metaResult.Transcripts[0]);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user