diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
index 576ed308..ce184cf4 100644
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
@@ -202,10 +202,11 @@ namespace DeepSpeechClient
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
///
/// Instance of the stream to finish.
+ /// Number of candidate transcripts to return.
/// The extended metadata result.
- public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream)
+ public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults)
{
- return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata();
+ return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
}
///
@@ -218,6 +219,17 @@ namespace DeepSpeechClient
return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString();
}
+ ///
+ /// Computes the intermediate decoding of an ongoing streaming inference.
+ ///
+ /// Instance of the stream to decode.
+ /// Number of candidate transcripts to return.
+ /// The STT intermediate result.
+ public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults)
+ {
+ return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata();
+ }
+
///
/// Return version of this library. The returned version is a semantic version
/// (SemVer 2.0.0).
@@ -265,10 +277,11 @@ namespace DeepSpeechClient
///
/// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
/// The number of samples in the audio signal.
+ /// Number of candidate transcripts to return.
/// The extended metadata. Returns NULL on error.
- public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
+ public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults)
{
- return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
+ return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata();
}
#endregion
diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
index b9077361..0139b3e8 100644
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
@@ -50,11 +50,13 @@
-
+
+
-
+
+
diff --git a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs
index 6b7f4c6a..9325f4b8 100644
--- a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs
+++ b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs
@@ -26,35 +26,68 @@ namespace DeepSpeechClient.Extensions
}
///
- /// Converts a pointer into managed metadata object.
+ /// Converts a pointer into managed TokenMetadata object.
+ ///
+ /// Native pointer.
+ /// TokenMetadata managed object.
+ private static Models.TokenMetadata PtrToTokenMetadata(this IntPtr intPtr)
+ {
+ var token = Marshal.PtrToStructure(intPtr);
+ var managedToken = new Models.TokenMetadata
+ {
+ Timestep = token.timestep,
+ StartTime = token.start_time,
+ Text = token.text.PtrToString(releasePtr: false)
+ };
+ return managedToken;
+ }
+
+ ///
+ /// Converts a pointer into managed CandidateTranscript object.
+ ///
+ /// Native pointer.
+ /// CandidateTranscript managed object.
+ private static Models.CandidateTranscript PtrToCandidateTranscript(this IntPtr intPtr)
+ {
+ var managedTranscript = new Models.CandidateTranscript();
+ var transcript = Marshal.PtrToStructure(intPtr);
+
+ managedTranscript.Tokens = new Models.TokenMetadata[transcript.num_tokens];
+ managedTranscript.Confidence = transcript.confidence;
+
+ //we need to manually read each item from the native ptr using its size
+ var sizeOfTokenMetadata = Marshal.SizeOf(typeof(TokenMetadata));
+ for (int i = 0; i < transcript.num_tokens; i++)
+ {
+ managedTranscript.Tokens[i] = transcript.tokens.PtrToTokenMetadata();
+ transcript.tokens += sizeOfTokenMetadata;
+ }
+
+ return managedTranscript;
+ }
+
+ ///
+ /// Converts a pointer into managed Metadata object.
///
/// Native pointer.
/// Metadata managed object.
internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
{
- var managedMetaObject = new Models.Metadata();
- var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));
-
- managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
- managedMetaObject.Confidence = metaData.confidence;
+ var managedMetadata = new Models.Metadata();
+ var metadata = Marshal.PtrToStructure(intPtr);
+ managedMetadata.Transcripts = new Models.CandidateTranscript[metadata.num_transcripts];
//we need to manually read each item from the native ptr using its size
- var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
- for (int i = 0; i < metaData.num_items; i++)
+ var sizeOfCandidateTranscript = Marshal.SizeOf(typeof(CandidateTranscript));
+ for (int i = 0; i < metadata.num_transcripts; i++)
{
- var tempItem = Marshal.PtrToStructure(metaData.items);
- managedMetaObject.Items[i] = new Models.MetadataItem
- {
- Timestep = tempItem.timestep,
- StartTime = tempItem.start_time,
- Character = tempItem.character.PtrToString(releasePtr: false)
- };
- //we keep the offset on each read
- metaData.items += sizeOfMetaItem;
+ managedMetadata.Transcripts[i] = metadata.transcripts.PtrToCandidateTranscript();
+ metadata.transcripts += sizeOfCandidateTranscript;
}
+
NativeImp.DS_FreeMetadata(intPtr);
- return managedMetaObject;
+ return managedMetadata;
}
}
}
diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
index 18677abc..ae3e72cf 100644
--- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
@@ -72,9 +72,11 @@ namespace DeepSpeechClient.Interfaces
///
/// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
/// The number of samples in the audio signal.
+ /// Number of candidate transcripts to return.
/// The extended metadata. Returns NULL on error.
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
- uint aBufferSize);
+ uint aBufferSize,
+ uint aNumResults);
///
/// Destroy a streaming state without decoding the computed logits.
@@ -102,6 +104,14 @@ namespace DeepSpeechClient.Interfaces
/// The STT intermediate result.
unsafe string IntermediateDecode(DeepSpeechStream stream);
+ ///
+ /// Computes the intermediate decoding of an ongoing streaming inference.
+ ///
+ /// Instance of the stream to decode.
+ /// Number of candidate transcripts to return.
+ /// The extended metadata result.
+ unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults);
+
///
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
///
@@ -113,7 +123,8 @@ namespace DeepSpeechClient.Interfaces
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
///
/// Instance of the stream to finish.
+ /// Number of candidate transcripts to return.
/// The extended metadata result.
- unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream);
+ unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults);
}
}
diff --git a/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs b/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs
new file mode 100644
index 00000000..cc6b5d28
--- /dev/null
+++ b/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs
@@ -0,0 +1,17 @@
+namespace DeepSpeechClient.Models
+{
+ ///
+ /// Stores the entire CTC output as an array of character metadata objects.
+ ///
+ public class CandidateTranscript
+ {
+ ///
+ /// Approximated confidence value for this transcription.
+ ///
+ public double Confidence { get; set; }
+ ///
+ /// List of metada tokens containing text, timestep, and time offset.
+ ///
+ public TokenMetadata[] Tokens { get; set; }
+ }
+}
\ No newline at end of file
diff --git a/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs b/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
index 870eb162..fb6c613d 100644
--- a/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
@@ -6,12 +6,8 @@
public class Metadata
{
///
- /// Approximated confidence value for this transcription.
+ /// List of candidate transcripts.
///
- public double Confidence { get; set; }
- ///
- /// List of metada items containing char, timespet, and time offset.
- ///
- public MetadataItem[] Items { get; set; }
+ public CandidateTranscript[] Transcripts { get; set; }
}
}
\ No newline at end of file
diff --git a/native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs b/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs
similarity index 89%
rename from native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs
rename to native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs
index e329c6cb..5f2dea56 100644
--- a/native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs
+++ b/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs
@@ -3,12 +3,12 @@
///
/// Stores each individual character, along with its timing information.
///
- public class MetadataItem
+ public class TokenMetadata
{
///
/// Char of the current timestep.
///
- public string Character;
+ public string Text;
///
/// Position of the character in units of 20ms.
///
diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs
index 6c3494b6..eabbfe48 100644
--- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs
+++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs
@@ -17,45 +17,46 @@ namespace DeepSpeechClient
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath,
- ref IntPtr** pint);
+ ref IntPtr** pint);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal unsafe static extern uint DS_GetModelBeamWidth(IntPtr** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal unsafe static extern ErrorCodes DS_SetModelBeamWidth(IntPtr** aCtx,
- uint aBeamWidth);
+ uint aBeamWidth);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath,
- uint aBeamWidth,
- ref IntPtr** pint);
+ uint aBeamWidth,
+ ref IntPtr** pint);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_EnableExternalScorer(IntPtr** aCtx,
- string aScorerPath);
+ string aScorerPath);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_DisableExternalScorer(IntPtr** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_SetScorerAlphaBeta(IntPtr** aCtx,
- float aAlpha,
- float aBeta);
+ float aAlpha,
+ float aBeta);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
CharSet = CharSet.Ansi, SetLastError = true)]
internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx,
- short[] aBuffer,
- uint aBufferSize);
+ short[] aBuffer,
+ uint aBufferSize);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
- short[] aBuffer,
- uint aBufferSize);
+ short[] aBuffer,
+ uint aBufferSize,
+ uint aNumResults);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
@@ -76,18 +77,23 @@ namespace DeepSpeechClient
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
CharSet = CharSet.Ansi, SetLastError = true)]
internal static unsafe extern void DS_FeedAudioContent(IntPtr** aSctx,
- short[] aBuffer,
- uint aBufferSize);
+ short[] aBuffer,
+ uint aBufferSize);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx);
+ [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
+ internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx,
+ uint aNumResults);
+
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
CharSet = CharSet.Ansi, SetLastError = true)]
internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
- internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx);
+ internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx,
+ uint aNumResults);
#endregion
}
}
diff --git a/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs b/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs
new file mode 100644
index 00000000..54581f6f
--- /dev/null
+++ b/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs
@@ -0,0 +1,22 @@
+using System;
+using System.Runtime.InteropServices;
+
+namespace DeepSpeechClient.Structs
+{
+ [StructLayout(LayoutKind.Sequential)]
+ internal unsafe struct CandidateTranscript
+ {
+ ///
+ /// Native list of tokens.
+ ///
+ internal unsafe IntPtr tokens;
+ ///
+ /// Count of tokens from the native side.
+ ///
+ internal unsafe int num_tokens;
+ ///
+ /// Approximated confidence value for this transcription.
+ ///
+ internal unsafe double confidence;
+ }
+}
diff --git a/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs b/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
index 411da9f2..0a9beddc 100644
--- a/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
@@ -7,16 +7,12 @@ namespace DeepSpeechClient.Structs
internal unsafe struct Metadata
{
///
- /// Native list of items.
+ /// Native list of candidate transcripts.
///
- internal unsafe IntPtr items;
+ internal unsafe IntPtr transcripts;
///
- /// Count of items from the native side.
+ /// Count of transcripts from the native side.
///
- internal unsafe int num_items;
- ///
- /// Approximated confidence value for this transcription.
- ///
- internal unsafe double confidence;
+ internal unsafe int num_transcripts;
}
}
diff --git a/native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs b/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs
similarity index 80%
rename from native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs
rename to native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs
index 10092742..1c660c71 100644
--- a/native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs
+++ b/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs
@@ -4,12 +4,12 @@ using System.Runtime.InteropServices;
namespace DeepSpeechClient.Structs
{
[StructLayout(LayoutKind.Sequential)]
- internal unsafe struct MetadataItem
+ internal unsafe struct TokenMetadata
{
///
- /// Native character.
+ /// Native text.
///
- internal unsafe IntPtr character;
+ internal unsafe IntPtr text;
///
/// Position of the character in units of 20ms.
///
diff --git a/native_client/dotnet/DeepSpeechConsole/Program.cs b/native_client/dotnet/DeepSpeechConsole/Program.cs
index b35c7046..a08e44b6 100644
--- a/native_client/dotnet/DeepSpeechConsole/Program.cs
+++ b/native_client/dotnet/DeepSpeechConsole/Program.cs
@@ -21,14 +21,14 @@ namespace CSharpExamples
static string GetArgument(IEnumerable args, string option)
=> args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();
- static string MetadataToString(Metadata meta)
+ static string MetadataToString(CandidateTranscript transcript)
{
var nl = Environment.NewLine;
string retval =
- Environment.NewLine + $"Recognized text: {string.Join("", meta?.Items?.Select(x => x.Character))} {nl}"
- + $"Confidence: {meta?.Confidence} {nl}"
- + $"Item count: {meta?.Items?.Length} {nl}"
- + string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}"));
+ Environment.NewLine + $"Recognized text: {string.Join("", transcript?.Tokens?.Select(x => x.Text))} {nl}"
+ + $"Confidence: {transcript?.Confidence} {nl}"
+ + $"Item count: {transcript?.Tokens?.Length} {nl}"
+ + string.Join(nl, transcript?.Tokens?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Text}"));
return retval;
}
@@ -75,8 +75,8 @@ namespace CSharpExamples
if (extended)
{
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer,
- Convert.ToUInt32(waveBuffer.MaxSize / 2));
- speechResult = MetadataToString(metaResult);
+ Convert.ToUInt32(waveBuffer.MaxSize / 2), 1);
+ speechResult = MetadataToString(metaResult.Transcripts[0]);
}
else
{