diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst
index f9818d64..2ba3415f 100644
--- a/doc/DotNet-API.rst
+++ b/doc/DotNet-API.rst
@@ -15,6 +15,13 @@ DeepSpeech Class
:project: deepspeech-dotnet
:members:
+DeepSpeechStream Class
+----------------
+
+.. doxygenclass:: DeepSpeechClient::DeepSpeechStream
+ :project: deepspeech-dotnet
+ :members:
+
ErrorCodes
----------
diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
index a674c699..1260d926 100644
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
@@ -4,6 +4,7 @@ using DeepSpeechClient.Extensions;
using System;
using System.IO;
using DeepSpeechClient.Enums;
+using DeepSpeechClient.Models;
namespace DeepSpeechClient
{
@@ -13,14 +14,16 @@ namespace DeepSpeechClient
public class DeepSpeech : IDeepSpeech
{
private unsafe IntPtr** _modelStatePP;
- private unsafe IntPtr** _streamingStatePP;
-
-
-
-
- public DeepSpeech()
+
+ ///
+ /// Initializes a new instance of class and creates a new acoustic model.
+ ///
+ /// The path to the frozen model graph.
+ /// The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.
+ /// Thrown when the native binary failed to create the model.
+ public DeepSpeech(string aModelPath, uint aBeamWidth)
{
-
+ CreateModel(aModelPath, aBeamWidth);
}
#region IDeepSpeech
@@ -31,7 +34,7 @@ namespace DeepSpeechClient
/// The path to the frozen model graph.
/// The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.
/// Thrown when the native binary failed to create the model.
- public unsafe void CreateModel(string aModelPath,
+ private unsafe void CreateModel(string aModelPath,
uint aBeamWidth)
{
string exceptionMessage = null;
@@ -118,10 +121,19 @@ namespace DeepSpeechClient
/// The alpha hyperparameter of the CTC decoder. Language Model weight.
/// The beta hyperparameter of the CTC decoder. Word insertion weight.
/// Thrown when the native binary failed to enable decoding with a language model.
+ /// Thrown when cannot find the language model or trie file.
public unsafe void EnableDecoderWithLM(string aLMPath, string aTriePath,
float aLMAlpha, float aLMBeta)
{
string exceptionMessage = null;
+ if (string.IsNullOrWhiteSpace(aLMPath))
+ {
+ exceptionMessage = "Path to the language model file cannot be empty.";
+ }
+ if (!File.Exists(aLMPath))
+ {
+ exceptionMessage = $"Cannot find the language model file: {aLMPath}";
+ }
if (string.IsNullOrWhiteSpace(aTriePath))
{
exceptionMessage = "Path to the trie file cannot be empty.";
@@ -147,37 +159,41 @@ namespace DeepSpeechClient
///
/// Feeds audio samples to an ongoing streaming inference.
///
+ /// Instance of the stream to feed the data.
/// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).
- public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize)
+ public unsafe void FeedAudioContent(DeepSpeechStream stream, short[] aBuffer, uint aBufferSize)
{
- NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize);
+ NativeImp.DS_FeedAudioContent(stream.GetNativePointer(), aBuffer, aBufferSize);
}
///
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
///
- /// The STT result. The user is responsible for freeing the string.
- public unsafe string FinishStream()
+ /// Instance of the stream to finish.
+ /// The STT result.
+ public unsafe string FinishStream(DeepSpeechStream stream)
{
- return NativeImp.DS_FinishStream(_streamingStatePP).PtrToString();
+ return NativeImp.DS_FinishStream(stream.GetNativePointer()).PtrToString();
}
///
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
///
- /// The extended metadata. The user is responsible for freeing the struct.
- public unsafe Models.Metadata FinishStreamWithMetadata()
+ /// Instance of the stream to finish.
+ /// The extended metadata result.
+ public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream)
{
- return NativeImp.DS_FinishStreamWithMetadata(_streamingStatePP).PtrToMetadata();
+ return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer()).PtrToMetadata();
}
///
/// Computes the intermediate decoding of an ongoing streaming inference.
///
- /// The STT intermediate result. The user is responsible for freeing the string.
- public unsafe string IntermediateDecode()
+ /// Instance of the stream to decode.
+ /// The STT intermediate result.
+ public unsafe string IntermediateDecode(DeepSpeechStream stream)
{
- return NativeImp.DS_IntermediateDecode(_streamingStatePP);
+ return NativeImp.DS_IntermediateDecode(stream.GetNativePointer());
}
///
@@ -191,11 +207,12 @@ namespace DeepSpeechClient
///
/// Creates a new streaming inference state.
///
- /// Thrown when the native binary failed to initialize the streaming mode.
- public unsafe void CreateStream()
+ public unsafe DeepSpeechStream CreateStream()
{
- var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref _streamingStatePP);
+ IntPtr** streamingStatePointer = null;
+ var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref streamingStatePointer);
EvaluateResultCode(resultCode);
+ return new DeepSpeechStream(streamingStatePointer);
}
///
@@ -203,25 +220,10 @@ namespace DeepSpeechClient
/// This can be used if you no longer need the result of an ongoing streaming
/// inference and don't want to perform a costly decode operation.
///
- public unsafe void FreeStream()
+ public unsafe void FreeStream(DeepSpeechStream stream)
{
- NativeImp.DS_FreeStream(ref _streamingStatePP);
- }
-
- ///
- /// Free a DeepSpeech allocated string
- ///
- public unsafe void FreeString(IntPtr intPtr)
- {
- NativeImp.DS_FreeString(intPtr);
- }
-
- ///
- /// Free a DeepSpeech allocated Metadata struct
- ///
- public unsafe void FreeMetadata(IntPtr intPtr)
- {
- NativeImp.DS_FreeMetadata(intPtr);
+ NativeImp.DS_FreeStream(stream.GetNativePointer());
+ stream.Dispose();
}
///
@@ -229,7 +231,7 @@ namespace DeepSpeechClient
///
/// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
/// The number of samples in the audio signal.
- /// The STT result. The user is responsible for freeing the string. Returns NULL on error.
+ /// The STT result. Returns NULL on error.
public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize)
{
return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize).PtrToString();
@@ -240,8 +242,8 @@ namespace DeepSpeechClient
///
/// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
/// The number of samples in the audio signal.
- /// The extended metadata. The user is responsible for freeing the struct. Returns NULL on error.
- public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
+ /// The extended metadata. Returns NULL on error.
+ public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
{
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
}
diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
index 320ecde5..b9077361 100644
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
@@ -48,6 +48,7 @@
+
diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
index c47c25a1..734f4240 100644
--- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
@@ -1,10 +1,11 @@
using DeepSpeechClient.Models;
using System;
+using System.IO;
namespace DeepSpeechClient.Interfaces
{
///
- /// Client interface of the Mozilla's deepspeech implementation.
+ /// Client interface of the Mozilla's DeepSpeech implementation.
///
public interface IDeepSpeech : IDisposable
{
@@ -13,15 +14,6 @@ namespace DeepSpeechClient.Interfaces
///
void PrintVersions();
- ///
- /// Create an object providing an interface to a trained DeepSpeech model.
- ///
- /// The path to the frozen model graph.
- /// The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.
- /// Thrown when the native binary failed to create the model.
- unsafe void CreateModel(string aModelPath,
- uint aBeamWidth);
-
///
/// Return the sample rate expected by the model.
///
@@ -36,6 +28,7 @@ namespace DeepSpeechClient.Interfaces
/// The alpha hyperparameter of the CTC decoder. Language Model weight.
/// The beta hyperparameter of the CTC decoder. Word insertion weight.
/// Thrown when the native binary failed to enable decoding with a language model.
+ /// Thrown when cannot find the language model or trie file.
unsafe void EnableDecoderWithLM(string aLMPath,
string aTriePath,
float aLMAlpha,
@@ -46,7 +39,7 @@ namespace DeepSpeechClient.Interfaces
///
/// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
/// The number of samples in the audio signal.
- /// The STT result. The user is responsible for freeing the string. Returns NULL on error.
+ /// The STT result. Returns NULL on error.
unsafe string SpeechToText(short[] aBuffer,
uint aBufferSize);
@@ -55,7 +48,7 @@ namespace DeepSpeechClient.Interfaces
///
/// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
/// The number of samples in the audio signal.
- /// The extended metadata result. The user is responsible for freeing the struct. Returns NULL on error.
+ /// The extended metadata. Returns NULL on error.
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
uint aBufferSize);
@@ -64,46 +57,39 @@ namespace DeepSpeechClient.Interfaces
/// This can be used if you no longer need the result of an ongoing streaming
/// inference and don't want to perform a costly decode operation.
///
- unsafe void FreeStream();
-
- ///
- /// Free a DeepSpeech allocated string
- ///
- unsafe void FreeString(IntPtr intPtr);
-
- ///
- /// Free a DeepSpeech allocated Metadata struct
- ///
- unsafe void FreeMetadata(IntPtr intPtr);
+ unsafe void FreeStream(DeepSpeechStream stream);
///
/// Creates a new streaming inference state.
///
- /// Thrown when the native binary failed to initialize the streaming mode.
- unsafe void CreateStream();
+ unsafe DeepSpeechStream CreateStream();
///
/// Feeds audio samples to an ongoing streaming inference.
///
+ /// Instance of the stream to feed the data.
/// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).
- unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize);
+ unsafe void FeedAudioContent(DeepSpeechStream stream, short[] aBuffer, uint aBufferSize);
///
/// Computes the intermediate decoding of an ongoing streaming inference.
///
- /// The STT intermediate result. The user is responsible for freeing the string.
- unsafe string IntermediateDecode();
+ /// Instance of the stream to decode.
+ /// The STT intermediate result.
+ unsafe string IntermediateDecode(DeepSpeechStream stream);
///
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
///
- /// The STT result. The user is responsible for freeing the string.
- unsafe string FinishStream();
+ /// Instance of the stream to finish.
+ /// The STT result.
+ unsafe string FinishStream(DeepSpeechStream stream);
///
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
///
- /// The extended metadata result. The user is responsible for freeing the struct.
- unsafe Metadata FinishStreamWithMetadata();
+ /// Instance of the stream to finish.
+ /// The extended metadata result.
+ unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream);
}
}
diff --git a/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs b/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs
new file mode 100644
index 00000000..e4605f5e
--- /dev/null
+++ b/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs
@@ -0,0 +1,35 @@
+using System;
+
+namespace DeepSpeechClient.Models
+{
+ ///
+ /// Wrapper of the pointer used for the decoding stream.
+ ///
+ public class DeepSpeechStream : IDisposable
+ {
+ private unsafe IntPtr** _streamingStatePp;
+
+ ///
+ /// Initializes a new instance of .
+ ///
+ /// Native pointer of the native stream.
+ public unsafe DeepSpeechStream(IntPtr** streamingStatePP)
+ {
+ _streamingStatePp = streamingStatePP;
+ }
+
+ ///
+ /// Gets the native pointer.
+ ///
+ /// Thrown when the stream has been disposed or not yet initialized.
+ /// Native pointer of the stream.
+ internal unsafe IntPtr** GetNativePointer()
+ {
+ if (_streamingStatePp == null)
+ throw new InvalidOperationException("Cannot use a disposed or uninitialized stream.");
+ return _streamingStatePp;
+ }
+
+ public unsafe void Dispose() => _streamingStatePp = null;
+ }
+}
diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs
index 0ea331d8..572055c0 100644
--- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs
+++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs
@@ -48,7 +48,7 @@ namespace DeepSpeechClient
ref IntPtr** retval);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
- internal static unsafe extern void DS_FreeStream(ref IntPtr** aSctx);
+ internal static unsafe extern void DS_FreeStream(IntPtr** aSctx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeMetadata(IntPtr metadata);
diff --git a/native_client/dotnet/DeepSpeechConsole/Program.cs b/native_client/dotnet/DeepSpeechConsole/Program.cs
index 364cab71..8c75a481 100644
--- a/native_client/dotnet/DeepSpeechConsole/Program.cs
+++ b/native_client/dotnet/DeepSpeechConsole/Program.cs
@@ -53,16 +53,13 @@ namespace CSharpExamples
const float LM_BETA = 1.85f;
Stopwatch stopwatch = new Stopwatch();
-
- using (IDeepSpeech sttClient = new DeepSpeech())
+ try
{
- try
+ Console.WriteLine("Loading model...");
+ stopwatch.Start();
+ using (IDeepSpeech sttClient = new DeepSpeech(model ?? "output_graph.pbmm",
+ BEAM_WIDTH))
{
- Console.WriteLine("Loading model...");
- stopwatch.Start();
- sttClient.CreateModel(
- model ?? "output_graph.pbmm",
- BEAM_WIDTH);
stopwatch.Stop();
Console.WriteLine($"Model loaded - {stopwatch.Elapsed.Milliseconds} ms");
@@ -88,12 +85,14 @@ namespace CSharpExamples
string speechResult;
if (extended)
{
- Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
+ Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer,
+ Convert.ToUInt32(waveBuffer.MaxSize / 2));
speechResult = MetadataToString(metaResult);
}
else
{
- speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
+ speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer,
+ Convert.ToUInt32(waveBuffer.MaxSize / 2));
}
stopwatch.Stop();
@@ -104,10 +103,10 @@ namespace CSharpExamples
}
waveBuffer.Clear();
}
- catch (Exception ex)
- {
- Console.WriteLine(ex.Message);
- }
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine(ex.Message);
}
}
}