Merge pull request #2022 from lissyx/expose-metadata

Expose extended metadata information to bindings
This commit is contained in:
lissyx 2019-04-24 23:06:33 +02:00 committed by GitHub
commit 9815d54218
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 519 additions and 116 deletions

View File

@ -28,6 +28,8 @@ bool has_versions = false;
bool extended_metadata = false;
bool json_output = false;
void PrintHelp(const char* bin)
{
std::cout <<
@ -41,7 +43,8 @@ void PrintHelp(const char* bin)
" --trie TRIE Path to the language model trie file created with native_client/generate_trie\n"
" --audio AUDIO Path to the audio file to run (WAV format)\n"
" -t Run in benchmark mode, output mfcc & inference time\n"
" -e Extended output, shows word timings as CSV (word, start time, duration)\n"
" --extended Output string from extended metadata\n"
" --json Extended output, shows word timings as JSON\n"
" --help Show help\n"
" --version Print version and exits\n";
DS_PrintVersions();
@ -59,7 +62,8 @@ bool ProcessArgs(int argc, char** argv)
{"audio", required_argument, nullptr, 'w'},
{"run_very_slowly_without_trie_I_really_know_what_Im_doing", no_argument, nullptr, 999},
{"t", no_argument, nullptr, 't'},
{"e", no_argument, nullptr, 'e'},
{"extended", no_argument, nullptr, 'e'},
{"json", no_argument, nullptr, 'j'},
{"help", no_argument, nullptr, 'h'},
{"version", no_argument, nullptr, 'v'},
{nullptr, no_argument, nullptr, 0}
@ -110,6 +114,10 @@ bool ProcessArgs(int argc, char** argv)
extended_metadata = true;
break;
case 'j':
json_output = true;
break;
case 'h': // -h or --help
case '?': // Unrecognized option
default:

View File

@ -50,24 +50,29 @@ struct meta_word {
float duration;
};
char* metadataToString(Metadata* metadata);
std::vector<meta_word> WordsFromMetadata(Metadata* metadata);
char* JSONOutput(Metadata* metadata);
ds_result
LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
int aSampleRate, bool extended_output)
int aSampleRate, bool extended_output, bool json_output)
{
ds_result res = {0};
clock_t ds_start_time = clock();
if (extended_output) {
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
res.string = metadataToString(metadata);
DS_FreeMetadata(metadata);
} else if (json_output) {
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
res.string = JSONOutput(metadata);
DS_FreeMetadata(metadata);
} else {
res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize, aSampleRate);
}
}
clock_t ds_end_infer = clock();
@ -241,7 +246,8 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
(const short*)audio.buffer,
audio.buffer_size / 2,
audio.sample_rate,
extended_metadata);
extended_metadata,
json_output);
free(audio.buffer);
if (result.string) {
@ -255,6 +261,17 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
}
}
char*
metadataToString(Metadata* metadata)
{
std::string retval = "";
for (int i = 0; i < metadata->num_items; i++) {
MetadataItem item = metadata->items[i];
retval += item.character;
}
return strdup(retval.c_str());
}
std::vector<meta_word>
WordsFromMetadata(Metadata* metadata)
{
@ -274,16 +291,16 @@ WordsFromMetadata(Metadata* metadata)
}
// Word boundary is either a space or the last character in the array
if (strcmp(item.character, " ") == 0
|| strcmp(item.character, u8" ") == 0
if (strcmp(item.character, " ") == 0
|| strcmp(item.character, u8" ") == 0
|| i == metadata->num_items-1) {
float word_duration = item.start_time - word_start_time;
if (word_duration < 0) {
word_duration = 0;
}
meta_word w;
w.word = word;
w.start_time = word_start_time;

View File

@ -1,4 +1,3 @@
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.28307.136
@ -8,21 +7,24 @@ EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechConsole", "DeepSpeechConsole\DeepSpeechConsole.csproj", "{312965E5-C4F6-4D95-BA64-79906B8BC7AC}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.ActiveCfg = Debug|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.Build.0 = Debug|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.ActiveCfg = Release|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.ActiveCfg = Debug|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.Build.0 = Debug|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.ActiveCfg = Release|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {FC035D95-DBFD-4050-885A-A2DD9134B3AD}
EndGlobalSection
EndGlobal

View File

@ -1,9 +1,10 @@
using DeepSpeechClient.Interfaces;
using DeepSpeechClient.Structs;
using DeepSpeechClient.Extensions;
using System;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;
namespace DeepSpeechClient
{
@ -16,7 +17,7 @@ namespace DeepSpeechClient
private unsafe ModelState* _modelStateP;
private unsafe StreamingState** _streamingStatePP;
public DeepSpeech()
@ -119,7 +120,7 @@ namespace DeepSpeechClient
/// <summary>
/// Feeds audio samples to an ongoing streaming inference.
/// </summary>
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize)
{
NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize);
@ -131,11 +132,20 @@ namespace DeepSpeechClient
/// <returns>The STT result. The user is responsible for freeing the string.</returns>
public unsafe string FinishStream()
{
return NativeImp.DS_FinishStream(_streamingStatePP);
return NativeImp.DS_FinishStream(_streamingStatePP).PtrToString();
}
/// <summary>
/// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
/// </summary>
/// <returns>The extended metadata. The user is responsible for freeing the struct.</returns>
public unsafe Models.Metadata FinishStreamWithMetadata()
{
return NativeImp.DS_FinishStreamWithMetadata(_streamingStatePP).PtrToMetadata();
}
/// <summary>
/// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't
/// currently capable of streaming, so it always starts from the beginning of the audio.
/// </summary>
/// <returns>The STT intermediate result. The user is responsible for freeing the string.</returns>
@ -156,7 +166,7 @@ namespace DeepSpeechClient
/// Creates a new streaming inference state.
/// </summary>
/// <param name="aPreAllocFrames">Number of timestep frames to reserve.
/// One timestep is equivalent to two window lengths(20ms).
/// One timestep is equivalent to two window lengths(20ms).
/// If set to 0 we reserve enough frames for 3 seconds of audio(150).</param>
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
/// <returns>Zero for success, non-zero on failure</returns>
@ -166,7 +176,7 @@ namespace DeepSpeechClient
}
/// <summary>
/// Destroy a streaming state without decoding the computed logits.
/// Destroy a streaming state without decoding the computed logits.
/// This can be used if you no longer need the result of an ongoing streaming
/// inference and don't want to perform a costly decode operation.
/// </summary>
@ -175,6 +185,22 @@ namespace DeepSpeechClient
NativeImp.DS_DiscardStream(ref _streamingStatePP);
}
/// <summary>
/// Free a DeepSpeech allocated string
/// </summary>
public unsafe void FreeString(IntPtr intPtr)
{
NativeImp.DS_FreeString(intPtr);
}
/// <summary>
/// Free a DeepSpeech allocated Metadata struct
/// </summary>
public unsafe void FreeMetadata(IntPtr intPtr)
{
NativeImp.DS_FreeMetadata(intPtr);
}
/// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text.
/// </summary>
@ -184,18 +210,24 @@ namespace DeepSpeechClient
/// <returns>The STT result. The user is responsible for freeing the string. Returns NULL on error.</returns>
public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize, uint aSampleRate)
{
var res = NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate);
int len = 0;
while (Marshal.ReadByte(res, len) != 0) ++len;
byte[] buffer = new byte[len];
Marshal.Copy(res, buffer, 0, buffer.Length);
return Encoding.UTF8.GetString(buffer);
return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToString();
}
/// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text.
/// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
/// <returns>The extended metadata. The user is responsible for freeing the struct. Returns NULL on error.</returns>
public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aSampleRate)
{
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToMetadata();
}
#endregion
}
}

View File

@ -13,25 +13,6 @@
<FileAlignment>512</FileAlignment>
<Deterministic>true</Deterministic>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x64\Debug\</OutputPath>
@ -65,10 +46,15 @@
<ItemGroup>
<Compile Include="DeepSpeech.cs" />
<Compile Include="Interfaces\IDeepSpeech.cs" />
<Compile Include="Extensions\NativeExtensions.cs" />
<Compile Include="Models\Metadata.cs" />
<Compile Include="Models\MetadataItem.cs" />
<Compile Include="NativeImp.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Structs\ModelState.cs" />
<Compile Include="Structs\StreamingState.cs" />
<Compile Include="Structs\Metadata.cs" />
<Compile Include="Structs\MetadataItem.cs" />
</ItemGroup>
<ItemGroup />
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />

View File

@ -0,0 +1,60 @@
using DeepSpeechClient.Structs;
using System;
using System.Runtime.InteropServices;
using System.Text;
namespace DeepSpeechClient.Extensions
{
internal static class NativeExtensions
{
/// <summary>
/// Converts native pointer to UTF-8 encoded string.
/// </summary>
/// <param name="intPtr">Native pointer.</param>
/// <param name="releasePtr">Optional parameter to release the native pointer.</param>
/// <returns>Result string.</returns>
internal static string PtrToString(this IntPtr intPtr, bool releasePtr = true)
{
int len = 0;
while (Marshal.ReadByte(intPtr, len) != 0) ++len;
byte[] buffer = new byte[len];
Marshal.Copy(intPtr, buffer, 0, buffer.Length);
if (releasePtr)
NativeImp.DS_FreeString(intPtr);
string result = Encoding.UTF8.GetString(buffer);
return result;
}
/// <summary>
/// Converts a pointer into managed metadata object.
/// </summary>
/// <param name="intPtr">Native pointer.</param>
/// <returns>Metadata managed object.</returns>
internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
{
var managedMetaObject = new Models.Metadata();
var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));
managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
managedMetaObject.Probability = metaData.probability;
//we need to manually read each item from the native ptr using its size
var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
for (int i = 0; i < metaData.num_items; i++)
{
var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items);
managedMetaObject.Items[i] = new Models.MetadataItem
{
Timestep = tempItem.timestep,
StartTime = tempItem.start_time,
Character = tempItem.character.PtrToString(releasePtr: false)
};
//we keep the offset on each read
metaData.items += sizeOfMetaItem;
}
NativeImp.DS_FreeMetadata(intPtr);
return managedMetaObject;
}
}
}

View File

@ -1,4 +1,5 @@
using System;
using DeepSpeechClient.Models;
using System;
namespace DeepSpeechClient.Interfaces
{
@ -53,17 +54,38 @@ namespace DeepSpeechClient.Interfaces
uint aSampleRate);
/// <summary>
/// Destroy a streaming state without decoding the computed logits.
/// Use the DeepSpeech model to perform Speech-To-Text.
/// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
/// <returns>The extended metadata result. The user is responsible for freeing the struct. Returns NULL on error.</returns>
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
uint aBufferSize,
uint aSampleRate);
/// <summary>
/// Destroy a streaming state without decoding the computed logits.
/// This can be used if you no longer need the result of an ongoing streaming
/// inference and don't want to perform a costly decode operation.
/// </summary>
unsafe void DiscardStream();
/// <summary>
/// Free a DeepSpeech allocated string
/// </summary>
unsafe void FreeString(IntPtr intPtr);
/// <summary>
/// Free a DeepSpeech allocated Metadata struct
/// </summary>
unsafe void FreeMetadata(IntPtr intPtr);
/// <summary>
/// Creates a new streaming inference state.
/// </summary>
/// <param name="aPreAllocFrames">Number of timestep frames to reserve.
/// One timestep is equivalent to two window lengths(20ms).
/// One timestep is equivalent to two window lengths(20ms).
/// If set to 0 we reserve enough frames for 3 seconds of audio(150).</param>
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
/// <returns>Zero for success, non-zero on failure</returns>
@ -72,11 +94,11 @@ namespace DeepSpeechClient.Interfaces
/// <summary>
/// Feeds audio samples to an ongoing streaming inference.
/// </summary>
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize);
/// <summary>
/// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't
/// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't
/// currently capable of streaming, so it always starts from the beginning of the audio.
/// </summary>
/// <returns>The STT intermediate result. The user is responsible for freeing the string.</returns>
@ -87,5 +109,11 @@ namespace DeepSpeechClient.Interfaces
/// </summary>
/// <returns>The STT result. The user is responsible for freeing the string.</returns>
unsafe string FinishStream();
/// <summary>
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
/// </summary>
/// <returns>The extended metadata result. The user is responsible for freeing the struct.</returns>
unsafe Metadata FinishStreamWithMetadata();
}
}

View File

@ -0,0 +1,17 @@
namespace DeepSpeechClient.Models
{
/// <summary>
/// Stores the entire CTC output as an array of character metadata objects.
/// </summary>
public class Metadata
{
/// <summary>
/// Approximated probability (confidence value) for this transcription.
/// </summary>
public double Probability { get; set; }
/// <summary>
/// List of metada items containing char, timespet, and time offset.
/// </summary>
public MetadataItem[] Items { get; set; }
}
}

View File

@ -0,0 +1,21 @@
namespace DeepSpeechClient.Models
{
/// <summary>
/// Stores each individual character, along with its timing information.
/// </summary>
public class MetadataItem
{
/// <summary>
/// Char of the current timestep.
/// </summary>
public string Character;
/// <summary>
/// Position of the character in units of 20ms.
/// </summary>
public int Timestep;
/// <summary>
/// Position of the character in seconds.
/// </summary>
public float StartTime;
}
}

View File

@ -1,5 +1,6 @@
using System;
using DeepSpeechClient.Structs;
using DeepSpeechClient.Structs;
using System;
using System.Runtime.InteropServices;
namespace DeepSpeechClient
@ -36,6 +37,12 @@ namespace DeepSpeechClient
uint aBufferSize,
uint aSampleRate);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(ModelState** aCtx,
short[] aBuffer,
uint aBufferSize,
uint aSampleRate);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_DestroyModel(ModelState** aCtx);
@ -44,10 +51,15 @@ namespace DeepSpeechClient
uint aPreAllocFrames,
uint aSampleRate, ref StreamingState** retval);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_DiscardStream(ref StreamingState** aSctx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeMetadata(IntPtr metadata);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeString(IntPtr str);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
CharSet = CharSet.Ansi, SetLastError = true)]
internal static unsafe extern void DS_FeedAudioContent(StreamingState** aSctx,
@ -57,8 +69,12 @@ namespace DeepSpeechClient
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern string DS_IntermediateDecode(StreamingState** aSctx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
CharSet = CharSet.Ansi, SetLastError = true)]
internal static unsafe extern IntPtr DS_FinishStream( StreamingState** aSctx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern string DS_FinishStream( StreamingState** aSctx);
internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(StreamingState** aSctx);
#endregion
}
}

View File

@ -0,0 +1,22 @@
using System;
using System.Runtime.InteropServices;
namespace DeepSpeechClient.Structs
{
[StructLayout(LayoutKind.Sequential)]
internal unsafe struct Metadata
{
/// <summary>
/// Native list of items.
/// </summary>
internal unsafe IntPtr items;
/// <summary>
/// Count of items from the native side.
/// </summary>
internal unsafe int num_items;
/// <summary>
/// Approximated probability (confidence value) for this transcription.
/// </summary>
internal unsafe double probability;
}
}

View File

@ -0,0 +1,22 @@
using System;
using System.Runtime.InteropServices;
namespace DeepSpeechClient.Structs
{
[StructLayout(LayoutKind.Sequential)]
internal unsafe struct MetadataItem
{
/// <summary>
/// Native character.
/// </summary>
internal unsafe IntPtr character;
/// <summary>
/// Position of the character in units of 20ms.
/// </summary>
internal unsafe int timestep;
/// <summary>
/// Position of the character in seconds.
/// </summary>
internal unsafe float start_time;
}
}

View File

@ -13,25 +13,6 @@
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
<Deterministic>true</Deterministic>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<PlatformTarget>AnyCPU</PlatformTarget>
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<PlatformTarget>AnyCPU</PlatformTarget>
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x64\Debug\</OutputPath>

View File

@ -1,5 +1,6 @@
using DeepSpeechClient;
using DeepSpeechClient.Interfaces;
using DeepSpeechClient.Models;
using NAudio.Wave;
using System;
using System.Collections.Generic;
@ -20,6 +21,17 @@ namespace CSharpExamples
static string GetArgument(IEnumerable<string> args, string option)
=> args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();
static string MetadataToString(Metadata meta)
{
var nl = Environment.NewLine;
string retval =
Environment.NewLine +$"Recognized text: {string.Join("", meta?.Items?.Select(x=>x.Character))} {nl}"
+ $"Prob: {meta?.Probability} {nl}"
+ $"Item count: {meta?.Items?.Length} {nl}"
+ string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}"));
return retval;
}
static void Main(string[] args)
{
string model = null;
@ -27,6 +39,7 @@ namespace CSharpExamples
string lm = null;
string trie = null;
string audio = null;
bool extended = false;
if (args.Length > 0)
{
model = GetArgument(args, "--model");
@ -34,6 +47,7 @@ namespace CSharpExamples
lm = GetArgument(args, "--lm");
trie = GetArgument(args, "--trie");
audio = GetArgument(args, "--audio");
extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended"));
}
const uint N_CEP = 26;
@ -50,9 +64,9 @@ namespace CSharpExamples
Console.WriteLine("Loading model...");
stopwatch.Start();
try
{
{
result = sttClient.CreateModel(
model ?? "output_graph.pbmm",
model ?? "output_graph.pbmm",
N_CEP, N_CONTEXT,
alphabet ?? "alphabet.txt",
BEAM_WIDTH);
@ -62,7 +76,6 @@ namespace CSharpExamples
Console.WriteLine("Error loading lm.");
Console.WriteLine(ex.Message);
}
stopwatch.Stop();
if (result == 0)
{
@ -95,13 +108,22 @@ namespace CSharpExamples
stopwatch.Start();
string speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
string speechResult;
if (extended)
{
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
speechResult = MetadataToString(metaResult);
}
else
{
speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
}
stopwatch.Stop();
Console.WriteLine($"Audio duration: {waveInfo.TotalTime.ToString()}");
Console.WriteLine($"Inference took: {stopwatch.Elapsed.ToString()}");
Console.WriteLine($"Recognized text: {speechResult}");
Console.WriteLine((extended ? $"Extended result: ": "Recognized text: ") + speechResult);
}
waveBuffer.Clear();
}

View File

@ -16,6 +16,25 @@
%pointer_functions(StreamingState*, streamingstatep);
%typemap(newfree) char* "DS_FreeString($1);";
%include "carrays.i"
%array_functions(struct MetadataItem, metadataItem_array);
%extend struct Metadata {
MetadataItem getItem(int i) {
return metadataItem_array_getitem(self->items, i);
}
~Metadata() {
DS_FreeMetadata(self);
}
}
%nodefaultdtor Metadata;
%nodefaultctor Metadata;
%nodefaultctor MetadataItem;
%nodefaultdtor MetadataItem;
%newobject DS_SpeechToText;
%newobject DS_IntermediateDecode;
%newobject DS_FinishStream;

View File

@ -12,6 +12,7 @@ import org.junit.runners.MethodSorters;
import static org.junit.Assert.*;
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
import org.mozilla.deepspeech.libdeepspeech.Metadata;
import java.io.RandomAccessFile;
import java.io.FileNotFoundException;
@ -66,10 +67,18 @@ public class BasicTest {
@Test
public void loadDeepSpeech_basic() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
m.destroyModel();
m.destroyModel();
}
private String doSTT(DeepSpeechModel m) {
private String metadataToString(Metadata m) {
String retval = "";
for (int i = 0; i < m.getNum_items(); ++i) {
retval += m.getItem(i).getCharacter();
}
return retval;
}
private String doSTT(DeepSpeechModel m, boolean extendedMetadata) {
try {
RandomAccessFile wave = new RandomAccessFile(wavFile, "r");
@ -96,7 +105,11 @@ public class BasicTest {
// to turn bytes to shorts as either big endian or little endian.
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
return m.stt(shorts, shorts.length, sampleRate);
if (extendedMetadata) {
return metadataToString(m.sttWithMetadata(shorts, shorts.length, sampleRate));
} else {
return m.stt(shorts, shorts.length, sampleRate);
}
} catch (FileNotFoundException ex) {
} catch (IOException ex) {
@ -105,25 +118,44 @@ public class BasicTest {
}
return "";
return "";
}
@Test
public void loadDeepSpeech_stt_noLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
String decoded = doSTT(m);
assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel();
String decoded = doSTT(m, false);
assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel();
}
@Test
public void loadDeepSpeech_stt_withLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA);
m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA);
String decoded = doSTT(m);
assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel();
String decoded = doSTT(m, false);
assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel();
}
@Test
public void loadDeepSpeech_sttWithMetadata_noLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
String decoded = doSTT(m, true);
assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel();
}
@Test
public void loadDeepSpeech_sttWithMetadata_withLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA);
String decoded = doSTT(m, true);
assertEquals("she had your dark suit in greasy wash water all year", decoded);
m.destroyModel();
}
}

View File

@ -29,6 +29,10 @@ public class DeepSpeechModel {
return impl.SpeechToText(this._msp, buffer, buffer_size, sample_rate);
}
public Metadata sttWithMetadata(short[] buffer, int buffer_size, int sample_rate) {
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, sample_rate);
}
public DeepSpeechStreamingState setupStream(int prealloc_frames, int sample_rate) {
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
impl.SetupStream(this._msp, prealloc_frames, sample_rate, ssp);
@ -46,4 +50,8 @@ public class DeepSpeechModel {
public String finishStream(DeepSpeechStreamingState ctx) {
return impl.FinishStream(ctx.get());
}
public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx) {
return impl.FinishStreamWithMetadata(ctx.get());
}
}

View File

@ -56,13 +56,22 @@ parser.addArgument(['--alphabet'], {required: true, help: 'Path to the configura
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'})
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
var args = parser.parseArgs();
function totalTime(hrtimeValue) {
return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
}
function metadataToString(metadata) {
var retval = ""
for (var i = 0; i < metadata.num_items; ++i) {
retval += metadata.items[i].character;
}
return retval;
}
const buffer = Fs.readFileSync(args['audio']);
const result = Wav.decode(buffer);
@ -119,7 +128,11 @@ audioStream.on('finish', () => {
// We take half of the buffer_size because buffer is a char* while
// LocalDsSTT() expected a short*
console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000));
if (args['extended']) {
console.log(metadataToString(model.sttWithMetadata(audioBuffer.slice(0, audioBuffer.length / 2), 16000)));
} else {
console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000));
}
const inference_stop = process.hrtime(inference_start);
console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4));
process.exit(0);

View File

@ -28,6 +28,8 @@ using namespace node;
// make sure the string returned by SpeechToText is freed
%typemap(newfree) char* "DS_FreeString($1);";
%typemap(newfree) Metadata* "DS_FreeMetadata($1);";
%newobject DS_SpeechToText;
%newobject DS_IntermediateDecode;
%newobject DS_FinishStream;
@ -41,7 +43,7 @@ using namespace node;
%typemap(argout) ModelState **retval {
$result = SWIGV8_ARRAY_NEW();
SWIGV8_AppendOutput($result, SWIG_From_int(result));
// owned by SWIG, ModelState destructor gets called when the Python object is finalized (see below)
// owned by SWIG, ModelState destructor gets called when the JavaScript object is finalized (see below)
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, SWIG_POINTER_OWN));
}
@ -60,7 +62,7 @@ using namespace node;
}
// extend ModelState with a destructor so that DestroyModel will be called
// when the Python object gets finalized.
// when the JavaScript object gets finalized.
%nodefaultctor ModelState;
%nodefaultdtor ModelState;
@ -72,6 +74,31 @@ struct ModelState {};
}
}
%nodefaultdtor Metadata;
%nodefaultctor Metadata;
%nodefaultctor MetadataItem;
%nodefaultdtor MetadataItem;
%extend Metadata {
v8::Handle<v8::Value> items;
v8::Handle<v8::Value> items_get() {
v8::Handle<v8::Value> jsresult = SWIGV8_ARRAY_NEW();
for (int i = 0; i < self->num_items; ++i) {
jsresult = SWIGV8_AppendOutput(jsresult, SWIG_NewPointerObj(SWIG_as_voidptr(&self->items[i]), SWIGTYPE_p_MetadataItem, SWIG_POINTER_OWN));
}
fail:
return jsresult;
}
v8::Handle<v8::Value> items_set(const v8::Handle<v8::Value> arg) {
fail:
v8::Handle<v8::Value> result = SWIGV8_ARRAY_NEW();
return result;
}
~Metadata() {
DS_FreeMetadata($self);
}
}
%rename ("%(strip:[DS_])s") "";
%include "../deepspeech.h"

View File

@ -43,6 +43,11 @@ Model.prototype.stt = function() {
return binding.SpeechToText.apply(null, args);
}
Model.prototype.sttWithMetadata = function() {
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
return binding.SpeechToTextWithMetadata.apply(null, args);
}
Model.prototype.setupStream = function() {
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
const rets = binding.SetupStream.apply(null, args);
@ -66,6 +71,10 @@ Model.prototype.finishStream = function() {
return binding.FinishStream.apply(null, arguments);
}
Model.prototype.finishStreamWithMetadata = function() {
return binding.FinishStreamWithMetadata.apply(null, arguments);
}
module.exports = {
Model: Model,
printVersions: binding.PrintVersions

View File

@ -34,6 +34,9 @@ class Model(object):
def stt(self, *args, **kwargs):
return deepspeech.impl.SpeechToText(self._impl, *args, **kwargs)
def sttWithMetadata(self, *args, **kwargs):
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
def setupStream(self, pre_alloc_frames=150, sample_rate=16000):
status, ctx = deepspeech.impl.SetupStream(self._impl,
aPreAllocFrames=pre_alloc_frames,
@ -50,3 +53,6 @@ class Model(object):
def finishStream(self, *args, **kwargs):
return deepspeech.impl.FinishStream(*args, **kwargs)
def finishStreamWithMetadata(self, *args, **kwargs):
return deepspeech.impl.FinishStreamWithMetadata(*args, **kwargs)

View File

@ -50,6 +50,12 @@ def convert_samplerate(audio_path):
return 16000, np.frombuffer(output, np.int16)
def metadata_to_string(metadata):
retval = ''
for item in range(metadata.num_items):
retval += metadata.items[item].character
return retval
class VersionAction(argparse.Action):
def __init__(self, *args, **kwargs):
@ -73,6 +79,8 @@ def main():
help='Path to the audio file to run (WAV format)')
parser.add_argument('--version', action=VersionAction,
help='Print version and exits')
parser.add_argument('--extended', required=False, action='store_true',
help='Output string from extended metadata')
args = parser.parse_args()
print('Loading model from file {}'.format(args.model), file=sys.stderr)
@ -101,7 +109,10 @@ def main():
print('Running inference.', file=sys.stderr)
inference_start = timer()
print(ds.stt(audio, fs))
if args.extended:
print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
else:
print(ds.stt(audio, fs))
inference_end = timer() - inference_start
print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

View File

@ -33,7 +33,30 @@ import_array();
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0));
}
%extend struct MetadataItem {
MetadataItem* __getitem__(size_t i) {
return &$self[i];
}
}
%typemap(out) Metadata* {
// owned, extended destructor needs to be called by SWIG
%append_output(SWIG_NewPointerObj(%as_voidptr($1), $1_descriptor, SWIG_POINTER_OWN));
}
%extend struct Metadata {
~Metadata() {
DS_FreeMetadata($self);
}
}
%nodefaultdtor Metadata;
%nodefaultctor Metadata;
%nodefaultctor MetadataItem;
%nodefaultdtor MetadataItem;
%typemap(newfree) char* "DS_FreeString($1);";
%newobject DS_SpeechToText;
%newobject DS_IntermediateDecode;
%newobject DS_FinishStream;

View File

@ -312,6 +312,11 @@ run_tflite_basic_inference_tests()
phrase_pbmodel_nolm=$(${DS_BINARY_PREFIX}deepspeech --model ${ANDROID_TMP_DIR}/ds/${model_name} --alphabet ${ANDROID_TMP_DIR}/ds/alphabet.txt --audio ${ANDROID_TMP_DIR}/ds/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e
phrase_pbmodel_nolm=$(${DS_BINARY_PREFIX}deepspeech --model ${ANDROID_TMP_DIR}/ds/${model_name} --alphabet ${ANDROID_TMP_DIR}/ds/alphabet.txt --audio ${ANDROID_TMP_DIR}/ds/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
}
run_netframework_inference_tests()
@ -321,6 +326,11 @@ run_netframework_inference_tests()
set -e
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e
phrase_pbmodel_nolm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended yes 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e
phrase_pbmodel_nolm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e
@ -339,6 +349,11 @@ run_electronjs_inference_tests()
set -e
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e
@ -358,6 +373,12 @@ run_basic_inference_tests()
set -e
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"
set +e
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$?
set -e
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"
set +e
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$?
@ -822,12 +843,12 @@ do_deepspeech_netframework_build()
# We need MSYS2_ARG_CONV_EXCL='/' otherwise the '/' of CLI parameters gets mangled and disappears
# We build the .NET Client for .NET Framework v4.5,v4.6,v4.7
MSYS2_ARG_CONV_EXCL='/' "${MSBUILD}" \
DeepSpeechClient/DeepSpeechClient.csproj \
/p:Configuration=Release \
/p:Platform=x64 \
/p:TargetFrameworkVersion="v4.5" \
/p:TargetFrameworkVersion="v4.5.2" \
/p:OutputPath=bin/nuget/x64/v4.5
MSYS2_ARG_CONV_EXCL='/' "${MSBUILD}" \
@ -863,13 +884,13 @@ do_nuget_build()
cp ${DS_TFDIR}/bazel-bin/native_client/libdeepspeech.so nupkg/build
# We copy the generated clients for .NET into the Nuget framework dirs
mkdir -p nupkg/lib/net45/
cp DeepSpeechClient/bin/nuget/x64/v4.5/DeepSpeechClient.dll nupkg/lib/net45/
mkdir -p nupkg/lib/net46/
cp DeepSpeechClient/bin/nuget/x64/v4.6/DeepSpeechClient.dll nupkg/lib/net46/
mkdir -p nupkg/lib/net47/
cp DeepSpeechClient/bin/nuget/x64/v4.7/DeepSpeechClient.dll nupkg/lib/net47/