Merge pull request #2022 from lissyx/expose-metadata
Expose extended metadata information to bindings
This commit is contained in:
commit
9815d54218
@ -28,6 +28,8 @@ bool has_versions = false;
|
||||
|
||||
bool extended_metadata = false;
|
||||
|
||||
bool json_output = false;
|
||||
|
||||
void PrintHelp(const char* bin)
|
||||
{
|
||||
std::cout <<
|
||||
@ -41,7 +43,8 @@ void PrintHelp(const char* bin)
|
||||
" --trie TRIE Path to the language model trie file created with native_client/generate_trie\n"
|
||||
" --audio AUDIO Path to the audio file to run (WAV format)\n"
|
||||
" -t Run in benchmark mode, output mfcc & inference time\n"
|
||||
" -e Extended output, shows word timings as CSV (word, start time, duration)\n"
|
||||
" --extended Output string from extended metadata\n"
|
||||
" --json Extended output, shows word timings as JSON\n"
|
||||
" --help Show help\n"
|
||||
" --version Print version and exits\n";
|
||||
DS_PrintVersions();
|
||||
@ -59,7 +62,8 @@ bool ProcessArgs(int argc, char** argv)
|
||||
{"audio", required_argument, nullptr, 'w'},
|
||||
{"run_very_slowly_without_trie_I_really_know_what_Im_doing", no_argument, nullptr, 999},
|
||||
{"t", no_argument, nullptr, 't'},
|
||||
{"e", no_argument, nullptr, 'e'},
|
||||
{"extended", no_argument, nullptr, 'e'},
|
||||
{"json", no_argument, nullptr, 'j'},
|
||||
{"help", no_argument, nullptr, 'h'},
|
||||
{"version", no_argument, nullptr, 'v'},
|
||||
{nullptr, no_argument, nullptr, 0}
|
||||
@ -110,6 +114,10 @@ bool ProcessArgs(int argc, char** argv)
|
||||
extended_metadata = true;
|
||||
break;
|
||||
|
||||
case 'j':
|
||||
json_output = true;
|
||||
break;
|
||||
|
||||
case 'h': // -h or --help
|
||||
case '?': // Unrecognized option
|
||||
default:
|
||||
|
@ -50,24 +50,29 @@ struct meta_word {
|
||||
float duration;
|
||||
};
|
||||
|
||||
char* metadataToString(Metadata* metadata);
|
||||
std::vector<meta_word> WordsFromMetadata(Metadata* metadata);
|
||||
char* JSONOutput(Metadata* metadata);
|
||||
|
||||
ds_result
|
||||
LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
|
||||
int aSampleRate, bool extended_output)
|
||||
int aSampleRate, bool extended_output, bool json_output)
|
||||
{
|
||||
ds_result res = {0};
|
||||
|
||||
clock_t ds_start_time = clock();
|
||||
|
||||
if (extended_output) {
|
||||
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
res.string = metadataToString(metadata);
|
||||
DS_FreeMetadata(metadata);
|
||||
} else if (json_output) {
|
||||
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
res.string = JSONOutput(metadata);
|
||||
DS_FreeMetadata(metadata);
|
||||
} else {
|
||||
res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
}
|
||||
}
|
||||
|
||||
clock_t ds_end_infer = clock();
|
||||
|
||||
@ -241,7 +246,8 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
|
||||
(const short*)audio.buffer,
|
||||
audio.buffer_size / 2,
|
||||
audio.sample_rate,
|
||||
extended_metadata);
|
||||
extended_metadata,
|
||||
json_output);
|
||||
free(audio.buffer);
|
||||
|
||||
if (result.string) {
|
||||
@ -255,6 +261,17 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
|
||||
}
|
||||
}
|
||||
|
||||
char*
|
||||
metadataToString(Metadata* metadata)
|
||||
{
|
||||
std::string retval = "";
|
||||
for (int i = 0; i < metadata->num_items; i++) {
|
||||
MetadataItem item = metadata->items[i];
|
||||
retval += item.character;
|
||||
}
|
||||
return strdup(retval.c_str());
|
||||
}
|
||||
|
||||
std::vector<meta_word>
|
||||
WordsFromMetadata(Metadata* metadata)
|
||||
{
|
||||
@ -274,16 +291,16 @@ WordsFromMetadata(Metadata* metadata)
|
||||
}
|
||||
|
||||
// Word boundary is either a space or the last character in the array
|
||||
if (strcmp(item.character, " ") == 0
|
||||
|| strcmp(item.character, u8" ") == 0
|
||||
if (strcmp(item.character, " ") == 0
|
||||
|| strcmp(item.character, u8" ") == 0
|
||||
|| i == metadata->num_items-1) {
|
||||
|
||||
|
||||
float word_duration = item.start_time - word_start_time;
|
||||
|
||||
|
||||
if (word_duration < 0) {
|
||||
word_duration = 0;
|
||||
}
|
||||
|
||||
|
||||
meta_word w;
|
||||
w.word = word;
|
||||
w.start_time = word_start_time;
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio 15
|
||||
VisualStudioVersion = 15.0.28307.136
|
||||
@ -8,21 +7,24 @@ EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechConsole", "DeepSpeechConsole\DeepSpeechConsole.csproj", "{312965E5-C4F6-4D95-BA64-79906B8BC7AC}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|x64 = Debug|x64
|
||||
Release|x64 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64
|
||||
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64
|
||||
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64
|
||||
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.Build.0 = Debug|x64
|
||||
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.ActiveCfg = Release|x64
|
||||
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|x64 = Debug|x64
|
||||
Release|x64 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64
|
||||
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64
|
||||
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64
|
||||
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.Build.0 = Debug|x64
|
||||
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.ActiveCfg = Release|x64
|
||||
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
EndGlobalSection
|
||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||
SolutionGuid = {FC035D95-DBFD-4050-885A-A2DD9134B3AD}
|
||||
EndGlobalSection
|
||||
EndGlobal
|
||||
|
@ -1,9 +1,10 @@
|
||||
using DeepSpeechClient.Interfaces;
|
||||
using DeepSpeechClient.Structs;
|
||||
using DeepSpeechClient.Extensions;
|
||||
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text;
|
||||
|
||||
namespace DeepSpeechClient
|
||||
{
|
||||
@ -16,7 +17,7 @@ namespace DeepSpeechClient
|
||||
private unsafe ModelState* _modelStateP;
|
||||
private unsafe StreamingState** _streamingStatePP;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
public DeepSpeech()
|
||||
@ -119,7 +120,7 @@ namespace DeepSpeechClient
|
||||
/// <summary>
|
||||
/// Feeds audio samples to an ongoing streaming inference.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
|
||||
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
|
||||
public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize)
|
||||
{
|
||||
NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize);
|
||||
@ -131,11 +132,20 @@ namespace DeepSpeechClient
|
||||
/// <returns>The STT result. The user is responsible for freeing the string.</returns>
|
||||
public unsafe string FinishStream()
|
||||
{
|
||||
return NativeImp.DS_FinishStream(_streamingStatePP);
|
||||
return NativeImp.DS_FinishStream(_streamingStatePP).PtrToString();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||
/// </summary>
|
||||
/// <returns>The extended metadata. The user is responsible for freeing the struct.</returns>
|
||||
public unsafe Models.Metadata FinishStreamWithMetadata()
|
||||
{
|
||||
return NativeImp.DS_FinishStreamWithMetadata(_streamingStatePP).PtrToMetadata();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't
|
||||
/// currently capable of streaming, so it always starts from the beginning of the audio.
|
||||
/// </summary>
|
||||
/// <returns>The STT intermediate result. The user is responsible for freeing the string.</returns>
|
||||
@ -156,7 +166,7 @@ namespace DeepSpeechClient
|
||||
/// Creates a new streaming inference state.
|
||||
/// </summary>
|
||||
/// <param name="aPreAllocFrames">Number of timestep frames to reserve.
|
||||
/// One timestep is equivalent to two window lengths(20ms).
|
||||
/// One timestep is equivalent to two window lengths(20ms).
|
||||
/// If set to 0 we reserve enough frames for 3 seconds of audio(150).</param>
|
||||
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
|
||||
/// <returns>Zero for success, non-zero on failure</returns>
|
||||
@ -166,7 +176,7 @@ namespace DeepSpeechClient
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Destroy a streaming state without decoding the computed logits.
|
||||
/// Destroy a streaming state without decoding the computed logits.
|
||||
/// This can be used if you no longer need the result of an ongoing streaming
|
||||
/// inference and don't want to perform a costly decode operation.
|
||||
/// </summary>
|
||||
@ -175,6 +185,22 @@ namespace DeepSpeechClient
|
||||
NativeImp.DS_DiscardStream(ref _streamingStatePP);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Free a DeepSpeech allocated string
|
||||
/// </summary>
|
||||
public unsafe void FreeString(IntPtr intPtr)
|
||||
{
|
||||
NativeImp.DS_FreeString(intPtr);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Free a DeepSpeech allocated Metadata struct
|
||||
/// </summary>
|
||||
public unsafe void FreeMetadata(IntPtr intPtr)
|
||||
{
|
||||
NativeImp.DS_FreeMetadata(intPtr);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
||||
/// </summary>
|
||||
@ -184,18 +210,24 @@ namespace DeepSpeechClient
|
||||
/// <returns>The STT result. The user is responsible for freeing the string. Returns NULL on error.</returns>
|
||||
public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize, uint aSampleRate)
|
||||
{
|
||||
var res = NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate);
|
||||
|
||||
int len = 0;
|
||||
while (Marshal.ReadByte(res, len) != 0) ++len;
|
||||
byte[] buffer = new byte[len];
|
||||
Marshal.Copy(res, buffer, 0, buffer.Length);
|
||||
return Encoding.UTF8.GetString(buffer);
|
||||
return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToString();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
|
||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
|
||||
/// <returns>The extended metadata. The user is responsible for freeing the struct. Returns NULL on error.</returns>
|
||||
public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aSampleRate)
|
||||
{
|
||||
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToMetadata();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -13,25 +13,6 @@
|
||||
<FileAlignment>512</FileAlignment>
|
||||
<Deterministic>true</Deterministic>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
|
||||
<DebugSymbols>true</DebugSymbols>
|
||||
<DebugType>full</DebugType>
|
||||
<Optimize>false</Optimize>
|
||||
<OutputPath>bin\Debug\</OutputPath>
|
||||
<DefineConstants>DEBUG;TRACE</DefineConstants>
|
||||
<ErrorReport>prompt</ErrorReport>
|
||||
<WarningLevel>4</WarningLevel>
|
||||
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
|
||||
<DebugType>pdbonly</DebugType>
|
||||
<Optimize>true</Optimize>
|
||||
<OutputPath>bin\Release\</OutputPath>
|
||||
<DefineConstants>TRACE</DefineConstants>
|
||||
<ErrorReport>prompt</ErrorReport>
|
||||
<WarningLevel>4</WarningLevel>
|
||||
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
|
||||
<DebugSymbols>true</DebugSymbols>
|
||||
<OutputPath>bin\x64\Debug\</OutputPath>
|
||||
@ -65,10 +46,15 @@
|
||||
<ItemGroup>
|
||||
<Compile Include="DeepSpeech.cs" />
|
||||
<Compile Include="Interfaces\IDeepSpeech.cs" />
|
||||
<Compile Include="Extensions\NativeExtensions.cs" />
|
||||
<Compile Include="Models\Metadata.cs" />
|
||||
<Compile Include="Models\MetadataItem.cs" />
|
||||
<Compile Include="NativeImp.cs" />
|
||||
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||
<Compile Include="Structs\ModelState.cs" />
|
||||
<Compile Include="Structs\StreamingState.cs" />
|
||||
<Compile Include="Structs\Metadata.cs" />
|
||||
<Compile Include="Structs\MetadataItem.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup />
|
||||
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
||||
|
@ -0,0 +1,60 @@
|
||||
using DeepSpeechClient.Structs;
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text;
|
||||
|
||||
namespace DeepSpeechClient.Extensions
|
||||
{
|
||||
internal static class NativeExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Converts native pointer to UTF-8 encoded string.
|
||||
/// </summary>
|
||||
/// <param name="intPtr">Native pointer.</param>
|
||||
/// <param name="releasePtr">Optional parameter to release the native pointer.</param>
|
||||
/// <returns>Result string.</returns>
|
||||
internal static string PtrToString(this IntPtr intPtr, bool releasePtr = true)
|
||||
{
|
||||
int len = 0;
|
||||
while (Marshal.ReadByte(intPtr, len) != 0) ++len;
|
||||
byte[] buffer = new byte[len];
|
||||
Marshal.Copy(intPtr, buffer, 0, buffer.Length);
|
||||
if (releasePtr)
|
||||
NativeImp.DS_FreeString(intPtr);
|
||||
string result = Encoding.UTF8.GetString(buffer);
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a pointer into managed metadata object.
|
||||
/// </summary>
|
||||
/// <param name="intPtr">Native pointer.</param>
|
||||
/// <returns>Metadata managed object.</returns>
|
||||
internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
|
||||
{
|
||||
var managedMetaObject = new Models.Metadata();
|
||||
var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));
|
||||
|
||||
managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
|
||||
managedMetaObject.Probability = metaData.probability;
|
||||
|
||||
|
||||
//we need to manually read each item from the native ptr using its size
|
||||
var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
|
||||
for (int i = 0; i < metaData.num_items; i++)
|
||||
{
|
||||
var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items);
|
||||
managedMetaObject.Items[i] = new Models.MetadataItem
|
||||
{
|
||||
Timestep = tempItem.timestep,
|
||||
StartTime = tempItem.start_time,
|
||||
Character = tempItem.character.PtrToString(releasePtr: false)
|
||||
};
|
||||
//we keep the offset on each read
|
||||
metaData.items += sizeOfMetaItem;
|
||||
}
|
||||
NativeImp.DS_FreeMetadata(intPtr);
|
||||
return managedMetaObject;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,4 +1,5 @@
|
||||
using System;
|
||||
using DeepSpeechClient.Models;
|
||||
using System;
|
||||
|
||||
namespace DeepSpeechClient.Interfaces
|
||||
{
|
||||
@ -53,17 +54,38 @@ namespace DeepSpeechClient.Interfaces
|
||||
uint aSampleRate);
|
||||
|
||||
/// <summary>
|
||||
/// Destroy a streaming state without decoding the computed logits.
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
|
||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
|
||||
/// <returns>The extended metadata result. The user is responsible for freeing the struct. Returns NULL on error.</returns>
|
||||
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
|
||||
uint aBufferSize,
|
||||
uint aSampleRate);
|
||||
|
||||
/// <summary>
|
||||
/// Destroy a streaming state without decoding the computed logits.
|
||||
/// This can be used if you no longer need the result of an ongoing streaming
|
||||
/// inference and don't want to perform a costly decode operation.
|
||||
/// </summary>
|
||||
unsafe void DiscardStream();
|
||||
|
||||
/// <summary>
|
||||
/// Free a DeepSpeech allocated string
|
||||
/// </summary>
|
||||
unsafe void FreeString(IntPtr intPtr);
|
||||
|
||||
/// <summary>
|
||||
/// Free a DeepSpeech allocated Metadata struct
|
||||
/// </summary>
|
||||
unsafe void FreeMetadata(IntPtr intPtr);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new streaming inference state.
|
||||
/// </summary>
|
||||
/// <param name="aPreAllocFrames">Number of timestep frames to reserve.
|
||||
/// One timestep is equivalent to two window lengths(20ms).
|
||||
/// One timestep is equivalent to two window lengths(20ms).
|
||||
/// If set to 0 we reserve enough frames for 3 seconds of audio(150).</param>
|
||||
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
|
||||
/// <returns>Zero for success, non-zero on failure</returns>
|
||||
@ -72,11 +94,11 @@ namespace DeepSpeechClient.Interfaces
|
||||
/// <summary>
|
||||
/// Feeds audio samples to an ongoing streaming inference.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
|
||||
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
|
||||
unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize);
|
||||
|
||||
/// <summary>
|
||||
/// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't
|
||||
/// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't
|
||||
/// currently capable of streaming, so it always starts from the beginning of the audio.
|
||||
/// </summary>
|
||||
/// <returns>The STT intermediate result. The user is responsible for freeing the string.</returns>
|
||||
@ -87,5 +109,11 @@ namespace DeepSpeechClient.Interfaces
|
||||
/// </summary>
|
||||
/// <returns>The STT result. The user is responsible for freeing the string.</returns>
|
||||
unsafe string FinishStream();
|
||||
|
||||
/// <summary>
|
||||
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
|
||||
/// </summary>
|
||||
/// <returns>The extended metadata result. The user is responsible for freeing the struct.</returns>
|
||||
unsafe Metadata FinishStreamWithMetadata();
|
||||
}
|
||||
}
|
||||
|
17
native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
Normal file
17
native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
Normal file
@ -0,0 +1,17 @@
|
||||
namespace DeepSpeechClient.Models
|
||||
{
|
||||
/// <summary>
|
||||
/// Stores the entire CTC output as an array of character metadata objects.
|
||||
/// </summary>
|
||||
public class Metadata
|
||||
{
|
||||
/// <summary>
|
||||
/// Approximated probability (confidence value) for this transcription.
|
||||
/// </summary>
|
||||
public double Probability { get; set; }
|
||||
/// <summary>
|
||||
/// List of metada items containing char, timespet, and time offset.
|
||||
/// </summary>
|
||||
public MetadataItem[] Items { get; set; }
|
||||
}
|
||||
}
|
21
native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs
Normal file
21
native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs
Normal file
@ -0,0 +1,21 @@
|
||||
namespace DeepSpeechClient.Models
|
||||
{
|
||||
/// <summary>
|
||||
/// Stores each individual character, along with its timing information.
|
||||
/// </summary>
|
||||
public class MetadataItem
|
||||
{
|
||||
/// <summary>
|
||||
/// Char of the current timestep.
|
||||
/// </summary>
|
||||
public string Character;
|
||||
/// <summary>
|
||||
/// Position of the character in units of 20ms.
|
||||
/// </summary>
|
||||
public int Timestep;
|
||||
/// <summary>
|
||||
/// Position of the character in seconds.
|
||||
/// </summary>
|
||||
public float StartTime;
|
||||
}
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
using System;
|
||||
using DeepSpeechClient.Structs;
|
||||
using DeepSpeechClient.Structs;
|
||||
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
namespace DeepSpeechClient
|
||||
@ -36,6 +37,12 @@ namespace DeepSpeechClient
|
||||
uint aBufferSize,
|
||||
uint aSampleRate);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
|
||||
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(ModelState** aCtx,
|
||||
short[] aBuffer,
|
||||
uint aBufferSize,
|
||||
uint aSampleRate);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern void DS_DestroyModel(ModelState** aCtx);
|
||||
|
||||
@ -44,10 +51,15 @@ namespace DeepSpeechClient
|
||||
uint aPreAllocFrames,
|
||||
uint aSampleRate, ref StreamingState** retval);
|
||||
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern void DS_DiscardStream(ref StreamingState** aSctx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern void DS_FreeMetadata(IntPtr metadata);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern void DS_FreeString(IntPtr str);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
|
||||
CharSet = CharSet.Ansi, SetLastError = true)]
|
||||
internal static unsafe extern void DS_FeedAudioContent(StreamingState** aSctx,
|
||||
@ -57,8 +69,12 @@ namespace DeepSpeechClient
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern string DS_IntermediateDecode(StreamingState** aSctx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
|
||||
CharSet = CharSet.Ansi, SetLastError = true)]
|
||||
internal static unsafe extern IntPtr DS_FinishStream( StreamingState** aSctx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern string DS_FinishStream( StreamingState** aSctx);
|
||||
internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(StreamingState** aSctx);
|
||||
#endregion
|
||||
}
|
||||
}
|
||||
|
22
native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
Normal file
22
native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
Normal file
@ -0,0 +1,22 @@
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
namespace DeepSpeechClient.Structs
|
||||
{
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
internal unsafe struct Metadata
|
||||
{
|
||||
/// <summary>
|
||||
/// Native list of items.
|
||||
/// </summary>
|
||||
internal unsafe IntPtr items;
|
||||
/// <summary>
|
||||
/// Count of items from the native side.
|
||||
/// </summary>
|
||||
internal unsafe int num_items;
|
||||
/// <summary>
|
||||
/// Approximated probability (confidence value) for this transcription.
|
||||
/// </summary>
|
||||
internal unsafe double probability;
|
||||
}
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
namespace DeepSpeechClient.Structs
|
||||
{
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
internal unsafe struct MetadataItem
|
||||
{
|
||||
/// <summary>
|
||||
/// Native character.
|
||||
/// </summary>
|
||||
internal unsafe IntPtr character;
|
||||
/// <summary>
|
||||
/// Position of the character in units of 20ms.
|
||||
/// </summary>
|
||||
internal unsafe int timestep;
|
||||
/// <summary>
|
||||
/// Position of the character in seconds.
|
||||
/// </summary>
|
||||
internal unsafe float start_time;
|
||||
}
|
||||
}
|
@ -13,25 +13,6 @@
|
||||
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
|
||||
<Deterministic>true</Deterministic>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
|
||||
<PlatformTarget>AnyCPU</PlatformTarget>
|
||||
<DebugSymbols>true</DebugSymbols>
|
||||
<DebugType>full</DebugType>
|
||||
<Optimize>false</Optimize>
|
||||
<OutputPath>bin\Debug\</OutputPath>
|
||||
<DefineConstants>DEBUG;TRACE</DefineConstants>
|
||||
<ErrorReport>prompt</ErrorReport>
|
||||
<WarningLevel>4</WarningLevel>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
|
||||
<PlatformTarget>AnyCPU</PlatformTarget>
|
||||
<DebugType>pdbonly</DebugType>
|
||||
<Optimize>true</Optimize>
|
||||
<OutputPath>bin\Release\</OutputPath>
|
||||
<DefineConstants>TRACE</DefineConstants>
|
||||
<ErrorReport>prompt</ErrorReport>
|
||||
<WarningLevel>4</WarningLevel>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
|
||||
<DebugSymbols>true</DebugSymbols>
|
||||
<OutputPath>bin\x64\Debug\</OutputPath>
|
||||
|
@ -1,5 +1,6 @@
|
||||
using DeepSpeechClient;
|
||||
using DeepSpeechClient.Interfaces;
|
||||
using DeepSpeechClient.Models;
|
||||
using NAudio.Wave;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
@ -20,6 +21,17 @@ namespace CSharpExamples
|
||||
static string GetArgument(IEnumerable<string> args, string option)
|
||||
=> args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();
|
||||
|
||||
static string MetadataToString(Metadata meta)
|
||||
{
|
||||
var nl = Environment.NewLine;
|
||||
string retval =
|
||||
Environment.NewLine +$"Recognized text: {string.Join("", meta?.Items?.Select(x=>x.Character))} {nl}"
|
||||
+ $"Prob: {meta?.Probability} {nl}"
|
||||
+ $"Item count: {meta?.Items?.Length} {nl}"
|
||||
+ string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}"));
|
||||
return retval;
|
||||
}
|
||||
|
||||
static void Main(string[] args)
|
||||
{
|
||||
string model = null;
|
||||
@ -27,6 +39,7 @@ namespace CSharpExamples
|
||||
string lm = null;
|
||||
string trie = null;
|
||||
string audio = null;
|
||||
bool extended = false;
|
||||
if (args.Length > 0)
|
||||
{
|
||||
model = GetArgument(args, "--model");
|
||||
@ -34,6 +47,7 @@ namespace CSharpExamples
|
||||
lm = GetArgument(args, "--lm");
|
||||
trie = GetArgument(args, "--trie");
|
||||
audio = GetArgument(args, "--audio");
|
||||
extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended"));
|
||||
}
|
||||
|
||||
const uint N_CEP = 26;
|
||||
@ -50,9 +64,9 @@ namespace CSharpExamples
|
||||
Console.WriteLine("Loading model...");
|
||||
stopwatch.Start();
|
||||
try
|
||||
{
|
||||
{
|
||||
result = sttClient.CreateModel(
|
||||
model ?? "output_graph.pbmm",
|
||||
model ?? "output_graph.pbmm",
|
||||
N_CEP, N_CONTEXT,
|
||||
alphabet ?? "alphabet.txt",
|
||||
BEAM_WIDTH);
|
||||
@ -62,7 +76,6 @@ namespace CSharpExamples
|
||||
Console.WriteLine("Error loading lm.");
|
||||
Console.WriteLine(ex.Message);
|
||||
}
|
||||
|
||||
stopwatch.Stop();
|
||||
if (result == 0)
|
||||
{
|
||||
@ -95,13 +108,22 @@ namespace CSharpExamples
|
||||
|
||||
stopwatch.Start();
|
||||
|
||||
string speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
|
||||
string speechResult;
|
||||
if (extended)
|
||||
{
|
||||
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
|
||||
speechResult = MetadataToString(metaResult);
|
||||
}
|
||||
else
|
||||
{
|
||||
speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
|
||||
}
|
||||
|
||||
stopwatch.Stop();
|
||||
|
||||
Console.WriteLine($"Audio duration: {waveInfo.TotalTime.ToString()}");
|
||||
Console.WriteLine($"Inference took: {stopwatch.Elapsed.ToString()}");
|
||||
Console.WriteLine($"Recognized text: {speechResult}");
|
||||
Console.WriteLine((extended ? $"Extended result: ": "Recognized text: ") + speechResult);
|
||||
}
|
||||
waveBuffer.Clear();
|
||||
}
|
||||
|
@ -16,6 +16,25 @@
|
||||
%pointer_functions(StreamingState*, streamingstatep);
|
||||
|
||||
%typemap(newfree) char* "DS_FreeString($1);";
|
||||
|
||||
%include "carrays.i"
|
||||
%array_functions(struct MetadataItem, metadataItem_array);
|
||||
|
||||
%extend struct Metadata {
|
||||
MetadataItem getItem(int i) {
|
||||
return metadataItem_array_getitem(self->items, i);
|
||||
}
|
||||
|
||||
~Metadata() {
|
||||
DS_FreeMetadata(self);
|
||||
}
|
||||
}
|
||||
|
||||
%nodefaultdtor Metadata;
|
||||
%nodefaultctor Metadata;
|
||||
%nodefaultctor MetadataItem;
|
||||
%nodefaultdtor MetadataItem;
|
||||
|
||||
%newobject DS_SpeechToText;
|
||||
%newobject DS_IntermediateDecode;
|
||||
%newobject DS_FinishStream;
|
||||
|
@ -12,6 +12,7 @@ import org.junit.runners.MethodSorters;
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
|
||||
import org.mozilla.deepspeech.libdeepspeech.Metadata;
|
||||
|
||||
import java.io.RandomAccessFile;
|
||||
import java.io.FileNotFoundException;
|
||||
@ -66,10 +67,18 @@ public class BasicTest {
|
||||
@Test
|
||||
public void loadDeepSpeech_basic() {
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
|
||||
m.destroyModel();
|
||||
m.destroyModel();
|
||||
}
|
||||
|
||||
private String doSTT(DeepSpeechModel m) {
|
||||
private String metadataToString(Metadata m) {
|
||||
String retval = "";
|
||||
for (int i = 0; i < m.getNum_items(); ++i) {
|
||||
retval += m.getItem(i).getCharacter();
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
private String doSTT(DeepSpeechModel m, boolean extendedMetadata) {
|
||||
try {
|
||||
RandomAccessFile wave = new RandomAccessFile(wavFile, "r");
|
||||
|
||||
@ -96,7 +105,11 @@ public class BasicTest {
|
||||
// to turn bytes to shorts as either big endian or little endian.
|
||||
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
|
||||
|
||||
return m.stt(shorts, shorts.length, sampleRate);
|
||||
if (extendedMetadata) {
|
||||
return metadataToString(m.sttWithMetadata(shorts, shorts.length, sampleRate));
|
||||
} else {
|
||||
return m.stt(shorts, shorts.length, sampleRate);
|
||||
}
|
||||
} catch (FileNotFoundException ex) {
|
||||
|
||||
} catch (IOException ex) {
|
||||
@ -105,25 +118,44 @@ public class BasicTest {
|
||||
|
||||
}
|
||||
|
||||
return "";
|
||||
return "";
|
||||
}
|
||||
|
||||
@Test
|
||||
public void loadDeepSpeech_stt_noLM() {
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
|
||||
|
||||
String decoded = doSTT(m);
|
||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||
m.destroyModel();
|
||||
String decoded = doSTT(m, false);
|
||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||
m.destroyModel();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void loadDeepSpeech_stt_withLM() {
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
|
||||
m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA);
|
||||
m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA);
|
||||
|
||||
String decoded = doSTT(m);
|
||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||
m.destroyModel();
|
||||
String decoded = doSTT(m, false);
|
||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||
m.destroyModel();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void loadDeepSpeech_sttWithMetadata_noLM() {
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
|
||||
|
||||
String decoded = doSTT(m, true);
|
||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||
m.destroyModel();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void loadDeepSpeech_sttWithMetadata_withLM() {
|
||||
DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
|
||||
m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA);
|
||||
|
||||
String decoded = doSTT(m, true);
|
||||
assertEquals("she had your dark suit in greasy wash water all year", decoded);
|
||||
m.destroyModel();
|
||||
}
|
||||
}
|
||||
|
@ -29,6 +29,10 @@ public class DeepSpeechModel {
|
||||
return impl.SpeechToText(this._msp, buffer, buffer_size, sample_rate);
|
||||
}
|
||||
|
||||
public Metadata sttWithMetadata(short[] buffer, int buffer_size, int sample_rate) {
|
||||
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, sample_rate);
|
||||
}
|
||||
|
||||
public DeepSpeechStreamingState setupStream(int prealloc_frames, int sample_rate) {
|
||||
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
|
||||
impl.SetupStream(this._msp, prealloc_frames, sample_rate, ssp);
|
||||
@ -46,4 +50,8 @@ public class DeepSpeechModel {
|
||||
public String finishStream(DeepSpeechStreamingState ctx) {
|
||||
return impl.FinishStream(ctx.get());
|
||||
}
|
||||
|
||||
public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx) {
|
||||
return impl.FinishStreamWithMetadata(ctx.get());
|
||||
}
|
||||
}
|
||||
|
@ -56,13 +56,22 @@ parser.addArgument(['--alphabet'], {required: true, help: 'Path to the configura
|
||||
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
|
||||
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
|
||||
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
|
||||
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'})
|
||||
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
|
||||
parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
|
||||
var args = parser.parseArgs();
|
||||
|
||||
function totalTime(hrtimeValue) {
|
||||
return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
|
||||
}
|
||||
|
||||
function metadataToString(metadata) {
|
||||
var retval = ""
|
||||
for (var i = 0; i < metadata.num_items; ++i) {
|
||||
retval += metadata.items[i].character;
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
const buffer = Fs.readFileSync(args['audio']);
|
||||
const result = Wav.decode(buffer);
|
||||
|
||||
@ -119,7 +128,11 @@ audioStream.on('finish', () => {
|
||||
|
||||
// We take half of the buffer_size because buffer is a char* while
|
||||
// LocalDsSTT() expected a short*
|
||||
console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000));
|
||||
if (args['extended']) {
|
||||
console.log(metadataToString(model.sttWithMetadata(audioBuffer.slice(0, audioBuffer.length / 2), 16000)));
|
||||
} else {
|
||||
console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000));
|
||||
}
|
||||
const inference_stop = process.hrtime(inference_start);
|
||||
console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4));
|
||||
process.exit(0);
|
||||
|
@ -28,6 +28,8 @@ using namespace node;
|
||||
|
||||
// make sure the string returned by SpeechToText is freed
|
||||
%typemap(newfree) char* "DS_FreeString($1);";
|
||||
%typemap(newfree) Metadata* "DS_FreeMetadata($1);";
|
||||
|
||||
%newobject DS_SpeechToText;
|
||||
%newobject DS_IntermediateDecode;
|
||||
%newobject DS_FinishStream;
|
||||
@ -41,7 +43,7 @@ using namespace node;
|
||||
%typemap(argout) ModelState **retval {
|
||||
$result = SWIGV8_ARRAY_NEW();
|
||||
SWIGV8_AppendOutput($result, SWIG_From_int(result));
|
||||
// owned by SWIG, ModelState destructor gets called when the Python object is finalized (see below)
|
||||
// owned by SWIG, ModelState destructor gets called when the JavaScript object is finalized (see below)
|
||||
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, SWIG_POINTER_OWN));
|
||||
}
|
||||
|
||||
@ -60,7 +62,7 @@ using namespace node;
|
||||
}
|
||||
|
||||
// extend ModelState with a destructor so that DestroyModel will be called
|
||||
// when the Python object gets finalized.
|
||||
// when the JavaScript object gets finalized.
|
||||
%nodefaultctor ModelState;
|
||||
%nodefaultdtor ModelState;
|
||||
|
||||
@ -72,6 +74,31 @@ struct ModelState {};
|
||||
}
|
||||
}
|
||||
|
||||
%nodefaultdtor Metadata;
|
||||
%nodefaultctor Metadata;
|
||||
%nodefaultctor MetadataItem;
|
||||
%nodefaultdtor MetadataItem;
|
||||
|
||||
%extend Metadata {
|
||||
v8::Handle<v8::Value> items;
|
||||
v8::Handle<v8::Value> items_get() {
|
||||
v8::Handle<v8::Value> jsresult = SWIGV8_ARRAY_NEW();
|
||||
for (int i = 0; i < self->num_items; ++i) {
|
||||
jsresult = SWIGV8_AppendOutput(jsresult, SWIG_NewPointerObj(SWIG_as_voidptr(&self->items[i]), SWIGTYPE_p_MetadataItem, SWIG_POINTER_OWN));
|
||||
}
|
||||
fail:
|
||||
return jsresult;
|
||||
}
|
||||
v8::Handle<v8::Value> items_set(const v8::Handle<v8::Value> arg) {
|
||||
fail:
|
||||
v8::Handle<v8::Value> result = SWIGV8_ARRAY_NEW();
|
||||
return result;
|
||||
}
|
||||
~Metadata() {
|
||||
DS_FreeMetadata($self);
|
||||
}
|
||||
}
|
||||
|
||||
%rename ("%(strip:[DS_])s") "";
|
||||
|
||||
%include "../deepspeech.h"
|
||||
|
@ -43,6 +43,11 @@ Model.prototype.stt = function() {
|
||||
return binding.SpeechToText.apply(null, args);
|
||||
}
|
||||
|
||||
Model.prototype.sttWithMetadata = function() {
|
||||
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
|
||||
return binding.SpeechToTextWithMetadata.apply(null, args);
|
||||
}
|
||||
|
||||
Model.prototype.setupStream = function() {
|
||||
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
|
||||
const rets = binding.SetupStream.apply(null, args);
|
||||
@ -66,6 +71,10 @@ Model.prototype.finishStream = function() {
|
||||
return binding.FinishStream.apply(null, arguments);
|
||||
}
|
||||
|
||||
Model.prototype.finishStreamWithMetadata = function() {
|
||||
return binding.FinishStreamWithMetadata.apply(null, arguments);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
Model: Model,
|
||||
printVersions: binding.PrintVersions
|
||||
|
@ -34,6 +34,9 @@ class Model(object):
|
||||
def stt(self, *args, **kwargs):
|
||||
return deepspeech.impl.SpeechToText(self._impl, *args, **kwargs)
|
||||
|
||||
def sttWithMetadata(self, *args, **kwargs):
|
||||
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
|
||||
|
||||
def setupStream(self, pre_alloc_frames=150, sample_rate=16000):
|
||||
status, ctx = deepspeech.impl.SetupStream(self._impl,
|
||||
aPreAllocFrames=pre_alloc_frames,
|
||||
@ -50,3 +53,6 @@ class Model(object):
|
||||
|
||||
def finishStream(self, *args, **kwargs):
|
||||
return deepspeech.impl.FinishStream(*args, **kwargs)
|
||||
|
||||
def finishStreamWithMetadata(self, *args, **kwargs):
|
||||
return deepspeech.impl.FinishStreamWithMetadata(*args, **kwargs)
|
||||
|
@ -50,6 +50,12 @@ def convert_samplerate(audio_path):
|
||||
|
||||
return 16000, np.frombuffer(output, np.int16)
|
||||
|
||||
def metadata_to_string(metadata):
|
||||
retval = ''
|
||||
for item in range(metadata.num_items):
|
||||
retval += metadata.items[item].character
|
||||
return retval
|
||||
|
||||
|
||||
class VersionAction(argparse.Action):
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -73,6 +79,8 @@ def main():
|
||||
help='Path to the audio file to run (WAV format)')
|
||||
parser.add_argument('--version', action=VersionAction,
|
||||
help='Print version and exits')
|
||||
parser.add_argument('--extended', required=False, action='store_true',
|
||||
help='Output string from extended metadata')
|
||||
args = parser.parse_args()
|
||||
|
||||
print('Loading model from file {}'.format(args.model), file=sys.stderr)
|
||||
@ -101,7 +109,10 @@ def main():
|
||||
|
||||
print('Running inference.', file=sys.stderr)
|
||||
inference_start = timer()
|
||||
print(ds.stt(audio, fs))
|
||||
if args.extended:
|
||||
print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
|
||||
else:
|
||||
print(ds.stt(audio, fs))
|
||||
inference_end = timer() - inference_start
|
||||
print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
|
||||
|
||||
|
@ -33,7 +33,30 @@ import_array();
|
||||
%append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0));
|
||||
}
|
||||
|
||||
%extend struct MetadataItem {
|
||||
MetadataItem* __getitem__(size_t i) {
|
||||
return &$self[i];
|
||||
}
|
||||
}
|
||||
|
||||
%typemap(out) Metadata* {
|
||||
// owned, extended destructor needs to be called by SWIG
|
||||
%append_output(SWIG_NewPointerObj(%as_voidptr($1), $1_descriptor, SWIG_POINTER_OWN));
|
||||
}
|
||||
|
||||
%extend struct Metadata {
|
||||
~Metadata() {
|
||||
DS_FreeMetadata($self);
|
||||
}
|
||||
}
|
||||
|
||||
%nodefaultdtor Metadata;
|
||||
%nodefaultctor Metadata;
|
||||
%nodefaultctor MetadataItem;
|
||||
%nodefaultdtor MetadataItem;
|
||||
|
||||
%typemap(newfree) char* "DS_FreeString($1);";
|
||||
|
||||
%newobject DS_SpeechToText;
|
||||
%newobject DS_IntermediateDecode;
|
||||
%newobject DS_FinishStream;
|
||||
|
@ -312,6 +312,11 @@ run_tflite_basic_inference_tests()
|
||||
phrase_pbmodel_nolm=$(${DS_BINARY_PREFIX}deepspeech --model ${ANDROID_TMP_DIR}/ds/${model_name} --alphabet ${ANDROID_TMP_DIR}/ds/alphabet.txt --audio ${ANDROID_TMP_DIR}/ds/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
set -e
|
||||
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_nolm=$(${DS_BINARY_PREFIX}deepspeech --model ${ANDROID_TMP_DIR}/ds/${model_name} --alphabet ${ANDROID_TMP_DIR}/ds/alphabet.txt --audio ${ANDROID_TMP_DIR}/ds/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
set -e
|
||||
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
|
||||
}
|
||||
|
||||
run_netframework_inference_tests()
|
||||
@ -321,6 +326,11 @@ run_netframework_inference_tests()
|
||||
set -e
|
||||
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_nolm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended yes 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
set -e
|
||||
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_nolm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
set -e
|
||||
@ -339,6 +349,11 @@ run_electronjs_inference_tests()
|
||||
set -e
|
||||
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
set -e
|
||||
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
set -e
|
||||
@ -358,6 +373,12 @@ run_basic_inference_tests()
|
||||
set -e
|
||||
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
status=$?
|
||||
set -e
|
||||
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"
|
||||
|
||||
set +e
|
||||
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
|
||||
status=$?
|
||||
@ -822,12 +843,12 @@ do_deepspeech_netframework_build()
|
||||
|
||||
# We need MSYS2_ARG_CONV_EXCL='/' otherwise the '/' of CLI parameters gets mangled and disappears
|
||||
# We build the .NET Client for .NET Framework v4.5,v4.6,v4.7
|
||||
|
||||
|
||||
MSYS2_ARG_CONV_EXCL='/' "${MSBUILD}" \
|
||||
DeepSpeechClient/DeepSpeechClient.csproj \
|
||||
/p:Configuration=Release \
|
||||
/p:Platform=x64 \
|
||||
/p:TargetFrameworkVersion="v4.5" \
|
||||
/p:TargetFrameworkVersion="v4.5.2" \
|
||||
/p:OutputPath=bin/nuget/x64/v4.5
|
||||
|
||||
MSYS2_ARG_CONV_EXCL='/' "${MSBUILD}" \
|
||||
@ -863,13 +884,13 @@ do_nuget_build()
|
||||
cp ${DS_TFDIR}/bazel-bin/native_client/libdeepspeech.so nupkg/build
|
||||
|
||||
# We copy the generated clients for .NET into the Nuget framework dirs
|
||||
|
||||
|
||||
mkdir -p nupkg/lib/net45/
|
||||
cp DeepSpeechClient/bin/nuget/x64/v4.5/DeepSpeechClient.dll nupkg/lib/net45/
|
||||
|
||||
|
||||
mkdir -p nupkg/lib/net46/
|
||||
cp DeepSpeechClient/bin/nuget/x64/v4.6/DeepSpeechClient.dll nupkg/lib/net46/
|
||||
|
||||
|
||||
mkdir -p nupkg/lib/net47/
|
||||
cp DeepSpeechClient/bin/nuget/x64/v4.7/DeepSpeechClient.dll nupkg/lib/net47/
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user