Merge pull request #2022 from lissyx/expose-metadata

Expose extended metadata information to bindings
2019-04-24 23:06:33 +02:00 · 2019-04-24 23:06:33 +02:00 · 9815d54218
commit 9815d54218
parent 8f01cca448 a9717e702a
24 changed files with 519 additions and 116 deletions
--- a/native_client/args.h
+++ b/native_client/args.h
@ -28,6 +28,8 @@ bool has_versions = false;

 bool extended_metadata = false;

+bool json_output = false;
+
 void PrintHelp(const char* bin)
 {
    std::cout <<
@ -41,7 +43,8 @@ void PrintHelp(const char* bin)
    "	--trie TRIE		Path to the language model trie file created with native_client/generate_trie\n"
    "	--audio AUDIO		Path to the audio file to run (WAV format)\n"
    "	-t			Run in benchmark mode, output mfcc & inference time\n"
-    "	-e			Extended output, shows word timings as CSV (word, start time, duration)\n"
+    "	--extended		Output string from extended metadata\n"
+    "	--json			Extended output, shows word timings as JSON\n"
    "	--help			Show help\n"
    "	--version		Print version and exits\n";
    DS_PrintVersions();
@ -59,7 +62,8 @@ bool ProcessArgs(int argc, char** argv)
            {"audio", required_argument, nullptr, 'w'},
            {"run_very_slowly_without_trie_I_really_know_what_Im_doing", no_argument, nullptr, 999},
            {"t", no_argument, nullptr, 't'},
-            {"e", no_argument, nullptr, 'e'},
+            {"extended", no_argument, nullptr, 'e'},
+            {"json", no_argument, nullptr, 'j'},
            {"help", no_argument, nullptr, 'h'},
            {"version", no_argument, nullptr, 'v'},
            {nullptr, no_argument, nullptr, 0}
@ -110,6 +114,10 @@ bool ProcessArgs(int argc, char** argv)
            extended_metadata = true;
            break;

+        case 'j':
+            json_output = true;
+            break;
+
        case 'h': // -h or --help
        case '?': // Unrecognized option
        default:
--- a/native_client/client.cc
+++ b/native_client/client.cc
@ -50,24 +50,29 @@ struct meta_word {
  float duration;
 };

+char* metadataToString(Metadata* metadata);
 std::vector<meta_word> WordsFromMetadata(Metadata* metadata);
 char* JSONOutput(Metadata* metadata);

 ds_result
 LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
-           int aSampleRate, bool extended_output)
+           int aSampleRate, bool extended_output, bool json_output)
 {
  ds_result res = {0};

  clock_t ds_start_time = clock();

  if (extended_output) {
+    Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
+    res.string = metadataToString(metadata);
+    DS_FreeMetadata(metadata);
+  } else if (json_output) {
    Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
    res.string = JSONOutput(metadata);
    DS_FreeMetadata(metadata);
  } else {
    res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize, aSampleRate);
-  }  
+  }

  clock_t ds_end_infer = clock();

@ -241,7 +246,8 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
                                (const short*)audio.buffer,
                                audio.buffer_size / 2,
                                audio.sample_rate,
-                                extended_metadata);
+                                extended_metadata,
+                                json_output);
  free(audio.buffer);

  if (result.string) {
@ -255,6 +261,17 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
  }
 }

+char*
+metadataToString(Metadata* metadata)
+{
+  std::string retval = "";
+  for (int i = 0; i < metadata->num_items; i++) {
+    MetadataItem item = metadata->items[i];
+    retval += item.character;
+  }
+  return strdup(retval.c_str());
+}
+
 std::vector<meta_word>
 WordsFromMetadata(Metadata* metadata)
 {
@ -274,16 +291,16 @@ WordsFromMetadata(Metadata* metadata)
    }

    // Word boundary is either a space or the last character in the array
-    if (strcmp(item.character, " ") == 0 
-        || strcmp(item.character, u8" ") == 0 
+    if (strcmp(item.character, " ") == 0
+        || strcmp(item.character, u8" ") == 0
        || i == metadata->num_items-1) {
-        
+
      float word_duration = item.start_time - word_start_time;
-      
+
      if (word_duration < 0) {
        word_duration = 0;
      }
-      
+
      meta_word w;
      w.word = word;
      w.start_time = word_start_time;
--- a/native_client/dotnet/DeepSpeech.sln
+++ b/native_client/dotnet/DeepSpeech.sln
@ -1,4 +1,3 @@
-
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 15
 VisualStudioVersion = 15.0.28307.136
@ -8,21 +7,24 @@ EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechConsole", "DeepSpeechConsole\DeepSpeechConsole.csproj", "{312965E5-C4F6-4D95-BA64-79906B8BC7AC}"
 EndProject
 Global
-    GlobalSection(SolutionConfigurationPlatforms) = preSolution
-        Debug|x64 = Debug|x64
-        Release|x64 = Release|x64
-    EndGlobalSection
-    GlobalSection(ProjectConfigurationPlatforms) = postSolution
-        {56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64
-        {56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64
-        {56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64
-        {56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64
-        {312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.ActiveCfg = Debug|x64
-        {312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.Build.0 = Debug|x64
-        {312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.ActiveCfg = Release|x64
-        {312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.Build.0 = Release|x64
-    EndGlobalSection
-    GlobalSection(SolutionProperties) = preSolution
-        HideSolutionNode = FALSE
-    EndGlobalSection
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64
+		{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64
+		{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64
+		{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64
+		{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.ActiveCfg = Debug|x64
+		{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.Build.0 = Debug|x64
+		{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.ActiveCfg = Release|x64
+		{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {FC035D95-DBFD-4050-885A-A2DD9134B3AD}
+	EndGlobalSection
 EndGlobal
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
@ -1,9 +1,10 @@
 using DeepSpeechClient.Interfaces;
 using DeepSpeechClient.Structs;
+using DeepSpeechClient.Extensions;
+
 using System;
 using System.IO;
 using System.Runtime.InteropServices;
-using System.Text;

 namespace DeepSpeechClient
 {
@ -16,7 +17,7 @@ namespace DeepSpeechClient
        private unsafe ModelState* _modelStateP;
        private unsafe StreamingState** _streamingStatePP;

-        
+


        public DeepSpeech()
@ -119,7 +120,7 @@ namespace DeepSpeechClient
        /// <summary>
        /// Feeds audio samples to an ongoing streaming inference.
        /// </summary>
-        /// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param> 
+        /// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
        public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize)
        {
            NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize);
@ -131,11 +132,20 @@ namespace DeepSpeechClient
        /// <returns>The STT result. The user is responsible for freeing the string.</returns>
        public unsafe string FinishStream()
        {
-            return NativeImp.DS_FinishStream(_streamingStatePP);
+            return NativeImp.DS_FinishStream(_streamingStatePP).PtrToString();
        }

        /// <summary>
-        /// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't 
+        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
+        /// </summary>
+        /// <returns>The extended metadata. The user is responsible for freeing the struct.</returns>
+        public unsafe Models.Metadata FinishStreamWithMetadata()
+        {
+            return NativeImp.DS_FinishStreamWithMetadata(_streamingStatePP).PtrToMetadata();
+        }
+
+        /// <summary>
+        /// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't
        /// currently capable of streaming, so it always starts from the beginning of the audio.
        /// </summary>
        /// <returns>The STT intermediate result. The user is responsible for freeing the string.</returns>
@ -156,7 +166,7 @@ namespace DeepSpeechClient
        /// Creates a new streaming inference state.
        /// </summary>
        /// <param name="aPreAllocFrames">Number of timestep frames to reserve.
-        /// One timestep is equivalent to two window lengths(20ms). 
+        /// One timestep is equivalent to two window lengths(20ms).
        /// If set to 0 we reserve enough frames for 3 seconds of audio(150).</param>
        /// <param name="aSampleRate">The sample-rate of the audio signal</param>
        /// <returns>Zero for success, non-zero on failure</returns>
@ -166,7 +176,7 @@ namespace DeepSpeechClient
        }

        /// <summary>
-        /// Destroy a streaming state without decoding the computed logits. 
+        /// Destroy a streaming state without decoding the computed logits.
        /// This can be used if you no longer need the result of an ongoing streaming
        /// inference and don't want to perform a costly decode operation.
        /// </summary>
@ -175,6 +185,22 @@ namespace DeepSpeechClient
            NativeImp.DS_DiscardStream(ref _streamingStatePP);
        }

+        /// <summary>
+        /// Free a DeepSpeech allocated string
+        /// </summary>
+        public unsafe void FreeString(IntPtr intPtr)
+        {
+            NativeImp.DS_FreeString(intPtr);
+        }
+
+        /// <summary>
+        /// Free a DeepSpeech allocated Metadata struct
+        /// </summary>
+        public unsafe void FreeMetadata(IntPtr intPtr)
+        {
+            NativeImp.DS_FreeMetadata(intPtr);
+        }
+
        /// <summary>
        /// Use the DeepSpeech model to perform Speech-To-Text.
        /// </summary>
@ -184,18 +210,24 @@ namespace DeepSpeechClient
        /// <returns>The STT result. The user is responsible for freeing the string.  Returns NULL on error.</returns>
        public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize, uint aSampleRate)
        {
-            var res = NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate);
-            
-            int len = 0;
-            while (Marshal.ReadByte(res, len) != 0) ++len;
-            byte[] buffer = new byte[len];
-            Marshal.Copy(res, buffer, 0, buffer.Length);
-            return Encoding.UTF8.GetString(buffer);
+            return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToString();
+        }
+
+        /// <summary>
+        /// Use the DeepSpeech model to perform Speech-To-Text.
+        /// </summary>
+        /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
+        /// <param name="aBufferSize">The number of samples in the audio signal.</param>
+        /// <param name="aSampleRate">The sample-rate of the audio signal.</param>
+        /// <returns>The extended metadata. The user is responsible for freeing the struct.  Returns NULL on error.</returns>
+        public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aSampleRate)
+        {
+            return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToMetadata();
        }

        #endregion


-        
+
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
@ -13,25 +13,6 @@
    <FileAlignment>512</FileAlignment>
    <Deterministic>true</Deterministic>
  </PropertyGroup>
-  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
-    <DebugSymbols>true</DebugSymbols>
-    <DebugType>full</DebugType>
-    <Optimize>false</Optimize>
-    <OutputPath>bin\Debug\</OutputPath>
-    <DefineConstants>DEBUG;TRACE</DefineConstants>
-    <ErrorReport>prompt</ErrorReport>
-    <WarningLevel>4</WarningLevel>
-    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
-  </PropertyGroup>
-  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
-    <DebugType>pdbonly</DebugType>
-    <Optimize>true</Optimize>
-    <OutputPath>bin\Release\</OutputPath>
-    <DefineConstants>TRACE</DefineConstants>
-    <ErrorReport>prompt</ErrorReport>
-    <WarningLevel>4</WarningLevel>
-    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
-  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
    <DebugSymbols>true</DebugSymbols>
    <OutputPath>bin\x64\Debug\</OutputPath>
@ -65,10 +46,15 @@
  <ItemGroup>
    <Compile Include="DeepSpeech.cs" />
    <Compile Include="Interfaces\IDeepSpeech.cs" />
+    <Compile Include="Extensions\NativeExtensions.cs" />
+    <Compile Include="Models\Metadata.cs" />
+    <Compile Include="Models\MetadataItem.cs" />
    <Compile Include="NativeImp.cs" />
    <Compile Include="Properties\AssemblyInfo.cs" />
    <Compile Include="Structs\ModelState.cs" />
    <Compile Include="Structs\StreamingState.cs" />
+    <Compile Include="Structs\Metadata.cs" />
+    <Compile Include="Structs\MetadataItem.cs" />
  </ItemGroup>
  <ItemGroup />
  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
--- a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs
+++ b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs
@ -0,0 +1,60 @@
+using DeepSpeechClient.Structs;
+using System;
+using System.Runtime.InteropServices;
+using System.Text;
+
+namespace DeepSpeechClient.Extensions
+{
+    internal static class NativeExtensions
+    {
+        /// <summary>
+        /// Converts native pointer to UTF-8 encoded string.
+        /// </summary>
+        /// <param name="intPtr">Native pointer.</param>
+        /// <param name="releasePtr">Optional parameter to release the native pointer.</param>
+        /// <returns>Result string.</returns>
+        internal static string PtrToString(this IntPtr intPtr, bool releasePtr = true)
+        {
+            int len = 0;
+            while (Marshal.ReadByte(intPtr, len) != 0) ++len;
+            byte[] buffer = new byte[len];
+            Marshal.Copy(intPtr, buffer, 0, buffer.Length);
+            if (releasePtr)
+                NativeImp.DS_FreeString(intPtr);
+            string result = Encoding.UTF8.GetString(buffer);
+            return result;
+        }
+
+        /// <summary>
+        /// Converts a pointer into managed metadata object.
+        /// </summary>
+        /// <param name="intPtr">Native pointer.</param>
+        /// <returns>Metadata managed object.</returns>
+        internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
+        {
+            var managedMetaObject = new Models.Metadata();
+            var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));
+
+            managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
+            managedMetaObject.Probability = metaData.probability;
+
+
+            //we need to manually read each item from the native ptr using its size
+            var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
+            for (int i = 0; i < metaData.num_items; i++)
+            {
+                var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items);
+                managedMetaObject.Items[i] = new Models.MetadataItem
+                {
+                    Timestep = tempItem.timestep,
+                    StartTime = tempItem.start_time,
+                    Character = tempItem.character.PtrToString(releasePtr: false)
+                };
+                //we keep the offset on each read
+                metaData.items += sizeOfMetaItem;
+            }
+            NativeImp.DS_FreeMetadata(intPtr);
+            return managedMetaObject;
+        }
+    }
+}
--- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
@ -1,4 +1,5 @@
-using System;
+using DeepSpeechClient.Models;
+using System;

 namespace DeepSpeechClient.Interfaces
 {
@ -53,17 +54,38 @@ namespace DeepSpeechClient.Interfaces
                uint aSampleRate);

        /// <summary>
-        /// Destroy a streaming state without decoding the computed logits. 
+        /// Use the DeepSpeech model to perform Speech-To-Text.
+        /// </summary>
+        /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
+        /// <param name="aBufferSize">The number of samples in the audio signal.</param>
+        /// <param name="aSampleRate">The sample-rate of the audio signal.</param>
+        /// <returns>The extended metadata result. The user is responsible for freeing the struct.  Returns NULL on error.</returns>
+        unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
+                uint aBufferSize,
+                uint aSampleRate);
+
+        /// <summary>
+        /// Destroy a streaming state without decoding the computed logits.
        /// This can be used if you no longer need the result of an ongoing streaming
        /// inference and don't want to perform a costly decode operation.
        /// </summary>
        unsafe void DiscardStream();

+        /// <summary>
+        /// Free a DeepSpeech allocated string
+        /// </summary>
+        unsafe void FreeString(IntPtr intPtr);
+
+        /// <summary>
+        /// Free a DeepSpeech allocated Metadata struct
+        /// </summary>
+        unsafe void FreeMetadata(IntPtr intPtr);
+
        /// <summary>
        /// Creates a new streaming inference state.
        /// </summary>
        /// <param name="aPreAllocFrames">Number of timestep frames to reserve.
-        /// One timestep is equivalent to two window lengths(20ms). 
+        /// One timestep is equivalent to two window lengths(20ms).
        /// If set to 0 we reserve enough frames for 3 seconds of audio(150).</param>
        /// <param name="aSampleRate">The sample-rate of the audio signal</param>
        /// <returns>Zero for success, non-zero on failure</returns>
@ -72,11 +94,11 @@ namespace DeepSpeechClient.Interfaces
        /// <summary>
        /// Feeds audio samples to an ongoing streaming inference.
        /// </summary>
-        /// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param> 
+        /// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
        unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize);

        /// <summary>
-        /// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't 
+        /// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't
        /// currently capable of streaming, so it always starts from the beginning of the audio.
        /// </summary>
        /// <returns>The STT intermediate result. The user is responsible for freeing the string.</returns>
@ -87,5 +109,11 @@ namespace DeepSpeechClient.Interfaces
        /// </summary>
        /// <returns>The STT result. The user is responsible for freeing the string.</returns>
        unsafe string FinishStream();
+
+        /// <summary>
+        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
+        /// </summary>
+        /// <returns>The extended metadata result. The user is responsible for freeing the struct.</returns>
+        unsafe Metadata FinishStreamWithMetadata();
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs
@ -0,0 +1,17 @@
+namespace DeepSpeechClient.Models
+{
+    /// <summary>
+    /// Stores the entire CTC output as an array of character metadata objects.
+    /// </summary>
+    public class Metadata
+    {
+        /// <summary>
+        /// Approximated probability (confidence value) for this transcription.
+        /// </summary>
+        public double Probability { get; set; }
+        /// <summary>
+        /// List of metada items containing char, timespet, and time offset.
+        /// </summary>
+        public MetadataItem[] Items { get; set; }
+    }
+}
--- a/native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs
+++ b/native_client/dotnet/DeepSpeechClient/Models/MetadataItem.cs
@ -0,0 +1,21 @@
+namespace DeepSpeechClient.Models
+{
+    /// <summary>
+    /// Stores each individual character, along with its timing information.
+    /// </summary>
+    public class MetadataItem
+    {
+        /// <summary>
+        /// Char of the current timestep.
+        /// </summary>
+        public string Character;
+        /// <summary>
+        /// Position of the character in units of 20ms.
+        /// </summary>
+        public int Timestep;
+        /// <summary>
+        /// Position of the character in seconds.
+        /// </summary>
+        public float StartTime;
+    }
+}
--- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs
+++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs
@ -1,5 +1,6 @@
-using System;
-using DeepSpeechClient.Structs;
+using DeepSpeechClient.Structs;
+
+using System;
 using System.Runtime.InteropServices;

 namespace DeepSpeechClient
@ -36,6 +37,12 @@ namespace DeepSpeechClient
                uint aBufferSize,
                uint aSampleRate);

+        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
+        internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(ModelState** aCtx,
+                 short[] aBuffer,
+                uint aBufferSize,
+                uint aSampleRate);
+
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
        internal static unsafe extern void DS_DestroyModel(ModelState** aCtx);

@ -44,10 +51,15 @@ namespace DeepSpeechClient
               uint aPreAllocFrames,
               uint aSampleRate, ref StreamingState** retval);

-         
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
        internal static unsafe extern void DS_DiscardStream(ref StreamingState** aSctx);

+        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
+        internal static unsafe extern void DS_FreeMetadata(IntPtr metadata);
+
+        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
+        internal static unsafe extern void DS_FreeString(IntPtr str);
+
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
            CharSet = CharSet.Ansi, SetLastError = true)]
        internal static unsafe extern void DS_FeedAudioContent(StreamingState** aSctx,
@ -57,8 +69,12 @@ namespace DeepSpeechClient
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
        internal static unsafe extern string DS_IntermediateDecode(StreamingState** aSctx);

+        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl,
+            CharSet = CharSet.Ansi, SetLastError = true)]
+        internal static unsafe extern IntPtr DS_FinishStream(  StreamingState** aSctx);
+
        [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
-        internal static unsafe extern string DS_FinishStream(  StreamingState** aSctx);
+        internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(StreamingState** aSctx);
        #endregion
    }
 }
--- a/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
+++ b/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs
@ -0,0 +1,22 @@
+using System;
+using System.Runtime.InteropServices;
+
+namespace DeepSpeechClient.Structs
+{
+    [StructLayout(LayoutKind.Sequential)]
+    internal unsafe struct Metadata
+    {
+        /// <summary>
+        /// Native list of items.
+        /// </summary>
+        internal unsafe IntPtr items;
+        /// <summary>
+        /// Count of items from the native side.
+        /// </summary>
+        internal unsafe int num_items;
+        /// <summary>
+        /// Approximated probability (confidence value) for this transcription.
+        /// </summary>
+        internal unsafe double probability;
+    }
+}
--- a/native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs
+++ b/native_client/dotnet/DeepSpeechClient/Structs/MetadataItem.cs
@ -0,0 +1,22 @@
+using System;
+using System.Runtime.InteropServices;
+
+namespace DeepSpeechClient.Structs
+{
+    [StructLayout(LayoutKind.Sequential)]
+    internal unsafe struct MetadataItem
+    {
+        /// <summary>
+        /// Native character.
+        /// </summary>
+        internal unsafe IntPtr character;
+        /// <summary>
+        /// Position of the character in units of 20ms.
+        /// </summary>
+        internal unsafe int timestep;
+        /// <summary>
+        /// Position of the character in seconds.
+        /// </summary>
+        internal unsafe float start_time;
+    }
+}
--- a/native_client/dotnet/DeepSpeechConsole/DeepSpeechConsole.csproj
+++ b/native_client/dotnet/DeepSpeechConsole/DeepSpeechConsole.csproj
@ -13,25 +13,6 @@
    <AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
    <Deterministic>true</Deterministic>
  </PropertyGroup>
-  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
-    <PlatformTarget>AnyCPU</PlatformTarget>
-    <DebugSymbols>true</DebugSymbols>
-    <DebugType>full</DebugType>
-    <Optimize>false</Optimize>
-    <OutputPath>bin\Debug\</OutputPath>
-    <DefineConstants>DEBUG;TRACE</DefineConstants>
-    <ErrorReport>prompt</ErrorReport>
-    <WarningLevel>4</WarningLevel>
-  </PropertyGroup>
-  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
-    <PlatformTarget>AnyCPU</PlatformTarget>
-    <DebugType>pdbonly</DebugType>
-    <Optimize>true</Optimize>
-    <OutputPath>bin\Release\</OutputPath>
-    <DefineConstants>TRACE</DefineConstants>
-    <ErrorReport>prompt</ErrorReport>
-    <WarningLevel>4</WarningLevel>
-  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
    <DebugSymbols>true</DebugSymbols>
    <OutputPath>bin\x64\Debug\</OutputPath>
--- a/native_client/dotnet/DeepSpeechConsole/Program.cs
+++ b/native_client/dotnet/DeepSpeechConsole/Program.cs
@ -1,5 +1,6 @@
 using DeepSpeechClient;
 using DeepSpeechClient.Interfaces;
+using DeepSpeechClient.Models;
 using NAudio.Wave;
 using System;
 using System.Collections.Generic;
@ -20,6 +21,17 @@ namespace CSharpExamples
        static string GetArgument(IEnumerable<string> args, string option)
        => args.SkipWhile(i => i != option).Skip(1).Take(1).FirstOrDefault();

+        static string MetadataToString(Metadata meta)
+        {
+            var nl = Environment.NewLine;
+            string retval =
+             Environment.NewLine +$"Recognized text: {string.Join("", meta?.Items?.Select(x=>x.Character))} {nl}"
+             + $"Prob: {meta?.Probability} {nl}"
+             + $"Item count: {meta?.Items?.Length} {nl}"
+             + string.Join(nl, meta?.Items?.Select(x => $"Timestep : {x.Timestep} TimeOffset: {x.StartTime} Char: {x.Character}"));
+            return retval;
+        }
+
        static void Main(string[] args)
        {
            string model = null;
@ -27,6 +39,7 @@ namespace CSharpExamples
            string lm = null;
            string trie = null;
            string audio = null;
+            bool extended = false;
            if (args.Length > 0)
            {
                model = GetArgument(args, "--model");
@ -34,6 +47,7 @@ namespace CSharpExamples
                lm = GetArgument(args, "--lm");
                trie = GetArgument(args, "--trie");
                audio = GetArgument(args, "--audio");
+                extended = !string.IsNullOrWhiteSpace(GetArgument(args, "--extended"));
            }

            const uint N_CEP = 26;
@ -50,9 +64,9 @@ namespace CSharpExamples
                Console.WriteLine("Loading model...");
                stopwatch.Start();
                try
-                { 
+                {
                    result = sttClient.CreateModel(
-                        model ?? "output_graph.pbmm", 
+                        model ?? "output_graph.pbmm",
                        N_CEP, N_CONTEXT,
                        alphabet ?? "alphabet.txt",
                        BEAM_WIDTH);
@ -62,7 +76,6 @@ namespace CSharpExamples
                    Console.WriteLine("Error loading lm.");
                    Console.WriteLine(ex.Message);
                }
-
                stopwatch.Stop();
                if (result == 0)
                {
@ -95,13 +108,22 @@ namespace CSharpExamples

                        stopwatch.Start();

-                        string speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
+                        string speechResult;
+                        if (extended)
+                        {
+                            Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
+                            speechResult = MetadataToString(metaResult);
+                        }
+                        else
+                        {
+                            speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
+                        }

                        stopwatch.Stop();

                        Console.WriteLine($"Audio duration: {waveInfo.TotalTime.ToString()}");
                        Console.WriteLine($"Inference took: {stopwatch.Elapsed.ToString()}");
-                        Console.WriteLine($"Recognized text: {speechResult}");
+                        Console.WriteLine((extended ? $"Extended result: ": "Recognized text: ") + speechResult);
                    }
                    waveBuffer.Clear();
                }
--- a/native_client/java/jni/deepspeech.i
+++ b/native_client/java/jni/deepspeech.i
@ -16,6 +16,25 @@
 %pointer_functions(StreamingState*, streamingstatep);

 %typemap(newfree) char* "DS_FreeString($1);";
+
+%include "carrays.i"
+%array_functions(struct MetadataItem, metadataItem_array);
+
+%extend struct Metadata {
+  MetadataItem getItem(int i) {
+    return metadataItem_array_getitem(self->items, i);
+  }
+
+  ~Metadata() {
+    DS_FreeMetadata(self);
+  }
+}
+
+%nodefaultdtor Metadata;
+%nodefaultctor Metadata;
+%nodefaultctor MetadataItem;
+%nodefaultdtor MetadataItem;
+
 %newobject DS_SpeechToText;
 %newobject DS_IntermediateDecode;
 %newobject DS_FinishStream;
--- a/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java
+++ b/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java
@ -12,6 +12,7 @@ import org.junit.runners.MethodSorters;
 import static org.junit.Assert.*;

 import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
+import org.mozilla.deepspeech.libdeepspeech.Metadata;

 import java.io.RandomAccessFile;
 import java.io.FileNotFoundException;
@ -66,10 +67,18 @@ public class BasicTest {
    @Test
    public void loadDeepSpeech_basic() {
        DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
-	m.destroyModel();
+        m.destroyModel();
    }

-    private String doSTT(DeepSpeechModel m) {
+    private String metadataToString(Metadata m) {
+        String retval = "";
+        for (int i = 0; i < m.getNum_items(); ++i) {
+            retval += m.getItem(i).getCharacter();
+        }
+        return retval;
+    }
+
+    private String doSTT(DeepSpeechModel m, boolean extendedMetadata) {
        try {
            RandomAccessFile wave = new RandomAccessFile(wavFile, "r");

@ -96,7 +105,11 @@ public class BasicTest {
            // to turn bytes to shorts as either big endian or little endian.
            ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);

-            return m.stt(shorts, shorts.length, sampleRate);
+            if (extendedMetadata) {
+                return metadataToString(m.sttWithMetadata(shorts, shorts.length, sampleRate));
+            } else {
+                return m.stt(shorts, shorts.length, sampleRate);
+            }
        } catch (FileNotFoundException ex) {

        } catch (IOException ex) {
@ -105,25 +118,44 @@ public class BasicTest {

        }

-	return "";
+        return "";
    }

    @Test
    public void loadDeepSpeech_stt_noLM() {
        DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);

-        String decoded = doSTT(m);
-	assertEquals("she had your dark suit in greasy wash water all year", decoded);
-	m.destroyModel();
+        String decoded = doSTT(m, false);
+        assertEquals("she had your dark suit in greasy wash water all year", decoded);
+        m.destroyModel();
    }

    @Test
    public void loadDeepSpeech_stt_withLM() {
        DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
-	m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA);
+        m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA);

-        String decoded = doSTT(m);
-	assertEquals("she had your dark suit in greasy wash water all year", decoded);
-	m.destroyModel();
+        String decoded = doSTT(m, false);
+        assertEquals("she had your dark suit in greasy wash water all year", decoded);
+        m.destroyModel();
+    }
+
+    @Test
+    public void loadDeepSpeech_sttWithMetadata_noLM() {
+        DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
+
+        String decoded = doSTT(m, true);
+        assertEquals("she had your dark suit in greasy wash water all year", decoded);
+        m.destroyModel();
+    }
+
+    @Test
+    public void loadDeepSpeech_sttWithMetadata_withLM() {
+        DeepSpeechModel m = new DeepSpeechModel(modelFile, N_CEP, N_CONTEXT, alphabetFile, BEAM_WIDTH);
+        m.enableDecoderWihLM(alphabetFile, lmFile, trieFile, LM_ALPHA, LM_BETA);
+
+        String decoded = doSTT(m, true);
+        assertEquals("she had your dark suit in greasy wash water all year", decoded);
+        m.destroyModel();
    }
 }
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
@ -29,6 +29,10 @@ public class DeepSpeechModel {
        return impl.SpeechToText(this._msp, buffer, buffer_size, sample_rate);
    }

+    public Metadata sttWithMetadata(short[] buffer, int buffer_size, int sample_rate) {
+        return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, sample_rate);
+    }
+
    public DeepSpeechStreamingState setupStream(int prealloc_frames, int sample_rate) {
        SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
        impl.SetupStream(this._msp, prealloc_frames, sample_rate, ssp);
@ -46,4 +50,8 @@ public class DeepSpeechModel {
    public String finishStream(DeepSpeechStreamingState ctx) {
        return impl.FinishStream(ctx.get());
    }
+
+    public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx) {
+        return impl.FinishStreamWithMetadata(ctx.get());
+    } 
 }
--- a/native_client/javascript/client.js
+++ b/native_client/javascript/client.js
@ -56,13 +56,22 @@ parser.addArgument(['--alphabet'], {required: true, help: 'Path to the configura
 parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
 parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
 parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
-parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'})
+parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
+parser.addArgument(['--extended'], {action: 'storeTrue', help: 'Output string from extended metadata'});
 var args = parser.parseArgs();

 function totalTime(hrtimeValue) {
  return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
 }

+function metadataToString(metadata) {
+  var retval = ""
+  for (var i = 0; i < metadata.num_items; ++i) {
+    retval += metadata.items[i].character;
+  }
+  return retval;
+}
+
 const buffer = Fs.readFileSync(args['audio']);
 const result = Wav.decode(buffer);

@ -119,7 +128,11 @@ audioStream.on('finish', () => {

  // We take half of the buffer_size because buffer is a char* while
  // LocalDsSTT() expected a short*
-  console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000));
+  if (args['extended']) {
+    console.log(metadataToString(model.sttWithMetadata(audioBuffer.slice(0, audioBuffer.length / 2), 16000)));
+  } else {
+    console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000));
+  }
  const inference_stop = process.hrtime(inference_start);
  console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4));
  process.exit(0);
--- a/native_client/javascript/deepspeech.i
+++ b/native_client/javascript/deepspeech.i
@ -28,6 +28,8 @@ using namespace node;

 // make sure the string returned by SpeechToText is freed
 %typemap(newfree) char* "DS_FreeString($1);";
+%typemap(newfree) Metadata* "DS_FreeMetadata($1);";
+
 %newobject DS_SpeechToText;
 %newobject DS_IntermediateDecode;
 %newobject DS_FinishStream;
@ -41,7 +43,7 @@ using namespace node;
 %typemap(argout) ModelState **retval {
  $result = SWIGV8_ARRAY_NEW();
  SWIGV8_AppendOutput($result, SWIG_From_int(result));
-  // owned by SWIG, ModelState destructor gets called when the Python object is finalized (see below)
+  // owned by SWIG, ModelState destructor gets called when the JavaScript object is finalized (see below)
  %append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, SWIG_POINTER_OWN));
 }

@ -60,7 +62,7 @@ using namespace node;
 }

 // extend ModelState with a destructor so that DestroyModel will be called
-// when the Python object gets finalized.
+// when the JavaScript object gets finalized.
 %nodefaultctor ModelState;
 %nodefaultdtor ModelState;

@ -72,6 +74,31 @@ struct ModelState {};
  }
 }

+%nodefaultdtor Metadata;
+%nodefaultctor Metadata;
+%nodefaultctor MetadataItem;
+%nodefaultdtor MetadataItem;
+
+%extend Metadata {
+  v8::Handle<v8::Value> items;
+  v8::Handle<v8::Value> items_get() {
+    v8::Handle<v8::Value> jsresult = SWIGV8_ARRAY_NEW();
+    for (int i = 0; i < self->num_items; ++i) {
+      jsresult = SWIGV8_AppendOutput(jsresult, SWIG_NewPointerObj(SWIG_as_voidptr(&self->items[i]), SWIGTYPE_p_MetadataItem, SWIG_POINTER_OWN));
+    }
+  fail:
+    return jsresult;
+  }
+  v8::Handle<v8::Value> items_set(const  v8::Handle<v8::Value> arg) {
+  fail:
+    v8::Handle<v8::Value> result = SWIGV8_ARRAY_NEW();
+    return result;
+  }
+  ~Metadata() {
+    DS_FreeMetadata($self);
+  }
+}
+
 %rename ("%(strip:[DS_])s") "";

 %include "../deepspeech.h"
--- a/native_client/javascript/index.js
+++ b/native_client/javascript/index.js
@ -43,6 +43,11 @@ Model.prototype.stt = function() {
    return binding.SpeechToText.apply(null, args);
 }

+Model.prototype.sttWithMetadata = function() {
+    const args = [this._impl].concat(Array.prototype.slice.call(arguments));
+    return binding.SpeechToTextWithMetadata.apply(null, args);
+}
+
 Model.prototype.setupStream = function() {
    const args = [this._impl].concat(Array.prototype.slice.call(arguments));
    const rets = binding.SetupStream.apply(null, args);
@ -66,6 +71,10 @@ Model.prototype.finishStream = function() {
    return binding.FinishStream.apply(null, arguments);
 }

+Model.prototype.finishStreamWithMetadata = function() {
+    return binding.FinishStreamWithMetadata.apply(null, arguments);
+}
+
 module.exports = {
    Model: Model,
    printVersions: binding.PrintVersions
--- a/native_client/python/init.py
+++ b/native_client/python/init.py
@ -34,6 +34,9 @@ class Model(object):
    def stt(self, *args, **kwargs):
        return deepspeech.impl.SpeechToText(self._impl, *args, **kwargs)

+    def sttWithMetadata(self, *args, **kwargs):
+        return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
+
    def setupStream(self, pre_alloc_frames=150, sample_rate=16000):
        status, ctx = deepspeech.impl.SetupStream(self._impl,
                                                  aPreAllocFrames=pre_alloc_frames,
@ -50,3 +53,6 @@ class Model(object):

    def finishStream(self, *args, **kwargs):
        return deepspeech.impl.FinishStream(*args, **kwargs)
+
+    def finishStreamWithMetadata(self, *args, **kwargs):
+        return deepspeech.impl.FinishStreamWithMetadata(*args, **kwargs)
--- a/native_client/python/client.py
+++ b/native_client/python/client.py
@ -50,6 +50,12 @@ def convert_samplerate(audio_path):

    return 16000, np.frombuffer(output, np.int16)

+def metadata_to_string(metadata):
+    retval = ''
+    for item in range(metadata.num_items):
+        retval += metadata.items[item].character
+    return retval
+

 class VersionAction(argparse.Action):
    def __init__(self, *args, **kwargs):
@ -73,6 +79,8 @@ def main():
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
+    parser.add_argument('--extended', required=False, action='store_true',
+                        help='Output string from extended metadata')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
@ -101,7 +109,10 @@ def main():

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
-    print(ds.stt(audio, fs))
+    if args.extended:
+        print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
+    else:
+        print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

--- a/native_client/python/impl.i
+++ b/native_client/python/impl.i
@ -33,7 +33,30 @@ import_array();
  %append_output(SWIG_NewPointerObj(%as_voidptr(*$1), $*1_descriptor, 0));
 }

+%extend struct MetadataItem {
+  MetadataItem* __getitem__(size_t i) {
+    return &$self[i];
+  }
+}
+
+%typemap(out) Metadata* {
+  // owned, extended destructor needs to be called by SWIG
+  %append_output(SWIG_NewPointerObj(%as_voidptr($1), $1_descriptor, SWIG_POINTER_OWN));
+}
+
+%extend struct Metadata {
+  ~Metadata() {
+    DS_FreeMetadata($self);
+  }
+}
+
+%nodefaultdtor Metadata;
+%nodefaultctor Metadata;
+%nodefaultctor MetadataItem;
+%nodefaultdtor MetadataItem;
+
 %typemap(newfree) char* "DS_FreeString($1);";
+
 %newobject DS_SpeechToText;
 %newobject DS_IntermediateDecode;
 %newobject DS_FinishStream;
--- a/tc-tests-utils.sh
+++ b/tc-tests-utils.sh
@ -312,6 +312,11 @@ run_tflite_basic_inference_tests()
  phrase_pbmodel_nolm=$(${DS_BINARY_PREFIX}deepspeech --model ${ANDROID_TMP_DIR}/ds/${model_name} --alphabet ${ANDROID_TMP_DIR}/ds/alphabet.txt --audio ${ANDROID_TMP_DIR}/ds/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
  set -e
  assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
+
+  set +e
+  phrase_pbmodel_nolm=$(${DS_BINARY_PREFIX}deepspeech --model ${ANDROID_TMP_DIR}/ds/${model_name} --alphabet ${ANDROID_TMP_DIR}/ds/alphabet.txt --audio ${ANDROID_TMP_DIR}/ds/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr)
+  set -e
+  assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
 }

 run_netframework_inference_tests()
@ -321,6 +326,11 @@ run_netframework_inference_tests()
  set -e
  assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"

+  set +e
+  phrase_pbmodel_nolm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended yes 2>${TASKCLUSTER_TMP_DIR}/stderr)
+  set -e
+  assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
+
  set +e
  phrase_pbmodel_nolm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
  set -e
@ -339,6 +349,11 @@ run_electronjs_inference_tests()
  set -e
  assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"

+  set +e
+  phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr)
+  set -e
+  assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
+
  set +e
  phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
  set -e
@ -358,6 +373,12 @@ run_basic_inference_tests()
  set -e
  assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"

+  set +e
+  phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr)
+  status=$?
+  set -e
+  assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"
+
  set +e
  phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
  status=$?
@ -822,12 +843,12 @@ do_deepspeech_netframework_build()

  # We need MSYS2_ARG_CONV_EXCL='/' otherwise the '/' of CLI parameters gets mangled and disappears
  # We build the .NET Client for .NET Framework v4.5,v4.6,v4.7
-  
+
  MSYS2_ARG_CONV_EXCL='/' "${MSBUILD}" \
    DeepSpeechClient/DeepSpeechClient.csproj \
    /p:Configuration=Release \
    /p:Platform=x64 \
-    /p:TargetFrameworkVersion="v4.5" \
+    /p:TargetFrameworkVersion="v4.5.2" \
    /p:OutputPath=bin/nuget/x64/v4.5

  MSYS2_ARG_CONV_EXCL='/' "${MSBUILD}" \
@ -863,13 +884,13 @@ do_nuget_build()
  cp ${DS_TFDIR}/bazel-bin/native_client/libdeepspeech.so nupkg/build

  # We copy the generated clients for .NET into the Nuget framework dirs
-  
+
  mkdir -p nupkg/lib/net45/
  cp DeepSpeechClient/bin/nuget/x64/v4.5/DeepSpeechClient.dll nupkg/lib/net45/
-  
+
  mkdir -p nupkg/lib/net46/
  cp DeepSpeechClient/bin/nuget/x64/v4.6/DeepSpeechClient.dll nupkg/lib/net46/
-  
+
  mkdir -p nupkg/lib/net47/
  cp DeepSpeechClient/bin/nuget/x64/v4.7/DeepSpeechClient.dll nupkg/lib/net47/