Merge pull request #2420 from mozilla/remove-sr-param

Remove unused sample rate param from API
This commit is contained in:
Reuben Morais 2019-10-10 21:05:29 +02:00 committed by GitHub
commit 315a67bf69
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 114 additions and 114 deletions

View File

@ -45,7 +45,7 @@ def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask):
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
fin.close() fin.close()
decoded = ds.stt(audio, fs) decoded = ds.stt(audio)
queue_out.put({'wav': wavname, 'prediction': decoded, 'ground_truth': msg['transcript']}) queue_out.put({'wav': wavname, 'prediction': decoded, 'ground_truth': msg['transcript']})
print(queue_out.qsize(), end='\r') # Update the current progress print(queue_out.qsize(), end='\r') # Update the current progress

View File

@ -95,7 +95,7 @@ const ffmpeg = spawn('ffmpeg', [
]); ]);
let audioLength = 0; let audioLength = 0;
let sctx = model.createStream(AUDIO_SAMPLE_RATE); let sctx = model.createStream();
function finishStream() { function finishStream() {
const model_load_start = process.hrtime(); const model_load_start = process.hrtime();
@ -108,7 +108,7 @@ function finishStream() {
function intermediateDecode() { function intermediateDecode() {
finishStream(); finishStream();
sctx = model.createStream(AUDIO_SAMPLE_RATE); sctx = model.createStream();
} }
function feedAudioContent(chunk) { function feedAudioContent(chunk) {

View File

@ -130,7 +130,7 @@ namespace DeepSpeechWPF
watch.Start(); watch.Start();
await Task.Run(() => await Task.Run(() =>
{ {
string speechResult = _sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000); string speechResult = _sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
watch.Stop(); watch.Stop();
Dispatcher.Invoke(() => Dispatcher.Invoke(() =>
{ {
@ -250,7 +250,7 @@ namespace DeepSpeechWPF
private void BtnStartRecording_Click(object sender, RoutedEventArgs e) private void BtnStartRecording_Click(object sender, RoutedEventArgs e)
{ {
_sttClient.CreateStream(16000); _sttClient.CreateStream();
_audioCapture.Start(); _audioCapture.Start();
btnStartRecording.IsEnabled = false; btnStartRecording.IsEnabled = false;
btnStopRecording.IsEnabled = true; btnStopRecording.IsEnabled = true;

View File

@ -64,7 +64,7 @@ audioStream.on('finish', () => {
const audioLength = (audioBuffer.length / 2) * ( 1 / 16000); const audioLength = (audioBuffer.length / 2) * ( 1 / 16000);
console.log('audio length', audioLength); console.log('audio length', audioLength);
let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000); let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2));
console.log('result:', result); console.log('result:', result);
}); });

View File

@ -44,12 +44,12 @@ Returns a list [Inference, Inference Time, Audio Length]
''' '''
def stt(ds, audio, fs): def stt(ds, audio, fs):
inference_time = 0.0 inference_time = 0.0
audio_length = len(audio) * (1 / 16000) audio_length = len(audio) * (1 / fs)
# Run Deepspeech # Run Deepspeech
logging.debug('Running inference...') logging.debug('Running inference...')
inference_start = timer() inference_start = timer()
output = ds.stt(audio, fs) output = ds.stt(audio)
inference_end = timer() - inference_start inference_end = timer() - inference_start
inference_time += inference_end inference_time += inference_end
logging.debug('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length)) logging.debug('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length))

View File

@ -54,23 +54,23 @@ char* JSONOutput(Metadata* metadata);
ds_result ds_result
LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
int aSampleRate, bool extended_output, bool json_output) bool extended_output, bool json_output)
{ {
ds_result res = {0}; ds_result res = {0};
clock_t ds_start_time = clock(); clock_t ds_start_time = clock();
if (extended_output) { if (extended_output) {
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate); Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
res.string = metadataToString(metadata); res.string = metadataToString(metadata);
DS_FreeMetadata(metadata); DS_FreeMetadata(metadata);
} else if (json_output) { } else if (json_output) {
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate); Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
res.string = JSONOutput(metadata); res.string = JSONOutput(metadata);
DS_FreeMetadata(metadata); DS_FreeMetadata(metadata);
} else if (stream_size > 0) { } else if (stream_size > 0) {
StreamingState* ctx; StreamingState* ctx;
int status = DS_CreateStream(aCtx, aSampleRate, &ctx); int status = DS_CreateStream(aCtx, &ctx);
if (status != DS_ERR_OK) { if (status != DS_ERR_OK) {
res.string = strdup(""); res.string = strdup("");
return res; return res;
@ -94,7 +94,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
} }
res.string = DS_FinishStream(ctx); res.string = DS_FinishStream(ctx);
} else { } else {
res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize, aSampleRate); res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize);
} }
clock_t ds_end_infer = clock(); clock_t ds_end_infer = clock();
@ -108,7 +108,6 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
typedef struct { typedef struct {
char* buffer; char* buffer;
size_t buffer_size; size_t buffer_size;
int sample_rate;
} ds_audio_buffer; } ds_audio_buffer;
ds_audio_buffer ds_audio_buffer
@ -159,8 +158,6 @@ GetAudioBuffer(const char* path)
assert(output); assert(output);
res.sample_rate = (int)output->signal.rate;
if ((int)input->signal.rate < 16000) { if ((int)input->signal.rate < 16000) {
fprintf(stderr, "Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.\n", (int)input->signal.rate); fprintf(stderr, "Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.\n", (int)input->signal.rate);
} }
@ -221,7 +218,6 @@ GetAudioBuffer(const char* path)
unsigned int sample_rate; unsigned int sample_rate;
fseek(wave, 24, SEEK_SET); rv = fread(&sample_rate, 4, 1, wave); fseek(wave, 24, SEEK_SET); rv = fread(&sample_rate, 4, 1, wave);
res.sample_rate = (int)sample_rate;
unsigned short bits_per_sample; unsigned short bits_per_sample;
fseek(wave, 34, SEEK_SET); rv = fread(&bits_per_sample, 2, 1, wave); fseek(wave, 34, SEEK_SET); rv = fread(&bits_per_sample, 2, 1, wave);
@ -269,7 +265,6 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
ds_result result = LocalDsSTT(context, ds_result result = LocalDsSTT(context,
(const short*)audio.buffer, (const short*)audio.buffer,
audio.buffer_size / 2, audio.buffer_size / 2,
audio.sample_rate,
extended_metadata, extended_metadata,
json_output); json_output);
free(audio.buffer); free(audio.buffer);

View File

@ -318,7 +318,6 @@ DS_EnableDecoderWithLM(ModelState* aCtx,
int int
DS_CreateStream(ModelState* aCtx, DS_CreateStream(ModelState* aCtx,
unsigned int aSampleRate,
StreamingState** retval) StreamingState** retval)
{ {
*retval = nullptr; *retval = nullptr;
@ -383,11 +382,10 @@ DS_FinishStreamWithMetadata(StreamingState* aSctx)
StreamingState* StreamingState*
CreateStreamAndFeedAudioContent(ModelState* aCtx, CreateStreamAndFeedAudioContent(ModelState* aCtx,
const short* aBuffer, const short* aBuffer,
unsigned int aBufferSize, unsigned int aBufferSize)
unsigned int aSampleRate)
{ {
StreamingState* ctx; StreamingState* ctx;
int status = DS_CreateStream(aCtx, aSampleRate, &ctx); int status = DS_CreateStream(aCtx, &ctx);
if (status != DS_ERR_OK) { if (status != DS_ERR_OK) {
return nullptr; return nullptr;
} }
@ -398,20 +396,18 @@ CreateStreamAndFeedAudioContent(ModelState* aCtx,
char* char*
DS_SpeechToText(ModelState* aCtx, DS_SpeechToText(ModelState* aCtx,
const short* aBuffer, const short* aBuffer,
unsigned int aBufferSize, unsigned int aBufferSize)
unsigned int aSampleRate)
{ {
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate); StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
return DS_FinishStream(ctx); return DS_FinishStream(ctx);
} }
Metadata* Metadata*
DS_SpeechToTextWithMetadata(ModelState* aCtx, DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer, const short* aBuffer,
unsigned int aBufferSize, unsigned int aBufferSize)
unsigned int aSampleRate)
{ {
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate); StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
return DS_FinishStreamWithMetadata(ctx); return DS_FinishStreamWithMetadata(ctx);
} }

View File

@ -124,9 +124,8 @@ int DS_EnableDecoderWithLM(ModelState* aCtx,
* *
* @param aCtx The ModelState pointer for the model to use. * @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate. * sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal. * @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
* *
* @return The STT result. The user is responsible for freeing the string using * @return The STT result. The user is responsible for freeing the string using
* {@link DS_FreeString()}. Returns NULL on error. * {@link DS_FreeString()}. Returns NULL on error.
@ -134,8 +133,7 @@ int DS_EnableDecoderWithLM(ModelState* aCtx,
DEEPSPEECH_EXPORT DEEPSPEECH_EXPORT
char* DS_SpeechToText(ModelState* aCtx, char* DS_SpeechToText(ModelState* aCtx,
const short* aBuffer, const short* aBuffer,
unsigned int aBufferSize, unsigned int aBufferSize);
unsigned int aSampleRate);
/** /**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata * @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
@ -143,9 +141,8 @@ char* DS_SpeechToText(ModelState* aCtx,
* *
* @param aCtx The ModelState pointer for the model to use. * @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate. * sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal. * @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
* *
* @return Outputs a struct of individual letters along with their timing information. * @return Outputs a struct of individual letters along with their timing information.
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. * The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
@ -153,8 +150,7 @@ char* DS_SpeechToText(ModelState* aCtx,
DEEPSPEECH_EXPORT DEEPSPEECH_EXPORT
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer, const short* aBuffer,
unsigned int aBufferSize, unsigned int aBufferSize);
unsigned int aSampleRate);
/** /**
* @brief Create a new streaming inference state. The streaming state returned * @brief Create a new streaming inference state. The streaming state returned
@ -162,7 +158,6 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
* and {@link DS_FinishStream()}. * and {@link DS_FinishStream()}.
* *
* @param aCtx The ModelState pointer for the model to use. * @param aCtx The ModelState pointer for the model to use.
* @param aSampleRate The sample-rate of the audio signal.
* @param[out] retval an opaque pointer that represents the streaming state. Can * @param[out] retval an opaque pointer that represents the streaming state. Can
* be NULL if an error occurs. * be NULL if an error occurs.
* *
@ -170,7 +165,6 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
*/ */
DEEPSPEECH_EXPORT DEEPSPEECH_EXPORT
int DS_CreateStream(ModelState* aCtx, int DS_CreateStream(ModelState* aCtx,
unsigned int aSampleRate,
StreamingState** retval); StreamingState** retval);
/** /**
@ -178,7 +172,7 @@ int DS_CreateStream(ModelState* aCtx,
* *
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aBuffer An array of 16-bit, mono raw audio samples at the * @param aBuffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate. * appropriate sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in @p aBuffer. * @param aBufferSize The number of samples in @p aBuffer.
*/ */
DEEPSPEECH_EXPORT DEEPSPEECH_EXPORT

View File

@ -71,17 +71,17 @@ int DS_EnableDecoderWithLM(ModelState* aCtx,
* and {@link DS_FinishStream()}. * and {@link DS_FinishStream()}.
* *
* @param aCtx The ModelState pointer for the model to use. * @param aCtx The ModelState pointer for the model to use.
* @param aSampleRate The sample-rate of the audio signal. * @param aSampleRate UNUSED, DEPRECATED.
* @param[out] retval an opaque pointer that represents the streaming state. Can * @param[out] retval an opaque pointer that represents the streaming state. Can
* be NULL if an error occurs. * be NULL if an error occurs.
* *
* @return Zero for success, non-zero on failure. * @return Zero for success, non-zero on failure.
*/ */
int DS_SetupStream(ModelState* aCtx, int DS_SetupStream(ModelState* aCtx,
unsigned int aSampleRate, unsigned int /*aSampleRate*/,
StreamingState** retval) StreamingState** retval)
{ {
return DS_CreateStream(aCtx, aSampleRate, retval); return DS_CreateStream(aCtx, retval);
} }
/** /**
@ -98,4 +98,45 @@ void DS_DiscardStream(StreamingState* aSctx)
return DS_FreeStream(aSctx); return DS_FreeStream(aSctx);
} }
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate UNUSED, DEPRECATED.
*
* @return The STT result. The user is responsible for freeing the string using
* {@link DS_FreeString()}. Returns NULL on error.
*/
char* DS_SpeechToText(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int /*aSampleRate*/)
{
return DS_SpeechToText(aCtx, aBuffer, aBufferSize);
}
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate UNUSED, DEPRECATED.
*
* @return Outputs a struct of individual letters along with their timing information.
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
*/
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int /*aSampleRate*/)
{
return DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
}
#endif /* DEEPSPEECH_COMPAT_H */ #endif /* DEEPSPEECH_COMPAT_H */

View File

@ -148,7 +148,7 @@ namespace DeepSpeechClient
/// <summary> /// <summary>
/// Feeds audio samples to an ongoing streaming inference. /// Feeds audio samples to an ongoing streaming inference.
/// </summary> /// </summary>
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param> /// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).</param>
public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize) public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize)
{ {
NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize); NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize);
@ -193,11 +193,10 @@ namespace DeepSpeechClient
/// <summary> /// <summary>
/// Creates a new streaming inference state. /// Creates a new streaming inference state.
/// </summary> /// </summary>
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception> /// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
public unsafe void CreateStream(uint aSampleRate) public unsafe void CreateStream()
{ {
var resultCode = NativeImp.DS_CreateStream(_modelStatePP, aSampleRate, ref _streamingStatePP); var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref _streamingStatePP);
EvaluateResultCode(resultCode); EvaluateResultCode(resultCode);
} }
@ -230,25 +229,23 @@ namespace DeepSpeechClient
/// <summary> /// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text. /// Use the DeepSpeech model to perform Speech-To-Text.
/// </summary> /// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param> /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param> /// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
/// <returns>The STT result. The user is responsible for freeing the string. Returns NULL on error.</returns> /// <returns>The STT result. The user is responsible for freeing the string. Returns NULL on error.</returns>
public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize, uint aSampleRate) public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize)
{ {
return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToString(); return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize).PtrToString();
} }
/// <summary> /// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text. /// Use the DeepSpeech model to perform Speech-To-Text.
/// </summary> /// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param> /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param> /// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
/// <returns>The extended metadata. The user is responsible for freeing the struct. Returns NULL on error.</returns> /// <returns>The extended metadata. The user is responsible for freeing the struct. Returns NULL on error.</returns>
public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aSampleRate) public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
{ {
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToMetadata(); return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
} }
#endregion #endregion

View File

@ -40,24 +40,20 @@ namespace DeepSpeechClient.Interfaces
/// <summary> /// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text. /// Use the DeepSpeech model to perform Speech-To-Text.
/// </summary> /// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param> /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param> /// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
/// <returns>The STT result. The user is responsible for freeing the string. Returns NULL on error.</returns> /// <returns>The STT result. The user is responsible for freeing the string. Returns NULL on error.</returns>
unsafe string SpeechToText(short[] aBuffer, unsafe string SpeechToText(short[] aBuffer,
uint aBufferSize, uint aBufferSize);
uint aSampleRate);
/// <summary> /// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text. /// Use the DeepSpeech model to perform Speech-To-Text.
/// </summary> /// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param> /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param> /// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
/// <returns>The extended metadata result. The user is responsible for freeing the struct. Returns NULL on error.</returns> /// <returns>The extended metadata result. The user is responsible for freeing the struct. Returns NULL on error.</returns>
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
uint aBufferSize, uint aBufferSize);
uint aSampleRate);
/// <summary> /// <summary>
/// Destroy a streaming state without decoding the computed logits. /// Destroy a streaming state without decoding the computed logits.
@ -79,14 +75,13 @@ namespace DeepSpeechClient.Interfaces
/// <summary> /// <summary>
/// Creates a new streaming inference state. /// Creates a new streaming inference state.
/// </summary> /// </summary>
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception> /// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
unsafe void CreateStream(uint aSampleRate); unsafe void CreateStream();
/// <summary> /// <summary>
/// Feeds audio samples to an ongoing streaming inference. /// Feeds audio samples to an ongoing streaming inference.
/// </summary> /// </summary>
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param> /// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).</param>
unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize); unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize);
/// <summary> /// <summary>

View File

@ -31,21 +31,19 @@ namespace DeepSpeechClient
CharSet = CharSet.Ansi, SetLastError = true)] CharSet = CharSet.Ansi, SetLastError = true)]
internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx, internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx,
short[] aBuffer, short[] aBuffer,
uint aBufferSize, uint aBufferSize);
uint aSampleRate);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx, internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
short[] aBuffer, short[] aBuffer,
uint aBufferSize, uint aBufferSize);
uint aSampleRate);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeModel(IntPtr** aCtx); internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_CreateStream(IntPtr** aCtx, internal static unsafe extern ErrorCodes DS_CreateStream(IntPtr** aCtx,
uint aSampleRate, ref IntPtr** retval); ref IntPtr** retval);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeStream(ref IntPtr** aSctx); internal static unsafe extern void DS_FreeStream(ref IntPtr** aSctx);

View File

@ -91,12 +91,12 @@ namespace CSharpExamples
string speechResult; string speechResult;
if (extended) if (extended)
{ {
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000); Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
speechResult = MetadataToString(metaResult); speechResult = MetadataToString(metaResult);
} }
else else
{ {
speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000); speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
} }
stopwatch.Stop(); stopwatch.Stop();

View File

@ -100,7 +100,7 @@ public class DeepSpeechActivity extends AppCompatActivity {
long inferenceStartTime = System.currentTimeMillis(); long inferenceStartTime = System.currentTimeMillis();
String decoded = this._m.stt(shorts, shorts.length, sampleRate); String decoded = this._m.stt(shorts, shorts.length);
inferenceExecTime = System.currentTimeMillis() - inferenceStartTime; inferenceExecTime = System.currentTimeMillis() - inferenceStartTime;

View File

@ -104,9 +104,9 @@ public class BasicTest {
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts); ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
if (extendedMetadata) { if (extendedMetadata) {
return metadataToString(m.sttWithMetadata(shorts, shorts.length, sampleRate)); return metadataToString(m.sttWithMetadata(shorts, shorts.length));
} else { } else {
return m.stt(shorts, shorts.length, sampleRate); return m.stt(shorts, shorts.length);
} }
} catch (FileNotFoundException ex) { } catch (FileNotFoundException ex) {

View File

@ -57,14 +57,13 @@ public class DeepSpeechModel {
* @brief Use the DeepSpeech model to perform Speech-To-Text. * @brief Use the DeepSpeech model to perform Speech-To-Text.
* *
* @param buffer A 16-bit, mono raw audio signal at the appropriate * @param buffer A 16-bit, mono raw audio signal at the appropriate
* sample rate. * sample rate (matching what the model was trained on).
* @param buffer_size The number of samples in the audio signal. * @param buffer_size The number of samples in the audio signal.
* @param sample_rate The sample-rate of the audio signal.
* *
* @return The STT result. * @return The STT result.
*/ */
public String stt(short[] buffer, int buffer_size, int sample_rate) { public String stt(short[] buffer, int buffer_size) {
return impl.SpeechToText(this._msp, buffer, buffer_size, sample_rate); return impl.SpeechToText(this._msp, buffer, buffer_size);
} }
/** /**
@ -72,14 +71,13 @@ public class DeepSpeechModel {
* about the results. * about the results.
* *
* @param buffer A 16-bit, mono raw audio signal at the appropriate * @param buffer A 16-bit, mono raw audio signal at the appropriate
* sample rate. * sample rate (matching what the model was trained on).
* @param buffer_size The number of samples in the audio signal. * @param buffer_size The number of samples in the audio signal.
* @param sample_rate The sample-rate of the audio signal.
* *
* @return Outputs a Metadata object of individual letters along with their timing information. * @return Outputs a Metadata object of individual letters along with their timing information.
*/ */
public Metadata sttWithMetadata(short[] buffer, int buffer_size, int sample_rate) { public Metadata sttWithMetadata(short[] buffer, int buffer_size) {
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, sample_rate); return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size);
} }
/** /**
@ -87,12 +85,11 @@ public class DeepSpeechModel {
* by this function can then be passed to feedAudioContent() * by this function can then be passed to feedAudioContent()
* and finishStream(). * and finishStream().
* *
* @param sample_rate The sample-rate of the audio signal.
* @return An opaque object that represents the streaming state. * @return An opaque object that represents the streaming state.
*/ */
public DeepSpeechStreamingState createStream(int sample_rate) { public DeepSpeechStreamingState createStream() {
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep(); SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
impl.CreateStream(this._msp, sample_rate, ssp); impl.CreateStream(this._msp, ssp);
return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp)); return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
} }
@ -101,7 +98,7 @@ public class DeepSpeechModel {
* *
* @param cctx A streaming state pointer returned by createStream(). * @param cctx A streaming state pointer returned by createStream().
* @param buffer An array of 16-bit, mono raw audio samples at the * @param buffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate. * appropriate sample rate (matching what the model was trained on).
* @param buffer_size The number of samples in @p buffer. * @param buffer_size The number of samples in @p buffer.
*/ */
public void feedAudioContent(DeepSpeechStreamingState ctx, short[] buffer, int buffer_size) { public void feedAudioContent(DeepSpeechStreamingState ctx, short[] buffer, int buffer_size) {

View File

@ -118,9 +118,9 @@ audioStream.on('finish', () => {
// We take half of the buffer_size because buffer is a char* while // We take half of the buffer_size because buffer is a char* while
// LocalDsSTT() expected a short* // LocalDsSTT() expected a short*
if (args['extended']) { if (args['extended']) {
console.log(metadataToString(model.sttWithMetadata(audioBuffer.slice(0, audioBuffer.length / 2), 16000))); console.log(metadataToString(model.sttWithMetadata(audioBuffer.slice(0, audioBuffer.length / 2))));
} else { } else {
console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000)); console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2)));
} }
const inference_stop = process.hrtime(inference_start); const inference_stop = process.hrtime(inference_start);
console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4)); console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4));

View File

@ -64,9 +64,8 @@ Model.prototype.enableDecoderWithLM = function() {
/** /**
* Use the DeepSpeech model to perform Speech-To-Text. * Use the DeepSpeech model to perform Speech-To-Text.
* *
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate. * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in the audio signal. * @param {number} aBufferSize The number of samples in the audio signal.
* @param {number} aSampleRate The sample-rate of the audio signal.
* *
* @return {string} The STT result. Returns undefined on error. * @return {string} The STT result. Returns undefined on error.
*/ */
@ -79,9 +78,8 @@ Model.prototype.stt = function() {
* Use the DeepSpeech model to perform Speech-To-Text and output metadata * Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results. * about the results.
* *
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate. * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in the audio signal. * @param {number} aBufferSize The number of samples in the audio signal.
* @param {number} aSampleRate The sample-rate of the audio signal.
* *
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error. * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/ */
@ -93,7 +91,6 @@ Model.prototype.sttWithMetadata = function() {
/** /**
* Create a new streaming inference state. The streaming state returned by this function can then be passed to :js:func:`Model.feedAudioContent` and :js:func:`Model.finishStream`. * Create a new streaming inference state. The streaming state returned by this function can then be passed to :js:func:`Model.feedAudioContent` and :js:func:`Model.finishStream`.
* *
* @param {number} aSampleRate The sample-rate of the audio signal.
* @return {object} an opaque object that represents the streaming state. * @return {object} an opaque object that represents the streaming state.
* *
* @throws on error * @throws on error
@ -114,7 +111,7 @@ Model.prototype.createStream = function() {
* *
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`. * @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
* @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the * @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate. * appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in @param aBuffer. * @param {number} aBufferSize The number of samples in @param aBuffer.
*/ */
Model.prototype.feedAudioContent = function() { Model.prototype.feedAudioContent = function() {

View File

@ -69,15 +69,12 @@ class Model(object):
""" """
Use the DeepSpeech model to perform Speech-To-Text. Use the DeepSpeech model to perform Speech-To-Text.
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate. :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array :type aBuffer: int array
:param aBufferSize: The number of samples in the audio signal. :param aBufferSize: The number of samples in the audio signal.
:type aBufferSize: int :type aBufferSize: int
:param aSampleRate: The sample-rate of the audio signal.
:type aSampleRate: int
:return: The STT result. :return: The STT result.
:type: str :type: str
""" """
@ -87,34 +84,27 @@ class Model(object):
""" """
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results. Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate. :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array :type aBuffer: int array
:param aBufferSize: The number of samples in the audio signal. :param aBufferSize: The number of samples in the audio signal.
:type aBufferSize: int :type aBufferSize: int
:param aSampleRate: The sample-rate of the audio signal.
:type aSampleRate: int
:return: Outputs a struct of individual letters along with their timing information. :return: Outputs a struct of individual letters along with their timing information.
:type: :func:`Metadata` :type: :func:`Metadata`
""" """
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs) return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
def createStream(self, sample_rate=16000): def createStream(self):
""" """
Create a new streaming inference state. The streaming state returned Create a new streaming inference state. The streaming state returned
by this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`. by this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.
:param aSampleRate: The sample-rate of the audio signal.
:type aSampleRate: int
:return: Object holding the stream :return: Object holding the stream
:throws: RuntimeError on error :throws: RuntimeError on error
""" """
status, ctx = deepspeech.impl.CreateStream(self._impl, status, ctx = deepspeech.impl.CreateStream(self._impl)
aSampleRate=sample_rate)
if status != 0: if status != 0:
raise RuntimeError("CreateStream failed with error code {}".format(status)) raise RuntimeError("CreateStream failed with error code {}".format(status))
return ctx return ctx
@ -127,7 +117,7 @@ class Model(object):
:param aSctx: A streaming state pointer returned by :func:`createStream()`. :param aSctx: A streaming state pointer returned by :func:`createStream()`.
:type aSctx: object :type aSctx: object
:param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate. :param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array :type aBuffer: int array
:param aBufferSize: The number of samples in @p aBuffer. :param aBufferSize: The number of samples in @p aBuffer.

View File

@ -102,9 +102,9 @@ def main():
print('Running inference.', file=sys.stderr) print('Running inference.', file=sys.stderr)
inference_start = timer() inference_start = timer()
if args.extended: if args.extended:
print(metadata_to_string(ds.sttWithMetadata(audio, fs))) print(metadata_to_string(ds.sttWithMetadata(audio)))
else: else:
print(ds.stt(audio, fs)) print(ds.stt(audio))
inference_end = timer() - inference_start inference_end = timer() - inference_start
print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

View File

@ -52,8 +52,8 @@ def main():
audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
fin.close() fin.close()
stream1 = ds.createStream(sample_rate=fs1) stream1 = ds.createStream()
stream2 = ds.createStream(sample_rate=fs2) stream2 = ds.createStream()
splits1 = np.array_split(audio1, 10) splits1 = np.array_split(audio1, 10)
splits2 = np.array_split(audio2, 10) splits2 = np.array_split(audio2, 10)