diff --git a/evaluate_tflite.py b/evaluate_tflite.py index 878f29b0..02e45af5 100644 --- a/evaluate_tflite.py +++ b/evaluate_tflite.py @@ -45,7 +45,7 @@ def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask): audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() - decoded = ds.stt(audio, fs) + decoded = ds.stt(audio) queue_out.put({'wav': wavname, 'prediction': decoded, 'ground_truth': msg['transcript']}) print(queue_out.qsize(), end='\r') # Update the current progress diff --git a/examples/ffmpeg_vad_streaming/index.js b/examples/ffmpeg_vad_streaming/index.js index 8aef749b..d64cc9f8 100644 --- a/examples/ffmpeg_vad_streaming/index.js +++ b/examples/ffmpeg_vad_streaming/index.js @@ -95,7 +95,7 @@ const ffmpeg = spawn('ffmpeg', [ ]); let audioLength = 0; -let sctx = model.createStream(AUDIO_SAMPLE_RATE); +let sctx = model.createStream(); function finishStream() { const model_load_start = process.hrtime(); @@ -108,7 +108,7 @@ function finishStream() { function intermediateDecode() { finishStream(); - sctx = model.createStream(AUDIO_SAMPLE_RATE); + sctx = model.createStream(); } function feedAudioContent(chunk) { diff --git a/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs b/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs index 8b38316c..e332da6d 100644 --- a/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs +++ b/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs @@ -130,7 +130,7 @@ namespace DeepSpeechWPF watch.Start(); await Task.Run(() => { - string speechResult = _sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000); + string speechResult = _sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2)); watch.Stop(); Dispatcher.Invoke(() => { @@ -250,7 +250,7 @@ namespace DeepSpeechWPF private void BtnStartRecording_Click(object sender, RoutedEventArgs e) { - _sttClient.CreateStream(16000); + _sttClient.CreateStream(); _audioCapture.Start(); btnStartRecording.IsEnabled = false; btnStopRecording.IsEnabled = true; diff --git a/examples/nodejs_wav/index.js b/examples/nodejs_wav/index.js index a5432217..20ccb2ab 100644 --- a/examples/nodejs_wav/index.js +++ b/examples/nodejs_wav/index.js @@ -64,7 +64,7 @@ audioStream.on('finish', () => { const audioLength = (audioBuffer.length / 2) * ( 1 / 16000); console.log('audio length', audioLength); - let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000); + let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2)); console.log('result:', result); }); diff --git a/examples/vad_transcriber/wavTranscriber.py b/examples/vad_transcriber/wavTranscriber.py index 9f21f362..727dc5cf 100644 --- a/examples/vad_transcriber/wavTranscriber.py +++ b/examples/vad_transcriber/wavTranscriber.py @@ -44,12 +44,12 @@ Returns a list [Inference, Inference Time, Audio Length] ''' def stt(ds, audio, fs): inference_time = 0.0 - audio_length = len(audio) * (1 / 16000) + audio_length = len(audio) * (1 / fs) # Run Deepspeech logging.debug('Running inference...') inference_start = timer() - output = ds.stt(audio, fs) + output = ds.stt(audio) inference_end = timer() - inference_start inference_time += inference_end logging.debug('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length)) diff --git a/native_client/client.cc b/native_client/client.cc index afde6cd8..358f527f 100644 --- a/native_client/client.cc +++ b/native_client/client.cc @@ -54,23 +54,23 @@ char* JSONOutput(Metadata* metadata); ds_result LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, - int aSampleRate, bool extended_output, bool json_output) + bool extended_output, bool json_output) { ds_result res = {0}; clock_t ds_start_time = clock(); if (extended_output) { - Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate); + Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize); res.string = metadataToString(metadata); DS_FreeMetadata(metadata); } else if (json_output) { - Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate); + Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize); res.string = JSONOutput(metadata); DS_FreeMetadata(metadata); } else if (stream_size > 0) { StreamingState* ctx; - int status = DS_CreateStream(aCtx, aSampleRate, &ctx); + int status = DS_CreateStream(aCtx, &ctx); if (status != DS_ERR_OK) { res.string = strdup(""); return res; @@ -94,7 +94,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, } res.string = DS_FinishStream(ctx); } else { - res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize, aSampleRate); + res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize); } clock_t ds_end_infer = clock(); @@ -108,7 +108,6 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, typedef struct { char* buffer; size_t buffer_size; - int sample_rate; } ds_audio_buffer; ds_audio_buffer @@ -159,8 +158,6 @@ GetAudioBuffer(const char* path) assert(output); - res.sample_rate = (int)output->signal.rate; - if ((int)input->signal.rate < 16000) { fprintf(stderr, "Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.\n", (int)input->signal.rate); } @@ -221,7 +218,6 @@ GetAudioBuffer(const char* path) unsigned int sample_rate; fseek(wave, 24, SEEK_SET); rv = fread(&sample_rate, 4, 1, wave); - res.sample_rate = (int)sample_rate; unsigned short bits_per_sample; fseek(wave, 34, SEEK_SET); rv = fread(&bits_per_sample, 2, 1, wave); @@ -269,7 +265,6 @@ ProcessFile(ModelState* context, const char* path, bool show_times) ds_result result = LocalDsSTT(context, (const short*)audio.buffer, audio.buffer_size / 2, - audio.sample_rate, extended_metadata, json_output); free(audio.buffer); diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc index bf4706dd..439702a6 100644 --- a/native_client/deepspeech.cc +++ b/native_client/deepspeech.cc @@ -318,7 +318,6 @@ DS_EnableDecoderWithLM(ModelState* aCtx, int DS_CreateStream(ModelState* aCtx, - unsigned int aSampleRate, StreamingState** retval) { *retval = nullptr; @@ -383,11 +382,10 @@ DS_FinishStreamWithMetadata(StreamingState* aSctx) StreamingState* CreateStreamAndFeedAudioContent(ModelState* aCtx, const short* aBuffer, - unsigned int aBufferSize, - unsigned int aSampleRate) + unsigned int aBufferSize) { StreamingState* ctx; - int status = DS_CreateStream(aCtx, aSampleRate, &ctx); + int status = DS_CreateStream(aCtx, &ctx); if (status != DS_ERR_OK) { return nullptr; } @@ -398,20 +396,18 @@ CreateStreamAndFeedAudioContent(ModelState* aCtx, char* DS_SpeechToText(ModelState* aCtx, const short* aBuffer, - unsigned int aBufferSize, - unsigned int aSampleRate) + unsigned int aBufferSize) { - StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate); + StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize); return DS_FinishStream(ctx); } Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, const short* aBuffer, - unsigned int aBufferSize, - unsigned int aSampleRate) + unsigned int aBufferSize) { - StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate); + StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize); return DS_FinishStreamWithMetadata(ctx); } diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h index 5c64d998..ef25e985 100644 --- a/native_client/deepspeech.h +++ b/native_client/deepspeech.h @@ -124,9 +124,8 @@ int DS_EnableDecoderWithLM(ModelState* aCtx, * * @param aCtx The ModelState pointer for the model to use. * @param aBuffer A 16-bit, mono raw audio signal at the appropriate - * sample rate. + * sample rate (matching what the model was trained on). * @param aBufferSize The number of samples in the audio signal. - * @param aSampleRate The sample-rate of the audio signal. * * @return The STT result. The user is responsible for freeing the string using * {@link DS_FreeString()}. Returns NULL on error. @@ -134,8 +133,7 @@ int DS_EnableDecoderWithLM(ModelState* aCtx, DEEPSPEECH_EXPORT char* DS_SpeechToText(ModelState* aCtx, const short* aBuffer, - unsigned int aBufferSize, - unsigned int aSampleRate); + unsigned int aBufferSize); /** * @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata @@ -143,9 +141,8 @@ char* DS_SpeechToText(ModelState* aCtx, * * @param aCtx The ModelState pointer for the model to use. * @param aBuffer A 16-bit, mono raw audio signal at the appropriate - * sample rate. + * sample rate (matching what the model was trained on). * @param aBufferSize The number of samples in the audio signal. - * @param aSampleRate The sample-rate of the audio signal. * * @return Outputs a struct of individual letters along with their timing information. * The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. @@ -153,8 +150,7 @@ char* DS_SpeechToText(ModelState* aCtx, DEEPSPEECH_EXPORT Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, const short* aBuffer, - unsigned int aBufferSize, - unsigned int aSampleRate); + unsigned int aBufferSize); /** * @brief Create a new streaming inference state. The streaming state returned @@ -162,7 +158,6 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, * and {@link DS_FinishStream()}. * * @param aCtx The ModelState pointer for the model to use. - * @param aSampleRate The sample-rate of the audio signal. * @param[out] retval an opaque pointer that represents the streaming state. Can * be NULL if an error occurs. * @@ -170,7 +165,6 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, */ DEEPSPEECH_EXPORT int DS_CreateStream(ModelState* aCtx, - unsigned int aSampleRate, StreamingState** retval); /** @@ -178,7 +172,7 @@ int DS_CreateStream(ModelState* aCtx, * * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @param aBuffer An array of 16-bit, mono raw audio samples at the - * appropriate sample rate. + * appropriate sample rate (matching what the model was trained on). * @param aBufferSize The number of samples in @p aBuffer. */ DEEPSPEECH_EXPORT diff --git a/native_client/deepspeech_compat.h b/native_client/deepspeech_compat.h index fa1db328..57f3d16c 100644 --- a/native_client/deepspeech_compat.h +++ b/native_client/deepspeech_compat.h @@ -71,17 +71,17 @@ int DS_EnableDecoderWithLM(ModelState* aCtx, * and {@link DS_FinishStream()}. * * @param aCtx The ModelState pointer for the model to use. - * @param aSampleRate The sample-rate of the audio signal. + * @param aSampleRate UNUSED, DEPRECATED. * @param[out] retval an opaque pointer that represents the streaming state. Can * be NULL if an error occurs. * * @return Zero for success, non-zero on failure. */ int DS_SetupStream(ModelState* aCtx, - unsigned int aSampleRate, + unsigned int /*aSampleRate*/, StreamingState** retval) { - return DS_CreateStream(aCtx, aSampleRate, retval); + return DS_CreateStream(aCtx, retval); } /** @@ -98,4 +98,45 @@ void DS_DiscardStream(StreamingState* aSctx) return DS_FreeStream(aSctx); } +/** + * @brief Use the DeepSpeech model to perform Speech-To-Text. + * + * @param aCtx The ModelState pointer for the model to use. + * @param aBuffer A 16-bit, mono raw audio signal at the appropriate + * sample rate (matching what the model was trained on). + * @param aBufferSize The number of samples in the audio signal. + * @param aSampleRate UNUSED, DEPRECATED. + * + * @return The STT result. The user is responsible for freeing the string using + * {@link DS_FreeString()}. Returns NULL on error. + */ +char* DS_SpeechToText(ModelState* aCtx, + const short* aBuffer, + unsigned int aBufferSize, + unsigned int /*aSampleRate*/) +{ + return DS_SpeechToText(aCtx, aBuffer, aBufferSize); +} + +/** + * @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata + * about the results. + * + * @param aCtx The ModelState pointer for the model to use. + * @param aBuffer A 16-bit, mono raw audio signal at the appropriate + * sample rate (matching what the model was trained on). + * @param aBufferSize The number of samples in the audio signal. + * @param aSampleRate UNUSED, DEPRECATED. + * + * @return Outputs a struct of individual letters along with their timing information. + * The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error. + */ +Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, + const short* aBuffer, + unsigned int aBufferSize, + unsigned int /*aSampleRate*/) +{ + return DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize); +} + #endif /* DEEPSPEECH_COMPAT_H */ diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs index 19247507..25fcc109 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs @@ -148,7 +148,7 @@ namespace DeepSpeechClient /// /// Feeds audio samples to an ongoing streaming inference. /// - /// An array of 16-bit, mono raw audio samples at the appropriate sample rate. + /// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on). public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize) { NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize); @@ -193,11 +193,10 @@ namespace DeepSpeechClient /// /// Creates a new streaming inference state. /// - /// The sample-rate of the audio signal /// Thrown when the native binary failed to initialize the streaming mode. - public unsafe void CreateStream(uint aSampleRate) + public unsafe void CreateStream() { - var resultCode = NativeImp.DS_CreateStream(_modelStatePP, aSampleRate, ref _streamingStatePP); + var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref _streamingStatePP); EvaluateResultCode(resultCode); } @@ -230,25 +229,23 @@ namespace DeepSpeechClient /// /// Use the DeepSpeech model to perform Speech-To-Text. /// - /// A 16-bit, mono raw audio signal at the appropriate sample rate. + /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// The sample-rate of the audio signal. /// The STT result. The user is responsible for freeing the string. Returns NULL on error. - public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize, uint aSampleRate) + public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize) { - return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToString(); + return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize).PtrToString(); } /// /// Use the DeepSpeech model to perform Speech-To-Text. /// - /// A 16-bit, mono raw audio signal at the appropriate sample rate. + /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// The sample-rate of the audio signal. /// The extended metadata. The user is responsible for freeing the struct. Returns NULL on error. - public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aSampleRate) + public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize) { - return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToMetadata(); + return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata(); } #endregion diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs index 04ad086c..79af2964 100644 --- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs @@ -40,24 +40,20 @@ namespace DeepSpeechClient.Interfaces /// /// Use the DeepSpeech model to perform Speech-To-Text. /// - /// A 16-bit, mono raw audio signal at the appropriate sample rate. + /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// The sample-rate of the audio signal. /// The STT result. The user is responsible for freeing the string. Returns NULL on error. unsafe string SpeechToText(short[] aBuffer, - uint aBufferSize, - uint aSampleRate); + uint aBufferSize); /// /// Use the DeepSpeech model to perform Speech-To-Text. /// - /// A 16-bit, mono raw audio signal at the appropriate sample rate. + /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. - /// The sample-rate of the audio signal. /// The extended metadata result. The user is responsible for freeing the struct. Returns NULL on error. unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, - uint aBufferSize, - uint aSampleRate); + uint aBufferSize); /// /// Destroy a streaming state without decoding the computed logits. @@ -79,14 +75,13 @@ namespace DeepSpeechClient.Interfaces /// /// Creates a new streaming inference state. /// - /// The sample-rate of the audio signal /// Thrown when the native binary failed to initialize the streaming mode. - unsafe void CreateStream(uint aSampleRate); + unsafe void CreateStream(); /// /// Feeds audio samples to an ongoing streaming inference. /// - /// An array of 16-bit, mono raw audio samples at the appropriate sample rate. + /// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on). unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize); /// diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs index 3f126acc..74de9197 100644 --- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs +++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs @@ -31,21 +31,19 @@ namespace DeepSpeechClient CharSet = CharSet.Ansi, SetLastError = true)] internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx, short[] aBuffer, - uint aBufferSize, - uint aSampleRate); + uint aBufferSize); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)] internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx, short[] aBuffer, - uint aBufferSize, - uint aSampleRate); + uint aBufferSize); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern void DS_FreeModel(IntPtr** aCtx); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern ErrorCodes DS_CreateStream(IntPtr** aCtx, - uint aSampleRate, ref IntPtr** retval); + ref IntPtr** retval); [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern void DS_FreeStream(ref IntPtr** aSctx); diff --git a/native_client/dotnet/DeepSpeechConsole/Program.cs b/native_client/dotnet/DeepSpeechConsole/Program.cs index 0940e63c..5085fd21 100644 --- a/native_client/dotnet/DeepSpeechConsole/Program.cs +++ b/native_client/dotnet/DeepSpeechConsole/Program.cs @@ -91,12 +91,12 @@ namespace CSharpExamples string speechResult; if (extended) { - Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000); + Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2)); speechResult = MetadataToString(metaResult); } else { - speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000); + speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2)); } stopwatch.Stop(); diff --git a/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java b/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java index f39e22fc..6b9c45b3 100644 --- a/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java +++ b/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java @@ -100,7 +100,7 @@ public class DeepSpeechActivity extends AppCompatActivity { long inferenceStartTime = System.currentTimeMillis(); - String decoded = this._m.stt(shorts, shorts.length, sampleRate); + String decoded = this._m.stt(shorts, shorts.length); inferenceExecTime = System.currentTimeMillis() - inferenceStartTime; diff --git a/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java b/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java index f4fc5bf6..5af94da9 100644 --- a/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java +++ b/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java @@ -104,9 +104,9 @@ public class BasicTest { ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts); if (extendedMetadata) { - return metadataToString(m.sttWithMetadata(shorts, shorts.length, sampleRate)); + return metadataToString(m.sttWithMetadata(shorts, shorts.length)); } else { - return m.stt(shorts, shorts.length, sampleRate); + return m.stt(shorts, shorts.length); } } catch (FileNotFoundException ex) { diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java index 288b27f3..3a665c5e 100644 --- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java @@ -57,14 +57,13 @@ public class DeepSpeechModel { * @brief Use the DeepSpeech model to perform Speech-To-Text. * * @param buffer A 16-bit, mono raw audio signal at the appropriate - * sample rate. + * sample rate (matching what the model was trained on). * @param buffer_size The number of samples in the audio signal. - * @param sample_rate The sample-rate of the audio signal. * * @return The STT result. */ - public String stt(short[] buffer, int buffer_size, int sample_rate) { - return impl.SpeechToText(this._msp, buffer, buffer_size, sample_rate); + public String stt(short[] buffer, int buffer_size) { + return impl.SpeechToText(this._msp, buffer, buffer_size); } /** @@ -72,14 +71,13 @@ public class DeepSpeechModel { * about the results. * * @param buffer A 16-bit, mono raw audio signal at the appropriate - * sample rate. + * sample rate (matching what the model was trained on). * @param buffer_size The number of samples in the audio signal. - * @param sample_rate The sample-rate of the audio signal. * * @return Outputs a Metadata object of individual letters along with their timing information. */ - public Metadata sttWithMetadata(short[] buffer, int buffer_size, int sample_rate) { - return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, sample_rate); + public Metadata sttWithMetadata(short[] buffer, int buffer_size) { + return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size); } /** @@ -87,12 +85,11 @@ public class DeepSpeechModel { * by this function can then be passed to feedAudioContent() * and finishStream(). * - * @param sample_rate The sample-rate of the audio signal. * @return An opaque object that represents the streaming state. */ - public DeepSpeechStreamingState createStream(int sample_rate) { + public DeepSpeechStreamingState createStream() { SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep(); - impl.CreateStream(this._msp, sample_rate, ssp); + impl.CreateStream(this._msp, ssp); return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp)); } @@ -101,7 +98,7 @@ public class DeepSpeechModel { * * @param cctx A streaming state pointer returned by createStream(). * @param buffer An array of 16-bit, mono raw audio samples at the - * appropriate sample rate. + * appropriate sample rate (matching what the model was trained on). * @param buffer_size The number of samples in @p buffer. */ public void feedAudioContent(DeepSpeechStreamingState ctx, short[] buffer, int buffer_size) { diff --git a/native_client/javascript/client.js b/native_client/javascript/client.js index 8e274fe7..e356c2e8 100644 --- a/native_client/javascript/client.js +++ b/native_client/javascript/client.js @@ -118,9 +118,9 @@ audioStream.on('finish', () => { // We take half of the buffer_size because buffer is a char* while // LocalDsSTT() expected a short* if (args['extended']) { - console.log(metadataToString(model.sttWithMetadata(audioBuffer.slice(0, audioBuffer.length / 2), 16000))); + console.log(metadataToString(model.sttWithMetadata(audioBuffer.slice(0, audioBuffer.length / 2)))); } else { - console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000)); + console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2))); } const inference_stop = process.hrtime(inference_start); console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4)); diff --git a/native_client/javascript/index.js b/native_client/javascript/index.js index 3c524cc3..f6446f4d 100644 --- a/native_client/javascript/index.js +++ b/native_client/javascript/index.js @@ -64,9 +64,8 @@ Model.prototype.enableDecoderWithLM = function() { /** * Use the DeepSpeech model to perform Speech-To-Text. * - * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate. + * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). * @param {number} aBufferSize The number of samples in the audio signal. - * @param {number} aSampleRate The sample-rate of the audio signal. * * @return {string} The STT result. Returns undefined on error. */ @@ -79,9 +78,8 @@ Model.prototype.stt = function() { * Use the DeepSpeech model to perform Speech-To-Text and output metadata * about the results. * - * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate. + * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). * @param {number} aBufferSize The number of samples in the audio signal. - * @param {number} aSampleRate The sample-rate of the audio signal. * * @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error. */ @@ -93,7 +91,6 @@ Model.prototype.sttWithMetadata = function() { /** * Create a new streaming inference state. The streaming state returned by this function can then be passed to :js:func:`Model.feedAudioContent` and :js:func:`Model.finishStream`. * - * @param {number} aSampleRate The sample-rate of the audio signal. * @return {object} an opaque object that represents the streaming state. * * @throws on error @@ -114,7 +111,7 @@ Model.prototype.createStream = function() { * * @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`. * @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the - * appropriate sample rate. + * appropriate sample rate (matching what the model was trained on). * @param {number} aBufferSize The number of samples in @param aBuffer. */ Model.prototype.feedAudioContent = function() { diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py index d10a4c98..62ea1eb5 100644 --- a/native_client/python/__init__.py +++ b/native_client/python/__init__.py @@ -69,15 +69,12 @@ class Model(object): """ Use the DeepSpeech model to perform Speech-To-Text. - :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate. + :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). :type aBuffer: int array :param aBufferSize: The number of samples in the audio signal. :type aBufferSize: int - :param aSampleRate: The sample-rate of the audio signal. - :type aSampleRate: int - :return: The STT result. :type: str """ @@ -87,34 +84,27 @@ class Model(object): """ Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results. - :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate. + :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). :type aBuffer: int array :param aBufferSize: The number of samples in the audio signal. :type aBufferSize: int - :param aSampleRate: The sample-rate of the audio signal. - :type aSampleRate: int - :return: Outputs a struct of individual letters along with their timing information. :type: :func:`Metadata` """ return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs) - def createStream(self, sample_rate=16000): + def createStream(self): """ Create a new streaming inference state. The streaming state returned by this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`. - :param aSampleRate: The sample-rate of the audio signal. - :type aSampleRate: int - :return: Object holding the stream :throws: RuntimeError on error """ - status, ctx = deepspeech.impl.CreateStream(self._impl, - aSampleRate=sample_rate) + status, ctx = deepspeech.impl.CreateStream(self._impl) if status != 0: raise RuntimeError("CreateStream failed with error code {}".format(status)) return ctx @@ -127,7 +117,7 @@ class Model(object): :param aSctx: A streaming state pointer returned by :func:`createStream()`. :type aSctx: object - :param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate. + :param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on). :type aBuffer: int array :param aBufferSize: The number of samples in @p aBuffer. diff --git a/native_client/python/client.py b/native_client/python/client.py index 3d6e71f9..b44c5122 100644 --- a/native_client/python/client.py +++ b/native_client/python/client.py @@ -102,9 +102,9 @@ def main(): print('Running inference.', file=sys.stderr) inference_start = timer() if args.extended: - print(metadata_to_string(ds.sttWithMetadata(audio, fs))) + print(metadata_to_string(ds.sttWithMetadata(audio))) else: - print(ds.stt(audio, fs)) + print(ds.stt(audio)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) diff --git a/native_client/test/concurrent_streams.py b/native_client/test/concurrent_streams.py index 54e66fed..e069aef7 100644 --- a/native_client/test/concurrent_streams.py +++ b/native_client/test/concurrent_streams.py @@ -52,8 +52,8 @@ def main(): audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() - stream1 = ds.createStream(sample_rate=fs1) - stream2 = ds.createStream(sample_rate=fs2) + stream1 = ds.createStream() + stream2 = ds.createStream() splits1 = np.array_split(audio1, 10) splits2 = np.array_split(audio2, 10)