diff --git a/evaluate_tflite.py b/evaluate_tflite.py
index 878f29b0..02e45af5 100644
--- a/evaluate_tflite.py
+++ b/evaluate_tflite.py
@@ -45,7 +45,7 @@ def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask):
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
fin.close()
- decoded = ds.stt(audio, fs)
+ decoded = ds.stt(audio)
queue_out.put({'wav': wavname, 'prediction': decoded, 'ground_truth': msg['transcript']})
print(queue_out.qsize(), end='\r') # Update the current progress
diff --git a/examples/ffmpeg_vad_streaming/index.js b/examples/ffmpeg_vad_streaming/index.js
index 8aef749b..d64cc9f8 100644
--- a/examples/ffmpeg_vad_streaming/index.js
+++ b/examples/ffmpeg_vad_streaming/index.js
@@ -95,7 +95,7 @@ const ffmpeg = spawn('ffmpeg', [
]);
let audioLength = 0;
-let sctx = model.createStream(AUDIO_SAMPLE_RATE);
+let sctx = model.createStream();
function finishStream() {
const model_load_start = process.hrtime();
@@ -108,7 +108,7 @@ function finishStream() {
function intermediateDecode() {
finishStream();
- sctx = model.createStream(AUDIO_SAMPLE_RATE);
+ sctx = model.createStream();
}
function feedAudioContent(chunk) {
diff --git a/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs b/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs
index 8b38316c..e332da6d 100644
--- a/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs
+++ b/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs
@@ -130,7 +130,7 @@ namespace DeepSpeechWPF
watch.Start();
await Task.Run(() =>
{
- string speechResult = _sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
+ string speechResult = _sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
watch.Stop();
Dispatcher.Invoke(() =>
{
@@ -250,7 +250,7 @@ namespace DeepSpeechWPF
private void BtnStartRecording_Click(object sender, RoutedEventArgs e)
{
- _sttClient.CreateStream(16000);
+ _sttClient.CreateStream();
_audioCapture.Start();
btnStartRecording.IsEnabled = false;
btnStopRecording.IsEnabled = true;
diff --git a/examples/nodejs_wav/index.js b/examples/nodejs_wav/index.js
index a5432217..20ccb2ab 100644
--- a/examples/nodejs_wav/index.js
+++ b/examples/nodejs_wav/index.js
@@ -64,7 +64,7 @@ audioStream.on('finish', () => {
const audioLength = (audioBuffer.length / 2) * ( 1 / 16000);
console.log('audio length', audioLength);
- let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000);
+ let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2));
console.log('result:', result);
});
diff --git a/examples/vad_transcriber/wavTranscriber.py b/examples/vad_transcriber/wavTranscriber.py
index 9f21f362..727dc5cf 100644
--- a/examples/vad_transcriber/wavTranscriber.py
+++ b/examples/vad_transcriber/wavTranscriber.py
@@ -44,12 +44,12 @@ Returns a list [Inference, Inference Time, Audio Length]
'''
def stt(ds, audio, fs):
inference_time = 0.0
- audio_length = len(audio) * (1 / 16000)
+ audio_length = len(audio) * (1 / fs)
# Run Deepspeech
logging.debug('Running inference...')
inference_start = timer()
- output = ds.stt(audio, fs)
+ output = ds.stt(audio)
inference_end = timer() - inference_start
inference_time += inference_end
logging.debug('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length))
diff --git a/native_client/client.cc b/native_client/client.cc
index afde6cd8..358f527f 100644
--- a/native_client/client.cc
+++ b/native_client/client.cc
@@ -54,23 +54,23 @@ char* JSONOutput(Metadata* metadata);
ds_result
LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
- int aSampleRate, bool extended_output, bool json_output)
+ bool extended_output, bool json_output)
{
ds_result res = {0};
clock_t ds_start_time = clock();
if (extended_output) {
- Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
+ Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
res.string = metadataToString(metadata);
DS_FreeMetadata(metadata);
} else if (json_output) {
- Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
+ Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
res.string = JSONOutput(metadata);
DS_FreeMetadata(metadata);
} else if (stream_size > 0) {
StreamingState* ctx;
- int status = DS_CreateStream(aCtx, aSampleRate, &ctx);
+ int status = DS_CreateStream(aCtx, &ctx);
if (status != DS_ERR_OK) {
res.string = strdup("");
return res;
@@ -94,7 +94,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
}
res.string = DS_FinishStream(ctx);
} else {
- res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize, aSampleRate);
+ res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize);
}
clock_t ds_end_infer = clock();
@@ -108,7 +108,6 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
typedef struct {
char* buffer;
size_t buffer_size;
- int sample_rate;
} ds_audio_buffer;
ds_audio_buffer
@@ -159,8 +158,6 @@ GetAudioBuffer(const char* path)
assert(output);
- res.sample_rate = (int)output->signal.rate;
-
if ((int)input->signal.rate < 16000) {
fprintf(stderr, "Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.\n", (int)input->signal.rate);
}
@@ -221,7 +218,6 @@ GetAudioBuffer(const char* path)
unsigned int sample_rate;
fseek(wave, 24, SEEK_SET); rv = fread(&sample_rate, 4, 1, wave);
- res.sample_rate = (int)sample_rate;
unsigned short bits_per_sample;
fseek(wave, 34, SEEK_SET); rv = fread(&bits_per_sample, 2, 1, wave);
@@ -269,7 +265,6 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
ds_result result = LocalDsSTT(context,
(const short*)audio.buffer,
audio.buffer_size / 2,
- audio.sample_rate,
extended_metadata,
json_output);
free(audio.buffer);
diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc
index bf4706dd..439702a6 100644
--- a/native_client/deepspeech.cc
+++ b/native_client/deepspeech.cc
@@ -318,7 +318,6 @@ DS_EnableDecoderWithLM(ModelState* aCtx,
int
DS_CreateStream(ModelState* aCtx,
- unsigned int aSampleRate,
StreamingState** retval)
{
*retval = nullptr;
@@ -383,11 +382,10 @@ DS_FinishStreamWithMetadata(StreamingState* aSctx)
StreamingState*
CreateStreamAndFeedAudioContent(ModelState* aCtx,
const short* aBuffer,
- unsigned int aBufferSize,
- unsigned int aSampleRate)
+ unsigned int aBufferSize)
{
StreamingState* ctx;
- int status = DS_CreateStream(aCtx, aSampleRate, &ctx);
+ int status = DS_CreateStream(aCtx, &ctx);
if (status != DS_ERR_OK) {
return nullptr;
}
@@ -398,20 +396,18 @@ CreateStreamAndFeedAudioContent(ModelState* aCtx,
char*
DS_SpeechToText(ModelState* aCtx,
const short* aBuffer,
- unsigned int aBufferSize,
- unsigned int aSampleRate)
+ unsigned int aBufferSize)
{
- StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
+ StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
return DS_FinishStream(ctx);
}
Metadata*
DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer,
- unsigned int aBufferSize,
- unsigned int aSampleRate)
+ unsigned int aBufferSize)
{
- StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
+ StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
return DS_FinishStreamWithMetadata(ctx);
}
diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h
index 5c64d998..ef25e985 100644
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@@ -124,9 +124,8 @@ int DS_EnableDecoderWithLM(ModelState* aCtx,
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
- * sample rate.
+ * sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
- * @param aSampleRate The sample-rate of the audio signal.
*
* @return The STT result. The user is responsible for freeing the string using
* {@link DS_FreeString()}. Returns NULL on error.
@@ -134,8 +133,7 @@ int DS_EnableDecoderWithLM(ModelState* aCtx,
DEEPSPEECH_EXPORT
char* DS_SpeechToText(ModelState* aCtx,
const short* aBuffer,
- unsigned int aBufferSize,
- unsigned int aSampleRate);
+ unsigned int aBufferSize);
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
@@ -143,9 +141,8 @@ char* DS_SpeechToText(ModelState* aCtx,
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
- * sample rate.
+ * sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
- * @param aSampleRate The sample-rate of the audio signal.
*
* @return Outputs a struct of individual letters along with their timing information.
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
@@ -153,8 +150,7 @@ char* DS_SpeechToText(ModelState* aCtx,
DEEPSPEECH_EXPORT
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer,
- unsigned int aBufferSize,
- unsigned int aSampleRate);
+ unsigned int aBufferSize);
/**
* @brief Create a new streaming inference state. The streaming state returned
@@ -162,7 +158,6 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
* and {@link DS_FinishStream()}.
*
* @param aCtx The ModelState pointer for the model to use.
- * @param aSampleRate The sample-rate of the audio signal.
* @param[out] retval an opaque pointer that represents the streaming state. Can
* be NULL if an error occurs.
*
@@ -170,7 +165,6 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
*/
DEEPSPEECH_EXPORT
int DS_CreateStream(ModelState* aCtx,
- unsigned int aSampleRate,
StreamingState** retval);
/**
@@ -178,7 +172,7 @@ int DS_CreateStream(ModelState* aCtx,
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aBuffer An array of 16-bit, mono raw audio samples at the
- * appropriate sample rate.
+ * appropriate sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in @p aBuffer.
*/
DEEPSPEECH_EXPORT
diff --git a/native_client/deepspeech_compat.h b/native_client/deepspeech_compat.h
index fa1db328..57f3d16c 100644
--- a/native_client/deepspeech_compat.h
+++ b/native_client/deepspeech_compat.h
@@ -71,17 +71,17 @@ int DS_EnableDecoderWithLM(ModelState* aCtx,
* and {@link DS_FinishStream()}.
*
* @param aCtx The ModelState pointer for the model to use.
- * @param aSampleRate The sample-rate of the audio signal.
+ * @param aSampleRate UNUSED, DEPRECATED.
* @param[out] retval an opaque pointer that represents the streaming state. Can
* be NULL if an error occurs.
*
* @return Zero for success, non-zero on failure.
*/
int DS_SetupStream(ModelState* aCtx,
- unsigned int aSampleRate,
+ unsigned int /*aSampleRate*/,
StreamingState** retval)
{
- return DS_CreateStream(aCtx, aSampleRate, retval);
+ return DS_CreateStream(aCtx, retval);
}
/**
@@ -98,4 +98,45 @@ void DS_DiscardStream(StreamingState* aSctx)
return DS_FreeStream(aSctx);
}
+/**
+ * @brief Use the DeepSpeech model to perform Speech-To-Text.
+ *
+ * @param aCtx The ModelState pointer for the model to use.
+ * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
+ * sample rate (matching what the model was trained on).
+ * @param aBufferSize The number of samples in the audio signal.
+ * @param aSampleRate UNUSED, DEPRECATED.
+ *
+ * @return The STT result. The user is responsible for freeing the string using
+ * {@link DS_FreeString()}. Returns NULL on error.
+ */
+char* DS_SpeechToText(ModelState* aCtx,
+ const short* aBuffer,
+ unsigned int aBufferSize,
+ unsigned int /*aSampleRate*/)
+{
+ return DS_SpeechToText(aCtx, aBuffer, aBufferSize);
+}
+
+/**
+ * @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
+ * about the results.
+ *
+ * @param aCtx The ModelState pointer for the model to use.
+ * @param aBuffer A 16-bit, mono raw audio signal at the appropriate
+ * sample rate (matching what the model was trained on).
+ * @param aBufferSize The number of samples in the audio signal.
+ * @param aSampleRate UNUSED, DEPRECATED.
+ *
+ * @return Outputs a struct of individual letters along with their timing information.
+ * The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
+ */
+Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
+ const short* aBuffer,
+ unsigned int aBufferSize,
+ unsigned int /*aSampleRate*/)
+{
+ return DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
+}
+
#endif /* DEEPSPEECH_COMPAT_H */
diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
index 19247507..25fcc109 100644
--- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
@@ -148,7 +148,7 @@ namespace DeepSpeechClient
///
/// Feeds audio samples to an ongoing streaming inference.
///
- /// An array of 16-bit, mono raw audio samples at the appropriate sample rate.
+ /// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).
public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize)
{
NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize);
@@ -193,11 +193,10 @@ namespace DeepSpeechClient
///
/// Creates a new streaming inference state.
///
- /// The sample-rate of the audio signal
/// Thrown when the native binary failed to initialize the streaming mode.
- public unsafe void CreateStream(uint aSampleRate)
+ public unsafe void CreateStream()
{
- var resultCode = NativeImp.DS_CreateStream(_modelStatePP, aSampleRate, ref _streamingStatePP);
+ var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref _streamingStatePP);
EvaluateResultCode(resultCode);
}
@@ -230,25 +229,23 @@ namespace DeepSpeechClient
///
/// Use the DeepSpeech model to perform Speech-To-Text.
///
- /// A 16-bit, mono raw audio signal at the appropriate sample rate.
+ /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
/// The number of samples in the audio signal.
- /// The sample-rate of the audio signal.
/// The STT result. The user is responsible for freeing the string. Returns NULL on error.
- public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize, uint aSampleRate)
+ public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize)
{
- return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToString();
+ return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize).PtrToString();
}
///
/// Use the DeepSpeech model to perform Speech-To-Text.
///
- /// A 16-bit, mono raw audio signal at the appropriate sample rate.
+ /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
/// The number of samples in the audio signal.
- /// The sample-rate of the audio signal.
/// The extended metadata. The user is responsible for freeing the struct. Returns NULL on error.
- public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aSampleRate)
+ public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
{
- return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToMetadata();
+ return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
}
#endregion
diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
index 04ad086c..79af2964 100644
--- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
+++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs
@@ -40,24 +40,20 @@ namespace DeepSpeechClient.Interfaces
///
/// Use the DeepSpeech model to perform Speech-To-Text.
///
- /// A 16-bit, mono raw audio signal at the appropriate sample rate.
+ /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
/// The number of samples in the audio signal.
- /// The sample-rate of the audio signal.
/// The STT result. The user is responsible for freeing the string. Returns NULL on error.
unsafe string SpeechToText(short[] aBuffer,
- uint aBufferSize,
- uint aSampleRate);
+ uint aBufferSize);
///
/// Use the DeepSpeech model to perform Speech-To-Text.
///
- /// A 16-bit, mono raw audio signal at the appropriate sample rate.
+ /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
/// The number of samples in the audio signal.
- /// The sample-rate of the audio signal.
/// The extended metadata result. The user is responsible for freeing the struct. Returns NULL on error.
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
- uint aBufferSize,
- uint aSampleRate);
+ uint aBufferSize);
///
/// Destroy a streaming state without decoding the computed logits.
@@ -79,14 +75,13 @@ namespace DeepSpeechClient.Interfaces
///
/// Creates a new streaming inference state.
///
- /// The sample-rate of the audio signal
/// Thrown when the native binary failed to initialize the streaming mode.
- unsafe void CreateStream(uint aSampleRate);
+ unsafe void CreateStream();
///
/// Feeds audio samples to an ongoing streaming inference.
///
- /// An array of 16-bit, mono raw audio samples at the appropriate sample rate.
+ /// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).
unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize);
///
diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs
index 3f126acc..74de9197 100644
--- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs
+++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs
@@ -31,21 +31,19 @@ namespace DeepSpeechClient
CharSet = CharSet.Ansi, SetLastError = true)]
internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx,
short[] aBuffer,
- uint aBufferSize,
- uint aSampleRate);
+ uint aBufferSize);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
short[] aBuffer,
- uint aBufferSize,
- uint aSampleRate);
+ uint aBufferSize);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_CreateStream(IntPtr** aCtx,
- uint aSampleRate, ref IntPtr** retval);
+ ref IntPtr** retval);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeStream(ref IntPtr** aSctx);
diff --git a/native_client/dotnet/DeepSpeechConsole/Program.cs b/native_client/dotnet/DeepSpeechConsole/Program.cs
index 0940e63c..5085fd21 100644
--- a/native_client/dotnet/DeepSpeechConsole/Program.cs
+++ b/native_client/dotnet/DeepSpeechConsole/Program.cs
@@ -91,12 +91,12 @@ namespace CSharpExamples
string speechResult;
if (extended)
{
- Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
+ Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
speechResult = MetadataToString(metaResult);
}
else
{
- speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
+ speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
}
stopwatch.Stop();
diff --git a/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java b/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java
index f39e22fc..6b9c45b3 100644
--- a/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java
+++ b/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java
@@ -100,7 +100,7 @@ public class DeepSpeechActivity extends AppCompatActivity {
long inferenceStartTime = System.currentTimeMillis();
- String decoded = this._m.stt(shorts, shorts.length, sampleRate);
+ String decoded = this._m.stt(shorts, shorts.length);
inferenceExecTime = System.currentTimeMillis() - inferenceStartTime;
diff --git a/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java b/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java
index f4fc5bf6..5af94da9 100644
--- a/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java
+++ b/native_client/java/libdeepspeech/src/androidTest/java/org/mozilla/deepspeech/libdeepspeech/test/BasicTest.java
@@ -104,9 +104,9 @@ public class BasicTest {
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
if (extendedMetadata) {
- return metadataToString(m.sttWithMetadata(shorts, shorts.length, sampleRate));
+ return metadataToString(m.sttWithMetadata(shorts, shorts.length));
} else {
- return m.stt(shorts, shorts.length, sampleRate);
+ return m.stt(shorts, shorts.length);
}
} catch (FileNotFoundException ex) {
diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
index 288b27f3..3a665c5e 100644
--- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
+++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java
@@ -57,14 +57,13 @@ public class DeepSpeechModel {
* @brief Use the DeepSpeech model to perform Speech-To-Text.
*
* @param buffer A 16-bit, mono raw audio signal at the appropriate
- * sample rate.
+ * sample rate (matching what the model was trained on).
* @param buffer_size The number of samples in the audio signal.
- * @param sample_rate The sample-rate of the audio signal.
*
* @return The STT result.
*/
- public String stt(short[] buffer, int buffer_size, int sample_rate) {
- return impl.SpeechToText(this._msp, buffer, buffer_size, sample_rate);
+ public String stt(short[] buffer, int buffer_size) {
+ return impl.SpeechToText(this._msp, buffer, buffer_size);
}
/**
@@ -72,14 +71,13 @@ public class DeepSpeechModel {
* about the results.
*
* @param buffer A 16-bit, mono raw audio signal at the appropriate
- * sample rate.
+ * sample rate (matching what the model was trained on).
* @param buffer_size The number of samples in the audio signal.
- * @param sample_rate The sample-rate of the audio signal.
*
* @return Outputs a Metadata object of individual letters along with their timing information.
*/
- public Metadata sttWithMetadata(short[] buffer, int buffer_size, int sample_rate) {
- return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, sample_rate);
+ public Metadata sttWithMetadata(short[] buffer, int buffer_size) {
+ return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size);
}
/**
@@ -87,12 +85,11 @@ public class DeepSpeechModel {
* by this function can then be passed to feedAudioContent()
* and finishStream().
*
- * @param sample_rate The sample-rate of the audio signal.
* @return An opaque object that represents the streaming state.
*/
- public DeepSpeechStreamingState createStream(int sample_rate) {
+ public DeepSpeechStreamingState createStream() {
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
- impl.CreateStream(this._msp, sample_rate, ssp);
+ impl.CreateStream(this._msp, ssp);
return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
}
@@ -101,7 +98,7 @@ public class DeepSpeechModel {
*
* @param cctx A streaming state pointer returned by createStream().
* @param buffer An array of 16-bit, mono raw audio samples at the
- * appropriate sample rate.
+ * appropriate sample rate (matching what the model was trained on).
* @param buffer_size The number of samples in @p buffer.
*/
public void feedAudioContent(DeepSpeechStreamingState ctx, short[] buffer, int buffer_size) {
diff --git a/native_client/javascript/client.js b/native_client/javascript/client.js
index 8e274fe7..e356c2e8 100644
--- a/native_client/javascript/client.js
+++ b/native_client/javascript/client.js
@@ -118,9 +118,9 @@ audioStream.on('finish', () => {
// We take half of the buffer_size because buffer is a char* while
// LocalDsSTT() expected a short*
if (args['extended']) {
- console.log(metadataToString(model.sttWithMetadata(audioBuffer.slice(0, audioBuffer.length / 2), 16000)));
+ console.log(metadataToString(model.sttWithMetadata(audioBuffer.slice(0, audioBuffer.length / 2))));
} else {
- console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000));
+ console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2)));
}
const inference_stop = process.hrtime(inference_start);
console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4));
diff --git a/native_client/javascript/index.js b/native_client/javascript/index.js
index 3c524cc3..f6446f4d 100644
--- a/native_client/javascript/index.js
+++ b/native_client/javascript/index.js
@@ -64,9 +64,8 @@ Model.prototype.enableDecoderWithLM = function() {
/**
* Use the DeepSpeech model to perform Speech-To-Text.
*
- * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate.
+ * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in the audio signal.
- * @param {number} aSampleRate The sample-rate of the audio signal.
*
* @return {string} The STT result. Returns undefined on error.
*/
@@ -79,9 +78,8 @@ Model.prototype.stt = function() {
* Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results.
*
- * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate.
+ * @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in the audio signal.
- * @param {number} aSampleRate The sample-rate of the audio signal.
*
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/
@@ -93,7 +91,6 @@ Model.prototype.sttWithMetadata = function() {
/**
* Create a new streaming inference state. The streaming state returned by this function can then be passed to :js:func:`Model.feedAudioContent` and :js:func:`Model.finishStream`.
*
- * @param {number} aSampleRate The sample-rate of the audio signal.
* @return {object} an opaque object that represents the streaming state.
*
* @throws on error
@@ -114,7 +111,7 @@ Model.prototype.createStream = function() {
*
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
* @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the
- * appropriate sample rate.
+ * appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in @param aBuffer.
*/
Model.prototype.feedAudioContent = function() {
diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py
index d10a4c98..62ea1eb5 100644
--- a/native_client/python/__init__.py
+++ b/native_client/python/__init__.py
@@ -69,15 +69,12 @@ class Model(object):
"""
Use the DeepSpeech model to perform Speech-To-Text.
- :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate.
+ :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array
:param aBufferSize: The number of samples in the audio signal.
:type aBufferSize: int
- :param aSampleRate: The sample-rate of the audio signal.
- :type aSampleRate: int
-
:return: The STT result.
:type: str
"""
@@ -87,34 +84,27 @@ class Model(object):
"""
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
- :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate.
+ :param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array
:param aBufferSize: The number of samples in the audio signal.
:type aBufferSize: int
- :param aSampleRate: The sample-rate of the audio signal.
- :type aSampleRate: int
-
:return: Outputs a struct of individual letters along with their timing information.
:type: :func:`Metadata`
"""
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
- def createStream(self, sample_rate=16000):
+ def createStream(self):
"""
Create a new streaming inference state. The streaming state returned
by this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.
- :param aSampleRate: The sample-rate of the audio signal.
- :type aSampleRate: int
-
:return: Object holding the stream
:throws: RuntimeError on error
"""
- status, ctx = deepspeech.impl.CreateStream(self._impl,
- aSampleRate=sample_rate)
+ status, ctx = deepspeech.impl.CreateStream(self._impl)
if status != 0:
raise RuntimeError("CreateStream failed with error code {}".format(status))
return ctx
@@ -127,7 +117,7 @@ class Model(object):
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
:type aSctx: object
- :param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate.
+ :param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array
:param aBufferSize: The number of samples in @p aBuffer.
diff --git a/native_client/python/client.py b/native_client/python/client.py
index 3d6e71f9..b44c5122 100644
--- a/native_client/python/client.py
+++ b/native_client/python/client.py
@@ -102,9 +102,9 @@ def main():
print('Running inference.', file=sys.stderr)
inference_start = timer()
if args.extended:
- print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
+ print(metadata_to_string(ds.sttWithMetadata(audio)))
else:
- print(ds.stt(audio, fs))
+ print(ds.stt(audio))
inference_end = timer() - inference_start
print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
diff --git a/native_client/test/concurrent_streams.py b/native_client/test/concurrent_streams.py
index 54e66fed..e069aef7 100644
--- a/native_client/test/concurrent_streams.py
+++ b/native_client/test/concurrent_streams.py
@@ -52,8 +52,8 @@ def main():
audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
fin.close()
- stream1 = ds.createStream(sample_rate=fs1)
- stream2 = ds.createStream(sample_rate=fs2)
+ stream1 = ds.createStream()
+ stream2 = ds.createStream()
splits1 = np.array_split(audio1, 10)
splits2 = np.array_split(audio2, 10)