Merge pull request #2420 from mozilla/remove-sr-param

Remove unused sample rate param from API
This commit is contained in:
Reuben Morais 2019-10-10 21:05:29 +02:00 committed by GitHub
commit 315a67bf69
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 114 additions and 114 deletions

View File

@ -45,7 +45,7 @@ def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask):
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
fin.close()
decoded = ds.stt(audio, fs)
decoded = ds.stt(audio)
queue_out.put({'wav': wavname, 'prediction': decoded, 'ground_truth': msg['transcript']})
print(queue_out.qsize(), end='\r') # Update the current progress

View File

@ -95,7 +95,7 @@ const ffmpeg = spawn('ffmpeg', [
]);
let audioLength = 0;
let sctx = model.createStream(AUDIO_SAMPLE_RATE);
let sctx = model.createStream();
function finishStream() {
const model_load_start = process.hrtime();
@ -108,7 +108,7 @@ function finishStream() {
function intermediateDecode() {
finishStream();
sctx = model.createStream(AUDIO_SAMPLE_RATE);
sctx = model.createStream();
}
function feedAudioContent(chunk) {

View File

@ -130,7 +130,7 @@ namespace DeepSpeechWPF
watch.Start();
await Task.Run(() =>
{
string speechResult = _sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
string speechResult = _sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
watch.Stop();
Dispatcher.Invoke(() =>
{
@ -250,7 +250,7 @@ namespace DeepSpeechWPF
private void BtnStartRecording_Click(object sender, RoutedEventArgs e)
{
_sttClient.CreateStream(16000);
_sttClient.CreateStream();
_audioCapture.Start();
btnStartRecording.IsEnabled = false;
btnStopRecording.IsEnabled = true;

View File

@ -64,7 +64,7 @@ audioStream.on('finish', () => {
const audioLength = (audioBuffer.length / 2) * ( 1 / 16000);
console.log('audio length', audioLength);
let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000);
let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2));
console.log('result:', result);
});

View File

@ -44,12 +44,12 @@ Returns a list [Inference, Inference Time, Audio Length]
'''
def stt(ds, audio, fs):
inference_time = 0.0
audio_length = len(audio) * (1 / 16000)
audio_length = len(audio) * (1 / fs)
# Run Deepspeech
logging.debug('Running inference...')
inference_start = timer()
output = ds.stt(audio, fs)
output = ds.stt(audio)
inference_end = timer() - inference_start
inference_time += inference_end
logging.debug('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length))

View File

@ -54,23 +54,23 @@ char* JSONOutput(Metadata* metadata);
ds_result
LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
int aSampleRate, bool extended_output, bool json_output)
bool extended_output, bool json_output)
{
ds_result res = {0};
clock_t ds_start_time = clock();
if (extended_output) {
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
res.string = metadataToString(metadata);
DS_FreeMetadata(metadata);
} else if (json_output) {
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
res.string = JSONOutput(metadata);
DS_FreeMetadata(metadata);
} else if (stream_size > 0) {
StreamingState* ctx;
int status = DS_CreateStream(aCtx, aSampleRate, &ctx);
int status = DS_CreateStream(aCtx, &ctx);
if (status != DS_ERR_OK) {
res.string = strdup("");
return res;
@ -94,7 +94,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
}
res.string = DS_FinishStream(ctx);
} else {
res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize, aSampleRate);
res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize);
}
clock_t ds_end_infer = clock();
@ -108,7 +108,6 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
typedef struct {
char* buffer;
size_t buffer_size;
int sample_rate;
} ds_audio_buffer;
ds_audio_buffer
@ -159,8 +158,6 @@ GetAudioBuffer(const char* path)
assert(output);
res.sample_rate = (int)output->signal.rate;
if ((int)input->signal.rate < 16000) {
fprintf(stderr, "Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.\n", (int)input->signal.rate);
}
@ -221,7 +218,6 @@ GetAudioBuffer(const char* path)
unsigned int sample_rate;
fseek(wave, 24, SEEK_SET); rv = fread(&sample_rate, 4, 1, wave);
res.sample_rate = (int)sample_rate;
unsigned short bits_per_sample;
fseek(wave, 34, SEEK_SET); rv = fread(&bits_per_sample, 2, 1, wave);
@ -269,7 +265,6 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
ds_result result = LocalDsSTT(context,
(const short*)audio.buffer,
audio.buffer_size / 2,
audio.sample_rate,
extended_metadata,
json_output);
free(audio.buffer);

View File

@ -318,7 +318,6 @@ DS_EnableDecoderWithLM(ModelState* aCtx,
int
DS_CreateStream(ModelState* aCtx,
unsigned int aSampleRate,
StreamingState** retval)
{
*retval = nullptr;
@ -383,11 +382,10 @@ DS_FinishStreamWithMetadata(StreamingState* aSctx)
StreamingState*
CreateStreamAndFeedAudioContent(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate)
unsigned int aBufferSize)
{
StreamingState* ctx;
int status = DS_CreateStream(aCtx, aSampleRate, &ctx);
int status = DS_CreateStream(aCtx, &ctx);
if (status != DS_ERR_OK) {
return nullptr;
}
@ -398,20 +396,18 @@ CreateStreamAndFeedAudioContent(ModelState* aCtx,
char*
DS_SpeechToText(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate)
unsigned int aBufferSize)
{
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
return DS_FinishStream(ctx);
}
Metadata*
DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate)
unsigned int aBufferSize)
{
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
return DS_FinishStreamWithMetadata(ctx);
}

View File

@ -124,9 +124,8 @@ int DS_EnableDecoderWithLM(ModelState* aCtx,
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate.
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
*
* @return The STT result. The user is responsible for freeing the string using
* {@link DS_FreeString()}. Returns NULL on error.
@ -134,8 +133,7 @@ int DS_EnableDecoderWithLM(ModelState* aCtx,
DEEPSPEECH_EXPORT
char* DS_SpeechToText(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate);
unsigned int aBufferSize);
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
@ -143,9 +141,8 @@ char* DS_SpeechToText(ModelState* aCtx,
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate.
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
*
* @return Outputs a struct of individual letters along with their timing information.
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
@ -153,8 +150,7 @@ char* DS_SpeechToText(ModelState* aCtx,
DEEPSPEECH_EXPORT
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate);
unsigned int aBufferSize);
/**
* @brief Create a new streaming inference state. The streaming state returned
@ -162,7 +158,6 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
* and {@link DS_FinishStream()}.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aSampleRate The sample-rate of the audio signal.
* @param[out] retval an opaque pointer that represents the streaming state. Can
* be NULL if an error occurs.
*
@ -170,7 +165,6 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
*/
DEEPSPEECH_EXPORT
int DS_CreateStream(ModelState* aCtx,
unsigned int aSampleRate,
StreamingState** retval);
/**
@ -178,7 +172,7 @@ int DS_CreateStream(ModelState* aCtx,
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aBuffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate.
* appropriate sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in @p aBuffer.
*/
DEEPSPEECH_EXPORT

View File

@ -71,17 +71,17 @@ int DS_EnableDecoderWithLM(ModelState* aCtx,
* and {@link DS_FinishStream()}.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aSampleRate The sample-rate of the audio signal.
* @param aSampleRate UNUSED, DEPRECATED.
* @param[out] retval an opaque pointer that represents the streaming state. Can
* be NULL if an error occurs.
*
* @return Zero for success, non-zero on failure.
*/
int DS_SetupStream(ModelState* aCtx,
unsigned int aSampleRate,
unsigned int /*aSampleRate*/,
StreamingState** retval)
{
return DS_CreateStream(aCtx, aSampleRate, retval);
return DS_CreateStream(aCtx, retval);
}
/**
@ -98,4 +98,45 @@ void DS_DiscardStream(StreamingState* aSctx)
return DS_FreeStream(aSctx);
}
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate UNUSED, DEPRECATED.
*
* @return The STT result. The user is responsible for freeing the string using
* {@link DS_FreeString()}. Returns NULL on error.
*/
char* DS_SpeechToText(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int /*aSampleRate*/)
{
return DS_SpeechToText(aCtx, aBuffer, aBufferSize);
}
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate UNUSED, DEPRECATED.
*
* @return Outputs a struct of individual letters along with their timing information.
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
*/
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int /*aSampleRate*/)
{
return DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
}
#endif /* DEEPSPEECH_COMPAT_H */

View File

@ -148,7 +148,7 @@ namespace DeepSpeechClient
/// <summary>
/// Feeds audio samples to an ongoing streaming inference.
/// </summary>
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).</param>
public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize)
{
NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize);
@ -193,11 +193,10 @@ namespace DeepSpeechClient
/// <summary>
/// Creates a new streaming inference state.
/// </summary>
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
public unsafe void CreateStream(uint aSampleRate)
public unsafe void CreateStream()
{
var resultCode = NativeImp.DS_CreateStream(_modelStatePP, aSampleRate, ref _streamingStatePP);
var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref _streamingStatePP);
EvaluateResultCode(resultCode);
}
@ -230,25 +229,23 @@ namespace DeepSpeechClient
/// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text.
/// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
/// <returns>The STT result. The user is responsible for freeing the string. Returns NULL on error.</returns>
public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize, uint aSampleRate)
public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize)
{
return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToString();
return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize).PtrToString();
}
/// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text.
/// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
/// <returns>The extended metadata. The user is responsible for freeing the struct. Returns NULL on error.</returns>
public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aSampleRate)
public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
{
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToMetadata();
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
}
#endregion

View File

@ -40,24 +40,20 @@ namespace DeepSpeechClient.Interfaces
/// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text.
/// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
/// <returns>The STT result. The user is responsible for freeing the string. Returns NULL on error.</returns>
unsafe string SpeechToText(short[] aBuffer,
uint aBufferSize,
uint aSampleRate);
uint aBufferSize);
/// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text.
/// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
/// <returns>The extended metadata result. The user is responsible for freeing the struct. Returns NULL on error.</returns>
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
uint aBufferSize,
uint aSampleRate);
uint aBufferSize);
/// <summary>
/// Destroy a streaming state without decoding the computed logits.
@ -79,14 +75,13 @@ namespace DeepSpeechClient.Interfaces
/// <summary>
/// Creates a new streaming inference state.
/// </summary>
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
unsafe void CreateStream(uint aSampleRate);
unsafe void CreateStream();
/// <summary>
/// Feeds audio samples to an ongoing streaming inference.
/// </summary>
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).</param>
unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize);
/// <summary>

View File

@ -31,21 +31,19 @@ namespace DeepSpeechClient
CharSet = CharSet.Ansi, SetLastError = true)]
internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx,
short[] aBuffer,
uint aBufferSize,
uint aSampleRate);
uint aBufferSize);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
short[] aBuffer,
uint aBufferSize,
uint aSampleRate);
uint aBufferSize);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern ErrorCodes DS_CreateStream(IntPtr** aCtx,
uint aSampleRate, ref IntPtr** retval);
ref IntPtr** retval);
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal static unsafe extern void DS_FreeStream(ref IntPtr** aSctx);

View File

@ -91,12 +91,12 @@ namespace CSharpExamples
string speechResult;
if (extended)
{
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
speechResult = MetadataToString(metaResult);
}
else
{
speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
}
stopwatch.Stop();

View File

@ -100,7 +100,7 @@ public class DeepSpeechActivity extends AppCompatActivity {
long inferenceStartTime = System.currentTimeMillis();
String decoded = this._m.stt(shorts, shorts.length, sampleRate);
String decoded = this._m.stt(shorts, shorts.length);
inferenceExecTime = System.currentTimeMillis() - inferenceStartTime;

View File

@ -104,9 +104,9 @@ public class BasicTest {
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
if (extendedMetadata) {
return metadataToString(m.sttWithMetadata(shorts, shorts.length, sampleRate));
return metadataToString(m.sttWithMetadata(shorts, shorts.length));
} else {
return m.stt(shorts, shorts.length, sampleRate);
return m.stt(shorts, shorts.length);
}
} catch (FileNotFoundException ex) {

View File

@ -57,14 +57,13 @@ public class DeepSpeechModel {
* @brief Use the DeepSpeech model to perform Speech-To-Text.
*
* @param buffer A 16-bit, mono raw audio signal at the appropriate
* sample rate.
* sample rate (matching what the model was trained on).
* @param buffer_size The number of samples in the audio signal.
* @param sample_rate The sample-rate of the audio signal.
*
* @return The STT result.
*/
public String stt(short[] buffer, int buffer_size, int sample_rate) {
return impl.SpeechToText(this._msp, buffer, buffer_size, sample_rate);
public String stt(short[] buffer, int buffer_size) {
return impl.SpeechToText(this._msp, buffer, buffer_size);
}
/**
@ -72,14 +71,13 @@ public class DeepSpeechModel {
* about the results.
*
* @param buffer A 16-bit, mono raw audio signal at the appropriate
* sample rate.
* sample rate (matching what the model was trained on).
* @param buffer_size The number of samples in the audio signal.
* @param sample_rate The sample-rate of the audio signal.
*
* @return Outputs a Metadata object of individual letters along with their timing information.
*/
public Metadata sttWithMetadata(short[] buffer, int buffer_size, int sample_rate) {
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, sample_rate);
public Metadata sttWithMetadata(short[] buffer, int buffer_size) {
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size);
}
/**
@ -87,12 +85,11 @@ public class DeepSpeechModel {
* by this function can then be passed to feedAudioContent()
* and finishStream().
*
* @param sample_rate The sample-rate of the audio signal.
* @return An opaque object that represents the streaming state.
*/
public DeepSpeechStreamingState createStream(int sample_rate) {
public DeepSpeechStreamingState createStream() {
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
impl.CreateStream(this._msp, sample_rate, ssp);
impl.CreateStream(this._msp, ssp);
return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
}
@ -101,7 +98,7 @@ public class DeepSpeechModel {
*
* @param cctx A streaming state pointer returned by createStream().
* @param buffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate.
* appropriate sample rate (matching what the model was trained on).
* @param buffer_size The number of samples in @p buffer.
*/
public void feedAudioContent(DeepSpeechStreamingState ctx, short[] buffer, int buffer_size) {

View File

@ -118,9 +118,9 @@ audioStream.on('finish', () => {
// We take half of the buffer_size because buffer is a char* while
// LocalDsSTT() expected a short*
if (args['extended']) {
console.log(metadataToString(model.sttWithMetadata(audioBuffer.slice(0, audioBuffer.length / 2), 16000)));
console.log(metadataToString(model.sttWithMetadata(audioBuffer.slice(0, audioBuffer.length / 2))));
} else {
console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000));
console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2)));
}
const inference_stop = process.hrtime(inference_start);
console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4));

View File

@ -64,9 +64,8 @@ Model.prototype.enableDecoderWithLM = function() {
/**
* Use the DeepSpeech model to perform Speech-To-Text.
*
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate.
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in the audio signal.
* @param {number} aSampleRate The sample-rate of the audio signal.
*
* @return {string} The STT result. Returns undefined on error.
*/
@ -79,9 +78,8 @@ Model.prototype.stt = function() {
* Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results.
*
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate.
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in the audio signal.
* @param {number} aSampleRate The sample-rate of the audio signal.
*
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/
@ -93,7 +91,6 @@ Model.prototype.sttWithMetadata = function() {
/**
* Create a new streaming inference state. The streaming state returned by this function can then be passed to :js:func:`Model.feedAudioContent` and :js:func:`Model.finishStream`.
*
* @param {number} aSampleRate The sample-rate of the audio signal.
* @return {object} an opaque object that represents the streaming state.
*
* @throws on error
@ -114,7 +111,7 @@ Model.prototype.createStream = function() {
*
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
* @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate.
* appropriate sample rate (matching what the model was trained on).
* @param {number} aBufferSize The number of samples in @param aBuffer.
*/
Model.prototype.feedAudioContent = function() {

View File

@ -69,15 +69,12 @@ class Model(object):
"""
Use the DeepSpeech model to perform Speech-To-Text.
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate.
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array
:param aBufferSize: The number of samples in the audio signal.
:type aBufferSize: int
:param aSampleRate: The sample-rate of the audio signal.
:type aSampleRate: int
:return: The STT result.
:type: str
"""
@ -87,34 +84,27 @@ class Model(object):
"""
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate.
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array
:param aBufferSize: The number of samples in the audio signal.
:type aBufferSize: int
:param aSampleRate: The sample-rate of the audio signal.
:type aSampleRate: int
:return: Outputs a struct of individual letters along with their timing information.
:type: :func:`Metadata`
"""
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
def createStream(self, sample_rate=16000):
def createStream(self):
"""
Create a new streaming inference state. The streaming state returned
by this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.
:param aSampleRate: The sample-rate of the audio signal.
:type aSampleRate: int
:return: Object holding the stream
:throws: RuntimeError on error
"""
status, ctx = deepspeech.impl.CreateStream(self._impl,
aSampleRate=sample_rate)
status, ctx = deepspeech.impl.CreateStream(self._impl)
if status != 0:
raise RuntimeError("CreateStream failed with error code {}".format(status))
return ctx
@ -127,7 +117,7 @@ class Model(object):
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
:type aSctx: object
:param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate.
:param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).
:type aBuffer: int array
:param aBufferSize: The number of samples in @p aBuffer.

View File

@ -102,9 +102,9 @@ def main():
print('Running inference.', file=sys.stderr)
inference_start = timer()
if args.extended:
print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
print(metadata_to_string(ds.sttWithMetadata(audio)))
else:
print(ds.stt(audio, fs))
print(ds.stt(audio))
inference_end = timer() - inference_start
print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

View File

@ -52,8 +52,8 @@ def main():
audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
fin.close()
stream1 = ds.createStream(sample_rate=fs1)
stream2 = ds.createStream(sample_rate=fs2)
stream1 = ds.createStream()
stream2 = ds.createStream()
splits1 = np.array_split(audio1, 10)
splits2 = np.array_split(audio2, 10)