Merge pull request #2420 from mozilla/remove-sr-param
Remove unused sample rate param from API
This commit is contained in:
commit
315a67bf69
|
@ -45,7 +45,7 @@ def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask):
|
|||
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
|
||||
fin.close()
|
||||
|
||||
decoded = ds.stt(audio, fs)
|
||||
decoded = ds.stt(audio)
|
||||
|
||||
queue_out.put({'wav': wavname, 'prediction': decoded, 'ground_truth': msg['transcript']})
|
||||
print(queue_out.qsize(), end='\r') # Update the current progress
|
||||
|
|
|
@ -95,7 +95,7 @@ const ffmpeg = spawn('ffmpeg', [
|
|||
]);
|
||||
|
||||
let audioLength = 0;
|
||||
let sctx = model.createStream(AUDIO_SAMPLE_RATE);
|
||||
let sctx = model.createStream();
|
||||
|
||||
function finishStream() {
|
||||
const model_load_start = process.hrtime();
|
||||
|
@ -108,7 +108,7 @@ function finishStream() {
|
|||
|
||||
function intermediateDecode() {
|
||||
finishStream();
|
||||
sctx = model.createStream(AUDIO_SAMPLE_RATE);
|
||||
sctx = model.createStream();
|
||||
}
|
||||
|
||||
function feedAudioContent(chunk) {
|
||||
|
|
|
@ -130,7 +130,7 @@ namespace DeepSpeechWPF
|
|||
watch.Start();
|
||||
await Task.Run(() =>
|
||||
{
|
||||
string speechResult = _sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
|
||||
string speechResult = _sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
|
||||
watch.Stop();
|
||||
Dispatcher.Invoke(() =>
|
||||
{
|
||||
|
@ -250,7 +250,7 @@ namespace DeepSpeechWPF
|
|||
|
||||
private void BtnStartRecording_Click(object sender, RoutedEventArgs e)
|
||||
{
|
||||
_sttClient.CreateStream(16000);
|
||||
_sttClient.CreateStream();
|
||||
_audioCapture.Start();
|
||||
btnStartRecording.IsEnabled = false;
|
||||
btnStopRecording.IsEnabled = true;
|
||||
|
|
|
@ -64,7 +64,7 @@ audioStream.on('finish', () => {
|
|||
const audioLength = (audioBuffer.length / 2) * ( 1 / 16000);
|
||||
console.log('audio length', audioLength);
|
||||
|
||||
let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000);
|
||||
let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2));
|
||||
|
||||
console.log('result:', result);
|
||||
});
|
||||
|
|
|
@ -44,12 +44,12 @@ Returns a list [Inference, Inference Time, Audio Length]
|
|||
'''
|
||||
def stt(ds, audio, fs):
|
||||
inference_time = 0.0
|
||||
audio_length = len(audio) * (1 / 16000)
|
||||
audio_length = len(audio) * (1 / fs)
|
||||
|
||||
# Run Deepspeech
|
||||
logging.debug('Running inference...')
|
||||
inference_start = timer()
|
||||
output = ds.stt(audio, fs)
|
||||
output = ds.stt(audio)
|
||||
inference_end = timer() - inference_start
|
||||
inference_time += inference_end
|
||||
logging.debug('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length))
|
||||
|
|
|
@ -54,23 +54,23 @@ char* JSONOutput(Metadata* metadata);
|
|||
|
||||
ds_result
|
||||
LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
|
||||
int aSampleRate, bool extended_output, bool json_output)
|
||||
bool extended_output, bool json_output)
|
||||
{
|
||||
ds_result res = {0};
|
||||
|
||||
clock_t ds_start_time = clock();
|
||||
|
||||
if (extended_output) {
|
||||
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
|
||||
res.string = metadataToString(metadata);
|
||||
DS_FreeMetadata(metadata);
|
||||
} else if (json_output) {
|
||||
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
|
||||
res.string = JSONOutput(metadata);
|
||||
DS_FreeMetadata(metadata);
|
||||
} else if (stream_size > 0) {
|
||||
StreamingState* ctx;
|
||||
int status = DS_CreateStream(aCtx, aSampleRate, &ctx);
|
||||
int status = DS_CreateStream(aCtx, &ctx);
|
||||
if (status != DS_ERR_OK) {
|
||||
res.string = strdup("");
|
||||
return res;
|
||||
|
@ -94,7 +94,7 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
|
|||
}
|
||||
res.string = DS_FinishStream(ctx);
|
||||
} else {
|
||||
res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize);
|
||||
}
|
||||
|
||||
clock_t ds_end_infer = clock();
|
||||
|
@ -108,7 +108,6 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
|
|||
typedef struct {
|
||||
char* buffer;
|
||||
size_t buffer_size;
|
||||
int sample_rate;
|
||||
} ds_audio_buffer;
|
||||
|
||||
ds_audio_buffer
|
||||
|
@ -159,8 +158,6 @@ GetAudioBuffer(const char* path)
|
|||
|
||||
assert(output);
|
||||
|
||||
res.sample_rate = (int)output->signal.rate;
|
||||
|
||||
if ((int)input->signal.rate < 16000) {
|
||||
fprintf(stderr, "Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.\n", (int)input->signal.rate);
|
||||
}
|
||||
|
@ -221,7 +218,6 @@ GetAudioBuffer(const char* path)
|
|||
|
||||
unsigned int sample_rate;
|
||||
fseek(wave, 24, SEEK_SET); rv = fread(&sample_rate, 4, 1, wave);
|
||||
res.sample_rate = (int)sample_rate;
|
||||
|
||||
unsigned short bits_per_sample;
|
||||
fseek(wave, 34, SEEK_SET); rv = fread(&bits_per_sample, 2, 1, wave);
|
||||
|
@ -269,7 +265,6 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
|
|||
ds_result result = LocalDsSTT(context,
|
||||
(const short*)audio.buffer,
|
||||
audio.buffer_size / 2,
|
||||
audio.sample_rate,
|
||||
extended_metadata,
|
||||
json_output);
|
||||
free(audio.buffer);
|
||||
|
|
|
@ -318,7 +318,6 @@ DS_EnableDecoderWithLM(ModelState* aCtx,
|
|||
|
||||
int
|
||||
DS_CreateStream(ModelState* aCtx,
|
||||
unsigned int aSampleRate,
|
||||
StreamingState** retval)
|
||||
{
|
||||
*retval = nullptr;
|
||||
|
@ -383,11 +382,10 @@ DS_FinishStreamWithMetadata(StreamingState* aSctx)
|
|||
StreamingState*
|
||||
CreateStreamAndFeedAudioContent(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate)
|
||||
unsigned int aBufferSize)
|
||||
{
|
||||
StreamingState* ctx;
|
||||
int status = DS_CreateStream(aCtx, aSampleRate, &ctx);
|
||||
int status = DS_CreateStream(aCtx, &ctx);
|
||||
if (status != DS_ERR_OK) {
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -398,20 +396,18 @@ CreateStreamAndFeedAudioContent(ModelState* aCtx,
|
|||
char*
|
||||
DS_SpeechToText(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate)
|
||||
unsigned int aBufferSize)
|
||||
{
|
||||
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
|
||||
return DS_FinishStream(ctx);
|
||||
}
|
||||
|
||||
Metadata*
|
||||
DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate)
|
||||
unsigned int aBufferSize)
|
||||
{
|
||||
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize);
|
||||
return DS_FinishStreamWithMetadata(ctx);
|
||||
}
|
||||
|
||||
|
|
|
@ -124,9 +124,8 @@ int DS_EnableDecoderWithLM(ModelState* aCtx,
|
|||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||
* sample rate.
|
||||
* sample rate (matching what the model was trained on).
|
||||
* @param aBufferSize The number of samples in the audio signal.
|
||||
* @param aSampleRate The sample-rate of the audio signal.
|
||||
*
|
||||
* @return The STT result. The user is responsible for freeing the string using
|
||||
* {@link DS_FreeString()}. Returns NULL on error.
|
||||
|
@ -134,8 +133,7 @@ int DS_EnableDecoderWithLM(ModelState* aCtx,
|
|||
DEEPSPEECH_EXPORT
|
||||
char* DS_SpeechToText(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate);
|
||||
unsigned int aBufferSize);
|
||||
|
||||
/**
|
||||
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
|
||||
|
@ -143,9 +141,8 @@ char* DS_SpeechToText(ModelState* aCtx,
|
|||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||
* sample rate.
|
||||
* sample rate (matching what the model was trained on).
|
||||
* @param aBufferSize The number of samples in the audio signal.
|
||||
* @param aSampleRate The sample-rate of the audio signal.
|
||||
*
|
||||
* @return Outputs a struct of individual letters along with their timing information.
|
||||
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
|
||||
|
@ -153,8 +150,7 @@ char* DS_SpeechToText(ModelState* aCtx,
|
|||
DEEPSPEECH_EXPORT
|
||||
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate);
|
||||
unsigned int aBufferSize);
|
||||
|
||||
/**
|
||||
* @brief Create a new streaming inference state. The streaming state returned
|
||||
|
@ -162,7 +158,6 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
|||
* and {@link DS_FinishStream()}.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aSampleRate The sample-rate of the audio signal.
|
||||
* @param[out] retval an opaque pointer that represents the streaming state. Can
|
||||
* be NULL if an error occurs.
|
||||
*
|
||||
|
@ -170,7 +165,6 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
|||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
int DS_CreateStream(ModelState* aCtx,
|
||||
unsigned int aSampleRate,
|
||||
StreamingState** retval);
|
||||
|
||||
/**
|
||||
|
@ -178,7 +172,7 @@ int DS_CreateStream(ModelState* aCtx,
|
|||
*
|
||||
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
|
||||
* @param aBuffer An array of 16-bit, mono raw audio samples at the
|
||||
* appropriate sample rate.
|
||||
* appropriate sample rate (matching what the model was trained on).
|
||||
* @param aBufferSize The number of samples in @p aBuffer.
|
||||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
|
|
|
@ -71,17 +71,17 @@ int DS_EnableDecoderWithLM(ModelState* aCtx,
|
|||
* and {@link DS_FinishStream()}.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aSampleRate The sample-rate of the audio signal.
|
||||
* @param aSampleRate UNUSED, DEPRECATED.
|
||||
* @param[out] retval an opaque pointer that represents the streaming state. Can
|
||||
* be NULL if an error occurs.
|
||||
*
|
||||
* @return Zero for success, non-zero on failure.
|
||||
*/
|
||||
int DS_SetupStream(ModelState* aCtx,
|
||||
unsigned int aSampleRate,
|
||||
unsigned int /*aSampleRate*/,
|
||||
StreamingState** retval)
|
||||
{
|
||||
return DS_CreateStream(aCtx, aSampleRate, retval);
|
||||
return DS_CreateStream(aCtx, retval);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -98,4 +98,45 @@ void DS_DiscardStream(StreamingState* aSctx)
|
|||
return DS_FreeStream(aSctx);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Use the DeepSpeech model to perform Speech-To-Text.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||
* sample rate (matching what the model was trained on).
|
||||
* @param aBufferSize The number of samples in the audio signal.
|
||||
* @param aSampleRate UNUSED, DEPRECATED.
|
||||
*
|
||||
* @return The STT result. The user is responsible for freeing the string using
|
||||
* {@link DS_FreeString()}. Returns NULL on error.
|
||||
*/
|
||||
char* DS_SpeechToText(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int /*aSampleRate*/)
|
||||
{
|
||||
return DS_SpeechToText(aCtx, aBuffer, aBufferSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
|
||||
* about the results.
|
||||
*
|
||||
* @param aCtx The ModelState pointer for the model to use.
|
||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
|
||||
* sample rate (matching what the model was trained on).
|
||||
* @param aBufferSize The number of samples in the audio signal.
|
||||
* @param aSampleRate UNUSED, DEPRECATED.
|
||||
*
|
||||
* @return Outputs a struct of individual letters along with their timing information.
|
||||
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
|
||||
*/
|
||||
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int /*aSampleRate*/)
|
||||
{
|
||||
return DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize);
|
||||
}
|
||||
|
||||
#endif /* DEEPSPEECH_COMPAT_H */
|
||||
|
|
|
@ -148,7 +148,7 @@ namespace DeepSpeechClient
|
|||
/// <summary>
|
||||
/// Feeds audio samples to an ongoing streaming inference.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
|
||||
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).</param>
|
||||
public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize)
|
||||
{
|
||||
NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize);
|
||||
|
@ -193,11 +193,10 @@ namespace DeepSpeechClient
|
|||
/// <summary>
|
||||
/// Creates a new streaming inference state.
|
||||
/// </summary>
|
||||
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
|
||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
|
||||
public unsafe void CreateStream(uint aSampleRate)
|
||||
public unsafe void CreateStream()
|
||||
{
|
||||
var resultCode = NativeImp.DS_CreateStream(_modelStatePP, aSampleRate, ref _streamingStatePP);
|
||||
var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref _streamingStatePP);
|
||||
EvaluateResultCode(resultCode);
|
||||
}
|
||||
|
||||
|
@ -230,25 +229,23 @@ namespace DeepSpeechClient
|
|||
/// <summary>
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
|
||||
/// <returns>The STT result. The user is responsible for freeing the string. Returns NULL on error.</returns>
|
||||
public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize, uint aSampleRate)
|
||||
public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize)
|
||||
{
|
||||
return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToString();
|
||||
return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize).PtrToString();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
|
||||
/// <returns>The extended metadata. The user is responsible for freeing the struct. Returns NULL on error.</returns>
|
||||
public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aSampleRate)
|
||||
public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize)
|
||||
{
|
||||
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToMetadata();
|
||||
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize).PtrToMetadata();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
|
|
@ -40,24 +40,20 @@ namespace DeepSpeechClient.Interfaces
|
|||
/// <summary>
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
|
||||
/// <returns>The STT result. The user is responsible for freeing the string. Returns NULL on error.</returns>
|
||||
unsafe string SpeechToText(short[] aBuffer,
|
||||
uint aBufferSize,
|
||||
uint aSampleRate);
|
||||
uint aBufferSize);
|
||||
|
||||
/// <summary>
|
||||
/// Use the DeepSpeech model to perform Speech-To-Text.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
|
||||
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).</param>
|
||||
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
|
||||
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
|
||||
/// <returns>The extended metadata result. The user is responsible for freeing the struct. Returns NULL on error.</returns>
|
||||
unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer,
|
||||
uint aBufferSize,
|
||||
uint aSampleRate);
|
||||
uint aBufferSize);
|
||||
|
||||
/// <summary>
|
||||
/// Destroy a streaming state without decoding the computed logits.
|
||||
|
@ -79,14 +75,13 @@ namespace DeepSpeechClient.Interfaces
|
|||
/// <summary>
|
||||
/// Creates a new streaming inference state.
|
||||
/// </summary>
|
||||
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
|
||||
/// <exception cref="ArgumentException">Thrown when the native binary failed to initialize the streaming mode.</exception>
|
||||
unsafe void CreateStream(uint aSampleRate);
|
||||
unsafe void CreateStream();
|
||||
|
||||
/// <summary>
|
||||
/// Feeds audio samples to an ongoing streaming inference.
|
||||
/// </summary>
|
||||
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
|
||||
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).</param>
|
||||
unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize);
|
||||
|
||||
/// <summary>
|
||||
|
|
|
@ -31,21 +31,19 @@ namespace DeepSpeechClient
|
|||
CharSet = CharSet.Ansi, SetLastError = true)]
|
||||
internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx,
|
||||
short[] aBuffer,
|
||||
uint aBufferSize,
|
||||
uint aSampleRate);
|
||||
uint aBufferSize);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)]
|
||||
internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx,
|
||||
short[] aBuffer,
|
||||
uint aBufferSize,
|
||||
uint aSampleRate);
|
||||
uint aBufferSize);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern void DS_FreeModel(IntPtr** aCtx);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern ErrorCodes DS_CreateStream(IntPtr** aCtx,
|
||||
uint aSampleRate, ref IntPtr** retval);
|
||||
ref IntPtr** retval);
|
||||
|
||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||
internal static unsafe extern void DS_FreeStream(ref IntPtr** aSctx);
|
||||
|
|
|
@ -91,12 +91,12 @@ namespace CSharpExamples
|
|||
string speechResult;
|
||||
if (extended)
|
||||
{
|
||||
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
|
||||
Metadata metaResult = sttClient.SpeechToTextWithMetadata(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
|
||||
speechResult = MetadataToString(metaResult);
|
||||
}
|
||||
else
|
||||
{
|
||||
speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
|
||||
speechResult = sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2));
|
||||
}
|
||||
|
||||
stopwatch.Stop();
|
||||
|
|
|
@ -100,7 +100,7 @@ public class DeepSpeechActivity extends AppCompatActivity {
|
|||
|
||||
long inferenceStartTime = System.currentTimeMillis();
|
||||
|
||||
String decoded = this._m.stt(shorts, shorts.length, sampleRate);
|
||||
String decoded = this._m.stt(shorts, shorts.length);
|
||||
|
||||
inferenceExecTime = System.currentTimeMillis() - inferenceStartTime;
|
||||
|
||||
|
|
|
@ -104,9 +104,9 @@ public class BasicTest {
|
|||
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts);
|
||||
|
||||
if (extendedMetadata) {
|
||||
return metadataToString(m.sttWithMetadata(shorts, shorts.length, sampleRate));
|
||||
return metadataToString(m.sttWithMetadata(shorts, shorts.length));
|
||||
} else {
|
||||
return m.stt(shorts, shorts.length, sampleRate);
|
||||
return m.stt(shorts, shorts.length);
|
||||
}
|
||||
} catch (FileNotFoundException ex) {
|
||||
|
||||
|
|
|
@ -57,14 +57,13 @@ public class DeepSpeechModel {
|
|||
* @brief Use the DeepSpeech model to perform Speech-To-Text.
|
||||
*
|
||||
* @param buffer A 16-bit, mono raw audio signal at the appropriate
|
||||
* sample rate.
|
||||
* sample rate (matching what the model was trained on).
|
||||
* @param buffer_size The number of samples in the audio signal.
|
||||
* @param sample_rate The sample-rate of the audio signal.
|
||||
*
|
||||
* @return The STT result.
|
||||
*/
|
||||
public String stt(short[] buffer, int buffer_size, int sample_rate) {
|
||||
return impl.SpeechToText(this._msp, buffer, buffer_size, sample_rate);
|
||||
public String stt(short[] buffer, int buffer_size) {
|
||||
return impl.SpeechToText(this._msp, buffer, buffer_size);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -72,14 +71,13 @@ public class DeepSpeechModel {
|
|||
* about the results.
|
||||
*
|
||||
* @param buffer A 16-bit, mono raw audio signal at the appropriate
|
||||
* sample rate.
|
||||
* sample rate (matching what the model was trained on).
|
||||
* @param buffer_size The number of samples in the audio signal.
|
||||
* @param sample_rate The sample-rate of the audio signal.
|
||||
*
|
||||
* @return Outputs a Metadata object of individual letters along with their timing information.
|
||||
*/
|
||||
public Metadata sttWithMetadata(short[] buffer, int buffer_size, int sample_rate) {
|
||||
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, sample_rate);
|
||||
public Metadata sttWithMetadata(short[] buffer, int buffer_size) {
|
||||
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -87,12 +85,11 @@ public class DeepSpeechModel {
|
|||
* by this function can then be passed to feedAudioContent()
|
||||
* and finishStream().
|
||||
*
|
||||
* @param sample_rate The sample-rate of the audio signal.
|
||||
* @return An opaque object that represents the streaming state.
|
||||
*/
|
||||
public DeepSpeechStreamingState createStream(int sample_rate) {
|
||||
public DeepSpeechStreamingState createStream() {
|
||||
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
|
||||
impl.CreateStream(this._msp, sample_rate, ssp);
|
||||
impl.CreateStream(this._msp, ssp);
|
||||
return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
|
||||
}
|
||||
|
||||
|
@ -101,7 +98,7 @@ public class DeepSpeechModel {
|
|||
*
|
||||
* @param cctx A streaming state pointer returned by createStream().
|
||||
* @param buffer An array of 16-bit, mono raw audio samples at the
|
||||
* appropriate sample rate.
|
||||
* appropriate sample rate (matching what the model was trained on).
|
||||
* @param buffer_size The number of samples in @p buffer.
|
||||
*/
|
||||
public void feedAudioContent(DeepSpeechStreamingState ctx, short[] buffer, int buffer_size) {
|
||||
|
|
|
@ -118,9 +118,9 @@ audioStream.on('finish', () => {
|
|||
// We take half of the buffer_size because buffer is a char* while
|
||||
// LocalDsSTT() expected a short*
|
||||
if (args['extended']) {
|
||||
console.log(metadataToString(model.sttWithMetadata(audioBuffer.slice(0, audioBuffer.length / 2), 16000)));
|
||||
console.log(metadataToString(model.sttWithMetadata(audioBuffer.slice(0, audioBuffer.length / 2))));
|
||||
} else {
|
||||
console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000));
|
||||
console.log(model.stt(audioBuffer.slice(0, audioBuffer.length / 2)));
|
||||
}
|
||||
const inference_stop = process.hrtime(inference_start);
|
||||
console.error('Inference took %ds for %ds audio file.', totalTime(inference_stop), audioLength.toPrecision(4));
|
||||
|
|
|
@ -64,9 +64,8 @@ Model.prototype.enableDecoderWithLM = function() {
|
|||
/**
|
||||
* Use the DeepSpeech model to perform Speech-To-Text.
|
||||
*
|
||||
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate.
|
||||
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
* @param {number} aBufferSize The number of samples in the audio signal.
|
||||
* @param {number} aSampleRate The sample-rate of the audio signal.
|
||||
*
|
||||
* @return {string} The STT result. Returns undefined on error.
|
||||
*/
|
||||
|
@ -79,9 +78,8 @@ Model.prototype.stt = function() {
|
|||
* Use the DeepSpeech model to perform Speech-To-Text and output metadata
|
||||
* about the results.
|
||||
*
|
||||
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate.
|
||||
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
* @param {number} aBufferSize The number of samples in the audio signal.
|
||||
* @param {number} aSampleRate The sample-rate of the audio signal.
|
||||
*
|
||||
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
|
||||
*/
|
||||
|
@ -93,7 +91,6 @@ Model.prototype.sttWithMetadata = function() {
|
|||
/**
|
||||
* Create a new streaming inference state. The streaming state returned by this function can then be passed to :js:func:`Model.feedAudioContent` and :js:func:`Model.finishStream`.
|
||||
*
|
||||
* @param {number} aSampleRate The sample-rate of the audio signal.
|
||||
* @return {object} an opaque object that represents the streaming state.
|
||||
*
|
||||
* @throws on error
|
||||
|
@ -114,7 +111,7 @@ Model.prototype.createStream = function() {
|
|||
*
|
||||
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
|
||||
* @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the
|
||||
* appropriate sample rate.
|
||||
* appropriate sample rate (matching what the model was trained on).
|
||||
* @param {number} aBufferSize The number of samples in @param aBuffer.
|
||||
*/
|
||||
Model.prototype.feedAudioContent = function() {
|
||||
|
|
|
@ -69,15 +69,12 @@ class Model(object):
|
|||
"""
|
||||
Use the DeepSpeech model to perform Speech-To-Text.
|
||||
|
||||
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate.
|
||||
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
:type aBuffer: int array
|
||||
|
||||
:param aBufferSize: The number of samples in the audio signal.
|
||||
:type aBufferSize: int
|
||||
|
||||
:param aSampleRate: The sample-rate of the audio signal.
|
||||
:type aSampleRate: int
|
||||
|
||||
:return: The STT result.
|
||||
:type: str
|
||||
"""
|
||||
|
@ -87,34 +84,27 @@ class Model(object):
|
|||
"""
|
||||
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
|
||||
|
||||
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate.
|
||||
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
|
||||
:type aBuffer: int array
|
||||
|
||||
:param aBufferSize: The number of samples in the audio signal.
|
||||
:type aBufferSize: int
|
||||
|
||||
:param aSampleRate: The sample-rate of the audio signal.
|
||||
:type aSampleRate: int
|
||||
|
||||
:return: Outputs a struct of individual letters along with their timing information.
|
||||
:type: :func:`Metadata`
|
||||
"""
|
||||
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
|
||||
|
||||
def createStream(self, sample_rate=16000):
|
||||
def createStream(self):
|
||||
"""
|
||||
Create a new streaming inference state. The streaming state returned
|
||||
by this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.
|
||||
|
||||
:param aSampleRate: The sample-rate of the audio signal.
|
||||
:type aSampleRate: int
|
||||
|
||||
:return: Object holding the stream
|
||||
|
||||
:throws: RuntimeError on error
|
||||
"""
|
||||
status, ctx = deepspeech.impl.CreateStream(self._impl,
|
||||
aSampleRate=sample_rate)
|
||||
status, ctx = deepspeech.impl.CreateStream(self._impl)
|
||||
if status != 0:
|
||||
raise RuntimeError("CreateStream failed with error code {}".format(status))
|
||||
return ctx
|
||||
|
@ -127,7 +117,7 @@ class Model(object):
|
|||
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
|
||||
:type aSctx: object
|
||||
|
||||
:param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate.
|
||||
:param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on).
|
||||
:type aBuffer: int array
|
||||
|
||||
:param aBufferSize: The number of samples in @p aBuffer.
|
||||
|
|
|
@ -102,9 +102,9 @@ def main():
|
|||
print('Running inference.', file=sys.stderr)
|
||||
inference_start = timer()
|
||||
if args.extended:
|
||||
print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
|
||||
print(metadata_to_string(ds.sttWithMetadata(audio)))
|
||||
else:
|
||||
print(ds.stt(audio, fs))
|
||||
print(ds.stt(audio))
|
||||
inference_end = timer() - inference_start
|
||||
print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
|
||||
|
||||
|
|
|
@ -52,8 +52,8 @@ def main():
|
|||
audio2 = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
|
||||
fin.close()
|
||||
|
||||
stream1 = ds.createStream(sample_rate=fs1)
|
||||
stream2 = ds.createStream(sample_rate=fs2)
|
||||
stream1 = ds.createStream()
|
||||
stream2 = ds.createStream()
|
||||
|
||||
splits1 = np.array_split(audio1, 10)
|
||||
splits2 = np.array_split(audio2, 10)
|
||||
|
|
Loading…
Reference in New Issue