From 0241f725cdf2b905ada67fb17550d522f5fde8a4 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 10 Oct 2019 21:45:33 +0200 Subject: [PATCH 1/7] Expose model sample rate in API --- native_client/deepspeech.cc | 6 ++++++ native_client/deepspeech.h | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc index 439702a6..9aee0f8e 100644 --- a/native_client/deepspeech.cc +++ b/native_client/deepspeech.cc @@ -292,6 +292,12 @@ DS_CreateModel(const char* aModelPath, return DS_ERR_OK; } +int +DS_GetModelSampleRate(ModelState* aCtx) +{ + return aCtx->sample_rate_; +} + void DS_FreeModel(ModelState* ctx) { diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h index ef25e985..ed9d8638 100644 --- a/native_client/deepspeech.h +++ b/native_client/deepspeech.h @@ -92,6 +92,16 @@ int DS_CreateModel(const char* aModelPath, unsigned int aBeamWidth, ModelState** retval); +/** + * @brief Return the sample rate expected by a model. + * + * @param aCtx A ModelState pointer created with {@link DS_CreateModel}. + * + * @return Sample rate expected by the model for its input. + */ +DEEPSPEECH_EXPORT +int DS_GetModelSampleRate(ModelState* aCtx); + /** * @brief Frees associated resources and destroys model object. */ From c1ed6d711d68bc7e5e59e0f35054adda261fac77 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 10 Oct 2019 21:46:01 +0200 Subject: [PATCH 2/7] Use model sample rate in client.cc --- native_client/client.cc | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/native_client/client.cc b/native_client/client.cc index 358f527f..80663fe6 100644 --- a/native_client/client.cc +++ b/native_client/client.cc @@ -111,7 +111,7 @@ typedef struct { } ds_audio_buffer; ds_audio_buffer -GetAudioBuffer(const char* path) +GetAudioBuffer(const char* path, int desired_sample_rate) { ds_audio_buffer res = {0}; @@ -121,7 +121,7 @@ GetAudioBuffer(const char* path) // Resample/reformat the audio so we can pass it through the MFCC functions sox_signalinfo_t target_signal = { - 16000, // Rate + static_cast(desired_sample_rate), // Rate 1, // Channels 16, // Precision SOX_UNSPEC, // Length @@ -158,8 +158,10 @@ GetAudioBuffer(const char* path) assert(output); - if ((int)input->signal.rate < 16000) { - fprintf(stderr, "Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.\n", (int)input->signal.rate); + if ((int)input->signal.rate < desired_sample_rate) { + fprintf(stderr, "Warning: original sample rate (%d) is lower than %dkHz. " + "Up-sampling might produce erratic speech recognition.\n", + desired_sample_rate, (int)input->signal.rate); } // Setup the effects chain to decode/resample @@ -205,7 +207,7 @@ GetAudioBuffer(const char* path) #endif // NO_SOX #ifdef NO_SOX - // FIXME: Hack and support only 16kHz mono 16-bits PCM + // FIXME: Hack and support only mono 16-bits PCM with standard SoX header FILE* wave = fopen(path, "r"); size_t rv; @@ -224,12 +226,12 @@ GetAudioBuffer(const char* path) assert(audio_format == 1); // 1 is PCM assert(num_channels == 1); // MONO - assert(sample_rate == 16000); // 16000 Hz + assert(sample_rate == desired_sample_rate); // at desired sample rate assert(bits_per_sample == 16); // 16 bits per sample fprintf(stderr, "audio_format=%d\n", audio_format); fprintf(stderr, "num_channels=%d\n", num_channels); - fprintf(stderr, "sample_rate=%d\n", sample_rate); + fprintf(stderr, "sample_rate=%d (desired=%d)\n", sample_rate, desired_sample_rate); fprintf(stderr, "bits_per_sample=%d\n", bits_per_sample); fseek(wave, 40, SEEK_SET); rv = fread(&res.buffer_size, 4, 1, wave); @@ -257,7 +259,7 @@ GetAudioBuffer(const char* path) void ProcessFile(ModelState* context, const char* path, bool show_times) { - ds_audio_buffer audio = GetAudioBuffer(path); + ds_audio_buffer audio = GetAudioBuffer(path, DS_GetModelSampleRate(context)); // Pass audio to DeepSpeech // We take half of buffer_size because buffer is a char* while From afea2b423189411c41234ab94b27a1e5d50a2a89 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 10 Oct 2019 21:50:15 +0200 Subject: [PATCH 3/7] Expose and use model sample rate in Python --- native_client/python/__init__.py | 9 +++++++++ native_client/python/client.py | 21 ++++++++++----------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py index 62ea1eb5..b9166632 100644 --- a/native_client/python/__init__.py +++ b/native_client/python/__init__.py @@ -44,6 +44,15 @@ class Model(object): deepspeech.impl.FreeModel(self._impl) self._impl = None + def sampleRate(self): + """ + Return the sample rate expected by the model. + + :return: Sample rate. + :type: int + """ + return deepspeech.impl.GetModelSampleRate(self._impl) + def enableDecoderWithLM(self, *args, **kwargs): """ Enable decoding using beam scoring with a KenLM language model. diff --git a/native_client/python/client.py b/native_client/python/client.py index b44c5122..3792a406 100644 --- a/native_client/python/client.py +++ b/native_client/python/client.py @@ -17,9 +17,6 @@ try: except ImportError: from pipes import quote -# Define the sample rate for audio - -SAMPLE_RATE = 16000 # These constants control the beam search decoder # Beam width used in the CTC decoder when building candidate transcriptions @@ -32,16 +29,16 @@ LM_ALPHA = 0.75 LM_BETA = 1.85 -def convert_samplerate(audio_path): - sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), SAMPLE_RATE) +def convert_samplerate(audio_path, desired_sample_rate): + sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate) try: output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr)) except OSError as e: - raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(SAMPLE_RATE, e.strerror)) + raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(desired_sample_rate, e.strerror)) - return SAMPLE_RATE, np.frombuffer(output, np.int16) + return desired_sample_rate, np.frombuffer(output, np.int16) def metadata_to_string(metadata): @@ -81,6 +78,8 @@ def main(): model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) + desired_sample_rate = ds.sampleRate() + if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() @@ -90,13 +89,13 @@ def main(): fin = wave.open(args.audio, 'rb') fs = fin.getframerate() - if fs != SAMPLE_RATE: - print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, SAMPLE_RATE), file=sys.stderr) - fs, audio = convert_samplerate(args.audio) + if fs != desired_sample_rate: + print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, desired_sample_rate), file=sys.stderr) + fs, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) - audio_length = fin.getnframes() * (1/SAMPLE_RATE) + audio_length = fin.getnframes() * (1/fs) fin.close() print('Running inference.', file=sys.stderr) From 0be2787e4ec96edfb92d1e2ac80c4e6f74327198 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 10 Oct 2019 21:55:08 +0200 Subject: [PATCH 4/7] Expose and use model sample rate in JavaScript --- native_client/javascript/client.js | 40 ++++++++++++++++-------------- native_client/javascript/index.js | 9 +++++++ 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/native_client/javascript/client.js b/native_client/javascript/client.js index e356c2e8..8bbdce12 100644 --- a/native_client/javascript/client.js +++ b/native_client/javascript/client.js @@ -62,11 +62,29 @@ function metadataToString(metadata) { return retval; } +console.error('Loading model from file %s', args['model']); +const model_load_start = process.hrtime(); +var model = new Ds.Model(args['model'], args['alphabet'], BEAM_WIDTH); +const model_load_end = process.hrtime(model_load_start); +console.error('Loaded model in %ds.', totalTime(model_load_end)); + +var desired_sample_rate = model.sampleRate(); + +if (args['lm'] && args['trie']) { + console.error('Loading language model from files %s %s', args['lm'], args['trie']); + const lm_load_start = process.hrtime(); + model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA); + const lm_load_end = process.hrtime(lm_load_start); + console.error('Loaded language model in %ds.', totalTime(lm_load_end)); +} + const buffer = Fs.readFileSync(args['audio']); const result = Wav.decode(buffer); -if (result.sampleRate < 16000) { - console.error('Warning: original sample rate (' + result.sampleRate + ') is lower than 16kHz. Up-sampling might produce erratic speech recognition.'); +if (result.sampleRate < desired_sample_rate) { + console.error('Warning: original sample rate (' + result.sampleRate + ') ' + + 'is lower than ' + desired_sample_rate + 'Hz. ' + + 'Up-sampling might produce erratic speech recognition.'); } function bufferToStream(buffer) { @@ -84,7 +102,7 @@ bufferToStream(buffer). }, output: { bits: 16, - rate: 16000, + rate: desired_sample_rate, channels: 1, encoding: 'signed-integer', endian: 'little', @@ -97,23 +115,9 @@ bufferToStream(buffer). audioStream.on('finish', () => { let audioBuffer = audioStream.toBuffer(); - console.error('Loading model from file %s', args['model']); - const model_load_start = process.hrtime(); - var model = new Ds.Model(args['model'], args['alphabet'], BEAM_WIDTH); - const model_load_end = process.hrtime(model_load_start); - console.error('Loaded model in %ds.', totalTime(model_load_end)); - - if (args['lm'] && args['trie']) { - console.error('Loading language model from files %s %s', args['lm'], args['trie']); - const lm_load_start = process.hrtime(); - model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA); - const lm_load_end = process.hrtime(lm_load_start); - console.error('Loaded language model in %ds.', totalTime(lm_load_end)); - } - const inference_start = process.hrtime(); console.error('Running inference.'); - const audioLength = (audioBuffer.length / 2) * ( 1 / 16000); + const audioLength = (audioBuffer.length / 2) * (1 / desired_sample_rate); // We take half of the buffer_size because buffer is a char* while // LocalDsSTT() expected a short* diff --git a/native_client/javascript/index.js b/native_client/javascript/index.js index f6446f4d..ad639099 100644 --- a/native_client/javascript/index.js +++ b/native_client/javascript/index.js @@ -45,6 +45,15 @@ function Model() { this._impl = impl; } +/** + * Return the sample rate expected by the model. + * + * @return {number} Sample rate. + */ +Model.prototype.sampleRate = function() { + return binding.GetModelSampleRate(this._impl); +} + /** * Enable decoding using beam scoring with a KenLM language model. * From 5cb15ca6ed80d35529e536b4024cc123845955ae Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 10 Oct 2019 22:04:33 +0200 Subject: [PATCH 5/7] Use model sample rate in examples --- .../net_framework/DeepSpeechWPF/MainWindow.xaml.cs | 8 +++----- examples/nodejs_wav/index.js | 11 ++++++----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs b/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs index e332da6d..31b1f9d4 100644 --- a/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs +++ b/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs @@ -202,15 +202,13 @@ namespace DeepSpeechWPF { _audioCapture.Device = _audioCaptureDevices[cbxAudioInputs.SelectedIndex]; } - InitilizeAudioCapture(); + InitializeAudioCapture(_sttClient.GetModelSampleRate()); } - - /// /// Initializes the recorder and setup the native stream. /// - private void InitilizeAudioCapture() + private void InitializeAudioCapture(int desiredSampleRate) { _audioCapture.Initialize(); _audioCapture.DataAvailable += _capture_DataAvailable; @@ -218,7 +216,7 @@ namespace DeepSpeechWPF //create a source, that converts the data provided by the //soundInSource to required by the deepspeech model _convertedSource = _soundInSource - .ChangeSampleRate(16000) // sample rate + .ChangeSampleRate(desiredSampleRate) // sample rate .ToSampleSource() .ToWaveSource(16); //bits per sample diff --git a/examples/nodejs_wav/index.js b/examples/nodejs_wav/index.js index 20ccb2ab..7883a010 100644 --- a/examples/nodejs_wav/index.js +++ b/examples/nodejs_wav/index.js @@ -11,6 +11,8 @@ let alphabetPath = './models/alphabet.txt'; let model = new DeepSpeech.Model(modelPath, alphabetPath, BEAM_WIDTH); +let desiredSampleRate = model.sampleRate(); + const LM_ALPHA = 0.75; const LM_BETA = 1.85; let lmPath = './models/lm.binary'; @@ -28,8 +30,8 @@ if (!Fs.existsSync(audioFile)) { const buffer = Fs.readFileSync(audioFile); const result = Wav.decode(buffer); -if (result.sampleRate < 16000) { - console.error('Warning: original sample rate (' + result.sampleRate + ') is lower than 16kHz. Up-sampling might produce erratic speech recognition.'); +if (result.sampleRate < desiredSampleRate) { + console.error('Warning: original sample rate (' + result.sampleRate + ') is lower than ' + desiredSampleRate + 'Hz. Up-sampling might produce erratic speech recognition.'); } function bufferToStream(buffer) { @@ -47,7 +49,7 @@ pipe(Sox({ }, output: { bits: 16, - rate: 16000, + rate: desiredSampleRate, channels: 1, encoding: 'signed-integer', endian: 'little', @@ -58,10 +60,9 @@ pipe(Sox({ pipe(audioStream); audioStream.on('finish', () => { - let audioBuffer = audioStream.toBuffer(); - const audioLength = (audioBuffer.length / 2) * ( 1 / 16000); + const audioLength = (audioBuffer.length / 2) * (1 / desiredSampleRate); console.log('audio length', audioLength); let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2)); From 4dc18dd8ee943438b649861bb483ebcf140815ca Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 10 Oct 2019 22:04:44 +0200 Subject: [PATCH 6/7] Expose and use model sample rate in .NET --- native_client/dotnet/DeepSpeechClient/DeepSpeech.cs | 9 +++++++++ .../dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs | 6 ++++++ native_client/dotnet/DeepSpeechClient/NativeImp.cs | 3 +++ 3 files changed, 18 insertions(+) diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs index 25fcc109..9bbf5e3c 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs @@ -64,6 +64,15 @@ namespace DeepSpeechClient EvaluateResultCode(resultCode); } + /// + /// Return the sample rate expected by the model. + /// + /// Sample rate. + public unsafe int GetModelSampleRate() + { + return NativeImp.DS_GetModelSampleRate(_modelStatePP); + } + /// /// Evaluate the result code and will raise an exception if necessary. /// diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs index 79af2964..f7bbee98 100644 --- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs +++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs @@ -24,6 +24,12 @@ namespace DeepSpeechClient.Interfaces string aAlphabetConfigPath, uint aBeamWidth); + /// + /// Return the sample rate expected by the model. + /// + /// Sample rate. + unsafe int GetModelSampleRate(); + /// /// Enable decoding using beam scoring with a KenLM language model. /// diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs index 74de9197..92cdb150 100644 --- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs +++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs @@ -20,6 +20,9 @@ namespace DeepSpeechClient uint aBeamWidth, ref IntPtr** pint); + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx); + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(IntPtr** aCtx, string aLMPath, From 673d620a67644f73aace97964851b4b9405350c1 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 10 Oct 2019 22:07:30 +0200 Subject: [PATCH 7/7] Expose and use model sample rate in Java --- .../java/org/mozilla/deepspeech/DeepSpeechActivity.java | 2 +- .../deepspeech/libdeepspeech/DeepSpeechModel.java | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java b/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java index 6b9c45b3..b44fdfab 100644 --- a/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java +++ b/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java @@ -77,7 +77,7 @@ public class DeepSpeechActivity extends AppCompatActivity { // tv_numChannels.setText("numChannels=" + (numChannels == 1 ? "MONO" : "!MONO")); wave.seek(24); int sampleRate = this.readLEInt(wave); - assert (sampleRate == 16000); // 16000 Hz + assert (sampleRate == this._m.sampleRate()); // desired sample rate // tv_sampleRate.setText("sampleRate=" + (sampleRate == 16000 ? "16kHz" : "!16kHz")); wave.seek(34); char bitsPerSample = this.readLEChar(wave); diff --git a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java index 3a665c5e..0bbc8fcc 100644 --- a/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java +++ b/native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/DeepSpeechModel.java @@ -32,6 +32,15 @@ public class DeepSpeechModel { this._msp = impl.modelstatep_value(this._mspp); } + /** + * @brief Return the sample rate expected by the model. + * + * @return Sample rate. + */ + public int sampleRate() { + return impl.GetModelSampleRate(this._msp); + } + /** * @brief Frees associated resources and destroys model object. */