From afea2b423189411c41234ab94b27a1e5d50a2a89 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 10 Oct 2019 21:50:15 +0200 Subject: [PATCH] Expose and use model sample rate in Python --- native_client/python/__init__.py | 9 +++++++++ native_client/python/client.py | 21 ++++++++++----------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/native_client/python/__init__.py b/native_client/python/__init__.py index 62ea1eb5..b9166632 100644 --- a/native_client/python/__init__.py +++ b/native_client/python/__init__.py @@ -44,6 +44,15 @@ class Model(object): deepspeech.impl.FreeModel(self._impl) self._impl = None + def sampleRate(self): + """ + Return the sample rate expected by the model. + + :return: Sample rate. + :type: int + """ + return deepspeech.impl.GetModelSampleRate(self._impl) + def enableDecoderWithLM(self, *args, **kwargs): """ Enable decoding using beam scoring with a KenLM language model. diff --git a/native_client/python/client.py b/native_client/python/client.py index b44c5122..3792a406 100644 --- a/native_client/python/client.py +++ b/native_client/python/client.py @@ -17,9 +17,6 @@ try: except ImportError: from pipes import quote -# Define the sample rate for audio - -SAMPLE_RATE = 16000 # These constants control the beam search decoder # Beam width used in the CTC decoder when building candidate transcriptions @@ -32,16 +29,16 @@ LM_ALPHA = 0.75 LM_BETA = 1.85 -def convert_samplerate(audio_path): - sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), SAMPLE_RATE) +def convert_samplerate(audio_path, desired_sample_rate): + sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate) try: output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr)) except OSError as e: - raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(SAMPLE_RATE, e.strerror)) + raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(desired_sample_rate, e.strerror)) - return SAMPLE_RATE, np.frombuffer(output, np.int16) + return desired_sample_rate, np.frombuffer(output, np.int16) def metadata_to_string(metadata): @@ -81,6 +78,8 @@ def main(): model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) + desired_sample_rate = ds.sampleRate() + if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() @@ -90,13 +89,13 @@ def main(): fin = wave.open(args.audio, 'rb') fs = fin.getframerate() - if fs != SAMPLE_RATE: - print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, SAMPLE_RATE), file=sys.stderr) - fs, audio = convert_samplerate(args.audio) + if fs != desired_sample_rate: + print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, desired_sample_rate), file=sys.stderr) + fs, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) - audio_length = fin.getnframes() * (1/SAMPLE_RATE) + audio_length = fin.getnframes() * (1/fs) fin.close() print('Running inference.', file=sys.stderr)