Add Input Rate to examples/mic_vad_streaming.

Add -r for input device sample rate and -d for device index by PyAudio
This commit is contained in:
Jordan Olafsen 2019-02-20 21:57:27 +13:00
parent 4b2e3bc714
commit 56df4ebf03

View File

@ -1,6 +1,7 @@
import time, logging import time, logging
from datetime import datetime from datetime import datetime
import threading, collections, queue, os, os.path import threading, collections, queue, os, os.path
import audioop
import wave import wave
import pyaudio import pyaudio
import webrtcvad import webrtcvad
@ -10,35 +11,64 @@ import numpy as np
logging.basicConfig(level=20) logging.basicConfig(level=20)
class Audio(object): class Audio(object):
"""Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from.""" """Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from."""
FORMAT = pyaudio.paInt16 FORMAT = pyaudio.paInt16
RATE = 16000 # Network/VAD rate-space
RATE_PROCESS = 16000
CHANNELS = 1 CHANNELS = 1
BLOCKS_PER_SECOND = 50 BLOCKS_PER_SECOND = 50
BLOCK_SIZE = int(RATE / float(BLOCKS_PER_SECOND))
def __init__(self, callback=None): def __init__(self, callback=None, device=None, input_rate=RATE_PROCESS):
def proxy_callback(in_data, frame_count, time_info, status): def proxy_callback(in_data, frame_count, time_info, status):
callback(in_data) callback(in_data)
return (None, pyaudio.paContinue) return (None, pyaudio.paContinue)
if callback is None: callback = lambda in_data: self.buffer_queue.put(in_data) if callback is None:
callback = lambda in_data: self.buffer_queue.put(in_data)
self.buffer_queue = queue.Queue() self.buffer_queue = queue.Queue()
self.sample_rate = self.RATE self.device = device
self.block_size = self.BLOCK_SIZE self.input_rate = input_rate
self.sample_rate = self.RATE_PROCESS
self.block_size = int(self.RATE_PROCESS / float(self.BLOCKS_PER_SECOND))
self.block_size_input = int(self.input_rate / float(self.BLOCKS_PER_SECOND))
self.pa = pyaudio.PyAudio() self.pa = pyaudio.PyAudio()
self.stream = self.pa.open(format=self.FORMAT,
channels=self.CHANNELS, kwargs = {
rate=self.sample_rate, 'format': self.FORMAT,
input=True, 'channels': self.CHANNELS,
frames_per_buffer=self.block_size, 'rate': self.input_rate,
stream_callback=proxy_callback) 'input': True,
'frames_per_buffer': self.block_size_input,
'stream_callback': proxy_callback,
}
# if not default device
if self.device:
kwargs['input_device_index'] = self.device
self.stream = self.pa.open(**kwargs)
self.stream.start_stream() self.stream.start_stream()
def resample(self, data, input_rate):
"""
Microphone may not support our native processing sampling rate, so
resample from input_rate to RATE_PROCESS here for webrtcvad and
deepspeech
Args:
data : Input audio stream
input_rate (int): Input audio rate to resample from
"""
newfragment, state = audioop.ratecv(data, 2, 1, input_rate,
self.sample_rate, None)
return newfragment
def read(self): def read(self):
"""Return a block of audio data, blocking if necessary.""" """Return a block of audio data, blocking if necessary."""
return self.buffer_queue.get() buffer = self.buffer_queue.get()
return self.resample(buffer, self.input_rate)
def destroy(self): def destroy(self):
self.stream.stop_stream() self.stream.stop_stream()
@ -58,11 +88,12 @@ class Audio(object):
wf.writeframes(data) wf.writeframes(data)
wf.close() wf.close()
class VADAudio(Audio): class VADAudio(Audio):
"""Filter & segment audio with voice activity detection.""" """Filter & segment audio with voice activity detection."""
def __init__(self, aggressiveness=3): def __init__(self, aggressiveness=3, device=None, input_rate=None):
super().__init__() super().__init__(device=device, input_rate=input_rate)
self.vad = webrtcvad.Vad(aggressiveness) self.vad = webrtcvad.Vad(aggressiveness)
def frame_generator(self): def frame_generator(self):
@ -121,7 +152,9 @@ def main(ARGS):
model.enableDecoderWithLM(ARGS.alphabet, ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta) model.enableDecoderWithLM(ARGS.alphabet, ARGS.lm, ARGS.trie, ARGS.lm_alpha, ARGS.lm_beta)
# Start audio with VAD # Start audio with VAD
vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness) vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
device=ARGS.device,
input_rate=ARGS.rate)
print("Listening (ctrl-C to exit)...") print("Listening (ctrl-C to exit)...")
frames = vad_audio.vad_collector() frames = vad_audio.vad_collector()
@ -146,8 +179,10 @@ def main(ARGS):
print("Recognized: %s" % text) print("Recognized: %s" % text)
stream_context = model.setupStream() stream_context = model.setupStream()
if __name__ == '__main__': if __name__ == '__main__':
BEAM_WIDTH = 500 BEAM_WIDTH = 500
DEFAULT_SAMPLE_RATE = 16000
LM_ALPHA = 0.75 LM_ALPHA = 0.75
LM_BETA = 1.85 LM_BETA = 1.85
N_FEATURES = 26 N_FEATURES = 26
@ -171,6 +206,10 @@ if __name__ == '__main__':
help="Path to the language model binary file. Default: lm.binary") help="Path to the language model binary file. Default: lm.binary")
parser.add_argument('-t', '--trie', default='trie', parser.add_argument('-t', '--trie', default='trie',
help="Path to the language model trie file created with native_client/generate_trie. Default: trie") help="Path to the language model trie file created with native_client/generate_trie. Default: trie")
parser.add_argument('-d', '--device', type=int, default=None,
help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index()")
parser.add_argument('-r', '--rate', type=int, default=DEFAULT_SAMPLE_RATE,
help=f"Input device sample rate. Default: {DEFAULT_SAMPLE_RATE}. Your device may require 44100.")
parser.add_argument('-nf', '--n_features', type=int, default=N_FEATURES, parser.add_argument('-nf', '--n_features', type=int, default=N_FEATURES,
help=f"Number of MFCC features to use. Default: {N_FEATURES}") help=f"Number of MFCC features to use. Default: {N_FEATURES}")
parser.add_argument('-nc', '--n_context', type=int, default=N_CONTEXT, parser.add_argument('-nc', '--n_context', type=int, default=N_CONTEXT,