Add example for Python streaming from mic with VAD

This commit is contained in:
daanzu 2018-11-09 10:18:01 -05:00
parent eee92c232d
commit 74cebb83b6
3 changed files with 255 additions and 0 deletions

View File

@ -0,0 +1,63 @@
# Microphone VAD Streaming
Stream from microphone to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Python. Also useful for quick, real-time testing of models and decoding parameters.
## Installation
```bash
pip install -r requirements.txt
```
Uses portaudio for microphone access, so on Linux, you may need to install its header files to compile the `pyaudio` package:
```bash
sudo apt install portaudio19-dev
```
## Usage
```
usage: mic_vad_streaming.py [-h] [-v VAD_AGGRESSIVENESS] [--nospinner]
[-w SAVEWAV] -m MODEL [-a ALPHABET] [-l LM]
[-t TRIE] [-nf N_FEATURES] [-nc N_CONTEXT]
[-lw LM_WEIGHT] [-vwcw VALID_WORD_COUNT_WEIGHT]
[-bw BEAM_WIDTH]
Stream from microphone to DeepSpeech using VAD
optional arguments:
-h, --help show this help message and exit
-v VAD_AGGRESSIVENESS, --vad_aggressiveness VAD_AGGRESSIVENESS
Set aggressiveness of VAD: an integer between 0 and 3,
0 being the least aggressive about filtering out non-
speech, 3 the most aggressive. Default: 3
--nospinner Disable spinner
-w SAVEWAV, --savewav SAVEWAV
Save .wav files of utterences to given directory
-m MODEL, --model MODEL
Path to the model (protocol buffer binary file, or
entire directory containing all standard-named files
for model)
-a ALPHABET, --alphabet ALPHABET
Path to the configuration file specifying the alphabet
used by the network. Default: alphabet.txt
-l LM, --lm LM Path to the language model binary file. Default:
lm.binary
-t TRIE, --trie TRIE Path to the language model trie file created with
native_client/generate_trie. Default: trie
-nf N_FEATURES, --n_features N_FEATURES
Number of MFCC features to use. Default: 26
-nc N_CONTEXT, --n_context N_CONTEXT
Size of the context window used for producing
timesteps in the input vector. Default: 9
-lw LM_WEIGHT, --lm_weight LM_WEIGHT
The alpha hyperparameter of the CTC decoder. Language
Model weight. Default: 1.5
-vwcw VALID_WORD_COUNT_WEIGHT, --valid_word_count_weight VALID_WORD_COUNT_WEIGHT
Valid word insertion weight. This is used to lessen
the word insertion penalty when the inserted word is
part of the vocabulary. Default: 2.1
-bw BEAM_WIDTH, --beam_width BEAM_WIDTH
Beam width used in the CTC decoder when building
candidate transcriptions. Default: 500
```

View File

@ -0,0 +1,187 @@
import time, logging
from datetime import datetime
import threading, collections, queue, os, os.path
import wave
import pyaudio
import webrtcvad
from halo import Halo
import deepspeech
import numpy as np
logging.basicConfig(level=20)
class Audio(object):
"""Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from."""
FORMAT = pyaudio.paInt16
RATE = 16000
CHANNELS = 1
BLOCKS_PER_SECOND = 50
BLOCK_SIZE = int(RATE / float(BLOCKS_PER_SECOND))
def __init__(self, callback=None):
def proxy_callback(in_data, frame_count, time_info, status):
callback(in_data)
return (None, pyaudio.paContinue)
if callback is None: callback = lambda in_data: self.buffer_queue.put(in_data)
self.buffer_queue = queue.Queue()
self.sample_rate = self.RATE
self.block_size = self.BLOCK_SIZE
self.pa = pyaudio.PyAudio()
self.stream = self.pa.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.sample_rate,
input=True,
frames_per_buffer=self.block_size,
stream_callback=proxy_callback)
self.stream.start_stream()
def read(self):
"""Return a block of audio data, blocking if necessary."""
return self.buffer_queue.get()
def destroy(self):
self.stream.stop_stream()
self.stream.close()
self.pa.terminate()
frame_duration_ms = property(lambda self: 1000 * self.block_size // self.sample_rate)
def write_wav(self, filename, data):
logging.info("write wav %s", filename)
wf = wave.open(filename, 'wb')
wf.setnchannels(self.CHANNELS)
# wf.setsampwidth(self.pa.get_sample_size(FORMAT))
assert self.FORMAT == pyaudio.paInt16
wf.setsampwidth(2)
wf.setframerate(self.sample_rate)
wf.writeframes(data)
wf.close()
class VADAudio(Audio):
"""Filter & segment audio with voice activity detection."""
def __init__(self, aggressiveness=3):
super().__init__()
self.vad = webrtcvad.Vad(aggressiveness)
def frame_generator(self):
"""Generator that yields all audio frames from microphone."""
while True:
yield self.read()
def vad_collector(self, padding_ms=300, ratio=0.75, frames=None):
"""Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
|---utterence---| |---utterence---|
"""
if frames is None: frames = self.frame_generator()
num_padding_frames = padding_ms // self.frame_duration_ms
ring_buffer = collections.deque(maxlen=num_padding_frames)
triggered = False
for frame in frames:
is_speech = self.vad.is_speech(frame, self.sample_rate)
if not triggered:
ring_buffer.append((frame, is_speech))
num_voiced = len([f for f, speech in ring_buffer if speech])
if num_voiced > ratio * ring_buffer.maxlen:
triggered = True
for f, s in ring_buffer:
yield f
ring_buffer.clear()
else:
yield frame
ring_buffer.append((frame, is_speech))
num_unvoiced = len([f for f, speech in ring_buffer if not speech])
if num_unvoiced > ratio * ring_buffer.maxlen:
triggered = False
yield None
ring_buffer.clear()
def main(ARGS):
# Load DeepSpeech model
if os.path.isdir(ARGS.model):
model_dir = ARGS.model
ARGS.model = os.path.join(model_dir, 'output_graph.pb')
ARGS.alphabet = os.path.join(model_dir, ARGS.alphabet if ARGS.alphabet else 'alphabet.txt')
ARGS.lm = os.path.join(model_dir, ARGS.lm)
ARGS.trie = os.path.join(model_dir, ARGS.trie)
print('Initializing model...')
logging.info("ARGS.model: %s", ARGS.model)
logging.info("ARGS.alphabet: %s", ARGS.alphabet)
model = deepspeech.Model(ARGS.model, ARGS.n_features, ARGS.n_context, ARGS.alphabet, ARGS.beam_width)
if ARGS.lm and ARGS.trie:
logging.info("ARGS.lm: %s", ARGS.lm)
logging.info("ARGS.trie: %s", ARGS.trie)
model.enableDecoderWithLM(ARGS.alphabet, ARGS.lm, ARGS.trie, ARGS.lm_weight, ARGS.valid_word_count_weight)
# Start audio with VAD
vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness)
print("Listening (ctrl-C to exit)...")
frames = vad_audio.vad_collector()
# Stream from microphone to DeepSpeech using VAD
spinner = None
if not ARGS.nospinner: spinner = Halo(spinner='line')
stream_context = model.setupStream()
wav_data = bytearray()
for frame in frames:
if frame is not None:
if spinner: spinner.start()
logging.debug("streaming frame")
model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16))
if ARGS.savewav: wav_data.extend(frame)
else:
if spinner: spinner.stop()
logging.debug("end utterence")
if ARGS.savewav:
vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
wav_data = bytearray()
text = model.finishStream(stream_context)
print("Recognized: %s" % text)
stream_context = model.setupStream()
if __name__ == '__main__':
BEAM_WIDTH = 500
LM_WEIGHT = 1.50
VALID_WORD_COUNT_WEIGHT = 2.10
N_FEATURES = 26
N_CONTEXT = 9
import argparse
parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD")
parser.add_argument('-v', '--vad_aggressiveness', type=int, default=3,
help="Set aggressiveness of VAD: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
parser.add_argument('--nospinner', action='store_true',
help="Disable spinner")
parser.add_argument('-w', '--savewav',
help="Save .wav files of utterences to given directory")
parser.add_argument('-m', '--model', required=True,
help="Path to the model (protocol buffer binary file, or entire directory containing all standard-named files for model)")
parser.add_argument('-a', '--alphabet', default='alphabet.txt',
help="Path to the configuration file specifying the alphabet used by the network. Default: alphabet.txt")
parser.add_argument('-l', '--lm', default='lm.binary',
help="Path to the language model binary file. Default: lm.binary")
parser.add_argument('-t', '--trie', default='trie',
help="Path to the language model trie file created with native_client/generate_trie. Default: trie")
parser.add_argument('-nf', '--n_features', type=int, default=N_FEATURES,
help=f"Number of MFCC features to use. Default: {N_FEATURES}")
parser.add_argument('-nc', '--n_context', type=int, default=N_CONTEXT,
help=f"Size of the context window used for producing timesteps in the input vector. Default: {N_CONTEXT}")
parser.add_argument('-lw', '--lm_weight', type=float, default=LM_WEIGHT,
help=f"The alpha hyperparameter of the CTC decoder. Language Model weight. Default: {LM_WEIGHT}")
parser.add_argument('-vwcw', '--valid_word_count_weight', type=float, default=VALID_WORD_COUNT_WEIGHT,
help=f"Valid word insertion weight. This is used to lessen the word insertion penalty when the inserted word is part of the vocabulary. Default: {VALID_WORD_COUNT_WEIGHT}")
parser.add_argument('-bw', '--beam_width', type=int, default=BEAM_WIDTH,
help=f"Beam width used in the CTC decoder when building candidate transcriptions. Default: {BEAM_WIDTH}")
ARGS = parser.parse_args()
if ARGS.savewav: os.makedirs(ARGS.savewav, exist_ok=True)
main(ARGS)

View File

@ -0,0 +1,5 @@
deepspeech~=0.3.0
pyaudio~=0.2.11
webrtcvad~=2.0.10
halo~=0.0.18
numpy~=1.15.1