Add example for Python streaming from mic with VAD
This commit is contained in:
parent
eee92c232d
commit
74cebb83b6
63
examples/mic_vad_streaming/README.md
Normal file
63
examples/mic_vad_streaming/README.md
Normal file
@ -0,0 +1,63 @@
|
||||
# Microphone VAD Streaming
|
||||
|
||||
Stream from microphone to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Python. Also useful for quick, real-time testing of models and decoding parameters.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Uses portaudio for microphone access, so on Linux, you may need to install its header files to compile the `pyaudio` package:
|
||||
|
||||
```bash
|
||||
sudo apt install portaudio19-dev
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```
|
||||
usage: mic_vad_streaming.py [-h] [-v VAD_AGGRESSIVENESS] [--nospinner]
|
||||
[-w SAVEWAV] -m MODEL [-a ALPHABET] [-l LM]
|
||||
[-t TRIE] [-nf N_FEATURES] [-nc N_CONTEXT]
|
||||
[-lw LM_WEIGHT] [-vwcw VALID_WORD_COUNT_WEIGHT]
|
||||
[-bw BEAM_WIDTH]
|
||||
|
||||
Stream from microphone to DeepSpeech using VAD
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-v VAD_AGGRESSIVENESS, --vad_aggressiveness VAD_AGGRESSIVENESS
|
||||
Set aggressiveness of VAD: an integer between 0 and 3,
|
||||
0 being the least aggressive about filtering out non-
|
||||
speech, 3 the most aggressive. Default: 3
|
||||
--nospinner Disable spinner
|
||||
-w SAVEWAV, --savewav SAVEWAV
|
||||
Save .wav files of utterences to given directory
|
||||
-m MODEL, --model MODEL
|
||||
Path to the model (protocol buffer binary file, or
|
||||
entire directory containing all standard-named files
|
||||
for model)
|
||||
-a ALPHABET, --alphabet ALPHABET
|
||||
Path to the configuration file specifying the alphabet
|
||||
used by the network. Default: alphabet.txt
|
||||
-l LM, --lm LM Path to the language model binary file. Default:
|
||||
lm.binary
|
||||
-t TRIE, --trie TRIE Path to the language model trie file created with
|
||||
native_client/generate_trie. Default: trie
|
||||
-nf N_FEATURES, --n_features N_FEATURES
|
||||
Number of MFCC features to use. Default: 26
|
||||
-nc N_CONTEXT, --n_context N_CONTEXT
|
||||
Size of the context window used for producing
|
||||
timesteps in the input vector. Default: 9
|
||||
-lw LM_WEIGHT, --lm_weight LM_WEIGHT
|
||||
The alpha hyperparameter of the CTC decoder. Language
|
||||
Model weight. Default: 1.5
|
||||
-vwcw VALID_WORD_COUNT_WEIGHT, --valid_word_count_weight VALID_WORD_COUNT_WEIGHT
|
||||
Valid word insertion weight. This is used to lessen
|
||||
the word insertion penalty when the inserted word is
|
||||
part of the vocabulary. Default: 2.1
|
||||
-bw BEAM_WIDTH, --beam_width BEAM_WIDTH
|
||||
Beam width used in the CTC decoder when building
|
||||
candidate transcriptions. Default: 500
|
||||
```
|
187
examples/mic_vad_streaming/mic_vad_streaming.py
Normal file
187
examples/mic_vad_streaming/mic_vad_streaming.py
Normal file
@ -0,0 +1,187 @@
|
||||
import time, logging
|
||||
from datetime import datetime
|
||||
import threading, collections, queue, os, os.path
|
||||
import wave
|
||||
import pyaudio
|
||||
import webrtcvad
|
||||
from halo import Halo
|
||||
import deepspeech
|
||||
import numpy as np
|
||||
|
||||
logging.basicConfig(level=20)
|
||||
|
||||
class Audio(object):
|
||||
"""Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from."""
|
||||
|
||||
FORMAT = pyaudio.paInt16
|
||||
RATE = 16000
|
||||
CHANNELS = 1
|
||||
BLOCKS_PER_SECOND = 50
|
||||
BLOCK_SIZE = int(RATE / float(BLOCKS_PER_SECOND))
|
||||
|
||||
def __init__(self, callback=None):
|
||||
def proxy_callback(in_data, frame_count, time_info, status):
|
||||
callback(in_data)
|
||||
return (None, pyaudio.paContinue)
|
||||
if callback is None: callback = lambda in_data: self.buffer_queue.put(in_data)
|
||||
self.buffer_queue = queue.Queue()
|
||||
self.sample_rate = self.RATE
|
||||
self.block_size = self.BLOCK_SIZE
|
||||
self.pa = pyaudio.PyAudio()
|
||||
self.stream = self.pa.open(format=self.FORMAT,
|
||||
channels=self.CHANNELS,
|
||||
rate=self.sample_rate,
|
||||
input=True,
|
||||
frames_per_buffer=self.block_size,
|
||||
stream_callback=proxy_callback)
|
||||
self.stream.start_stream()
|
||||
|
||||
def read(self):
|
||||
"""Return a block of audio data, blocking if necessary."""
|
||||
return self.buffer_queue.get()
|
||||
|
||||
def destroy(self):
|
||||
self.stream.stop_stream()
|
||||
self.stream.close()
|
||||
self.pa.terminate()
|
||||
|
||||
frame_duration_ms = property(lambda self: 1000 * self.block_size // self.sample_rate)
|
||||
|
||||
def write_wav(self, filename, data):
|
||||
logging.info("write wav %s", filename)
|
||||
wf = wave.open(filename, 'wb')
|
||||
wf.setnchannels(self.CHANNELS)
|
||||
# wf.setsampwidth(self.pa.get_sample_size(FORMAT))
|
||||
assert self.FORMAT == pyaudio.paInt16
|
||||
wf.setsampwidth(2)
|
||||
wf.setframerate(self.sample_rate)
|
||||
wf.writeframes(data)
|
||||
wf.close()
|
||||
|
||||
class VADAudio(Audio):
|
||||
"""Filter & segment audio with voice activity detection."""
|
||||
|
||||
def __init__(self, aggressiveness=3):
|
||||
super().__init__()
|
||||
self.vad = webrtcvad.Vad(aggressiveness)
|
||||
|
||||
def frame_generator(self):
|
||||
"""Generator that yields all audio frames from microphone."""
|
||||
while True:
|
||||
yield self.read()
|
||||
|
||||
def vad_collector(self, padding_ms=300, ratio=0.75, frames=None):
|
||||
"""Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
|
||||
Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
|
||||
Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
|
||||
|---utterence---| |---utterence---|
|
||||
"""
|
||||
if frames is None: frames = self.frame_generator()
|
||||
num_padding_frames = padding_ms // self.frame_duration_ms
|
||||
ring_buffer = collections.deque(maxlen=num_padding_frames)
|
||||
triggered = False
|
||||
|
||||
for frame in frames:
|
||||
is_speech = self.vad.is_speech(frame, self.sample_rate)
|
||||
|
||||
if not triggered:
|
||||
ring_buffer.append((frame, is_speech))
|
||||
num_voiced = len([f for f, speech in ring_buffer if speech])
|
||||
if num_voiced > ratio * ring_buffer.maxlen:
|
||||
triggered = True
|
||||
for f, s in ring_buffer:
|
||||
yield f
|
||||
ring_buffer.clear()
|
||||
|
||||
else:
|
||||
yield frame
|
||||
ring_buffer.append((frame, is_speech))
|
||||
num_unvoiced = len([f for f, speech in ring_buffer if not speech])
|
||||
if num_unvoiced > ratio * ring_buffer.maxlen:
|
||||
triggered = False
|
||||
yield None
|
||||
ring_buffer.clear()
|
||||
|
||||
def main(ARGS):
|
||||
# Load DeepSpeech model
|
||||
if os.path.isdir(ARGS.model):
|
||||
model_dir = ARGS.model
|
||||
ARGS.model = os.path.join(model_dir, 'output_graph.pb')
|
||||
ARGS.alphabet = os.path.join(model_dir, ARGS.alphabet if ARGS.alphabet else 'alphabet.txt')
|
||||
ARGS.lm = os.path.join(model_dir, ARGS.lm)
|
||||
ARGS.trie = os.path.join(model_dir, ARGS.trie)
|
||||
|
||||
print('Initializing model...')
|
||||
logging.info("ARGS.model: %s", ARGS.model)
|
||||
logging.info("ARGS.alphabet: %s", ARGS.alphabet)
|
||||
model = deepspeech.Model(ARGS.model, ARGS.n_features, ARGS.n_context, ARGS.alphabet, ARGS.beam_width)
|
||||
if ARGS.lm and ARGS.trie:
|
||||
logging.info("ARGS.lm: %s", ARGS.lm)
|
||||
logging.info("ARGS.trie: %s", ARGS.trie)
|
||||
model.enableDecoderWithLM(ARGS.alphabet, ARGS.lm, ARGS.trie, ARGS.lm_weight, ARGS.valid_word_count_weight)
|
||||
|
||||
# Start audio with VAD
|
||||
vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness)
|
||||
print("Listening (ctrl-C to exit)...")
|
||||
frames = vad_audio.vad_collector()
|
||||
|
||||
# Stream from microphone to DeepSpeech using VAD
|
||||
spinner = None
|
||||
if not ARGS.nospinner: spinner = Halo(spinner='line')
|
||||
stream_context = model.setupStream()
|
||||
wav_data = bytearray()
|
||||
for frame in frames:
|
||||
if frame is not None:
|
||||
if spinner: spinner.start()
|
||||
logging.debug("streaming frame")
|
||||
model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16))
|
||||
if ARGS.savewav: wav_data.extend(frame)
|
||||
else:
|
||||
if spinner: spinner.stop()
|
||||
logging.debug("end utterence")
|
||||
if ARGS.savewav:
|
||||
vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
|
||||
wav_data = bytearray()
|
||||
text = model.finishStream(stream_context)
|
||||
print("Recognized: %s" % text)
|
||||
stream_context = model.setupStream()
|
||||
|
||||
if __name__ == '__main__':
|
||||
BEAM_WIDTH = 500
|
||||
LM_WEIGHT = 1.50
|
||||
VALID_WORD_COUNT_WEIGHT = 2.10
|
||||
N_FEATURES = 26
|
||||
N_CONTEXT = 9
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD")
|
||||
|
||||
parser.add_argument('-v', '--vad_aggressiveness', type=int, default=3,
|
||||
help="Set aggressiveness of VAD: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
|
||||
parser.add_argument('--nospinner', action='store_true',
|
||||
help="Disable spinner")
|
||||
parser.add_argument('-w', '--savewav',
|
||||
help="Save .wav files of utterences to given directory")
|
||||
|
||||
parser.add_argument('-m', '--model', required=True,
|
||||
help="Path to the model (protocol buffer binary file, or entire directory containing all standard-named files for model)")
|
||||
parser.add_argument('-a', '--alphabet', default='alphabet.txt',
|
||||
help="Path to the configuration file specifying the alphabet used by the network. Default: alphabet.txt")
|
||||
parser.add_argument('-l', '--lm', default='lm.binary',
|
||||
help="Path to the language model binary file. Default: lm.binary")
|
||||
parser.add_argument('-t', '--trie', default='trie',
|
||||
help="Path to the language model trie file created with native_client/generate_trie. Default: trie")
|
||||
parser.add_argument('-nf', '--n_features', type=int, default=N_FEATURES,
|
||||
help=f"Number of MFCC features to use. Default: {N_FEATURES}")
|
||||
parser.add_argument('-nc', '--n_context', type=int, default=N_CONTEXT,
|
||||
help=f"Size of the context window used for producing timesteps in the input vector. Default: {N_CONTEXT}")
|
||||
parser.add_argument('-lw', '--lm_weight', type=float, default=LM_WEIGHT,
|
||||
help=f"The alpha hyperparameter of the CTC decoder. Language Model weight. Default: {LM_WEIGHT}")
|
||||
parser.add_argument('-vwcw', '--valid_word_count_weight', type=float, default=VALID_WORD_COUNT_WEIGHT,
|
||||
help=f"Valid word insertion weight. This is used to lessen the word insertion penalty when the inserted word is part of the vocabulary. Default: {VALID_WORD_COUNT_WEIGHT}")
|
||||
parser.add_argument('-bw', '--beam_width', type=int, default=BEAM_WIDTH,
|
||||
help=f"Beam width used in the CTC decoder when building candidate transcriptions. Default: {BEAM_WIDTH}")
|
||||
|
||||
ARGS = parser.parse_args()
|
||||
if ARGS.savewav: os.makedirs(ARGS.savewav, exist_ok=True)
|
||||
main(ARGS)
|
5
examples/mic_vad_streaming/requirements.txt
Normal file
5
examples/mic_vad_streaming/requirements.txt
Normal file
@ -0,0 +1,5 @@
|
||||
deepspeech~=0.3.0
|
||||
pyaudio~=0.2.11
|
||||
webrtcvad~=2.0.10
|
||||
halo~=0.0.18
|
||||
numpy~=1.15.1
|
Loading…
x
Reference in New Issue
Block a user