Add example for Python streaming from mic with VAD

2018-11-09 10:18:01 -05:00 · 2018-11-09 10:18:01 -05:00 · 74cebb83b6
commit 74cebb83b6
parent eee92c232d
3 changed files with 255 additions and 0 deletions
--- a/examples/mic_vad_streaming/README.md
+++ b/examples/mic_vad_streaming/README.md
@ -0,0 +1,63 @@
+# Microphone VAD Streaming
+
+Stream from microphone to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Python. Also useful for quick, real-time testing of models and decoding parameters.
+
+## Installation
+
+```bash
+pip install -r requirements.txt
+```
+
+Uses portaudio for microphone access, so on Linux, you may need to install its header files to compile the `pyaudio` package:
+
+```bash
+sudo apt install portaudio19-dev
+```
+
+## Usage
+
+```
+usage: mic_vad_streaming.py [-h] [-v VAD_AGGRESSIVENESS] [--nospinner]
+                            [-w SAVEWAV] -m MODEL [-a ALPHABET] [-l LM]
+                            [-t TRIE] [-nf N_FEATURES] [-nc N_CONTEXT]
+                            [-lw LM_WEIGHT] [-vwcw VALID_WORD_COUNT_WEIGHT]
+                            [-bw BEAM_WIDTH]
+
+Stream from microphone to DeepSpeech using VAD
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -v VAD_AGGRESSIVENESS, --vad_aggressiveness VAD_AGGRESSIVENESS
+                        Set aggressiveness of VAD: an integer between 0 and 3,
+                        0 being the least aggressive about filtering out non-
+                        speech, 3 the most aggressive. Default: 3
+  --nospinner           Disable spinner
+  -w SAVEWAV, --savewav SAVEWAV
+                        Save .wav files of utterences to given directory
+  -m MODEL, --model MODEL
+                        Path to the model (protocol buffer binary file, or
+                        entire directory containing all standard-named files
+                        for model)
+  -a ALPHABET, --alphabet ALPHABET
+                        Path to the configuration file specifying the alphabet
+                        used by the network. Default: alphabet.txt
+  -l LM, --lm LM        Path to the language model binary file. Default:
+                        lm.binary
+  -t TRIE, --trie TRIE  Path to the language model trie file created with
+                        native_client/generate_trie. Default: trie
+  -nf N_FEATURES, --n_features N_FEATURES
+                        Number of MFCC features to use. Default: 26
+  -nc N_CONTEXT, --n_context N_CONTEXT
+                        Size of the context window used for producing
+                        timesteps in the input vector. Default: 9
+  -lw LM_WEIGHT, --lm_weight LM_WEIGHT
+                        The alpha hyperparameter of the CTC decoder. Language
+                        Model weight. Default: 1.5
+  -vwcw VALID_WORD_COUNT_WEIGHT, --valid_word_count_weight VALID_WORD_COUNT_WEIGHT
+                        Valid word insertion weight. This is used to lessen
+                        the word insertion penalty when the inserted word is
+                        part of the vocabulary. Default: 2.1
+  -bw BEAM_WIDTH, --beam_width BEAM_WIDTH
+                        Beam width used in the CTC decoder when building
+                        candidate transcriptions. Default: 500
+```
--- a/examples/mic_vad_streaming/mic_vad_streaming.py
+++ b/examples/mic_vad_streaming/mic_vad_streaming.py
@ -0,0 +1,187 @@
+import time, logging
+from datetime import datetime
+import threading, collections, queue, os, os.path
+import wave
+import pyaudio
+import webrtcvad
+from halo import Halo
+import deepspeech
+import numpy as np
+
+logging.basicConfig(level=20)
+
+class Audio(object):
+    """Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from."""
+
+    FORMAT = pyaudio.paInt16
+    RATE = 16000
+    CHANNELS = 1
+    BLOCKS_PER_SECOND = 50
+    BLOCK_SIZE = int(RATE / float(BLOCKS_PER_SECOND))
+
+    def __init__(self, callback=None):
+        def proxy_callback(in_data, frame_count, time_info, status):
+            callback(in_data)
+            return (None, pyaudio.paContinue)
+        if callback is None: callback = lambda in_data: self.buffer_queue.put(in_data)
+        self.buffer_queue = queue.Queue()
+        self.sample_rate = self.RATE
+        self.block_size = self.BLOCK_SIZE
+        self.pa = pyaudio.PyAudio()
+        self.stream = self.pa.open(format=self.FORMAT,
+                                   channels=self.CHANNELS,
+                                   rate=self.sample_rate,
+                                   input=True,
+                                   frames_per_buffer=self.block_size,
+                                   stream_callback=proxy_callback)
+        self.stream.start_stream()
+
+    def read(self):
+        """Return a block of audio data, blocking if necessary."""
+        return self.buffer_queue.get()
+
+    def destroy(self):
+        self.stream.stop_stream()
+        self.stream.close()
+        self.pa.terminate()
+
+    frame_duration_ms = property(lambda self: 1000 * self.block_size // self.sample_rate)
+
+    def write_wav(self, filename, data):
+        logging.info("write wav %s", filename)
+        wf = wave.open(filename, 'wb')
+        wf.setnchannels(self.CHANNELS)
+        # wf.setsampwidth(self.pa.get_sample_size(FORMAT))
+        assert self.FORMAT == pyaudio.paInt16
+        wf.setsampwidth(2)
+        wf.setframerate(self.sample_rate)
+        wf.writeframes(data)
+        wf.close()
+
+class VADAudio(Audio):
+    """Filter & segment audio with voice activity detection."""
+
+    def __init__(self, aggressiveness=3):
+        super().__init__()
+        self.vad = webrtcvad.Vad(aggressiveness)
+
+    def frame_generator(self):
+        """Generator that yields all audio frames from microphone."""
+        while True:
+            yield self.read()
+
+    def vad_collector(self, padding_ms=300, ratio=0.75, frames=None):
+        """Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
+            Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
+            Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
+                      |---utterence---|        |---utterence---|
+        """
+        if frames is None: frames = self.frame_generator()
+        num_padding_frames = padding_ms // self.frame_duration_ms
+        ring_buffer = collections.deque(maxlen=num_padding_frames)
+        triggered = False
+
+        for frame in frames:
+            is_speech = self.vad.is_speech(frame, self.sample_rate)
+
+            if not triggered:
+                ring_buffer.append((frame, is_speech))
+                num_voiced = len([f for f, speech in ring_buffer if speech])
+                if num_voiced > ratio * ring_buffer.maxlen:
+                    triggered = True
+                    for f, s in ring_buffer:
+                        yield f
+                    ring_buffer.clear()
+
+            else:
+                yield frame
+                ring_buffer.append((frame, is_speech))
+                num_unvoiced = len([f for f, speech in ring_buffer if not speech])
+                if num_unvoiced > ratio * ring_buffer.maxlen:
+                    triggered = False
+                    yield None
+                    ring_buffer.clear()
+
+def main(ARGS):
+    # Load DeepSpeech model
+    if os.path.isdir(ARGS.model):
+        model_dir = ARGS.model
+        ARGS.model = os.path.join(model_dir, 'output_graph.pb')
+        ARGS.alphabet = os.path.join(model_dir, ARGS.alphabet if ARGS.alphabet else 'alphabet.txt')
+        ARGS.lm = os.path.join(model_dir, ARGS.lm)
+        ARGS.trie = os.path.join(model_dir, ARGS.trie)
+
+    print('Initializing model...')
+    logging.info("ARGS.model: %s", ARGS.model)
+    logging.info("ARGS.alphabet: %s", ARGS.alphabet)
+    model = deepspeech.Model(ARGS.model, ARGS.n_features, ARGS.n_context, ARGS.alphabet, ARGS.beam_width)
+    if ARGS.lm and ARGS.trie:
+        logging.info("ARGS.lm: %s", ARGS.lm)
+        logging.info("ARGS.trie: %s", ARGS.trie)
+        model.enableDecoderWithLM(ARGS.alphabet, ARGS.lm, ARGS.trie, ARGS.lm_weight, ARGS.valid_word_count_weight)
+
+    # Start audio with VAD
+    vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness)
+    print("Listening (ctrl-C to exit)...")
+    frames = vad_audio.vad_collector()
+
+    # Stream from microphone to DeepSpeech using VAD
+    spinner = None
+    if not ARGS.nospinner: spinner = Halo(spinner='line')
+    stream_context = model.setupStream()
+    wav_data = bytearray()
+    for frame in frames:
+        if frame is not None:
+            if spinner: spinner.start()
+            logging.debug("streaming frame")
+            model.feedAudioContent(stream_context, np.frombuffer(frame, np.int16))
+            if ARGS.savewav: wav_data.extend(frame)
+        else:
+            if spinner: spinner.stop()
+            logging.debug("end utterence")
+            if ARGS.savewav:
+                vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
+                wav_data = bytearray()
+            text = model.finishStream(stream_context)
+            print("Recognized: %s" % text)
+            stream_context = model.setupStream()
+
+if __name__ == '__main__':
+    BEAM_WIDTH = 500
+    LM_WEIGHT = 1.50
+    VALID_WORD_COUNT_WEIGHT = 2.10
+    N_FEATURES = 26
+    N_CONTEXT = 9
+
+    import argparse
+    parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD")
+
+    parser.add_argument('-v', '--vad_aggressiveness', type=int, default=3,
+        help="Set aggressiveness of VAD: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
+    parser.add_argument('--nospinner', action='store_true',
+        help="Disable spinner")
+    parser.add_argument('-w', '--savewav',
+        help="Save .wav files of utterences to given directory")
+
+    parser.add_argument('-m', '--model', required=True,
+                        help="Path to the model (protocol buffer binary file, or entire directory containing all standard-named files for model)")
+    parser.add_argument('-a', '--alphabet', default='alphabet.txt',
+                        help="Path to the configuration file specifying the alphabet used by the network. Default: alphabet.txt")
+    parser.add_argument('-l', '--lm', default='lm.binary',
+                        help="Path to the language model binary file. Default: lm.binary")
+    parser.add_argument('-t', '--trie', default='trie',
+                        help="Path to the language model trie file created with native_client/generate_trie. Default: trie")
+    parser.add_argument('-nf', '--n_features', type=int, default=N_FEATURES,
+                        help=f"Number of MFCC features to use. Default: {N_FEATURES}")
+    parser.add_argument('-nc', '--n_context', type=int, default=N_CONTEXT,
+                        help=f"Size of the context window used for producing timesteps in the input vector. Default: {N_CONTEXT}")
+    parser.add_argument('-lw', '--lm_weight', type=float, default=LM_WEIGHT,
+                        help=f"The alpha hyperparameter of the CTC decoder. Language Model weight. Default: {LM_WEIGHT}")
+    parser.add_argument('-vwcw', '--valid_word_count_weight', type=float, default=VALID_WORD_COUNT_WEIGHT,
+                        help=f"Valid word insertion weight. This is used to lessen the word insertion penalty when the inserted word is part of the vocabulary. Default: {VALID_WORD_COUNT_WEIGHT}")
+    parser.add_argument('-bw', '--beam_width', type=int, default=BEAM_WIDTH,
+                        help=f"Beam width used in the CTC decoder when building candidate transcriptions. Default: {BEAM_WIDTH}")
+
+    ARGS = parser.parse_args()
+    if ARGS.savewav: os.makedirs(ARGS.savewav, exist_ok=True)
+    main(ARGS)
--- a/examples/mic_vad_streaming/requirements.txt
+++ b/examples/mic_vad_streaming/requirements.txt
@ -0,0 +1,5 @@
+deepspeech~=0.3.0
+pyaudio~=0.2.11
+webrtcvad~=2.0.10
+halo~=0.0.18
+numpy~=1.15.1