Add libdeepspeech python bindings

This commit is contained in:
Chris Lord 2017-05-02 10:41:59 +01:00
parent c9cd4ff6f6
commit 10068fc40e
11 changed files with 3285 additions and 12 deletions

7
.gitignore vendored
View File

@ -1,7 +1,14 @@
.ipynb_checkpoints .ipynb_checkpoints
*.pyc *.pyc
*.swp
*.DS_Store *.DS_Store
/werlog.js /werlog.js
/data /data
/logs /logs
/exports /exports
/native_client/setup.cfg
/native_client/build
/native_client/deepspeech.egg-info
/native_client/dist
/native_client/python/deepspeech.py
/native_client/python/deepspeech_wrap.cpp

View File

@ -9,9 +9,12 @@
### $ make -C native_client/ TARGET=rpi3 TFDIR=../../tensorflow/tensorflow/ ### $ make -C native_client/ TARGET=rpi3 TFDIR=../../tensorflow/tensorflow/
### ###
.PHONY: clean run bindings
TARGET ?= host TARGET ?= host
TFDIR ?= ../../tensorflow TFDIR ?= ../../tensorflow
CXX ?= c++ CXX ?= c++
PREFIX ?= /usr/local
ifeq ($(TARGET),host) ifeq ($(TARGET),host)
TOOLCHAIN := TOOLCHAIN :=
@ -38,10 +41,30 @@ endif
default: deepspeech default: deepspeech
clean: clean:
rm -f deepspeech rm -rf build dist deepspeech.egg-info
rm -f deepspeech setup.cfg python/deepspeech_wrap.cpp python/deepspeech.py
deepspeech: client.cc deepspeech: client.cc
$(TOOLCHAIN)$(CXX) -o deepspeech $(CFLAGS) client.cc $(LDFLAGS) $(TOOLCHAIN)$(CXX) -o deepspeech $(CFLAGS) client.cc $(LDFLAGS)
setup.cfg: setup.cfg.in
sed -e 's:@LIBDIRS@:${TFDIR}/bazel-bin/tensorflow\:${TFDIR}/bazel-bin/native_client:g' setup.cfg.in > setup.cfg
bindings: setup.cfg
python ./setup.py bdist_wheel
run: deepspeech run: deepspeech
${META_LD_LIBRARY_PATH}=${TFDIR}/bazel-bin/tensorflow:${TFDIR}/bazel-bin/native_client:${${META_LD_LIBRARY_PATH}} ./deepspeech ${ARGS} ${META_LD_LIBRARY_PATH}=${TFDIR}/bazel-bin/tensorflow:${TFDIR}/bazel-bin/native_client:${${META_LD_LIBRARY_PATH}} ./deepspeech ${ARGS}
install:
install -d ${PREFIX}/lib
install -m 0644 ${TFDIR}/bazel-bin/tensorflow/libtensorflow.so ${PREFIX}/lib/
install -m 0644 ${TFDIR}/bazel-bin/native_client/libkissfft.so ${PREFIX}/lib/
install -m 0644 ${TFDIR}/bazel-bin/native_client/libc_speech_features.so ${PREFIX}/lib/
install -m 0644 ${TFDIR}/bazel-bin/native_client/libdeepspeech.so ${PREFIX}/lib/
uninstall:
rm -f ${PREFIX}/lib/libtensorflow.so
rm -f ${PREFIX}/lib/libkissfft.so
rm -f ${PREFIX}/lib/libc_speech_features.so
rm -f ${PREFIX}/lib/libdeepspeech.so

View File

@ -46,3 +46,15 @@ The client can be run via the `Makefile`. The client will accept audio of any fo
``` ```
ARGS="/path/to/output_graph.pb /path/to/audio/file.ogg" make run ARGS="/path/to/output_graph.pb /path/to/audio/file.ogg" make run
``` ```
## Python bindings
Included are a set of generated Python bindings. After following the above build instructions, these can be installed by executing the following commands (or equivalent on your system):
```
PREFIX=/usr/local make install
make bindings
sudo pip install dist/deepspeech*
```
It is assumed that `$PREFIX/lib` exists in the library path, otherwise you may need to alter your environment. The API mirrors the C++ API and is demonstrated in [client.py](client.py). Refer to [deepspeech.h](deepspeech.h) for documentation.

7
native_client/client.py Normal file
View File

@ -0,0 +1,7 @@
import sys
import scipy.io.wavfile as wav
from deepspeech import DeepSpeech
ds = DeepSpeech(sys.argv[1], 26, 9)
fs, audio = wav.read(sys.argv[2])
print ds.stt(audio, fs)

View File

@ -36,14 +36,14 @@ DeepSpeech::DeepSpeech(const char* aModelPath, int aNCep, int aNContext)
status = ReadBinaryProto(Env::Default(), aModelPath, &mPriv->graph_def); status = ReadBinaryProto(Env::Default(), aModelPath, &mPriv->graph_def);
if (!status.ok()) { if (!status.ok()) {
mPriv->session->Close(); mPriv->session->Close();
mPriv->session = nullptr; mPriv->session = NULL;
return; return;
} }
status = mPriv->session->Create(mPriv->graph_def); status = mPriv->session->Create(mPriv->graph_def);
if (!status.ok()) { if (!status.ok()) {
mPriv->session->Close(); mPriv->session->Close();
mPriv->session = nullptr; mPriv->session = NULL;
return; return;
} }
@ -136,7 +136,7 @@ DeepSpeech::getMfccFrames(const short* aBuffer, unsigned int aBufferSize,
*aNFrames = ds_input_length; *aNFrames = ds_input_length;
} }
if (aFrameLen) { if (aFrameLen) {
*aFrameLen = contextSize; *aFrameLen = frameSize;
} }
} }
@ -144,7 +144,7 @@ char*
DeepSpeech::infer(float* aMfcc, int aNFrames, int aFrameLen) DeepSpeech::infer(float* aMfcc, int aNFrames, int aFrameLen)
{ {
if (!mPriv->session) { if (!mPriv->session) {
return nullptr; return NULL;
} }
const int frameSize = mPriv->ncep + (2 * mPriv->ncep * mPriv->ncontext); const int frameSize = mPriv->ncep + (2 * mPriv->ncep * mPriv->ncontext);
@ -153,7 +153,7 @@ DeepSpeech::infer(float* aMfcc, int aNFrames, int aFrameLen)
} else if (aFrameLen < frameSize) { } else if (aFrameLen < frameSize) {
std::cerr << "mfcc features array is too small (expected " << std::cerr << "mfcc features array is too small (expected " <<
frameSize << ", got " << aFrameLen << ")\n"; frameSize << ", got " << aFrameLen << ")\n";
return nullptr; return NULL;
} }
Tensor input(DT_FLOAT, TensorShape({1, aNFrames, frameSize})); Tensor input(DT_FLOAT, TensorShape({1, aNFrames, frameSize}));
@ -175,7 +175,7 @@ DeepSpeech::infer(float* aMfcc, int aNFrames, int aFrameLen)
{"output_node"}, {}, &outputs); {"output_node"}, {}, &outputs);
if (!status.ok()) { if (!status.ok()) {
std::cerr << "Error running session: " << status.ToString() << "\n"; std::cerr << "Error running session: " << status.ToString() << "\n";
return nullptr; return NULL;
} }
// Output is an array of shape (1, n_results, result_length). // Output is an array of shape (1, n_results, result_length).
@ -199,7 +199,7 @@ DeepSpeech::stt(const short* aBuffer, unsigned int aBufferSize, int aSampleRate)
char* string; char* string;
int n_frames; int n_frames;
getMfccFrames(aBuffer, aBufferSize, aSampleRate, &mfcc, &n_frames, nullptr); getMfccFrames(aBuffer, aBufferSize, aSampleRate, &mfcc, &n_frames, NULL);
string = infer(mfcc, n_frames); string = infer(mfcc, n_frames);
free(mfcc); free(mfcc);
return string; return string;

View File

@ -2,6 +2,8 @@
#ifndef __DEEPSPEECH_H__ #ifndef __DEEPSPEECH_H__
#define __DEEPSPEECH_H__ #define __DEEPSPEECH_H__
#include <cstddef>
typedef struct _DeepSpeechPrivate DeepSpeechPrivate; typedef struct _DeepSpeechPrivate DeepSpeechPrivate;
class DeepSpeech { class DeepSpeech {
@ -27,7 +29,6 @@ class DeepSpeech {
* Extracts MFCC features from a given audio signal and adds the appropriate * Extracts MFCC features from a given audio signal and adds the appropriate
* amount of context to run inference with the given DeepSpeech context. * amount of context to run inference with the given DeepSpeech context.
* *
* @param aCtx A DeepSpeech context.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample * @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample
* rate. * rate.
* @param aBufferSize The sample-length of the audio signal. * @param aBufferSize The sample-length of the audio signal.
@ -43,8 +44,8 @@ class DeepSpeech {
unsigned int aBufferSize, unsigned int aBufferSize,
int aSampleRate, int aSampleRate,
float** aMfcc, float** aMfcc,
int* aNFrames = nullptr, int* aNFrames = NULL,
int* aFrameLen = nullptr); int* aFrameLen = NULL);
/** /**
* @brief Run inference on the given audio. * @brief Run inference on the given audio.
@ -52,7 +53,6 @@ class DeepSpeech {
* Runs inference on the given MFCC audio features with the given DeepSpeech * Runs inference on the given MFCC audio features with the given DeepSpeech
* context. See DsGetMfccFrames(). * context. See DsGetMfccFrames().
* *
* @param aCtx A DeepSpeech context.
* @param aMfcc MFCC features with the appropriate amount of context per * @param aMfcc MFCC features with the appropriate amount of context per
* frame. * frame.
* @param aNFrames The number of frames in @p aMfcc. * @param aNFrames The number of frames in @p aMfcc.

View File

@ -0,0 +1 @@
from .deepspeech import *

View File

@ -0,0 +1,17 @@
%module deepspeech
%{
#define SWIG_FILE_WITH_INIT
#include "deepspeech.h"
%}
%include "numpy.i"
%init %{
import_array();
%}
%apply (short* IN_ARRAY1, int DIM1) {(const short* aBuffer, unsigned int aBufferSize)};
%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {(float** aMfcc, int* aNFrames, int* aFrameLen)};
%apply (float* IN_ARRAY2, int DIM1, int DIM2) {(float* aMfcc, int aNFrames, int aFrameLen)};
%include "../deepspeech.h"

3166
native_client/python/numpy.i Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,4 @@
[build_ext]
include-dirs=./
library-dirs=@LIBDIRS@
swig-opts=-c++ -keyword

36
native_client/setup.py Executable file
View File

@ -0,0 +1,36 @@
#! /usr/bin/env python
from setuptools import setup, Extension
from distutils.command.build import build
import os
import numpy
import subprocess
try:
numpy_include = numpy.get_include()
except AttributeError:
numpy_include = numpy.get_numpy_include()
class BuildExtFirst(build):
sub_commands = [('build_ext', build.has_ext_modules),
('build_py', build.has_pure_modules),
('build_clib', build.has_c_libraries),
('build_scripts', build.has_scripts)]
deepspeech = Extension('_deepspeech',
['python/deepspeech.i'],
include_dirs = [numpy_include],
libraries = ['tensorflow', 'deepspeech', 'c_speech_features', 'kissfft'])
setup(name = 'deepspeech',
description = 'A library for running inference on a DeepSpeech model',
author = 'Chris Lord',
author_email='chrislord.net@gmail.com',
version = '0.0.1',
package_dir = {'deepspeech': 'python'},
packages = [ 'deepspeech' ],
cmdclass = { 'build': BuildExtFirst },
license = 'MPL-2.0',
url = 'https://github.com/mozilla/DeepSpeech',
ext_modules = [deepspeech])