Add libdeepspeech python bindings
This commit is contained in:
parent
c9cd4ff6f6
commit
10068fc40e
|
@ -1,7 +1,14 @@
|
||||||
.ipynb_checkpoints
|
.ipynb_checkpoints
|
||||||
*.pyc
|
*.pyc
|
||||||
|
*.swp
|
||||||
*.DS_Store
|
*.DS_Store
|
||||||
/werlog.js
|
/werlog.js
|
||||||
/data
|
/data
|
||||||
/logs
|
/logs
|
||||||
/exports
|
/exports
|
||||||
|
/native_client/setup.cfg
|
||||||
|
/native_client/build
|
||||||
|
/native_client/deepspeech.egg-info
|
||||||
|
/native_client/dist
|
||||||
|
/native_client/python/deepspeech.py
|
||||||
|
/native_client/python/deepspeech_wrap.cpp
|
||||||
|
|
|
@ -9,9 +9,12 @@
|
||||||
### $ make -C native_client/ TARGET=rpi3 TFDIR=../../tensorflow/tensorflow/
|
### $ make -C native_client/ TARGET=rpi3 TFDIR=../../tensorflow/tensorflow/
|
||||||
###
|
###
|
||||||
|
|
||||||
|
.PHONY: clean run bindings
|
||||||
|
|
||||||
TARGET ?= host
|
TARGET ?= host
|
||||||
TFDIR ?= ../../tensorflow
|
TFDIR ?= ../../tensorflow
|
||||||
CXX ?= c++
|
CXX ?= c++
|
||||||
|
PREFIX ?= /usr/local
|
||||||
|
|
||||||
ifeq ($(TARGET),host)
|
ifeq ($(TARGET),host)
|
||||||
TOOLCHAIN :=
|
TOOLCHAIN :=
|
||||||
|
@ -38,10 +41,30 @@ endif
|
||||||
default: deepspeech
|
default: deepspeech
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f deepspeech
|
rm -rf build dist deepspeech.egg-info
|
||||||
|
rm -f deepspeech setup.cfg python/deepspeech_wrap.cpp python/deepspeech.py
|
||||||
|
|
||||||
deepspeech: client.cc
|
deepspeech: client.cc
|
||||||
$(TOOLCHAIN)$(CXX) -o deepspeech $(CFLAGS) client.cc $(LDFLAGS)
|
$(TOOLCHAIN)$(CXX) -o deepspeech $(CFLAGS) client.cc $(LDFLAGS)
|
||||||
|
|
||||||
|
setup.cfg: setup.cfg.in
|
||||||
|
sed -e 's:@LIBDIRS@:${TFDIR}/bazel-bin/tensorflow\:${TFDIR}/bazel-bin/native_client:g' setup.cfg.in > setup.cfg
|
||||||
|
|
||||||
|
bindings: setup.cfg
|
||||||
|
python ./setup.py bdist_wheel
|
||||||
|
|
||||||
run: deepspeech
|
run: deepspeech
|
||||||
${META_LD_LIBRARY_PATH}=${TFDIR}/bazel-bin/tensorflow:${TFDIR}/bazel-bin/native_client:${${META_LD_LIBRARY_PATH}} ./deepspeech ${ARGS}
|
${META_LD_LIBRARY_PATH}=${TFDIR}/bazel-bin/tensorflow:${TFDIR}/bazel-bin/native_client:${${META_LD_LIBRARY_PATH}} ./deepspeech ${ARGS}
|
||||||
|
|
||||||
|
install:
|
||||||
|
install -d ${PREFIX}/lib
|
||||||
|
install -m 0644 ${TFDIR}/bazel-bin/tensorflow/libtensorflow.so ${PREFIX}/lib/
|
||||||
|
install -m 0644 ${TFDIR}/bazel-bin/native_client/libkissfft.so ${PREFIX}/lib/
|
||||||
|
install -m 0644 ${TFDIR}/bazel-bin/native_client/libc_speech_features.so ${PREFIX}/lib/
|
||||||
|
install -m 0644 ${TFDIR}/bazel-bin/native_client/libdeepspeech.so ${PREFIX}/lib/
|
||||||
|
|
||||||
|
uninstall:
|
||||||
|
rm -f ${PREFIX}/lib/libtensorflow.so
|
||||||
|
rm -f ${PREFIX}/lib/libkissfft.so
|
||||||
|
rm -f ${PREFIX}/lib/libc_speech_features.so
|
||||||
|
rm -f ${PREFIX}/lib/libdeepspeech.so
|
||||||
|
|
|
@ -46,3 +46,15 @@ The client can be run via the `Makefile`. The client will accept audio of any fo
|
||||||
```
|
```
|
||||||
ARGS="/path/to/output_graph.pb /path/to/audio/file.ogg" make run
|
ARGS="/path/to/output_graph.pb /path/to/audio/file.ogg" make run
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Python bindings
|
||||||
|
|
||||||
|
Included are a set of generated Python bindings. After following the above build instructions, these can be installed by executing the following commands (or equivalent on your system):
|
||||||
|
|
||||||
|
```
|
||||||
|
PREFIX=/usr/local make install
|
||||||
|
make bindings
|
||||||
|
sudo pip install dist/deepspeech*
|
||||||
|
```
|
||||||
|
|
||||||
|
It is assumed that `$PREFIX/lib` exists in the library path, otherwise you may need to alter your environment. The API mirrors the C++ API and is demonstrated in [client.py](client.py). Refer to [deepspeech.h](deepspeech.h) for documentation.
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
import sys
|
||||||
|
import scipy.io.wavfile as wav
|
||||||
|
from deepspeech import DeepSpeech
|
||||||
|
|
||||||
|
ds = DeepSpeech(sys.argv[1], 26, 9)
|
||||||
|
fs, audio = wav.read(sys.argv[2])
|
||||||
|
print ds.stt(audio, fs)
|
|
@ -36,14 +36,14 @@ DeepSpeech::DeepSpeech(const char* aModelPath, int aNCep, int aNContext)
|
||||||
status = ReadBinaryProto(Env::Default(), aModelPath, &mPriv->graph_def);
|
status = ReadBinaryProto(Env::Default(), aModelPath, &mPriv->graph_def);
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
mPriv->session->Close();
|
mPriv->session->Close();
|
||||||
mPriv->session = nullptr;
|
mPriv->session = NULL;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
status = mPriv->session->Create(mPriv->graph_def);
|
status = mPriv->session->Create(mPriv->graph_def);
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
mPriv->session->Close();
|
mPriv->session->Close();
|
||||||
mPriv->session = nullptr;
|
mPriv->session = NULL;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -136,7 +136,7 @@ DeepSpeech::getMfccFrames(const short* aBuffer, unsigned int aBufferSize,
|
||||||
*aNFrames = ds_input_length;
|
*aNFrames = ds_input_length;
|
||||||
}
|
}
|
||||||
if (aFrameLen) {
|
if (aFrameLen) {
|
||||||
*aFrameLen = contextSize;
|
*aFrameLen = frameSize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ char*
|
||||||
DeepSpeech::infer(float* aMfcc, int aNFrames, int aFrameLen)
|
DeepSpeech::infer(float* aMfcc, int aNFrames, int aFrameLen)
|
||||||
{
|
{
|
||||||
if (!mPriv->session) {
|
if (!mPriv->session) {
|
||||||
return nullptr;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int frameSize = mPriv->ncep + (2 * mPriv->ncep * mPriv->ncontext);
|
const int frameSize = mPriv->ncep + (2 * mPriv->ncep * mPriv->ncontext);
|
||||||
|
@ -153,7 +153,7 @@ DeepSpeech::infer(float* aMfcc, int aNFrames, int aFrameLen)
|
||||||
} else if (aFrameLen < frameSize) {
|
} else if (aFrameLen < frameSize) {
|
||||||
std::cerr << "mfcc features array is too small (expected " <<
|
std::cerr << "mfcc features array is too small (expected " <<
|
||||||
frameSize << ", got " << aFrameLen << ")\n";
|
frameSize << ", got " << aFrameLen << ")\n";
|
||||||
return nullptr;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor input(DT_FLOAT, TensorShape({1, aNFrames, frameSize}));
|
Tensor input(DT_FLOAT, TensorShape({1, aNFrames, frameSize}));
|
||||||
|
@ -175,7 +175,7 @@ DeepSpeech::infer(float* aMfcc, int aNFrames, int aFrameLen)
|
||||||
{"output_node"}, {}, &outputs);
|
{"output_node"}, {}, &outputs);
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
std::cerr << "Error running session: " << status.ToString() << "\n";
|
std::cerr << "Error running session: " << status.ToString() << "\n";
|
||||||
return nullptr;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Output is an array of shape (1, n_results, result_length).
|
// Output is an array of shape (1, n_results, result_length).
|
||||||
|
@ -199,7 +199,7 @@ DeepSpeech::stt(const short* aBuffer, unsigned int aBufferSize, int aSampleRate)
|
||||||
char* string;
|
char* string;
|
||||||
int n_frames;
|
int n_frames;
|
||||||
|
|
||||||
getMfccFrames(aBuffer, aBufferSize, aSampleRate, &mfcc, &n_frames, nullptr);
|
getMfccFrames(aBuffer, aBufferSize, aSampleRate, &mfcc, &n_frames, NULL);
|
||||||
string = infer(mfcc, n_frames);
|
string = infer(mfcc, n_frames);
|
||||||
free(mfcc);
|
free(mfcc);
|
||||||
return string;
|
return string;
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
#ifndef __DEEPSPEECH_H__
|
#ifndef __DEEPSPEECH_H__
|
||||||
#define __DEEPSPEECH_H__
|
#define __DEEPSPEECH_H__
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
|
||||||
typedef struct _DeepSpeechPrivate DeepSpeechPrivate;
|
typedef struct _DeepSpeechPrivate DeepSpeechPrivate;
|
||||||
|
|
||||||
class DeepSpeech {
|
class DeepSpeech {
|
||||||
|
@ -27,7 +29,6 @@ class DeepSpeech {
|
||||||
* Extracts MFCC features from a given audio signal and adds the appropriate
|
* Extracts MFCC features from a given audio signal and adds the appropriate
|
||||||
* amount of context to run inference with the given DeepSpeech context.
|
* amount of context to run inference with the given DeepSpeech context.
|
||||||
*
|
*
|
||||||
* @param aCtx A DeepSpeech context.
|
|
||||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample
|
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample
|
||||||
* rate.
|
* rate.
|
||||||
* @param aBufferSize The sample-length of the audio signal.
|
* @param aBufferSize The sample-length of the audio signal.
|
||||||
|
@ -43,8 +44,8 @@ class DeepSpeech {
|
||||||
unsigned int aBufferSize,
|
unsigned int aBufferSize,
|
||||||
int aSampleRate,
|
int aSampleRate,
|
||||||
float** aMfcc,
|
float** aMfcc,
|
||||||
int* aNFrames = nullptr,
|
int* aNFrames = NULL,
|
||||||
int* aFrameLen = nullptr);
|
int* aFrameLen = NULL);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Run inference on the given audio.
|
* @brief Run inference on the given audio.
|
||||||
|
@ -52,7 +53,6 @@ class DeepSpeech {
|
||||||
* Runs inference on the given MFCC audio features with the given DeepSpeech
|
* Runs inference on the given MFCC audio features with the given DeepSpeech
|
||||||
* context. See DsGetMfccFrames().
|
* context. See DsGetMfccFrames().
|
||||||
*
|
*
|
||||||
* @param aCtx A DeepSpeech context.
|
|
||||||
* @param aMfcc MFCC features with the appropriate amount of context per
|
* @param aMfcc MFCC features with the appropriate amount of context per
|
||||||
* frame.
|
* frame.
|
||||||
* @param aNFrames The number of frames in @p aMfcc.
|
* @param aNFrames The number of frames in @p aMfcc.
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
from .deepspeech import *
|
|
@ -0,0 +1,17 @@
|
||||||
|
%module deepspeech
|
||||||
|
|
||||||
|
%{
|
||||||
|
#define SWIG_FILE_WITH_INIT
|
||||||
|
#include "deepspeech.h"
|
||||||
|
%}
|
||||||
|
|
||||||
|
%include "numpy.i"
|
||||||
|
%init %{
|
||||||
|
import_array();
|
||||||
|
%}
|
||||||
|
|
||||||
|
%apply (short* IN_ARRAY1, int DIM1) {(const short* aBuffer, unsigned int aBufferSize)};
|
||||||
|
%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {(float** aMfcc, int* aNFrames, int* aFrameLen)};
|
||||||
|
%apply (float* IN_ARRAY2, int DIM1, int DIM2) {(float* aMfcc, int aNFrames, int aFrameLen)};
|
||||||
|
|
||||||
|
%include "../deepspeech.h"
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,4 @@
|
||||||
|
[build_ext]
|
||||||
|
include-dirs=./
|
||||||
|
library-dirs=@LIBDIRS@
|
||||||
|
swig-opts=-c++ -keyword
|
|
@ -0,0 +1,36 @@
|
||||||
|
#! /usr/bin/env python
|
||||||
|
|
||||||
|
from setuptools import setup, Extension
|
||||||
|
from distutils.command.build import build
|
||||||
|
|
||||||
|
import os
|
||||||
|
import numpy
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
try:
|
||||||
|
numpy_include = numpy.get_include()
|
||||||
|
except AttributeError:
|
||||||
|
numpy_include = numpy.get_numpy_include()
|
||||||
|
|
||||||
|
class BuildExtFirst(build):
|
||||||
|
sub_commands = [('build_ext', build.has_ext_modules),
|
||||||
|
('build_py', build.has_pure_modules),
|
||||||
|
('build_clib', build.has_c_libraries),
|
||||||
|
('build_scripts', build.has_scripts)]
|
||||||
|
|
||||||
|
deepspeech = Extension('_deepspeech',
|
||||||
|
['python/deepspeech.i'],
|
||||||
|
include_dirs = [numpy_include],
|
||||||
|
libraries = ['tensorflow', 'deepspeech', 'c_speech_features', 'kissfft'])
|
||||||
|
|
||||||
|
setup(name = 'deepspeech',
|
||||||
|
description = 'A library for running inference on a DeepSpeech model',
|
||||||
|
author = 'Chris Lord',
|
||||||
|
author_email='chrislord.net@gmail.com',
|
||||||
|
version = '0.0.1',
|
||||||
|
package_dir = {'deepspeech': 'python'},
|
||||||
|
packages = [ 'deepspeech' ],
|
||||||
|
cmdclass = { 'build': BuildExtFirst },
|
||||||
|
license = 'MPL-2.0',
|
||||||
|
url = 'https://github.com/mozilla/DeepSpeech',
|
||||||
|
ext_modules = [deepspeech])
|
Loading…
Reference in New Issue