Convert libdeepspeech into a proper C++ library to ease use/bindings

Formerly, libdeepspeech was basically a C library. Rather than returning
a context struct and having every function require this context struct,
instead provide a DeepSpeech class that handles its own destruction.
This commit is contained in:
Chris Lord 2017-04-25 18:43:26 +01:00
parent 70da866e03
commit c9cd4ff6f6
3 changed files with 166 additions and 143 deletions

View File

@ -22,7 +22,7 @@ struct ds_result {
// DsSTT() instrumented // DsSTT() instrumented
struct ds_result* struct ds_result*
LocalDsSTT(DeepSpeechContext* aCtx, const short* aBuffer, size_t aBufferSize, LocalDsSTT(DeepSpeech& aCtx, const short* aBuffer, size_t aBufferSize,
int aSampleRate) int aSampleRate)
{ {
float* mfcc; float* mfcc;
@ -34,11 +34,11 @@ LocalDsSTT(DeepSpeechContext* aCtx, const short* aBuffer, size_t aBufferSize,
clock_t ds_start_time = clock(); clock_t ds_start_time = clock();
clock_t ds_end_mfcc = 0, ds_end_infer = 0; clock_t ds_end_mfcc = 0, ds_end_infer = 0;
int n_frames = int n_frames = 0;
DsGetMfccFrames(aCtx, aBuffer, aBufferSize, aSampleRate, &mfcc); aCtx.getMfccFrames(aBuffer, aBufferSize, aSampleRate, &mfcc, &n_frames);
ds_end_mfcc = clock(); ds_end_mfcc = clock();
res->string = DsInfer(aCtx, mfcc, n_frames); res->string = aCtx.infer(mfcc, n_frames);
ds_end_infer = clock(); ds_end_infer = clock();
free(mfcc); free(mfcc);
@ -66,8 +66,7 @@ main(int argc, char **argv)
} }
// Initialise DeepSpeech // Initialise DeepSpeech
DeepSpeechContext* ctx = DsInit(argv[1], N_CEP, N_CONTEXT); DeepSpeech ctx = DeepSpeech(argv[1], N_CEP, N_CONTEXT);
assert(ctx);
// Initialise SOX // Initialise SOX
assert(sox_init() == SOX_SUCCESS); assert(sox_init() == SOX_SUCCESS);
@ -178,7 +177,6 @@ main(int argc, char **argv)
} }
// Deinitialise and quit // Deinitialise and quit
DsClose(ctx);
sox_quit(); sox_quit();
return 0; return 0;

View File

@ -1,7 +1,7 @@
#include "tensorflow/core/public/session.h"
#include "tensorflow/core/platform/env.h"
#include "deepspeech.h" #include "deepspeech.h"
#include "c_speech_features.h" #include "c_speech_features.h"
#include "tensorflow/core/public/session.h"
#include "tensorflow/core/platform/env.h"
#define COEFF 0.97f #define COEFF 0.97f
#define WIN_LEN 0.025f #define WIN_LEN 0.025f
@ -9,76 +9,69 @@
#define N_FFT 512 #define N_FFT 512
#define N_FILTERS 26 #define N_FILTERS 26
#define LOWFREQ 0 #define LOWFREQ 0
#define N_CEP 26
#define CEP_LIFTER 22 #define CEP_LIFTER 22
#define N_CONTEXT 9
using namespace tensorflow; using namespace tensorflow;
struct _DeepSpeechContext { struct _DeepSpeechPrivate {
Session* session; Session* session;
GraphDef graph_def; GraphDef graph_def;
int ncep; int ncep;
int ncontext; int ncontext;
}; };
DeepSpeechContext* DeepSpeech::DeepSpeech(const char* aModelPath, int aNCep, int aNContext)
DsInit(const char* aModelPath, int aNCep, int aNContext)
{ {
mPriv = new DeepSpeechPrivate;
if (!aModelPath) { if (!aModelPath) {
return NULL;
}
DeepSpeechContext* ctx = new DeepSpeechContext;
Status status = NewSession(SessionOptions(), &ctx->session);
if (!status.ok()) {
delete ctx;
return NULL;
}
status = ReadBinaryProto(Env::Default(), aModelPath, &ctx->graph_def);
if (!status.ok()) {
ctx->session->Close();
delete ctx;
return NULL;
}
status = ctx->session->Create(ctx->graph_def);
if (!status.ok()) {
ctx->session->Close();
delete ctx;
return NULL;
}
ctx->ncep = aNCep;
ctx->ncontext = aNContext;
return ctx;
}
void
DsClose(DeepSpeechContext* aCtx)
{
if (!aCtx) {
return; return;
} }
aCtx->session->Close(); Status status = NewSession(SessionOptions(), &mPriv->session);
delete aCtx; if (!status.ok()) {
return;
}
status = ReadBinaryProto(Env::Default(), aModelPath, &mPriv->graph_def);
if (!status.ok()) {
mPriv->session->Close();
mPriv->session = nullptr;
return;
}
status = mPriv->session->Create(mPriv->graph_def);
if (!status.ok()) {
mPriv->session->Close();
mPriv->session = nullptr;
return;
}
mPriv->ncep = aNCep;
mPriv->ncontext = aNContext;
} }
int DeepSpeech::~DeepSpeech()
DsGetMfccFrames(DeepSpeechContext* aCtx, const short* aBuffer,
size_t aBufferSize, int aSampleRate, float** aMfcc)
{ {
const int contextSize = aCtx->ncep * aCtx->ncontext; if (mPriv->session) {
const int frameSize = aCtx->ncep + (2 * aCtx->ncep * aCtx->ncontext); mPriv->session->Close();
}
delete mPriv;
}
void
DeepSpeech::getMfccFrames(const short* aBuffer, unsigned int aBufferSize,
int aSampleRate, float** aMfcc, int* aNFrames,
int* aFrameLen)
{
const int contextSize = mPriv->ncep * mPriv->ncontext;
const int frameSize = mPriv->ncep + (2 * mPriv->ncep * mPriv->ncontext);
// Compute MFCC features // Compute MFCC features
float* mfcc; float* mfcc;
int n_frames = csf_mfcc(aBuffer, aBufferSize, aSampleRate, int n_frames = csf_mfcc(aBuffer, aBufferSize, aSampleRate,
WIN_LEN, WIN_STEP, aCtx->ncep, N_FILTERS, N_FFT, WIN_LEN, WIN_STEP, mPriv->ncep, N_FILTERS, N_FFT,
LOWFREQ, aSampleRate/2, COEFF, CEP_LIFTER, 1, NULL, LOWFREQ, aSampleRate/2, COEFF, CEP_LIFTER, 1, NULL,
&mfcc); &mfcc);
@ -87,30 +80,30 @@ DsGetMfccFrames(DeepSpeechContext* aCtx, const short* aBuffer,
// TODO: Use MFCC of silence instead of zero // TODO: Use MFCC of silence instead of zero
float* ds_input = (float*)calloc(sizeof(float), ds_input_length * frameSize); float* ds_input = (float*)calloc(sizeof(float), ds_input_length * frameSize);
for (int i = 0, idx = 0, mfcc_idx = 0; i < ds_input_length; for (int i = 0, idx = 0, mfcc_idx = 0; i < ds_input_length;
i++, idx += frameSize, mfcc_idx += aCtx->ncep * 2) { i++, idx += frameSize, mfcc_idx += mPriv->ncep * 2) {
// Past context // Past context
for (int j = N_CONTEXT; j > 0; j--) { for (int j = mPriv->ncontext; j > 0; j--) {
int frame_index = (i * 2) - (j * 2); int frame_index = (i * 2) - (j * 2);
if (frame_index < 0) { continue; } if (frame_index < 0) { continue; }
int mfcc_base = frame_index * aCtx->ncep; int mfcc_base = frame_index * mPriv->ncep;
int base = (N_CONTEXT - j) * N_CEP; int base = (mPriv->ncontext - j) * mPriv->ncep;
for (int k = 0; k < N_CEP; k++) { for (int k = 0; k < mPriv->ncep; k++) {
ds_input[idx + base + k] = mfcc[mfcc_base + k]; ds_input[idx + base + k] = mfcc[mfcc_base + k];
} }
} }
// Present context // Present context
for (int j = 0; j < N_CEP; j++) { for (int j = 0; j < mPriv->ncep; j++) {
ds_input[idx + j + contextSize] = mfcc[mfcc_idx + j]; ds_input[idx + j + contextSize] = mfcc[mfcc_idx + j];
} }
// Future context // Future context
for (int j = 1; j <= N_CONTEXT; j++) { for (int j = 1; j <= mPriv->ncontext; j++) {
int frame_index = (i * 2) + (j * 2); int frame_index = (i * 2) + (j * 2);
if (frame_index >= n_frames) { continue; } if (frame_index >= n_frames) { continue; }
int mfcc_base = frame_index * aCtx->ncep; int mfcc_base = frame_index * mPriv->ncep;
int base = contextSize + N_CEP + ((j - 1) * N_CEP); int base = contextSize + mPriv->ncep + ((j - 1) * mPriv->ncep);
for (int k = 0; k < N_CEP; k++) { for (int k = 0; k < mPriv->ncep; k++) {
ds_input[idx + base + k] = mfcc[mfcc_base + k]; ds_input[idx + base + k] = mfcc[mfcc_base + k];
} }
} }
@ -136,14 +129,33 @@ DsGetMfccFrames(DeepSpeechContext* aCtx, const short* aBuffer,
ds_input[idx] = (float)((ds_input[idx] - mean) / stddev); ds_input[idx] = (float)((ds_input[idx] - mean) / stddev);
} }
if (aMfcc) {
*aMfcc = ds_input; *aMfcc = ds_input;
return ds_input_length; }
if (aNFrames) {
*aNFrames = ds_input_length;
}
if (aFrameLen) {
*aFrameLen = contextSize;
}
} }
char* char*
DsInfer(DeepSpeechContext* aCtx, float* aMfcc, int aNFrames) DeepSpeech::infer(float* aMfcc, int aNFrames, int aFrameLen)
{ {
const int frameSize = aCtx->ncep + (2 * aCtx->ncep * aCtx->ncontext); if (!mPriv->session) {
return nullptr;
}
const int frameSize = mPriv->ncep + (2 * mPriv->ncep * mPriv->ncontext);
if (aFrameLen == 0) {
aFrameLen = frameSize;
} else if (aFrameLen < frameSize) {
std::cerr << "mfcc features array is too small (expected " <<
frameSize << ", got " << aFrameLen << ")\n";
return nullptr;
}
Tensor input(DT_FLOAT, TensorShape({1, aNFrames, frameSize})); Tensor input(DT_FLOAT, TensorShape({1, aNFrames, frameSize}));
auto input_mapped = input.tensor<float, 3>(); auto input_mapped = input.tensor<float, 3>();
@ -151,18 +163,19 @@ DsInfer(DeepSpeechContext* aCtx, float* aMfcc, int aNFrames)
for (int j = 0; j < frameSize; j++, idx++) { for (int j = 0; j < frameSize; j++, idx++) {
input_mapped(0, i, j) = aMfcc[idx]; input_mapped(0, i, j) = aMfcc[idx];
} }
idx += (aFrameLen - frameSize);
} }
Tensor n_frames(DT_INT32, TensorShape({1})); Tensor n_frames(DT_INT32, TensorShape({1}));
n_frames.scalar<int>()() = aNFrames; n_frames.scalar<int>()() = aNFrames;
std::vector<Tensor> outputs; std::vector<Tensor> outputs;
Status status = Status status = mPriv->session->Run(
aCtx->session->Run({{ "input_node", input }, { "input_lengths", n_frames }}, {{ "input_node", input }, { "input_lengths", n_frames }},
{"output_node"}, {}, &outputs); {"output_node"}, {}, &outputs);
if (!status.ok()) { if (!status.ok()) {
std::cerr << "Error running session: " << status.ToString() << "\n"; std::cerr << "Error running session: " << status.ToString() << "\n";
return NULL; return nullptr;
} }
// Output is an array of shape (1, n_results, result_length). // Output is an array of shape (1, n_results, result_length).
@ -180,14 +193,14 @@ DsInfer(DeepSpeechContext* aCtx, float* aMfcc, int aNFrames)
} }
char* char*
DsSTT(DeepSpeechContext* aCtx, const short* aBuffer, size_t aBufferSize, DeepSpeech::stt(const short* aBuffer, unsigned int aBufferSize, int aSampleRate)
int aSampleRate)
{ {
float* mfcc; float* mfcc;
char* string; char* string;
int n_frames = int n_frames;
DsGetMfccFrames(aCtx, aBuffer, aBufferSize, aSampleRate, &mfcc);
string = DsInfer(aCtx, mfcc, n_frames); getMfccFrames(aBuffer, aBufferSize, aSampleRate, &mfcc, &n_frames, nullptr);
string = infer(mfcc, n_frames);
free(mfcc); free(mfcc);
return string; return string;
} }

View File

@ -2,9 +2,14 @@
#ifndef __DEEPSPEECH_H__ #ifndef __DEEPSPEECH_H__
#define __DEEPSPEECH_H__ #define __DEEPSPEECH_H__
typedef struct _DeepSpeechContext DeepSpeechContext; typedef struct _DeepSpeechPrivate DeepSpeechPrivate;
/** class DeepSpeech {
private:
DeepSpeechPrivate* mPriv;
public:
/**
* @brief Initialise a DeepSpeech context. * @brief Initialise a DeepSpeech context.
* *
* @param aModelPath The path to the frozen model graph. * @param aModelPath The path to the frozen model graph.
@ -13,16 +18,10 @@ typedef struct _DeepSpeechContext DeepSpeechContext;
* *
* @return A DeepSpeech context. * @return A DeepSpeech context.
*/ */
DeepSpeechContext* DsInit(const char* aModelPath, int aNCep, int aNContext); DeepSpeech(const char* aModelPath, int aNCep, int aNContext);
~DeepSpeech();
/** /**
* @brief De-initialise a DeepSpeech context.
*
* @param aCtx A DeepSpeech context.
*/
void DsClose(DeepSpeechContext* aCtx);
/**
* @brief Extract MFCC features from a given audio signal and add context. * @brief Extract MFCC features from a given audio signal and add context.
* *
* Extracts MFCC features from a given audio signal and adds the appropriate * Extracts MFCC features from a given audio signal and adds the appropriate
@ -34,30 +33,41 @@ void DsClose(DeepSpeechContext* aCtx);
* @param aBufferSize The sample-length of the audio signal. * @param aBufferSize The sample-length of the audio signal.
* @param aSampleRate The sample-rate of the audio signal. * @param aSampleRate The sample-rate of the audio signal.
* @param[out] aMFCC An array containing features, of shape * @param[out] aMFCC An array containing features, of shape
* (frames, ncep * ncontext). The user is responsible for * (@p aNFrames, ncep * ncontext). The user is responsible
* freeing the array. * for freeing the array.
* * @param[out] aNFrames (optional) The number of frames in @p aMFCC.
* @return The number of frames in @p aMFCC. * @param[out] aFrameLen (optional) The length of each frame
* (ncep * ncontext) in @p aMFCC.
*/ */
int DsGetMfccFrames(DeepSpeechContext* aCtx, const short* aBuffer, void getMfccFrames(const short* aBuffer,
size_t aBufferSize, int aSampleRate, float** aMfcc); unsigned int aBufferSize,
int aSampleRate,
float** aMfcc,
int* aNFrames = nullptr,
int* aFrameLen = nullptr);
/** /**
* @brief Run inference on the given audio. * @brief Run inference on the given audio.
* *
* Runs inference on the given MFCC audio features with the given DeepSpeech * Runs inference on the given MFCC audio features with the given DeepSpeech
* context. See DsGetMfccFrames(). * context. See DsGetMfccFrames().
* *
* @param aCtx A DeepSpeech context. * @param aCtx A DeepSpeech context.
* @param aMfcc MFCC features with the appropriate amount of context per frame. * @param aMfcc MFCC features with the appropriate amount of context per
* frame.
* @param aNFrames The number of frames in @p aMfcc. * @param aNFrames The number of frames in @p aMfcc.
* @param aFrameLen (optional) The length of each frame in @p aMfcc. If
* specified, this will be used to verify the array is
* large enough.
* *
* @return The resulting string after running inference. The user is * @return The resulting string after running inference. The user is
* responsible for freeing this string. * responsible for freeing this string.
*/ */
char* DsInfer(DeepSpeechContext* aCtx, float* aMfcc, int aNFrames); char* infer(float* aMfcc,
int aNFrames,
int aFrameLen = 0);
/** /**
* @brief Use DeepSpeech to perform Speech-To-Text. * @brief Use DeepSpeech to perform Speech-To-Text.
* *
* @param aMfcc An MFCC features array. * @param aMfcc An MFCC features array.
@ -68,7 +78,9 @@ char* DsInfer(DeepSpeechContext* aCtx, float* aMfcc, int aNFrames);
* *
* @return The STT result. The user is responsible for freeing this string. * @return The STT result. The user is responsible for freeing this string.
*/ */
char* DsSTT(DeepSpeechContext* aCtx, const short* aBuffer, size_t aBufferSize, char* stt(const short* aBuffer,
unsigned int aBufferSize,
int aSampleRate); int aSampleRate);
};
#endif /* __DEEPSPEECH_H__ */ #endif /* __DEEPSPEECH_H__ */