Remove DS_AudioToInputVector and dep on c_speech_features

This commit is contained in:
Reuben Morais 2019-02-28 17:13:48 -03:00
parent 1cea2b0fe8
commit 51f80744c6
7 changed files with 2 additions and 150 deletions

View File

@ -62,14 +62,6 @@ tf_cc_shared_object(
srcs = ["deepspeech.cc",
"deepspeech.h",
"alphabet.h",
"c_speech_features/c_speech_features.cpp",
"kiss_fft130/kiss_fft.c",
"kiss_fft130/tools/kiss_fftr.c",
"c_speech_features/c_speech_features.h",
"c_speech_features/c_speech_features_config.h",
"kiss_fft130/kiss_fft.h",
"kiss_fft130/_kiss_fft_guts.h",
"kiss_fft130/tools/kiss_fftr.h",
"ds_version.h"] +
DECODER_SOURCES,
copts = select({
@ -134,7 +126,7 @@ tf_cc_shared_object(
}) + if_cuda([
"//tensorflow/core:core",
]),
includes = ["c_speech_features", "kiss_fft130"] + DECODER_INCLUDES,
includes = DECODER_INCLUDES,
defines = ["KENLM_MAX_ORDER=6"],
)

View File

@ -23,8 +23,6 @@
#include "tensorflow/lite/kernels/register.h"
#endif // USE_TFLITE
#include "c_speech_features.h"
#include "ctcdecode/ctc_beam_search_decoder.h"
#ifdef __ANDROID__
@ -50,12 +48,6 @@ constexpr unsigned int AUDIO_WIN_STEP_SAMPLES = (unsigned int)(AUDIO_WIN_STEP *
constexpr unsigned int MFCC_FEATURES = 26;
constexpr float PREEMPHASIS_COEFF = 0.97f;
constexpr unsigned int N_FFT = 512;
constexpr unsigned int N_FILTERS = 26;
constexpr unsigned int LOWFREQ = 0;
constexpr unsigned int CEP_LIFTER = 22;
constexpr size_t WINDOW_SIZE = AUDIO_WIN_LEN * SAMPLE_RATE;
std::array<float, WINDOW_SIZE> calc_hamming_window() {
@ -890,76 +882,8 @@ DS_DiscardStream(StreamingState* aSctx)
}
void
DS_AudioToInputVector(const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate,
unsigned int aNCep,
unsigned int aNContext,
float** aMfcc,
int* aNFrames,
int* aFrameLen)
DS_FreeMetadata(Metadata* m)
{
const int contextSize = aNCep * aNContext;
const int frameSize = aNCep + (2 * aNCep * aNContext);
// Compute MFCC features
float* mfcc;
int n_frames = csf_mfcc(aBuffer, aBufferSize, aSampleRate,
AUDIO_WIN_LEN, AUDIO_WIN_STEP, aNCep, N_FILTERS, N_FFT,
LOWFREQ, aSampleRate/2, PREEMPHASIS_COEFF, CEP_LIFTER,
1, NULL, &mfcc);
// Take every other frame (BiRNN stride of 2) and add past/future context
int ds_input_length = (n_frames + 1) / 2;
// TODO: Use MFCC of silence instead of zero
float* ds_input = (float*)calloc(ds_input_length * frameSize, sizeof(float));
for (int i = 0, idx = 0, mfcc_idx = 0; i < ds_input_length;
i++, idx += frameSize, mfcc_idx += aNCep * 2) {
// Past context
for (int j = aNContext; j > 0; j--) {
int frame_index = (i - j) * 2;
if (frame_index < 0) { continue; }
int mfcc_base = frame_index * aNCep;
int base = (aNContext - j) * aNCep;
for (int k = 0; k < aNCep; k++) {
ds_input[idx + base + k] = mfcc[mfcc_base + k];
}
}
// Present context
for (int j = 0; j < aNCep; j++) {
ds_input[idx + j + contextSize] = mfcc[mfcc_idx + j];
}
// Future context
for (int j = 1; j <= aNContext; j++) {
int frame_index = (i + j) * 2;
if (frame_index >= n_frames) { break; }
int mfcc_base = frame_index * aNCep;
int base = contextSize + aNCep + ((j - 1) * aNCep);
for (int k = 0; k < aNCep; k++) {
ds_input[idx + base + k] = mfcc[mfcc_base + k];
}
}
}
// Free mfcc array
free(mfcc);
if (aMfcc) {
*aMfcc = ds_input;
}
if (aNFrames) {
*aNFrames = ds_input_length;
}
if (aFrameLen) {
*aFrameLen = frameSize;
}
}
void
DS_FreeMetadata(Metadata* m)
{
if (m) {
delete(m->items);
delete(m);

View File

@ -228,41 +228,9 @@ Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx);
DEEPSPEECH_EXPORT
void DS_DiscardStream(StreamingState* aSctx);
/**
* @brief Given audio, return a vector suitable for input to a DeepSpeech
* model trained with the given parameters.
*
* Extracts MFCC features from a given audio signal and adds the appropriate
* amount of context to run inference on a DeepSpeech model trained with
* the given parameters.
*
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample
* rate.
* @param aBufferSize The sample-length of the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
* @param aNCep The number of cepstrum.
* @param aNContext The size of the context window.
* @param[out] aMfcc An array containing features, of shape
* (@p aNFrames, ncep * ncontext). The user is responsible
* for freeing the array.
* @param[out] aNFrames (optional) The number of frames in @p aMfcc.
* @param[out] aFrameLen (optional) The length of each frame
* (ncep * ncontext) in @p aMfcc.
*/
DEEPSPEECH_EXPORT
void DS_AudioToInputVector(const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate,
unsigned int aNCep,
unsigned int aNContext,
float** aMfcc,
int* aNFrames = NULL,
int* aFrameLen = NULL);
/**
* @brief Free memory allocated for metadata information.
*/
DEEPSPEECH_EXPORT
void DS_FreeMetadata(Metadata* m);

View File

@ -26,33 +26,6 @@ using namespace node;
%apply (short* IN_ARRAY1, int DIM1) {(const short* aBuffer, unsigned int aBufferSize)};
// convert DS_AudioToInputVector return values to a Node Buffer
%typemap(in,numinputs=0)
(float** ARGOUTVIEWM_ARRAY2, unsigned int* DIM1, unsigned int* DIM2)
(float* data_temp, unsigned int dim1_temp, unsigned int dim2_temp)
{
$1 = &data_temp;
$2 = &dim1_temp;
$3 = &dim2_temp;
}
%typemap(argout)
(float** ARGOUTVIEWM_ARRAY2, unsigned int* DIM1, unsigned int* DIM2)
{
Handle<Array> array = Array::New(Isolate::GetCurrent(), *$2);
for (unsigned int i = 0, idx = 0; i < *$2; i++) {
Handle<ArrayBuffer> buffer =
ArrayBuffer::New(Isolate::GetCurrent(), *$1, *$3 * sizeof(float));
memcpy(buffer->GetContents().Data(),
(*$1) + (idx += *$3), *$3 * sizeof(float));
Handle<Float32Array> inner = Float32Array::New(buffer, 0, *$3);
array->Set(i, inner);
}
free(*$1);
$result = array;
}
%apply (float** ARGOUTVIEWM_ARRAY2, unsigned int* DIM1, unsigned int* DIM2) {(float** aMfcc, unsigned int* aNFrames, unsigned int* aFrameLen)};
// make sure the string returned by SpeechToText is freed
%typemap(newfree) char* "free($1);";
%newobject DS_SpeechToText;

View File

@ -66,6 +66,5 @@ Model.prototype.finishStream = function() {
module.exports = {
Model: Model,
audioToInputVector: binding.AudioToInputVector,
printVersions: binding.PrintVersions
};

View File

@ -1,7 +1,6 @@
import deepspeech
# rename for backwards compatibility
from deepspeech.impl import AudioToInputVector as audioToInputVector
from deepspeech.impl import PrintVersions as printVersions
class Model(object):

View File

@ -13,9 +13,6 @@ import_array();
// apply NumPy conversion typemap to DS_FeedAudioContent and DS_SpeechToText
%apply (short* IN_ARRAY1, int DIM1) {(const short* aBuffer, unsigned int aBufferSize)};
// apply NumPy conversion typemap to DS_AudioToInputVector
%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {(float** aMfcc, int* aNFrames, int* aFrameLen)};
%typemap(in, numinputs=0) ModelState **retval (ModelState *ret) {
ret = NULL;
$1 = &ret;