Remove DS_AudioToInputVector and dep on c_speech_features
This commit is contained in:
parent
1cea2b0fe8
commit
51f80744c6
@ -62,14 +62,6 @@ tf_cc_shared_object(
|
||||
srcs = ["deepspeech.cc",
|
||||
"deepspeech.h",
|
||||
"alphabet.h",
|
||||
"c_speech_features/c_speech_features.cpp",
|
||||
"kiss_fft130/kiss_fft.c",
|
||||
"kiss_fft130/tools/kiss_fftr.c",
|
||||
"c_speech_features/c_speech_features.h",
|
||||
"c_speech_features/c_speech_features_config.h",
|
||||
"kiss_fft130/kiss_fft.h",
|
||||
"kiss_fft130/_kiss_fft_guts.h",
|
||||
"kiss_fft130/tools/kiss_fftr.h",
|
||||
"ds_version.h"] +
|
||||
DECODER_SOURCES,
|
||||
copts = select({
|
||||
@ -134,7 +126,7 @@ tf_cc_shared_object(
|
||||
}) + if_cuda([
|
||||
"//tensorflow/core:core",
|
||||
]),
|
||||
includes = ["c_speech_features", "kiss_fft130"] + DECODER_INCLUDES,
|
||||
includes = DECODER_INCLUDES,
|
||||
defines = ["KENLM_MAX_ORDER=6"],
|
||||
)
|
||||
|
||||
|
@ -23,8 +23,6 @@
|
||||
#include "tensorflow/lite/kernels/register.h"
|
||||
#endif // USE_TFLITE
|
||||
|
||||
#include "c_speech_features.h"
|
||||
|
||||
#include "ctcdecode/ctc_beam_search_decoder.h"
|
||||
|
||||
#ifdef __ANDROID__
|
||||
@ -50,12 +48,6 @@ constexpr unsigned int AUDIO_WIN_STEP_SAMPLES = (unsigned int)(AUDIO_WIN_STEP *
|
||||
|
||||
constexpr unsigned int MFCC_FEATURES = 26;
|
||||
|
||||
constexpr float PREEMPHASIS_COEFF = 0.97f;
|
||||
constexpr unsigned int N_FFT = 512;
|
||||
constexpr unsigned int N_FILTERS = 26;
|
||||
constexpr unsigned int LOWFREQ = 0;
|
||||
constexpr unsigned int CEP_LIFTER = 22;
|
||||
|
||||
constexpr size_t WINDOW_SIZE = AUDIO_WIN_LEN * SAMPLE_RATE;
|
||||
|
||||
std::array<float, WINDOW_SIZE> calc_hamming_window() {
|
||||
@ -890,76 +882,8 @@ DS_DiscardStream(StreamingState* aSctx)
|
||||
}
|
||||
|
||||
void
|
||||
DS_AudioToInputVector(const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate,
|
||||
unsigned int aNCep,
|
||||
unsigned int aNContext,
|
||||
float** aMfcc,
|
||||
int* aNFrames,
|
||||
int* aFrameLen)
|
||||
DS_FreeMetadata(Metadata* m)
|
||||
{
|
||||
const int contextSize = aNCep * aNContext;
|
||||
const int frameSize = aNCep + (2 * aNCep * aNContext);
|
||||
|
||||
// Compute MFCC features
|
||||
float* mfcc;
|
||||
int n_frames = csf_mfcc(aBuffer, aBufferSize, aSampleRate,
|
||||
AUDIO_WIN_LEN, AUDIO_WIN_STEP, aNCep, N_FILTERS, N_FFT,
|
||||
LOWFREQ, aSampleRate/2, PREEMPHASIS_COEFF, CEP_LIFTER,
|
||||
1, NULL, &mfcc);
|
||||
|
||||
// Take every other frame (BiRNN stride of 2) and add past/future context
|
||||
int ds_input_length = (n_frames + 1) / 2;
|
||||
// TODO: Use MFCC of silence instead of zero
|
||||
float* ds_input = (float*)calloc(ds_input_length * frameSize, sizeof(float));
|
||||
for (int i = 0, idx = 0, mfcc_idx = 0; i < ds_input_length;
|
||||
i++, idx += frameSize, mfcc_idx += aNCep * 2) {
|
||||
// Past context
|
||||
for (int j = aNContext; j > 0; j--) {
|
||||
int frame_index = (i - j) * 2;
|
||||
if (frame_index < 0) { continue; }
|
||||
int mfcc_base = frame_index * aNCep;
|
||||
int base = (aNContext - j) * aNCep;
|
||||
for (int k = 0; k < aNCep; k++) {
|
||||
ds_input[idx + base + k] = mfcc[mfcc_base + k];
|
||||
}
|
||||
}
|
||||
|
||||
// Present context
|
||||
for (int j = 0; j < aNCep; j++) {
|
||||
ds_input[idx + j + contextSize] = mfcc[mfcc_idx + j];
|
||||
}
|
||||
|
||||
// Future context
|
||||
for (int j = 1; j <= aNContext; j++) {
|
||||
int frame_index = (i + j) * 2;
|
||||
if (frame_index >= n_frames) { break; }
|
||||
int mfcc_base = frame_index * aNCep;
|
||||
int base = contextSize + aNCep + ((j - 1) * aNCep);
|
||||
for (int k = 0; k < aNCep; k++) {
|
||||
ds_input[idx + base + k] = mfcc[mfcc_base + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Free mfcc array
|
||||
free(mfcc);
|
||||
|
||||
if (aMfcc) {
|
||||
*aMfcc = ds_input;
|
||||
}
|
||||
if (aNFrames) {
|
||||
*aNFrames = ds_input_length;
|
||||
}
|
||||
if (aFrameLen) {
|
||||
*aFrameLen = frameSize;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
DS_FreeMetadata(Metadata* m)
|
||||
{
|
||||
if (m) {
|
||||
delete(m->items);
|
||||
delete(m);
|
||||
|
@ -228,41 +228,9 @@ Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx);
|
||||
DEEPSPEECH_EXPORT
|
||||
void DS_DiscardStream(StreamingState* aSctx);
|
||||
|
||||
/**
|
||||
* @brief Given audio, return a vector suitable for input to a DeepSpeech
|
||||
* model trained with the given parameters.
|
||||
*
|
||||
* Extracts MFCC features from a given audio signal and adds the appropriate
|
||||
* amount of context to run inference on a DeepSpeech model trained with
|
||||
* the given parameters.
|
||||
*
|
||||
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample
|
||||
* rate.
|
||||
* @param aBufferSize The sample-length of the audio signal.
|
||||
* @param aSampleRate The sample-rate of the audio signal.
|
||||
* @param aNCep The number of cepstrum.
|
||||
* @param aNContext The size of the context window.
|
||||
* @param[out] aMfcc An array containing features, of shape
|
||||
* (@p aNFrames, ncep * ncontext). The user is responsible
|
||||
* for freeing the array.
|
||||
* @param[out] aNFrames (optional) The number of frames in @p aMfcc.
|
||||
* @param[out] aFrameLen (optional) The length of each frame
|
||||
* (ncep * ncontext) in @p aMfcc.
|
||||
*/
|
||||
DEEPSPEECH_EXPORT
|
||||
void DS_AudioToInputVector(const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate,
|
||||
unsigned int aNCep,
|
||||
unsigned int aNContext,
|
||||
float** aMfcc,
|
||||
int* aNFrames = NULL,
|
||||
int* aFrameLen = NULL);
|
||||
|
||||
/**
|
||||
* @brief Free memory allocated for metadata information.
|
||||
*/
|
||||
|
||||
DEEPSPEECH_EXPORT
|
||||
void DS_FreeMetadata(Metadata* m);
|
||||
|
||||
|
@ -26,33 +26,6 @@ using namespace node;
|
||||
%apply (short* IN_ARRAY1, int DIM1) {(const short* aBuffer, unsigned int aBufferSize)};
|
||||
|
||||
|
||||
// convert DS_AudioToInputVector return values to a Node Buffer
|
||||
%typemap(in,numinputs=0)
|
||||
(float** ARGOUTVIEWM_ARRAY2, unsigned int* DIM1, unsigned int* DIM2)
|
||||
(float* data_temp, unsigned int dim1_temp, unsigned int dim2_temp)
|
||||
{
|
||||
$1 = &data_temp;
|
||||
$2 = &dim1_temp;
|
||||
$3 = &dim2_temp;
|
||||
}
|
||||
%typemap(argout)
|
||||
(float** ARGOUTVIEWM_ARRAY2, unsigned int* DIM1, unsigned int* DIM2)
|
||||
{
|
||||
Handle<Array> array = Array::New(Isolate::GetCurrent(), *$2);
|
||||
for (unsigned int i = 0, idx = 0; i < *$2; i++) {
|
||||
Handle<ArrayBuffer> buffer =
|
||||
ArrayBuffer::New(Isolate::GetCurrent(), *$1, *$3 * sizeof(float));
|
||||
memcpy(buffer->GetContents().Data(),
|
||||
(*$1) + (idx += *$3), *$3 * sizeof(float));
|
||||
Handle<Float32Array> inner = Float32Array::New(buffer, 0, *$3);
|
||||
array->Set(i, inner);
|
||||
}
|
||||
free(*$1);
|
||||
$result = array;
|
||||
}
|
||||
|
||||
%apply (float** ARGOUTVIEWM_ARRAY2, unsigned int* DIM1, unsigned int* DIM2) {(float** aMfcc, unsigned int* aNFrames, unsigned int* aFrameLen)};
|
||||
|
||||
// make sure the string returned by SpeechToText is freed
|
||||
%typemap(newfree) char* "free($1);";
|
||||
%newobject DS_SpeechToText;
|
||||
|
@ -66,6 +66,5 @@ Model.prototype.finishStream = function() {
|
||||
|
||||
module.exports = {
|
||||
Model: Model,
|
||||
audioToInputVector: binding.AudioToInputVector,
|
||||
printVersions: binding.PrintVersions
|
||||
};
|
||||
|
@ -1,7 +1,6 @@
|
||||
import deepspeech
|
||||
|
||||
# rename for backwards compatibility
|
||||
from deepspeech.impl import AudioToInputVector as audioToInputVector
|
||||
from deepspeech.impl import PrintVersions as printVersions
|
||||
|
||||
class Model(object):
|
||||
|
@ -13,9 +13,6 @@ import_array();
|
||||
// apply NumPy conversion typemap to DS_FeedAudioContent and DS_SpeechToText
|
||||
%apply (short* IN_ARRAY1, int DIM1) {(const short* aBuffer, unsigned int aBufferSize)};
|
||||
|
||||
// apply NumPy conversion typemap to DS_AudioToInputVector
|
||||
%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {(float** aMfcc, int* aNFrames, int* aFrameLen)};
|
||||
|
||||
%typemap(in, numinputs=0) ModelState **retval (ModelState *ret) {
|
||||
ret = NULL;
|
||||
$1 = &ret;
|
||||
|
Loading…
x
Reference in New Issue
Block a user