From 1df9602c954ba83e88d61cb8aeb07d8ed1169305 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Mon, 10 Dec 2018 11:08:48 -0200 Subject: [PATCH] Use longer MFCC step instead of throwing away features. --- native_client/deepspeech.cc | 47 ++++++++++++++++++++----------------- util/audio.py | 5 +--- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc index 2ca61b89..6f627a83 100644 --- a/native_client/deepspeech.cc +++ b/native_client/deepspeech.cc @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -20,23 +21,35 @@ #include "ctcdecode/ctc_beam_search_decoder.h" //TODO: infer batch size from model/use dynamic batch size -const unsigned int BATCH_SIZE = 1; +constexpr unsigned int BATCH_SIZE = 1; //TODO: use dynamic sample rate -const unsigned int SAMPLE_RATE = 16000; +constexpr unsigned int SAMPLE_RATE = 16000; -const float AUDIO_WIN_LEN = 0.025f; -const float AUDIO_WIN_STEP = 0.01f; -const unsigned int AUDIO_WIN_LEN_SAMPLES = (unsigned int)(AUDIO_WIN_LEN * SAMPLE_RATE); -const unsigned int AUDIO_WIN_STEP_SAMPLES = (unsigned int)(AUDIO_WIN_STEP * SAMPLE_RATE); +constexpr float AUDIO_WIN_LEN = 0.032f; +constexpr float AUDIO_WIN_STEP = 0.02f; +constexpr unsigned int AUDIO_WIN_LEN_SAMPLES = (unsigned int)(AUDIO_WIN_LEN * SAMPLE_RATE); +constexpr unsigned int AUDIO_WIN_STEP_SAMPLES = (unsigned int)(AUDIO_WIN_STEP * SAMPLE_RATE); -const unsigned int MFCC_FEATURES = 26; +constexpr unsigned int MFCC_FEATURES = 26; -const float PREEMPHASIS_COEFF = 0.97f; -const unsigned int N_FFT = 512; -const unsigned int N_FILTERS = 26; -const unsigned int LOWFREQ = 0; -const unsigned int CEP_LIFTER = 22; +constexpr float PREEMPHASIS_COEFF = 0.97f; +constexpr unsigned int N_FFT = 512; +constexpr unsigned int N_FILTERS = 26; +constexpr unsigned int LOWFREQ = 0; +constexpr unsigned int CEP_LIFTER = 22; + +constexpr size_t WINDOW_SIZE = AUDIO_WIN_LEN * SAMPLE_RATE; + +std::array calc_hamming_window() { + std::array a{0}; + for (int i = 0; i < WINDOW_SIZE; ++i) { + a[i] = 0.54 - 0.46 * std::cos(2*M_PI*i/(WINDOW_SIZE-1)); + } + return a; +} + +std::array hamming_window = calc_hamming_window(); using namespace tensorflow; @@ -77,7 +90,6 @@ struct StreamingState { float last_sample; // used for preemphasis vector mfcc_buffer; vector batch_buffer; - bool skip_next_mfcc; ModelState* model; void feedAudioContent(const short* buffer, unsigned int buffer_size); @@ -214,16 +226,11 @@ StreamingState::finishStream() void StreamingState::processAudioWindow(const vector& buf) { - skip_next_mfcc = !skip_next_mfcc; - if (!skip_next_mfcc) { // Was true - return; - } - // Compute MFCC features float* mfcc; int n_frames = csf_mfcc(buf.data(), buf.size(), SAMPLE_RATE, AUDIO_WIN_LEN, AUDIO_WIN_STEP, MFCC_FEATURES, N_FILTERS, N_FFT, - LOWFREQ, SAMPLE_RATE/2, 0.f, CEP_LIFTER, 1, nullptr, + LOWFREQ, SAMPLE_RATE/2, 0.f, CEP_LIFTER, 1, hamming_window.data(), &mfcc); assert(n_frames == 1); @@ -518,8 +525,6 @@ DS_SetupStream(ModelState* aCtx, ctx->mfcc_buffer.resize(MFCC_FEATURES*aCtx->n_context, 0.f); ctx->batch_buffer.reserve(aCtx->n_steps * aCtx->mfcc_feats_per_timestep); - ctx->skip_next_mfcc = false; - ctx->model = aCtx; *retval = ctx.release(); diff --git a/util/audio.py b/util/audio.py index 2b68e108..ac9dde63 100644 --- a/util/audio.py +++ b/util/audio.py @@ -15,10 +15,7 @@ def audiofile_to_input_vector(audio_filename, numcep, numcontext): fs, audio = wav.read(audio_filename) # Get mfcc coefficients - features = mfcc(audio, samplerate=fs, numcep=numcep) - - # We only keep every second feature (BiRNN stride = 2) - features = features[::2] + features = mfcc(audio, samplerate=fs, numcep=numcep, winlen=0.032, winstep=0.02, winfunc=np.hamming) # Add empty initial and final contexts empty_context = np.zeros((numcontext, numcep), dtype=features.dtype)