Merge pull request #1773 from mozilla/mfcc-striding

Use longer MFCC step instead of throwing away features (Fixes #1744)
This commit is contained in:
Reuben Morais 2018-12-10 13:14:39 -02:00 committed by GitHub
commit 2a8128b8fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 25 deletions

View File

@ -1,4 +1,5 @@
#include <algorithm> #include <algorithm>
#include <cmath>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include <string> #include <string>
@ -20,23 +21,35 @@
#include "ctcdecode/ctc_beam_search_decoder.h" #include "ctcdecode/ctc_beam_search_decoder.h"
//TODO: infer batch size from model/use dynamic batch size //TODO: infer batch size from model/use dynamic batch size
const unsigned int BATCH_SIZE = 1; constexpr unsigned int BATCH_SIZE = 1;
//TODO: use dynamic sample rate //TODO: use dynamic sample rate
const unsigned int SAMPLE_RATE = 16000; constexpr unsigned int SAMPLE_RATE = 16000;
const float AUDIO_WIN_LEN = 0.025f; constexpr float AUDIO_WIN_LEN = 0.032f;
const float AUDIO_WIN_STEP = 0.01f; constexpr float AUDIO_WIN_STEP = 0.02f;
const unsigned int AUDIO_WIN_LEN_SAMPLES = (unsigned int)(AUDIO_WIN_LEN * SAMPLE_RATE); constexpr unsigned int AUDIO_WIN_LEN_SAMPLES = (unsigned int)(AUDIO_WIN_LEN * SAMPLE_RATE);
const unsigned int AUDIO_WIN_STEP_SAMPLES = (unsigned int)(AUDIO_WIN_STEP * SAMPLE_RATE); constexpr unsigned int AUDIO_WIN_STEP_SAMPLES = (unsigned int)(AUDIO_WIN_STEP * SAMPLE_RATE);
const unsigned int MFCC_FEATURES = 26; constexpr unsigned int MFCC_FEATURES = 26;
const float PREEMPHASIS_COEFF = 0.97f; constexpr float PREEMPHASIS_COEFF = 0.97f;
const unsigned int N_FFT = 512; constexpr unsigned int N_FFT = 512;
const unsigned int N_FILTERS = 26; constexpr unsigned int N_FILTERS = 26;
const unsigned int LOWFREQ = 0; constexpr unsigned int LOWFREQ = 0;
const unsigned int CEP_LIFTER = 22; constexpr unsigned int CEP_LIFTER = 22;
constexpr size_t WINDOW_SIZE = AUDIO_WIN_LEN * SAMPLE_RATE;
std::array<float, WINDOW_SIZE> calc_hamming_window() {
std::array<float, WINDOW_SIZE> a{0};
for (int i = 0; i < WINDOW_SIZE; ++i) {
a[i] = 0.54 - 0.46 * std::cos(2*M_PI*i/(WINDOW_SIZE-1));
}
return a;
}
std::array<float, WINDOW_SIZE> hamming_window = calc_hamming_window();
using namespace tensorflow; using namespace tensorflow;
@ -77,7 +90,6 @@ struct StreamingState {
float last_sample; // used for preemphasis float last_sample; // used for preemphasis
vector<float> mfcc_buffer; vector<float> mfcc_buffer;
vector<float> batch_buffer; vector<float> batch_buffer;
bool skip_next_mfcc;
ModelState* model; ModelState* model;
void feedAudioContent(const short* buffer, unsigned int buffer_size); void feedAudioContent(const short* buffer, unsigned int buffer_size);
@ -214,16 +226,11 @@ StreamingState::finishStream()
void void
StreamingState::processAudioWindow(const vector<float>& buf) StreamingState::processAudioWindow(const vector<float>& buf)
{ {
skip_next_mfcc = !skip_next_mfcc;
if (!skip_next_mfcc) { // Was true
return;
}
// Compute MFCC features // Compute MFCC features
float* mfcc; float* mfcc;
int n_frames = csf_mfcc(buf.data(), buf.size(), SAMPLE_RATE, int n_frames = csf_mfcc(buf.data(), buf.size(), SAMPLE_RATE,
AUDIO_WIN_LEN, AUDIO_WIN_STEP, MFCC_FEATURES, N_FILTERS, N_FFT, AUDIO_WIN_LEN, AUDIO_WIN_STEP, MFCC_FEATURES, N_FILTERS, N_FFT,
LOWFREQ, SAMPLE_RATE/2, 0.f, CEP_LIFTER, 1, nullptr, LOWFREQ, SAMPLE_RATE/2, 0.f, CEP_LIFTER, 1, hamming_window.data(),
&mfcc); &mfcc);
assert(n_frames == 1); assert(n_frames == 1);
@ -518,8 +525,6 @@ DS_SetupStream(ModelState* aCtx,
ctx->mfcc_buffer.resize(MFCC_FEATURES*aCtx->n_context, 0.f); ctx->mfcc_buffer.resize(MFCC_FEATURES*aCtx->n_context, 0.f);
ctx->batch_buffer.reserve(aCtx->n_steps * aCtx->mfcc_feats_per_timestep); ctx->batch_buffer.reserve(aCtx->n_steps * aCtx->mfcc_feats_per_timestep);
ctx->skip_next_mfcc = false;
ctx->model = aCtx; ctx->model = aCtx;
*retval = ctx.release(); *retval = ctx.release();

View File

@ -15,10 +15,7 @@ def audiofile_to_input_vector(audio_filename, numcep, numcontext):
fs, audio = wav.read(audio_filename) fs, audio = wav.read(audio_filename)
# Get mfcc coefficients # Get mfcc coefficients
features = mfcc(audio, samplerate=fs, numcep=numcep) features = mfcc(audio, samplerate=fs, numcep=numcep, winlen=0.032, winstep=0.02, winfunc=np.hamming)
# We only keep every second feature (BiRNN stride = 2)
features = features[::2]
# Add empty initial and final contexts # Add empty initial and final contexts
empty_context = np.zeros((numcontext, numcep), dtype=features.dtype) empty_context = np.zeros((numcontext, numcep), dtype=features.dtype)