Infer number of MFCC features from input shape

This commit is contained in:
Reuben Morais 2019-04-01 16:52:38 -03:00
parent d6babfb8f3
commit 4e9e78fefe
2 changed files with 62 additions and 46 deletions

View File

@ -555,6 +555,9 @@ def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
mfccs_len = tf.identity(mfccs_len, name='mfccs_len')
# Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
# This shape is read by the native_client in DS_CreateModel to know the
# value of n_steps, n_context and n_input. Make sure you update the code
# there if this shape is changed.
input_tensor = tf.placeholder(tf.float32, [batch_size, n_steps if n_steps > 0 else None, 2 * Config.n_context + 1, Config.n_input], name='input_node')
seq_length = tf.placeholder(tf.int32, [batch_size], name='input_lengths')

View File

@ -46,8 +46,6 @@ constexpr float AUDIO_WIN_STEP = 0.02f;
constexpr unsigned int AUDIO_WIN_LEN_SAMPLES = (unsigned int)(AUDIO_WIN_LEN * SAMPLE_RATE);
constexpr unsigned int AUDIO_WIN_STEP_SAMPLES = (unsigned int)(AUDIO_WIN_STEP * SAMPLE_RATE);
constexpr unsigned int MFCC_FEATURES = 26;
constexpr size_t WINDOW_SIZE = AUDIO_WIN_LEN * SAMPLE_RATE;
std::array<float, WINDOW_SIZE> calc_hamming_window() {
@ -112,7 +110,7 @@ struct StreamingState {
void processAudioWindow(const vector<float>& buf);
void processMfccWindow(const vector<float>& buf);
void pushMfccBuffer(const float* buf, unsigned int len);
void pushMfccBuffer(const vector<float>& buf);
void addZeroMfccWindow();
void processBatch(const vector<float>& buf, unsigned int n_steps);
};
@ -132,8 +130,9 @@ struct ModelState {
Scorer* scorer;
unsigned int beam_width;
unsigned int n_steps;
unsigned int mfcc_feats_per_timestep;
unsigned int n_context;
unsigned int n_features;
unsigned int mfcc_feats_per_timestep;
#ifdef USE_TFLITE
size_t previous_state_size;
@ -200,10 +199,10 @@ struct ModelState {
*/
void infer(const float* mfcc, unsigned int n_frames, vector<float>& logits_output);
void compute_mfcc(const vector<float> audio_buffer, vector<float>& mfcc_output);
void compute_mfcc(const vector<float>& audio_buffer, vector<float>& mfcc_output);
};
StreamingState* setupStreamAndFeedAudioContent(ModelState* aCtx, const short* aBuffer,
StreamingState* SetupStreamAndFeedAudioContent(ModelState* aCtx, const short* aBuffer,
unsigned int aBufferSize, unsigned int aSampleRate);
ModelState::ModelState()
@ -221,8 +220,9 @@ ModelState::ModelState()
, scorer(nullptr)
, beam_width(0)
, n_steps(-1)
, mfcc_feats_per_timestep(-1)
, n_context(-1)
, n_features(-1)
, mfcc_feats_per_timestep(-1)
#ifdef USE_TFLITE
, previous_state_size(0)
, previous_state_c_(nullptr)
@ -247,6 +247,14 @@ ModelState::~ModelState()
delete alphabet;
}
template<typename T>
void
shift_buffer_left(vector<T>& buf, int shift_amount)
{
std::rotate(buf.begin(), buf.begin() + shift_amount, buf.end());
buf.resize(buf.size() - shift_amount);
}
void
StreamingState::feedAudioContent(const short* buffer,
unsigned int buffer_size)
@ -265,8 +273,7 @@ StreamingState::feedAudioContent(const short* buffer,
if (audio_buffer.size() == AUDIO_WIN_LEN_SAMPLES) {
processAudioWindow(audio_buffer);
// Shift data by one step
std::rotate(audio_buffer.begin(), audio_buffer.begin() + AUDIO_WIN_STEP_SAMPLES, audio_buffer.end());
audio_buffer.resize(audio_buffer.size() - AUDIO_WIN_STEP_SAMPLES);
shift_buffer_left(audio_buffer, AUDIO_WIN_STEP_SAMPLES);
}
// Repeat until buffer empty
@ -283,7 +290,6 @@ char*
StreamingState::finishStream()
{
finalizeStream();
return model->decode(accumulated_logits);
}
@ -291,7 +297,6 @@ Metadata*
StreamingState::finishStreamWithMetadata()
{
finalizeStream();
return model->decode_metadata(accumulated_logits);
}
@ -300,10 +305,9 @@ StreamingState::processAudioWindow(const vector<float>& buf)
{
// Compute MFCC features
vector<float> mfcc;
mfcc.reserve(MFCC_FEATURES);
mfcc.reserve(model->n_features);
model->compute_mfcc(buf, mfcc);
pushMfccBuffer(mfcc.data(), MFCC_FEATURES);
pushMfccBuffer(mfcc);
}
void
@ -326,25 +330,35 @@ StreamingState::finalizeStream()
void
StreamingState::addZeroMfccWindow()
{
static const float zero_buffer[MFCC_FEATURES] = {0.f};
pushMfccBuffer(zero_buffer, MFCC_FEATURES);
vector<float> zero_buffer(model->n_features, 0.f);
pushMfccBuffer(zero_buffer);
}
template<typename InputIt, typename OutputIt>
InputIt
copy_up_to_n(InputIt from_begin, InputIt from_end, OutputIt to_begin, int max_elems)
{
int next_copy_amount = std::min<int>(std::distance(from_begin, from_end), max_elems);
std::copy_n(from_begin, next_copy_amount, to_begin);
return from_begin + next_copy_amount;
}
void
StreamingState::pushMfccBuffer(const float* buf, unsigned int len)
StreamingState::pushMfccBuffer(const vector<float>& buf)
{
while (len > 0) {
unsigned int next_copy_amount = std::min(len, (unsigned int)(model->mfcc_feats_per_timestep - mfcc_buffer.size()));
mfcc_buffer.insert(mfcc_buffer.end(), buf, buf + next_copy_amount);
buf += next_copy_amount;
len -= next_copy_amount;
auto start = buf.begin();
auto end = buf.end();
while (start != end) {
// Copy from input buffer to mfcc_buffer, stopping if we have a full context window
start = copy_up_to_n(start, end, std::back_inserter(mfcc_buffer),
model->mfcc_feats_per_timestep - mfcc_buffer.size());
assert(mfcc_buffer.size() <= model->mfcc_feats_per_timestep);
// If we have a full context window
if (mfcc_buffer.size() == model->mfcc_feats_per_timestep) {
processMfccWindow(mfcc_buffer);
// Shift data by one step of one mfcc feature vector
std::rotate(mfcc_buffer.begin(), mfcc_buffer.begin() + MFCC_FEATURES, mfcc_buffer.end());
mfcc_buffer.resize(mfcc_buffer.size() - MFCC_FEATURES);
shift_buffer_left(mfcc_buffer, model->n_features);
}
}
}
@ -355,11 +369,12 @@ StreamingState::processMfccWindow(const vector<float>& buf)
auto start = buf.begin();
auto end = buf.end();
while (start != end) {
unsigned int next_copy_amount = std::min<unsigned int>(std::distance(start, end), (unsigned int)(model->n_steps * model->mfcc_feats_per_timestep - batch_buffer.size()));
batch_buffer.insert(batch_buffer.end(), start, start + next_copy_amount);
start += next_copy_amount;
// Copy from input buffer to batch_buffer, stopping if we have a full batch
start = copy_up_to_n(start, end, std::back_inserter(batch_buffer),
model->n_steps * model->mfcc_feats_per_timestep - batch_buffer.size());
assert(batch_buffer.size() <= model->n_steps * model->mfcc_feats_per_timestep);
// If we have a full batch
if (batch_buffer.size() == model->n_steps * model->mfcc_feats_per_timestep) {
processBatch(batch_buffer, model->n_steps);
batch_buffer.resize(0);
@ -379,7 +394,7 @@ ModelState::infer(const float* aMfcc, unsigned int n_frames, vector<float>& logi
const size_t num_classes = alphabet->GetSize() + 1; // +1 for blank
#ifndef USE_TFLITE
Tensor input(DT_FLOAT, TensorShape({BATCH_SIZE, n_steps, 2*n_context+1, MFCC_FEATURES}));
Tensor input(DT_FLOAT, TensorShape({BATCH_SIZE, n_steps, 2*n_context+1, n_features}));
auto input_mapped = input.flat<float>();
int i;
@ -446,7 +461,7 @@ ModelState::infer(const float* aMfcc, unsigned int n_frames, vector<float>& logi
}
void
ModelState::compute_mfcc(const vector<float> samples, vector<float>& mfcc_output)
ModelState::compute_mfcc(const vector<float>& samples, vector<float>& mfcc_output)
{
#ifndef USE_TFLITE
Tensor input(DT_FLOAT, TensorShape({static_cast<long long>(samples.size())}));
@ -467,7 +482,7 @@ ModelState::compute_mfcc(const vector<float> samples, vector<float>& mfcc_output
int n_windows = mfcc_len_mapped(0);
auto mfcc_mapped = outputs[0].flat<float>();
for (int i = 0; i < n_windows * MFCC_FEATURES; ++i) {
for (int i = 0; i < n_windows * n_features; ++i) {
mfcc_output.push_back(mfcc_mapped(i));
}
#else
@ -486,7 +501,7 @@ ModelState::compute_mfcc(const vector<float> samples, vector<float>& mfcc_output
int n_windows = *interpreter->typed_tensor<float>(mfccs_len_idx);
float* outputs = interpreter->typed_tensor<float>(mfccs_idx);
for (int i = 0; i < n_windows * MFCC_FEATURES; ++i) {
for (int i = 0; i < n_windows * n_features; ++i) {
mfcc_output.push_back(outputs[i]);
}
#endif
@ -645,6 +660,7 @@ DS_CreateModel(const char* aModelPath,
const auto& shape = node.attr().at("shape").shape();
model->n_steps = shape.dim(1).size();
model->n_context = (shape.dim(2).size()-1)/2;
model->n_features = shape.dim(3).size();
model->mfcc_feats_per_timestep = shape.dim(2).size() * shape.dim(3).size();
} else if (node.name() == "logits_shape") {
Tensor logits_shape = Tensor(DT_INT32, TensorShape({3}));
@ -665,12 +681,10 @@ DS_CreateModel(const char* aModelPath,
}
}
if (model->n_context == -1) {
std::cerr << "Error: Could not infer context window size from model file. "
<< "Make sure input_node is a 3D tensor with the last dimension "
<< "of size MFCC_FEATURES * ((2 * context window) + 1). If you "
<< "changed the number of features in the input, adjust the "
<< "MFCC_FEATURES constant in " __FILE__
if (model->n_context == -1 || model->n_features == -1) {
std::cerr << "Error: Could not infer input shape from model file. "
<< "Make sure input_node is a 4D tensor with shape "
<< "[batch_size=1, time, window_size, n_features]."
<< std::endl;
return DS_ERR_INVALID_SHAPE;
}
@ -710,6 +724,7 @@ DS_CreateModel(const char* aModelPath,
model->n_steps = dims_input_node->data[1];
model->n_context = (dims_input_node->data[2] - 1 ) / 2;
model->n_features = dims_input_node->data[3];
model->mfcc_feats_per_timestep = dims_input_node->data[2] * dims_input_node->data[3];
TfLiteIntArray* dims_logits = model->interpreter->tensor(model->logits_idx)->dims;
@ -772,7 +787,7 @@ DS_SpeechToText(ModelState* aCtx,
unsigned int aBufferSize,
unsigned int aSampleRate)
{
StreamingState* ctx = setupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
StreamingState* ctx = SetupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
return DS_FinishStream(ctx);
}
@ -782,12 +797,12 @@ DS_SpeechToTextWithMetadata(ModelState* aCtx,
unsigned int aBufferSize,
unsigned int aSampleRate)
{
StreamingState* ctx = setupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
StreamingState* ctx = SetupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
return DS_FinishStreamWithMetadata(ctx);
}
StreamingState*
setupStreamAndFeedAudioContent(ModelState* aCtx,
SetupStreamAndFeedAudioContent(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate)
@ -797,9 +812,7 @@ setupStreamAndFeedAudioContent(ModelState* aCtx,
if (status != DS_ERR_OK) {
return nullptr;
}
DS_FeedAudioContent(ctx, aBuffer, aBufferSize);
return ctx;
}
@ -836,7 +849,7 @@ DS_SetupStream(ModelState* aCtx,
ctx->audio_buffer.reserve(AUDIO_WIN_LEN_SAMPLES);
ctx->mfcc_buffer.reserve(aCtx->mfcc_feats_per_timestep);
ctx->mfcc_buffer.resize(MFCC_FEATURES*aCtx->n_context, 0.f);
ctx->mfcc_buffer.resize(aCtx->n_features*aCtx->n_context, 0.f);
ctx->batch_buffer.reserve(aCtx->n_steps * aCtx->mfcc_feats_per_timestep);
ctx->model = aCtx;