Infer number of MFCC features from input shape
This commit is contained in:
parent
d6babfb8f3
commit
4e9e78fefe
@ -555,6 +555,9 @@ def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
|
||||
mfccs_len = tf.identity(mfccs_len, name='mfccs_len')
|
||||
|
||||
# Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
|
||||
# This shape is read by the native_client in DS_CreateModel to know the
|
||||
# value of n_steps, n_context and n_input. Make sure you update the code
|
||||
# there if this shape is changed.
|
||||
input_tensor = tf.placeholder(tf.float32, [batch_size, n_steps if n_steps > 0 else None, 2 * Config.n_context + 1, Config.n_input], name='input_node')
|
||||
seq_length = tf.placeholder(tf.int32, [batch_size], name='input_lengths')
|
||||
|
||||
|
@ -46,8 +46,6 @@ constexpr float AUDIO_WIN_STEP = 0.02f;
|
||||
constexpr unsigned int AUDIO_WIN_LEN_SAMPLES = (unsigned int)(AUDIO_WIN_LEN * SAMPLE_RATE);
|
||||
constexpr unsigned int AUDIO_WIN_STEP_SAMPLES = (unsigned int)(AUDIO_WIN_STEP * SAMPLE_RATE);
|
||||
|
||||
constexpr unsigned int MFCC_FEATURES = 26;
|
||||
|
||||
constexpr size_t WINDOW_SIZE = AUDIO_WIN_LEN * SAMPLE_RATE;
|
||||
|
||||
std::array<float, WINDOW_SIZE> calc_hamming_window() {
|
||||
@ -112,7 +110,7 @@ struct StreamingState {
|
||||
|
||||
void processAudioWindow(const vector<float>& buf);
|
||||
void processMfccWindow(const vector<float>& buf);
|
||||
void pushMfccBuffer(const float* buf, unsigned int len);
|
||||
void pushMfccBuffer(const vector<float>& buf);
|
||||
void addZeroMfccWindow();
|
||||
void processBatch(const vector<float>& buf, unsigned int n_steps);
|
||||
};
|
||||
@ -132,8 +130,9 @@ struct ModelState {
|
||||
Scorer* scorer;
|
||||
unsigned int beam_width;
|
||||
unsigned int n_steps;
|
||||
unsigned int mfcc_feats_per_timestep;
|
||||
unsigned int n_context;
|
||||
unsigned int n_features;
|
||||
unsigned int mfcc_feats_per_timestep;
|
||||
|
||||
#ifdef USE_TFLITE
|
||||
size_t previous_state_size;
|
||||
@ -200,10 +199,10 @@ struct ModelState {
|
||||
*/
|
||||
void infer(const float* mfcc, unsigned int n_frames, vector<float>& logits_output);
|
||||
|
||||
void compute_mfcc(const vector<float> audio_buffer, vector<float>& mfcc_output);
|
||||
void compute_mfcc(const vector<float>& audio_buffer, vector<float>& mfcc_output);
|
||||
};
|
||||
|
||||
StreamingState* setupStreamAndFeedAudioContent(ModelState* aCtx, const short* aBuffer,
|
||||
StreamingState* SetupStreamAndFeedAudioContent(ModelState* aCtx, const short* aBuffer,
|
||||
unsigned int aBufferSize, unsigned int aSampleRate);
|
||||
|
||||
ModelState::ModelState()
|
||||
@ -221,8 +220,9 @@ ModelState::ModelState()
|
||||
, scorer(nullptr)
|
||||
, beam_width(0)
|
||||
, n_steps(-1)
|
||||
, mfcc_feats_per_timestep(-1)
|
||||
, n_context(-1)
|
||||
, n_features(-1)
|
||||
, mfcc_feats_per_timestep(-1)
|
||||
#ifdef USE_TFLITE
|
||||
, previous_state_size(0)
|
||||
, previous_state_c_(nullptr)
|
||||
@ -247,6 +247,14 @@ ModelState::~ModelState()
|
||||
delete alphabet;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void
|
||||
shift_buffer_left(vector<T>& buf, int shift_amount)
|
||||
{
|
||||
std::rotate(buf.begin(), buf.begin() + shift_amount, buf.end());
|
||||
buf.resize(buf.size() - shift_amount);
|
||||
}
|
||||
|
||||
void
|
||||
StreamingState::feedAudioContent(const short* buffer,
|
||||
unsigned int buffer_size)
|
||||
@ -265,8 +273,7 @@ StreamingState::feedAudioContent(const short* buffer,
|
||||
if (audio_buffer.size() == AUDIO_WIN_LEN_SAMPLES) {
|
||||
processAudioWindow(audio_buffer);
|
||||
// Shift data by one step
|
||||
std::rotate(audio_buffer.begin(), audio_buffer.begin() + AUDIO_WIN_STEP_SAMPLES, audio_buffer.end());
|
||||
audio_buffer.resize(audio_buffer.size() - AUDIO_WIN_STEP_SAMPLES);
|
||||
shift_buffer_left(audio_buffer, AUDIO_WIN_STEP_SAMPLES);
|
||||
}
|
||||
|
||||
// Repeat until buffer empty
|
||||
@ -283,7 +290,6 @@ char*
|
||||
StreamingState::finishStream()
|
||||
{
|
||||
finalizeStream();
|
||||
|
||||
return model->decode(accumulated_logits);
|
||||
}
|
||||
|
||||
@ -291,7 +297,6 @@ Metadata*
|
||||
StreamingState::finishStreamWithMetadata()
|
||||
{
|
||||
finalizeStream();
|
||||
|
||||
return model->decode_metadata(accumulated_logits);
|
||||
}
|
||||
|
||||
@ -300,10 +305,9 @@ StreamingState::processAudioWindow(const vector<float>& buf)
|
||||
{
|
||||
// Compute MFCC features
|
||||
vector<float> mfcc;
|
||||
mfcc.reserve(MFCC_FEATURES);
|
||||
mfcc.reserve(model->n_features);
|
||||
model->compute_mfcc(buf, mfcc);
|
||||
|
||||
pushMfccBuffer(mfcc.data(), MFCC_FEATURES);
|
||||
pushMfccBuffer(mfcc);
|
||||
}
|
||||
|
||||
void
|
||||
@ -326,25 +330,35 @@ StreamingState::finalizeStream()
|
||||
void
|
||||
StreamingState::addZeroMfccWindow()
|
||||
{
|
||||
static const float zero_buffer[MFCC_FEATURES] = {0.f};
|
||||
pushMfccBuffer(zero_buffer, MFCC_FEATURES);
|
||||
vector<float> zero_buffer(model->n_features, 0.f);
|
||||
pushMfccBuffer(zero_buffer);
|
||||
}
|
||||
|
||||
template<typename InputIt, typename OutputIt>
|
||||
InputIt
|
||||
copy_up_to_n(InputIt from_begin, InputIt from_end, OutputIt to_begin, int max_elems)
|
||||
{
|
||||
int next_copy_amount = std::min<int>(std::distance(from_begin, from_end), max_elems);
|
||||
std::copy_n(from_begin, next_copy_amount, to_begin);
|
||||
return from_begin + next_copy_amount;
|
||||
}
|
||||
|
||||
void
|
||||
StreamingState::pushMfccBuffer(const float* buf, unsigned int len)
|
||||
StreamingState::pushMfccBuffer(const vector<float>& buf)
|
||||
{
|
||||
while (len > 0) {
|
||||
unsigned int next_copy_amount = std::min(len, (unsigned int)(model->mfcc_feats_per_timestep - mfcc_buffer.size()));
|
||||
mfcc_buffer.insert(mfcc_buffer.end(), buf, buf + next_copy_amount);
|
||||
buf += next_copy_amount;
|
||||
len -= next_copy_amount;
|
||||
auto start = buf.begin();
|
||||
auto end = buf.end();
|
||||
while (start != end) {
|
||||
// Copy from input buffer to mfcc_buffer, stopping if we have a full context window
|
||||
start = copy_up_to_n(start, end, std::back_inserter(mfcc_buffer),
|
||||
model->mfcc_feats_per_timestep - mfcc_buffer.size());
|
||||
assert(mfcc_buffer.size() <= model->mfcc_feats_per_timestep);
|
||||
|
||||
// If we have a full context window
|
||||
if (mfcc_buffer.size() == model->mfcc_feats_per_timestep) {
|
||||
processMfccWindow(mfcc_buffer);
|
||||
// Shift data by one step of one mfcc feature vector
|
||||
std::rotate(mfcc_buffer.begin(), mfcc_buffer.begin() + MFCC_FEATURES, mfcc_buffer.end());
|
||||
mfcc_buffer.resize(mfcc_buffer.size() - MFCC_FEATURES);
|
||||
shift_buffer_left(mfcc_buffer, model->n_features);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -355,11 +369,12 @@ StreamingState::processMfccWindow(const vector<float>& buf)
|
||||
auto start = buf.begin();
|
||||
auto end = buf.end();
|
||||
while (start != end) {
|
||||
unsigned int next_copy_amount = std::min<unsigned int>(std::distance(start, end), (unsigned int)(model->n_steps * model->mfcc_feats_per_timestep - batch_buffer.size()));
|
||||
batch_buffer.insert(batch_buffer.end(), start, start + next_copy_amount);
|
||||
start += next_copy_amount;
|
||||
// Copy from input buffer to batch_buffer, stopping if we have a full batch
|
||||
start = copy_up_to_n(start, end, std::back_inserter(batch_buffer),
|
||||
model->n_steps * model->mfcc_feats_per_timestep - batch_buffer.size());
|
||||
assert(batch_buffer.size() <= model->n_steps * model->mfcc_feats_per_timestep);
|
||||
|
||||
// If we have a full batch
|
||||
if (batch_buffer.size() == model->n_steps * model->mfcc_feats_per_timestep) {
|
||||
processBatch(batch_buffer, model->n_steps);
|
||||
batch_buffer.resize(0);
|
||||
@ -379,7 +394,7 @@ ModelState::infer(const float* aMfcc, unsigned int n_frames, vector<float>& logi
|
||||
const size_t num_classes = alphabet->GetSize() + 1; // +1 for blank
|
||||
|
||||
#ifndef USE_TFLITE
|
||||
Tensor input(DT_FLOAT, TensorShape({BATCH_SIZE, n_steps, 2*n_context+1, MFCC_FEATURES}));
|
||||
Tensor input(DT_FLOAT, TensorShape({BATCH_SIZE, n_steps, 2*n_context+1, n_features}));
|
||||
|
||||
auto input_mapped = input.flat<float>();
|
||||
int i;
|
||||
@ -446,7 +461,7 @@ ModelState::infer(const float* aMfcc, unsigned int n_frames, vector<float>& logi
|
||||
}
|
||||
|
||||
void
|
||||
ModelState::compute_mfcc(const vector<float> samples, vector<float>& mfcc_output)
|
||||
ModelState::compute_mfcc(const vector<float>& samples, vector<float>& mfcc_output)
|
||||
{
|
||||
#ifndef USE_TFLITE
|
||||
Tensor input(DT_FLOAT, TensorShape({static_cast<long long>(samples.size())}));
|
||||
@ -467,7 +482,7 @@ ModelState::compute_mfcc(const vector<float> samples, vector<float>& mfcc_output
|
||||
int n_windows = mfcc_len_mapped(0);
|
||||
|
||||
auto mfcc_mapped = outputs[0].flat<float>();
|
||||
for (int i = 0; i < n_windows * MFCC_FEATURES; ++i) {
|
||||
for (int i = 0; i < n_windows * n_features; ++i) {
|
||||
mfcc_output.push_back(mfcc_mapped(i));
|
||||
}
|
||||
#else
|
||||
@ -486,7 +501,7 @@ ModelState::compute_mfcc(const vector<float> samples, vector<float>& mfcc_output
|
||||
int n_windows = *interpreter->typed_tensor<float>(mfccs_len_idx);
|
||||
|
||||
float* outputs = interpreter->typed_tensor<float>(mfccs_idx);
|
||||
for (int i = 0; i < n_windows * MFCC_FEATURES; ++i) {
|
||||
for (int i = 0; i < n_windows * n_features; ++i) {
|
||||
mfcc_output.push_back(outputs[i]);
|
||||
}
|
||||
#endif
|
||||
@ -645,6 +660,7 @@ DS_CreateModel(const char* aModelPath,
|
||||
const auto& shape = node.attr().at("shape").shape();
|
||||
model->n_steps = shape.dim(1).size();
|
||||
model->n_context = (shape.dim(2).size()-1)/2;
|
||||
model->n_features = shape.dim(3).size();
|
||||
model->mfcc_feats_per_timestep = shape.dim(2).size() * shape.dim(3).size();
|
||||
} else if (node.name() == "logits_shape") {
|
||||
Tensor logits_shape = Tensor(DT_INT32, TensorShape({3}));
|
||||
@ -665,12 +681,10 @@ DS_CreateModel(const char* aModelPath,
|
||||
}
|
||||
}
|
||||
|
||||
if (model->n_context == -1) {
|
||||
std::cerr << "Error: Could not infer context window size from model file. "
|
||||
<< "Make sure input_node is a 3D tensor with the last dimension "
|
||||
<< "of size MFCC_FEATURES * ((2 * context window) + 1). If you "
|
||||
<< "changed the number of features in the input, adjust the "
|
||||
<< "MFCC_FEATURES constant in " __FILE__
|
||||
if (model->n_context == -1 || model->n_features == -1) {
|
||||
std::cerr << "Error: Could not infer input shape from model file. "
|
||||
<< "Make sure input_node is a 4D tensor with shape "
|
||||
<< "[batch_size=1, time, window_size, n_features]."
|
||||
<< std::endl;
|
||||
return DS_ERR_INVALID_SHAPE;
|
||||
}
|
||||
@ -710,6 +724,7 @@ DS_CreateModel(const char* aModelPath,
|
||||
|
||||
model->n_steps = dims_input_node->data[1];
|
||||
model->n_context = (dims_input_node->data[2] - 1 ) / 2;
|
||||
model->n_features = dims_input_node->data[3];
|
||||
model->mfcc_feats_per_timestep = dims_input_node->data[2] * dims_input_node->data[3];
|
||||
|
||||
TfLiteIntArray* dims_logits = model->interpreter->tensor(model->logits_idx)->dims;
|
||||
@ -772,7 +787,7 @@ DS_SpeechToText(ModelState* aCtx,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate)
|
||||
{
|
||||
StreamingState* ctx = setupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
StreamingState* ctx = SetupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
return DS_FinishStream(ctx);
|
||||
}
|
||||
|
||||
@ -782,24 +797,22 @@ DS_SpeechToTextWithMetadata(ModelState* aCtx,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate)
|
||||
{
|
||||
StreamingState* ctx = setupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
StreamingState* ctx = SetupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
|
||||
return DS_FinishStreamWithMetadata(ctx);
|
||||
}
|
||||
|
||||
StreamingState*
|
||||
setupStreamAndFeedAudioContent(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate)
|
||||
SetupStreamAndFeedAudioContent(ModelState* aCtx,
|
||||
const short* aBuffer,
|
||||
unsigned int aBufferSize,
|
||||
unsigned int aSampleRate)
|
||||
{
|
||||
StreamingState* ctx;
|
||||
int status = DS_SetupStream(aCtx, 0, aSampleRate, &ctx);
|
||||
if (status != DS_ERR_OK) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
DS_FeedAudioContent(ctx, aBuffer, aBufferSize);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
@ -836,7 +849,7 @@ DS_SetupStream(ModelState* aCtx,
|
||||
|
||||
ctx->audio_buffer.reserve(AUDIO_WIN_LEN_SAMPLES);
|
||||
ctx->mfcc_buffer.reserve(aCtx->mfcc_feats_per_timestep);
|
||||
ctx->mfcc_buffer.resize(MFCC_FEATURES*aCtx->n_context, 0.f);
|
||||
ctx->mfcc_buffer.resize(aCtx->n_features*aCtx->n_context, 0.f);
|
||||
ctx->batch_buffer.reserve(aCtx->n_steps * aCtx->mfcc_feats_per_timestep);
|
||||
|
||||
ctx->model = aCtx;
|
||||
|
Loading…
x
Reference in New Issue
Block a user