Infer number of MFCC features from input shape

2019-04-01 16:52:38 -03:00 · 2019-04-01 16:52:38 -03:00 · 4e9e78fefe
commit 4e9e78fefe
parent d6babfb8f3
2 changed files with 62 additions and 46 deletions
--- a/DeepSpeech.py
+++ b/DeepSpeech.py
@ -555,6 +555,9 @@ def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
    mfccs_len = tf.identity(mfccs_len, name='mfccs_len')

    # Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
+    # This shape is read by the native_client in DS_CreateModel to know the
+    # value of n_steps, n_context and n_input. Make sure you update the code
+    # there if this shape is changed.
    input_tensor = tf.placeholder(tf.float32, [batch_size, n_steps if n_steps > 0 else None, 2 * Config.n_context + 1, Config.n_input], name='input_node')
    seq_length = tf.placeholder(tf.int32, [batch_size], name='input_lengths')

--- a/native_client/deepspeech.cc
+++ b/native_client/deepspeech.cc
@ -46,8 +46,6 @@ constexpr float AUDIO_WIN_STEP = 0.02f;
 constexpr unsigned int AUDIO_WIN_LEN_SAMPLES = (unsigned int)(AUDIO_WIN_LEN * SAMPLE_RATE);
 constexpr unsigned int AUDIO_WIN_STEP_SAMPLES = (unsigned int)(AUDIO_WIN_STEP * SAMPLE_RATE);

-constexpr unsigned int MFCC_FEATURES = 26;
-
 constexpr size_t WINDOW_SIZE = AUDIO_WIN_LEN * SAMPLE_RATE;

 std::array<float, WINDOW_SIZE> calc_hamming_window() {
@ -112,7 +110,7 @@ struct StreamingState {

  void processAudioWindow(const vector<float>& buf);
  void processMfccWindow(const vector<float>& buf);
-  void pushMfccBuffer(const float* buf, unsigned int len);
+  void pushMfccBuffer(const vector<float>& buf);
  void addZeroMfccWindow();
  void processBatch(const vector<float>& buf, unsigned int n_steps);
 };
@ -132,8 +130,9 @@ struct ModelState {
  Scorer* scorer;
  unsigned int beam_width;
  unsigned int n_steps;
-  unsigned int mfcc_feats_per_timestep;
  unsigned int n_context;
+  unsigned int n_features;
+  unsigned int mfcc_feats_per_timestep;

 #ifdef USE_TFLITE
  size_t previous_state_size;
@ -200,10 +199,10 @@ struct ModelState {
   */
  void infer(const float* mfcc, unsigned int n_frames, vector<float>& logits_output);

-  void compute_mfcc(const vector<float> audio_buffer, vector<float>& mfcc_output);
+  void compute_mfcc(const vector<float>& audio_buffer, vector<float>& mfcc_output);
 };

-StreamingState* setupStreamAndFeedAudioContent(ModelState* aCtx, const short* aBuffer,
+StreamingState* SetupStreamAndFeedAudioContent(ModelState* aCtx, const short* aBuffer,
                                               unsigned int aBufferSize, unsigned int aSampleRate);

 ModelState::ModelState()
@ -221,8 +220,9 @@ ModelState::ModelState()
  , scorer(nullptr)
  , beam_width(0)
  , n_steps(-1)
-  , mfcc_feats_per_timestep(-1)
  , n_context(-1)
+  , n_features(-1)
+  , mfcc_feats_per_timestep(-1)
 #ifdef USE_TFLITE
  , previous_state_size(0)
  , previous_state_c_(nullptr)
@ -247,6 +247,14 @@ ModelState::~ModelState()
  delete alphabet;
 }

+template<typename T>
+void
+shift_buffer_left(vector<T>& buf, int shift_amount)
+{
+  std::rotate(buf.begin(), buf.begin() + shift_amount, buf.end());
+  buf.resize(buf.size() - shift_amount);
+}
+
 void
 StreamingState::feedAudioContent(const short* buffer,
                                 unsigned int buffer_size)
@ -265,8 +273,7 @@ StreamingState::feedAudioContent(const short* buffer,
    if (audio_buffer.size() == AUDIO_WIN_LEN_SAMPLES) {
      processAudioWindow(audio_buffer);
      // Shift data by one step
-      std::rotate(audio_buffer.begin(), audio_buffer.begin() + AUDIO_WIN_STEP_SAMPLES, audio_buffer.end());
-      audio_buffer.resize(audio_buffer.size() - AUDIO_WIN_STEP_SAMPLES);
+      shift_buffer_left(audio_buffer, AUDIO_WIN_STEP_SAMPLES);
    }

    // Repeat until buffer empty
@ -283,7 +290,6 @@ char*
 StreamingState::finishStream()
 {
  finalizeStream();
-
  return model->decode(accumulated_logits);
 }

@ -291,7 +297,6 @@ Metadata*
 StreamingState::finishStreamWithMetadata()
 {
  finalizeStream();
-
  return model->decode_metadata(accumulated_logits);
 }

@ -300,10 +305,9 @@ StreamingState::processAudioWindow(const vector<float>& buf)
 {
  // Compute MFCC features
  vector<float> mfcc;
-  mfcc.reserve(MFCC_FEATURES);
+  mfcc.reserve(model->n_features);
  model->compute_mfcc(buf, mfcc);
-
-  pushMfccBuffer(mfcc.data(), MFCC_FEATURES);
+  pushMfccBuffer(mfcc);
 }

 void
@ -326,25 +330,35 @@ StreamingState::finalizeStream()
 void
 StreamingState::addZeroMfccWindow()
 {
-  static const float zero_buffer[MFCC_FEATURES] = {0.f};
-  pushMfccBuffer(zero_buffer, MFCC_FEATURES);
+  vector<float> zero_buffer(model->n_features, 0.f);
+  pushMfccBuffer(zero_buffer);
+}
+
+template<typename InputIt, typename OutputIt>
+InputIt
+copy_up_to_n(InputIt from_begin, InputIt from_end, OutputIt to_begin, int max_elems)
+{
+  int next_copy_amount = std::min<int>(std::distance(from_begin, from_end), max_elems);
+  std::copy_n(from_begin, next_copy_amount, to_begin);
+  return from_begin + next_copy_amount;
 }

 void
-StreamingState::pushMfccBuffer(const float* buf, unsigned int len)
+StreamingState::pushMfccBuffer(const vector<float>& buf)
 {
-  while (len > 0) {
-    unsigned int next_copy_amount = std::min(len, (unsigned int)(model->mfcc_feats_per_timestep - mfcc_buffer.size()));
-    mfcc_buffer.insert(mfcc_buffer.end(), buf, buf + next_copy_amount);
-    buf += next_copy_amount;
-    len -= next_copy_amount;
+  auto start = buf.begin();
+  auto end = buf.end();
+  while (start != end) {
+    // Copy from input buffer to mfcc_buffer, stopping if we have a full context window
+    start = copy_up_to_n(start, end, std::back_inserter(mfcc_buffer),
+                         model->mfcc_feats_per_timestep - mfcc_buffer.size());
    assert(mfcc_buffer.size() <= model->mfcc_feats_per_timestep);

+    // If we have a full context window
    if (mfcc_buffer.size() == model->mfcc_feats_per_timestep) {
      processMfccWindow(mfcc_buffer);
      // Shift data by one step of one mfcc feature vector
-      std::rotate(mfcc_buffer.begin(), mfcc_buffer.begin() + MFCC_FEATURES, mfcc_buffer.end());
-      mfcc_buffer.resize(mfcc_buffer.size() - MFCC_FEATURES);
+      shift_buffer_left(mfcc_buffer, model->n_features);
    }
  }
 }
@ -355,11 +369,12 @@ StreamingState::processMfccWindow(const vector<float>& buf)
  auto start = buf.begin();
  auto end = buf.end();
  while (start != end) {
-    unsigned int next_copy_amount = std::min<unsigned int>(std::distance(start, end), (unsigned int)(model->n_steps * model->mfcc_feats_per_timestep - batch_buffer.size()));
-    batch_buffer.insert(batch_buffer.end(), start, start + next_copy_amount);
-    start += next_copy_amount;
+    // Copy from input buffer to batch_buffer, stopping if we have a full batch
+    start = copy_up_to_n(start, end, std::back_inserter(batch_buffer),
+                         model->n_steps * model->mfcc_feats_per_timestep - batch_buffer.size());
    assert(batch_buffer.size() <= model->n_steps * model->mfcc_feats_per_timestep);

+    // If we have a full batch
    if (batch_buffer.size() == model->n_steps * model->mfcc_feats_per_timestep) {
      processBatch(batch_buffer, model->n_steps);
      batch_buffer.resize(0);
@ -379,7 +394,7 @@ ModelState::infer(const float* aMfcc, unsigned int n_frames, vector<float>& logi
  const size_t num_classes = alphabet->GetSize() + 1; // +1 for blank

 #ifndef USE_TFLITE
-  Tensor input(DT_FLOAT, TensorShape({BATCH_SIZE, n_steps, 2*n_context+1, MFCC_FEATURES}));
+  Tensor input(DT_FLOAT, TensorShape({BATCH_SIZE, n_steps, 2*n_context+1, n_features}));

  auto input_mapped = input.flat<float>();
  int i;
@ -446,7 +461,7 @@ ModelState::infer(const float* aMfcc, unsigned int n_frames, vector<float>& logi
 }

 void
-ModelState::compute_mfcc(const vector<float> samples, vector<float>& mfcc_output)
+ModelState::compute_mfcc(const vector<float>& samples, vector<float>& mfcc_output)
 {
 #ifndef USE_TFLITE
  Tensor input(DT_FLOAT, TensorShape({static_cast<long long>(samples.size())}));
@ -467,7 +482,7 @@ ModelState::compute_mfcc(const vector<float> samples, vector<float>& mfcc_output
  int n_windows = mfcc_len_mapped(0);

  auto mfcc_mapped = outputs[0].flat<float>();
-  for (int i = 0; i < n_windows * MFCC_FEATURES; ++i) {
+  for (int i = 0; i < n_windows * n_features; ++i) {
    mfcc_output.push_back(mfcc_mapped(i));
  }
 #else
@ -486,7 +501,7 @@ ModelState::compute_mfcc(const vector<float> samples, vector<float>& mfcc_output
  int n_windows = *interpreter->typed_tensor<float>(mfccs_len_idx);

  float* outputs = interpreter->typed_tensor<float>(mfccs_idx);
-  for (int i = 0; i < n_windows * MFCC_FEATURES; ++i) {
+  for (int i = 0; i < n_windows * n_features; ++i) {
    mfcc_output.push_back(outputs[i]);
  }
 #endif
@ -645,6 +660,7 @@ DS_CreateModel(const char* aModelPath,
      const auto& shape = node.attr().at("shape").shape();
      model->n_steps = shape.dim(1).size();
      model->n_context = (shape.dim(2).size()-1)/2;
+      model->n_features = shape.dim(3).size();
      model->mfcc_feats_per_timestep = shape.dim(2).size() * shape.dim(3).size();
    } else if (node.name() == "logits_shape") {
      Tensor logits_shape = Tensor(DT_INT32, TensorShape({3}));
@ -665,12 +681,10 @@ DS_CreateModel(const char* aModelPath,
    }
  }

-  if (model->n_context == -1) {
-    std::cerr << "Error: Could not infer context window size from model file. "
-              << "Make sure input_node is a 3D tensor with the last dimension "
-              << "of size MFCC_FEATURES * ((2 * context window) + 1). If you "
-              << "changed the number of features in the input, adjust the "
-              << "MFCC_FEATURES constant in " __FILE__
+  if (model->n_context == -1 || model->n_features == -1) {
+    std::cerr << "Error: Could not infer input shape from model file. "
+              << "Make sure input_node is a 4D tensor with shape "
+              << "[batch_size=1, time, window_size, n_features]."
              << std::endl;
    return DS_ERR_INVALID_SHAPE;
  }
@ -710,6 +724,7 @@ DS_CreateModel(const char* aModelPath,

  model->n_steps = dims_input_node->data[1];
  model->n_context = (dims_input_node->data[2] - 1 ) / 2;
+  model->n_features = dims_input_node->data[3];
  model->mfcc_feats_per_timestep = dims_input_node->data[2] * dims_input_node->data[3];

  TfLiteIntArray* dims_logits = model->interpreter->tensor(model->logits_idx)->dims;
@ -772,7 +787,7 @@ DS_SpeechToText(ModelState* aCtx,
                unsigned int aBufferSize,
                unsigned int aSampleRate)
 {
-  StreamingState* ctx = setupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
+  StreamingState* ctx = SetupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
  return DS_FinishStream(ctx);
 }

@ -782,12 +797,12 @@ DS_SpeechToTextWithMetadata(ModelState* aCtx,
                            unsigned int aBufferSize,
                            unsigned int aSampleRate)
 {
-  StreamingState* ctx = setupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
+  StreamingState* ctx = SetupStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize, aSampleRate);
  return DS_FinishStreamWithMetadata(ctx);
 }

 StreamingState* 
-setupStreamAndFeedAudioContent(ModelState* aCtx,
+SetupStreamAndFeedAudioContent(ModelState* aCtx,
                               const short* aBuffer,
                               unsigned int aBufferSize,
                               unsigned int aSampleRate)
@ -797,9 +812,7 @@ setupStreamAndFeedAudioContent(ModelState* aCtx,
  if (status != DS_ERR_OK) {
    return nullptr;
  }
-      
  DS_FeedAudioContent(ctx, aBuffer, aBufferSize);
-
  return ctx;
 }

@ -836,7 +849,7 @@ DS_SetupStream(ModelState* aCtx,

  ctx->audio_buffer.reserve(AUDIO_WIN_LEN_SAMPLES);
  ctx->mfcc_buffer.reserve(aCtx->mfcc_feats_per_timestep);
-  ctx->mfcc_buffer.resize(MFCC_FEATURES*aCtx->n_context, 0.f);
+  ctx->mfcc_buffer.resize(aCtx->n_features*aCtx->n_context, 0.f);
  ctx->batch_buffer.reserve(aCtx->n_steps * aCtx->mfcc_feats_per_timestep);

  ctx->model = aCtx;