Remove DS_AudioToInputVector and dep on c_speech_features

2019-02-28 17:13:48 -03:00 · 2019-02-28 17:13:48 -03:00 · 51f80744c6
commit 51f80744c6
parent 1cea2b0fe8
7 changed files with 2 additions and 150 deletions
--- a/native_client/BUILD
+++ b/native_client/BUILD
@ -62,14 +62,6 @@ tf_cc_shared_object(
    srcs = ["deepspeech.cc",
            "deepspeech.h",
            "alphabet.h",
-            "c_speech_features/c_speech_features.cpp",
-            "kiss_fft130/kiss_fft.c",
-            "kiss_fft130/tools/kiss_fftr.c",
-            "c_speech_features/c_speech_features.h",
-            "c_speech_features/c_speech_features_config.h",
-            "kiss_fft130/kiss_fft.h",
-            "kiss_fft130/_kiss_fft_guts.h",
-            "kiss_fft130/tools/kiss_fftr.h",
            "ds_version.h"] +
            DECODER_SOURCES,
    copts = select({ 
@ -134,7 +126,7 @@ tf_cc_shared_object(
    }) + if_cuda([
            "//tensorflow/core:core",
    ]),
-    includes = ["c_speech_features", "kiss_fft130"] + DECODER_INCLUDES,
+    includes = DECODER_INCLUDES,
    defines = ["KENLM_MAX_ORDER=6"],
 )

--- a/native_client/deepspeech.cc
+++ b/native_client/deepspeech.cc
@ -23,8 +23,6 @@
  #include "tensorflow/lite/kernels/register.h"
 #endif // USE_TFLITE

-#include "c_speech_features.h"
-
 #include "ctcdecode/ctc_beam_search_decoder.h"

 #ifdef __ANDROID__
@ -50,12 +48,6 @@ constexpr unsigned int AUDIO_WIN_STEP_SAMPLES = (unsigned int)(AUDIO_WIN_STEP *

 constexpr unsigned int MFCC_FEATURES = 26;

-constexpr float PREEMPHASIS_COEFF = 0.97f;
-constexpr unsigned int N_FFT = 512;
-constexpr unsigned int N_FILTERS = 26;
-constexpr unsigned int LOWFREQ = 0;
-constexpr unsigned int CEP_LIFTER = 22;
-
 constexpr size_t WINDOW_SIZE = AUDIO_WIN_LEN * SAMPLE_RATE;

 std::array<float, WINDOW_SIZE> calc_hamming_window() {
@ -890,76 +882,8 @@ DS_DiscardStream(StreamingState* aSctx)
 }

 void
-DS_AudioToInputVector(const short* aBuffer,
-                      unsigned int aBufferSize,
-                      unsigned int aSampleRate,
-                      unsigned int aNCep,
-                      unsigned int aNContext,
-                      float** aMfcc,
-                      int* aNFrames,
-                      int* aFrameLen)
+DS_FreeMetadata(Metadata* m)
 {
-  const int contextSize = aNCep * aNContext;
-  const int frameSize = aNCep + (2 * aNCep * aNContext);
-
-  // Compute MFCC features
-  float* mfcc;
-  int n_frames = csf_mfcc(aBuffer, aBufferSize, aSampleRate,
-                          AUDIO_WIN_LEN, AUDIO_WIN_STEP, aNCep, N_FILTERS, N_FFT,
-                          LOWFREQ, aSampleRate/2, PREEMPHASIS_COEFF, CEP_LIFTER,
-                          1, NULL, &mfcc);
-
-  // Take every other frame (BiRNN stride of 2) and add past/future context
-  int ds_input_length = (n_frames + 1) / 2;
-  // TODO: Use MFCC of silence instead of zero
-  float* ds_input = (float*)calloc(ds_input_length * frameSize, sizeof(float));
-  for (int i = 0, idx = 0, mfcc_idx = 0; i < ds_input_length;
-       i++, idx += frameSize, mfcc_idx += aNCep * 2) {
-    // Past context
-    for (int j = aNContext; j > 0; j--) {
-      int frame_index = (i - j) * 2;
-      if (frame_index < 0) { continue; }
-      int mfcc_base = frame_index * aNCep;
-      int base = (aNContext - j) * aNCep;
-      for (int k = 0; k < aNCep; k++) {
-        ds_input[idx + base + k] = mfcc[mfcc_base + k];
-      }
-    }
-
-    // Present context
-    for (int j = 0; j < aNCep; j++) {
-      ds_input[idx + j + contextSize] = mfcc[mfcc_idx + j];
-    }
-
-    // Future context
-    for (int j = 1; j <= aNContext; j++) {
-      int frame_index = (i + j) * 2;
-      if (frame_index >= n_frames) { break; }
-      int mfcc_base = frame_index * aNCep;
-      int base = contextSize + aNCep + ((j - 1) * aNCep);
-      for (int k = 0; k < aNCep; k++) {
-        ds_input[idx + base + k] = mfcc[mfcc_base + k];
-      }
-    }
-  }
-
-  // Free mfcc array
-  free(mfcc);
-
-  if (aMfcc) {
-    *aMfcc = ds_input;
-  }
-  if (aNFrames) {
-    *aNFrames = ds_input_length;
-  }
-  if (aFrameLen) {
-    *aFrameLen = frameSize;
-  }
-}
-
-void 
-DS_FreeMetadata(Metadata* m) 
-{  
  if (m) {
    delete(m->items);
    delete(m);
--- a/native_client/deepspeech.h
+++ b/native_client/deepspeech.h
@ -228,41 +228,9 @@ Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx);
 DEEPSPEECH_EXPORT
 void DS_DiscardStream(StreamingState* aSctx);

-/**
- * @brief Given audio, return a vector suitable for input to a DeepSpeech
- *        model trained with the given parameters.
- *
- * Extracts MFCC features from a given audio signal and adds the appropriate
- * amount of context to run inference on a DeepSpeech model trained with
- * the given parameters.
- *
- * @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample
- *                rate.
- * @param aBufferSize The sample-length of the audio signal.
- * @param aSampleRate The sample-rate of the audio signal.
- * @param aNCep The number of cepstrum.
- * @param aNContext The size of the context window.
- * @param[out] aMfcc An array containing features, of shape
- *                   (@p aNFrames, ncep * ncontext). The user is responsible
- *                   for freeing the array.
- * @param[out] aNFrames (optional) The number of frames in @p aMfcc.
- * @param[out] aFrameLen (optional) The length of each frame
- *                       (ncep * ncontext) in @p aMfcc.
- */
-DEEPSPEECH_EXPORT
-void DS_AudioToInputVector(const short* aBuffer,
-                           unsigned int aBufferSize,
-                           unsigned int aSampleRate,
-                           unsigned int aNCep,
-                           unsigned int aNContext,
-                           float** aMfcc,
-                           int* aNFrames = NULL,
-                           int* aFrameLen = NULL);
-
 /**
 * @brief Free memory allocated for metadata information.
 */
-
 DEEPSPEECH_EXPORT
 void DS_FreeMetadata(Metadata* m); 

--- a/native_client/javascript/deepspeech.i
+++ b/native_client/javascript/deepspeech.i
@ -26,33 +26,6 @@ using namespace node;
 %apply (short* IN_ARRAY1, int DIM1) {(const short* aBuffer, unsigned int aBufferSize)};


-// convert DS_AudioToInputVector return values to a Node Buffer
-%typemap(in,numinputs=0)
-  (float** ARGOUTVIEWM_ARRAY2, unsigned int* DIM1, unsigned int* DIM2)
-  (float* data_temp, unsigned int dim1_temp, unsigned int dim2_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-}
-%typemap(argout)
-  (float** ARGOUTVIEWM_ARRAY2, unsigned int* DIM1, unsigned int* DIM2)
-{
-  Handle<Array> array = Array::New(Isolate::GetCurrent(), *$2);
-  for (unsigned int i = 0, idx = 0; i < *$2; i++) {
-    Handle<ArrayBuffer> buffer =
-      ArrayBuffer::New(Isolate::GetCurrent(), *$1, *$3 * sizeof(float));
-    memcpy(buffer->GetContents().Data(),
-           (*$1) + (idx += *$3), *$3 * sizeof(float));
-    Handle<Float32Array> inner = Float32Array::New(buffer, 0, *$3);
-    array->Set(i, inner);
-  }
-  free(*$1);
-  $result = array;
-}
-
-%apply (float** ARGOUTVIEWM_ARRAY2, unsigned int* DIM1, unsigned int* DIM2) {(float** aMfcc, unsigned int* aNFrames, unsigned int* aFrameLen)};
-
 // make sure the string returned by SpeechToText is freed
 %typemap(newfree) char* "free($1);";
 %newobject DS_SpeechToText;
--- a/native_client/javascript/index.js
+++ b/native_client/javascript/index.js
@ -66,6 +66,5 @@ Model.prototype.finishStream = function() {

 module.exports = {
    Model: Model,
-    audioToInputVector: binding.AudioToInputVector,
    printVersions: binding.PrintVersions
 };
--- a/native_client/python/init.py
+++ b/native_client/python/init.py
@ -1,7 +1,6 @@
 import deepspeech

 # rename for backwards compatibility
-from deepspeech.impl import AudioToInputVector as audioToInputVector
 from deepspeech.impl import PrintVersions as printVersions

 class Model(object):
--- a/native_client/python/impl.i
+++ b/native_client/python/impl.i
@ -13,9 +13,6 @@ import_array();
 // apply NumPy conversion typemap to DS_FeedAudioContent and DS_SpeechToText
 %apply (short* IN_ARRAY1, int DIM1) {(const short* aBuffer, unsigned int aBufferSize)};

-// apply NumPy conversion typemap to DS_AudioToInputVector
-%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {(float** aMfcc, int* aNFrames, int* aFrameLen)};
-
 %typemap(in, numinputs=0) ModelState **retval (ModelState *ret) {
  ret = NULL;
  $1 = &ret;