Merge pull request #2425 from mozilla/expose-sample-rate-value
Expose sample rate value in API and use it in in-tree consumers
This commit is contained in:
commit
fcbebbe71a
@ -202,15 +202,13 @@ namespace DeepSpeechWPF
|
|||||||
{
|
{
|
||||||
_audioCapture.Device = _audioCaptureDevices[cbxAudioInputs.SelectedIndex];
|
_audioCapture.Device = _audioCaptureDevices[cbxAudioInputs.SelectedIndex];
|
||||||
}
|
}
|
||||||
InitilizeAudioCapture();
|
InitializeAudioCapture(_sttClient.GetModelSampleRate());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Initializes the recorder and setup the native stream.
|
/// Initializes the recorder and setup the native stream.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private void InitilizeAudioCapture()
|
private void InitializeAudioCapture(int desiredSampleRate)
|
||||||
{
|
{
|
||||||
_audioCapture.Initialize();
|
_audioCapture.Initialize();
|
||||||
_audioCapture.DataAvailable += _capture_DataAvailable;
|
_audioCapture.DataAvailable += _capture_DataAvailable;
|
||||||
@ -218,7 +216,7 @@ namespace DeepSpeechWPF
|
|||||||
//create a source, that converts the data provided by the
|
//create a source, that converts the data provided by the
|
||||||
//soundInSource to required by the deepspeech model
|
//soundInSource to required by the deepspeech model
|
||||||
_convertedSource = _soundInSource
|
_convertedSource = _soundInSource
|
||||||
.ChangeSampleRate(16000) // sample rate
|
.ChangeSampleRate(desiredSampleRate) // sample rate
|
||||||
.ToSampleSource()
|
.ToSampleSource()
|
||||||
.ToWaveSource(16); //bits per sample
|
.ToWaveSource(16); //bits per sample
|
||||||
|
|
||||||
|
@ -11,6 +11,8 @@ let alphabetPath = './models/alphabet.txt';
|
|||||||
|
|
||||||
let model = new DeepSpeech.Model(modelPath, alphabetPath, BEAM_WIDTH);
|
let model = new DeepSpeech.Model(modelPath, alphabetPath, BEAM_WIDTH);
|
||||||
|
|
||||||
|
let desiredSampleRate = model.sampleRate();
|
||||||
|
|
||||||
const LM_ALPHA = 0.75;
|
const LM_ALPHA = 0.75;
|
||||||
const LM_BETA = 1.85;
|
const LM_BETA = 1.85;
|
||||||
let lmPath = './models/lm.binary';
|
let lmPath = './models/lm.binary';
|
||||||
@ -28,8 +30,8 @@ if (!Fs.existsSync(audioFile)) {
|
|||||||
const buffer = Fs.readFileSync(audioFile);
|
const buffer = Fs.readFileSync(audioFile);
|
||||||
const result = Wav.decode(buffer);
|
const result = Wav.decode(buffer);
|
||||||
|
|
||||||
if (result.sampleRate < 16000) {
|
if (result.sampleRate < desiredSampleRate) {
|
||||||
console.error('Warning: original sample rate (' + result.sampleRate + ') is lower than 16kHz. Up-sampling might produce erratic speech recognition.');
|
console.error('Warning: original sample rate (' + result.sampleRate + ') is lower than ' + desiredSampleRate + 'Hz. Up-sampling might produce erratic speech recognition.');
|
||||||
}
|
}
|
||||||
|
|
||||||
function bufferToStream(buffer) {
|
function bufferToStream(buffer) {
|
||||||
@ -47,7 +49,7 @@ pipe(Sox({
|
|||||||
},
|
},
|
||||||
output: {
|
output: {
|
||||||
bits: 16,
|
bits: 16,
|
||||||
rate: 16000,
|
rate: desiredSampleRate,
|
||||||
channels: 1,
|
channels: 1,
|
||||||
encoding: 'signed-integer',
|
encoding: 'signed-integer',
|
||||||
endian: 'little',
|
endian: 'little',
|
||||||
@ -58,10 +60,9 @@ pipe(Sox({
|
|||||||
pipe(audioStream);
|
pipe(audioStream);
|
||||||
|
|
||||||
audioStream.on('finish', () => {
|
audioStream.on('finish', () => {
|
||||||
|
|
||||||
let audioBuffer = audioStream.toBuffer();
|
let audioBuffer = audioStream.toBuffer();
|
||||||
|
|
||||||
const audioLength = (audioBuffer.length / 2) * ( 1 / 16000);
|
const audioLength = (audioBuffer.length / 2) * (1 / desiredSampleRate);
|
||||||
console.log('audio length', audioLength);
|
console.log('audio length', audioLength);
|
||||||
|
|
||||||
let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2));
|
let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2));
|
||||||
|
@ -111,7 +111,7 @@ typedef struct {
|
|||||||
} ds_audio_buffer;
|
} ds_audio_buffer;
|
||||||
|
|
||||||
ds_audio_buffer
|
ds_audio_buffer
|
||||||
GetAudioBuffer(const char* path)
|
GetAudioBuffer(const char* path, int desired_sample_rate)
|
||||||
{
|
{
|
||||||
ds_audio_buffer res = {0};
|
ds_audio_buffer res = {0};
|
||||||
|
|
||||||
@ -121,7 +121,7 @@ GetAudioBuffer(const char* path)
|
|||||||
|
|
||||||
// Resample/reformat the audio so we can pass it through the MFCC functions
|
// Resample/reformat the audio so we can pass it through the MFCC functions
|
||||||
sox_signalinfo_t target_signal = {
|
sox_signalinfo_t target_signal = {
|
||||||
16000, // Rate
|
static_cast<sox_rate_t>(desired_sample_rate), // Rate
|
||||||
1, // Channels
|
1, // Channels
|
||||||
16, // Precision
|
16, // Precision
|
||||||
SOX_UNSPEC, // Length
|
SOX_UNSPEC, // Length
|
||||||
@ -158,8 +158,10 @@ GetAudioBuffer(const char* path)
|
|||||||
|
|
||||||
assert(output);
|
assert(output);
|
||||||
|
|
||||||
if ((int)input->signal.rate < 16000) {
|
if ((int)input->signal.rate < desired_sample_rate) {
|
||||||
fprintf(stderr, "Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.\n", (int)input->signal.rate);
|
fprintf(stderr, "Warning: original sample rate (%d) is lower than %dkHz. "
|
||||||
|
"Up-sampling might produce erratic speech recognition.\n",
|
||||||
|
desired_sample_rate, (int)input->signal.rate);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Setup the effects chain to decode/resample
|
// Setup the effects chain to decode/resample
|
||||||
@ -205,7 +207,7 @@ GetAudioBuffer(const char* path)
|
|||||||
#endif // NO_SOX
|
#endif // NO_SOX
|
||||||
|
|
||||||
#ifdef NO_SOX
|
#ifdef NO_SOX
|
||||||
// FIXME: Hack and support only 16kHz mono 16-bits PCM
|
// FIXME: Hack and support only mono 16-bits PCM with standard SoX header
|
||||||
FILE* wave = fopen(path, "r");
|
FILE* wave = fopen(path, "r");
|
||||||
|
|
||||||
size_t rv;
|
size_t rv;
|
||||||
@ -224,12 +226,12 @@ GetAudioBuffer(const char* path)
|
|||||||
|
|
||||||
assert(audio_format == 1); // 1 is PCM
|
assert(audio_format == 1); // 1 is PCM
|
||||||
assert(num_channels == 1); // MONO
|
assert(num_channels == 1); // MONO
|
||||||
assert(sample_rate == 16000); // 16000 Hz
|
assert(sample_rate == desired_sample_rate); // at desired sample rate
|
||||||
assert(bits_per_sample == 16); // 16 bits per sample
|
assert(bits_per_sample == 16); // 16 bits per sample
|
||||||
|
|
||||||
fprintf(stderr, "audio_format=%d\n", audio_format);
|
fprintf(stderr, "audio_format=%d\n", audio_format);
|
||||||
fprintf(stderr, "num_channels=%d\n", num_channels);
|
fprintf(stderr, "num_channels=%d\n", num_channels);
|
||||||
fprintf(stderr, "sample_rate=%d\n", sample_rate);
|
fprintf(stderr, "sample_rate=%d (desired=%d)\n", sample_rate, desired_sample_rate);
|
||||||
fprintf(stderr, "bits_per_sample=%d\n", bits_per_sample);
|
fprintf(stderr, "bits_per_sample=%d\n", bits_per_sample);
|
||||||
|
|
||||||
fseek(wave, 40, SEEK_SET); rv = fread(&res.buffer_size, 4, 1, wave);
|
fseek(wave, 40, SEEK_SET); rv = fread(&res.buffer_size, 4, 1, wave);
|
||||||
@ -257,7 +259,7 @@ GetAudioBuffer(const char* path)
|
|||||||
void
|
void
|
||||||
ProcessFile(ModelState* context, const char* path, bool show_times)
|
ProcessFile(ModelState* context, const char* path, bool show_times)
|
||||||
{
|
{
|
||||||
ds_audio_buffer audio = GetAudioBuffer(path);
|
ds_audio_buffer audio = GetAudioBuffer(path, DS_GetModelSampleRate(context));
|
||||||
|
|
||||||
// Pass audio to DeepSpeech
|
// Pass audio to DeepSpeech
|
||||||
// We take half of buffer_size because buffer is a char* while
|
// We take half of buffer_size because buffer is a char* while
|
||||||
|
@ -292,6 +292,12 @@ DS_CreateModel(const char* aModelPath,
|
|||||||
return DS_ERR_OK;
|
return DS_ERR_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
DS_GetModelSampleRate(ModelState* aCtx)
|
||||||
|
{
|
||||||
|
return aCtx->sample_rate_;
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
DS_FreeModel(ModelState* ctx)
|
DS_FreeModel(ModelState* ctx)
|
||||||
{
|
{
|
||||||
|
@ -92,6 +92,16 @@ int DS_CreateModel(const char* aModelPath,
|
|||||||
unsigned int aBeamWidth,
|
unsigned int aBeamWidth,
|
||||||
ModelState** retval);
|
ModelState** retval);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Return the sample rate expected by a model.
|
||||||
|
*
|
||||||
|
* @param aCtx A ModelState pointer created with {@link DS_CreateModel}.
|
||||||
|
*
|
||||||
|
* @return Sample rate expected by the model for its input.
|
||||||
|
*/
|
||||||
|
DEEPSPEECH_EXPORT
|
||||||
|
int DS_GetModelSampleRate(ModelState* aCtx);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Frees associated resources and destroys model object.
|
* @brief Frees associated resources and destroys model object.
|
||||||
*/
|
*/
|
||||||
|
@ -64,6 +64,15 @@ namespace DeepSpeechClient
|
|||||||
EvaluateResultCode(resultCode);
|
EvaluateResultCode(resultCode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Return the sample rate expected by the model.
|
||||||
|
/// </summary>
|
||||||
|
/// <returns>Sample rate.</returns>
|
||||||
|
public unsafe int GetModelSampleRate()
|
||||||
|
{
|
||||||
|
return NativeImp.DS_GetModelSampleRate(_modelStatePP);
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Evaluate the result code and will raise an exception if necessary.
|
/// Evaluate the result code and will raise an exception if necessary.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
@ -24,6 +24,12 @@ namespace DeepSpeechClient.Interfaces
|
|||||||
string aAlphabetConfigPath,
|
string aAlphabetConfigPath,
|
||||||
uint aBeamWidth);
|
uint aBeamWidth);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Return the sample rate expected by the model.
|
||||||
|
/// </summary>
|
||||||
|
/// <returns>Sample rate.</returns>
|
||||||
|
unsafe int GetModelSampleRate();
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Enable decoding using beam scoring with a KenLM language model.
|
/// Enable decoding using beam scoring with a KenLM language model.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
@ -20,6 +20,9 @@ namespace DeepSpeechClient
|
|||||||
uint aBeamWidth,
|
uint aBeamWidth,
|
||||||
ref IntPtr** pint);
|
ref IntPtr** pint);
|
||||||
|
|
||||||
|
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||||
|
internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx);
|
||||||
|
|
||||||
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
|
||||||
internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(IntPtr** aCtx,
|
internal static unsafe extern ErrorCodes DS_EnableDecoderWithLM(IntPtr** aCtx,
|
||||||
string aLMPath,
|
string aLMPath,
|
||||||
|
@ -77,7 +77,7 @@ public class DeepSpeechActivity extends AppCompatActivity {
|
|||||||
// tv_numChannels.setText("numChannels=" + (numChannels == 1 ? "MONO" : "!MONO"));
|
// tv_numChannels.setText("numChannels=" + (numChannels == 1 ? "MONO" : "!MONO"));
|
||||||
|
|
||||||
wave.seek(24); int sampleRate = this.readLEInt(wave);
|
wave.seek(24); int sampleRate = this.readLEInt(wave);
|
||||||
assert (sampleRate == 16000); // 16000 Hz
|
assert (sampleRate == this._m.sampleRate()); // desired sample rate
|
||||||
// tv_sampleRate.setText("sampleRate=" + (sampleRate == 16000 ? "16kHz" : "!16kHz"));
|
// tv_sampleRate.setText("sampleRate=" + (sampleRate == 16000 ? "16kHz" : "!16kHz"));
|
||||||
|
|
||||||
wave.seek(34); char bitsPerSample = this.readLEChar(wave);
|
wave.seek(34); char bitsPerSample = this.readLEChar(wave);
|
||||||
|
@ -32,6 +32,15 @@ public class DeepSpeechModel {
|
|||||||
this._msp = impl.modelstatep_value(this._mspp);
|
this._msp = impl.modelstatep_value(this._mspp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Return the sample rate expected by the model.
|
||||||
|
*
|
||||||
|
* @return Sample rate.
|
||||||
|
*/
|
||||||
|
public int sampleRate() {
|
||||||
|
return impl.GetModelSampleRate(this._msp);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Frees associated resources and destroys model object.
|
* @brief Frees associated resources and destroys model object.
|
||||||
*/
|
*/
|
||||||
|
@ -62,11 +62,29 @@ function metadataToString(metadata) {
|
|||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.error('Loading model from file %s', args['model']);
|
||||||
|
const model_load_start = process.hrtime();
|
||||||
|
var model = new Ds.Model(args['model'], args['alphabet'], BEAM_WIDTH);
|
||||||
|
const model_load_end = process.hrtime(model_load_start);
|
||||||
|
console.error('Loaded model in %ds.', totalTime(model_load_end));
|
||||||
|
|
||||||
|
var desired_sample_rate = model.sampleRate();
|
||||||
|
|
||||||
|
if (args['lm'] && args['trie']) {
|
||||||
|
console.error('Loading language model from files %s %s', args['lm'], args['trie']);
|
||||||
|
const lm_load_start = process.hrtime();
|
||||||
|
model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA);
|
||||||
|
const lm_load_end = process.hrtime(lm_load_start);
|
||||||
|
console.error('Loaded language model in %ds.', totalTime(lm_load_end));
|
||||||
|
}
|
||||||
|
|
||||||
const buffer = Fs.readFileSync(args['audio']);
|
const buffer = Fs.readFileSync(args['audio']);
|
||||||
const result = Wav.decode(buffer);
|
const result = Wav.decode(buffer);
|
||||||
|
|
||||||
if (result.sampleRate < 16000) {
|
if (result.sampleRate < desired_sample_rate) {
|
||||||
console.error('Warning: original sample rate (' + result.sampleRate + ') is lower than 16kHz. Up-sampling might produce erratic speech recognition.');
|
console.error('Warning: original sample rate (' + result.sampleRate + ') ' +
|
||||||
|
'is lower than ' + desired_sample_rate + 'Hz. ' +
|
||||||
|
'Up-sampling might produce erratic speech recognition.');
|
||||||
}
|
}
|
||||||
|
|
||||||
function bufferToStream(buffer) {
|
function bufferToStream(buffer) {
|
||||||
@ -84,7 +102,7 @@ bufferToStream(buffer).
|
|||||||
},
|
},
|
||||||
output: {
|
output: {
|
||||||
bits: 16,
|
bits: 16,
|
||||||
rate: 16000,
|
rate: desired_sample_rate,
|
||||||
channels: 1,
|
channels: 1,
|
||||||
encoding: 'signed-integer',
|
encoding: 'signed-integer',
|
||||||
endian: 'little',
|
endian: 'little',
|
||||||
@ -97,23 +115,9 @@ bufferToStream(buffer).
|
|||||||
audioStream.on('finish', () => {
|
audioStream.on('finish', () => {
|
||||||
let audioBuffer = audioStream.toBuffer();
|
let audioBuffer = audioStream.toBuffer();
|
||||||
|
|
||||||
console.error('Loading model from file %s', args['model']);
|
|
||||||
const model_load_start = process.hrtime();
|
|
||||||
var model = new Ds.Model(args['model'], args['alphabet'], BEAM_WIDTH);
|
|
||||||
const model_load_end = process.hrtime(model_load_start);
|
|
||||||
console.error('Loaded model in %ds.', totalTime(model_load_end));
|
|
||||||
|
|
||||||
if (args['lm'] && args['trie']) {
|
|
||||||
console.error('Loading language model from files %s %s', args['lm'], args['trie']);
|
|
||||||
const lm_load_start = process.hrtime();
|
|
||||||
model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA);
|
|
||||||
const lm_load_end = process.hrtime(lm_load_start);
|
|
||||||
console.error('Loaded language model in %ds.', totalTime(lm_load_end));
|
|
||||||
}
|
|
||||||
|
|
||||||
const inference_start = process.hrtime();
|
const inference_start = process.hrtime();
|
||||||
console.error('Running inference.');
|
console.error('Running inference.');
|
||||||
const audioLength = (audioBuffer.length / 2) * ( 1 / 16000);
|
const audioLength = (audioBuffer.length / 2) * (1 / desired_sample_rate);
|
||||||
|
|
||||||
// We take half of the buffer_size because buffer is a char* while
|
// We take half of the buffer_size because buffer is a char* while
|
||||||
// LocalDsSTT() expected a short*
|
// LocalDsSTT() expected a short*
|
||||||
|
@ -45,6 +45,15 @@ function Model() {
|
|||||||
this._impl = impl;
|
this._impl = impl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the sample rate expected by the model.
|
||||||
|
*
|
||||||
|
* @return {number} Sample rate.
|
||||||
|
*/
|
||||||
|
Model.prototype.sampleRate = function() {
|
||||||
|
return binding.GetModelSampleRate(this._impl);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enable decoding using beam scoring with a KenLM language model.
|
* Enable decoding using beam scoring with a KenLM language model.
|
||||||
*
|
*
|
||||||
|
@ -44,6 +44,15 @@ class Model(object):
|
|||||||
deepspeech.impl.FreeModel(self._impl)
|
deepspeech.impl.FreeModel(self._impl)
|
||||||
self._impl = None
|
self._impl = None
|
||||||
|
|
||||||
|
def sampleRate(self):
|
||||||
|
"""
|
||||||
|
Return the sample rate expected by the model.
|
||||||
|
|
||||||
|
:return: Sample rate.
|
||||||
|
:type: int
|
||||||
|
"""
|
||||||
|
return deepspeech.impl.GetModelSampleRate(self._impl)
|
||||||
|
|
||||||
def enableDecoderWithLM(self, *args, **kwargs):
|
def enableDecoderWithLM(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Enable decoding using beam scoring with a KenLM language model.
|
Enable decoding using beam scoring with a KenLM language model.
|
||||||
|
@ -17,9 +17,6 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from pipes import quote
|
from pipes import quote
|
||||||
|
|
||||||
# Define the sample rate for audio
|
|
||||||
|
|
||||||
SAMPLE_RATE = 16000
|
|
||||||
# These constants control the beam search decoder
|
# These constants control the beam search decoder
|
||||||
|
|
||||||
# Beam width used in the CTC decoder when building candidate transcriptions
|
# Beam width used in the CTC decoder when building candidate transcriptions
|
||||||
@ -32,16 +29,16 @@ LM_ALPHA = 0.75
|
|||||||
LM_BETA = 1.85
|
LM_BETA = 1.85
|
||||||
|
|
||||||
|
|
||||||
def convert_samplerate(audio_path):
|
def convert_samplerate(audio_path, desired_sample_rate):
|
||||||
sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), SAMPLE_RATE)
|
sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate)
|
||||||
try:
|
try:
|
||||||
output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
|
output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
|
raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(SAMPLE_RATE, e.strerror))
|
raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(desired_sample_rate, e.strerror))
|
||||||
|
|
||||||
return SAMPLE_RATE, np.frombuffer(output, np.int16)
|
return desired_sample_rate, np.frombuffer(output, np.int16)
|
||||||
|
|
||||||
|
|
||||||
def metadata_to_string(metadata):
|
def metadata_to_string(metadata):
|
||||||
@ -81,6 +78,8 @@ def main():
|
|||||||
model_load_end = timer() - model_load_start
|
model_load_end = timer() - model_load_start
|
||||||
print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)
|
print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)
|
||||||
|
|
||||||
|
desired_sample_rate = ds.sampleRate()
|
||||||
|
|
||||||
if args.lm and args.trie:
|
if args.lm and args.trie:
|
||||||
print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
|
print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
|
||||||
lm_load_start = timer()
|
lm_load_start = timer()
|
||||||
@ -90,13 +89,13 @@ def main():
|
|||||||
|
|
||||||
fin = wave.open(args.audio, 'rb')
|
fin = wave.open(args.audio, 'rb')
|
||||||
fs = fin.getframerate()
|
fs = fin.getframerate()
|
||||||
if fs != SAMPLE_RATE:
|
if fs != desired_sample_rate:
|
||||||
print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, SAMPLE_RATE), file=sys.stderr)
|
print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, desired_sample_rate), file=sys.stderr)
|
||||||
fs, audio = convert_samplerate(args.audio)
|
fs, audio = convert_samplerate(args.audio, desired_sample_rate)
|
||||||
else:
|
else:
|
||||||
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
|
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
|
||||||
|
|
||||||
audio_length = fin.getnframes() * (1/SAMPLE_RATE)
|
audio_length = fin.getnframes() * (1/fs)
|
||||||
fin.close()
|
fin.close()
|
||||||
|
|
||||||
print('Running inference.', file=sys.stderr)
|
print('Running inference.', file=sys.stderr)
|
||||||
|
Loading…
Reference in New Issue
Block a user