Remove alphabet param usage

This commit is contained in:
Reuben Morais 2019-11-01 13:12:35 +01:00
parent 8c82081779
commit 3fdc7d422d
37 changed files with 86 additions and 181 deletions

View File

@ -7,7 +7,7 @@ extension-pkg-whitelist=
# Add files or directories to the blacklist. They should be base names, not # Add files or directories to the blacklist. They should be base names, not
# paths. # paths.
ignore=CVS ignore=examples
# Add files or directories matching the regex patterns to the blacklist. The # Add files or directories matching the regex patterns to the blacklist. The
# regex matches against base names, not paths. # regex matches against base names, not paths.

View File

@ -34,7 +34,7 @@ To install and use deepspeech all you have to do is:
tar xvf audio-0.5.1.tar.gz tar xvf audio-0.5.1.tar.gz
# Transcribe an audio file # Transcribe an audio file
deepspeech --model deepspeech-0.5.1-models/output_graph.pbmm --alphabet deepspeech-0.5.1-models/alphabet.txt --lm deepspeech-0.5.1-models/lm.binary --trie deepspeech-0.5.1-models/trie --audio audio/2830-3980-0043.wav deepspeech --model deepspeech-0.5.1-models/output_graph.pbmm --lm deepspeech-0.5.1-models/lm.binary --trie deepspeech-0.5.1-models/trie --audio audio/2830-3980-0043.wav
A pre-trained English model is available for use and can be downloaded using `the instructions below <USING.rst#using-a-pre-trained-model>`_. Currently, only 16-bit, 16 kHz, mono-channel WAVE audio files are supported in the Python client. A package with some example audio files is available for download in our `release notes <https://github.com/mozilla/DeepSpeech/releases/latest>`_. A pre-trained English model is available for use and can be downloaded using `the instructions below <USING.rst#using-a-pre-trained-model>`_. Currently, only 16-bit, 16 kHz, mono-channel WAVE audio files are supported in the Python client. A package with some example audio files is available for download in our `release notes <https://github.com/mozilla/DeepSpeech/releases/latest>`_.
@ -50,7 +50,7 @@ Quicker inference can be performed using a supported NVIDIA GPU on Linux. See th
pip3 install deepspeech-gpu pip3 install deepspeech-gpu
# Transcribe an audio file. # Transcribe an audio file.
deepspeech --model deepspeech-0.5.1-models/output_graph.pbmm --alphabet deepspeech-0.5.1-models/alphabet.txt --lm deepspeech-0.5.1-models/lm.binary --trie deepspeech-0.5.1-models/trie --audio audio/2830-3980-0043.wav deepspeech --model deepspeech-0.5.1-models/output_graph.pbmm --lm deepspeech-0.5.1-models/lm.binary --trie deepspeech-0.5.1-models/trie --audio audio/2830-3980-0043.wav
Please ensure you have the required `CUDA dependencies <USING.rst#cuda-dependency>`_. Please ensure you have the required `CUDA dependencies <USING.rst#cuda-dependency>`_.

View File

@ -105,7 +105,7 @@ Note: the following command assumes you `downloaded the pre-trained model <#gett
.. code-block:: bash .. code-block:: bash
deepspeech --model models/output_graph.pbmm --alphabet models/alphabet.txt --lm models/lm.binary --trie models/trie --audio my_audio_file.wav deepspeech --model models/output_graph.pbmm --lm models/lm.binary --trie models/trie --audio my_audio_file.wav
The arguments ``--lm`` and ``--trie`` are optional, and represent a language model. The arguments ``--lm`` and ``--trie`` are optional, and represent a language model.
@ -159,7 +159,7 @@ Note: the following command assumes you `downloaded the pre-trained model <#gett
.. code-block:: bash .. code-block:: bash
./deepspeech --model models/output_graph.pbmm --alphabet models/alphabet.txt --lm models/lm.binary --trie models/trie --audio audio_input.wav ./deepspeech --model models/output_graph.pbmm --lm models/lm.binary --trie models/trie --audio audio_input.wav
See the help output with ``./deepspeech -h`` and the `native client README <native_client/README.rst>`_ for more details. See the help output with ``./deepspeech -h`` and the `native client README <native_client/README.rst>`_ for more details.

View File

@ -239,7 +239,7 @@ def delete_tree(dir):
except IOError: except IOError:
print('No remote directory: %s' % dir) print('No remote directory: %s' % dir)
def setup_tempdir(dir, models, wav, alphabet, lm_binary, trie, binaries): def setup_tempdir(dir, models, wav, lm_binary, trie, binaries):
r''' r'''
Copy models, libs and binary to a directory (new one if dir is None) Copy models, libs and binary to a directory (new one if dir is None)
''' '''
@ -268,7 +268,7 @@ def setup_tempdir(dir, models, wav, alphabet, lm_binary, trie, binaries):
print('Copying %s to %s' % (f, dir)) print('Copying %s to %s' % (f, dir))
shutil.copy2(f, dir) shutil.copy2(f, dir)
for extra_file in [ wav, alphabet, lm_binary, trie ]: for extra_file in [ wav, lm_binary, trie ]:
if extra_file and not os.path.isfile(os.path.join(dir, os.path.basename(extra_file))): if extra_file and not os.path.isfile(os.path.join(dir, os.path.basename(extra_file))):
print('Copying %s to %s' % (extra_file, dir)) print('Copying %s to %s' % (extra_file, dir))
shutil.copy2(extra_file, dir) shutil.copy2(extra_file, dir)
@ -375,10 +375,10 @@ def establish_ssh(target=None, auto_trust=False, allow_agent=True, look_keys=Tru
return ssh_conn return ssh_conn
def run_benchmarks(dir, models, wav, alphabet, lm_binary=None, trie=None, iters=-1): def run_benchmarks(dir, models, wav, lm_binary=None, trie=None, iters=-1):
r''' r'''
Core of the running of the benchmarks. We will run on all of models, against Core of the running of the benchmarks. We will run on all of models, against
the WAV file provided as wav, and the provided alphabet. the WAV file provided as wav.
''' '''
assert_valid_dir(dir) assert_valid_dir(dir)
@ -396,9 +396,9 @@ def run_benchmarks(dir, models, wav, alphabet, lm_binary=None, trie=None, iters=
} }
if lm_binary and trie: if lm_binary and trie:
cmdline = './deepspeech --model "%s" --alphabet "%s" --lm "%s" --trie "%s" --audio "%s" -t' % (model_filename, alphabet, lm_binary, trie, wav) cmdline = './deepspeech --model "%s" --lm "%s" --trie "%s" --audio "%s" -t' % (model_filename, lm_binary, trie, wav)
else: else:
cmdline = './deepspeech --model "%s" --alphabet "%s" --audio "%s" -t' % (model_filename, alphabet, wav) cmdline = './deepspeech --model "%s" --audio "%s" -t' % (model_filename, wav)
for it in range(iters): for it in range(iters):
sys.stdout.write('\rRunning %s: %d/%d' % (os.path.basename(model), (it+1), iters)) sys.stdout.write('\rRunning %s: %d/%d' % (os.path.basename(model), (it+1), iters))
@ -453,8 +453,6 @@ def handle_args():
help='List of files (protocolbuffer) to work on. Might be a zip file.') help='List of files (protocolbuffer) to work on. Might be a zip file.')
parser.add_argument('--wav', required=False, parser.add_argument('--wav', required=False,
help='WAV file to pass to native_client. Supply again in plotting mode to draw realine line.') help='WAV file to pass to native_client. Supply again in plotting mode to draw realine line.')
parser.add_argument('--alphabet', required=False,
help='Text file to pass to native_client for the alphabet.')
parser.add_argument('--lm_binary', required=False, parser.add_argument('--lm_binary', required=False,
help='Path to the LM binary file used by the decoder.') help='Path to the LM binary file used by the decoder.')
parser.add_argument('--trie', required=False, parser.add_argument('--trie', required=False,
@ -472,8 +470,8 @@ def handle_args():
def do_main(): def do_main():
cli_args = handle_args() cli_args = handle_args()
if not cli_args.models or not cli_args.wav or not cli_args.alphabet: if not cli_args.models or not cli_args.wav:
raise AssertionError('Missing arguments (models, wav or alphabet)') raise AssertionError('Missing arguments (models or wav)')
if cli_args.dir is not None and not os.path.isdir(cli_args.dir): if cli_args.dir is not None and not os.path.isdir(cli_args.dir):
raise AssertionError('Inexistent temp directory') raise AssertionError('Inexistent temp directory')
@ -484,18 +482,17 @@ def do_main():
global ssh_conn global ssh_conn
ssh_conn = establish_ssh(target=cli_args.target, auto_trust=cli_args.autotrust, allow_agent=cli_args.allowagent, look_keys=cli_args.lookforkeys) ssh_conn = establish_ssh(target=cli_args.target, auto_trust=cli_args.autotrust, allow_agent=cli_args.allowagent, look_keys=cli_args.lookforkeys)
tempdir, sorted_models = setup_tempdir(dir=cli_args.dir, models=cli_args.models, wav=cli_args.wav, alphabet=cli_args.alphabet, lm_binary=cli_args.lm_binary, trie=cli_args.trie, binaries=cli_args.binaries) tempdir, sorted_models = setup_tempdir(dir=cli_args.dir, models=cli_args.models, wav=cli_args.wav, lm_binary=cli_args.lm_binary, trie=cli_args.trie, binaries=cli_args.binaries)
dest_sorted_models = list(map(lambda x: os.path.join(tempdir, os.path.basename(x)), sorted_models)) dest_sorted_models = list(map(lambda x: os.path.join(tempdir, os.path.basename(x)), sorted_models))
dest_wav = os.path.join(tempdir, os.path.basename(cli_args.wav)) dest_wav = os.path.join(tempdir, os.path.basename(cli_args.wav))
dest_alphabet = os.path.join(tempdir, os.path.basename(cli_args.alphabet))
if cli_args.lm_binary and cli_args.trie: if cli_args.lm_binary and cli_args.trie:
dest_lm_binary = os.path.join(tempdir, os.path.basename(cli_args.lm_binary)) dest_lm_binary = os.path.join(tempdir, os.path.basename(cli_args.lm_binary))
dest_trie = os.path.join(tempdir, os.path.basename(cli_args.trie)) dest_trie = os.path.join(tempdir, os.path.basename(cli_args.trie))
inference_times = run_benchmarks(dir=tempdir, models=dest_sorted_models, wav=dest_wav, alphabet=dest_alphabet, lm_binary=dest_lm_binary, trie=dest_trie, iters=cli_args.iters) inference_times = run_benchmarks(dir=tempdir, models=dest_sorted_models, wav=dest_wav, lm_binary=dest_lm_binary, trie=dest_trie, iters=cli_args.iters)
else: else:
inference_times = run_benchmarks(dir=tempdir, models=dest_sorted_models, wav=dest_wav, alphabet=dest_alphabet, iters=cli_args.iters) inference_times = run_benchmarks(dir=tempdir, models=dest_sorted_models, wav=dest_wav, iters=cli_args.iters)
if cli_args.csv: if cli_args.csv:
produce_csv(input=inference_times, output=cli_args.csv) produce_csv(input=inference_times, output=cli_args.csv)

View File

@ -23,16 +23,16 @@ This module should be self-contained:
- pip install native_client/python/dist/deepspeech*.whl - pip install native_client/python/dist/deepspeech*.whl
- pip install -r requirements_eval_tflite.txt - pip install -r requirements_eval_tflite.txt
Then run with a TF Lite model, alphabet, LM/trie and a CSV test file Then run with a TF Lite model, LM/trie and a CSV test file
''' '''
BEAM_WIDTH = 500 BEAM_WIDTH = 500
LM_ALPHA = 0.75 LM_ALPHA = 0.75
LM_BETA = 1.85 LM_BETA = 1.85
def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask): def tflite_worker(model, lm, trie, queue_in, queue_out, gpu_mask):
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask) os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
ds = Model(model, alphabet, BEAM_WIDTH) ds = Model(model, BEAM_WIDTH)
ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA) ds.enableDecoderWithLM(lm, trie, LM_ALPHA, LM_BETA)
while True: while True:
@ -58,8 +58,6 @@ def main():
parser = argparse.ArgumentParser(description='Computing TFLite accuracy') parser = argparse.ArgumentParser(description='Computing TFLite accuracy')
parser.add_argument('--model', required=True, parser.add_argument('--model', required=True,
help='Path to the model (protocol buffer binary file)') help='Path to the model (protocol buffer binary file)')
parser.add_argument('--alphabet', required=True,
help='Path to the configuration file specifying the alphabet used by the network')
parser.add_argument('--lm', required=True, parser.add_argument('--lm', required=True,
help='Path to the language model binary file') help='Path to the language model binary file')
parser.add_argument('--trie', required=True, parser.add_argument('--trie', required=True,
@ -78,7 +76,7 @@ def main():
processes = [] processes = []
for i in range(args.proc): for i in range(args.proc):
worker_process = Process(target=tflite_worker, args=(args.model, args.alphabet, args.lm, args.trie, work_todo, work_done, i), daemon=True, name='tflite_process_{}'.format(i)) worker_process = Process(target=tflite_worker, args=(args.model, args.lm, args.trie, work_todo, work_done, i), daemon=True, name='tflite_process_{}'.format(i))
worker_process.start() # Launch reader() as a separate python process worker_process.start() # Launch reader() as a separate python process
processes.append(worker_process) processes.append(worker_process)

View File

@ -22,14 +22,12 @@ Here is an example for a local audio file:
```bash ```bash
node ./index.js --audio <AUDIO_FILE> \ node ./index.js --audio <AUDIO_FILE> \
--model $HOME/models/output_graph.pbmm \ --model $HOME/models/output_graph.pbmm \
--alphabet $HOME/models/alphabet.txt
``` ```
Here is an example for a remote RTMP-Stream: Here is an example for a remote RTMP-Stream:
```bash ```bash
node ./index.js --audio rtmp://<IP>:1935/live/teststream \ node ./index.js --audio rtmp://<IP>:1935/live/teststream \
--model $HOME/models/output_graph.pbmm \ --model $HOME/models/output_graph.pbmm \
--alphabet $HOME/models/alphabet.txt
``` ```
## Examples ## Examples
@ -39,21 +37,18 @@ node ./index.js --audio $HOME/audio/2830-3980-0043.wav \
--lm $HOME/models/lm.binary \ --lm $HOME/models/lm.binary \
--trie $HOME/models/trie \ --trie $HOME/models/trie \
--model $HOME/models/output_graph.pbmm \ --model $HOME/models/output_graph.pbmm \
--alphabet $HOME/models/alphabet.txt
``` ```
```bash ```bash
node ./index.js --audio $HOME/audio/4507-16021-0012.wav \ node ./index.js --audio $HOME/audio/4507-16021-0012.wav \
--lm $HOME/models/lm.binary \ --lm $HOME/models/lm.binary \
--trie $HOME/models/trie \ --trie $HOME/models/trie \
--model $HOME/models/output_graph.pbmm \ --model $HOME/models/output_graph.pbmm \
--alphabet $HOME/models/alphabet.txt
``` ```
```bash ```bash
node ./index.js --audio $HOME/audio/8455-210777-0068.wav \ node ./index.js --audio $HOME/audio/8455-210777-0068.wav \
--lm $HOME/models/lm.binary \ --lm $HOME/models/lm.binary \
--trie $HOME/models/trie \ --trie $HOME/models/trie \
--model $HOME/models/output_graph.pbmm \ --model $HOME/models/output_graph.pbmm \
--alphabet $HOME/models/alphabet.txt
``` ```
Real time streaming inference in combination with a RTMP server. Real time streaming inference in combination with a RTMP server.
```bash ```bash
@ -61,7 +56,6 @@ node ./index.js --audio rtmp://<HOST>/<APP>/<KEY> \
--lm $HOME/models/lm.binary \ --lm $HOME/models/lm.binary \
--trie $HOME/models/trie \ --trie $HOME/models/trie \
--model $HOME/models/output_graph.pbmm \ --model $HOME/models/output_graph.pbmm \
--alphabet $HOME/models/alphabet.txt
``` ```
## Notes ## Notes

View File

@ -32,7 +32,6 @@ VersionAction.prototype.call = function(parser) {
let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'}); let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'}); parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
parser.addArgument(['--alphabet'], {required: true, help: 'Path to the configuration file specifying the alphabet used by the network'});
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'}); parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'}); parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio source to run (ffmpeg supported formats)'}); parser.addArgument(['--audio'], {required: true, help: 'Path to the audio source to run (ffmpeg supported formats)'});
@ -45,7 +44,7 @@ function totalTime(hrtimeValue) {
console.error('Loading model from file %s', args['model']); console.error('Loading model from file %s', args['model']);
const model_load_start = process.hrtime(); const model_load_start = process.hrtime();
let model = new Ds.Model(args['model'], args['alphabet'], BEAM_WIDTH); let model = new Ds.Model(args['model'], BEAM_WIDTH);
const model_load_end = process.hrtime(model_load_start); const model_load_end = process.hrtime(model_load_start);
console.error('Loaded model in %ds.', totalTime(model_load_end)); console.error('Loaded model in %ds.', totalTime(model_load_end));

View File

@ -13,18 +13,15 @@ pushd ${THIS}
node ./index.js --audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \ node ./index.js --audio $HOME/DeepSpeech/audio/2830-3980-0043.wav \
--lm $HOME/DeepSpeech/models/lm.binary \ --lm $HOME/DeepSpeech/models/lm.binary \
--trie $HOME/DeepSpeech/models/trie \ --trie $HOME/DeepSpeech/models/trie \
--model $HOME/DeepSpeech/models/output_graph.pbmm \ --model $HOME/DeepSpeech/models/output_graph.pbmm
--alphabet $HOME/DeepSpeech/models/alphabet.txt
node ./index.js --audio $HOME/DeepSpeech/audio/4507-16021-0012.wav \ node ./index.js --audio $HOME/DeepSpeech/audio/4507-16021-0012.wav \
--lm $HOME/DeepSpeech/models/lm.binary \ --lm $HOME/DeepSpeech/models/lm.binary \
--trie $HOME/DeepSpeech/models/trie \ --trie $HOME/DeepSpeech/models/trie \
--model $HOME/DeepSpeech/models/output_graph.pbmm \ --model $HOME/DeepSpeech/models/output_graph.pbmm
--alphabet $HOME/DeepSpeech/models/alphabet.txt
node ./index.js --audio $HOME/DeepSpeech/audio/8455-210777-0068.wav \ node ./index.js --audio $HOME/DeepSpeech/audio/8455-210777-0068.wav \
--lm $HOME/DeepSpeech/models/lm.binary \ --lm $HOME/DeepSpeech/models/lm.binary \
--trie $HOME/DeepSpeech/models/trie \ --trie $HOME/DeepSpeech/models/trie \
--model $HOME/DeepSpeech/models/output_graph.pbmm \ --model $HOME/DeepSpeech/models/output_graph.pbmm
--alphabet $HOME/DeepSpeech/models/alphabet.txt
popd popd

View File

@ -29,7 +29,7 @@ Usage
.. code-block:: .. code-block::
usage: mic_vad_streaming.py [-h] [-v VAD_AGGRESSIVENESS] [--nospinner] usage: mic_vad_streaming.py [-h] [-v VAD_AGGRESSIVENESS] [--nospinner]
[-w SAVEWAV] -m MODEL [-a ALPHABET] [-l LM] [-w SAVEWAV] -m MODEL [-l LM]
[-t TRIE] [-nf N_FEATURES] [-nc N_CONTEXT] [-t TRIE] [-nf N_FEATURES] [-nc N_CONTEXT]
[-la LM_ALPHA] [-lb LM_BETA] [-la LM_ALPHA] [-lb LM_BETA]
[-bw BEAM_WIDTH] [-bw BEAM_WIDTH]
@ -49,9 +49,6 @@ Usage
Path to the model (protocol buffer binary file, or Path to the model (protocol buffer binary file, or
entire directory containing all standard-named files entire directory containing all standard-named files
for model) for model)
-a ALPHABET, --alphabet ALPHABET
Path to the configuration file specifying the alphabet
used by the network. Default: alphabet.txt
-l LM, --lm LM Path to the language model binary file. Default: -l LM, --lm LM Path to the language model binary file. Default:
lm.binary lm.binary
-t TRIE, --trie TRIE Path to the language model trie file created with -t TRIE, --trie TRIE Path to the language model trie file created with

View File

@ -156,14 +156,12 @@ def main(ARGS):
if os.path.isdir(ARGS.model): if os.path.isdir(ARGS.model):
model_dir = ARGS.model model_dir = ARGS.model
ARGS.model = os.path.join(model_dir, 'output_graph.pb') ARGS.model = os.path.join(model_dir, 'output_graph.pb')
ARGS.alphabet = os.path.join(model_dir, ARGS.alphabet if ARGS.alphabet else 'alphabet.txt')
ARGS.lm = os.path.join(model_dir, ARGS.lm) ARGS.lm = os.path.join(model_dir, ARGS.lm)
ARGS.trie = os.path.join(model_dir, ARGS.trie) ARGS.trie = os.path.join(model_dir, ARGS.trie)
print('Initializing model...') print('Initializing model...')
logging.info("ARGS.model: %s", ARGS.model) logging.info("ARGS.model: %s", ARGS.model)
logging.info("ARGS.alphabet: %s", ARGS.alphabet) model = deepspeech.Model(ARGS.model, ARGS.beam_width)
model = deepspeech.Model(ARGS.model, ARGS.alphabet, ARGS.beam_width)
if ARGS.lm and ARGS.trie: if ARGS.lm and ARGS.trie:
logging.info("ARGS.lm: %s", ARGS.lm) logging.info("ARGS.lm: %s", ARGS.lm)
logging.info("ARGS.trie: %s", ARGS.trie) logging.info("ARGS.trie: %s", ARGS.trie)
@ -219,8 +217,6 @@ if __name__ == '__main__':
parser.add_argument('-m', '--model', required=True, parser.add_argument('-m', '--model', required=True,
help="Path to the model (protocol buffer binary file, or entire directory containing all standard-named files for model)") help="Path to the model (protocol buffer binary file, or entire directory containing all standard-named files for model)")
parser.add_argument('-a', '--alphabet', default='alphabet.txt',
help="Path to the configuration file specifying the alphabet used by the network. Default: alphabet.txt")
parser.add_argument('-l', '--lm', default='lm.binary', parser.add_argument('-l', '--lm', default='lm.binary',
help="Path to the language model binary file. Default: lm.binary") help="Path to the language model binary file. Default: lm.binary")
parser.add_argument('-t', '--trie', default='trie', parser.add_argument('-t', '--trie', default='trie',

View File

@ -14,7 +14,6 @@ pushd ${THIS}
python mic_vad_streaming.py \ python mic_vad_streaming.py \
--model $HOME/DeepSpeech/models/output_graph.pbmm \ --model $HOME/DeepSpeech/models/output_graph.pbmm \
--alphabet $HOME/DeepSpeech/models/alphabet.txt \
--lm $HOME/DeepSpeech/models/lm.binary \ --lm $HOME/DeepSpeech/models/lm.binary \
--trie $HOME/DeepSpeech/models/trie \ --trie $HOME/DeepSpeech/models/trie \
--file $HOME/DeepSpeech/audio/2830-3980-0043.wav --file $HOME/DeepSpeech/audio/2830-3980-0043.wav

View File

@ -77,7 +77,7 @@ namespace DeepSpeechWPF
{ {
try try
{ {
_sttClient.CreateModel("output_graph.pbmm", "alphabet.txt", BEAM_WIDTH); _sttClient.CreateModel("output_graph.pbmm", BEAM_WIDTH);
Dispatcher.Invoke(() => { EnableControls(); }); Dispatcher.Invoke(() => { EnableControls(); });
} }
catch (Exception ex) catch (Exception ex)

View File

@ -11,7 +11,6 @@ Edit references to models path if necessary:
``` ```
let modelPath = './models/output_graph.pbmm'; let modelPath = './models/output_graph.pbmm';
let alphabetPath = './models/alphabet.txt';
let lmPath = './models/lm.binary'; let lmPath = './models/lm.binary';
let triePath = './models/trie'; let triePath = './models/trie';
``` ```

View File

@ -7,9 +7,8 @@ const Wav = require('node-wav');
const BEAM_WIDTH = 1024; const BEAM_WIDTH = 1024;
let modelPath = './models/output_graph.pbmm'; let modelPath = './models/output_graph.pbmm';
let alphabetPath = './models/alphabet.txt';
let model = new DeepSpeech.Model(modelPath, alphabetPath, BEAM_WIDTH); let model = new DeepSpeech.Model(modelPath, BEAM_WIDTH);
let desiredSampleRate = model.sampleRate(); let desiredSampleRate = model.sampleRate();

View File

@ -18,7 +18,7 @@ def main(args):
parser.add_argument('--audio', required=False, parser.add_argument('--audio', required=False,
help='Path to the audio file to run (WAV format)') help='Path to the audio file to run (WAV format)')
parser.add_argument('--model', required=True, parser.add_argument('--model', required=True,
help='Path to directory that contains all model files (output_graph, lm, trie and alphabet)') help='Path to directory that contains all model files (output_graph, lm and trie)')
parser.add_argument('--stream', required=False, action='store_true', parser.add_argument('--stream', required=False, action='store_true',
help='To use deepspeech streaming interface') help='To use deepspeech streaming interface')
args = parser.parse_args() args = parser.parse_args()
@ -34,10 +34,10 @@ def main(args):
dirName = os.path.expanduser(args.model) dirName = os.path.expanduser(args.model)
# Resolve all the paths of model files # Resolve all the paths of model files
output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(dirName) output_graph, lm, trie = wavTranscriber.resolve_models(dirName)
# Load output_graph, alpahbet, lm and trie # Load output_graph, alpahbet, lm and trie
model_retval = wavTranscriber.load_model(output_graph, alphabet, lm, trie) model_retval = wavTranscriber.load_model(output_graph, lm, trie)
if args.audio is not None: if args.audio is not None:
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)'] title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)']

View File

@ -109,7 +109,7 @@ class App(QMainWindow):
self.microphone = QRadioButton("Microphone") self.microphone = QRadioButton("Microphone")
self.fileUpload = QRadioButton("File Upload") self.fileUpload = QRadioButton("File Upload")
self.browseBox = QLineEdit(self, placeholderText="Wave File, Mono @ 16 kHz, 16bit Little-Endian") self.browseBox = QLineEdit(self, placeholderText="Wave File, Mono @ 16 kHz, 16bit Little-Endian")
self.modelsBox = QLineEdit(self, placeholderText="Directory path for output_graph, alphabet, lm & trie") self.modelsBox = QLineEdit(self, placeholderText="Directory path for output_graph, lm & trie")
self.textboxTranscript = QPlainTextEdit(self, placeholderText="Transcription") self.textboxTranscript = QPlainTextEdit(self, placeholderText="Transcription")
self.browseButton = QPushButton('Browse', self) self.browseButton = QPushButton('Browse', self)
self.browseButton.setToolTip('Select a wav file') self.browseButton.setToolTip('Select a wav file')
@ -238,9 +238,9 @@ class App(QMainWindow):
def modelResult(self, dirName): def modelResult(self, dirName):
# Fetch and Resolve all the paths of model files # Fetch and Resolve all the paths of model files
output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(dirName) output_graph, lm, trie = wavTranscriber.resolve_models(dirName)
# Load output_graph, alpahbet, lm and trie # Load output_graph, alpahbet, lm and trie
self.model = wavTranscriber.load_model(output_graph, alphabet, lm, trie) self.model = wavTranscriber.load_model(output_graph, lm, trie)
def modelFinish(self): def modelFinish(self):
# self.timer.stop() # self.timer.stop()

View File

@ -8,20 +8,19 @@ from timeit import default_timer as timer
''' '''
Load the pre-trained model into the memory Load the pre-trained model into the memory
@param models: Output Grapgh Protocol Buffer file @param models: Output Grapgh Protocol Buffer file
@param alphabet: Alphabet.txt file
@param lm: Language model file @param lm: Language model file
@param trie: Trie file @param trie: Trie file
@Retval @Retval
Returns a list [DeepSpeech Object, Model Load Time, LM Load Time] Returns a list [DeepSpeech Object, Model Load Time, LM Load Time]
''' '''
def load_model(models, alphabet, lm, trie): def load_model(models, lm, trie):
BEAM_WIDTH = 500 BEAM_WIDTH = 500
LM_ALPHA = 0.75 LM_ALPHA = 0.75
LM_BETA = 1.85 LM_BETA = 1.85
model_load_start = timer() model_load_start = timer()
ds = Model(models, alphabet, BEAM_WIDTH) ds = Model(models, BEAM_WIDTH)
model_load_end = timer() - model_load_start model_load_end = timer() - model_load_start
logging.debug("Loaded model in %0.3fs." % (model_load_end)) logging.debug("Loaded model in %0.3fs." % (model_load_end))
@ -61,21 +60,18 @@ Resolve directory path for the models and fetch each of them.
@param dirName: Path to the directory containing pre-trained models @param dirName: Path to the directory containing pre-trained models
@Retval: @Retval:
Retunns a tuple containing each of the model files (pb, alphabet, lm and trie) Retunns a tuple containing each of the model files (pb, lm and trie)
''' '''
def resolve_models(dirName): def resolve_models(dirName):
pb = glob.glob(dirName + "/*.pb")[0] pb = glob.glob(dirName + "/*.pb")[0]
logging.debug("Found Model: %s" % pb) logging.debug("Found Model: %s" % pb)
alphabet = glob.glob(dirName + "/alphabet.txt")[0]
logging.debug("Found Alphabet: %s" % alphabet)
lm = glob.glob(dirName + "/lm.binary")[0] lm = glob.glob(dirName + "/lm.binary")[0]
trie = glob.glob(dirName + "/trie")[0] trie = glob.glob(dirName + "/trie")[0]
logging.debug("Found Language Model: %s" % lm) logging.debug("Found Language Model: %s" % lm)
logging.debug("Found Trie: %s" % trie) logging.debug("Found Trie: %s" % trie)
return pb, alphabet, lm, trie return pb, lm, trie
''' '''
Generate VAD segments. Filters out non-voiced audio frames. Generate VAD segments. Filters out non-voiced audio frames.

View File

@ -12,8 +12,6 @@
char* model = NULL; char* model = NULL;
char* alphabet = NULL;
char* lm = NULL; char* lm = NULL;
char* trie = NULL; char* trie = NULL;
@ -41,12 +39,11 @@ int stream_size = 0;
void PrintHelp(const char* bin) void PrintHelp(const char* bin)
{ {
std::cout << std::cout <<
"Usage: " << bin << " --model MODEL --alphabet ALPHABET [--lm LM --trie TRIE] --audio AUDIO [-t] [-e]\n" "Usage: " << bin << " --model MODEL [--lm LM --trie TRIE] --audio AUDIO [-t] [-e]\n"
"\n" "\n"
"Running DeepSpeech inference.\n" "Running DeepSpeech inference.\n"
"\n" "\n"
" --model MODEL Path to the model (protocol buffer binary file)\n" " --model MODEL Path to the model (protocol buffer binary file)\n"
" --alphabet ALPHABET Path to the configuration file specifying the alphabet used by the network\n"
" --lm LM Path to the language model binary file\n" " --lm LM Path to the language model binary file\n"
" --trie TRIE Path to the language model trie file created with native_client/generate_trie\n" " --trie TRIE Path to the language model trie file created with native_client/generate_trie\n"
" --audio AUDIO Path to the audio file to run (WAV format)\n" " --audio AUDIO Path to the audio file to run (WAV format)\n"
@ -68,7 +65,6 @@ bool ProcessArgs(int argc, char** argv)
const char* const short_opts = "m:a:l:r:w:c:d:b:tehv"; const char* const short_opts = "m:a:l:r:w:c:d:b:tehv";
const option long_opts[] = { const option long_opts[] = {
{"model", required_argument, nullptr, 'm'}, {"model", required_argument, nullptr, 'm'},
{"alphabet", required_argument, nullptr, 'a'},
{"lm", required_argument, nullptr, 'l'}, {"lm", required_argument, nullptr, 'l'},
{"trie", required_argument, nullptr, 'r'}, {"trie", required_argument, nullptr, 'r'},
{"audio", required_argument, nullptr, 'w'}, {"audio", required_argument, nullptr, 'w'},
@ -98,10 +94,6 @@ bool ProcessArgs(int argc, char** argv)
model = optarg; model = optarg;
break; break;
case 'a':
alphabet = optarg;
break;
case 'l': case 'l':
lm = optarg; lm = optarg;
break; break;
@ -163,7 +155,7 @@ bool ProcessArgs(int argc, char** argv)
return false; return false;
} }
if (!model || !alphabet || !audio) { if (!model || !audio) {
PrintHelp(argv[0]); PrintHelp(argv[0]);
return false; return false;
} }

View File

@ -29,36 +29,26 @@ namespace DeepSpeechClient
/// Create an object providing an interface to a trained DeepSpeech model. /// Create an object providing an interface to a trained DeepSpeech model.
/// </summary> /// </summary>
/// <param name="aModelPath">The path to the frozen model graph.</param> /// <param name="aModelPath">The path to the frozen model graph.</param>
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
/// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param> /// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception> /// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception>
public unsafe void CreateModel(string aModelPath, public unsafe void CreateModel(string aModelPath,
string aAlphabetConfigPath, uint aBeamWidth) uint aBeamWidth)
{ {
string exceptionMessage = null; string exceptionMessage = null;
if (string.IsNullOrWhiteSpace(aModelPath)) if (string.IsNullOrWhiteSpace(aModelPath))
{ {
exceptionMessage = "Model path cannot be empty."; exceptionMessage = "Model path cannot be empty.";
} }
if (string.IsNullOrWhiteSpace(aAlphabetConfigPath))
{
exceptionMessage = "Alphabet path cannot be empty.";
}
if (!File.Exists(aModelPath)) if (!File.Exists(aModelPath))
{ {
exceptionMessage = $"Cannot find the model file: {aModelPath}"; exceptionMessage = $"Cannot find the model file: {aModelPath}";
} }
if (!File.Exists(aAlphabetConfigPath))
{
exceptionMessage = $"Cannot find the alphabet file: {aAlphabetConfigPath}";
}
if (exceptionMessage != null) if (exceptionMessage != null)
{ {
throw new FileNotFoundException(exceptionMessage); throw new FileNotFoundException(exceptionMessage);
} }
var resultCode = NativeImp.DS_CreateModel(aModelPath, var resultCode = NativeImp.DS_CreateModel(aModelPath,
aAlphabetConfigPath,
aBeamWidth, aBeamWidth,
ref _modelStatePP); ref _modelStatePP);
EvaluateResultCode(resultCode); EvaluateResultCode(resultCode);
@ -86,7 +76,7 @@ namespace DeepSpeechClient
case ErrorCodes.DS_ERR_NO_MODEL: case ErrorCodes.DS_ERR_NO_MODEL:
throw new ArgumentException("Missing model information."); throw new ArgumentException("Missing model information.");
case ErrorCodes.DS_ERR_INVALID_ALPHABET: case ErrorCodes.DS_ERR_INVALID_ALPHABET:
throw new ArgumentException("Invalid alphabet file or invalid alphabet size."); throw new ArgumentException("Invalid alphabet embedded in model. (Data corruption?)");
case ErrorCodes.DS_ERR_INVALID_SHAPE: case ErrorCodes.DS_ERR_INVALID_SHAPE:
throw new ArgumentException("Invalid model shape."); throw new ArgumentException("Invalid model shape.");
case ErrorCodes.DS_ERR_INVALID_LM: case ErrorCodes.DS_ERR_INVALID_LM:

View File

@ -17,11 +17,9 @@ namespace DeepSpeechClient.Interfaces
/// Create an object providing an interface to a trained DeepSpeech model. /// Create an object providing an interface to a trained DeepSpeech model.
/// </summary> /// </summary>
/// <param name="aModelPath">The path to the frozen model graph.</param> /// <param name="aModelPath">The path to the frozen model graph.</param>
/// <param name="aAlphabetConfigPath">The path to the configuration file specifying the alphabet used by the network.</param>
/// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param> /// <param name="aBeamWidth">The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.</param>
/// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception> /// <exception cref="ArgumentException">Thrown when the native binary failed to create the model.</exception>
unsafe void CreateModel(string aModelPath, unsafe void CreateModel(string aModelPath,
string aAlphabetConfigPath,
uint aBeamWidth); uint aBeamWidth);
/// <summary> /// <summary>

View File

@ -16,7 +16,6 @@ namespace DeepSpeechClient
[DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)]
internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath, internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath,
string aAlphabetConfigPath,
uint aBeamWidth, uint aBeamWidth,
ref IntPtr** pint); ref IntPtr** pint);

View File

@ -35,7 +35,6 @@ namespace CSharpExamples
static void Main(string[] args) static void Main(string[] args)
{ {
string model = null; string model = null;
string alphabet = null;
string lm = null; string lm = null;
string trie = null; string trie = null;
string audio = null; string audio = null;
@ -43,7 +42,6 @@ namespace CSharpExamples
if (args.Length > 0) if (args.Length > 0)
{ {
model = GetArgument(args, "--model"); model = GetArgument(args, "--model");
alphabet = GetArgument(args, "--alphabet");
lm = GetArgument(args, "--lm"); lm = GetArgument(args, "--lm");
trie = GetArgument(args, "--trie"); trie = GetArgument(args, "--trie");
audio = GetArgument(args, "--audio"); audio = GetArgument(args, "--audio");
@ -64,7 +62,6 @@ namespace CSharpExamples
stopwatch.Start(); stopwatch.Start();
sttClient.CreateModel( sttClient.CreateModel(
model ?? "output_graph.pbmm", model ?? "output_graph.pbmm",
alphabet ?? "alphabet.txt",
BEAM_WIDTH); BEAM_WIDTH);
stopwatch.Stop(); stopwatch.Stop();

View File

@ -51,7 +51,6 @@ Please push DeepSpeech data to ``/sdcard/deepspeech/``\ , including:
* ``output_graph.tflite`` which is the TF Lite model * ``output_graph.tflite`` which is the TF Lite model
* ``alphabet.txt``
* ``lm.binary`` and ``trie`` files, if you want to use the language model ; please * ``lm.binary`` and ``trie`` files, if you want to use the language model ; please
be aware that too big language model will make the device run out of memory be aware that too big language model will make the device run out of memory

View File

@ -23,7 +23,6 @@ public class DeepSpeechActivity extends AppCompatActivity {
DeepSpeechModel _m = null; DeepSpeechModel _m = null;
EditText _tfliteModel; EditText _tfliteModel;
EditText _alphabet;
EditText _audioFile; EditText _audioFile;
TextView _decodedString; TextView _decodedString;
@ -49,10 +48,10 @@ public class DeepSpeechActivity extends AppCompatActivity {
return (int)((b1 & 0xFF) | (b2 & 0xFF) << 8 | (b3 & 0xFF) << 16 | (b4 & 0xFF) << 24); return (int)((b1 & 0xFF) | (b2 & 0xFF) << 8 | (b3 & 0xFF) << 16 | (b4 & 0xFF) << 24);
} }
private void newModel(String tfliteModel, String alphabet) { private void newModel(String tfliteModel) {
this._tfliteStatus.setText("Creating model"); this._tfliteStatus.setText("Creating model");
if (this._m == null) { if (this._m == null) {
this._m = new DeepSpeechModel(tfliteModel, alphabet, BEAM_WIDTH); this._m = new DeepSpeechModel(tfliteModel, BEAM_WIDTH);
} }
} }
@ -61,7 +60,7 @@ public class DeepSpeechActivity extends AppCompatActivity {
this._startInference.setEnabled(false); this._startInference.setEnabled(false);
this.newModel(this._tfliteModel.getText().toString(), this._alphabet.getText().toString()); this.newModel(this._tfliteModel.getText().toString());
this._tfliteStatus.setText("Extracting audio features ..."); this._tfliteStatus.setText("Extracting audio features ...");
@ -128,13 +127,11 @@ public class DeepSpeechActivity extends AppCompatActivity {
this._tfliteStatus = (TextView) findViewById(R.id.tfliteStatus); this._tfliteStatus = (TextView) findViewById(R.id.tfliteStatus);
this._tfliteModel = (EditText) findViewById(R.id.tfliteModel); this._tfliteModel = (EditText) findViewById(R.id.tfliteModel);
this._alphabet = (EditText) findViewById(R.id.alphabet);
this._audioFile = (EditText) findViewById(R.id.audioFile); this._audioFile = (EditText) findViewById(R.id.audioFile);
this._tfliteModel.setText("/sdcard/deepspeech/output_graph.tflite"); this._tfliteModel.setText("/sdcard/deepspeech/output_graph.tflite");
this._tfliteStatus.setText("Ready, waiting ..."); this._tfliteStatus.setText("Ready, waiting ...");
this._alphabet.setText("/sdcard/deepspeech/alphabet.txt");
this._audioFile.setText("/sdcard/deepspeech/audio.wav"); this._audioFile.setText("/sdcard/deepspeech/audio.wav");
this._startInference = (Button) findViewById(R.id.btnStartInference); this._startInference = (Button) findViewById(R.id.btnStartInference);

View File

@ -97,25 +97,6 @@
android:inputType="text" /> android:inputType="text" />
</LinearLayout> </LinearLayout>
<LinearLayout
android:layout_width="match_parent"
android:layout_height="wrap_content"
android:orientation="horizontal">
<TextView
android:id="@+id/lblAlphabet"
android:layout_width="263dp"
android:layout_height="wrap_content"
android:layout_weight="1"
android:text="Alphabet" />
<EditText
android:id="@+id/alphabet"
android:layout_width="wrap_content"
android:layout_height="wrap_content"
android:inputType="text" />
</LinearLayout>
<LinearLayout <LinearLayout
android:layout_width="match_parent" android:layout_width="match_parent"
android:layout_height="wrap_content" android:layout_height="wrap_content"

View File

@ -30,7 +30,6 @@ import java.nio.ByteBuffer;
public class BasicTest { public class BasicTest {
public static final String modelFile = "/data/local/tmp/test/output_graph.tflite"; public static final String modelFile = "/data/local/tmp/test/output_graph.tflite";
public static final String alphabetFile = "/data/local/tmp/test/alphabet.txt";
public static final String lmFile = "/data/local/tmp/test/lm.binary"; public static final String lmFile = "/data/local/tmp/test/lm.binary";
public static final String trieFile = "/data/local/tmp/test/trie"; public static final String trieFile = "/data/local/tmp/test/trie";
public static final String wavFile = "/data/local/tmp/test/LDC93S1.wav"; public static final String wavFile = "/data/local/tmp/test/LDC93S1.wav";
@ -64,7 +63,7 @@ public class BasicTest {
@Test @Test
public void loadDeepSpeech_basic() { public void loadDeepSpeech_basic() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH); DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
m.freeModel(); m.freeModel();
} }
@ -121,7 +120,7 @@ public class BasicTest {
@Test @Test
public void loadDeepSpeech_stt_noLM() { public void loadDeepSpeech_stt_noLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH); DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
String decoded = doSTT(m, false); String decoded = doSTT(m, false);
assertEquals("she had your dark suit in greasy wash water all year", decoded); assertEquals("she had your dark suit in greasy wash water all year", decoded);
@ -130,7 +129,7 @@ public class BasicTest {
@Test @Test
public void loadDeepSpeech_stt_withLM() { public void loadDeepSpeech_stt_withLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH); DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
m.enableDecoderWihLM(lmFile, trieFile, LM_ALPHA, LM_BETA); m.enableDecoderWihLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
String decoded = doSTT(m, false); String decoded = doSTT(m, false);
@ -140,7 +139,7 @@ public class BasicTest {
@Test @Test
public void loadDeepSpeech_sttWithMetadata_noLM() { public void loadDeepSpeech_sttWithMetadata_noLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH); DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
String decoded = doSTT(m, true); String decoded = doSTT(m, true);
assertEquals("she had your dark suit in greasy wash water all year", decoded); assertEquals("she had your dark suit in greasy wash water all year", decoded);
@ -149,7 +148,7 @@ public class BasicTest {
@Test @Test
public void loadDeepSpeech_sttWithMetadata_withLM() { public void loadDeepSpeech_sttWithMetadata_withLM() {
DeepSpeechModel m = new DeepSpeechModel(modelFile, alphabetFile, BEAM_WIDTH); DeepSpeechModel m = new DeepSpeechModel(modelFile, BEAM_WIDTH);
m.enableDecoderWihLM(lmFile, trieFile, LM_ALPHA, LM_BETA); m.enableDecoderWihLM(lmFile, trieFile, LM_ALPHA, LM_BETA);
String decoded = doSTT(m, true); String decoded = doSTT(m, true);

View File

@ -20,15 +20,13 @@ public class DeepSpeechModel {
* @constructor * @constructor
* *
* @param modelPath The path to the frozen model graph. * @param modelPath The path to the frozen model graph.
* @param alphabetPath The path to the configuration file specifying
* the alphabet used by the network. See alphabet.h.
* @param beam_width The beam width used by the decoder. A larger beam * @param beam_width The beam width used by the decoder. A larger beam
* width generates better results at the cost of decoding * width generates better results at the cost of decoding
* time. * time.
*/ */
public DeepSpeechModel(String modelPath, String alphabetPath, int beam_width) { public DeepSpeechModel(String modelPath, int beam_width) {
this._mspp = impl.new_modelstatep(); this._mspp = impl.new_modelstatep();
impl.CreateModel(modelPath, alphabetPath, beam_width, this._mspp); impl.CreateModel(modelPath, beam_width, this._mspp);
this._msp = impl.modelstatep_value(this._mspp); this._msp = impl.modelstatep_value(this._mspp);
} }

View File

@ -17,7 +17,7 @@ Once everything is installed, you can then use the `deepspeech` binary to do spe
pip3 install deepspeech pip3 install deepspeech
deepspeech --model models/output*graph.pbmm --alphabet models/alphabet.txt --lm models/lm.binary --trie models/trie --audio my*audio_file.wav deepspeech --model models/output*graph.pbmm --lm models/lm.binary --trie models/trie --audio my*audio_file.wav
``` ```
@ -27,7 +27,7 @@ Alternatively, quicker inference can be performed using a supported NVIDIA GPU o
pip3 install deepspeech-gpu pip3 install deepspeech-gpu
deepspeech --model models/output*graph.pbmm --alphabet models/alphabet.txt --lm models/lm.binary --trie models/trie --audio my*audio_file.wav deepspeech --model models/output*graph.pbmm --lm models/lm.binary --trie models/trie --audio my*audio_file.wav
``` ```
@ -223,7 +223,7 @@ Note: the following command assumes you `downloaded the pre-trained model <#gett
```bash ```bash
deepspeech --model models/output*graph.pbmm --alphabet models/alphabet.txt --lm models/lm.binary --trie models/trie --audio my*audio_file.wav deepspeech --model models/output*graph.pbmm --lm models/lm.binary --trie models/trie --audio my*audio_file.wav
``` ```
@ -290,7 +290,7 @@ Note: the following command assumes you `downloaded the pre-trained model <#gett
```bash ```bash
./deepspeech --model models/output*graph.pbmm --alphabet models/alphabet.txt --lm models/lm.binary --trie models/trie --audio audio*input.wav ./deepspeech --model models/output*graph.pbmm --lm models/lm.binary --trie models/trie --audio audio*input.wav
``` ```

View File

@ -29,7 +29,6 @@ VersionAction.prototype.call = function(parser) {
var parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'}); var parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'}); parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
parser.addArgument(['--alphabet'], {required: true, help: 'Path to the configuration file specifying the alphabet used by the network'});
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'}); parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'}); parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'}); parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'});
@ -55,7 +54,7 @@ function metadataToString(metadata) {
console.error('Loading model from file %s', args['model']); console.error('Loading model from file %s', args['model']);
const model_load_start = process.hrtime(); const model_load_start = process.hrtime();
var model = new Ds.Model(args['model'], args['alphabet'], args['beam_width']); var model = new Ds.Model(args['model'], args['beam_width']);
const model_load_end = process.hrtime(model_load_start); const model_load_end = process.hrtime(model_load_start);
console.error('Loaded model in %ds.', totalTime(model_load_end)); console.error('Loaded model in %ds.', totalTime(model_load_end));

View File

@ -25,7 +25,6 @@ if (process.platform === 'win32') {
* An object providing an interface to a trained DeepSpeech model. * An object providing an interface to a trained DeepSpeech model.
* *
* @param {string} aModelPath The path to the frozen model graph. * @param {string} aModelPath The path to the frozen model graph.
* @param {string} aAlphabetConfigPath The path to the configuration file specifying the alphabet used by the network. See alphabet.h.
* @param {number} aBeamWidth The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time. * @param {number} aBeamWidth The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.
* *
* @throws on error * @throws on error

View File

@ -30,9 +30,6 @@ class Model(object):
:param aModelPath: Path to model file to load :param aModelPath: Path to model file to load
:type aModelPath: str :type aModelPath: str
:param aAlphabetConfigPath: Path to alphabet file to load
:type aAlphabetConfigPath: str
:param aBeamWidth: Decoder beam width :param aBeamWidth: Decoder beam width
:type aBeamWidth: int :type aBeamWidth: int
""" """

View File

@ -46,8 +46,6 @@ def main():
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
parser.add_argument('--model', required=True, parser.add_argument('--model', required=True,
help='Path to the model (protocol buffer binary file)') help='Path to the model (protocol buffer binary file)')
parser.add_argument('--alphabet', required=True,
help='Path to the configuration file specifying the alphabet used by the network')
parser.add_argument('--lm', nargs='?', parser.add_argument('--lm', nargs='?',
help='Path to the language model binary file') help='Path to the language model binary file')
parser.add_argument('--trie', nargs='?', parser.add_argument('--trie', nargs='?',
@ -68,7 +66,7 @@ def main():
print('Loading model from file {}'.format(args.model), file=sys.stderr) print('Loading model from file {}'.format(args.model), file=sys.stderr)
model_load_start = timer() model_load_start = timer()
ds = Model(args.model, args.alphabet, args.beam_width) ds = Model(args.model, args.beam_width)
model_load_end = timer() - model_load_start model_load_end = timer() - model_load_start
print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

View File

@ -25,8 +25,6 @@ def main():
parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
parser.add_argument('--model', required=True, parser.add_argument('--model', required=True,
help='Path to the model (protocol buffer binary file)') help='Path to the model (protocol buffer binary file)')
parser.add_argument('--alphabet', required=True,
help='Path to the configuration file specifying the alphabet used by the network')
parser.add_argument('--lm', nargs='?', parser.add_argument('--lm', nargs='?',
help='Path to the language model binary file') help='Path to the language model binary file')
parser.add_argument('--trie', nargs='?', parser.add_argument('--trie', nargs='?',
@ -37,7 +35,7 @@ def main():
help='Second audio file to use in interleaved streams') help='Second audio file to use in interleaved streams')
args = parser.parse_args() args = parser.parse_args()
ds = Model(args.model, args.alphabet, BEAM_WIDTH) ds = Model(args.model, BEAM_WIDTH)
if args.lm and args.trie: if args.lm and args.trie:
ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA) ds.enableDecoderWithLM(args.lm, args.trie, LM_ALPHA, LM_BETA)

View File

@ -30,7 +30,7 @@ then:
image: ${build.docker_image} image: ${build.docker_image}
env: env:
DEEPSPEECH_MODEL: "https://github.com/lissyx/DeepSpeech/releases/download/test-model-0.6.0a10/models.tar.gz" DEEPSPEECH_MODEL: "https://github.com/reuben/DeepSpeech/releases/download/v0.6.0-alpha.11/models.tar.gz"
DEEPSPEECH_AUDIO: "https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz" DEEPSPEECH_AUDIO: "https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/audio-0.4.1.tar.gz"
PIP_DEFAULT_TIMEOUT: "60" PIP_DEFAULT_TIMEOUT: "60"

View File

@ -21,7 +21,6 @@ exec_benchmark()
--dir /tmp/bench-ds/ \ --dir /tmp/bench-ds/ \
--models ${model_file} \ --models ${model_file} \
--wav /tmp/LDC93S1.wav \ --wav /tmp/LDC93S1.wav \
--alphabet /tmp/alphabet.txt \
--lm_binary /tmp/lm.binary \ --lm_binary /tmp/lm.binary \
--trie /tmp/trie \ --trie /tmp/trie \
--csv ${csv} --csv ${csv}
@ -30,7 +29,6 @@ exec_benchmark()
--dir /tmp/bench-ds-nolm/ \ --dir /tmp/bench-ds-nolm/ \
--models ${model_file} \ --models ${model_file} \
--wav /tmp/LDC93S1.wav \ --wav /tmp/LDC93S1.wav \
--alphabet /tmp/alphabet.txt \
--csv ${csv_nolm} --csv ${csv_nolm}
python ${DS_ROOT_TASK}/DeepSpeech/ds/bin/benchmark_plotter.py \ python ${DS_ROOT_TASK}/DeepSpeech/ds/bin/benchmark_plotter.py \

View File

@ -309,12 +309,12 @@ check_runtime_electronjs()
run_tflite_basic_inference_tests() run_tflite_basic_inference_tests()
{ {
set +e set +e
phrase_pbmodel_nolm=$(${DS_BINARY_PREFIX}deepspeech --model ${DATA_TMP_DIR}/${model_name} --alphabet ${DATA_TMP_DIR}/alphabet.txt --audio ${DATA_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_nolm=$(${DS_BINARY_PREFIX}deepspeech --model ${DATA_TMP_DIR}/${model_name} --audio ${DATA_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e set -e
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$?" assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e set +e
phrase_pbmodel_nolm=$(${DS_BINARY_PREFIX}deepspeech --model ${DATA_TMP_DIR}/${model_name} --alphabet ${DATA_TMP_DIR}/alphabet.txt --audio ${DATA_TMP_DIR}/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_nolm=$(${DS_BINARY_PREFIX}deepspeech --model ${DATA_TMP_DIR}/${model_name} --audio ${DATA_TMP_DIR}/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e set -e
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$?" assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
} }
@ -322,22 +322,22 @@ run_tflite_basic_inference_tests()
run_netframework_inference_tests() run_netframework_inference_tests()
{ {
set +e set +e
phrase_pbmodel_nolm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_nolm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name} --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e set -e
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?" assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e set +e
phrase_pbmodel_nolm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended yes 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_nolm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name} --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended yes 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e set -e
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?" assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e set +e
phrase_pbmodel_nolm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_nolm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e set -e
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?" assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e set +e
phrase_pbmodel_withlm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm=$(DeepSpeechConsole.exe --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e set -e
assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?" assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?"
} }
@ -345,22 +345,22 @@ run_netframework_inference_tests()
run_electronjs_inference_tests() run_electronjs_inference_tests()
{ {
set +e set +e
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e set -e
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?" assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e set +e
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e set -e
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?" assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e set +e
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e set -e
assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?" assert_working_ldc93s1 "${phrase_pbmodel_nolm}" "$?"
set +e set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
set -e set -e
assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?" assert_working_ldc93s1_lm "${phrase_pbmodel_withlm}" "$?"
} }
@ -368,25 +368,25 @@ run_electronjs_inference_tests()
run_basic_inference_tests() run_basic_inference_tests()
{ {
set +e set +e
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status" assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"
set +e set +e
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --extended 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status" assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"
set +e set +e
phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status" assert_correct_ldc93s1 "${phrase_pbmodel_nolm}" "$status"
set +e set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm}" "$status" assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm}" "$status"
@ -397,24 +397,24 @@ run_all_inference_tests()
run_basic_inference_tests run_basic_inference_tests
set +e set +e
phrase_pbmodel_nolm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_nolm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1 "${phrase_pbmodel_nolm_stereo_44k}" "$status" assert_correct_ldc93s1 "${phrase_pbmodel_nolm_stereo_44k}" "$status"
set +e set +e
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_stereo_44k}" "$status" assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_stereo_44k}" "$status"
set +e set +e
phrase_pbmodel_nolm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null) phrase_pbmodel_nolm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
set -e set -e
assert_correct_warning_upsampling "${phrase_pbmodel_nolm_mono_8k}" assert_correct_warning_upsampling "${phrase_pbmodel_nolm_mono_8k}"
set +e set +e
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null) phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
set -e set -e
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}" assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
} }
@ -424,7 +424,6 @@ run_prod_concurrent_stream_tests()
set +e set +e
output=$(python ${TASKCLUSTER_TMP_DIR}/test_sources/concurrent_streams.py \ output=$(python ${TASKCLUSTER_TMP_DIR}/test_sources/concurrent_streams.py \
--model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} \ --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} \
--alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt \
--lm ${TASKCLUSTER_TMP_DIR}/lm.binary \ --lm ${TASKCLUSTER_TMP_DIR}/lm.binary \
--trie ${TASKCLUSTER_TMP_DIR}/trie \ --trie ${TASKCLUSTER_TMP_DIR}/trie \
--audio1 ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav \ --audio1 ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav \
@ -442,25 +441,25 @@ run_prod_concurrent_stream_tests()
run_prod_inference_tests() run_prod_inference_tests()
{ {
set +e set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status"
set +e set +e
phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status" assert_correct_ldc93s1_prodmodel "${phrase_pbmodel_withlm}" "$status"
set +e set +e
phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr) phrase_pbmodel_withlm_stereo_44k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_2_44100.wav 2>${TASKCLUSTER_TMP_DIR}/stderr)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_prodmodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status" assert_correct_ldc93s1_prodmodel_stereo_44k "${phrase_pbmodel_withlm_stereo_44k}" "$status"
set +e set +e
phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null) phrase_pbmodel_withlm_mono_8k=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1_pcms16le_1_8000.wav 2>&1 1>/dev/null)
set -e set -e
assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}" assert_correct_warning_upsampling "${phrase_pbmodel_withlm_mono_8k}"
} }
@ -468,13 +467,13 @@ run_prod_inference_tests()
run_multi_inference_tests() run_multi_inference_tests()
{ {
set +e -o pipefail set +e -o pipefail
multi_phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%') multi_phrase_pbmodel_nolm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%')
status=$? status=$?
set -e +o pipefail set -e +o pipefail
assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_nolm}" "$status" assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_nolm}" "$status"
set +e -o pipefail set +e -o pipefail
multi_phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%') multi_phrase_pbmodel_withlm=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/ 2>${TASKCLUSTER_TMP_DIR}/stderr | tr '\n' '%')
status=$? status=$?
set -e +o pipefail set -e +o pipefail
assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_withlm}" "$status" assert_correct_multi_ldc93s1 "${multi_phrase_pbmodel_withlm}" "$status"
@ -483,7 +482,7 @@ run_multi_inference_tests()
run_cpp_only_inference_tests() run_cpp_only_inference_tests()
{ {
set +e set +e
phrase_pbmodel_withlm_intermediate_decode=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --alphabet ${TASKCLUSTER_TMP_DIR}/alphabet.txt --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --stream 1280 2>${TASKCLUSTER_TMP_DIR}/stderr | tail -n 1) phrase_pbmodel_withlm_intermediate_decode=$(deepspeech --model ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} --lm ${TASKCLUSTER_TMP_DIR}/lm.binary --trie ${TASKCLUSTER_TMP_DIR}/trie --audio ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav --stream 1280 2>${TASKCLUSTER_TMP_DIR}/stderr | tail -n 1)
status=$? status=$?
set -e set -e
assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_intermediate_decode}" "$status" assert_correct_ldc93s1_lm "${phrase_pbmodel_withlm_intermediate_decode}" "$status"
@ -566,7 +565,6 @@ download_data()
${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source}" ${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source}"
${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source_mmap}" ${WGET} -P "${TASKCLUSTER_TMP_DIR}" "${model_source_mmap}"
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/*.wav ${TASKCLUSTER_TMP_DIR}/ cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/*.wav ${TASKCLUSTER_TMP_DIR}/
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/alphabet.txt ${TASKCLUSTER_TMP_DIR}/alphabet.txt
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.pruned.lm ${TASKCLUSTER_TMP_DIR}/lm.binary cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.pruned.lm ${TASKCLUSTER_TMP_DIR}/lm.binary
cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.trie ${TASKCLUSTER_TMP_DIR}/trie cp ${DS_ROOT_TASK}/DeepSpeech/ds/data/smoke_test/vocab.trie ${TASKCLUSTER_TMP_DIR}/trie
cp -R ${DS_ROOT_TASK}/DeepSpeech/ds/native_client/test ${TASKCLUSTER_TMP_DIR}/test_sources cp -R ${DS_ROOT_TASK}/DeepSpeech/ds/native_client/test ${TASKCLUSTER_TMP_DIR}/test_sources
@ -579,7 +577,7 @@ download_material()
download_native_client_files "${target_dir}" download_native_client_files "${target_dir}"
download_data download_data
ls -hal ${TASKCLUSTER_TMP_DIR}/${model_name} ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} ${TASKCLUSTER_TMP_DIR}/LDC93S1*.wav ${TASKCLUSTER_TMP_DIR}/alphabet.txt ls -hal ${TASKCLUSTER_TMP_DIR}/${model_name} ${TASKCLUSTER_TMP_DIR}/${model_name_mmap} ${TASKCLUSTER_TMP_DIR}/LDC93S1*.wav
} }
download_benchmark_model() download_benchmark_model()
@ -1595,7 +1593,6 @@ android_setup_ndk_data()
adb push \ adb push \
${TASKCLUSTER_TMP_DIR}/${model_name} \ ${TASKCLUSTER_TMP_DIR}/${model_name} \
${TASKCLUSTER_TMP_DIR}/LDC93S1.wav \ ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav \
${TASKCLUSTER_TMP_DIR}/alphabet.txt \
${ANDROID_TMP_DIR}/ds/ ${ANDROID_TMP_DIR}/ds/
} }
@ -1606,7 +1603,6 @@ android_setup_apk_data()
adb push \ adb push \
${TASKCLUSTER_TMP_DIR}/${model_name} \ ${TASKCLUSTER_TMP_DIR}/${model_name} \
${TASKCLUSTER_TMP_DIR}/LDC93S1.wav \ ${TASKCLUSTER_TMP_DIR}/LDC93S1.wav \
${TASKCLUSTER_TMP_DIR}/alphabet.txt \
${TASKCLUSTER_TMP_DIR}/lm.binary \ ${TASKCLUSTER_TMP_DIR}/lm.binary \
${TASKCLUSTER_TMP_DIR}/trie \ ${TASKCLUSTER_TMP_DIR}/trie \
${ANDROID_TMP_DIR}/test/ ${ANDROID_TMP_DIR}/test/

View File

@ -133,7 +133,6 @@ def create_flags():
# Decoder # Decoder
f.DEFINE_string('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.') f.DEFINE_string('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.')
f.DEFINE_alias('alphabet', 'alphabet_config_path')
f.DEFINE_string('lm_binary_path', 'data/lm/lm.binary', 'path to the language model binary file created with KenLM') f.DEFINE_string('lm_binary_path', 'data/lm/lm.binary', 'path to the language model binary file created with KenLM')
f.DEFINE_alias('lm', 'lm_binary_path') f.DEFINE_alias('lm', 'lm_binary_path')
f.DEFINE_string('lm_trie_path', 'data/lm/trie', 'path to the language model trie file created with native_client/generate_trie') f.DEFINE_string('lm_trie_path', 'data/lm/trie', 'path to the language model trie file created with native_client/generate_trie')