diff --git a/.circleci/config.yml b/.circleci/config.yml index 14d0fb0f..9e7f804d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,6 +1,6 @@ # These environment variables must be set in CircleCI UI # -# DOCKERHUB_MOZILLA_VOICE_REPO - docker hub repo, format: / +# DOCKERHUB_REPO - docker hub repo, format: / # DOCKER_USER - login info for docker hub # DOCKER_PASS # @@ -28,8 +28,8 @@ jobs: name: Create a Dockerfile.train command: | make Dockerfile.train \ - MOZILLA_VOICE_STT_REPO="https://github.com/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME" \ - MOZILLA_VOICE_STT_SHA=$CIRCLE_SHA1 + DEEPSPEECH_REPO="https://github.com/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME" \ + DEEPSPEECH_SHA=$CIRCLE_SHA1 - run: name: Build Docker image @@ -62,14 +62,14 @@ jobs: echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin # deploy master if [ "${CIRCLE_BRANCH}" == "master" ]; then - docker tag app:build ${DOCKERHUB_MOZILLA_VOICE_REPO}:latest - docker push ${DOCKERHUB_MOZILLA_VOICE_REPO}:latest + docker tag app:build ${DOCKERHUB_REPO}:latest + docker push ${DOCKERHUB_REPO}:latest elif [ ! -z "${CIRCLE_TAG}" ]; then # deploy a release tag... - echo "${DOCKERHUB_MOZILLA_VOICE_REPO}:${CIRCLE_TAG}" - docker tag app:build "${DOCKERHUB_MOZILLA_VOICE_REPO}:${CIRCLE_TAG}" + echo "${DOCKERHUB_REPO}:${CIRCLE_TAG}" + docker tag app:build "${DOCKERHUB_REPO}:${CIRCLE_TAG}" docker images - docker push "${DOCKERHUB_MOZILLA_VOICE_REPO}:${CIRCLE_TAG}" + docker push "${DOCKERHUB_REPO}:${CIRCLE_TAG}" fi workflows: diff --git a/.gitmodules b/.gitmodules index 61b8bdc8..70cd92a1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "doc/examples"] path = doc/examples - url = https://github.com/mozilla/STT-examples.git + url = https://github.com/mozilla/DeepSpeech-examples.git branch = master [submodule "tensorflow"] path = tensorflow diff --git a/.taskcluster.yml b/.taskcluster.yml index d761ff44..ab816850 100644 --- a/.taskcluster.yml +++ b/.taskcluster.yml @@ -23,8 +23,8 @@ tasks: scopes: [ "queue:create-task:highest:proj-deepspeech/*", - "queue:route:index.project.mozilla-voice-stt.*", - "index:insert-task:project.mozilla-voice-stt.*", + "queue:route:index.project.deepspeech.*", + "index:insert-task:project.deepspeech.*", "queue:scheduler-id:taskcluster-github", "generic-worker:cache:deepspeech-macos-pyenv", "docker-worker:capability:device:kvm" diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index 0640e27d..19b14d27 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -1,5 +1,5 @@ This file contains a list of papers in chronological order that have been published -using Mozilla Voice STT. +using Mozilla's DeepSpeech. To appear ========== diff --git a/DeepSpeech.py b/DeepSpeech.py index 39740870..0fa4ae8a 100755 --- a/DeepSpeech.py +++ b/DeepSpeech.py @@ -4,7 +4,7 @@ from __future__ import absolute_import, division, print_function if __name__ == '__main__': try: - from mozilla_voice_stt_training import train as ds_train + from deepspeech_training import train as ds_train except ImportError: print('Training package is not installed. See training documentation.') raise diff --git a/Dockerfile.build.tmpl b/Dockerfile.build.tmpl index 12ce1989..f6af29f5 100644 --- a/Dockerfile.build.tmpl +++ b/Dockerfile.build.tmpl @@ -3,8 +3,8 @@ # Need devel version cause we need /usr/include/cudnn.h FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 -ENV MOZILLA_VOICE_STT_REPO=#MOZILLA_VOICE_STT_REPO# -ENV MOZILLA_VOICE_STT_SHA=#MOZILLA_VOICE_STT_SHA# +ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO# +ENV DEEPSPEECH_SHA=#DEEPSPEECH_SHA# # >> START Install base software @@ -113,15 +113,15 @@ RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \ WORKDIR / -RUN git clone --recursive $MOZILLA_VOICE_STT_REPO -WORKDIR /STT -RUN git checkout $MOZILLA_VOICE_STT_SHA +RUN git clone --recursive $DEEPSPEECH_REPO DeepSpeech +WORKDIR /DeepSpeech +RUN git checkout $DEEPSPEECH_SHA RUN git submodule sync tensorflow/ RUN git submodule update --init tensorflow/ # >> START Build and bind -WORKDIR /STT/tensorflow +WORKDIR /DeepSpeech/tensorflow # Fix for not found script https://github.com/tensorflow/tensorflow/issues/471 RUN ./configure @@ -132,7 +132,7 @@ RUN ./configure # passing LD_LIBRARY_PATH is required cause Bazel doesn't pickup it from environment -# Build Mozilla Voice STT +# Build DeepSpeech RUN bazel build \ --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" \ --config=monolithic \ @@ -149,22 +149,22 @@ RUN bazel build \ --copt=-msse4.2 \ --copt=-mavx \ --copt=-fvisibility=hidden \ - //native_client:libmozilla_voice_stt.so \ + //native_client:libdeepspeech.so \ --verbose_failures \ --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH} -# Copy built libs to /STT/native_client -RUN cp bazel-bin/native_client/libmozilla_voice_stt.so /STT/native_client/ +# Copy built libs to /DeepSpeech/native_client +RUN cp bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/ # Build client.cc and install Python client and decoder bindings -ENV TFDIR /STT/tensorflow +ENV TFDIR /DeepSpeech/tensorflow RUN nproc -WORKDIR /STT/native_client -RUN make NUM_PROCESSES=$(nproc) mozilla_voice_stt +WORKDIR /DeepSpeech/native_client +RUN make NUM_PROCESSES=$(nproc) deepspeech -WORKDIR /STT +WORKDIR /DeepSpeech RUN cd native_client/python && make NUM_PROCESSES=$(nproc) bindings RUN pip3 install --upgrade native_client/python/dist/*.whl @@ -176,8 +176,8 @@ RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl # Allow Python printing utf-8 ENV PYTHONIOENCODING UTF-8 -# Build KenLM in /STT/native_client/kenlm folder -WORKDIR /STT/native_client +# Build KenLM in /DeepSpeech/native_client/kenlm folder +WORKDIR /DeepSpeech/native_client RUN rm -rf kenlm && \ git clone https://github.com/kpu/kenlm && \ cd kenlm && \ @@ -188,4 +188,4 @@ RUN rm -rf kenlm && \ make -j $(nproc) # Done -WORKDIR /STT +WORKDIR /DeepSpeech diff --git a/Dockerfile.train.tmpl b/Dockerfile.train.tmpl index 790f35f4..fd0e9fa6 100644 --- a/Dockerfile.train.tmpl +++ b/Dockerfile.train.tmpl @@ -3,8 +3,8 @@ FROM tensorflow/tensorflow:1.15.2-gpu-py3 ENV DEBIAN_FRONTEND=noninteractive -ENV MOZILLA_VOICE_STT_REPO=#MOZILLA_VOICE_STT_REPO# -ENV MOZILLA_VOICE_STT_SHA=#MOZILLA_VOICE_STT_SHA# +ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO# +ENV DEEPSPEECH_SHA=#DEEPSPEECH_SHA# RUN apt-get update && apt-get install -y --no-install-recommends \ apt-utils \ @@ -31,12 +31,10 @@ RUN apt-get install -y --no-install-recommends libopus0 libsndfile1 RUN rm -rf /var/lib/apt/lists/* WORKDIR / -RUN echo git clone $MOZILLA_VOICE_STT_REPO -RUN git clone $MOZILLA_VOICE_STT_REPO +RUN git clone $DEEPSPEECH_REPO DeepSpeech -WORKDIR /STT -RUN echo git checkout $MOZILLA_VOICE_STT_SHA -RUN git checkout $MOZILLA_VOICE_STT_SHA +WORKDIR /DeepSpeech +RUN git checkout $DEEPSPEECH_SHA # Build CTC decoder first, to avoid clashes on incompatible versions upgrades RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings @@ -45,7 +43,7 @@ RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl # Prepare deps RUN pip3 install --upgrade pip==20.0.2 wheel==0.34.2 setuptools==46.1.3 -# Install Mozilla Voice STT +# Install DeepSpeech # - No need for the decoder since we did it earlier # - There is already correct TensorFlow GPU installed on the base image, # we don't want to break that @@ -56,7 +54,7 @@ RUN python3 util/taskcluster.py --source tensorflow --branch r1.15 \ --artifact convert_graphdef_memmapped_format --target . # Build KenLM to generate new scorers -WORKDIR /STT/native_client +WORKDIR /DeepSpeech/native_client RUN rm -rf kenlm && \ git clone https://github.com/kpu/kenlm && \ cd kenlm && \ @@ -65,6 +63,6 @@ RUN rm -rf kenlm && \ cd build && \ cmake .. && \ make -j $(nproc) -WORKDIR /STT +WORKDIR /DeepSpeech RUN ./bin/run-ldc93s1.sh diff --git a/GRAPH_VERSION b/GRAPH_VERSION index 20665e89..b9a65815 120000 --- a/GRAPH_VERSION +++ b/GRAPH_VERSION @@ -1 +1 @@ -training/mozilla_voice_stt_training/GRAPH_VERSION \ No newline at end of file +training/deepspeech_training/GRAPH_VERSION \ No newline at end of file diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md index 5b42ee74..08345c3a 100644 --- a/ISSUE_TEMPLATE.md +++ b/ISSUE_TEMPLATE.md @@ -1,4 +1,4 @@ -For support and discussions, please use our [Discourse forums](https://discourse.mozilla.org/c/mozilla-voice-stt). +For support and discussions, please use our [Discourse forums](https://discourse.mozilla.org/c/deep-speech). If you've found a bug, or have a feature request, then please create an issue with the following information: diff --git a/Makefile b/Makefile index e7f08eb8..2d28d24b 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ -MOZILLA_VOICE_STT_REPO ?= https://github.com/mozilla/STT.git -MOZILLA_VOICE_STT_SHA ?= origin/master +DEEPSPEECH_REPO ?= https://github.com/mozilla/DeepSpeech.git +DEEPSPEECH_SHA ?= origin/master Dockerfile%: Dockerfile%.tmpl sed \ - -e "s|#MOZILLA_VOICE_STT_REPO#|$(MOZILLA_VOICE_STT_REPO)|g" \ - -e "s|#MOZILLA_VOICE_STT_SHA#|$(MOZILLA_VOICE_STT_SHA)|g" \ + -e "s|#DEEPSPEECH_REPO#|$(DEEPSPEECH_REPO)|g" \ + -e "s|#DEEPSPEECH_SHA#|$(DEEPSPEECH_SHA)|g" \ < $< > $@ diff --git a/README.rst b/README.rst index e5bf7b9d..9c1b987e 100644 --- a/README.rst +++ b/README.rst @@ -1,22 +1,22 @@ -Mozilla Voice STT -================= +Project DeepSpeech +================== .. image:: https://readthedocs.org/projects/deepspeech/badge/?version=latest - :target: http://mozilla-voice-stt.readthedocs.io/?badge=latest + :target: http://deepspeech.readthedocs.io/?badge=latest :alt: Documentation -.. image:: https://community-tc.services.mozilla.com/api/github/v1/repository/mozilla/STT/master/badge.svg - :target: https://community-tc.services.mozilla.com/api/github/v1/repository/mozilla/STT/master/latest +.. image:: https://community-tc.services.mozilla.com/api/github/v1/repository/mozilla/DeepSpeech/master/badge.svg + :target: https://community-tc.services.mozilla.com/api/github/v1/repository/mozilla/DeepSpeech/master/latest :alt: Task Status -Mozilla Voice STT is an open source Speech-To-Text engine, using a model trained by machine learning techniques based on `Baidu's Deep Speech research paper `_. Mozilla Voice STT uses Google's `TensorFlow `_ to make the implementation easier. +DeepSpeech is an open source Speech-To-Text engine, using a model trained by machine learning techniques based on `Baidu's Deep Speech research paper `_. Project DeepSpeech uses Google's `TensorFlow `_ to make the implementation easier. -Documentation for installation, usage, and training models are available on `mozilla-voice-stt.readthedocs.io `_. +Documentation for installation, usage, and training models are available on `deepspeech.readthedocs.io `_. -For the latest release, including pre-trained models and checkpoints, `see the latest release on GitHub `_. +For the latest release, including pre-trained models and checkpoints, `see the latest release on GitHub `_. For contribution guidelines, see `CONTRIBUTING.rst `_. diff --git a/SUPPORT.rst b/SUPPORT.rst index ea93626e..d72a7418 100644 --- a/SUPPORT.rst +++ b/SUPPORT.rst @@ -5,8 +5,8 @@ Contact/Getting Help There are several ways to contact us or to get help: -#. `Discourse Forums `_ - The `Deep Speech category on Discourse `_ is the first place to look. Search for keywords related to your question or problem to see if someone else has run into it already. If you can't find anything relevant there, search on our `issue tracker `_ to see if there is an existing issue about your problem. +#. `Discourse Forums `_ - The `Deep Speech category on Discourse `_ is the first place to look. Search for keywords related to your question or problem to see if someone else has run into it already. If you can't find anything relevant there, search on our `issue tracker `_ to see if there is an existing issue about your problem. -#. `Matrix chat `_ - If your question is not addressed by either the `FAQ `_ or `Discourse Forums `_\ , you can contact us on the ``#machinelearning`` channel on `Mozilla Matrix `_\ ; people there can try to answer/help +#. `Matrix chat `_ - If your question is not addressed by either the `FAQ `_ or `Discourse Forums `_\ , you can contact us on the ``#machinelearning`` channel on `Mozilla Matrix `_\ ; people there can try to answer/help -#. `Create a new issue `_ - Finally, if you have a bug report or a feature request that isn't already covered by an existing issue, please open an issue in our repo and fill the appropriate information on your hardware and software setup. +#. `Create a new issue `_ - Finally, if you have a bug report or a feature request that isn't already covered by an existing issue, please open an issue in our repo and fill the appropriate information on your hardware and software setup. diff --git a/VERSION b/VERSION index 941b2c33..8a3ed242 120000 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -training/mozilla_voice_stt_training/VERSION \ No newline at end of file +training/deepspeech_training/VERSION \ No newline at end of file diff --git a/bin/compare_samples.py b/bin/compare_samples.py index 934a26f6..94108a7a 100755 --- a/bin/compare_samples.py +++ b/bin/compare_samples.py @@ -5,8 +5,8 @@ Tool for comparing two wav samples import sys import argparse -from mozilla_voice_stt_training.util.audio import AUDIO_TYPE_NP, mean_dbfs -from mozilla_voice_stt_training.util.sample_collections import load_sample +from deepspeech_training.util.audio import AUDIO_TYPE_NP, mean_dbfs +from deepspeech_training.util.sample_collections import load_sample def fail(message): diff --git a/bin/data_set_tool.py b/bin/data_set_tool.py index 4bfe7bd8..604684b9 100755 --- a/bin/data_set_tool.py +++ b/bin/data_set_tool.py @@ -8,20 +8,20 @@ import argparse import progressbar from pathlib import Path -from mozilla_voice_stt_training.util.audio import ( +from deepspeech_training.util.audio import ( AUDIO_TYPE_PCM, AUDIO_TYPE_OPUS, AUDIO_TYPE_WAV, change_audio_types, ) -from mozilla_voice_stt_training.util.downloader import SIMPLE_BAR -from mozilla_voice_stt_training.util.sample_collections import ( +from deepspeech_training.util.downloader import SIMPLE_BAR +from deepspeech_training.util.sample_collections import ( CSVWriter, DirectSDBWriter, TarWriter, samples_from_sources, ) -from mozilla_voice_stt_training.util.augmentations import ( +from deepspeech_training.util.augmentations import ( parse_augmentations, apply_sample_augmentations, SampleAugmentation diff --git a/bin/import_aidatatang.py b/bin/import_aidatatang.py index 34769482..c53eba09 100755 --- a/bin/import_aidatatang.py +++ b/bin/import_aidatatang.py @@ -5,7 +5,7 @@ import tarfile import pandas -from mozilla_voice_stt_training.util.importers import get_importers_parser +from deepspeech_training.util.importers import get_importers_parser COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"] diff --git a/bin/import_aishell.py b/bin/import_aishell.py index 4972afb8..341d0d88 100755 --- a/bin/import_aishell.py +++ b/bin/import_aishell.py @@ -5,7 +5,7 @@ import tarfile import pandas -from mozilla_voice_stt_training.util.importers import get_importers_parser +from deepspeech_training.util.importers import get_importers_parser COLUMNNAMES = ["wav_filename", "wav_filesize", "transcript"] diff --git a/bin/import_cv.py b/bin/import_cv.py index 4af9d9bd..e7dab564 100755 --- a/bin/import_cv.py +++ b/bin/import_cv.py @@ -9,13 +9,13 @@ from multiprocessing import Pool import progressbar import sox -from mozilla_voice_stt_training.util.downloader import SIMPLE_BAR, maybe_download -from mozilla_voice_stt_training.util.importers import ( +from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download +from deepspeech_training.util.importers import ( get_counter, get_imported_samples, print_import_report, ) -from mozilla_voice_stt_training.util.importers import validate_label_eng as validate_label +from deepspeech_training.util.importers import validate_label_eng as validate_label FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"] SAMPLE_RATE = 16000 diff --git a/bin/import_cv2.py b/bin/import_cv2.py index 30e02714..d6c8c192 100755 --- a/bin/import_cv2.py +++ b/bin/import_cv2.py @@ -15,15 +15,15 @@ from multiprocessing import Pool import progressbar import sox -from mozilla_voice_stt_training.util.downloader import SIMPLE_BAR -from mozilla_voice_stt_training.util.importers import ( +from deepspeech_training.util.downloader import SIMPLE_BAR +from deepspeech_training.util.importers import ( get_counter, get_imported_samples, get_importers_parser, get_validate_label, print_import_report, ) -from mvs_ctcdecoder import Alphabet +from ds_ctcdecoder import Alphabet FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"] SAMPLE_RATE = 16000 diff --git a/bin/import_fisher.py b/bin/import_fisher.py index 89b72c75..0634c860 100755 --- a/bin/import_fisher.py +++ b/bin/import_fisher.py @@ -10,7 +10,7 @@ import librosa import pandas import soundfile # <= Has an external dependency on libsndfile -from mozilla_voice_stt_training.util.importers import validate_label_eng as validate_label +from deepspeech_training.util.importers import validate_label_eng as validate_label # Prerequisite: Having the sph2pipe tool in your PATH: # https://www.ldc.upenn.edu/language-resources/tools/sphere-conversion-tools diff --git a/bin/import_freestmandarin.py b/bin/import_freestmandarin.py index 10cf2e5c..55ce9128 100755 --- a/bin/import_freestmandarin.py +++ b/bin/import_freestmandarin.py @@ -6,7 +6,7 @@ import tarfile import numpy as np import pandas -from mozilla_voice_stt_training.util.importers import get_importers_parser +from deepspeech_training.util.importers import get_importers_parser COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"] diff --git a/bin/import_gram_vaani.py b/bin/import_gram_vaani.py index 4e91a67a..71fcee08 100755 --- a/bin/import_gram_vaani.py +++ b/bin/import_gram_vaani.py @@ -12,7 +12,7 @@ import pandas as pd from sox import Transformer import swifter -from mozilla_voice_stt_training.util.importers import get_importers_parser, get_validate_label +from deepspeech_training.util.importers import get_importers_parser, get_validate_label __version__ = "0.1.0" _logger = logging.getLogger(__name__) diff --git a/bin/import_ldc93s1.py b/bin/import_ldc93s1.py index 1bac085f..86a00d74 100755 --- a/bin/import_ldc93s1.py +++ b/bin/import_ldc93s1.py @@ -4,7 +4,7 @@ import sys import pandas -from mozilla_voice_stt_training.util.downloader import maybe_download +from deepspeech_training.util.downloader import maybe_download def _download_and_preprocess_data(data_dir): diff --git a/bin/import_librivox.py b/bin/import_librivox.py index 0f77a7ab..32c1d20a 100755 --- a/bin/import_librivox.py +++ b/bin/import_librivox.py @@ -12,7 +12,7 @@ import progressbar from sox import Transformer from tensorflow.python.platform import gfile -from mozilla_voice_stt_training.util.downloader import maybe_download +from deepspeech_training.util.downloader import maybe_download SAMPLE_RATE = 16000 diff --git a/bin/import_lingua_libre.py b/bin/import_lingua_libre.py index 8c262f97..956d7a0b 100755 --- a/bin/import_lingua_libre.py +++ b/bin/import_lingua_libre.py @@ -12,15 +12,15 @@ from multiprocessing import Pool import progressbar import sox -from mozilla_voice_stt_training.util.downloader import SIMPLE_BAR, maybe_download -from mozilla_voice_stt_training.util.importers import ( +from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download +from deepspeech_training.util.importers import ( get_counter, get_imported_samples, get_importers_parser, get_validate_label, print_import_report, ) -from mvs_ctcdecoder import Alphabet +from ds_ctcdecoder import Alphabet FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"] SAMPLE_RATE = 16000 diff --git a/bin/import_m-ailabs.py b/bin/import_m-ailabs.py index ad637a25..bbaa744b 100755 --- a/bin/import_m-ailabs.py +++ b/bin/import_m-ailabs.py @@ -10,15 +10,15 @@ from multiprocessing import Pool import progressbar -from mozilla_voice_stt_training.util.downloader import SIMPLE_BAR, maybe_download -from mozilla_voice_stt_training.util.importers import ( +from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download +from deepspeech_training.util.importers import ( get_counter, get_imported_samples, get_importers_parser, get_validate_label, print_import_report, ) -from mvs_ctcdecoder import Alphabet +from ds_ctcdecoder import Alphabet FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"] SAMPLE_RATE = 16000 diff --git a/bin/import_magicdata.py b/bin/import_magicdata.py index b7205c1f..c8502784 100755 --- a/bin/import_magicdata.py +++ b/bin/import_magicdata.py @@ -6,7 +6,7 @@ import wave import pandas -from mozilla_voice_stt_training.util.importers import get_importers_parser +from deepspeech_training.util.importers import get_importers_parser COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"] diff --git a/bin/import_primewords.py b/bin/import_primewords.py index bedc3a85..08f3302a 100755 --- a/bin/import_primewords.py +++ b/bin/import_primewords.py @@ -7,7 +7,7 @@ import tarfile import numpy as np import pandas -from mozilla_voice_stt_training.util.importers import get_importers_parser +from deepspeech_training.util.importers import get_importers_parser COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"] diff --git a/bin/import_slr57.py b/bin/import_slr57.py index 68e68428..57588696 100755 --- a/bin/import_slr57.py +++ b/bin/import_slr57.py @@ -9,15 +9,15 @@ from multiprocessing import Pool import progressbar -from mozilla_voice_stt_training.util.downloader import SIMPLE_BAR, maybe_download -from mozilla_voice_stt_training.util.importers import ( +from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download +from deepspeech_training.util.importers import ( get_counter, get_imported_samples, get_importers_parser, get_validate_label, print_import_report, ) -from mvs_ctcdecoder import Alphabet +from ds_ctcdecoder import Alphabet FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"] SAMPLE_RATE = 16000 diff --git a/bin/import_swb.py b/bin/import_swb.py index e8497114..11e43f31 100755 --- a/bin/import_swb.py +++ b/bin/import_swb.py @@ -16,7 +16,7 @@ import pandas import requests import soundfile # <= Has an external dependency on libsndfile -from mozilla_voice_stt_training.util.importers import validate_label_eng as validate_label +from deepspeech_training.util.importers import validate_label_eng as validate_label # ARCHIVE_NAME refers to ISIP alignments from 01/29/03 ARCHIVE_NAME = "switchboard_word_alignments.tar.gz" diff --git a/bin/import_swc.py b/bin/import_swc.py index 2b2ec3b8..3775de05 100755 --- a/bin/import_swc.py +++ b/bin/import_swc.py @@ -22,9 +22,9 @@ from multiprocessing.pool import ThreadPool import progressbar import sox -from mozilla_voice_stt_training.util.downloader import SIMPLE_BAR, maybe_download -from mozilla_voice_stt_training.util.importers import validate_label_eng as validate_label -from mvs_ctcdecoder import Alphabet +from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download +from deepspeech_training.util.importers import validate_label_eng as validate_label +from ds_ctcdecoder import Alphabet SWC_URL = "https://www2.informatik.uni-hamburg.de/nats/pub/SWC/SWC_{language}.tar" SWC_ARCHIVE = "SWC_{language}.tar" diff --git a/bin/import_ted.py b/bin/import_ted.py index 0e185ac6..bad1452f 100755 --- a/bin/import_ted.py +++ b/bin/import_ted.py @@ -10,8 +10,8 @@ import pandas from sox import Transformer from tensorflow.python.platform import gfile -from mozilla_voice_stt_training.util.downloader import maybe_download -from mozilla_voice_stt_training.util.stm import parse_stm_file +from deepspeech_training.util.downloader import maybe_download +from deepspeech_training.util.stm import parse_stm_file def _download_and_preprocess_data(data_dir): diff --git a/bin/import_ts.py b/bin/import_ts.py index 373e86c7..e0130130 100755 --- a/bin/import_ts.py +++ b/bin/import_ts.py @@ -10,8 +10,8 @@ import progressbar import sox import unidecode -from mozilla_voice_stt_training.util.downloader import SIMPLE_BAR, maybe_download -from mozilla_voice_stt_training.util.importers import ( +from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download +from deepspeech_training.util.importers import ( get_counter, get_imported_samples, get_importers_parser, diff --git a/bin/import_tuda.py b/bin/import_tuda.py index 32e16963..da0cb42b 100755 --- a/bin/import_tuda.py +++ b/bin/import_tuda.py @@ -14,9 +14,9 @@ from collections import Counter import progressbar -from mozilla_voice_stt_training.util.downloader import SIMPLE_BAR, maybe_download -from mozilla_voice_stt_training.util.importers import validate_label_eng as validate_label -from mvs_ctcdecoder import Alphabet +from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download +from deepspeech_training.util.importers import validate_label_eng as validate_label +from ds_ctcdecoder import Alphabet TUDA_VERSION = "v2" TUDA_PACKAGE = "german-speechdata-package-{}".format(TUDA_VERSION) diff --git a/bin/import_vctk.py b/bin/import_vctk.py index 6d673020..f9c86799 100755 --- a/bin/import_vctk.py +++ b/bin/import_vctk.py @@ -11,8 +11,8 @@ from zipfile import ZipFile import librosa import progressbar -from mozilla_voice_stt_training.util.downloader import SIMPLE_BAR, maybe_download -from mozilla_voice_stt_training.util.importers import ( +from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download +from deepspeech_training.util.importers import ( get_counter, get_imported_samples, print_import_report, diff --git a/bin/import_voxforge.py b/bin/import_voxforge.py index 92dadc82..16195a8e 100755 --- a/bin/import_voxforge.py +++ b/bin/import_voxforge.py @@ -13,7 +13,7 @@ from os import makedirs, path import pandas from bs4 import BeautifulSoup from tensorflow.python.platform import gfile -from mozilla_voice_stt_training.util.downloader import maybe_download +from deepspeech_training.util.downloader import maybe_download """The number of jobs to run in parallel""" NUM_PARALLEL = 8 diff --git a/bin/play.py b/bin/play.py index bffc581e..1e8c59ca 100755 --- a/bin/play.py +++ b/bin/play.py @@ -9,9 +9,9 @@ import sys import random import argparse -from mozilla_voice_stt_training.util.audio import LOADABLE_AUDIO_EXTENSIONS, AUDIO_TYPE_PCM, AUDIO_TYPE_WAV -from mozilla_voice_stt_training.util.sample_collections import SampleList, LabeledSample, samples_from_source -from mozilla_voice_stt_training.util.augmentations import parse_augmentations, apply_sample_augmentations, SampleAugmentation +from deepspeech_training.util.audio import LOADABLE_AUDIO_EXTENSIONS, AUDIO_TYPE_PCM, AUDIO_TYPE_WAV +from deepspeech_training.util.sample_collections import SampleList, LabeledSample, samples_from_source +from deepspeech_training.util.augmentations import parse_augmentations, apply_sample_augmentations, SampleAugmentation def get_samples_in_play_order(): diff --git a/data/README.rst b/data/README.rst index 1d841060..f731a31c 100644 --- a/data/README.rst +++ b/data/README.rst @@ -3,9 +3,9 @@ Language-Specific Data This directory contains language-specific data files. Most importantly, you will find here: -1. A list of unique characters for the target language (e.g. English) in ``data/alphabet.txt``. After installing the training code, you can check ``python -m mozilla_voice_stt_training.util.check_characters --help`` for a tool that creates an alphabet file from a list of training CSV files. +1. A list of unique characters for the target language (e.g. English) in ``data/alphabet.txt``. After installing the training code, you can check ``python -m deepspeech_training.util.check_characters --help`` for a tool that creates an alphabet file from a list of training CSV files. 2. A script used to generate a binary n-gram language model: ``data/lm/generate_lm.py``. -For more information on how to build these resources from scratch, see the ``External scorer scripts`` section on `mozilla-voice-stt.readthedocs.io `_. +For more information on how to build these resources from scratch, see the ``External scorer scripts`` section on `deepspeech.readthedocs.io `_. diff --git a/doc/BUILDING.rst b/doc/BUILDING.rst index a6e09f9b..59f1a3b9 100644 --- a/doc/BUILDING.rst +++ b/doc/BUILDING.rst @@ -1,12 +1,12 @@ .. _build-native-client: -Building Mozilla Voice STT Binaries -=================================== +Building DeepSpeech Binaries +============================ This section describes how to rebuild binaries. We have already several prebuilt binaries for all the supported platform, it is highly advised to use them except if you know what you are doing. -If you'd like to build the Mozilla Voice STT binaries yourself, you'll need the following pre-requisites downloaded and installed: +If you'd like to build the DeepSpeech binaries yourself, you'll need the following pre-requisites downloaded and installed: * `Bazel 3.1.0 `_ * `General TensorFlow r2.3 requirements `_ @@ -26,18 +26,18 @@ If you'd like to build the language bindings or the decoder package, you'll also Dependencies ------------ -If you follow these instructions, you should compile your own binaries of Mozilla Voice STT (built on TensorFlow using Bazel). +If you follow these instructions, you should compile your own binaries of DeepSpeech (built on TensorFlow using Bazel). For more information on configuring TensorFlow, read the docs up to the end of `"Configure the Build" `_. Checkout source code ^^^^^^^^^^^^^^^^^^^^ -Clone Mozilla Voice STT source code (TensorFlow will come as a submdule): +Clone DeepSpeech source code (TensorFlow will come as a submdule): .. code-block:: - git clone https://github.com/mozilla/STT.git + git clone https://github.com/mozilla/DeepSpeech.git git submodule sync tensorflow/ git submodule update --init tensorflow/ @@ -56,24 +56,24 @@ After you have installed the correct version of Bazel, configure TensorFlow: cd tensorflow ./configure -Compile Mozilla Voice STT -------------------------- +Compile DeepSpeech +------------------ -Compile ``libmozilla_voice_stt.so`` +Compile ``libdeepspeech.so`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Within your TensorFlow directory, there should be a symbolic link to the Mozilla Voice STT ``native_client`` directory. If it is not present, create it with the follow command: +Within your TensorFlow directory, there should be a symbolic link to the DeepSpeech ``native_client`` directory. If it is not present, create it with the follow command: .. code-block:: cd tensorflow ln -s ../native_client -You can now use Bazel to build the main Mozilla Voice STT library, ``libmozilla_voice_stt.so``. Add ``--config=cuda`` if you want a CUDA build. +You can now use Bazel to build the main DeepSpeech library, ``libdeepspeech.so``. Add ``--config=cuda`` if you want a CUDA build. .. code-block:: - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-fvisibility=hidden //native_client:libmozilla_voice_stt.so + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-fvisibility=hidden //native_client:libdeepspeech.so The generated binaries will be saved to ``bazel-bin/native_client/``. @@ -82,12 +82,12 @@ The generated binaries will be saved to ``bazel-bin/native_client/``. Compile ``generate_scorer_package`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Following the same setup as for ``libmozilla_voice_stt.so`` above, you can rebuild the ``generate_scorer_package`` binary by adding its target to the command line: ``//native_client:generate_scorer_package``. +Following the same setup as for ``libdeepspeech.so`` above, you can rebuild the ``generate_scorer_package`` binary by adding its target to the command line: ``//native_client:generate_scorer_package``. Using the example from above you can build the library and that binary at the same time: .. code-block:: - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-fvisibility=hidden //native_client:libmozilla_voice_stt.so //native_client:generate_scorer_package + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_scorer_package The generated binaries will be saved to ``bazel-bin/native_client/``. @@ -99,7 +99,7 @@ Now, ``cd`` into the ``DeepSpeech/native_client`` directory and use the ``Makefi .. code-block:: cd ../DeepSpeech/native_client - make mozilla_voice_stt + make deepspeech Installing your own Binaries ---------------------------- @@ -121,9 +121,9 @@ Included are a set of generated Python bindings. After following the above build cd native_client/python make bindings - pip install dist/mozilla_voice_stt* + pip install dist/deepspeech* -The API mirrors the C++ API and is demonstrated in `client.py `_. Refer to the `C API ` for documentation. +The API mirrors the C++ API and is demonstrated in `client.py `_. Refer to `deepspeech.h `_ for documentation. Install NodeJS / ElectronJS bindings ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -136,12 +136,12 @@ After following the above build and installation instructions, the Node.JS bindi make build make npm-pack -This will create the package ``mozilla_voice_stt-VERSION.tgz`` in ``native_client/javascript``. +This will create the package ``deepspeech-VERSION.tgz`` in ``native_client/javascript``. Install the CTC decoder package ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To build the ``mvs_ctcdecoder`` package, you'll need the general requirements listed above (in particular SWIG). The command below builds the bindings using eight (8) processes for compilation. Adjust the parameter accordingly for more or less parallelism. +To build the ``ds_ctcdecoder`` package, you'll need the general requirements listed above (in particular SWIG). The command below builds the bindings using eight (8) processes for compilation. Adjust the parameter accordingly for more or less parallelism. .. code-block:: @@ -165,23 +165,23 @@ So your command line for ``RPi3`` and ``ARMv7`` should look like: .. code-block:: - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=rpi3 --config=rpi3_opt -c opt --copt=-O3 --copt=-fvisibility=hidden //native_client:libmozilla_voice_stt.so + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=rpi3 --config=rpi3_opt -c opt --copt=-O3 --copt=-fvisibility=hidden //native_client:libdeepspeech.so And your command line for ``LePotato`` and ``ARM64`` should look like: .. code-block:: - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=rpi3-armv8 --config=rpi3-armv8_opt -c opt --copt=-O3 --copt=-fvisibility=hidden //native_client:libmozilla_voice_stt.so + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=rpi3-armv8 --config=rpi3-armv8_opt -c opt --copt=-O3 --copt=-fvisibility=hidden //native_client:libdeepspeech.so While we test only on RPi3 Raspbian Buster and LePotato ARMBian Buster, anything compatible with ``armv7-a cortex-a53`` or ``armv8-a cortex-a53`` should be fine. -The ``mozilla_voice_stt`` binary can also be cross-built, with ``TARGET=rpi3`` or ``TARGET=rpi3-armv8``. This might require you to setup a system tree using the tool ``multistrap`` and the multitrap configuration files: ``native_client/multistrap_armbian64_buster.conf`` and ``native_client/multistrap_raspbian_buster.conf``. +The ``deepspeech`` binary can also be cross-built, with ``TARGET=rpi3`` or ``TARGET=rpi3-armv8``. This might require you to setup a system tree using the tool ``multistrap`` and the multitrap configuration files: ``native_client/multistrap_armbian64_buster.conf`` and ``native_client/multistrap_raspbian_buster.conf``. The path of the system tree can be overridden from the default values defined in ``definitions.mk`` through the ``RASPBIAN`` ``make`` variable. .. code-block:: cd ../DeepSpeech/native_client - make TARGET= mozilla_voice_stt + make TARGET= deepspeech Android devices support ----------------------- @@ -193,9 +193,9 @@ Please refer to TensorFlow documentation on how to setup the environment to buil Using the library from Android project ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We provide up-to-date and tested STT usable as an ``AAR`` package, +We provide uptodate and tested ``libdeepspeech`` usable as an ``AAR`` package, for Android versions starting with 7.0 to 11.0. The package is published on -`JCenter `_, +`JCenter `_, and the ``JCenter`` repository should be available by default in any Android project. Please make sure your project is setup to pull from this repository. You can then include the library by just adding this line to your @@ -203,43 +203,43 @@ You can then include the library by just adding this line to your .. code-block:: - implementation 'voice.mozilla.org:stt:VERSION@aar' + implementation 'deepspeech.mozilla.org:libdeepspeech:VERSION@aar' -Building ``libmozilla_voice_stt.so`` +Building ``libdeepspeech.so`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -You can build the ``libmozilla_voice_stt.so`` using (ARMv7): +You can build the ``libdeepspeech.so`` using (ARMv7): .. code-block:: - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=android --config=android_arm --define=runtime=tflite --action_env ANDROID_NDK_API_LEVEL=21 --cxxopt=-std=c++14 --copt=-D_GLIBCXX_USE_C99 //native_client:libmozilla_voice_stt.so + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=android --config=android_arm --define=runtime=tflite --action_env ANDROID_NDK_API_LEVEL=21 --cxxopt=-std=c++14 --copt=-D_GLIBCXX_USE_C99 //native_client:libdeepspeech.so Or (ARM64): .. code-block:: - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=android --config=android_arm64 --define=runtime=tflite --action_env ANDROID_NDK_API_LEVEL=21 --cxxopt=-std=c++14 --copt=-D_GLIBCXX_USE_C99 //native_client:libmozilla_voice_stt.so + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=android --config=android_arm64 --define=runtime=tflite --action_env ANDROID_NDK_API_LEVEL=21 --cxxopt=-std=c++14 --copt=-D_GLIBCXX_USE_C99 //native_client:libdeepspeech.so -Building ``libmozillavoicestt.aar`` +Building ``libdeepspeech.aar`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In the unlikely event you have to rebuild the JNI bindings, source code is -available under the ``libmozillavoicestt`` subdirectory. Building depends on shared -object: please ensure to place ``libmozilla_voice_stt.so`` into the -``libmozillavoicestt/libs/{arm64-v8a,armeabi-v7a,x86_64}/`` matching subdirectories. +available under the ``libdeepspeech`` subdirectory. Building depends on shared +object: please ensure to place ``libdeepspeech.so`` into the +``libdeepspeech/libs/{arm64-v8a,armeabi-v7a,x86_64}/`` matching subdirectories. Building the bindings is managed by ``gradle`` and should be limited to issuing -``./gradlew libmozillavoicestt:build``, producing an ``AAR`` package in -``./libmozillavoicestt/build/outputs/aar/``. +``./gradlew libdeepspeech:build``, producing an ``AAR`` package in +``./libdeepspeech/build/outputs/aar/``. Please note that you might have to copy the file to a local Maven repository and adapt file naming (when missing, the error message should states what filename it expects and where). -Building C++ ``mozilla_voice_stt`` binary -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Building C++ ``deepspeech`` binary +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Building the ``mozilla_voice_stt`` binary will happen through ``ndk-build`` (ARMv7): +Building the ``deepspeech`` binary will happen through ``ndk-build`` (ARMv7): .. code-block:: @@ -272,13 +272,13 @@ demo of one usage of the application. For example, it's only able to read PCM mono 16kHz 16-bits file and it might fail on some WAVE file that are not following exactly the specification. -Running ``mozilla_voice_stt`` via adb -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Running ``deepspeech`` via adb +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You should use ``adb push`` to send data to device, please refer to Android documentation on how to use that. -Please push Mozilla Voice STT data to ``/sdcard/mozilla_voice_stt/``\ , including: +Please push DeepSpeech data to ``/sdcard/deepspeech/``\ , including: * ``output_graph.tflite`` which is the TF Lite model @@ -286,18 +286,18 @@ Please push Mozilla Voice STT data to ``/sdcard/mozilla_voice_stt/``\ , includin the scorer; please be aware that too big scorer will make the device run out of memory -Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/stt``\ : +Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/ds``\ : -* ``mozilla_voice_stt`` -* ``libmozilla_voice_stt.so`` +* ``deepspeech`` +* ``libdeepspeech.so`` * ``libc++_shared.so`` You should then be able to run as usual, using a shell from ``adb shell``\ : .. code-block:: - user@device$ cd /data/local/tmp/stt/ - user@device$ LD_LIBRARY_PATH=$(pwd)/ ./mozilla_voice_stt [...] + user@device$ cd /data/local/tmp/ds/ + user@device$ LD_LIBRARY_PATH=$(pwd)/ ./deepspeech [...] Please note that Android linker does not support ``rpath`` so you have to set ``LD_LIBRARY_PATH``. Properly wrapped / packaged bindings does embed the library diff --git a/doc/C-API.rst b/doc/C-API.rst index bddc7d49..e96f3e12 100644 --- a/doc/C-API.rst +++ b/doc/C-API.rst @@ -10,59 +10,56 @@ C API See also the list of error codes including descriptions for each error in :ref:`error-codes`. -.. doxygenfunction:: STT_CreateModel +.. doxygenfunction:: DS_CreateModel :project: deepspeech-c -.. doxygenfunction:: STT_FreeModel +.. doxygenfunction:: DS_FreeModel :project: deepspeech-c -.. doxygenfunction:: STT_EnableExternalScorer +.. doxygenfunction:: DS_EnableExternalScorer :project: deepspeech-c -.. doxygenfunction:: STT_DisableExternalScorer +.. doxygenfunction:: DS_DisableExternalScorer :project: deepspeech-c -.. doxygenfunction:: STT_SetScorerAlphaBeta +.. doxygenfunction:: DS_SetScorerAlphaBeta :project: deepspeech-c -.. doxygenfunction:: STT_GetModelSampleRate +.. doxygenfunction:: DS_GetModelSampleRate :project: deepspeech-c -.. doxygenfunction:: STT_SpeechToText +.. doxygenfunction:: DS_SpeechToText :project: deepspeech-c -.. doxygenfunction:: STT_SpeechToTextWithMetadata +.. doxygenfunction:: DS_SpeechToTextWithMetadata :project: deepspeech-c -.. doxygenfunction:: STT_CreateStream +.. doxygenfunction:: DS_CreateStream :project: deepspeech-c -.. doxygenfunction:: STT_FeedAudioContent +.. doxygenfunction:: DS_FeedAudioContent :project: deepspeech-c -.. doxygenfunction:: STT_IntermediateDecode +.. doxygenfunction:: DS_IntermediateDecode :project: deepspeech-c -.. doxygenfunction:: STT_IntermediateDecodeWithMetadata +.. doxygenfunction:: DS_IntermediateDecodeWithMetadata :project: deepspeech-c -.. doxygenfunction:: STT_FinishStream +.. doxygenfunction:: DS_FinishStream :project: deepspeech-c -.. doxygenfunction:: STT_FinishStreamWithMetadata +.. doxygenfunction:: DS_FinishStreamWithMetadata :project: deepspeech-c -.. doxygenfunction:: STT_FreeStream +.. doxygenfunction:: DS_FreeStream :project: deepspeech-c -.. doxygenfunction:: STT_FreeMetadata +.. doxygenfunction:: DS_FreeMetadata :project: deepspeech-c -.. doxygenfunction:: STT_FreeString +.. doxygenfunction:: DS_FreeString :project: deepspeech-c -.. doxygenfunction:: STT_Version - :project: deepspeech-c - -.. doxygenfunction:: STT_ErrorCodeToErrorMessage +.. doxygenfunction:: DS_Version :project: deepspeech-c diff --git a/doc/Contributed-Examples.rst b/doc/Contributed-Examples.rst index 15221cc0..7eaba452 100644 --- a/doc/Contributed-Examples.rst +++ b/doc/Contributed-Examples.rst @@ -1,4 +1,4 @@ User contributed examples ========================= -There are also several user contributed examples available on a separate examples repository: `https://github.com/mozilla/STT-examples `_. +There are also several user contributed examples available on a separate examples repository: `https://github.com/mozilla/DeepSpeech-examples `_. diff --git a/doc/Decoder.rst b/doc/Decoder.rst index 1c145e93..c335c317 100644 --- a/doc/Decoder.rst +++ b/doc/Decoder.rst @@ -6,7 +6,7 @@ CTC beam search decoder Introduction ^^^^^^^^^^^^ -Mozilla Voice STT uses the `Connectionist Temporal Classification `_ loss function. For an excellent explanation of CTC and its usage, see this Distill article: `Sequence Modeling with CTC `_. This document assumes the reader is familiar with the concepts described in that article, and describes Mozilla Voice STT specific behaviors that developers building systems with Mozilla Voice STT should know to avoid problems. +DeepSpeech uses the `Connectionist Temporal Classification `_ loss function. For an excellent explanation of CTC and its usage, see this Distill article: `Sequence Modeling with CTC `_. This document assumes the reader is familiar with the concepts described in that article, and describes DeepSpeech specific behaviors that developers building systems with DeepSpeech should know to avoid problems. Note: Documentation for the tooling for creating custom scorer packages is available in :ref:`scorer-scripts`. @@ -16,19 +16,19 @@ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "S External scorer ^^^^^^^^^^^^^^^ -Mozilla Voice STT clients support OPTIONAL use of an external language model to improve the accuracy of the predicted transcripts. In the code, command line parameters, and documentation, this is referred to as a "scorer". The scorer is used to compute the likelihood (also called a score, hence the name "scorer") of sequences of words or characters in the output, to guide the decoder towards more likely results. This improves accuracy significantly. +DeepSpeech clients support OPTIONAL use of an external language model to improve the accuracy of the predicted transcripts. In the code, command line parameters, and documentation, this is referred to as a "scorer". The scorer is used to compute the likelihood (also called a score, hence the name "scorer") of sequences of words or characters in the output, to guide the decoder towards more likely results. This improves accuracy significantly. -The use of an external scorer is fully optional. When an external scorer is not specified, Mozilla Voice STT still uses a beam search decoding algorithm, but without any outside scoring. +The use of an external scorer is fully optional. When an external scorer is not specified, DeepSpeech still uses a beam search decoding algorithm, but without any outside scoring. -Currently, the Mozilla Voice STT external scorer is implemented with `KenLM `_, plus some tooling to package the necessary files and metadata into a single ``.scorer`` package. The tooling lives in ``data/lm/``. The scripts included in ``data/lm/`` can be used and modified to build your own language model based on your particular use case or language. See :ref:`scorer-scripts` for more details on how to reproduce our scorer file as well as create your own. +Currently, the DeepSpeech external scorer is implemented with `KenLM `_, plus some tooling to package the necessary files and metadata into a single ``.scorer`` package. The tooling lives in ``data/lm/``. The scripts included in ``data/lm/`` can be used and modified to build your own language model based on your particular use case or language. See :ref:`scorer-scripts` for more details on how to reproduce our scorer file as well as create your own. -The scripts are geared towards replicating the language model files we release as part of `Mozilla Voice STT model releases `_, but modifying them to use different datasets or language model construction parameters should be simple. +The scripts are geared towards replicating the language model files we release as part of `DeepSpeech model releases `_, but modifying them to use different datasets or language model construction parameters should be simple. Decoding modes ^^^^^^^^^^^^^^ -Mozilla Voice STT currently supports two modes of operation with significant differences at both training and decoding time. Note that Bytes output mode is experimental and has not been tested for languages other than Chinese Mandarin. +DeepSpeech currently supports two modes of operation with significant differences at both training and decoding time. Note that Bytes output mode is experimental and has not been tested for languages other than Chinese Mandarin. Default mode (alphabet based) diff --git a/doc/AcousticModel.rst b/doc/DeepSpeech.rst similarity index 88% rename from doc/AcousticModel.rst rename to doc/DeepSpeech.rst index cf70af2e..3d74d22e 100644 --- a/doc/AcousticModel.rst +++ b/doc/DeepSpeech.rst @@ -1,5 +1,11 @@ -Mozilla Voice STT Acoustic Model -================================ +DeepSpeech Model +================ + +The aim of this project is to create a simple, open, and ubiquitous speech +recognition engine. Simple, in that the engine should not require server-class +hardware to execute. Open, in that the code and models are released under the +Mozilla Public License. Ubiquitous, in that the engine should run on many +platforms and have bindings to many different languages. The architecture of the engine was originally motivated by that presented in `Deep Speech: Scaling up end-to-end speech recognition `_. @@ -71,7 +77,7 @@ with respect to all of the model parameters may be done via back-propagation through the rest of the network. We use the Adam method for training `[3] `_. -The complete LSTM model is illustrated in the figure below. +The complete RNN model is illustrated in the figure below. .. image:: ../images/rnn_fig-624x598.png - :alt: Mozilla Voice STT LSTM + :alt: DeepSpeech BRNN diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst index 7ec4e18d..92342ded 100644 --- a/doc/DotNet-API.rst +++ b/doc/DotNet-API.rst @@ -2,17 +2,17 @@ ============== -MozillaVoiceSttModel Class --------------------------- +DeepSpeech Class +---------------- -.. doxygenclass:: MozillaVoiceSttClient::MozillaVoiceSttModel +.. doxygenclass:: DeepSpeechClient::DeepSpeech :project: deepspeech-dotnet :members: -MozillaVoiceSttStream Class ---------------------------- +DeepSpeechStream Class +---------------------- -.. doxygenclass:: MozillaVoiceSttClient::Models::MozillaVoiceSttStream +.. doxygenclass:: DeepSpeechClient::Models::DeepSpeechStream :project: deepspeech-dotnet :members: @@ -21,33 +21,33 @@ ErrorCodes See also the main definition including descriptions for each error in :ref:`error-codes`. -.. doxygenenum:: MozillaVoiceSttClient::Enums::ErrorCodes +.. doxygenenum:: DeepSpeechClient::Enums::ErrorCodes :project: deepspeech-dotnet Metadata -------- -.. doxygenclass:: MozillaVoiceSttClient::Models::Metadata +.. doxygenclass:: DeepSpeechClient::Models::Metadata :project: deepspeech-dotnet :members: Transcripts CandidateTranscript ------------------- -.. doxygenclass:: MozillaVoiceSttClient::Models::CandidateTranscript +.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript :project: deepspeech-dotnet :members: Tokens, Confidence TokenMetadata ------------- -.. doxygenclass:: MozillaVoiceSttClient::Models::TokenMetadata +.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata :project: deepspeech-dotnet :members: Text, Timestep, StartTime -IMozillaVoiceSttModel Interface -------------------------------- +DeepSpeech Interface +-------------------- -.. doxygeninterface:: MozillaVoiceSttClient::Interfaces::IMozillaVoiceSttModel +.. doxygeninterface:: DeepSpeechClient::Interfaces::IDeepSpeech :project: deepspeech-dotnet :members: diff --git a/doc/DotNet-Examples.rst b/doc/DotNet-Examples.rst index 749250ba..a00ee833 100644 --- a/doc/DotNet-Examples.rst +++ b/doc/DotNet-Examples.rst @@ -1,12 +1,12 @@ .NET API Usage example ====================== -Examples are from `native_client/dotnet/MozillaVoiceSttConsole/Program.cs`. +Examples are from `native_client/dotnet/DeepSpeechConsole/Program.cs`. Creating a model instance and loading model ------------------------------------------- -.. literalinclude:: ../native_client/dotnet/MozillaVoiceSttConsole/Program.cs +.. literalinclude:: ../native_client/dotnet/DeepSpeechConsole/Program.cs :language: csharp :linenos: :lineno-match: @@ -16,7 +16,7 @@ Creating a model instance and loading model Performing inference -------------------- -.. literalinclude:: ../native_client/dotnet/MozillaVoiceSttConsole/Program.cs +.. literalinclude:: ../native_client/dotnet/DeepSpeechConsole/Program.cs :language: csharp :linenos: :lineno-match: @@ -26,4 +26,4 @@ Performing inference Full source code ---------------- -See :download:`Full source code<../native_client/dotnet/MozillaVoiceSttConsole/Program.cs>`. +See :download:`Full source code<../native_client/dotnet/DeepSpeechConsole/Program.cs>`. diff --git a/doc/Error-Codes.rst b/doc/Error-Codes.rst index 60090c9d..361ca025 100644 --- a/doc/Error-Codes.rst +++ b/doc/Error-Codes.rst @@ -5,7 +5,7 @@ Error codes Below is the definition for all error codes used in the API, their numerical values, and a human readable description. -.. literalinclude:: ../native_client/mozilla_voice_stt.h +.. literalinclude:: ../native_client/deepspeech.h :language: c :start-after: sphinx-doc: error_code_listing_start :end-before: sphinx-doc: error_code_listing_end diff --git a/doc/Flags.rst b/doc/Flags.rst index 05c9ce4d..66b26f0c 100644 --- a/doc/Flags.rst +++ b/doc/Flags.rst @@ -8,7 +8,7 @@ Below you can find the definition of all command-line flags supported by the tra Flags ----- -.. literalinclude:: ../training/mozilla_voice_stt_training/util/flags.py +.. literalinclude:: ../training/deepspeech_training/util/flags.py :language: python :linenos: :lineno-match: diff --git a/doc/Java-API.rst b/doc/Java-API.rst index f75297f1..e0c6a7dd 100644 --- a/doc/Java-API.rst +++ b/doc/Java-API.rst @@ -1,29 +1,29 @@ Java ==== -MozillaVoiceSttModel --------------------- +DeepSpeechModel +--------------- -.. doxygenclass:: org::mozilla::voice::stt::MozillaVoiceSttModel +.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::DeepSpeechModel :project: deepspeech-java :members: Metadata -------- -.. doxygenclass:: org::mozilla::voice::stt::Metadata +.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata :project: deepspeech-java :members: getNumTranscripts, getTranscript CandidateTranscript ------------------- -.. doxygenclass:: org::mozilla::voice::stt::CandidateTranscript +.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::CandidateTranscript :project: deepspeech-java :members: getNumTokens, getConfidence, getToken TokenMetadata ------------- -.. doxygenclass:: org::mozilla::voice::stt::TokenMetadata +.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::TokenMetadata :project: deepspeech-java :members: getText, getTimestep, getStartTime diff --git a/doc/Java-Examples.rst b/doc/Java-Examples.rst index a1e1a7dc..46ffa175 100644 --- a/doc/Java-Examples.rst +++ b/doc/Java-Examples.rst @@ -1,12 +1,12 @@ Java API Usage example ====================== -Examples are from `native_client/java/app/src/main/java/org/mozilla/voice/sttapp/MozillaVoiceSttActivity.java`. +Examples are from `native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java`. Creating a model instance and loading model ------------------------------------------- -.. literalinclude:: ../native_client/java/app/src/main/java/org/mozilla/voice/sttapp/MozillaVoiceSttActivity.java +.. literalinclude:: ../native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java :language: java :linenos: :lineno-match: @@ -16,7 +16,7 @@ Creating a model instance and loading model Performing inference -------------------- -.. literalinclude:: ../native_client/java/app/src/main/java/org/mozilla/voice/sttapp/MozillaVoiceSttActivity.java +.. literalinclude:: ../native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java :language: java :linenos: :lineno-match: @@ -26,4 +26,4 @@ Performing inference Full source code ---------------- -See :download:`Full source code<../native_client/java/app/src/main/java/org/mozilla/voice/sttapp/MozillaVoiceSttActivity.java>`. +See :download:`Full source code<../native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java>`. diff --git a/doc/Makefile b/doc/Makefile index 1b8aa39c..0980ab24 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -4,7 +4,7 @@ # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build -SPHINXPROJ = Mozilla Voice STT +SPHINXPROJ = DeepSpeech SOURCEDIR = . BUILDDIR = .build diff --git a/doc/ParallelOptimization.rst b/doc/ParallelOptimization.rst index 0da5954e..e0d3734c 100644 --- a/doc/ParallelOptimization.rst +++ b/doc/ParallelOptimization.rst @@ -1,8 +1,8 @@ Parallel Optimization ===================== -This is how we implement optimization of the Mozilla Voice STT model across GPUs -on a single host. Parallel optimization can take on various forms. For example +This is how we implement optimization of the DeepSpeech model across GPUs on a +single host. Parallel optimization can take on various forms. For example one can use asynchronous updates of the model, synchronous updates of the model, or some combination of the two. diff --git a/doc/SUPPORTED_PLATFORMS.rst b/doc/SUPPORTED_PLATFORMS.rst index eeea28da..1ccfb7e3 100644 --- a/doc/SUPPORTED_PLATFORMS.rst +++ b/doc/SUPPORTED_PLATFORMS.rst @@ -9,61 +9,61 @@ Linux / AMD64 without GPU ^^^^^^^^^^^^^^^^^^^^^^^^^ * x86-64 CPU with AVX/FMA (one can rebuild without AVX/FMA, but it might slow down inference) * Ubuntu 14.04+ (glibc >= 2.19, libstdc++6 >= 4.8) -* Full TensorFlow runtime (``mozilla_voice_stt`` packages) -* TensorFlow Lite runtime (``mozilla_voice_stt_tflite`` packages) +* Full TensorFlow runtime (``deepspeech`` packages) +* TensorFlow Lite runtime (``deepspeech-tflite`` packages) Linux / AMD64 with GPU ^^^^^^^^^^^^^^^^^^^^^^ * x86-64 CPU with AVX/FMA (one can rebuild without AVX/FMA, but it might slow down inference) * Ubuntu 14.04+ (glibc >= 2.19, libstdc++6 >= 4.8) * CUDA 10.0 (and capable GPU) -* Full TensorFlow runtime (``mozilla_voice_stt`` packages) -* TensorFlow Lite runtime (``mozilla_voice_stt_tflite`` packages) +* Full TensorFlow runtime (``deepspeech`` packages) +* TensorFlow Lite runtime (``deepspeech-tflite`` packages) Linux / ARMv7 ^^^^^^^^^^^^^ * Cortex-A53 compatible ARMv7 SoC with Neon support * Raspbian Buster-compatible distribution -* TensorFlow Lite runtime (``mozilla_voice_stt_tflite`` packages) +* TensorFlow Lite runtime (``deepspeech-tflite`` packages) Linux / Aarch64 ^^^^^^^^^^^^^^^ * Cortex-A72 compatible Aarch64 SoC * ARMbian Buster-compatible distribution -* TensorFlow Lite runtime (``mozilla_voice_stt_tflite`` packages) +* TensorFlow Lite runtime (``deepspeech-tflite`` packages) Android / ARMv7 ^^^^^^^^^^^^^^^ * ARMv7 SoC with Neon support * Android 7.0-10.0 * NDK API level >= 21 -* TensorFlow Lite runtime (``mozilla_voice_stt_tflite`` packages) +* TensorFlow Lite runtime (``deepspeech-tflite`` packages) Android / Aarch64 ^^^^^^^^^^^^^^^^^ * Aarch64 SoC * Android 7.0-10.0 * NDK API level >= 21 -* TensorFlow Lite runtime (``mozilla_voice_stt_tflite`` packages) +* TensorFlow Lite runtime (``deepspeech-tflite`` packages) macOS / AMD64 ^^^^^^^^^^^^^ * x86-64 CPU with AVX/FMA (one can rebuild without AVX/FMA, but it might slow down inference) * macOS >= 10.10 -* Full TensorFlow runtime (``mozilla_voice_stt`` packages) -* TensorFlow Lite runtime (``mozilla_voice_stt_tflite`` packages) +* Full TensorFlow runtime (``deepspeech`` packages) +* TensorFlow Lite runtime (``deepspeech-tflite`` packages) Windows / AMD64 without GPU ^^^^^^^^^^^^^^^^^^^^^^^^^^^ * x86-64 CPU with AVX/FMA (one can rebuild without AVX/FMA, but it might slow down inference) * Windows Server >= 2012 R2 ; Windows >= 8.1 -* Full TensorFlow runtime (``mozilla_voice_stt`` packages) -* TensorFlow Lite runtime (``mozilla_voice_stt_tflite`` packages) +* Full TensorFlow runtime (``deepspeech`` packages) +* TensorFlow Lite runtime (``deepspeech-tflite`` packages) Windows / AMD64 with GPU ^^^^^^^^^^^^^^^^^^^^^^^^ * x86-64 CPU with AVX/FMA (one can rebuild without AVX/FMA, but it might slow down inference) * Windows Server >= 2012 R2 ; Windows >= 8.1 * CUDA 10.0 (and capable GPU) -* Full TensorFlow runtime (``mozilla_voice_stt`` packages) -* TensorFlow Lite runtime (``mozilla_voice_stt_tflite`` packages) +* Full TensorFlow runtime (``deepspeech`` packages) +* TensorFlow Lite runtime (``deepspeech-tflite`` packages) diff --git a/doc/Scorer.rst b/doc/Scorer.rst index 50a8db60..1f374604 100644 --- a/doc/Scorer.rst +++ b/doc/Scorer.rst @@ -3,11 +3,11 @@ External scorer scripts ======================= -Mozilla Voice STT pre-trained models include an external scorer. This document explains how to reproduce our external scorer, as well as adapt the scripts to create your own. +DeepSpeech pre-trained models include an external scorer. This document explains how to reproduce our external scorer, as well as adapt the scripts to create your own. The scorer is composed of two sub-components, a KenLM language model and a trie data structure containing all words in the vocabulary. In order to create the scorer package, first we must create a KenLM language model (using ``data/lm/generate_lm.py``, and then use ``generate_scorer_package`` to create the final package file including the trie data structure. -The ``generate_scorer_package`` binary is part of the native client package that is included with official releases. You can find the appropriate archive for your platform in the `GitHub release downloads `_. The native client package is named ``native_client.{arch}.{config}.{plat}.tar.xz``, where ``{arch}`` is the architecture the binary was built for, for example ``amd64`` or ``arm64``, ``config`` is the build configuration, which for building decoder packages does not matter, and ``{plat}`` is the platform the binary was built-for, for example ``linux`` or ``osx``. If you wanted to run the ``generate_scorer_package`` binary on a Linux desktop, you would download ``native_client.amd64.cpu.linux.tar.xz``. +The ``generate_scorer_package`` binary is part of the native client package that is included with official releases. You can find the appropriate archive for your platform in the `GitHub release downloads `_. The native client package is named ``native_client.{arch}.{config}.{plat}.tar.xz``, where ``{arch}`` is the architecture the binary was built for, for example ``amd64`` or ``arm64``, ``config`` is the build configuration, which for building decoder packages does not matter, and ``{plat}`` is the platform the binary was built-for, for example ``linux`` or ``osx``. If you wanted to run the ``generate_scorer_package`` binary on a Linux desktop, you would download ``native_client.amd64.cpu.linux.tar.xz``. Reproducing our external scorer ------------------------------- @@ -44,7 +44,7 @@ Afterwards you can use ``generate_scorer_package`` to generate the scorer packag cd data/lm # Download and extract appropriate native_client package: - curl -LO http://github.com/mozilla/STT/releases/... + curl -LO http://github.com/mozilla/DeepSpeech/releases/... tar xvf native_client.*.tar.xz ./generate_scorer_package --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \ --package kenlm.scorer --default_alpha 0.931289039105002 --default_beta 1.1834137581510284 @@ -59,6 +59,6 @@ Building your own scorer can be useful if you're using models in a narrow usage The LibriSpeech LM training text used by our scorer is around 4GB uncompressed, which should give an idea of the size of a corpus needed for a reasonable language model for general speech recognition. For more constrained use cases with smaller vocabularies, you don't need as much data, but you should still try to gather as much as you can. -With a text corpus in hand, you can then re-use ``generate_lm.py`` and ``generate_scorer_package`` to create your own scorer that is compatible with Mozilla Voice STT clients and language bindings. Before building the language model, you must first familiarize yourself with the `KenLM toolkit `_. Most of the options exposed by the ``generate_lm.py`` script are simply forwarded to KenLM options of the same name, so you must read the KenLM documentation in order to fully understand their behavior. +With a text corpus in hand, you can then re-use ``generate_lm.py`` and ``generate_scorer_package`` to create your own scorer that is compatible with DeepSpeech clients and language bindings. Before building the language model, you must first familiarize yourself with the `KenLM toolkit `_. Most of the options exposed by the ``generate_lm.py`` script are simply forwarded to KenLM options of the same name, so you must read the KenLM documentation in order to fully understand their behavior. After using ``generate_lm.py`` to create a KenLM language model binary file, you can use ``generate_scorer_package`` to create a scorer package as described in the previous section. Note that we have a :github:`lm_optimizer.py script ` which can be used to find good default values for alpha and beta. To use it, you must first generate a package with any value set for default alpha and beta flags. For this step, it doesn't matter what values you use, as they'll be overridden by ``lm_optimizer.py`` later. Then, use ``lm_optimizer.py`` with this scorer file to find good alpha and beta values. Finally, use ``generate_scorer_package`` again, this time with the new values. diff --git a/doc/TRAINING.rst b/doc/TRAINING.rst index 034be8ba..9cc6d951 100644 --- a/doc/TRAINING.rst +++ b/doc/TRAINING.rst @@ -12,34 +12,34 @@ Prerequisites for training a model Getting the training code ^^^^^^^^^^^^^^^^^^^^^^^^^ -Clone the Mozilla Voice STT repository: +Clone the DeepSpeech repository: .. code-block:: bash - git clone https://github.com/mozilla/STT + git clone https://github.com/mozilla/DeepSpeech Creating a virtual environment ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In creating a virtual environment you will create a directory containing a ``python3`` binary and everything needed to run Mozilla Voice STT. You can use whatever directory you want. For the purpose of the documentation, we will rely on ``$HOME/tmp/stt-train-venv``. You can create it using this command: +In creating a virtual environment you will create a directory containing a ``python3`` binary and everything needed to run deepspeech. You can use whatever directory you want. For the purpose of the documentation, we will rely on ``$HOME/tmp/deepspeech-train-venv``. You can create it using this command: .. code-block:: - $ python3 -m venv $HOME/tmp/stt-train-venv/ + $ python3 -m venv $HOME/tmp/deepspeech-train-venv/ Once this command completes successfully, the environment will be ready to be activated. Activating the environment ^^^^^^^^^^^^^^^^^^^^^^^^^^ -Each time you need to work with Mozilla Voice STT, you have to *activate* this virtual environment. This is done with this simple command: +Each time you need to work with DeepSpeech, you have to *activate* this virtual environment. This is done with this simple command: .. code-block:: - $ source $HOME/tmp/stt-train-venv/bin/activate + $ source $HOME/tmp/deepspeech-train-venv/bin/activate -Installing Mozilla Voice STT Training Code and its dependencies -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Installing DeepSpeech Training Code and its dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Install the required dependencies using ``pip3``\ : @@ -88,11 +88,11 @@ This should ensure that you'll re-use the upstream Python 3 TensorFlow GPU-enabl make Dockerfile.train -If you want to specify a different Mozilla Voice STT repository / branch, you can pass ``MOZILLA_VOICE_STT_REPO`` or ``MOZILLA_VOICE_STT_SHA`` parameters: +If you want to specify a different DeepSpeech repository / branch, you can pass ``DEEPSPEECH_REPO`` or ``DEEPSPEECH_SHA`` parameters: .. code-block:: bash - make Dockerfile.train MOZILLA_VOICE_STT_REPO=git://your/fork MOZILLA_VOICE_STT_SHA=origin/your-branch + make Dockerfile.train DEEPSPEECH_REPO=git://your/fork DEEPSPEECH_SHA=origin/your-branch Common Voice training data ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -105,7 +105,7 @@ After extraction of such a data set, you'll find the following contents: * the ``*.tsv`` files output by CorporaCreator for the downloaded language * the mp3 audio files they reference in a ``clips`` sub-directory. -For bringing this data into a form that Mozilla Voice STT understands, you have to run the CommonVoice v2.0 importer (\ ``bin/import_cv2.py``\ ): +For bringing this data into a form that DeepSpeech understands, you have to run the CommonVoice v2.0 importer (\ ``bin/import_cv2.py``\ ): .. code-block:: bash @@ -150,7 +150,7 @@ For executing pre-configured training scenarios, there is a collection of conven **If you experience GPU OOM errors while training, try reducing the batch size with the ``--train_batch_size``\ , ``--dev_batch_size`` and ``--test_batch_size`` parameters.** -As a simple first example you can open a terminal, change to the directory of the Mozilla Voice STT checkout, activate the virtualenv created above, and run: +As a simple first example you can open a terminal, change to the directory of the DeepSpeech checkout, activate the virtualenv created above, and run: .. code-block:: bash @@ -160,7 +160,7 @@ This script will train on a small sample dataset composed of just a single audio Feel also free to pass additional (or overriding) ``DeepSpeech.py`` parameters to these scripts. Then, just run the script to train the modified network. -Each dataset has a corresponding importer script in ``bin/`` that can be used to download (if it's freely available) and preprocess the dataset. See ``bin/import_librivox.py`` for an example of how to import and preprocess a large dataset for training with Mozilla Voice STT. +Each dataset has a corresponding importer script in ``bin/`` that can be used to download (if it's freely available) and preprocess the dataset. See ``bin/import_librivox.py`` for an example of how to import and preprocess a large dataset for training with DeepSpeech. Some importers might require additional code to properly handled your locale-specific requirements. Such handling is dealt with ``--validate_label_locale`` flag that allows you to source out-of-tree Python script that defines a ``validate_label`` function. Please refer to ``util/importers.py`` for implementation example of that function. If you don't provide this argument, the default ``validate_label`` function will be used. This one is only intended for English language, so you might have consistency issues in your data for other languages. @@ -187,7 +187,7 @@ Mixed precision training makes use of both FP32 and FP16 precisions where approp python3 DeepSpeech.py --train_files ./train.csv --dev_files ./dev.csv --test_files ./test.csv --automatic_mixed_precision ``` -On a Volta generation V100 GPU, automatic mixed precision speeds up Mozilla Voice STT training and evaluation by ~30%-40%. +On a Volta generation V100 GPU, automatic mixed precision speeds up DeepSpeech training and evaluation by ~30%-40%. Checkpointing ^^^^^^^^^^^^^ @@ -229,9 +229,9 @@ Upon sucessfull run, it should report about conversion of a non-zero number of n Continuing training from a release model ---------------------------------------- -There are currently two supported approaches to make use of a pre-trained Mozilla Voice STT model: fine-tuning or transfer-learning. Choosing which one to use is a simple decision, and it depends on your target dataset. Does your data use the same alphabet as the release model? If "Yes": fine-tune. If "No" use transfer-learning. +There are currently two supported approaches to make use of a pre-trained DeepSpeech model: fine-tuning or transfer-learning. Choosing which one to use is a simple decision, and it depends on your target dataset. Does your data use the same alphabet as the release model? If "Yes": fine-tune. If "No" use transfer-learning. -If your own data uses the *extact* same alphabet as the English release model (i.e. `a-z` plus `'`) then the release model's output layer will match your data, and you can just fine-tune the existing parameters. However, if you want to use a new alphabet (e.g. Cyrillic `а`, `б`, `д`), the output layer of a release Mozilla Voice STT model will *not* match your data. In this case, you should use transfer-learning (i.e. remove the trained model's output layer, and reinitialize a new output layer that matches your target character set. +If your own data uses the *extact* same alphabet as the English release model (i.e. `a-z` plus `'`) then the release model's output layer will match your data, and you can just fine-tune the existing parameters. However, if you want to use a new alphabet (e.g. Cyrillic `а`, `б`, `д`), the output layer of a release DeepSpeech model will *not* match your data. In this case, you should use transfer-learning (i.e. remove the trained model's output layer, and reinitialize a new output layer that matches your target character set. N.B. - If you have access to a pre-trained model which uses UTF-8 bytes at the output layer you can always fine-tune, because any alphabet should be encodable as UTF-8. @@ -263,11 +263,11 @@ If you try to load a release model without following these steps, you'll get an Transfer-Learning (new alphabet) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -If you want to continue training an alphabet-based Mozilla Voice STT model (i.e. not a UTF-8 model) on a new language, or if you just want to add new characters to your custom alphabet, you will probably want to use transfer-learning instead of fine-tuning. If you're starting with a pre-trained UTF-8 model -- even if your data comes from a different language or uses a different alphabet -- the model will be able to predict your new transcripts, and you should use fine-tuning instead. +If you want to continue training an alphabet-based DeepSpeech model (i.e. not a UTF-8 model) on a new language, or if you just want to add new characters to your custom alphabet, you will probably want to use transfer-learning instead of fine-tuning. If you're starting with a pre-trained UTF-8 model -- even if your data comes from a different language or uses a different alphabet -- the model will be able to predict your new transcripts, and you should use fine-tuning instead. -In a nutshell, Mozilla Voice STT's transfer-learning allows you to remove certain layers from a pre-trained model, initialize new layers for your target data, stitch together the old and new layers, and update all layers via gradient descent. You will remove the pre-trained output layer (and optionally more layers) and reinitialize parameters to fit your target alphabet. The simplest case of transfer-learning is when you remove just the output layer. +In a nutshell, DeepSpeech's transfer-learning allows you to remove certain layers from a pre-trained model, initialize new layers for your target data, stitch together the old and new layers, and update all layers via gradient descent. You will remove the pre-trained output layer (and optionally more layers) and reinitialize parameters to fit your target alphabet. The simplest case of transfer-learning is when you remove just the output layer. -In Mozilla Voice STT's implementation of transfer-learning, all removed layers will be contiguous, starting from the output layer. The key flag you will want to experiment with is ``--drop_source_layers``. This flag accepts an integer from ``1`` to ``5`` and allows you to specify how many layers you want to remove from the pre-trained model. For example, if you supplied ``--drop_source_layers 3``, you will drop the last three layers of the pre-trained model: the output layer, penultimate layer, and LSTM layer. All dropped layers will be reinintialized, and (crucially) the output layer will be defined to match your supplied target alphabet. +In DeepSpeech's implementation of transfer-learning, all removed layers will be contiguous, starting from the output layer. The key flag you will want to experiment with is ``--drop_source_layers``. This flag accepts an integer from ``1`` to ``5`` and allows you to specify how many layers you want to remove from the pre-trained model. For example, if you supplied ``--drop_source_layers 3``, you will drop the last three layers of the pre-trained model: the output layer, penultimate layer, and LSTM layer. All dropped layers will be reinintialized, and (crucially) the output layer will be defined to match your supplied target alphabet. You need to specify the location of the pre-trained model with ``--load_checkpoint_dir`` and define where your new model checkpoints will be saved with ``--save_checkpoint_dir``. You need to specify how many layers to remove (aka "drop") from the pre-trained model: ``--drop_source_layers``. You also need to supply your new alphabet file using the standard ``--alphabet_config_path`` (remember, using a new alphabet is the whole reason you want to use transfer-learning). @@ -285,7 +285,8 @@ You need to specify the location of the pre-trained model with ``--load_checkpoi UTF-8 mode ^^^^^^^^^^ -Mozilla Voice STT includes a UTF-8 operating mode which can be useful to model languages with very large alphabets, such as Chinese Mandarin. For details on how it works and how to use it, see :ref:`decoder-docs`. +DeepSpeech includes a UTF-8 operating mode which can be useful to model languages with very large alphabets, such as Chinese Mandarin. For details on how it works and how to use it, see :ref:`decoder-docs`. + .. _training-data-augmentation: diff --git a/doc/USING.rst b/doc/USING.rst index 095a61d1..12519980 100644 --- a/doc/USING.rst +++ b/doc/USING.rst @@ -3,7 +3,7 @@ Using a Pre-trained Model ========================= -Inference using a Mozilla Voice STT pre-trained model can be done with a client/language binding package. We have four clients/language bindings in this repository, listed below, and also a few community-maintained clients/language bindings in other repositories, listed `further down in this README <#third-party-bindings>`_. +Inference using a DeepSpeech pre-trained model can be done with a client/language binding package. We have four clients/language bindings in this repository, listed below, and also a few community-maintained clients/language bindings in other repositories, listed `further down in this README <#third-party-bindings>`_. * :ref:`The C API `. * :ref:`The Python package/language binding ` @@ -13,7 +13,7 @@ Inference using a Mozilla Voice STT pre-trained model can be done with a client/ .. _runtime-deps: -Running ``mozilla_voice_stt`` might, see below, require some runtime dependencies to be already installed on your system: +Running ``deepspeech`` might, see below, require some runtime dependencies to be already installed on your system: * ``sox`` - The Python and Node.JS clients use SoX to resample files to 16kHz. * ``libgomp1`` - libsox (statically linked into the clients) depends on OpenMP. Some people have had to install this manually. @@ -28,29 +28,29 @@ Please refer to your system's documentation on how to install these dependencies CUDA dependency ^^^^^^^^^^^^^^^ -The CUDA capable builds (Python, NodeJS, C++, etc) depend on CUDA 10.1 and CuDNN v7.6. +The GPU capable builds (Python, NodeJS, C++, etc) depend on CUDA 10.1 and CuDNN v7.6. Getting the pre-trained model ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -If you want to use the pre-trained English model for performing speech-to-text, you can download it (along with other important inference material) from the Mozilla Voice STT `releases page `_. Alternatively, you can run the following command to download the model files in your current directory: +If you want to use the pre-trained English model for performing speech-to-text, you can download it (along with other important inference material) from the DeepSpeech `releases page `_. Alternatively, you can run the following command to download the model files in your current directory: .. code-block:: bash - wget https://github.com/mozilla/STT/releases/download/v0.8.1/deepspeech-0.8.1-models.pbmm - wget https://github.com/mozilla/STT/releases/download/v0.8.1/deepspeech-0.8.1-models.scorer + wget https://github.com/mozilla/DeepSpeech/releases/download/v0.7.4/deepspeech-0.7.4-models.pbmm + wget https://github.com/mozilla/DeepSpeech/releases/download/v0.7.4/deepspeech-0.7.4-models.scorer -There are several pre-trained model files available in official releases. Files ending in ``.pbmm`` are compatible with clients and language bindings built against the standard TensorFlow runtime. Usually these packages are simply called ``mozilla_voice_stt``. These files are also compatible with CUDA enabled clients and language bindings. These packages are usually called ``mozilla_voice_stt_cuda``. Files ending in ``.tflite`` are compatible with clients and language bindings built against the `TensorFlow Lite runtime `_. These models are optimized for size and performance in low power devices. On desktop platforms, the compatible packages are called ``mozilla_voice_stt_tflite``. On Android and Raspberry Pi, we only publish TensorFlow Lite enabled packages, and they are simply called ``mozilla_voice_stt``. You can see a full list of supported platforms and which TensorFlow runtime is supported at :ref:`supported-platforms-inference`. +There are several pre-trained model files available in official releases. Files ending in ``.pbmm`` are compatible with clients and language bindings built against the standard TensorFlow runtime. Usually these packages are simply called ``deepspeech``. These files are also compatible with CUDA enabled clients and language bindings. These packages are usually called ``deepspeech-gpu``. Files ending in ``.tflite`` are compatible with clients and language bindings built against the `TensorFlow Lite runtime `_. These models are optimized for size and performance in low power devices. On desktop platforms, the compatible packages are called ``deepspeech-tflite``. On Android and Raspberry Pi, we only publish TensorFlow Lite enabled packages, and they are simply called ``deepspeech``. You can see a full list of supported platforms and which TensorFlow runtime is supported at :ref:`supported-platforms-inference`. -+--------------------------+---------------------+---------------------+ -| Package/Model type | .pbmm | .tflite | -+==========================+=====================+=====================+ -| mozilla_voice_stt | Depends on platform | Depends on platform | -+--------------------------+---------------------+---------------------+ -| mozilla_voice_stt_cuda | ✅ | ❌ | -+--------------------------+---------------------+---------------------+ -| mozilla_voice_stt_tflite | ❌ | ✅ | -+--------------------------+---------------------+---------------------+ ++--------------------+---------------------+---------------------+ +| Package/Model type | .pbmm | .tflite | ++====================+=====================+=====================+ +| deepspeech | Depends on platform | Depends on platform | ++--------------------+---------------------+---------------------+ +| deepspeech-gpu | ✅ | ❌ | ++--------------------+---------------------+---------------------+ +| deepspeech-tflite | ❌ | ✅ | ++--------------------+---------------------+---------------------+ Finally, the pre-trained model files also include files ending in ``.scorer``. These are external scorers (language models) that are used at inference time in conjunction with an acoustic model (``.pbmm`` or ``.tflite`` file) to produce transcriptions. We also provide further documentation on :ref:`the decoding process ` and :ref:`how scorers are generated `. @@ -61,82 +61,82 @@ The release notes include detailed information on how the released models were t The process for training an acoustic model is described in :ref:`training-docs`. In particular, fine tuning a release model using your own data can be a good way to leverage relatively smaller amounts of data that would not be sufficient for training a new model from scratch. See the :ref:`fine tuning and transfer learning sections ` for more information. :ref:`Data augmentation ` can also be a good way to increase the value of smaller training sets. -Creating your own external scorer from text data is another way that you can adapt the model to your specific needs. The process and tools used to generate an external scorer package are described in :ref:`scorer-scripts` and an overview of how the external scorer is used by Mozilla Voice STT to perform inference is available in :ref:`decoder-docs`. Generating a smaller scorer from a single purpose text dataset is a quick process and can bring significant accuracy improvements, specially for more constrained, limited vocabulary applications. +Creating your own external scorer from text data is another way that you can adapt the model to your specific needs. The process and tools used to generate an external scorer package are described in :ref:`scorer-scripts` and an overview of how the external scorer is used by DeepSpeech to perform inference is available in :ref:`decoder-docs`. Generating a smaller scorer from a single purpose text dataset is a quick process and can bring significant accuracy improvements, specially for more constrained, limited vocabulary applications. Model compatibility ^^^^^^^^^^^^^^^^^^^ -Mozilla Voice STT models are versioned to keep you from trying to use an incompatible graph with a newer client after a breaking change was made to the code. If you get an error saying your model file version is too old for the client, you should either upgrade to a newer model release, re-export your model from the checkpoint using a newer version of the code, or downgrade your client if you need to use the old model and can't re-export it. +DeepSpeech models are versioned to keep you from trying to use an incompatible graph with a newer client after a breaking change was made to the code. If you get an error saying your model file version is too old for the client, you should either upgrade to a newer model release, re-export your model from the checkpoint using a newer version of the code, or downgrade your client if you need to use the old model and can't re-export it. .. _py-usage: Using the Python package ^^^^^^^^^^^^^^^^^^^^^^^^ -Pre-built binaries which can be used for performing inference with a trained model can be installed with ``pip3``. You can then use the ``mozilla_voice_stt`` binary to do speech-to-text on an audio file: +Pre-built binaries which can be used for performing inference with a trained model can be installed with ``pip3``. You can then use the ``deepspeech`` binary to do speech-to-text on an audio file: For the Python bindings, it is highly recommended that you perform the installation within a Python 3.5 or later virtual environment. You can find more information about those in `this documentation `_. We will continue under the assumption that you already have your system properly setup to create new virtual environments. -Create a Mozilla Voice STT virtual environment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Create a DeepSpeech virtual environment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In creating a virtual environment you will create a directory containing a ``python3`` binary and everything needed to run Mozilla Voice STT. You can use whatever directory you want. For the purpose of the documentation, we will rely on ``$HOME/tmp/stt-venv``. You can create it using this command: +In creating a virtual environment you will create a directory containing a ``python3`` binary and everything needed to run deepspeech. You can use whatever directory you want. For the purpose of the documentation, we will rely on ``$HOME/tmp/deepspeech-venv``. You can create it using this command: .. code-block:: - $ virtualenv -p python3 $HOME/tmp/stt-venv/ + $ virtualenv -p python3 $HOME/tmp/deepspeech-venv/ Once this command completes successfully, the environment will be ready to be activated. Activating the environment ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Each time you need to work with Mozilla Voice STT, you have to *activate* this virtual environment. This is done with this simple command: +Each time you need to work with DeepSpeech, you have to *activate* this virtual environment. This is done with this simple command: .. code-block:: - $ source $HOME/tmp/stt-venv/bin/activate + $ source $HOME/tmp/deepspeech-venv/bin/activate -Installing Mozilla Voice STT Python bindings -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Installing DeepSpeech Python bindings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Once your environment has been set-up and loaded, you can use ``pip3`` to manage packages locally. On a fresh setup of the ``virtualenv``\ , you will have to install the Mozilla Voice STT wheel. You can check if ``mozilla_voice_stt`` is already installed with ``pip3 list``. +Once your environment has been set-up and loaded, you can use ``pip3`` to manage packages locally. On a fresh setup of the ``virtualenv``\ , you will have to install the DeepSpeech wheel. You can check if ``deepspeech`` is already installed with ``pip3 list``. To perform the installation, just use ``pip3`` as such: .. code-block:: - $ pip3 install mozilla_voice_stt + $ pip3 install deepspeech -If ``mozilla_voice_stt`` is already installed, you can update it as such: +If ``deepspeech`` is already installed, you can update it as such: .. code-block:: - $ pip3 install --upgrade mozilla_voice_stt + $ pip3 install --upgrade deepspeech -Alternatively, if you have a supported NVIDIA GPU on Linux, you can install the CUDA specific package as follows: +Alternatively, if you have a supported NVIDIA GPU on Linux, you can install the GPU specific package as follows: .. code-block:: - $ pip3 install mozilla_voice_stt_cuda + $ pip3 install deepspeech-gpu -See the `release notes `_ to find which GPUs are supported. Please ensure you have the required `CUDA dependency <#cuda-dependency>`_. +See the `release notes `_ to find which GPUs are supported. Please ensure you have the required `CUDA dependency <#cuda-dependency>`_. -You can update ``mozilla_voice_stt_cuda`` as follows: +You can update ``deepspeech-gpu`` as follows: .. code-block:: - $ pip3 install --upgrade mozilla_voice_stt_cuda + $ pip3 install --upgrade deepspeech-gpu -In both cases, ``pip3`` should take care of installing all the required dependencies. After installation has finished, you should be able to call ``mozilla_voice_stt`` from the command-line. +In both cases, ``pip3`` should take care of installing all the required dependencies. After installation has finished, you should be able to call ``deepspeech`` from the command-line. Note: the following command assumes you `downloaded the pre-trained model <#getting-the-pre-trained-model>`_. .. code-block:: bash - mozilla_voice_stt --model deepspeech-0.8.1-models.pbmm --scorer deepspeech-0.8.1-models.scorer --audio my_audio_file.wav + deepspeech --model deepspeech-0.7.4-models.pbmm --scorer deepspeech-0.7.4-models.scorer --audio my_audio_file.wav The ``--scorer`` argument is optional, and represents an external language model to be used when transcribing the audio. @@ -151,7 +151,7 @@ You can download the JS bindings using ``npm``\ : .. code-block:: bash - npm install @mozilla-voice/stt + npm install deepspeech Please note that as of now, we support: - Node.JS versions 4 to 13. @@ -159,13 +159,13 @@ Please note that as of now, we support: TypeScript support is also provided. -Alternatively, if you're using Linux and have a supported NVIDIA GPU, you can install the CUDA specific package as follows: +Alternatively, if you're using Linux and have a supported NVIDIA GPU, you can install the GPU specific package as follows: .. code-block:: bash - npm install @mozilla-voice/stt-cuda + npm install deepspeech-gpu -See the `release notes `_ to find which GPUs are supported. Please ensure you have the required `CUDA dependency <#cuda-dependency>`_. +See the `release notes `_ to find which GPUs are supported. Please ensure you have the required `CUDA dependency <#cuda-dependency>`_. See the :ref:`TypeScript client ` for an example of how to use the bindings programatically. @@ -174,7 +174,7 @@ See the :ref:`TypeScript client ` for an example of how to use t Using the command-line client ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To download the pre-built binaries for the ``mozilla_voice_stt`` command-line (compiled C++) client, use ``util/taskcluster.py``\ : +To download the pre-built binaries for the ``deepspeech`` command-line (compiled C++) client, use ``util/taskcluster.py``\ : .. code-block:: bash @@ -192,17 +192,17 @@ also, if you need some binaries different than current master, like ``v0.2.0-alp python3 util/taskcluster.py --branch "v0.2.0-alpha.6" --target "." -The script ``taskcluster.py`` will download ``native_client.tar.xz`` (which includes the ``mozilla_voice_stt`` binary and associated libraries) and extract it into the current folder. Also, ``taskcluster.py`` will download binaries for Linux/x86_64 by default, but you can override that behavior with the ``--arch`` parameter. See the help info with ``python util/taskcluster.py -h`` for more details. Specific branches of Mozilla Voice STT or TensorFlow can be specified as well. +The script ``taskcluster.py`` will download ``native_client.tar.xz`` (which includes the ``deepspeech`` binary and associated libraries) and extract it into the current folder. Also, ``taskcluster.py`` will download binaries for Linux/x86_64 by default, but you can override that behavior with the ``--arch`` parameter. See the help info with ``python util/taskcluster.py -h`` for more details. Specific branches of DeepSpeech or TensorFlow can be specified as well. -Alternatively you may manually download the ``native_client.tar.xz`` from the [releases](https://github.com/mozilla/STT/releases). +Alternatively you may manually download the ``native_client.tar.xz`` from the [releases](https://github.com/mozilla/DeepSpeech/releases). Note: the following command assumes you `downloaded the pre-trained model <#getting-the-pre-trained-model>`_. .. code-block:: bash - ./mozilla_voice_stt --model deepspeech-0.8.1-models.pbmm --scorer deepspeech-0.8.1-models.scorer --audio audio_input.wav + ./deepspeech --model deepspeech-0.7.4-models.pbmm --scorer deepspeech-0.7.4-models.scorer --audio audio_input.wav -See the help output with ``./mozilla_voice_stt -h`` for more details. +See the help output with ``./deepspeech -h`` for more details. Installing bindings from source ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -212,18 +212,18 @@ If pre-built binaries aren't available for your system, you'll need to install t Dockerfile for building from source ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We provide ``Dockerfile.build`` to automatically build ``libmozilla_voice_stt.so``, the C++ native client, Python bindings, and KenLM. +We provide ``Dockerfile.build`` to automatically build ``libdeepspeech.so``, the C++ native client, Python bindings, and KenLM. You need to generate the Dockerfile from the template using: .. code-block:: bash make Dockerfile.build -If you want to specify a different Mozilla Voice STT repository / branch, you can pass ``MOZILLA_VOICE_STT_REPO`` or ``MOZILLA_VOICE_STT_SHA`` parameters: +If you want to specify a different DeepSpeech repository / branch, you can pass ``DEEPSPEECH_REPO`` or ``DEEPSPEECH_SHA`` parameters: .. code-block:: bash - make Dockerfile.build MOZILLA_VOICE_STT_REPO=git://your/fork MOZILLA_VOICE_STT_SHA=origin/your-branch + make Dockerfile.build DEEPSPEECH_REPO=git://your/fork DEEPSPEECH_SHA=origin/your-branch Third party bindings ^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/conf.py b/doc/conf.py index 2a62a07c..bb64d77e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Mozilla Voice STT documentation build configuration file, created by +# DeepSpeech documentation build configuration file, created by # sphinx-quickstart on Thu Feb 2 21:20:39 2017. # # This file is execfile()d with the current directory set to its @@ -24,7 +24,7 @@ import sys sys.path.insert(0, os.path.abspath('../')) -autodoc_mock_imports = ['mozilla_voice_stt'] +autodoc_mock_imports = ['deepspeech'] # This is in fact only relevant on ReadTheDocs, but we want to run the same way # on our CI as in RTD to avoid regressions on RTD that we would not catch on @@ -41,7 +41,7 @@ import semver # -- Project information ----------------------------------------------------- -project = u'Mozilla Voice STT' +project = u'DeepSpeech' copyright = '2019-2020, Mozilla Corporation' author = 'Mozilla Corporation' @@ -143,7 +143,7 @@ html_static_path = ['.static'] # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. -htmlhelp_basename = 'sttdoc' +htmlhelp_basename = 'DeepSpeechdoc' # -- Options for LaTeX output --------------------------------------------- @@ -170,7 +170,7 @@ latex_elements = { # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'Mozilla_Voice_STT.tex', u'Mozilla Voice STT Documentation', + (master_doc, 'DeepSpeech.tex', u'DeepSpeech Documentation', u'Mozilla Research', 'manual'), ] @@ -180,7 +180,7 @@ latex_documents = [ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'mozilla_voice_stt', u'Mozilla Voice STT Documentation', + (master_doc, 'deepspeech', u'DeepSpeech Documentation', [author], 1) ] @@ -191,8 +191,8 @@ man_pages = [ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'Mozilla Voice STT', u'Mozilla Voice STT Documentation', - author, 'Mozilla Voice STT', 'One line description of project.', + (master_doc, 'DeepSpeech', u'DeepSpeech Documentation', + author, 'DeepSpeech', 'One line description of project.', 'Miscellaneous'), ] @@ -202,5 +202,5 @@ texinfo_documents = [ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = {'https://docs.python.org/': None} -extlinks = {'github': ('https://github.com/mozilla/STT/blob/v{}/%s'.format(release), +extlinks = {'github': ('https://github.com/mozilla/DeepSpeech/blob/v{}/%s'.format(release), '%s')} diff --git a/doc/doxygen-c.conf b/doc/doxygen-c.conf index daecb5f4..f36f57b2 100644 --- a/doc/doxygen-c.conf +++ b/doc/doxygen-c.conf @@ -790,7 +790,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = native_client/mozilla_voice_stt.h +INPUT = native_client/deepspeech.h # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/doc/doxygen-dotnet.conf b/doc/doxygen-dotnet.conf index 6481a9c1..74c2c5bb 100644 --- a/doc/doxygen-dotnet.conf +++ b/doc/doxygen-dotnet.conf @@ -790,7 +790,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = native_client/dotnet/MozillaVoiceSttClient/ native_client/dotnet/MozillaVoiceSttClient/Interfaces/ native_client/dotnet/MozillaVoiceSttClient/Enums/ native_client/dotnet/MozillaVoiceSttClient/Models/ +INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/doc/doxygen-java.conf b/doc/doxygen-java.conf index cf193fed..a8d65c69 100644 --- a/doc/doxygen-java.conf +++ b/doc/doxygen-java.conf @@ -790,7 +790,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = native_client/java/libmozillavoicestt/src/main/java/org/mozilla/voice/stt/ native_client/java/libmozillavoicestt/src/main/java/org/mozilla/voice/stt_doc/ +INPUT = native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/ native_client/java/libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc/ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/doc/examples b/doc/examples index adde02be..6f5f501f 160000 --- a/doc/examples +++ b/doc/examples @@ -1 +1 @@ -Subproject commit adde02be1676d3ec4a2c18008b4871489f3bb42a +Subproject commit 6f5f501fa62743f1b78fe162eb1a579a450bd38f diff --git a/doc/index.rst b/doc/index.rst index 5b76358f..e8991d3f 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -1,54 +1,54 @@ -.. Mozilla Voice STT documentation master file, created by +.. DeepSpeech documentation master file, created by sphinx-quickstart on Thu Feb 2 21:20:39 2017. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to Mozilla Voice STT's documentation! +Welcome to DeepSpeech's documentation! ====================================== -Mozilla Voice STT is an open source Speech-To-Text engine, using a model trained by machine learning techniques based on `Baidu's Deep Speech research paper `_. Project Mozilla Voice STT uses Google's `TensorFlow `_ to make the implementation easier. +DeepSpeech is an open source Speech-To-Text engine, using a model trained by machine learning techniques based on `Baidu's Deep Speech research paper `_. Project DeepSpeech uses Google's `TensorFlow `_ to make the implementation easier. -To install and use Mozilla Voice STT all you have to do is: +To install and use DeepSpeech all you have to do is: .. code-block:: bash # Create and activate a virtualenv - virtualenv -p python3 $HOME/tmp/stt-venv/ - source $HOME/tmp/stt-venv/bin/activate + virtualenv -p python3 $HOME/tmp/deepspeech-venv/ + source $HOME/tmp/deepspeech-venv/bin/activate - # Install Mozilla Voice STT - pip3 install mozilla_voice_stt + # Install DeepSpeech + pip3 install deepspeech # Download pre-trained English model files - curl -LO https://github.com/mozilla/STT/releases/download/v0.8.1/deepspeech-0.8.1-models.pbmm - curl -LO https://github.com/mozilla/STT/releases/download/v0.8.1/deepspeech-0.8.1-models.scorer + curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.7.4/deepspeech-0.7.4-models.pbmm + curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.7.4/deepspeech-0.7.4-models.scorer # Download example audio files - curl -LO https://github.com/mozilla/STT/releases/download/v0.8.1/audio-0.8.1.tar.gz - tar xvf audio-0.8.1.tar.gz + curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.7.4/audio-0.7.4.tar.gz + tar xvf audio-0.7.4.tar.gz # Transcribe an audio file - mozilla_voice_stt --model deepspeech-0.8.1-models.pbmm --scorer deepspeech-0.8.1-models.scorer --audio audio/2830-3980-0043.wav + deepspeech --model deepspeech-0.7.4-models.pbmm --scorer deepspeech-0.7.4-models.scorer --audio audio/2830-3980-0043.wav -A pre-trained English model is available for use and can be downloaded following the instructions in :ref:`the usage docs `. For the latest release, including pre-trained models and checkpoints, `see the GitHub releases page `_. +A pre-trained English model is available for use and can be downloaded following the instructions in :ref:`the usage docs `. For the latest release, including pre-trained models and checkpoints, `see the GitHub releases page `_. -Quicker inference can be performed using a supported NVIDIA GPU on Linux. See the `release notes `_ to find which GPUs are supported. To run ``mozilla_voice_stt`` on a GPU, install the GPU specific package: +Quicker inference can be performed using a supported NVIDIA GPU on Linux. See the `release notes `_ to find which GPUs are supported. To run ``deepspeech`` on a GPU, install the GPU specific package: .. code-block:: bash # Create and activate a virtualenv - virtualenv -p python3 $HOME/tmp/stt-gpu-venv/ - source $HOME/tmp/stt-gpu-venv/bin/activate + virtualenv -p python3 $HOME/tmp/deepspeech-gpu-venv/ + source $HOME/tmp/deepspeech-gpu-venv/bin/activate - # Install Mozilla Voice STT CUDA enabled package - pip3 install mozilla_voice_stt_cuda + # Install DeepSpeech CUDA enabled package + pip3 install deepspeech-gpu # Transcribe an audio file. - mozilla_voice_stt --model deepspeech-0.8.1-models.pbmm --scorer deepspeech-0.8.1-models.scorer --audio audio/2830-3980-0043.wav + deepspeech --model deepspeech-0.7.4-models.pbmm --scorer deepspeech-0.7.4-models.scorer --audio audio/2830-3980-0043.wav Please ensure you have the required :ref:`CUDA dependencies `. -See the output of ``mozilla_voice_stt -h`` for more information on the use of ``mozilla_voice_stt``. (If you experience problems running ``mozilla_voice_stt``, please check :ref:`required runtime dependencies `). +See the output of ``deepspeech -h`` for more information on the use of ``deepspeech``. (If you experience problems running ``deepspeech``, please check :ref:`required runtime dependencies `). .. toctree:: :maxdepth: 2 @@ -76,7 +76,7 @@ See the output of ``mozilla_voice_stt -h`` for more information on the use of `` :maxdepth: 2 :caption: Architecture and training - AcousticModel + DeepSpeech Geometry diff --git a/evaluate.py b/evaluate.py index f8ff3815..dc502542 100644 --- a/evaluate.py +++ b/evaluate.py @@ -4,7 +4,7 @@ from __future__ import absolute_import, division, print_function if __name__ == '__main__': try: - from mozilla_voice_stt_training import evaluate as ds_evaluate + from deepspeech_training import evaluate as ds_evaluate except ImportError: print('Training package is not installed. See training documentation.') raise diff --git a/evaluate_tflite.py b/evaluate_tflite.py index 09e24568..0d462615 100644 --- a/evaluate_tflite.py +++ b/evaluate_tflite.py @@ -10,17 +10,20 @@ import csv import os import sys -from mozilla_voice_stt import Model -from mozilla_voice_stt_training.util.evaluate_tools import calculate_and_print_report -from mozilla_voice_stt_training.util.flags import create_flags +from deepspeech import Model +from deepspeech_training.util.evaluate_tools import calculate_and_print_report +from deepspeech_training.util.flags import create_flags from functools import partial from multiprocessing import JoinableQueue, Process, cpu_count, Manager from six.moves import zip, range r''' This module should be self-contained: + - build libdeepspeech.so with TFLite: + - bazel build [...] --define=runtime=tflite [...] //native_client:libdeepspeech.so + - make -C native_client/python/ TFDIR=... bindings - setup a virtualenv - - pip install mozilla_voice_stt_tflite + - pip install native_client/python/dist/deepspeech*.whl - pip install -r requirements_eval_tflite.txt Then run with a TF Lite model, a scorer and a CSV test file diff --git a/examples/README.rst b/examples/README.rst index 1c3e32e6..f5ebb1bd 100644 --- a/examples/README.rst +++ b/examples/README.rst @@ -1,6 +1,6 @@ Examples ======== -Mozilla Voice STT examples were moved to a separate repository. +DeepSpeech examples were moved to a separate repository. -New location: https://github.com/mozilla/STT-examples +New location: https://github.com/mozilla/DeepSpeech-examples diff --git a/lm_optimizer.py b/lm_optimizer.py index 106a9e58..25d8a05e 100644 --- a/lm_optimizer.py +++ b/lm_optimizer.py @@ -7,13 +7,13 @@ import optuna import sys import tensorflow.compat.v1 as tfv1 -from mozilla_voice_stt_training.evaluate import evaluate -from mozilla_voice_stt_training.train import create_model -from mozilla_voice_stt_training.util.config import Config, initialize_globals -from mozilla_voice_stt_training.util.flags import create_flags, FLAGS -from mozilla_voice_stt_training.util.logging import log_error -from mozilla_voice_stt_training.util.evaluate_tools import wer_cer_batch -from mvs_ctcdecoder import Scorer +from deepspeech_training.evaluate import evaluate +from deepspeech_training.train import create_model +from deepspeech_training.util.config import Config, initialize_globals +from deepspeech_training.util.flags import create_flags, FLAGS +from deepspeech_training.util.logging import log_error +from deepspeech_training.util.evaluate_tools import wer_cer_batch +from ds_ctcdecoder import Scorer def character_based(): diff --git a/native_client/Android.mk b/native_client/Android.mk index 9c40d585..d21551fd 100644 --- a/native_client/Android.mk +++ b/native_client/Android.mk @@ -1,14 +1,14 @@ LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) -LOCAL_MODULE := mozilla_voice_stt-prebuilt -LOCAL_SRC_FILES := $(TFDIR)/bazel-bin/native_client/libmozilla_voice_stt.so +LOCAL_MODULE := deepspeech-prebuilt +LOCAL_SRC_FILES := $(TFDIR)/bazel-bin/native_client/libdeepspeech.so include $(PREBUILT_SHARED_LIBRARY) include $(CLEAR_VARS) LOCAL_CPP_EXTENSION := .cc .cxx .cpp -LOCAL_MODULE := mozilla_voice_stt +LOCAL_MODULE := deepspeech LOCAL_SRC_FILES := client.cc -LOCAL_SHARED_LIBRARIES := mozilla_voice_stt-prebuilt +LOCAL_SHARED_LIBRARIES := deepspeech-prebuilt LOCAL_LDFLAGS := -Wl,--no-as-needed include $(BUILD_EXECUTABLE) diff --git a/native_client/BUILD b/native_client/BUILD index 61bfb19f..e061da6c 100644 --- a/native_client/BUILD +++ b/native_client/BUILD @@ -110,10 +110,10 @@ cc_library( ) tf_cc_shared_object( - name = "libmozilla_voice_stt.so", + name = "libdeepspeech.so", srcs = [ "deepspeech.cc", - "mozilla_voice_stt.h", + "deepspeech.h", "deepspeech_errors.cc", "modelstate.cc", "modelstate.h", @@ -163,7 +163,7 @@ tf_cc_shared_object( #"//tensorflow/core:all_kernels", ### => Trying to be more fine-grained ### Use bin/ops_in_graph.py to list all the ops used by a frozen graph. - ### CPU only build, libmozilla_voice_stt.so file size reduced by ~50% + ### CPU only build, libdeepspeech.so file size reduced by ~50% "//tensorflow/core/kernels:spectrogram_op", # AudioSpectrogram "//tensorflow/core/kernels:bias_op", # BiasAdd "//tensorflow/core/kernels:cast_op", # Cast @@ -203,11 +203,11 @@ tf_cc_shared_object( ) genrule( - name = "libmozilla_voice_stt_so_dsym", - srcs = [":libmozilla_voice_stt.so"], - outs = ["libmozilla_voice_stt.so.dSYM"], + name = "libdeepspeech_so_dsym", + srcs = [":libdeepspeech.so"], + outs = ["libdeepspeech.so.dSYM"], output_to_bindir = True, - cmd = "dsymutil $(location :libmozilla_voice_stt.so) -o $@" + cmd = "dsymutil $(location :libdeepspeech.so) -o $@" ) cc_binary( diff --git a/native_client/CODINGSTYLE.md b/native_client/CODINGSTYLE.md index 01759473..ddb8fc82 100644 --- a/native_client/CODINGSTYLE.md +++ b/native_client/CODINGSTYLE.md @@ -1,5 +1,5 @@ This file contains some notes on coding style within the C++ portion of the -Mozilla Voice STT project. It is very much a work in progress and incomplete. +DeepSpeech project. It is very much a work in progress and incomplete. General ======= diff --git a/native_client/Makefile b/native_client/Makefile index 597adc12..b645499c 100644 --- a/native_client/Makefile +++ b/native_client/Makefile @@ -16,32 +16,32 @@ include definitions.mk default: $(DEEPSPEECH_BIN) clean: - rm -f $(DEEPSPEECH_BIN) + rm -f deepspeech $(DEEPSPEECH_BIN): client.cc Makefile $(CXX) $(CFLAGS) $(CFLAGS_DEEPSPEECH) $(SOX_CFLAGS) client.cc $(LDFLAGS) $(SOX_LDFLAGS) ifeq ($(OS),Darwin) - install_name_tool -change bazel-out/local-opt/bin/native_client/libmozilla_voice_stt.so @rpath/libmozilla_voice_stt.so $(DEEPSPEECH_BIN) + install_name_tool -change bazel-out/local-opt/bin/native_client/libdeepspeech.so @rpath/libdeepspeech.so deepspeech endif run: $(DEEPSPEECH_BIN) - ${META_LD_LIBRARY_PATH}=${TFDIR}/bazel-bin/native_client:${${META_LD_LIBRARY_PATH}} ./$(DEEPSPEECH_BIN) ${ARGS} + ${META_LD_LIBRARY_PATH}=${TFDIR}/bazel-bin/native_client:${${META_LD_LIBRARY_PATH}} ./deepspeech ${ARGS} debug: $(DEEPSPEECH_BIN) - ${META_LD_LIBRARY_PATH}=${TFDIR}/bazel-bin/native_client:${${META_LD_LIBRARY_PATH}} gdb --args ./$(DEEPSPEECH_BIN) ${ARGS} + ${META_LD_LIBRARY_PATH}=${TFDIR}/bazel-bin/native_client:${${META_LD_LIBRARY_PATH}} gdb --args ./deepspeech ${ARGS} install: $(DEEPSPEECH_BIN) install -d ${PREFIX}/lib - install -m 0644 ${TFDIR}/bazel-bin/native_client/libmozilla_voice_stt.so ${PREFIX}/lib/ + install -m 0644 ${TFDIR}/bazel-bin/native_client/libdeepspeech.so ${PREFIX}/lib/ install -d ${PREFIX}/include - install -m 0644 mozilla_voice_stt.h ${PREFIX}/include + install -m 0644 deepspeech.h ${PREFIX}/include install -d ${PREFIX}/bin - install -m 0755 $(DEEPSPEECH_BIN) ${PREFIX}/bin/ + install -m 0755 deepspeech ${PREFIX}/bin/ uninstall: - rm -f ${PREFIX}/bin/$(DEEPSPEECH_BIN) + rm -f ${PREFIX}/bin/deepspeech rmdir --ignore-fail-on-non-empty ${PREFIX}/bin - rm -f ${PREFIX}/lib/libmozilla_voice_stt.so + rm -f ${PREFIX}/lib/libdeepspeech.so rmdir --ignore-fail-on-non-empty ${PREFIX}/lib print-toolchain: diff --git a/native_client/args.h b/native_client/args.h index 0f26743c..baa9b7ff 100644 --- a/native_client/args.h +++ b/native_client/args.h @@ -8,7 +8,7 @@ #endif #include -#include "mozilla_voice_stt.h" +#include "deepspeech.h" char* model = NULL; @@ -43,7 +43,7 @@ void PrintHelp(const char* bin) std::cout << "Usage: " << bin << " --model MODEL [--scorer SCORER] --audio AUDIO [-t] [-e]\n" "\n" - "Running Mozilla Voice STT inference.\n" + "Running DeepSpeech inference.\n" "\n" "\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n" "\t--scorer SCORER\t\t\tPath to the external scorer file\n" @@ -58,9 +58,9 @@ void PrintHelp(const char* bin) "\t--stream size\t\t\tRun in stream mode, output intermediate results\n" "\t--help\t\t\t\tShow help\n" "\t--version\t\t\tPrint version and exits\n"; - char* version = STT_Version(); - std::cerr << "Mozilla Voice STT " << version << "\n"; - STT_FreeString(version); + char* version = DS_Version(); + std::cerr << "DeepSpeech " << version << "\n"; + DS_FreeString(version); exit(1); } @@ -153,9 +153,9 @@ bool ProcessArgs(int argc, char** argv) } if (has_versions) { - char* version = STT_Version(); - std::cout << "Mozilla Voice STT " << version << "\n"; - STT_FreeString(version); + char* version = DS_Version(); + std::cout << "DeepSpeech " << version << "\n"; + DS_FreeString(version); return false; } diff --git a/native_client/bazel_workspace_status_cmd.sh b/native_client/bazel_workspace_status_cmd.sh index e9820a4d..a1a5a2a0 100755 --- a/native_client/bazel_workspace_status_cmd.sh +++ b/native_client/bazel_workspace_status_cmd.sh @@ -22,8 +22,8 @@ echo "STABLE_TF_GIT_VERSION ${tf_git_rev}" pushd $(dirname "$0") ds_git_rev=$(git describe --long --tags) echo "STABLE_DS_GIT_VERSION ${ds_git_rev}" -ds_version=$(cat ../training/mozilla_voice_stt_training/VERSION) +ds_version=$(cat ../training/deepspeech_training/VERSION) echo "STABLE_DS_VERSION ${ds_version}" -ds_graph_version=$(cat ../training/mozilla_voice_stt_training/GRAPH_VERSION) +ds_graph_version=$(cat ../training/deepspeech_training/GRAPH_VERSION) echo "STABLE_DS_GRAPH_VERSION ${ds_graph_version}" popd diff --git a/native_client/client.cc b/native_client/client.cc index 4fa167d2..46a16115 100644 --- a/native_client/client.cc +++ b/native_client/client.cc @@ -34,7 +34,7 @@ #endif // NO_DIR #include -#include "mozilla_voice_stt.h" +#include "deepspeech.h" #include "args.h" typedef struct { @@ -168,17 +168,17 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, // sphinx-doc: c_ref_inference_start if (extended_output) { - Metadata *result = STT_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1); + Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1); res.string = CandidateTranscriptToString(&result->transcripts[0]); - STT_FreeMetadata(result); + DS_FreeMetadata(result); } else if (json_output) { - Metadata *result = STT_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts); + Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts); res.string = MetadataToJSON(result); - STT_FreeMetadata(result); + DS_FreeMetadata(result); } else if (stream_size > 0) { StreamingState* ctx; - int status = STT_CreateStream(aCtx, &ctx); - if (status != STT_ERR_OK) { + int status = DS_CreateStream(aCtx, &ctx); + if (status != DS_ERR_OK) { res.string = strdup(""); return res; } @@ -186,22 +186,22 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, const char *last = nullptr; while (off < aBufferSize) { size_t cur = aBufferSize - off > stream_size ? stream_size : aBufferSize - off; - STT_FeedAudioContent(ctx, aBuffer + off, cur); + DS_FeedAudioContent(ctx, aBuffer + off, cur); off += cur; - const char* partial = STT_IntermediateDecode(ctx); + const char* partial = DS_IntermediateDecode(ctx); if (last == nullptr || strcmp(last, partial)) { printf("%s\n", partial); last = partial; } else { - STT_FreeString((char *) partial); + DS_FreeString((char *) partial); } } if (last != nullptr) { - STT_FreeString((char *) last); + DS_FreeString((char *) last); } - res.string = STT_FinishStream(ctx); + res.string = DS_FinishStream(ctx); } else { - res.string = STT_SpeechToText(aCtx, aBuffer, aBufferSize); + res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize); } // sphinx-doc: c_ref_inference_stop @@ -367,7 +367,7 @@ GetAudioBuffer(const char* path, int desired_sample_rate) void ProcessFile(ModelState* context, const char* path, bool show_times) { - ds_audio_buffer audio = GetAudioBuffer(path, STT_GetModelSampleRate(context)); + ds_audio_buffer audio = GetAudioBuffer(path, DS_GetModelSampleRate(context)); // Pass audio to DeepSpeech // We take half of buffer_size because buffer is a char* while @@ -381,7 +381,7 @@ ProcessFile(ModelState* context, const char* path, bool show_times) if (result.string) { printf("%s\n", result.string); - STT_FreeString((char*)result.string); + DS_FreeString((char*)result.string); } if (show_times) { @@ -400,16 +400,16 @@ main(int argc, char **argv) // Initialise DeepSpeech ModelState* ctx; // sphinx-doc: c_ref_model_start - int status = STT_CreateModel(model, &ctx); + int status = DS_CreateModel(model, &ctx); if (status != 0) { - char* error = STT_ErrorCodeToErrorMessage(status); + char* error = DS_ErrorCodeToErrorMessage(status); fprintf(stderr, "Could not create model: %s\n", error); free(error); return 1; } if (set_beamwidth) { - status = STT_SetModelBeamWidth(ctx, beam_width); + status = DS_SetModelBeamWidth(ctx, beam_width); if (status != 0) { fprintf(stderr, "Could not set model beam width.\n"); return 1; @@ -417,13 +417,13 @@ main(int argc, char **argv) } if (scorer) { - status = STT_EnableExternalScorer(ctx, scorer); + status = DS_EnableExternalScorer(ctx, scorer); if (status != 0) { fprintf(stderr, "Could not enable external scorer.\n"); return 1; } if (set_alphabeta) { - status = STT_SetScorerAlphaBeta(ctx, lm_alpha, lm_beta); + status = DS_SetScorerAlphaBeta(ctx, lm_alpha, lm_beta); if (status != 0) { fprintf(stderr, "Error setting scorer alpha and beta.\n"); return 1; @@ -485,7 +485,7 @@ main(int argc, char **argv) sox_quit(); #endif // NO_SOX - STT_FreeModel(ctx); + DS_FreeModel(ctx); return 0; } diff --git a/native_client/ctcdecode/Makefile b/native_client/ctcdecode/Makefile index c50fa49f..8bff277b 100644 --- a/native_client/ctcdecode/Makefile +++ b/native_client/ctcdecode/Makefile @@ -10,7 +10,7 @@ LDFLAGS_NEEDED += $(RASPBIAN)/lib/aarch64-linux-gnu/libm.so.6 endif ifeq ($(OS),Darwin) -GENERATE_DEBUG_SYMS := dsymutil temp_build/temp_build/mvs_ctcdecoder/_swigwrapper.*.so +GENERATE_DEBUG_SYMS := dsymutil temp_build/temp_build/ds_ctcdecoder/_swigwrapper.*.so else GENERATE_DEBUG_SYMS := endif @@ -28,7 +28,7 @@ THIRD_PARTY := third_party.$(ARCHIVE_EXT) all: bindings clean-keep-third-party: - rm -rf dist temp_build mvs_ctcdecoder.egg-info + rm -rf dist temp_build ds_ctcdecoder.egg-info rm -f swigwrapper_wrap.cpp swigwrapper.py $(FIRST_PARTY) clean: clean-keep-third-party diff --git a/native_client/ctcdecode/__init__.py b/native_client/ctcdecode/__init__.py index f0af046e..fd897b3b 100644 --- a/native_client/ctcdecode/__init__.py +++ b/native_client/ctcdecode/__init__.py @@ -10,7 +10,7 @@ __version__ = swigwrapper.__version__.decode('utf-8') # Hack: import error codes by matching on their names, as SWIG unfortunately # does not support binding enums to Python in a scoped manner yet. for symbol in dir(swigwrapper): - if symbol.startswith('STT_ERR_'): + if symbol.startswith('DS_ERR_'): globals()[symbol] = getattr(swigwrapper, symbol) class Scorer(swigwrapper.Scorer): diff --git a/native_client/ctcdecode/scorer.cpp b/native_client/ctcdecode/scorer.cpp index ad41dd8e..23982ef3 100644 --- a/native_client/ctcdecode/scorer.cpp +++ b/native_client/ctcdecode/scorer.cpp @@ -74,13 +74,13 @@ int Scorer::load_lm(const std::string& lm_path) // Check if file is readable to avoid KenLM throwing an exception const char* filename = lm_path.c_str(); if (access(filename, R_OK) != 0) { - return STT_ERR_SCORER_UNREADABLE; + return DS_ERR_SCORER_UNREADABLE; } // Check if the file format is valid to avoid KenLM throwing an exception lm::ngram::ModelType model_type; if (!lm::ngram::RecognizeBinary(filename, model_type)) { - return STT_ERR_SCORER_INVALID_LM; + return DS_ERR_SCORER_INVALID_LM; } // Load the LM @@ -97,7 +97,7 @@ int Scorer::load_lm(const std::string& lm_path) uint64_t trie_offset = language_model_->GetEndOfSearchOffset(); if (package_size <= trie_offset) { // File ends without a trie structure - return STT_ERR_SCORER_NO_TRIE; + return DS_ERR_SCORER_NO_TRIE; } // Read metadata and trie from file @@ -113,7 +113,7 @@ int Scorer::load_trie(std::ifstream& fin, const std::string& file_path) if (magic != MAGIC) { std::cerr << "Error: Can't parse scorer file, invalid header. Try updating " "your scorer file." << std::endl; - return STT_ERR_SCORER_INVALID_TRIE; + return DS_ERR_SCORER_INVALID_TRIE; } int version; @@ -125,10 +125,10 @@ int Scorer::load_trie(std::ifstream& fin, const std::string& file_path) if (version < FILE_VERSION) { std::cerr << "Update your scorer file."; } else { - std::cerr << "Downgrade your scorer file or update your version of Mozilla Voice STT."; + std::cerr << "Downgrade your scorer file or update your version of DeepSpeech."; } std::cerr << std::endl; - return STT_ERR_SCORER_VERSION_MISMATCH; + return DS_ERR_SCORER_VERSION_MISMATCH; } fin.read(reinterpret_cast(&is_utf8_mode_), sizeof(is_utf8_mode_)); @@ -143,7 +143,7 @@ int Scorer::load_trie(std::ifstream& fin, const std::string& file_path) opt.mode = fst::FstReadOptions::MAP; opt.source = file_path; dictionary.reset(FstType::Read(fin, opt)); - return STT_ERR_OK; + return DS_ERR_OK; } bool Scorer::save_dictionary(const std::string& path, bool append_instead_of_overwrite) diff --git a/native_client/ctcdecode/scorer.h b/native_client/ctcdecode/scorer.h index ee361d7a..5aee1046 100644 --- a/native_client/ctcdecode/scorer.h +++ b/native_client/ctcdecode/scorer.h @@ -13,7 +13,7 @@ #include "path_trie.h" #include "alphabet.h" -#include "mozilla_voice_stt.h" +#include "deepspeech.h" const double OOV_SCORE = -1000.0; const std::string START_TOKEN = ""; diff --git a/native_client/ctcdecode/setup.py b/native_client/ctcdecode/setup.py index d9c5d707..82e702a8 100644 --- a/native_client/ctcdecode/setup.py +++ b/native_client/ctcdecode/setup.py @@ -51,7 +51,7 @@ def maybe_rebuild(srcs, out_name, build_dir): num_parallel=known_args.num_processes, debug=debug) -project_version = read('../../training/mozilla_voice_stt_training/VERSION').strip() +project_version = read('../../training/deepspeech_training/VERSION').strip() build_dir = 'temp_build/temp_build' @@ -68,7 +68,7 @@ maybe_rebuild(KENLM_FILES, third_party_build, build_dir) maybe_rebuild(CTC_DECODER_FILES, ctc_decoder_build, build_dir) decoder_module = Extension( - name='mvs_ctcdecoder._swigwrapper', + name='ds_ctcdecoder._swigwrapper', sources=['swigwrapper.i'], swig_opts=['-c++', '-extranative'], language='c++', @@ -84,12 +84,12 @@ class BuildExtFirst(build): ('build_scripts', build.has_scripts)] setup( - name='mvs_ctcdecoder', + name='ds_ctcdecoder', version=project_version, description="""DS CTC decoder""", cmdclass = {'build': BuildExtFirst}, ext_modules=[decoder_module], - package_dir = {'mvs_ctcdecoder': '.'}, - py_modules=['mvs_ctcdecoder', 'mvs_ctcdecoder.swigwrapper'], + package_dir = {'ds_ctcdecoder': '.'}, + py_modules=['ds_ctcdecoder', 'ds_ctcdecoder.swigwrapper'], install_requires = ['numpy%s' % numpy_min_ver], ) diff --git a/native_client/ctcdecode/swigwrapper.i b/native_client/ctcdecode/swigwrapper.i index 9daf7d89..dbe67c68 100644 --- a/native_client/ctcdecode/swigwrapper.i +++ b/native_client/ctcdecode/swigwrapper.i @@ -42,14 +42,14 @@ namespace std { %constant const char* __version__ = ds_version(); %constant const char* __git_version__ = ds_git_version(); -// Import only the error code enum definitions from mozilla_voice_stt.h +// Import only the error code enum definitions from deepspeech.h // We can't just do |%ignore "";| here because it affects this file globally (even // files %include'd above). That causes SWIG to lose destructor information and // leads to leaks of the wrapper objects. // Instead we ignore functions and classes (structs), which are the only other -// things in mozilla_voice_stt.h. If we add some new construct to mozilla_voice_stt.h we need +// things in deepspeech.h. If we add some new construct to deepspeech.h we need // to update the ignore rules here to avoid exposing unwanted APIs in the decoder // package. %rename("$ignore", %$isfunction) ""; %rename("$ignore", %$isclass) ""; -%include "../mozilla_voice_stt.h" +%include "../deepspeech.h" diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc index 01a9292b..38868d4b 100644 --- a/native_client/deepspeech.cc +++ b/native_client/deepspeech.cc @@ -9,7 +9,7 @@ #include #include -#include "mozilla_voice_stt.h" +#include "deepspeech.h" #include "alphabet.h" #include "modelstate.h" @@ -25,7 +25,7 @@ #ifdef __ANDROID__ #include -#define LOG_TAG "libmozilla_voice_stt" +#define LOG_TAG "libdeepspeech" #define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, __VA_ARGS__) #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__) #else @@ -263,23 +263,23 @@ StreamingState::processBatch(const vector& buf, unsigned int n_steps) } int -STT_CreateModel(const char* aModelPath, +DS_CreateModel(const char* aModelPath, ModelState** retval) { *retval = nullptr; std::cerr << "TensorFlow: " << tf_local_git_version() << std::endl; - std::cerr << "Mozilla Voice STT: " << ds_git_version() << std::endl; + std::cerr << "DeepSpeech: " << ds_git_version() << std::endl; #ifdef __ANDROID__ LOGE("TensorFlow: %s", tf_local_git_version()); LOGD("TensorFlow: %s", tf_local_git_version()); - LOGE("Mozilla Voice STT: %s", ds_git_version()); - LOGD("Mozilla Voice STT: %s", ds_git_version()); + LOGE("DeepSpeech: %s", ds_git_version()); + LOGD("DeepSpeech: %s", ds_git_version()); #endif if (!aModelPath || strlen(aModelPath) < 1) { std::cerr << "No model specified, cannot continue." << std::endl; - return STT_ERR_NO_MODEL; + return DS_ERR_NO_MODEL; } std::unique_ptr model( @@ -292,79 +292,79 @@ STT_CreateModel(const char* aModelPath, if (!model) { std::cerr << "Could not allocate model state." << std::endl; - return STT_ERR_FAIL_CREATE_MODEL; + return DS_ERR_FAIL_CREATE_MODEL; } int err = model->init(aModelPath); - if (err != STT_ERR_OK) { + if (err != DS_ERR_OK) { return err; } *retval = model.release(); - return STT_ERR_OK; + return DS_ERR_OK; } unsigned int -STT_GetModelBeamWidth(const ModelState* aCtx) +DS_GetModelBeamWidth(const ModelState* aCtx) { return aCtx->beam_width_; } int -STT_SetModelBeamWidth(ModelState* aCtx, unsigned int aBeamWidth) +DS_SetModelBeamWidth(ModelState* aCtx, unsigned int aBeamWidth) { aCtx->beam_width_ = aBeamWidth; return 0; } int -STT_GetModelSampleRate(const ModelState* aCtx) +DS_GetModelSampleRate(const ModelState* aCtx) { return aCtx->sample_rate_; } void -STT_FreeModel(ModelState* ctx) +DS_FreeModel(ModelState* ctx) { delete ctx; } int -STT_EnableExternalScorer(ModelState* aCtx, +DS_EnableExternalScorer(ModelState* aCtx, const char* aScorerPath) { std::unique_ptr scorer(new Scorer()); int err = scorer->init(aScorerPath, aCtx->alphabet_); if (err != 0) { - return STT_ERR_INVALID_SCORER; + return DS_ERR_INVALID_SCORER; } aCtx->scorer_ = std::move(scorer); - return STT_ERR_OK; + return DS_ERR_OK; } int -STT_DisableExternalScorer(ModelState* aCtx) +DS_DisableExternalScorer(ModelState* aCtx) { if (aCtx->scorer_) { aCtx->scorer_.reset(); - return STT_ERR_OK; + return DS_ERR_OK; } - return STT_ERR_SCORER_NOT_ENABLED; + return DS_ERR_SCORER_NOT_ENABLED; } -int STT_SetScorerAlphaBeta(ModelState* aCtx, +int DS_SetScorerAlphaBeta(ModelState* aCtx, float aAlpha, float aBeta) { if (aCtx->scorer_) { aCtx->scorer_->reset_params(aAlpha, aBeta); - return STT_ERR_OK; + return DS_ERR_OK; } - return STT_ERR_SCORER_NOT_ENABLED; + return DS_ERR_SCORER_NOT_ENABLED; } int -STT_CreateStream(ModelState* aCtx, +DS_CreateStream(ModelState* aCtx, StreamingState** retval) { *retval = nullptr; @@ -372,7 +372,7 @@ STT_CreateStream(ModelState* aCtx, std::unique_ptr ctx(new StreamingState()); if (!ctx) { std::cerr << "Could not allocate streaming state." << std::endl; - return STT_ERR_FAIL_CREATE_STREAM; + return DS_ERR_FAIL_CREATE_STREAM; } ctx->audio_buffer_.reserve(aCtx->audio_win_len_); @@ -393,11 +393,11 @@ STT_CreateStream(ModelState* aCtx, aCtx->scorer_); *retval = ctx.release(); - return STT_ERR_OK; + return DS_ERR_OK; } void -STT_FeedAudioContent(StreamingState* aSctx, +DS_FeedAudioContent(StreamingState* aSctx, const short* aBuffer, unsigned int aBufferSize) { @@ -405,32 +405,32 @@ STT_FeedAudioContent(StreamingState* aSctx, } char* -STT_IntermediateDecode(const StreamingState* aSctx) +DS_IntermediateDecode(const StreamingState* aSctx) { return aSctx->intermediateDecode(); } Metadata* -STT_IntermediateDecodeWithMetadata(const StreamingState* aSctx, +DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx, unsigned int aNumResults) { return aSctx->intermediateDecodeWithMetadata(aNumResults); } char* -STT_FinishStream(StreamingState* aSctx) +DS_FinishStream(StreamingState* aSctx) { char* str = aSctx->finishStream(); - STT_FreeStream(aSctx); + DS_FreeStream(aSctx); return str; } Metadata* -STT_FinishStreamWithMetadata(StreamingState* aSctx, +DS_FinishStreamWithMetadata(StreamingState* aSctx, unsigned int aNumResults) { Metadata* result = aSctx->finishStreamWithMetadata(aNumResults); - STT_FreeStream(aSctx); + DS_FreeStream(aSctx); return result; } @@ -440,41 +440,41 @@ CreateStreamAndFeedAudioContent(ModelState* aCtx, unsigned int aBufferSize) { StreamingState* ctx; - int status = STT_CreateStream(aCtx, &ctx); - if (status != STT_ERR_OK) { + int status = DS_CreateStream(aCtx, &ctx); + if (status != DS_ERR_OK) { return nullptr; } - STT_FeedAudioContent(ctx, aBuffer, aBufferSize); + DS_FeedAudioContent(ctx, aBuffer, aBufferSize); return ctx; } char* -STT_SpeechToText(ModelState* aCtx, +DS_SpeechToText(ModelState* aCtx, const short* aBuffer, unsigned int aBufferSize) { StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize); - return STT_FinishStream(ctx); + return DS_FinishStream(ctx); } Metadata* -STT_SpeechToTextWithMetadata(ModelState* aCtx, +DS_SpeechToTextWithMetadata(ModelState* aCtx, const short* aBuffer, unsigned int aBufferSize, unsigned int aNumResults) { StreamingState* ctx = CreateStreamAndFeedAudioContent(aCtx, aBuffer, aBufferSize); - return STT_FinishStreamWithMetadata(ctx, aNumResults); + return DS_FinishStreamWithMetadata(ctx, aNumResults); } void -STT_FreeStream(StreamingState* aSctx) +DS_FreeStream(StreamingState* aSctx) { delete aSctx; } void -STT_FreeMetadata(Metadata* m) +DS_FreeMetadata(Metadata* m) { if (m) { for (int i = 0; i < m->num_transcripts; ++i) { @@ -491,13 +491,13 @@ STT_FreeMetadata(Metadata* m) } void -STT_FreeString(char* str) +DS_FreeString(char* str) { free(str); } char* -STT_Version() +DS_Version() { return strdup(ds_version()); } diff --git a/native_client/mozilla_voice_stt.h b/native_client/deepspeech.h similarity index 54% rename from native_client/mozilla_voice_stt.h rename to native_client/deepspeech.h index 52895aa4..1df3cf2e 100644 --- a/native_client/mozilla_voice_stt.h +++ b/native_client/deepspeech.h @@ -1,5 +1,5 @@ -#ifndef MOZILLA_VOICE_STT_H -#define MOZILLA_VOICE_STT_H +#ifndef DEEPSPEECH_H +#define DEEPSPEECH_H #ifdef __cplusplus extern "C" { @@ -7,12 +7,12 @@ extern "C" { #ifndef SWIG #if defined _MSC_VER - #define STT_EXPORT __declspec(dllexport) + #define DEEPSPEECH_EXPORT __declspec(dllexport) #else - #define STT_EXPORT __attribute__ ((visibility("default"))) + #define DEEPSPEECH_EXPORT __attribute__ ((visibility("default"))) #endif /*End of _MSC_VER*/ #else - #define STT_EXPORT + #define DEEPSPEECH_EXPORT #endif typedef struct ModelState ModelState; @@ -61,89 +61,89 @@ typedef struct Metadata { // sphinx-doc: error_code_listing_start -#define STT_FOR_EACH_ERROR(APPLY) \ - APPLY(STT_ERR_OK, 0x0000, "No error.") \ - APPLY(STT_ERR_NO_MODEL, 0x1000, "Missing model information.") \ - APPLY(STT_ERR_INVALID_ALPHABET, 0x2000, "Invalid alphabet embedded in model. (Data corruption?)") \ - APPLY(STT_ERR_INVALID_SHAPE, 0x2001, "Invalid model shape.") \ - APPLY(STT_ERR_INVALID_SCORER, 0x2002, "Invalid scorer file.") \ - APPLY(STT_ERR_MODEL_INCOMPATIBLE, 0x2003, "Incompatible model.") \ - APPLY(STT_ERR_SCORER_NOT_ENABLED, 0x2004, "External scorer is not enabled.") \ - APPLY(STT_ERR_SCORER_UNREADABLE, 0x2005, "Could not read scorer file.") \ - APPLY(STT_ERR_SCORER_INVALID_LM, 0x2006, "Could not recognize language model header in scorer.") \ - APPLY(STT_ERR_SCORER_NO_TRIE, 0x2007, "Reached end of scorer file before loading vocabulary trie.") \ - APPLY(STT_ERR_SCORER_INVALID_TRIE, 0x2008, "Invalid magic in trie header.") \ - APPLY(STT_ERR_SCORER_VERSION_MISMATCH, 0x2009, "Scorer file version does not match expected version.") \ - APPLY(STT_ERR_FAIL_INIT_MMAP, 0x3000, "Failed to initialize memory mapped model.") \ - APPLY(STT_ERR_FAIL_INIT_SESS, 0x3001, "Failed to initialize the session.") \ - APPLY(STT_ERR_FAIL_INTERPRETER, 0x3002, "Interpreter failed.") \ - APPLY(STT_ERR_FAIL_RUN_SESS, 0x3003, "Failed to run the session.") \ - APPLY(STT_ERR_FAIL_CREATE_STREAM, 0x3004, "Error creating the stream.") \ - APPLY(STT_ERR_FAIL_READ_PROTOBUF, 0x3005, "Error reading the proto buffer model file.") \ - APPLY(STT_ERR_FAIL_CREATE_SESS, 0x3006, "Failed to create session.") \ - APPLY(STT_ERR_FAIL_CREATE_MODEL, 0x3007, "Could not allocate model state.") +#define DS_FOR_EACH_ERROR(APPLY) \ + APPLY(DS_ERR_OK, 0x0000, "No error.") \ + APPLY(DS_ERR_NO_MODEL, 0x1000, "Missing model information.") \ + APPLY(DS_ERR_INVALID_ALPHABET, 0x2000, "Invalid alphabet embedded in model. (Data corruption?)") \ + APPLY(DS_ERR_INVALID_SHAPE, 0x2001, "Invalid model shape.") \ + APPLY(DS_ERR_INVALID_SCORER, 0x2002, "Invalid scorer file.") \ + APPLY(DS_ERR_MODEL_INCOMPATIBLE, 0x2003, "Incompatible model.") \ + APPLY(DS_ERR_SCORER_NOT_ENABLED, 0x2004, "External scorer is not enabled.") \ + APPLY(DS_ERR_SCORER_UNREADABLE, 0x2005, "Could not read scorer file.") \ + APPLY(DS_ERR_SCORER_INVALID_LM, 0x2006, "Could not recognize language model header in scorer.") \ + APPLY(DS_ERR_SCORER_NO_TRIE, 0x2007, "Reached end of scorer file before loading vocabulary trie.") \ + APPLY(DS_ERR_SCORER_INVALID_TRIE, 0x2008, "Invalid magic in trie header.") \ + APPLY(DS_ERR_SCORER_VERSION_MISMATCH, 0x2009, "Scorer file version does not match expected version.") \ + APPLY(DS_ERR_FAIL_INIT_MMAP, 0x3000, "Failed to initialize memory mapped model.") \ + APPLY(DS_ERR_FAIL_INIT_SESS, 0x3001, "Failed to initialize the session.") \ + APPLY(DS_ERR_FAIL_INTERPRETER, 0x3002, "Interpreter failed.") \ + APPLY(DS_ERR_FAIL_RUN_SESS, 0x3003, "Failed to run the session.") \ + APPLY(DS_ERR_FAIL_CREATE_STREAM, 0x3004, "Error creating the stream.") \ + APPLY(DS_ERR_FAIL_READ_PROTOBUF, 0x3005, "Error reading the proto buffer model file.") \ + APPLY(DS_ERR_FAIL_CREATE_SESS, 0x3006, "Failed to create session.") \ + APPLY(DS_ERR_FAIL_CREATE_MODEL, 0x3007, "Could not allocate model state.") // sphinx-doc: error_code_listing_end -enum STT_Error_Codes +enum DeepSpeech_Error_Codes { #define DEFINE(NAME, VALUE, DESC) NAME = VALUE, -STT_FOR_EACH_ERROR(DEFINE) +DS_FOR_EACH_ERROR(DEFINE) #undef DEFINE }; /** - * @brief An object providing an interface to a trained Mozilla Voice STT model. + * @brief An object providing an interface to a trained DeepSpeech model. * * @param aModelPath The path to the frozen model graph. * @param[out] retval a ModelState pointer * * @return Zero on success, non-zero on failure. */ -STT_EXPORT -int STT_CreateModel(const char* aModelPath, - ModelState** retval); +DEEPSPEECH_EXPORT +int DS_CreateModel(const char* aModelPath, + ModelState** retval); /** - * @brief Get beam width value used by the model. If {@link STT_SetModelBeamWidth} + * @brief Get beam width value used by the model. If {@link DS_SetModelBeamWidth} * was not called before, will return the default value loaded from the * model file. * - * @param aCtx A ModelState pointer created with {@link STT_CreateModel}. + * @param aCtx A ModelState pointer created with {@link DS_CreateModel}. * * @return Beam width value used by the model. */ -STT_EXPORT -unsigned int STT_GetModelBeamWidth(const ModelState* aCtx); +DEEPSPEECH_EXPORT +unsigned int DS_GetModelBeamWidth(const ModelState* aCtx); /** * @brief Set beam width value used by the model. * - * @param aCtx A ModelState pointer created with {@link STT_CreateModel}. + * @param aCtx A ModelState pointer created with {@link DS_CreateModel}. * @param aBeamWidth The beam width used by the model. A larger beam width value * generates better results at the cost of decoding time. * * @return Zero on success, non-zero on failure. */ -STT_EXPORT -int STT_SetModelBeamWidth(ModelState* aCtx, - unsigned int aBeamWidth); +DEEPSPEECH_EXPORT +int DS_SetModelBeamWidth(ModelState* aCtx, + unsigned int aBeamWidth); /** * @brief Return the sample rate expected by a model. * - * @param aCtx A ModelState pointer created with {@link STT_CreateModel}. + * @param aCtx A ModelState pointer created with {@link DS_CreateModel}. * * @return Sample rate expected by the model for its input. */ -STT_EXPORT -int STT_GetModelSampleRate(const ModelState* aCtx); +DEEPSPEECH_EXPORT +int DS_GetModelSampleRate(const ModelState* aCtx); /** * @brief Frees associated resources and destroys model object. */ -STT_EXPORT -void STT_FreeModel(ModelState* ctx); +DEEPSPEECH_EXPORT +void DS_FreeModel(ModelState* ctx); /** * @brief Enable decoding using an external scorer. @@ -153,9 +153,9 @@ void STT_FreeModel(ModelState* ctx); * * @return Zero on success, non-zero on failure (invalid arguments). */ -STT_EXPORT -int STT_EnableExternalScorer(ModelState* aCtx, - const char* aScorerPath); +DEEPSPEECH_EXPORT +int DS_EnableExternalScorer(ModelState* aCtx, + const char* aScorerPath); /** * @brief Disable decoding using an external scorer. @@ -164,8 +164,8 @@ int STT_EnableExternalScorer(ModelState* aCtx, * * @return Zero on success, non-zero on failure. */ -STT_EXPORT -int STT_DisableExternalScorer(ModelState* aCtx); +DEEPSPEECH_EXPORT +int DS_DisableExternalScorer(ModelState* aCtx); /** * @brief Set hyperparameters alpha and beta of the external scorer. @@ -176,13 +176,13 @@ int STT_DisableExternalScorer(ModelState* aCtx); * * @return Zero on success, non-zero on failure. */ -STT_EXPORT -int STT_SetScorerAlphaBeta(ModelState* aCtx, - float aAlpha, - float aBeta); +DEEPSPEECH_EXPORT +int DS_SetScorerAlphaBeta(ModelState* aCtx, + float aAlpha, + float aBeta); /** - * @brief Use the Mozilla Voice STT model to convert speech to text. + * @brief Use the DeepSpeech model to convert speech to text. * * @param aCtx The ModelState pointer for the model to use. * @param aBuffer A 16-bit, mono raw audio signal at the appropriate @@ -190,15 +190,15 @@ int STT_SetScorerAlphaBeta(ModelState* aCtx, * @param aBufferSize The number of samples in the audio signal. * * @return The STT result. The user is responsible for freeing the string using - * {@link STT_FreeString()}. Returns NULL on error. + * {@link DS_FreeString()}. Returns NULL on error. */ -STT_EXPORT -char* STT_SpeechToText(ModelState* aCtx, - const short* aBuffer, - unsigned int aBufferSize); +DEEPSPEECH_EXPORT +char* DS_SpeechToText(ModelState* aCtx, + const short* aBuffer, + unsigned int aBufferSize); /** - * @brief Use the Mozilla Voice STT model to convert speech to text and output results + * @brief Use the DeepSpeech model to convert speech to text and output results * including metadata. * * @param aCtx The ModelState pointer for the model to use. @@ -209,19 +209,19 @@ char* STT_SpeechToText(ModelState* aCtx, * * @return Metadata struct containing multiple CandidateTranscript structs. Each * transcript has per-token metadata including timing information. The - * user is responsible for freeing Metadata by calling {@link STT_FreeMetadata()}. + * user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. * Returns NULL on error. */ -STT_EXPORT -Metadata* STT_SpeechToTextWithMetadata(ModelState* aCtx, - const short* aBuffer, - unsigned int aBufferSize, - unsigned int aNumResults); +DEEPSPEECH_EXPORT +Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, + const short* aBuffer, + unsigned int aBufferSize, + unsigned int aNumResults); /** * @brief Create a new streaming inference state. The streaming state returned - * by this function can then be passed to {@link STT_FeedAudioContent()} - * and {@link STT_FinishStream()}. + * by this function can then be passed to {@link DS_FeedAudioContent()} + * and {@link DS_FinishStream()}. * * @param aCtx The ModelState pointer for the model to use. * @param[out] retval an opaque pointer that represents the streaming state. Can @@ -229,129 +229,129 @@ Metadata* STT_SpeechToTextWithMetadata(ModelState* aCtx, * * @return Zero for success, non-zero on failure. */ -STT_EXPORT -int STT_CreateStream(ModelState* aCtx, - StreamingState** retval); +DEEPSPEECH_EXPORT +int DS_CreateStream(ModelState* aCtx, + StreamingState** retval); /** * @brief Feed audio samples to an ongoing streaming inference. * - * @param aSctx A streaming state pointer returned by {@link STT_CreateStream()}. + * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @param aBuffer An array of 16-bit, mono raw audio samples at the * appropriate sample rate (matching what the model was trained on). * @param aBufferSize The number of samples in @p aBuffer. */ -STT_EXPORT -void STT_FeedAudioContent(StreamingState* aSctx, - const short* aBuffer, - unsigned int aBufferSize); +DEEPSPEECH_EXPORT +void DS_FeedAudioContent(StreamingState* aSctx, + const short* aBuffer, + unsigned int aBufferSize); /** * @brief Compute the intermediate decoding of an ongoing streaming inference. * - * @param aSctx A streaming state pointer returned by {@link STT_CreateStream()}. + * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * * @return The STT intermediate result. The user is responsible for freeing the - * string using {@link STT_FreeString()}. + * string using {@link DS_FreeString()}. */ -STT_EXPORT -char* STT_IntermediateDecode(const StreamingState* aSctx); +DEEPSPEECH_EXPORT +char* DS_IntermediateDecode(const StreamingState* aSctx); /** * @brief Compute the intermediate decoding of an ongoing streaming inference, * return results including metadata. * - * @param aSctx A streaming state pointer returned by {@link STT_CreateStream()}. + * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @param aNumResults The number of candidate transcripts to return. * * @return Metadata struct containing multiple candidate transcripts. Each transcript * has per-token metadata including timing information. The user is - * responsible for freeing Metadata by calling {@link STT_FreeMetadata()}. + * responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. * Returns NULL on error. */ -STT_EXPORT -Metadata* STT_IntermediateDecodeWithMetadata(const StreamingState* aSctx, - unsigned int aNumResults); +DEEPSPEECH_EXPORT +Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx, + unsigned int aNumResults); /** * @brief Compute the final decoding of an ongoing streaming inference and return * the result. Signals the end of an ongoing streaming inference. * - * @param aSctx A streaming state pointer returned by {@link STT_CreateStream()}. + * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * * @return The STT result. The user is responsible for freeing the string using - * {@link STT_FreeString()}. + * {@link DS_FreeString()}. * * @note This method will free the state pointer (@p aSctx). */ -STT_EXPORT -char* STT_FinishStream(StreamingState* aSctx); +DEEPSPEECH_EXPORT +char* DS_FinishStream(StreamingState* aSctx); /** * @brief Compute the final decoding of an ongoing streaming inference and return * results including metadata. Signals the end of an ongoing streaming * inference. * - * @param aSctx A streaming state pointer returned by {@link STT_CreateStream()}. + * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * @param aNumResults The number of candidate transcripts to return. * * @return Metadata struct containing multiple candidate transcripts. Each transcript * has per-token metadata including timing information. The user is - * responsible for freeing Metadata by calling {@link STT_FreeMetadata()}. + * responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. * Returns NULL on error. * * @note This method will free the state pointer (@p aSctx). */ -STT_EXPORT -Metadata* STT_FinishStreamWithMetadata(StreamingState* aSctx, - unsigned int aNumResults); +DEEPSPEECH_EXPORT +Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, + unsigned int aNumResults); /** * @brief Destroy a streaming state without decoding the computed logits. This * can be used if you no longer need the result of an ongoing streaming * inference and don't want to perform a costly decode operation. * - * @param aSctx A streaming state pointer returned by {@link STT_CreateStream()}. + * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. * * @note This method will free the state pointer (@p aSctx). */ -STT_EXPORT -void STT_FreeStream(StreamingState* aSctx); +DEEPSPEECH_EXPORT +void DS_FreeStream(StreamingState* aSctx); /** * @brief Free memory allocated for metadata information. */ -STT_EXPORT -void STT_FreeMetadata(Metadata* m); +DEEPSPEECH_EXPORT +void DS_FreeMetadata(Metadata* m); /** - * @brief Free a char* string returned by the Mozilla Voice STT API. + * @brief Free a char* string returned by the DeepSpeech API. */ -STT_EXPORT -void STT_FreeString(char* str); +DEEPSPEECH_EXPORT +void DS_FreeString(char* str); /** * @brief Returns the version of this library. The returned version is a semantic - * version (SemVer 2.0.0). The string returned must be freed with {@link STT_FreeString()}. + * version (SemVer 2.0.0). The string returned must be freed with {@link DS_FreeString()}. * * @return The version string. */ -STT_EXPORT -char* STT_Version(); +DEEPSPEECH_EXPORT +char* DS_Version(); /** * @brief Returns a textual description corresponding to an error code. - * The string returned must be freed with @{link STT_FreeString()}. + * The string returned must be freed with @{link DS_FreeString()}. * * @return The error description. */ -STT_EXPORT -char* STT_ErrorCodeToErrorMessage(int aErrorCode); +DEEPSPEECH_EXPORT +char* DS_ErrorCodeToErrorMessage(int aErrorCode); -#undef STT_EXPORT +#undef DEEPSPEECH_EXPORT #ifdef __cplusplus } #endif -#endif /* MOZILLA_VOICE_STT_H */ +#endif /* DEEPSPEECH_H */ diff --git a/native_client/deepspeech_errors.cc b/native_client/deepspeech_errors.cc index 69b580f6..1f1e4d8d 100644 --- a/native_client/deepspeech_errors.cc +++ b/native_client/deepspeech_errors.cc @@ -1,8 +1,8 @@ -#include "mozilla_voice_stt.h" +#include "deepspeech.h" #include char* -STT_ErrorCodeToErrorMessage(int aErrorCode) +DS_ErrorCodeToErrorMessage(int aErrorCode) { #define RETURN_MESSAGE(NAME, VALUE, DESC) \ case NAME: \ @@ -10,7 +10,7 @@ STT_ErrorCodeToErrorMessage(int aErrorCode) switch(aErrorCode) { - STT_FOR_EACH_ERROR(RETURN_MESSAGE) + DS_FOR_EACH_ERROR(RETURN_MESSAGE) default: return strdup("Unknown error, please make sure you are using the correct native binary."); } diff --git a/native_client/definitions.mk b/native_client/definitions.mk index 937aa9d4..0c8ab656 100644 --- a/native_client/definitions.mk +++ b/native_client/definitions.mk @@ -18,9 +18,9 @@ ifeq ($(findstring _NT,$(OS)),_NT) PLATFORM_EXE_SUFFIX := .exe endif -DEEPSPEECH_BIN := mozilla_voice_stt$(PLATFORM_EXE_SUFFIX) +DEEPSPEECH_BIN := deepspeech$(PLATFORM_EXE_SUFFIX) CFLAGS_DEEPSPEECH := -std=c++11 -o $(DEEPSPEECH_BIN) -LINK_DEEPSPEECH := -lmozilla_voice_stt +LINK_DEEPSPEECH := -ldeepspeech LINK_PATH_DEEPSPEECH := -L${TFDIR}/bazel-bin/native_client ifeq ($(TARGET),host) @@ -53,7 +53,7 @@ TOOL_CC := cl.exe TOOL_CXX := cl.exe TOOL_LD := link.exe TOOL_LIBEXE := lib.exe -LINK_DEEPSPEECH := $(TFDIR)\bazel-bin\native_client\libmozilla_voice_stt.so.if.lib +LINK_DEEPSPEECH := $(TFDIR)\bazel-bin\native_client\libdeepspeech.so.if.lib LINK_PATH_DEEPSPEECH := CFLAGS_DEEPSPEECH := -nologo -Fe$(DEEPSPEECH_BIN) SOX_CFLAGS := @@ -174,7 +174,7 @@ define copy_missing_libs new_missing="$$( (for f in $$(otool -L $$lib 2>/dev/null | tail -n +2 | awk '{ print $$1 }' | grep -v '$$lib'); do ls -hal $$f; done;) 2>&1 | grep 'No such' | cut -d':' -f2 | xargs basename -a)"; \ missing_libs="$$missing_libs $$new_missing"; \ elif [ "$(OS)" = "${TC_MSYS_VERSION}" ]; then \ - missing_libs="libmozilla_voice_stt.so"; \ + missing_libs="libdeepspeech.so"; \ else \ missing_libs="$$missing_libs $$($(LDD) $$lib | grep 'not found' | awk '{ print $$1 }')"; \ fi; \ @@ -200,11 +200,11 @@ endef SWIG_DIST_URL ?= ifeq ($(findstring Linux,$(OS)),Linux) -SWIG_DIST_URL := "https://community-tc.services.mozilla.com/api/index/v1/task/project.mozilla-voice-stt.swig.linux.amd64.1a4c14945012f1282c2eddc174fb7674d5295de8.0/artifacts/public/ds-swig.tar.gz" +SWIG_DIST_URL := "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.swig.linux.amd64.b5fea54d39832d1d132d7dd921b69c0c2c9d5118/artifacts/public/ds-swig.tar.gz" else ifeq ($(findstring Darwin,$(OS)),Darwin) -SWIG_DIST_URL := "https://community-tc.services.mozilla.com/api/index/v1/task/project.mozilla-voice-stt.swig.darwin.amd64.1a4c14945012f1282c2eddc174fb7674d5295de8.0/artifacts/public/ds-swig.tar.gz" +SWIG_DIST_URL := "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.swig.darwin.amd64.b5fea54d39832d1d132d7dd921b69c0c2c9d5118/artifacts/public/ds-swig.tar.gz" else ifeq ($(findstring _NT,$(OS)),_NT) -SWIG_DIST_URL := "https://community-tc.services.mozilla.com/api/index/v1/task/project.mozilla-voice-stt.swig.win.amd64.1a4c14945012f1282c2eddc174fb7674d5295de8.0/artifacts/public/ds-swig.tar.gz" +SWIG_DIST_URL := "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.swig.win.amd64.b5fea54d39832d1d132d7dd921b69c0c2c9d5118/artifacts/public/ds-swig.tar.gz" else $(error There is no prebuilt SWIG available for your platform. Please produce one and set SWIG_DIST_URL.) endif diff --git a/native_client/dotnet/MozillaVoiceStt.sln b/native_client/dotnet/DeepSpeech.sln similarity index 77% rename from native_client/dotnet/MozillaVoiceStt.sln rename to native_client/dotnet/DeepSpeech.sln index 0bf2b52e..78afe7db 100644 --- a/native_client/dotnet/MozillaVoiceStt.sln +++ b/native_client/dotnet/DeepSpeech.sln @@ -2,9 +2,9 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 16 VisualStudioVersion = 16.0.30204.135 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "MozillaVoiceSttClient", "MozillaVoiceSttClient\MozillaVoiceSttClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DeepSpeechClient", "DeepSpeechClient\DeepSpeechClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MozillaVoiceSttConsole", "MozillaVoiceSttConsole\MozillaVoiceSttConsole.csproj", "{312965E5-C4F6-4D95-BA64-79906B8BC7AC}" +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechConsole", "DeepSpeechConsole\DeepSpeechConsole.csproj", "{312965E5-C4F6-4D95-BA64-79906B8BC7AC}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/native_client/dotnet/MozillaVoiceSttClient/MozillaVoiceStt.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs similarity index 72% rename from native_client/dotnet/MozillaVoiceSttClient/MozillaVoiceStt.cs rename to native_client/dotnet/DeepSpeechClient/DeepSpeech.cs index a331e393..08a3808b 100644 --- a/native_client/dotnet/MozillaVoiceSttClient/MozillaVoiceStt.cs +++ b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs @@ -1,34 +1,34 @@ -using MozillaVoiceSttClient.Interfaces; -using MozillaVoiceSttClient.Extensions; +using DeepSpeechClient.Interfaces; +using DeepSpeechClient.Extensions; using System; using System.IO; -using MozillaVoiceSttClient.Enums; -using MozillaVoiceSttClient.Models; +using DeepSpeechClient.Enums; +using DeepSpeechClient.Models; -namespace MozillaVoiceSttClient +namespace DeepSpeechClient { /// - /// Concrete implementation of . + /// Concrete implementation of . /// - public class MozillaVoiceSttModel : IMozillaVoiceSttModel + public class DeepSpeech : IDeepSpeech { private unsafe IntPtr** _modelStatePP; /// - /// Initializes a new instance of class and creates a new acoustic model. + /// Initializes a new instance of class and creates a new acoustic model. /// /// The path to the frozen model graph. /// Thrown when the native binary failed to create the model. - public MozillaVoiceSttModel(string aModelPath) + public DeepSpeech(string aModelPath) { CreateModel(aModelPath); } - #region IMozillaVoiceSttModel + #region IDeepSpeech /// - /// Create an object providing an interface to a trained Mozilla Voice STT model. + /// Create an object providing an interface to a trained DeepSpeech model. /// /// The path to the frozen model graph. /// Thrown when the native binary failed to create the model. @@ -48,7 +48,7 @@ namespace MozillaVoiceSttClient { throw new FileNotFoundException(exceptionMessage); } - var resultCode = NativeImp.STT_CreateModel(aModelPath, + var resultCode = NativeImp.DS_CreateModel(aModelPath, ref _modelStatePP); EvaluateResultCode(resultCode); } @@ -60,7 +60,7 @@ namespace MozillaVoiceSttClient /// Beam width value used by the model. public unsafe uint GetModelBeamWidth() { - return NativeImp.STT_GetModelBeamWidth(_modelStatePP); + return NativeImp.DS_GetModelBeamWidth(_modelStatePP); } /// @@ -70,7 +70,7 @@ namespace MozillaVoiceSttClient /// Thrown on failure. public unsafe void SetModelBeamWidth(uint aBeamWidth) { - var resultCode = NativeImp.STT_SetModelBeamWidth(_modelStatePP, aBeamWidth); + var resultCode = NativeImp.DS_SetModelBeamWidth(_modelStatePP, aBeamWidth); EvaluateResultCode(resultCode); } @@ -80,7 +80,7 @@ namespace MozillaVoiceSttClient /// Sample rate. public unsafe int GetModelSampleRate() { - return NativeImp.STT_GetModelSampleRate(_modelStatePP); + return NativeImp.DS_GetModelSampleRate(_modelStatePP); } /// @@ -89,9 +89,9 @@ namespace MozillaVoiceSttClient /// Native result code. private void EvaluateResultCode(ErrorCodes resultCode) { - if (resultCode != ErrorCodes.STT_ERR_OK) + if (resultCode != ErrorCodes.DS_ERR_OK) { - throw new ArgumentException(NativeImp.STT_ErrorCodeToErrorMessage((int)resultCode).PtrToString()); + throw new ArgumentException(NativeImp.DS_ErrorCodeToErrorMessage((int)resultCode).PtrToString()); } } @@ -100,7 +100,7 @@ namespace MozillaVoiceSttClient /// public unsafe void Dispose() { - NativeImp.STT_FreeModel(_modelStatePP); + NativeImp.DS_FreeModel(_modelStatePP); } /// @@ -120,7 +120,7 @@ namespace MozillaVoiceSttClient throw new FileNotFoundException($"Cannot find the scorer file: {aScorerPath}"); } - var resultCode = NativeImp.STT_EnableExternalScorer(_modelStatePP, aScorerPath); + var resultCode = NativeImp.DS_EnableExternalScorer(_modelStatePP, aScorerPath); EvaluateResultCode(resultCode); } @@ -130,7 +130,7 @@ namespace MozillaVoiceSttClient /// Thrown when an external scorer is not enabled. public unsafe void DisableExternalScorer() { - var resultCode = NativeImp.STT_DisableExternalScorer(_modelStatePP); + var resultCode = NativeImp.DS_DisableExternalScorer(_modelStatePP); EvaluateResultCode(resultCode); } @@ -142,7 +142,7 @@ namespace MozillaVoiceSttClient /// Thrown when an external scorer is not enabled. public unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta) { - var resultCode = NativeImp.STT_SetScorerAlphaBeta(_modelStatePP, + var resultCode = NativeImp.DS_SetScorerAlphaBeta(_modelStatePP, aAlpha, aBeta); EvaluateResultCode(resultCode); @@ -153,9 +153,9 @@ namespace MozillaVoiceSttClient /// /// Instance of the stream to feed the data. /// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on). - public unsafe void FeedAudioContent(MozillaVoiceSttStream stream, short[] aBuffer, uint aBufferSize) + public unsafe void FeedAudioContent(DeepSpeechStream stream, short[] aBuffer, uint aBufferSize) { - NativeImp.STT_FeedAudioContent(stream.GetNativePointer(), aBuffer, aBufferSize); + NativeImp.DS_FeedAudioContent(stream.GetNativePointer(), aBuffer, aBufferSize); } /// @@ -163,9 +163,9 @@ namespace MozillaVoiceSttClient /// /// Instance of the stream to finish. /// The STT result. - public unsafe string FinishStream(MozillaVoiceSttStream stream) + public unsafe string FinishStream(DeepSpeechStream stream) { - return NativeImp.STT_FinishStream(stream.GetNativePointer()).PtrToString(); + return NativeImp.DS_FinishStream(stream.GetNativePointer()).PtrToString(); } /// @@ -174,9 +174,9 @@ namespace MozillaVoiceSttClient /// Instance of the stream to finish. /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata result. - public unsafe Metadata FinishStreamWithMetadata(MozillaVoiceSttStream stream, uint aNumResults) + public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults) { - return NativeImp.STT_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata(); + return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata(); } /// @@ -184,9 +184,9 @@ namespace MozillaVoiceSttClient /// /// Instance of the stream to decode. /// The STT intermediate result. - public unsafe string IntermediateDecode(MozillaVoiceSttStream stream) + public unsafe string IntermediateDecode(DeepSpeechStream stream) { - return NativeImp.STT_IntermediateDecode(stream.GetNativePointer()).PtrToString(); + return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString(); } /// @@ -195,9 +195,9 @@ namespace MozillaVoiceSttClient /// Instance of the stream to decode. /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The STT intermediate result. - public unsafe Metadata IntermediateDecodeWithMetadata(MozillaVoiceSttStream stream, uint aNumResults) + public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults) { - return NativeImp.STT_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata(); + return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata(); } /// @@ -206,18 +206,18 @@ namespace MozillaVoiceSttClient /// public unsafe string Version() { - return NativeImp.STT_Version().PtrToString(); + return NativeImp.DS_Version().PtrToString(); } /// /// Creates a new streaming inference state. /// - public unsafe MozillaVoiceSttStream CreateStream() + public unsafe DeepSpeechStream CreateStream() { IntPtr** streamingStatePointer = null; - var resultCode = NativeImp.STT_CreateStream(_modelStatePP, ref streamingStatePointer); + var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref streamingStatePointer); EvaluateResultCode(resultCode); - return new MozillaVoiceSttStream(streamingStatePointer); + return new DeepSpeechStream(streamingStatePointer); } /// @@ -225,25 +225,25 @@ namespace MozillaVoiceSttClient /// This can be used if you no longer need the result of an ongoing streaming /// inference and don't want to perform a costly decode operation. /// - public unsafe void FreeStream(MozillaVoiceSttStream stream) + public unsafe void FreeStream(DeepSpeechStream stream) { - NativeImp.STT_FreeStream(stream.GetNativePointer()); + NativeImp.DS_FreeStream(stream.GetNativePointer()); stream.Dispose(); } /// - /// Use the Mozilla Voice STT model to perform Speech-To-Text. + /// Use the DeepSpeech model to perform Speech-To-Text. /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. /// The STT result. Returns NULL on error. public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize) { - return NativeImp.STT_SpeechToText(_modelStatePP, aBuffer, aBufferSize).PtrToString(); + return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize).PtrToString(); } /// - /// Use the Mozilla Voice STT model to perform Speech-To-Text, return results including metadata. + /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata. /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. @@ -251,7 +251,7 @@ namespace MozillaVoiceSttClient /// The extended metadata. Returns NULL on error. public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults) { - return NativeImp.STT_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata(); + return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata(); } #endregion diff --git a/native_client/dotnet/MozillaVoiceSttClient/MozillaVoiceSttClient.csproj b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj similarity index 100% rename from native_client/dotnet/MozillaVoiceSttClient/MozillaVoiceSttClient.csproj rename to native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj diff --git a/native_client/dotnet/DeepSpeechClient/Enums/ErrorCodes.cs b/native_client/dotnet/DeepSpeechClient/Enums/ErrorCodes.cs new file mode 100644 index 00000000..30660add --- /dev/null +++ b/native_client/dotnet/DeepSpeechClient/Enums/ErrorCodes.cs @@ -0,0 +1,30 @@ +namespace DeepSpeechClient.Enums +{ + /// + /// Error codes from the native DeepSpeech binary. + /// + internal enum ErrorCodes + { + // OK + DS_ERR_OK = 0x0000, + + // Missing invormations + DS_ERR_NO_MODEL = 0x1000, + + // Invalid parameters + DS_ERR_INVALID_ALPHABET = 0x2000, + DS_ERR_INVALID_SHAPE = 0x2001, + DS_ERR_INVALID_SCORER = 0x2002, + DS_ERR_MODEL_INCOMPATIBLE = 0x2003, + DS_ERR_SCORER_NOT_ENABLED = 0x2004, + + // Runtime failures + DS_ERR_FAIL_INIT_MMAP = 0x3000, + DS_ERR_FAIL_INIT_SESS = 0x3001, + DS_ERR_FAIL_INTERPRETER = 0x3002, + DS_ERR_FAIL_RUN_SESS = 0x3003, + DS_ERR_FAIL_CREATE_STREAM = 0x3004, + DS_ERR_FAIL_READ_PROTOBUF = 0x3005, + DS_ERR_FAIL_CREATE_SESS = 0x3006, + } +} diff --git a/native_client/dotnet/MozillaVoiceSttClient/Extensions/NativeExtensions.cs b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs similarity index 95% rename from native_client/dotnet/MozillaVoiceSttClient/Extensions/NativeExtensions.cs rename to native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs index 0d2229f9..9325f4b8 100644 --- a/native_client/dotnet/MozillaVoiceSttClient/Extensions/NativeExtensions.cs +++ b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs @@ -1,9 +1,9 @@ -using MozillaVoiceSttClient.Structs; +using DeepSpeechClient.Structs; using System; using System.Runtime.InteropServices; using System.Text; -namespace MozillaVoiceSttClient.Extensions +namespace DeepSpeechClient.Extensions { internal static class NativeExtensions { @@ -20,7 +20,7 @@ namespace MozillaVoiceSttClient.Extensions byte[] buffer = new byte[len]; Marshal.Copy(intPtr, buffer, 0, buffer.Length); if (releasePtr) - NativeImp.STT_FreeString(intPtr); + NativeImp.DS_FreeString(intPtr); string result = Encoding.UTF8.GetString(buffer); return result; } @@ -86,7 +86,7 @@ namespace MozillaVoiceSttClient.Extensions metadata.transcripts += sizeOfCandidateTranscript; } - NativeImp.STT_FreeMetadata(intPtr); + NativeImp.DS_FreeMetadata(intPtr); return managedMetadata; } } diff --git a/native_client/dotnet/MozillaVoiceSttClient/Interfaces/IMozillaVoiceSttModel.cs b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs similarity index 85% rename from native_client/dotnet/MozillaVoiceSttClient/Interfaces/IMozillaVoiceSttModel.cs rename to native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs index ede8b5f4..e1ed9cad 100644 --- a/native_client/dotnet/MozillaVoiceSttClient/Interfaces/IMozillaVoiceSttModel.cs +++ b/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs @@ -1,13 +1,13 @@ -using MozillaVoiceSttClient.Models; +using DeepSpeechClient.Models; using System; using System.IO; -namespace MozillaVoiceSttClient.Interfaces +namespace DeepSpeechClient.Interfaces { /// - /// Client interface of Mozilla Voice STT. + /// Client interface of Mozilla's DeepSpeech implementation. /// - public interface IMozillaVoiceSttModel : IDisposable + public interface IDeepSpeech : IDisposable { /// /// Return version of this library. The returned version is a semantic version @@ -59,7 +59,7 @@ namespace MozillaVoiceSttClient.Interfaces unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta); /// - /// Use the Mozilla Voice STT model to perform Speech-To-Text. + /// Use the DeepSpeech model to perform Speech-To-Text. /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. @@ -68,7 +68,7 @@ namespace MozillaVoiceSttClient.Interfaces uint aBufferSize); /// - /// Use the Mozilla Voice STT model to perform Speech-To-Text, return results including metadata. + /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata. /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. @@ -83,26 +83,26 @@ namespace MozillaVoiceSttClient.Interfaces /// This can be used if you no longer need the result of an ongoing streaming /// inference and don't want to perform a costly decode operation. /// - unsafe void FreeStream(MozillaVoiceSttStream stream); + unsafe void FreeStream(DeepSpeechStream stream); /// /// Creates a new streaming inference state. /// - unsafe MozillaVoiceSttStream CreateStream(); + unsafe DeepSpeechStream CreateStream(); /// /// Feeds audio samples to an ongoing streaming inference. /// /// Instance of the stream to feed the data. /// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on). - unsafe void FeedAudioContent(MozillaVoiceSttStream stream, short[] aBuffer, uint aBufferSize); + unsafe void FeedAudioContent(DeepSpeechStream stream, short[] aBuffer, uint aBufferSize); /// /// Computes the intermediate decoding of an ongoing streaming inference. /// /// Instance of the stream to decode. /// The STT intermediate result. - unsafe string IntermediateDecode(MozillaVoiceSttStream stream); + unsafe string IntermediateDecode(DeepSpeechStream stream); /// /// Computes the intermediate decoding of an ongoing streaming inference, including metadata. @@ -110,14 +110,14 @@ namespace MozillaVoiceSttClient.Interfaces /// Instance of the stream to decode. /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata result. - unsafe Metadata IntermediateDecodeWithMetadata(MozillaVoiceSttStream stream, uint aNumResults); + unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults); /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// /// Instance of the stream to finish. /// The STT result. - unsafe string FinishStream(MozillaVoiceSttStream stream); + unsafe string FinishStream(DeepSpeechStream stream); /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata. @@ -125,6 +125,6 @@ namespace MozillaVoiceSttClient.Interfaces /// Instance of the stream to finish. /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata result. - unsafe Metadata FinishStreamWithMetadata(MozillaVoiceSttStream stream, uint aNumResults); + unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults); } } diff --git a/native_client/dotnet/MozillaVoiceSttClient/Models/CandidateTranscript.cs b/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs similarity index 92% rename from native_client/dotnet/MozillaVoiceSttClient/Models/CandidateTranscript.cs rename to native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs index abe1aa30..cc6b5d28 100644 --- a/native_client/dotnet/MozillaVoiceSttClient/Models/CandidateTranscript.cs +++ b/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs @@ -1,4 +1,4 @@ -namespace MozillaVoiceSttClient.Models +namespace DeepSpeechClient.Models { /// /// Stores the entire CTC output as an array of character metadata objects. diff --git a/native_client/dotnet/MozillaVoiceSttClient/Models/DeepSpeechStream.cs b/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs similarity index 80% rename from native_client/dotnet/MozillaVoiceSttClient/Models/DeepSpeechStream.cs rename to native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs index 0223a6bd..e4605f5e 100644 --- a/native_client/dotnet/MozillaVoiceSttClient/Models/DeepSpeechStream.cs +++ b/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs @@ -1,19 +1,19 @@ using System; -namespace MozillaVoiceSttClient.Models +namespace DeepSpeechClient.Models { /// /// Wrapper of the pointer used for the decoding stream. /// - public class MozillaVoiceSttStream : IDisposable + public class DeepSpeechStream : IDisposable { private unsafe IntPtr** _streamingStatePp; /// - /// Initializes a new instance of . + /// Initializes a new instance of . /// /// Native pointer of the native stream. - public unsafe MozillaVoiceSttStream(IntPtr** streamingStatePP) + public unsafe DeepSpeechStream(IntPtr** streamingStatePP) { _streamingStatePp = streamingStatePP; } diff --git a/native_client/dotnet/MozillaVoiceSttClient/Models/Metadata.cs b/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs similarity index 88% rename from native_client/dotnet/MozillaVoiceSttClient/Models/Metadata.cs rename to native_client/dotnet/DeepSpeechClient/Models/Metadata.cs index ea0666bf..fb6c613d 100644 --- a/native_client/dotnet/MozillaVoiceSttClient/Models/Metadata.cs +++ b/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs @@ -1,4 +1,4 @@ -namespace MozillaVoiceSttClient.Models +namespace DeepSpeechClient.Models { /// /// Stores the entire CTC output as an array of character metadata objects. diff --git a/native_client/dotnet/MozillaVoiceSttClient/Models/TokenMetadata.cs b/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs similarity index 92% rename from native_client/dotnet/MozillaVoiceSttClient/Models/TokenMetadata.cs rename to native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs index 86e8bdda..5f2dea56 100644 --- a/native_client/dotnet/MozillaVoiceSttClient/Models/TokenMetadata.cs +++ b/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs @@ -1,4 +1,4 @@ -namespace MozillaVoiceSttClient.Models +namespace DeepSpeechClient.Models { /// /// Stores each individual character, along with its timing information. diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs new file mode 100644 index 00000000..bc77cf1b --- /dev/null +++ b/native_client/dotnet/DeepSpeechClient/NativeImp.cs @@ -0,0 +1,102 @@ +using DeepSpeechClient.Enums; + +using System; +using System.Runtime.InteropServices; + +namespace DeepSpeechClient +{ + /// + /// Wrapper for the native implementation of "libdeepspeech.so" + /// + internal static class NativeImp + { + #region Native Implementation + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, + CharSet = CharSet.Ansi, SetLastError = true)] + internal static extern IntPtr DS_Version(); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath, + ref IntPtr** pint); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal unsafe static extern IntPtr DS_ErrorCodeToErrorMessage(int aErrorCode); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal unsafe static extern uint DS_GetModelBeamWidth(IntPtr** aCtx); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal unsafe static extern ErrorCodes DS_SetModelBeamWidth(IntPtr** aCtx, + uint aBeamWidth); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath, + uint aBeamWidth, + ref IntPtr** pint); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern ErrorCodes DS_EnableExternalScorer(IntPtr** aCtx, + string aScorerPath); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern ErrorCodes DS_DisableExternalScorer(IntPtr** aCtx); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern ErrorCodes DS_SetScorerAlphaBeta(IntPtr** aCtx, + float aAlpha, + float aBeta); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, + CharSet = CharSet.Ansi, SetLastError = true)] + internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx, + short[] aBuffer, + uint aBufferSize); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)] + internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx, + short[] aBuffer, + uint aBufferSize, + uint aNumResults); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern void DS_FreeModel(IntPtr** aCtx); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern ErrorCodes DS_CreateStream(IntPtr** aCtx, + ref IntPtr** retval); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern void DS_FreeStream(IntPtr** aSctx); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern void DS_FreeMetadata(IntPtr metadata); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern void DS_FreeString(IntPtr str); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, + CharSet = CharSet.Ansi, SetLastError = true)] + internal static unsafe extern void DS_FeedAudioContent(IntPtr** aSctx, + short[] aBuffer, + uint aBufferSize); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx, + uint aNumResults); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, + CharSet = CharSet.Ansi, SetLastError = true)] + internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx); + + [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx, + uint aNumResults); + #endregion + } +} diff --git a/native_client/dotnet/MozillaVoiceSttClient/Structs/CandidateTranscript.cs b/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs similarity index 93% rename from native_client/dotnet/MozillaVoiceSttClient/Structs/CandidateTranscript.cs rename to native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs index 9029d0f5..54581f6f 100644 --- a/native_client/dotnet/MozillaVoiceSttClient/Structs/CandidateTranscript.cs +++ b/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs @@ -1,7 +1,7 @@ using System; using System.Runtime.InteropServices; -namespace MozillaVoiceSttClient.Structs +namespace DeepSpeechClient.Structs { [StructLayout(LayoutKind.Sequential)] internal unsafe struct CandidateTranscript diff --git a/native_client/dotnet/MozillaVoiceSttClient/Structs/Metadata.cs b/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs similarity index 91% rename from native_client/dotnet/MozillaVoiceSttClient/Structs/Metadata.cs rename to native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs index a354759a..0a9beddc 100644 --- a/native_client/dotnet/MozillaVoiceSttClient/Structs/Metadata.cs +++ b/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs @@ -1,7 +1,7 @@ using System; using System.Runtime.InteropServices; -namespace MozillaVoiceSttClient.Structs +namespace DeepSpeechClient.Structs { [StructLayout(LayoutKind.Sequential)] internal unsafe struct Metadata diff --git a/native_client/dotnet/MozillaVoiceSttClient/Structs/TokenMetadata.cs b/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs similarity index 93% rename from native_client/dotnet/MozillaVoiceSttClient/Structs/TokenMetadata.cs rename to native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs index 1f54e5d4..1c660c71 100644 --- a/native_client/dotnet/MozillaVoiceSttClient/Structs/TokenMetadata.cs +++ b/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs @@ -1,7 +1,7 @@ using System; using System.Runtime.InteropServices; -namespace MozillaVoiceSttClient.Structs +namespace DeepSpeechClient.Structs { [StructLayout(LayoutKind.Sequential)] internal unsafe struct TokenMetadata diff --git a/native_client/dotnet/MozillaVoiceSttConsole/App.config b/native_client/dotnet/DeepSpeechConsole/App.config similarity index 100% rename from native_client/dotnet/MozillaVoiceSttConsole/App.config rename to native_client/dotnet/DeepSpeechConsole/App.config diff --git a/native_client/dotnet/MozillaVoiceSttConsole/MozillaVoiceSttConsole.csproj b/native_client/dotnet/DeepSpeechConsole/DeepSpeechConsole.csproj similarity index 92% rename from native_client/dotnet/MozillaVoiceSttConsole/MozillaVoiceSttConsole.csproj rename to native_client/dotnet/DeepSpeechConsole/DeepSpeechConsole.csproj index 13a8b355..a05fca61 100644 --- a/native_client/dotnet/MozillaVoiceSttConsole/MozillaVoiceSttConsole.csproj +++ b/native_client/dotnet/DeepSpeechConsole/DeepSpeechConsole.csproj @@ -6,8 +6,8 @@ AnyCPU {312965E5-C4F6-4D95-BA64-79906B8BC7AC} Exe - MozillaVoiceSttConsole - MozillaVoiceSttConsole + DeepSpeechConsole + DeepSpeechConsole v4.6.2 512 true @@ -56,9 +56,9 @@ - + {56DE4091-BBBE-47E4-852D-7268B33B971F} - MozillaVoiceSttClient + DeepSpeechClient diff --git a/native_client/dotnet/MozillaVoiceSttConsole/Program.cs b/native_client/dotnet/DeepSpeechConsole/Program.cs similarity index 94% rename from native_client/dotnet/MozillaVoiceSttConsole/Program.cs rename to native_client/dotnet/DeepSpeechConsole/Program.cs index f94f5de1..68f3fc54 100644 --- a/native_client/dotnet/MozillaVoiceSttConsole/Program.cs +++ b/native_client/dotnet/DeepSpeechConsole/Program.cs @@ -1,6 +1,6 @@ -using MozillaVoiceSttClient; -using MozillaVoiceSttClient.Interfaces; -using MozillaVoiceSttClient.Models; +using DeepSpeechClient; +using DeepSpeechClient.Interfaces; +using DeepSpeechClient.Models; using NAudio.Wave; using System; using System.Collections.Generic; @@ -52,7 +52,7 @@ namespace CSharpExamples Console.WriteLine("Loading model..."); stopwatch.Start(); // sphinx-doc: csharp_ref_model_start - using (IMozillaVoiceSttModel sttClient = new MozillaVoiceSttModel(model ?? "output_graph.pbmm")) + using (IDeepSpeech sttClient = new DeepSpeech(model ?? "output_graph.pbmm")) { // sphinx-doc: csharp_ref_model_stop stopwatch.Stop(); diff --git a/native_client/dotnet/MozillaVoiceSttConsole/Properties/AssemblyInfo.cs b/native_client/dotnet/DeepSpeechConsole/Properties/AssemblyInfo.cs similarity index 96% rename from native_client/dotnet/MozillaVoiceSttConsole/Properties/AssemblyInfo.cs rename to native_client/dotnet/DeepSpeechConsole/Properties/AssemblyInfo.cs index f3257c64..845851a1 100644 --- a/native_client/dotnet/MozillaVoiceSttConsole/Properties/AssemblyInfo.cs +++ b/native_client/dotnet/DeepSpeechConsole/Properties/AssemblyInfo.cs @@ -5,7 +5,7 @@ using System.Runtime.InteropServices; // General Information about an assembly is controlled through the following // set of attributes. Change these attribute values to modify the information // associated with an assembly. -[assembly: AssemblyTitle("MozillaVoiceSttConsole")] +[assembly: AssemblyTitle("DeepSpeechConsole")] [assembly: AssemblyDescription("")] [assembly: AssemblyConfiguration("")] [assembly: AssemblyCompany("")] diff --git a/native_client/dotnet/MozillaVoiceSttConsole/arctic_a0024.wav b/native_client/dotnet/DeepSpeechConsole/arctic_a0024.wav similarity index 100% rename from native_client/dotnet/MozillaVoiceSttConsole/arctic_a0024.wav rename to native_client/dotnet/DeepSpeechConsole/arctic_a0024.wav diff --git a/native_client/dotnet/MozillaVoiceSttConsole/packages.config b/native_client/dotnet/DeepSpeechConsole/packages.config similarity index 100% rename from native_client/dotnet/MozillaVoiceSttConsole/packages.config rename to native_client/dotnet/DeepSpeechConsole/packages.config diff --git a/native_client/dotnet/MozillaVoiceSttWPF/.gitignore b/native_client/dotnet/DeepSpeechWPF/.gitignore similarity index 100% rename from native_client/dotnet/MozillaVoiceSttWPF/.gitignore rename to native_client/dotnet/DeepSpeechWPF/.gitignore diff --git a/native_client/dotnet/MozillaVoiceSttWPF/App.config b/native_client/dotnet/DeepSpeechWPF/App.config similarity index 100% rename from native_client/dotnet/MozillaVoiceSttWPF/App.config rename to native_client/dotnet/DeepSpeechWPF/App.config diff --git a/native_client/dotnet/MozillaVoiceSttWPF/App.xaml b/native_client/dotnet/DeepSpeechWPF/App.xaml similarity index 71% rename from native_client/dotnet/MozillaVoiceSttWPF/App.xaml rename to native_client/dotnet/DeepSpeechWPF/App.xaml index ca6a0f13..16ebb0d4 100644 --- a/native_client/dotnet/MozillaVoiceSttWPF/App.xaml +++ b/native_client/dotnet/DeepSpeechWPF/App.xaml @@ -1,8 +1,8 @@  diff --git a/native_client/dotnet/MozillaVoiceSttWPF/App.xaml.cs b/native_client/dotnet/DeepSpeechWPF/App.xaml.cs similarity index 58% rename from native_client/dotnet/MozillaVoiceSttWPF/App.xaml.cs rename to native_client/dotnet/DeepSpeechWPF/App.xaml.cs index 6404f50b..d4b87d6e 100644 --- a/native_client/dotnet/MozillaVoiceSttWPF/App.xaml.cs +++ b/native_client/dotnet/DeepSpeechWPF/App.xaml.cs @@ -1,10 +1,10 @@ using CommonServiceLocator; -using MozillaVoiceStt.WPF.ViewModels; -using MozillaVoiceSttClient.Interfaces; +using DeepSpeech.WPF.ViewModels; +using DeepSpeechClient.Interfaces; using GalaSoft.MvvmLight.Ioc; using System.Windows; -namespace MozillaVoiceSttWPF +namespace DeepSpeechWPF { /// /// Interaction logic for App.xaml @@ -18,11 +18,11 @@ namespace MozillaVoiceSttWPF try { - //Register instance of Mozilla Voice STT - MozillaVoiceSttClient.MozillaVoiceSttModel client = - new MozillaVoiceSttClient.MozillaVoiceSttModel("deepspeech-0.8.0-models.pbmm"); + //Register instance of DeepSpeech + DeepSpeechClient.DeepSpeech deepSpeechClient = + new DeepSpeechClient.DeepSpeech("deepspeech-0.8.0-models.pbmm"); - SimpleIoc.Default.Register(() => client); + SimpleIoc.Default.Register(() => deepSpeechClient); SimpleIoc.Default.Register(); } catch (System.Exception ex) @@ -35,8 +35,8 @@ namespace MozillaVoiceSttWPF protected override void OnExit(ExitEventArgs e) { base.OnExit(e); - //Dispose instance of Mozilla Voice STT - ServiceLocator.Current.GetInstance()?.Dispose(); + //Dispose instance of DeepSpeech + ServiceLocator.Current.GetInstance()?.Dispose(); } } } diff --git a/native_client/dotnet/MozillaVoiceSttWPF/MozillaVoiceStt.WPF.csproj b/native_client/dotnet/DeepSpeechWPF/DeepSpeech.WPF.csproj similarity index 94% rename from native_client/dotnet/MozillaVoiceSttWPF/MozillaVoiceStt.WPF.csproj rename to native_client/dotnet/DeepSpeechWPF/DeepSpeech.WPF.csproj index d14a02b7..7f46a31e 100644 --- a/native_client/dotnet/MozillaVoiceSttWPF/MozillaVoiceStt.WPF.csproj +++ b/native_client/dotnet/DeepSpeechWPF/DeepSpeech.WPF.csproj @@ -6,8 +6,8 @@ AnyCPU {54BFD766-4305-4F4C-BA59-AF45505DF3C1} WinExe - MozillaVoiceStt.WPF - MozillaVoiceStt.WPF + DeepSpeech.WPF + DeepSpeech.WPF v4.6.2 512 {60dc8134-eba5-43b8-bcc9-bb4bc16c2548};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} @@ -131,9 +131,9 @@ - + {56de4091-bbbe-47e4-852d-7268b33b971f} - MozillaVoiceSttClient + DeepSpeechClient diff --git a/native_client/dotnet/MozillaVoiceSttWPF/MozillaVoiceStt.WPF.sln b/native_client/dotnet/DeepSpeechWPF/DeepSpeech.WPF.sln similarity index 79% rename from native_client/dotnet/MozillaVoiceSttWPF/MozillaVoiceStt.WPF.sln rename to native_client/dotnet/DeepSpeechWPF/DeepSpeech.WPF.sln index 003c6d8e..cd29025e 100644 --- a/native_client/dotnet/MozillaVoiceSttWPF/MozillaVoiceStt.WPF.sln +++ b/native_client/dotnet/DeepSpeechWPF/DeepSpeech.WPF.sln @@ -3,9 +3,9 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 15 VisualStudioVersion = 15.0.28307.421 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MozillaVoiceStt.WPF", "MozillaVoiceStt.WPF.csproj", "{54BFD766-4305-4F4C-BA59-AF45505DF3C1}" +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeech.WPF", "DeepSpeech.WPF.csproj", "{54BFD766-4305-4F4C-BA59-AF45505DF3C1}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MozillaVoiceSttClient", "..\MozillaVoiceSttClient\MozillaVoiceSttClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}" +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechClient", "..\DeepSpeechClient\DeepSpeechClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/native_client/dotnet/MozillaVoiceSttWPF/MainWindow.xaml b/native_client/dotnet/DeepSpeechWPF/MainWindow.xaml similarity index 97% rename from native_client/dotnet/MozillaVoiceSttWPF/MainWindow.xaml rename to native_client/dotnet/DeepSpeechWPF/MainWindow.xaml index 5894fae3..4fbe5e72 100644 --- a/native_client/dotnet/MozillaVoiceSttWPF/MainWindow.xaml +++ b/native_client/dotnet/DeepSpeechWPF/MainWindow.xaml @@ -1,10 +1,10 @@  /// Interaction logic for MainWindow.xaml diff --git a/native_client/dotnet/MozillaVoiceSttWPF/Properties/AssemblyInfo.cs b/native_client/dotnet/DeepSpeechWPF/Properties/AssemblyInfo.cs similarity index 95% rename from native_client/dotnet/MozillaVoiceSttWPF/Properties/AssemblyInfo.cs rename to native_client/dotnet/DeepSpeechWPF/Properties/AssemblyInfo.cs index 034ac3d6..f9ae7d76 100644 --- a/native_client/dotnet/MozillaVoiceSttWPF/Properties/AssemblyInfo.cs +++ b/native_client/dotnet/DeepSpeechWPF/Properties/AssemblyInfo.cs @@ -7,11 +7,11 @@ using System.Windows; // General Information about an assembly is controlled through the following // set of attributes. Change these attribute values to modify the information // associated with an assembly. -[assembly: AssemblyTitle("MozillaVoiceStt.WPF")] +[assembly: AssemblyTitle("DeepSpeech.WPF")] [assembly: AssemblyDescription("")] [assembly: AssemblyConfiguration("")] [assembly: AssemblyCompany("")] -[assembly: AssemblyProduct("MozillaVoiceStt.WPF.SingleFiles")] +[assembly: AssemblyProduct("DeepSpeech.WPF.SingleFiles")] [assembly: AssemblyCopyright("Copyright © 2018")] [assembly: AssemblyTrademark("")] [assembly: AssemblyCulture("")] diff --git a/native_client/dotnet/MozillaVoiceSttWPF/Properties/Resources.Designer.cs b/native_client/dotnet/DeepSpeechWPF/Properties/Resources.Designer.cs similarity index 94% rename from native_client/dotnet/MozillaVoiceSttWPF/Properties/Resources.Designer.cs rename to native_client/dotnet/DeepSpeechWPF/Properties/Resources.Designer.cs index b470f9ae..2da2b4b2 100644 --- a/native_client/dotnet/MozillaVoiceSttWPF/Properties/Resources.Designer.cs +++ b/native_client/dotnet/DeepSpeechWPF/Properties/Resources.Designer.cs @@ -8,7 +8,7 @@ // //------------------------------------------------------------------------------ -namespace MozillaVoiceStt.WPF.Properties { +namespace DeepSpeech.WPF.Properties { using System; @@ -39,7 +39,7 @@ namespace MozillaVoiceStt.WPF.Properties { internal static global::System.Resources.ResourceManager ResourceManager { get { if (object.ReferenceEquals(resourceMan, null)) { - global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("MozillaVoiceStt.WPF.Properties.Resources", typeof(Resources).Assembly); + global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("DeepSpeech.WPF.Properties.Resources", typeof(Resources).Assembly); resourceMan = temp; } return resourceMan; diff --git a/native_client/dotnet/MozillaVoiceSttWPF/Properties/Resources.resx b/native_client/dotnet/DeepSpeechWPF/Properties/Resources.resx similarity index 100% rename from native_client/dotnet/MozillaVoiceSttWPF/Properties/Resources.resx rename to native_client/dotnet/DeepSpeechWPF/Properties/Resources.resx diff --git a/native_client/dotnet/MozillaVoiceSttWPF/Properties/Settings.Designer.cs b/native_client/dotnet/DeepSpeechWPF/Properties/Settings.Designer.cs similarity index 96% rename from native_client/dotnet/MozillaVoiceSttWPF/Properties/Settings.Designer.cs rename to native_client/dotnet/DeepSpeechWPF/Properties/Settings.Designer.cs index a7218694..0f464bc4 100644 --- a/native_client/dotnet/MozillaVoiceSttWPF/Properties/Settings.Designer.cs +++ b/native_client/dotnet/DeepSpeechWPF/Properties/Settings.Designer.cs @@ -8,7 +8,7 @@ // //------------------------------------------------------------------------------ -namespace MozillaVoiceStt.WPF.Properties { +namespace DeepSpeech.WPF.Properties { [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] diff --git a/native_client/dotnet/MozillaVoiceSttWPF/Properties/Settings.settings b/native_client/dotnet/DeepSpeechWPF/Properties/Settings.settings similarity index 100% rename from native_client/dotnet/MozillaVoiceSttWPF/Properties/Settings.settings rename to native_client/dotnet/DeepSpeechWPF/Properties/Settings.settings diff --git a/native_client/dotnet/MozillaVoiceSttWPF/ViewModels/BindableBase.cs b/native_client/dotnet/DeepSpeechWPF/ViewModels/BindableBase.cs similarity index 98% rename from native_client/dotnet/MozillaVoiceSttWPF/ViewModels/BindableBase.cs rename to native_client/dotnet/DeepSpeechWPF/ViewModels/BindableBase.cs index 92fd2f57..909327ee 100644 --- a/native_client/dotnet/MozillaVoiceSttWPF/ViewModels/BindableBase.cs +++ b/native_client/dotnet/DeepSpeechWPF/ViewModels/BindableBase.cs @@ -3,7 +3,7 @@ using System.Collections.Generic; using System.ComponentModel; using System.Runtime.CompilerServices; -namespace MozillaVoiceStt.WPF.ViewModels +namespace DeepSpeech.WPF.ViewModels { /// /// Implementation of to simplify models. diff --git a/native_client/dotnet/MozillaVoiceSttWPF/ViewModels/MainWindowViewModel.cs b/native_client/dotnet/DeepSpeechWPF/ViewModels/MainWindowViewModel.cs similarity index 96% rename from native_client/dotnet/MozillaVoiceSttWPF/ViewModels/MainWindowViewModel.cs rename to native_client/dotnet/DeepSpeechWPF/ViewModels/MainWindowViewModel.cs index 0d81c2f0..230fd42a 100644 --- a/native_client/dotnet/MozillaVoiceSttWPF/ViewModels/MainWindowViewModel.cs +++ b/native_client/dotnet/DeepSpeechWPF/ViewModels/MainWindowViewModel.cs @@ -3,8 +3,8 @@ using CSCore; using CSCore.CoreAudioAPI; using CSCore.SoundIn; using CSCore.Streams; -using MozillaVoiceSttClient.Interfaces; -using MozillaVoiceSttClient.Models; +using DeepSpeechClient.Interfaces; +using DeepSpeechClient.Models; using GalaSoft.MvvmLight.CommandWpf; using Microsoft.Win32; using System; @@ -15,7 +15,7 @@ using System.IO; using System.Threading; using System.Threading.Tasks; -namespace MozillaVoiceStt.WPF.ViewModels +namespace DeepSpeech.WPF.ViewModels { /// /// View model of the MainWindow View. @@ -27,7 +27,7 @@ namespace MozillaVoiceStt.WPF.ViewModels private const string ScorerPath = "kenlm.scorer"; #endregion - private readonly IMozillaVoiceSttModel _sttClient; + private readonly IDeepSpeech _sttClient; #region Commands /// @@ -62,7 +62,7 @@ namespace MozillaVoiceStt.WPF.ViewModels /// /// Stream used to feed data into the acoustic model. /// - private MozillaVoiceSttStream _sttStream; + private DeepSpeechStream _sttStream; /// /// Records the audio of the selected device. @@ -75,7 +75,7 @@ namespace MozillaVoiceStt.WPF.ViewModels private SoundInSource _soundInSource; /// - /// Target wave source.(16KHz Mono 16bit for Mozilla Voice STT) + /// Target wave source.(16KHz Mono 16bit for DeepSpeech) /// private IWaveSource _convertedSource; @@ -200,7 +200,7 @@ namespace MozillaVoiceStt.WPF.ViewModels #endregion #region Ctors - public MainWindowViewModel(IMozillaVoiceSttModel sttClient) + public MainWindowViewModel(IDeepSpeech sttClient) { _sttClient = sttClient; @@ -290,8 +290,7 @@ namespace MozillaVoiceStt.WPF.ViewModels //read data from the converedSource //important: don't use the e.Data here //the e.Data contains the raw data provided by the - //soundInSource which won't have the Mozilla Voice STT required - // audio format + //soundInSource which won't have the deepspeech required audio format byte[] buffer = new byte[_convertedSource.WaveFormat.BytesPerSecond / 2]; int read; diff --git a/native_client/dotnet/MozillaVoiceSttWPF/packages.config b/native_client/dotnet/DeepSpeechWPF/packages.config similarity index 100% rename from native_client/dotnet/MozillaVoiceSttWPF/packages.config rename to native_client/dotnet/DeepSpeechWPF/packages.config diff --git a/native_client/dotnet/MozillaVoiceSttClient/Enums/ErrorCodes.cs b/native_client/dotnet/MozillaVoiceSttClient/Enums/ErrorCodes.cs deleted file mode 100644 index aa816f8d..00000000 --- a/native_client/dotnet/MozillaVoiceSttClient/Enums/ErrorCodes.cs +++ /dev/null @@ -1,29 +0,0 @@ -namespace MozillaVoiceSttClient.Enums -{ - /// - /// Error codes from the native Mozilla Voice STT binary. - /// - internal enum ErrorCodes - { - STT_ERR_OK = 0x0000, - STT_ERR_NO_MODEL = 0x1000, - STT_ERR_INVALID_ALPHABET = 0x2000, - STT_ERR_INVALID_SHAPE = 0x2001, - STT_ERR_INVALID_SCORER = 0x2002, - STT_ERR_MODEL_INCOMPATIBLE = 0x2003, - STT_ERR_SCORER_NOT_ENABLED = 0x2004, - STT_ERR_SCORER_UNREADABLE = 0x2005, - STT_ERR_SCORER_INVALID_LM = 0x2006, - STT_ERR_SCORER_NO_TRIE = 0x2007, - STT_ERR_SCORER_INVALID_TRIE = 0x2008, - STT_ERR_SCORER_VERSION_MISMATCH = 0x2009, - STT_ERR_FAIL_INIT_MMAP = 0x3000, - STT_ERR_FAIL_INIT_SESS = 0x3001, - STT_ERR_FAIL_INTERPRETER = 0x3002, - STT_ERR_FAIL_RUN_SESS = 0x3003, - STT_ERR_FAIL_CREATE_STREAM = 0x3004, - STT_ERR_FAIL_READ_PROTOBUF = 0x3005, - STT_ERR_FAIL_CREATE_SESS = 0x3006, - STT_ERR_FAIL_CREATE_MODEL = 0x3007, - } -} diff --git a/native_client/dotnet/MozillaVoiceSttClient/NativeImp.cs b/native_client/dotnet/MozillaVoiceSttClient/NativeImp.cs deleted file mode 100644 index daad79ac..00000000 --- a/native_client/dotnet/MozillaVoiceSttClient/NativeImp.cs +++ /dev/null @@ -1,102 +0,0 @@ -using MozillaVoiceSttClient.Enums; - -using System; -using System.Runtime.InteropServices; - -namespace MozillaVoiceSttClient -{ - /// - /// Wrapper for the native implementation of "libmozilla_voice_stt.so" - /// - internal static class NativeImp - { - #region Native Implementation - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl, - CharSet = CharSet.Ansi, SetLastError = true)] - internal static extern IntPtr STT_Version(); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal unsafe static extern ErrorCodes STT_CreateModel(string aModelPath, - ref IntPtr** pint); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal unsafe static extern IntPtr STT_ErrorCodeToErrorMessage(int aErrorCode); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal unsafe static extern uint STT_GetModelBeamWidth(IntPtr** aCtx); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal unsafe static extern ErrorCodes STT_SetModelBeamWidth(IntPtr** aCtx, - uint aBeamWidth); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal unsafe static extern ErrorCodes STT_CreateModel(string aModelPath, - uint aBeamWidth, - ref IntPtr** pint); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal unsafe static extern int STT_GetModelSampleRate(IntPtr** aCtx); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern ErrorCodes STT_EnableExternalScorer(IntPtr** aCtx, - string aScorerPath); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern ErrorCodes STT_DisableExternalScorer(IntPtr** aCtx); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern ErrorCodes STT_SetScorerAlphaBeta(IntPtr** aCtx, - float aAlpha, - float aBeta); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl, - CharSet = CharSet.Ansi, SetLastError = true)] - internal static unsafe extern IntPtr STT_SpeechToText(IntPtr** aCtx, - short[] aBuffer, - uint aBufferSize); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)] - internal static unsafe extern IntPtr STT_SpeechToTextWithMetadata(IntPtr** aCtx, - short[] aBuffer, - uint aBufferSize, - uint aNumResults); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern void STT_FreeModel(IntPtr** aCtx); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern ErrorCodes STT_CreateStream(IntPtr** aCtx, - ref IntPtr** retval); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern void STT_FreeStream(IntPtr** aSctx); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern void STT_FreeMetadata(IntPtr metadata); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern void STT_FreeString(IntPtr str); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl, - CharSet = CharSet.Ansi, SetLastError = true)] - internal static unsafe extern void STT_FeedAudioContent(IntPtr** aSctx, - short[] aBuffer, - uint aBufferSize); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern IntPtr STT_IntermediateDecode(IntPtr** aSctx); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern IntPtr STT_IntermediateDecodeWithMetadata(IntPtr** aSctx, - uint aNumResults); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl, - CharSet = CharSet.Ansi, SetLastError = true)] - internal static unsafe extern IntPtr STT_FinishStream(IntPtr** aSctx); - - [DllImport("libmozilla_voice_stt.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern IntPtr STT_FinishStreamWithMetadata(IntPtr** aSctx, - uint aNumResults); - #endregion - } -} diff --git a/native_client/dotnet/README.rst b/native_client/dotnet/README.rst index 5db9eb28..b1025573 100644 --- a/native_client/dotnet/README.rst +++ b/native_client/dotnet/README.rst @@ -1,8 +1,8 @@ -Building Mozilla Voice STT native client for Windows +Building DeepSpeech native client for Windows ============================================= -Now we can build the native client of Mozilla Voice STT and run inference on Windows using the C# client, to do that we need to compile the ``native_client``. +Now we can build the native client of DeepSpeech and run inference on Windows using the C# client, to do that we need to compile the ``native_client``. **Table of Contents** @@ -42,11 +42,11 @@ We highly recommend sticking to the recommended versions of CUDA/cuDNN in order Getting the code ---------------- -We need to clone ``mozilla/STT``. +We need to clone ``mozilla/DeepSpeech``. .. code-block:: bash - git clone https://github.com/mozilla/STT + git clone https://github.com/mozilla/DeepSpeech git submodule sync tensorflow/ git submodule update --init tensorflow/ @@ -59,8 +59,8 @@ There should already be a symbolic link, for this example let's suppose that we . ├── D:\ - │ ├── cloned # Contains Mozilla Voice STT and tensorflow side by side - │ │ └── DeepSpeech # Root of the cloned Mozilla Voice STT + │ ├── cloned # Contains DeepSpeech and tensorflow side by side + │ │ └── DeepSpeech # Root of the cloned DeepSpeech │ │ ├── tensorflow # Root of the cloned Mozilla's tensorflow └── ... @@ -126,7 +126,7 @@ We will add AVX/AVX2 support in the command, please make sure that your CPU supp .. code-block:: bash - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" -c opt --copt=/arch:AVX --copt=/arch:AVX2 //native_client:libmozilla_voice_stt.so + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" -c opt --copt=/arch:AVX --copt=/arch:AVX2 //native_client:libdeepspeech.so GPU with CUDA ~~~~~~~~~~~~~ @@ -135,11 +135,11 @@ If you enabled CUDA in `configure.py `_ in your Mozilla Voice STT directory and open the Visual Studio solution, then we need to build in debug or release mode, finally we just need to copy ``libmozilla_voice_stt.so`` to the generated ``x64/Debug`` or ``x64/Release`` directory. +As for now we can only use the generated ``libdeepspeech.so`` with the C# clients, go to `native_client/dotnet/ `_ in your DeepSpeech directory and open the Visual Studio solution, then we need to build in debug or release mode, finally we just need to copy ``libdeepspeech.so`` to the generated ``x64/Debug`` or ``x64/Release`` directory. diff --git a/native_client/dotnet/nupkg/deepspeech.nuspec.in b/native_client/dotnet/nupkg/deepspeech.nuspec.in index 51925787..a4797177 100644 --- a/native_client/dotnet/nupkg/deepspeech.nuspec.in +++ b/native_client/dotnet/nupkg/deepspeech.nuspec.in @@ -3,13 +3,13 @@ $NUPKG_ID $NUPKG_VERSION - Mozilla.Voice.STT + DeepSpeech Mozilla Mozilla MPL-2.0 - http://github.com/mozilla/STT + http://github.com/mozilla/DeepSpeech false - A library for running inference with a Mozilla Voice STT model + A library for running inference with a DeepSpeech model Copyright (c) 2019 Mozilla Corporation native speech speech_recognition diff --git a/native_client/generate_scorer_package.cpp b/native_client/generate_scorer_package.cpp index 63d294bf..4486b42c 100644 --- a/native_client/generate_scorer_package.cpp +++ b/native_client/generate_scorer_package.cpp @@ -11,7 +11,7 @@ using namespace std; #include "ctcdecode/decoder_utils.h" #include "ctcdecode/scorer.h" #include "alphabet.h" -#include "mozilla_voice_stt.h" +#include "deepspeech.h" namespace po = boost::program_options; @@ -66,9 +66,9 @@ create_package(absl::optional alphabet_path, scorer.set_utf8_mode(force_utf8.value()); scorer.reset_params(default_alpha, default_beta); int err = scorer.load_lm(lm_path); - if (err != STT_ERR_SCORER_NO_TRIE) { + if (err != DS_ERR_SCORER_NO_TRIE) { cerr << "Error loading language model file: " - << STT_ErrorCodeToErrorMessage(err) << "\n"; + << DS_ErrorCodeToErrorMessage(err) << "\n"; return 1; } scorer.fill_dictionary(words); @@ -102,7 +102,7 @@ main(int argc, char** argv) ("package", po::value(), "Path to save scorer package.") ("default_alpha", po::value(), "Default value of alpha hyperparameter (float).") ("default_beta", po::value(), "Default value of beta hyperparameter (float).") - ("force_utf8", po::value(), "Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See for further explanation.") + ("force_utf8", po::value(), "Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See for further explanation.") ; po::variables_map vm; diff --git a/native_client/java/Makefile b/native_client/java/Makefile index 22694841..191b1013 100644 --- a/native_client/java/Makefile +++ b/native_client/java/Makefile @@ -2,7 +2,7 @@ include ../definitions.mk -ARCHS := $(shell grep 'ABI_FILTERS' libmozillavoicestt/gradle.properties | cut -d'=' -f2 | sed -e 's/;/ /g') +ARCHS := $(shell grep 'ABI_FILTERS' libdeepspeech/gradle.properties | cut -d'=' -f2 | sed -e 's/;/ /g') GRADLE ?= ./gradlew all: apk @@ -14,13 +14,13 @@ apk-clean: $(GRADLE) clean libs-clean: - rm -fr libmozillavoicestt/libs/*/libmozilla_voice_stt.so + rm -fr libdeepspeech/libs/*/libdeepspeech.so -libmozillavoicestt/libs/%/libmozilla_voice_stt.so: - -mkdir libmozillavoicestt/libs/$*/ - cp ${TFDIR}/bazel-out/$*-*/bin/native_client/libmozilla_voice_stt.so libmozillavoicestt/libs/$*/ +libdeepspeech/libs/%/libdeepspeech.so: + -mkdir libdeepspeech/libs/$*/ + cp ${TFDIR}/bazel-out/$*-*/bin/native_client/libdeepspeech.so libdeepspeech/libs/$*/ -apk: apk-clean bindings $(patsubst %,libmozillavoicestt/libs/%/libmozilla_voice_stt.so,$(ARCHS)) +apk: apk-clean bindings $(patsubst %,libdeepspeech/libs/%/libdeepspeech.so,$(ARCHS)) $(GRADLE) build maven-bundle: apk @@ -28,4 +28,4 @@ maven-bundle: apk $(GRADLE) zipMavenArtifacts bindings: clean ds-swig - $(DS_SWIG_ENV) swig -c++ -java -package org.mozilla.voice.stt -outdir libmozillavoicestt/src/main/java/org/mozilla/voice/stt/ -o jni/deepspeech_wrap.cpp jni/deepspeech.i + $(DS_SWIG_ENV) swig -c++ -java -package org.mozilla.deepspeech.libdeepspeech -outdir libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech/ -o jni/deepspeech_wrap.cpp jni/deepspeech.i diff --git a/native_client/java/README.md b/native_client/java/README.md index 8554ff50..89ebc594 100644 --- a/native_client/java/README.md +++ b/native_client/java/README.md @@ -1 +1 @@ -Full project description and documentation on GitHub: [https://github.com/mozilla/STT](https://github.com/mozilla/STT). +Full project description and documentation on GitHub: [https://github.com/mozilla/DeepSpeech](https://github.com/mozilla/DeepSpeech). diff --git a/native_client/java/app/build.gradle b/native_client/java/app/build.gradle index abf1fd62..c1aed496 100644 --- a/native_client/java/app/build.gradle +++ b/native_client/java/app/build.gradle @@ -4,7 +4,7 @@ android { compileSdkVersion 27 defaultConfig { - applicationId "org.mozilla.voice.sttapp" + applicationId "org.mozilla.deepspeech" minSdkVersion 21 targetSdkVersion 27 versionName androidGitVersion.name() @@ -28,7 +28,7 @@ android { dependencies { implementation fileTree(dir: 'libs', include: ['*.jar']) - implementation project(':libmozillavoicestt') + implementation project(':libdeepspeech') implementation 'com.android.support:appcompat-v7:27.1.1' implementation 'com.android.support.constraint:constraint-layout:1.1.3' testImplementation 'junit:junit:4.12' diff --git a/native_client/java/app/src/androidTest/java/org/mozilla/voice/sttapp/ExampleInstrumentedTest.java b/native_client/java/app/src/androidTest/java/org/mozilla/deepspeech/ExampleInstrumentedTest.java similarity index 84% rename from native_client/java/app/src/androidTest/java/org/mozilla/voice/sttapp/ExampleInstrumentedTest.java rename to native_client/java/app/src/androidTest/java/org/mozilla/deepspeech/ExampleInstrumentedTest.java index 01ddafb9..6c3e7f91 100644 --- a/native_client/java/app/src/androidTest/java/org/mozilla/voice/sttapp/ExampleInstrumentedTest.java +++ b/native_client/java/app/src/androidTest/java/org/mozilla/deepspeech/ExampleInstrumentedTest.java @@ -1,4 +1,4 @@ -package org.mozilla.voice.sttapp; +package org.mozilla.deepspeech; import android.content.Context; import android.support.test.InstrumentationRegistry; @@ -21,6 +21,6 @@ public class ExampleInstrumentedTest { // Context of the app under test. Context appContext = InstrumentationRegistry.getTargetContext(); - assertEquals("org.mozilla.voice.sttapp", appContext.getPackageName()); + assertEquals("org.mozilla.deepspeech", appContext.getPackageName()); } } diff --git a/native_client/java/app/src/main/AndroidManifest.xml b/native_client/java/app/src/main/AndroidManifest.xml index 1ef6e3a2..0702cc10 100644 --- a/native_client/java/app/src/main/AndroidManifest.xml +++ b/native_client/java/app/src/main/AndroidManifest.xml @@ -1,6 +1,6 @@ + package="org.mozilla.deepspeech"> - + diff --git a/native_client/java/app/src/main/java/org/mozilla/voice/sttapp/MozillaVoiceSttActivity.java b/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java similarity index 95% rename from native_client/java/app/src/main/java/org/mozilla/voice/sttapp/MozillaVoiceSttActivity.java rename to native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java index 7f24e9f6..d82de3a1 100644 --- a/native_client/java/app/src/main/java/org/mozilla/voice/sttapp/MozillaVoiceSttActivity.java +++ b/native_client/java/app/src/main/java/org/mozilla/deepspeech/DeepSpeechActivity.java @@ -1,4 +1,4 @@ -package org.mozilla.voice.sttapp; +package org.mozilla.deepspeech; import android.support.v7.app.AppCompatActivity; import android.os.Bundle; @@ -16,11 +16,11 @@ import java.io.IOException; import java.nio.ByteOrder; import java.nio.ByteBuffer; -import org.mozilla.voice.stt.MozillaVoiceSttModel; +import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel; -public class MozillaVoiceSttActivity extends AppCompatActivity { +public class DeepSpeechActivity extends AppCompatActivity { - MozillaVoiceSttModel _m = null; + DeepSpeechModel _m = null; EditText _tfliteModel; EditText _audioFile; @@ -50,7 +50,7 @@ public class MozillaVoiceSttActivity extends AppCompatActivity { this._tfliteStatus.setText("Creating model"); if (this._m == null) { // sphinx-doc: java_ref_model_start - this._m = new MozillaVoiceSttModel(tfliteModel); + this._m = new DeepSpeechModel(tfliteModel); this._m.setBeamWidth(BEAM_WIDTH); // sphinx-doc: java_ref_model_stop } diff --git a/native_client/java/app/src/main/res/layout/activity_deep_speech.xml b/native_client/java/app/src/main/res/layout/activity_deep_speech.xml index ffbee619..02c383d4 100644 --- a/native_client/java/app/src/main/res/layout/activity_deep_speech.xml +++ b/native_client/java/app/src/main/res/layout/activity_deep_speech.xml @@ -4,7 +4,7 @@ xmlns:tools="http://schemas.android.com/tools" android:layout_width="match_parent" android:layout_height="match_parent" - tools:context=".MozillaVoiceSttActivity"> + tools:context=".DeepSpeechActivity">