diff --git a/.gitmodules b/.gitmodules index 3ec0cd1b..fdbe1f3c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,10 +1,10 @@ [submodule "doc/examples"] path = doc/examples - url = https://github.com/mozilla/DeepSpeech-examples.git + url = https://github.com/coqui-ai/STT-examples.git branch = master [submodule "tensorflow"] path = tensorflow - url = https://github.com/mozilla/tensorflow.git + url = https://github.com/coqui-ai/tensorflow.git [submodule "kenlm"] path = kenlm url = https://github.com/kpu/kenlm diff --git a/.taskcluster.yml b/.taskcluster.yml.disabled similarity index 100% rename from .taskcluster.yml rename to .taskcluster.yml.disabled diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index 1d392a66..f675f38b 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -1,5 +1,4 @@ -This file contains a list of papers in chronological order that have been published -using DeepSpeech. +This file contains a list of papers in chronological order that have been published using 🐸STT. To appear ========== diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 498baa3f..bdb48cd1 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,15 +1,132 @@ -# Community Participation Guidelines +# Contributor Covenant Code of Conduct -This repository is governed by Mozilla's code of conduct and etiquette guidelines. -For more details, please read the -[Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). +## Our Pledge -## How to Report -For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page. +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual identity +and orientation. - +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement by emailing +[coc-report@coqui.ai](mailto:coc-report@coqui.ai). +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0]. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available +at [https://www.contributor-covenant.org/translations][translations]. + +[homepage]: https://www.contributor-covenant.org +[v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations diff --git a/CODE_OWNERS.rst b/CODE_OWNERS.rst index 0ae6659a..92150211 100644 --- a/CODE_OWNERS.rst +++ b/CODE_OWNERS.rst @@ -1,7 +1,7 @@ -DeepSpeech code owners / governance system -========================================== +Coqui STT code owners / governance system +========================================= -DeepSpeech is run under a governance system inspired (and partially copied from) by the `Mozilla module ownership system `_. The project is roughly divided into modules, and each module has its own owners, which are responsible for reviewing pull requests and deciding on technical direction for their modules. Module ownership authority is given to people who have worked extensively on areas of the project. +🐸STT is run under a governance system inspired (and partially copied from) by the `Mozilla module ownership system `_. The project is roughly divided into modules, and each module has its own owners, which are responsible for reviewing pull requests and deciding on technical direction for their modules. Module ownership authority is given to people who have worked extensively on areas of the project. Module owners also have the authority of naming other module owners or appointing module peers, which are people with authority to review pull requests in that module. They can also sub-divide their module into sub-modules with their own owners. @@ -46,7 +46,7 @@ Testing & CI Native inference client ----------------------- -Everything that goes into libdeepspeech.so and is not specifically covered in another area fits here. +Everything that goes into libstt.so and is not specifically covered in another area fits here. - Alexandre Lissy (@lissyx) - Reuben Morais (@reuben) @@ -110,7 +110,7 @@ Documentation - Alexandre Lissy (@lissyx) - Reuben Morais (@reuben) -Third party bindings --------------------- - -Hosted externally and owned by the individual authors. See the `list of third-party bindings `_ for more info. +.. Third party bindings + -------------------- + + Hosted externally and owned by the individual authors. See the `list of third-party bindings `_ for more info. diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index c7970a34..3e8b7ebf 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -1,37 +1,32 @@ Contribution guidelines ======================= -Welcome to the DeepSpeech project! We are excited to see your interest, and appreciate your support! +Welcome to the 🐸STT project! We are excited to see your interest, and appreciate your support! -This repository is governed by Mozilla's code of conduct and etiquette guidelines. For more details, please read the `Mozilla Community Participation Guidelines `_. +This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the `CODE_OF_CONDUCT.md `_. How to Make a Good Pull Request ------------------------------- -Here's some guidelines on how to make a good PR to DeepSpeech. +Here's some guidelines on how to make a good PR to 🐸STT. Bug-fix PR ^^^^^^^^^^ You've found a bug and you were able to squash it! Great job! Please write a short but clear commit message describing the bug, and how you fixed it. This makes review much easier. Also, please name your branch something related to the bug-fix. -Documentation PR -^^^^^^^^^^^^^^^^ - -If you're just making updates or changes to the documentation, there's no need to run all of DeepSpeech's tests for Continuous Integration (i.e. Taskcluster tests). In this case, at the end of your short but clear commit message, you should add **X-DeepSpeech: NOBUILD**. This will trigger the CI tests to skip your PR, saving both time and compute. - New Feature PR ^^^^^^^^^^^^^^ -You've made some core changes to DeepSpeech, and you would like to share them back with the community -- great! First things first: if you're planning to add a feature (not just fix a bug or docs) let the DeepSpeech team know ahead of time and get some feedback early. A quick check-in with the team can save time during code-review, and also ensure that your new feature fits into the project. +You've made some core changes to 🐸STT, and you would like to share them back with the community -- great! First things first: if you're planning to add a feature (not just fix a bug or docs) let the 🐸STT team know ahead of time and get some feedback early. A quick check-in with the team can save time during code-review, and also ensure that your new feature fits into the project. -The DeepSpeech codebase is made of many connected parts. There is Python code for training DeepSpeech, core C++ code for running inference on trained models, and multiple language bindings to the C++ core so you can use DeepSpeech in your favorite language. +The 🐸STT codebase is made of many connected parts. There is Python code for training 🐸STT, core C++ code for running inference on trained models, and multiple language bindings to the C++ core so you can use 🐸STT in your favorite language. -Whenever you add a new feature to DeepSpeech and what to contribute that feature back to the project, here are some things to keep in mind: +Whenever you add a new feature to 🐸STT and what to contribute that feature back to the project, here are some things to keep in mind: -1. You've made changes to the core C++ code. Core changes can have downstream effects on all parts of the DeepSpeech project, so keep that in mind. You should minimally also make necessary changes to the C client (i.e. **args.h** and **client.cc**). The bindings for Python, Java, and Javascript are SWIG generated, and in the best-case scenario you won't have to worry about them. However, if you've added a whole new feature, you may need to make custom tweaks to those bindings, because SWIG may not automagically work with your new feature, especially if you've exposed new arguments. The bindings for .NET and Swift are not generated automatically. It would be best if you also made the necessary manual changes to these bindings as well. It is best to communicate with the core DeepSpeech team and come to an understanding of where you will likely need to work with the bindings. They can't predict all the bugs you will run into, but they will have a good idea of how to plan for some obvious challenges. +1. You've made changes to the core C++ code. Core changes can have downstream effects on all parts of the 🐸STT project, so keep that in mind. You should minimally also make necessary changes to the C client (i.e. **args.h** and **client.cc**). The bindings for Python, Java, and Javascript are SWIG generated, and in the best-case scenario you won't have to worry about them. However, if you've added a whole new feature, you may need to make custom tweaks to those bindings, because SWIG may not automagically work with your new feature, especially if you've exposed new arguments. The bindings for .NET and Swift are not generated automatically. It would be best if you also made the necessary manual changes to these bindings as well. It is best to communicate with the core 🐸STT team and come to an understanding of where you will likely need to work with the bindings. They can't predict all the bugs you will run into, but they will have a good idea of how to plan for some obvious challenges. 2. You've made changes to the Python code. Make sure you run a linter (described below). -3. Make sure your new feature doesn't regress the project. If you've added a significant feature or amount of code, you want to be sure your new feature doesn't create performance issues. For example, if you've made a change to the DeepSpeech decoder, you should know that inference performance doesn't drop in terms of latency, accuracy, or memory usage. Unless you're proposing a new decoding algorithm, you probably don't have to worry about affecting accuracy. However, it's very possible you've affected latency or memory usage. You should run local performance tests to make sure no bugs have crept in. There are lots of tools to check latency and memory usage, and you should use what is most comfortable for you and gets the job done. If you're on Linux, you might find [[perf](https://perf.wiki.kernel.org/index.php/Main_Page)] to be a useful tool. You can use sample WAV files for testing which are provided in the `DeepSpeech/data/` directory. +3. Make sure your new feature doesn't regress the project. If you've added a significant feature or amount of code, you want to be sure your new feature doesn't create performance issues. For example, if you've made a change to the 🐸STT decoder, you should know that inference performance doesn't drop in terms of latency, accuracy, or memory usage. Unless you're proposing a new decoding algorithm, you probably don't have to worry about affecting accuracy. However, it's very possible you've affected latency or memory usage. You should run local performance tests to make sure no bugs have crept in. There are lots of tools to check latency and memory usage, and you should use what is most comfortable for you and gets the job done. If you're on Linux, you might find [[perf](https://perf.wiki.kernel.org/index.php/Main_Page)] to be a useful tool. You can use sample WAV files for testing which are provided in the `STT/data/` directory. Requesting review on your PR ---------------------------- @@ -47,9 +42,9 @@ Before making a Pull Request for Python code changes, check your changes for bas .. code-block:: bash pip install pylint cardboardlint - cardboardlinter --refspec master + cardboardlinter --refspec main -This will compare the code against master and run the linter on all the changes. We plan to introduce more linter checks (e.g. for C++) in the future. To run it automatically as a git pre-commit hook, do the following: +This will compare the code against the main branch and run the linter on all the changes. We plan to introduce more linter checks (e.g. for C++) in the future. To run it automatically as a git pre-commit hook, do the following: .. code-block:: bash diff --git a/Dockerfile.build.tmpl b/Dockerfile.build.tmpl index e6648102..cb88d80c 100644 --- a/Dockerfile.build.tmpl +++ b/Dockerfile.build.tmpl @@ -3,8 +3,8 @@ # Need devel version cause we need /usr/include/cudnn.h FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 -ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO# -ENV DEEPSPEECH_SHA=#DEEPSPEECH_SHA# +ENV STT_REPO=#STT_REPO# +ENV STT_SHA=#STT_SHA# # >> START Install base software @@ -113,15 +113,15 @@ RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \ WORKDIR / -RUN git clone --recursive $DEEPSPEECH_REPO DeepSpeech -WORKDIR /DeepSpeech -RUN git checkout $DEEPSPEECH_SHA +RUN git clone --recursive $STT_REPO STT +WORKDIR /STT +RUN git checkout $STT_SHA RUN git submodule sync tensorflow/ RUN git submodule update --init tensorflow/ # >> START Build and bind -WORKDIR /DeepSpeech/tensorflow +WORKDIR /STT/tensorflow # Fix for not found script https://github.com/tensorflow/tensorflow/issues/471 RUN ./configure @@ -132,7 +132,7 @@ RUN ./configure # passing LD_LIBRARY_PATH is required cause Bazel doesn't pickup it from environment -# Build DeepSpeech +# Build STT RUN bazel build \ --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" \ --config=monolithic \ @@ -149,22 +149,22 @@ RUN bazel build \ --copt=-msse4.2 \ --copt=-mavx \ --copt=-fvisibility=hidden \ - //native_client:libdeepspeech.so \ + //native_client:libstt.so \ --verbose_failures \ --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH} -# Copy built libs to /DeepSpeech/native_client -RUN cp bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/ +# Copy built libs to /STT/native_client +RUN cp bazel-bin/native_client/libstt.so /STT/native_client/ # Build client.cc and install Python client and decoder bindings -ENV TFDIR /DeepSpeech/tensorflow +ENV TFDIR /STT/tensorflow RUN nproc -WORKDIR /DeepSpeech/native_client -RUN make NUM_PROCESSES=$(nproc) deepspeech +WORKDIR /STT/native_client +RUN make NUM_PROCESSES=$(nproc) stt -WORKDIR /DeepSpeech +WORKDIR /STT RUN cd native_client/python && make NUM_PROCESSES=$(nproc) bindings RUN pip3 install --upgrade native_client/python/dist/*.whl @@ -176,8 +176,8 @@ RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl # Allow Python printing utf-8 ENV PYTHONIOENCODING UTF-8 -# Build KenLM in /DeepSpeech/native_client/kenlm folder -WORKDIR /DeepSpeech/native_client +# Build KenLM in /STT/native_client/kenlm folder +WORKDIR /STT/native_client RUN rm -rf kenlm && \ git clone https://github.com/kpu/kenlm && \ cd kenlm && \ @@ -188,4 +188,4 @@ RUN rm -rf kenlm && \ make -j $(nproc) # Done -WORKDIR /DeepSpeech +WORKDIR /STT diff --git a/Dockerfile.train.tmpl b/Dockerfile.train.tmpl index 9baa76d9..d306dcaf 100644 --- a/Dockerfile.train.tmpl +++ b/Dockerfile.train.tmpl @@ -3,8 +3,8 @@ FROM tensorflow/tensorflow:1.15.4-gpu-py3 ENV DEBIAN_FRONTEND=noninteractive -ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO# -ENV DEEPSPEECH_SHA=#DEEPSPEECH_SHA# +ENV STT_REPO=#STT_REPO# +ENV STT_SHA=#STT_SHA# RUN apt-get update && apt-get install -y --no-install-recommends \ apt-utils \ @@ -20,7 +20,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ unzip \ wget -# We need to remove it because it's breaking deepspeech install later with +# We need to remove it because it's breaking STT install later with # weird errors about setuptools RUN apt-get purge -y python3-xdg @@ -31,10 +31,10 @@ RUN apt-get install -y --no-install-recommends libopus0 libsndfile1 RUN rm -rf /var/lib/apt/lists/* WORKDIR / -RUN git clone $DEEPSPEECH_REPO DeepSpeech +RUN git clone $STT_REPO STT -WORKDIR /DeepSpeech -RUN git checkout $DEEPSPEECH_SHA +WORKDIR /STT +RUN git checkout $STT_SHA # Build CTC decoder first, to avoid clashes on incompatible versions upgrades RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings @@ -43,7 +43,7 @@ RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl # Prepare deps RUN pip3 install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0 -# Install DeepSpeech +# Install STT # - No need for the decoder since we did it earlier # - There is already correct TensorFlow GPU installed on the base image, # we don't want to break that @@ -54,7 +54,7 @@ RUN python3 util/taskcluster.py --source tensorflow --branch r1.15 \ --artifact convert_graphdef_memmapped_format --target . # Build KenLM to generate new scorers -WORKDIR /DeepSpeech/native_client +WORKDIR /STT/native_client RUN rm -rf kenlm && \ git clone https://github.com/kpu/kenlm && \ cd kenlm && \ @@ -63,6 +63,6 @@ RUN rm -rf kenlm && \ cd build && \ cmake .. && \ make -j $(nproc) -WORKDIR /DeepSpeech +WORKDIR /STT RUN ./bin/run-ldc93s1.sh diff --git a/GRAPH_VERSION b/GRAPH_VERSION index b9a65815..06f18ad2 120000 --- a/GRAPH_VERSION +++ b/GRAPH_VERSION @@ -1 +1 @@ -training/deepspeech_training/GRAPH_VERSION \ No newline at end of file +training/coqui_stt_training/GRAPH_VERSION \ No newline at end of file diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md index 08345c3a..2d72979b 100644 --- a/ISSUE_TEMPLATE.md +++ b/ISSUE_TEMPLATE.md @@ -1,4 +1,4 @@ -For support and discussions, please use our [Discourse forums](https://discourse.mozilla.org/c/deep-speech). +For support and discussions, please use [GitHub Discussions](https://github.com/coqui-ai/STT/discussions). If you've found a bug, or have a feature request, then please create an issue with the following information: diff --git a/Makefile b/Makefile index 2d28d24b..6953c437 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ -DEEPSPEECH_REPO ?= https://github.com/mozilla/DeepSpeech.git -DEEPSPEECH_SHA ?= origin/master +STT_REPO ?= https://github.com/coqui-ai/STT.git +STT_SHA ?= origin/main Dockerfile%: Dockerfile%.tmpl sed \ - -e "s|#DEEPSPEECH_REPO#|$(DEEPSPEECH_REPO)|g" \ - -e "s|#DEEPSPEECH_SHA#|$(DEEPSPEECH_SHA)|g" \ + -e "s|#STT_REPO#|$(STT_REPO)|g" \ + -e "s|#STT_SHA#|$(STT_SHA)|g" \ < $< > $@ diff --git a/README.rst b/README.rst index 26a48afc..1ef9d14e 100644 --- a/README.rst +++ b/README.rst @@ -1,22 +1,20 @@ -Project DeepSpeech -================== +.. image:: images/coqui-STT-logo-green.png + :alt: Coqui STT logo -.. image:: https://readthedocs.org/projects/deepspeech/badge/?version=latest - :target: https://deepspeech.readthedocs.io/?badge=latest +.. image:: https://readthedocs.org/projects/stt/badge/?version=latest + :target: https://stt.readthedocs.io/?badge=latest :alt: Documentation +.. image:: https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg + :target: CODE_OF_CONDUCT.md + :alt: Contributor Covenant -.. image:: https://community-tc.services.mozilla.com/api/github/v1/repository/mozilla/DeepSpeech/master/badge.svg - :target: https://community-tc.services.mozilla.com/api/github/v1/repository/mozilla/DeepSpeech/master/latest - :alt: Task Status +**Coqui STT** is an open-source Speech-To-Text engine, using a model trained by machine learning techniques based on `Baidu's Deep Speech research paper `_. 🐸STT uses Google's `TensorFlow `_ to make the implementation easier. +**Documentation** for installation, usage, and training models are available on `stt.readthedocs.io `_. -DeepSpeech is an open-source Speech-To-Text engine, using a model trained by machine learning techniques based on `Baidu's Deep Speech research paper `_. Project DeepSpeech uses Google's `TensorFlow `_ to make the implementation easier. - -Documentation for installation, usage, and training models are available on `deepspeech.readthedocs.io `_. - -For the latest release, including pre-trained models and checkpoints, `see the latest release on GitHub `_. +For the **latest release**, including pre-trained models and checkpoints, `see the latest release on GitHub `_. For contribution guidelines, see `CONTRIBUTING.rst `_. diff --git a/RELEASE.rst b/RELEASE.rst deleted file mode 100644 index 4e9143c0..00000000 --- a/RELEASE.rst +++ /dev/null @@ -1,12 +0,0 @@ - -Making a (new) release of the codebase -====================================== - - -* Update version in VERSION file, commit -* Open PR, ensure all tests are passing properly -* Merge the PR -* Fetch the new master, tag it with (hopefully) the same version as in VERSION -* Push that to Github -* New build should be triggered and new packages should be made -* TaskCluster should schedule a merge build **including** a "DeepSpeech Packages" task diff --git a/SUPPORT.rst b/SUPPORT.rst index d72a7418..ad8e64d5 100644 --- a/SUPPORT.rst +++ b/SUPPORT.rst @@ -5,8 +5,8 @@ Contact/Getting Help There are several ways to contact us or to get help: -#. `Discourse Forums `_ - The `Deep Speech category on Discourse `_ is the first place to look. Search for keywords related to your question or problem to see if someone else has run into it already. If you can't find anything relevant there, search on our `issue tracker `_ to see if there is an existing issue about your problem. +#. `GitHub Discussions `_ - `GitHub Discussions `_ is the first place to look. Search for keywords related to your question or problem to see if someone else has run into it already. If you can't find anything relevant there, search on our `issue tracker `_ to see if there is an existing issue about your problem. -#. `Matrix chat `_ - If your question is not addressed by either the `FAQ `_ or `Discourse Forums `_\ , you can contact us on the ``#machinelearning`` channel on `Mozilla Matrix `_\ ; people there can try to answer/help +#. `Matrix chat `_ - If your question is not addressed on `GitHub Discussions `_\ , you can contact us on the ``#stt:matrix.org`` `channel on Matrix `_. -#. `Create a new issue `_ - Finally, if you have a bug report or a feature request that isn't already covered by an existing issue, please open an issue in our repo and fill the appropriate information on your hardware and software setup. +#. `Create a new issue `_ - Finally, if you have a bug report or a feature request that isn't already covered by an existing issue, please open an issue in our repo and fill the appropriate information on your hardware and software setup. diff --git a/VERSION b/VERSION index 8a3ed242..9b8b7c93 120000 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -training/deepspeech_training/VERSION \ No newline at end of file +training/coqui_stt_training/VERSION \ No newline at end of file diff --git a/bin/compare_samples.py b/bin/compare_samples.py index 19a60575..3bef72ca 100755 --- a/bin/compare_samples.py +++ b/bin/compare_samples.py @@ -6,8 +6,8 @@ import sys import argparse import numpy as np -from deepspeech_training.util.audio import AUDIO_TYPE_NP, mean_dbfs -from deepspeech_training.util.sample_collections import load_sample +from coqui_stt_training.util.audio import AUDIO_TYPE_NP, mean_dbfs +from coqui_stt_training.util.sample_collections import load_sample def fail(message): diff --git a/bin/data_set_tool.py b/bin/data_set_tool.py index 604684b9..521dda21 100755 --- a/bin/data_set_tool.py +++ b/bin/data_set_tool.py @@ -8,20 +8,20 @@ import argparse import progressbar from pathlib import Path -from deepspeech_training.util.audio import ( +from coqui_stt_training.util.audio import ( AUDIO_TYPE_PCM, AUDIO_TYPE_OPUS, AUDIO_TYPE_WAV, change_audio_types, ) -from deepspeech_training.util.downloader import SIMPLE_BAR -from deepspeech_training.util.sample_collections import ( +from coqui_stt_training.util.downloader import SIMPLE_BAR +from coqui_stt_training.util.sample_collections import ( CSVWriter, DirectSDBWriter, TarWriter, samples_from_sources, ) -from deepspeech_training.util.augmentations import ( +from coqui_stt_training.util.augmentations import ( parse_augmentations, apply_sample_augmentations, SampleAugmentation diff --git a/bin/import_aidatatang.py b/bin/import_aidatatang.py index c53eba09..8eac7de6 100755 --- a/bin/import_aidatatang.py +++ b/bin/import_aidatatang.py @@ -5,7 +5,7 @@ import tarfile import pandas -from deepspeech_training.util.importers import get_importers_parser +from coqui_stt_training.util.importers import get_importers_parser COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"] diff --git a/bin/import_aishell.py b/bin/import_aishell.py index 341d0d88..3ca71f02 100755 --- a/bin/import_aishell.py +++ b/bin/import_aishell.py @@ -5,7 +5,7 @@ import tarfile import pandas -from deepspeech_training.util.importers import get_importers_parser +from coqui_stt_training.util.importers import get_importers_parser COLUMNNAMES = ["wav_filename", "wav_filesize", "transcript"] diff --git a/bin/import_ccpmf.py b/bin/import_ccpmf.py index 0895b144..0d670a43 100755 --- a/bin/import_ccpmf.py +++ b/bin/import_ccpmf.py @@ -30,9 +30,9 @@ except ImportError as ex: import requests import json -from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download -from deepspeech_training.util.helpers import secs_to_hours -from deepspeech_training.util.importers import ( +from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download +from coqui_stt_training.util.helpers import secs_to_hours +from coqui_stt_training.util.importers import ( get_counter, get_importers_parser, get_imported_samples, diff --git a/bin/import_cv.py b/bin/import_cv.py index 392a1301..a59c9a25 100755 --- a/bin/import_cv.py +++ b/bin/import_cv.py @@ -10,13 +10,13 @@ from multiprocessing import Pool import progressbar import sox -from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download -from deepspeech_training.util.importers import ( +from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download +from coqui_stt_training.util.importers import ( get_counter, get_imported_samples, print_import_report, ) -from deepspeech_training.util.importers import validate_label_eng as validate_label +from coqui_stt_training.util.importers import validate_label_eng as validate_label FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"] SAMPLE_RATE = 16000 @@ -35,7 +35,7 @@ def _download_and_preprocess_data(target_dir): archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL) # Conditionally extract common voice data _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path) - # Conditionally convert common voice CSV files and mp3 data to DeepSpeech CSVs and wav + # Conditionally convert common voice CSV files and mp3 data to Coqui STT CSVs and wav _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME) diff --git a/bin/import_cv2.py b/bin/import_cv2.py index 19a5741c..fcc3635a 100755 --- a/bin/import_cv2.py +++ b/bin/import_cv2.py @@ -3,7 +3,7 @@ Broadly speaking, this script takes the audio downloaded from Common Voice for a certain language, in addition to the *.tsv files output by CorporaCreator, and the script formats the data and transcripts to be in a state usable by -DeepSpeech.py +train.py Use "python3 import_cv2.py -h" for help """ import csv @@ -15,8 +15,8 @@ from multiprocessing import Pool import progressbar import sox -from deepspeech_training.util.downloader import SIMPLE_BAR -from deepspeech_training.util.importers import ( +from coqui_stt_training.util.downloader import SIMPLE_BAR +from coqui_stt_training.util.importers import ( get_counter, get_imported_samples, get_importers_parser, @@ -138,9 +138,9 @@ def _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_ever print_import_report(counter, SAMPLE_RATE, MAX_SECS) output_csv = os.path.join(os.path.abspath(audio_dir), dataset + ".csv") - print("Saving new DeepSpeech-formatted CSV file to: ", output_csv) + print("Saving new Coqui STT-formatted CSV file to: ", output_csv) with open(output_csv, "w", encoding="utf-8", newline="") as output_csv_file: - print("Writing CSV file for DeepSpeech.py as: ", output_csv) + print("Writing CSV file for train.py as: ", output_csv) writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES) writer.writeheader() bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR) diff --git a/bin/import_fisher.py b/bin/import_fisher.py index 1b5a495c..9c6f8a7b 100755 --- a/bin/import_fisher.py +++ b/bin/import_fisher.py @@ -11,7 +11,7 @@ import librosa import pandas import soundfile # <= Has an external dependency on libsndfile -from deepspeech_training.util.importers import validate_label_eng as validate_label +from coqui_stt_training.util.importers import validate_label_eng as validate_label # Prerequisite: Having the sph2pipe tool in your PATH: # https://www.ldc.upenn.edu/language-resources/tools/sphere-conversion-tools diff --git a/bin/import_freestmandarin.py b/bin/import_freestmandarin.py index 55ce9128..f1838d91 100755 --- a/bin/import_freestmandarin.py +++ b/bin/import_freestmandarin.py @@ -6,7 +6,7 @@ import tarfile import numpy as np import pandas -from deepspeech_training.util.importers import get_importers_parser +from coqui_stt_training.util.importers import get_importers_parser COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"] diff --git a/bin/import_gram_vaani.py b/bin/import_gram_vaani.py index 71fcee08..80bf0241 100755 --- a/bin/import_gram_vaani.py +++ b/bin/import_gram_vaani.py @@ -12,7 +12,7 @@ import pandas as pd from sox import Transformer import swifter -from deepspeech_training.util.importers import get_importers_parser, get_validate_label +from coqui_stt_training.util.importers import get_importers_parser, get_validate_label __version__ = "0.1.0" _logger = logging.getLogger(__name__) diff --git a/bin/import_ldc93s1.py b/bin/import_ldc93s1.py index 86a00d74..85088b93 100755 --- a/bin/import_ldc93s1.py +++ b/bin/import_ldc93s1.py @@ -4,7 +4,7 @@ import sys import pandas -from deepspeech_training.util.downloader import maybe_download +from coqui_stt_training.util.downloader import maybe_download def _download_and_preprocess_data(data_dir): diff --git a/bin/import_librivox.py b/bin/import_librivox.py index 32c1d20a..491488fa 100755 --- a/bin/import_librivox.py +++ b/bin/import_librivox.py @@ -12,7 +12,7 @@ import progressbar from sox import Transformer from tensorflow.python.platform import gfile -from deepspeech_training.util.downloader import maybe_download +from coqui_stt_training.util.downloader import maybe_download SAMPLE_RATE = 16000 diff --git a/bin/import_lingua_libre.py b/bin/import_lingua_libre.py index 956d7a0b..1c8f31ae 100755 --- a/bin/import_lingua_libre.py +++ b/bin/import_lingua_libre.py @@ -12,8 +12,8 @@ from multiprocessing import Pool import progressbar import sox -from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download -from deepspeech_training.util.importers import ( +from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download +from coqui_stt_training.util.importers import ( get_counter, get_imported_samples, get_importers_parser, diff --git a/bin/import_m-ailabs.py b/bin/import_m-ailabs.py index bbaa744b..0e655612 100755 --- a/bin/import_m-ailabs.py +++ b/bin/import_m-ailabs.py @@ -10,8 +10,8 @@ from multiprocessing import Pool import progressbar -from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download -from deepspeech_training.util.importers import ( +from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download +from coqui_stt_training.util.importers import ( get_counter, get_imported_samples, get_importers_parser, diff --git a/bin/import_magicdata.py b/bin/import_magicdata.py index c8502784..8b289804 100755 --- a/bin/import_magicdata.py +++ b/bin/import_magicdata.py @@ -6,7 +6,7 @@ import wave import pandas -from deepspeech_training.util.importers import get_importers_parser +from coqui_stt_training.util.importers import get_importers_parser COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"] diff --git a/bin/import_primewords.py b/bin/import_primewords.py index 08f3302a..4643bd39 100755 --- a/bin/import_primewords.py +++ b/bin/import_primewords.py @@ -7,7 +7,7 @@ import tarfile import numpy as np import pandas -from deepspeech_training.util.importers import get_importers_parser +from coqui_stt_training.util.importers import get_importers_parser COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"] diff --git a/bin/import_slr57.py b/bin/import_slr57.py index 57588696..94869c21 100755 --- a/bin/import_slr57.py +++ b/bin/import_slr57.py @@ -9,8 +9,8 @@ from multiprocessing import Pool import progressbar -from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download -from deepspeech_training.util.importers import ( +from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download +from coqui_stt_training.util.importers import ( get_counter, get_imported_samples, get_importers_parser, diff --git a/bin/import_swb.py b/bin/import_swb.py index c55ce298..b192d9f8 100755 --- a/bin/import_swb.py +++ b/bin/import_swb.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # ensure that you have downloaded the LDC dataset LDC97S62 and tar exists in a folder e.g. # ./data/swb/swb1_LDC97S62.tgz -# from the deepspeech directory run with: ./bin/import_swb.py ./data/swb/ +# from the Coqui STT directory run with: ./bin/import_swb.py ./data/swb/ import codecs import fnmatch import os @@ -17,7 +17,7 @@ import pandas import requests import soundfile # <= Has an external dependency on libsndfile -from deepspeech_training.util.importers import validate_label_eng as validate_label +from coqui_stt_training.util.importers import validate_label_eng as validate_label # ARCHIVE_NAME refers to ISIP alignments from 01/29/03 ARCHIVE_NAME = "switchboard_word_alignments.tar.gz" diff --git a/bin/import_swc.py b/bin/import_swc.py index 3775de05..d660b300 100755 --- a/bin/import_swc.py +++ b/bin/import_swc.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -Downloads and prepares (parts of) the "Spoken Wikipedia Corpora" for DeepSpeech.py +Downloads and prepares (parts of) the "Spoken Wikipedia Corpora" for train.py Use "python3 import_swc.py -h" for help """ @@ -22,8 +22,8 @@ from multiprocessing.pool import ThreadPool import progressbar import sox -from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download -from deepspeech_training.util.importers import validate_label_eng as validate_label +from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download +from coqui_stt_training.util.importers import validate_label_eng as validate_label from ds_ctcdecoder import Alphabet SWC_URL = "https://www2.informatik.uni-hamburg.de/nats/pub/SWC/SWC_{language}.tar" diff --git a/bin/import_ted.py b/bin/import_ted.py index bad1452f..f88a248f 100755 --- a/bin/import_ted.py +++ b/bin/import_ted.py @@ -10,8 +10,8 @@ import pandas from sox import Transformer from tensorflow.python.platform import gfile -from deepspeech_training.util.downloader import maybe_download -from deepspeech_training.util.stm import parse_stm_file +from coqui_stt_training.util.downloader import maybe_download +from coqui_stt_training.util.stm import parse_stm_file def _download_and_preprocess_data(data_dir): diff --git a/bin/import_ts.py b/bin/import_ts.py index e0130130..0ce3fdf2 100755 --- a/bin/import_ts.py +++ b/bin/import_ts.py @@ -10,8 +10,8 @@ import progressbar import sox import unidecode -from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download -from deepspeech_training.util.importers import ( +from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download +from coqui_stt_training.util.importers import ( get_counter, get_imported_samples, get_importers_parser, @@ -25,7 +25,7 @@ MAX_SECS = 15 ARCHIVE_NAME = "2019-04-11_fr_FR" ARCHIVE_DIR_NAME = "ts_" + ARCHIVE_NAME ARCHIVE_URL = ( - "https://deepspeech-storage-mirror.s3.fr-par.scw.cloud/" + ARCHIVE_NAME + ".zip" + "https://Coqui STT-storage-mirror.s3.fr-par.scw.cloud/" + ARCHIVE_NAME + ".zip" ) @@ -38,7 +38,7 @@ def _download_and_preprocess_data(target_dir, english_compatible=False): ) # Conditionally extract archive data _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path) - # Conditionally convert TrainingSpeech data to DeepSpeech CSVs and wav + # Conditionally convert TrainingSpeech data to Coqui STT CSVs and wav _maybe_convert_sets( target_dir, ARCHIVE_DIR_NAME, english_compatible=english_compatible ) diff --git a/bin/import_tuda.py b/bin/import_tuda.py index da0cb42b..16f4dcc8 100755 --- a/bin/import_tuda.py +++ b/bin/import_tuda.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -Downloads and prepares (parts of) the "German Distant Speech" corpus (TUDA) for DeepSpeech.py +Downloads and prepares (parts of) the "German Distant Speech" corpus (TUDA) for train.py Use "python3 import_tuda.py -h" for help """ import argparse @@ -14,8 +14,8 @@ from collections import Counter import progressbar -from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download -from deepspeech_training.util.importers import validate_label_eng as validate_label +from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download +from coqui_stt_training.util.importers import validate_label_eng as validate_label from ds_ctcdecoder import Alphabet TUDA_VERSION = "v2" diff --git a/bin/import_vctk.py b/bin/import_vctk.py index f9c86799..b2b85b6d 100755 --- a/bin/import_vctk.py +++ b/bin/import_vctk.py @@ -11,8 +11,8 @@ from zipfile import ZipFile import librosa import progressbar -from deepspeech_training.util.downloader import SIMPLE_BAR, maybe_download -from deepspeech_training.util.importers import ( +from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download +from coqui_stt_training.util.importers import ( get_counter, get_imported_samples, print_import_report, @@ -35,7 +35,7 @@ def _download_and_preprocess_data(target_dir): archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL) # Conditionally extract common voice data _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path) - # Conditionally convert common voice CSV files and mp3 data to DeepSpeech CSVs and wav + # Conditionally convert common voice CSV files and mp3 data to Coqui STT CSVs and wav _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME) diff --git a/bin/import_voxforge.py b/bin/import_voxforge.py index cae5f744..b01dca72 100755 --- a/bin/import_voxforge.py +++ b/bin/import_voxforge.py @@ -14,7 +14,7 @@ from os import makedirs, path import pandas from bs4 import BeautifulSoup from tensorflow.python.platform import gfile -from deepspeech_training.util.downloader import maybe_download +from coqui_stt_training.util.downloader import maybe_download """The number of jobs to run in parallel""" NUM_PARALLEL = 8 diff --git a/bin/play.py b/bin/play.py index 60383344..59433e18 100755 --- a/bin/play.py +++ b/bin/play.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -Tool for playing (and augmenting) single samples or samples from Sample Databases (SDB files) and DeepSpeech CSV files +Tool for playing (and augmenting) single samples or samples from Sample Databases (SDB files) and 🐸STT CSV files Use "python3 play.py -h" for help """ @@ -9,9 +9,9 @@ import sys import random import argparse -from deepspeech_training.util.audio import get_loadable_audio_type_from_extension, AUDIO_TYPE_PCM, AUDIO_TYPE_WAV -from deepspeech_training.util.sample_collections import SampleList, LabeledSample, samples_from_source -from deepspeech_training.util.augmentations import parse_augmentations, apply_sample_augmentations, SampleAugmentation +from coqui_stt_training.util.audio import get_loadable_audio_type_from_extension, AUDIO_TYPE_PCM, AUDIO_TYPE_WAV +from coqui_stt_training.util.sample_collections import SampleList, LabeledSample, samples_from_source +from coqui_stt_training.util.augmentations import parse_augmentations, apply_sample_augmentations, SampleAugmentation def get_samples_in_play_order(): @@ -68,7 +68,7 @@ def play_collection(): def handle_args(): parser = argparse.ArgumentParser( description="Tool for playing (and augmenting) single samples or samples from Sample Databases (SDB files) " - "and DeepSpeech CSV files" + "and Coqui STT CSV files" ) parser.add_argument("source", help="Sample DB, CSV or WAV file to play samples from") parser.add_argument( diff --git a/bin/run-ldc93s1.sh b/bin/run-ldc93s1.sh index 4bee5c70..d19722b0 100755 --- a/bin/run-ldc93s1.sh +++ b/bin/run-ldc93s1.sh @@ -1,7 +1,7 @@ #!/bin/sh set -xe -if [ ! -f DeepSpeech.py ]; then - echo "Please make sure you run this from DeepSpeech's top level directory." +if [ ! -f train.py ]; then + echo "Please make sure you run this from STT's top level directory." exit 1 fi; @@ -13,14 +13,14 @@ fi; if [ -d "${COMPUTE_KEEP_DIR}" ]; then checkpoint_dir=$COMPUTE_KEEP_DIR else - checkpoint_dir=$(python -c 'from xdg import BaseDirectory as xdg; print(xdg.save_data_path("deepspeech/ldc93s1"))') + checkpoint_dir=$(python -c 'from xdg import BaseDirectory as xdg; print(xdg.save_data_path("stt/ldc93s1"))') fi # Force only one visible device because we have a single-sample dataset # and when trying to run on multiple devices (like GPUs), this will break export CUDA_VISIBLE_DEVICES=0 -python -u DeepSpeech.py --noshow_progressbar \ +python -u train.py --noshow_progressbar \ --train_files data/ldc93s1/ldc93s1.csv \ --test_files data/ldc93s1/ldc93s1.csv \ --train_batch_size 1 \ diff --git a/bin/run-tc-graph_augmentations.sh b/bin/run-tc-graph_augmentations.sh index 9b6181ae..c958ef2e 100755 --- a/bin/run-tc-graph_augmentations.sh +++ b/bin/run-tc-graph_augmentations.sh @@ -14,7 +14,7 @@ fi; # and when trying to run on multiple devices (like GPUs), this will break export CUDA_VISIBLE_DEVICES=0 -python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ +python -u train.py --noshow_progressbar --noearly_stop \ --train_files ${ldc93s1_csv} --train_batch_size 1 \ --scorer "" \ --augment dropout \ diff --git a/bin/run-tc-ldc93s1_checkpoint.sh b/bin/run-tc-ldc93s1_checkpoint.sh index 9dc4e84e..c499c5a8 100755 --- a/bin/run-tc-ldc93s1_checkpoint.sh +++ b/bin/run-tc-ldc93s1_checkpoint.sh @@ -14,7 +14,7 @@ fi; # and when trying to run on multiple devices (like GPUs), this will break export CUDA_VISIBLE_DEVICES=0 -python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ +python -u train.py --noshow_progressbar --noearly_stop \ --train_files ${ldc93s1_csv} --train_batch_size 1 \ --dev_files ${ldc93s1_csv} --dev_batch_size 1 \ --test_files ${ldc93s1_csv} --test_batch_size 1 \ diff --git a/bin/run-tc-ldc93s1_checkpoint_bytes.sh b/bin/run-tc-ldc93s1_checkpoint_bytes.sh index d6fe98e9..8af85a44 100755 --- a/bin/run-tc-ldc93s1_checkpoint_bytes.sh +++ b/bin/run-tc-ldc93s1_checkpoint_bytes.sh @@ -14,7 +14,7 @@ fi; # and when trying to run on multiple devices (like GPUs), this will break export CUDA_VISIBLE_DEVICES=0 -python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ +python -u train.py --noshow_progressbar --noearly_stop \ --train_files ${ldc93s1_csv} --train_batch_size 1 \ --dev_files ${ldc93s1_csv} --dev_batch_size 1 \ --test_files ${ldc93s1_csv} --test_batch_size 1 \ diff --git a/bin/run-tc-ldc93s1_checkpoint_sdb.sh b/bin/run-tc-ldc93s1_checkpoint_sdb.sh index c811f984..d3006f30 100755 --- a/bin/run-tc-ldc93s1_checkpoint_sdb.sh +++ b/bin/run-tc-ldc93s1_checkpoint_sdb.sh @@ -20,7 +20,7 @@ fi; # and when trying to run on multiple devices (like GPUs), this will break export CUDA_VISIBLE_DEVICES=0 -python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ +python -u train.py --noshow_progressbar --noearly_stop \ --train_files ${ldc93s1_sdb} --train_batch_size 1 \ --dev_files ${ldc93s1_sdb} --dev_batch_size 1 \ --test_files ${ldc93s1_sdb} --test_batch_size 1 \ diff --git a/bin/run-tc-ldc93s1_new.sh b/bin/run-tc-ldc93s1_new.sh index 8e9cf4d4..fb1d48ca 100755 --- a/bin/run-tc-ldc93s1_new.sh +++ b/bin/run-tc-ldc93s1_new.sh @@ -17,7 +17,7 @@ fi; # and when trying to run on multiple devices (like GPUs), this will break export CUDA_VISIBLE_DEVICES=0 -python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ +python -u train.py --noshow_progressbar --noearly_stop \ --train_files ${ldc93s1_csv} --train_batch_size 1 \ --feature_cache '/tmp/ldc93s1_cache' \ --dev_files ${ldc93s1_csv} --dev_batch_size 1 \ diff --git a/bin/run-tc-ldc93s1_new_bytes.sh b/bin/run-tc-ldc93s1_new_bytes.sh index 5ce787d3..2296ed1f 100755 --- a/bin/run-tc-ldc93s1_new_bytes.sh +++ b/bin/run-tc-ldc93s1_new_bytes.sh @@ -17,7 +17,7 @@ fi; # and when trying to run on multiple devices (like GPUs), this will break export CUDA_VISIBLE_DEVICES=0 -python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ +python -u train.py --noshow_progressbar --noearly_stop \ --train_files ${ldc93s1_csv} --train_batch_size 1 \ --feature_cache '/tmp/ldc93s1_cache' \ --dev_files ${ldc93s1_csv} --dev_batch_size 1 \ diff --git a/bin/run-tc-ldc93s1_new_bytes_tflite.sh b/bin/run-tc-ldc93s1_new_bytes_tflite.sh index f1a79f12..3cb8da59 100755 --- a/bin/run-tc-ldc93s1_new_bytes_tflite.sh +++ b/bin/run-tc-ldc93s1_new_bytes_tflite.sh @@ -16,7 +16,7 @@ fi; # and when trying to run on multiple devices (like GPUs), this will break export CUDA_VISIBLE_DEVICES=0 -python -u DeepSpeech.py --noshow_progressbar \ +python -u train.py --noshow_progressbar \ --n_hidden 100 \ --checkpoint_dir '/tmp/ckpt_bytes' \ --export_dir '/tmp/train_bytes_tflite' \ diff --git a/bin/run-tc-ldc93s1_new_metrics.sh b/bin/run-tc-ldc93s1_new_metrics.sh index 01403bf1..6077cb41 100755 --- a/bin/run-tc-ldc93s1_new_metrics.sh +++ b/bin/run-tc-ldc93s1_new_metrics.sh @@ -17,7 +17,7 @@ fi; # and when trying to run on multiple devices (like GPUs), this will break export CUDA_VISIBLE_DEVICES=0 -python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ +python -u train.py --noshow_progressbar --noearly_stop \ --train_files ${ldc93s1_csv} --train_batch_size 1 \ --dev_files ${ldc93s1_csv} --dev_batch_size 1 \ --test_files ${ldc93s1_csv} --test_batch_size 1 \ diff --git a/bin/run-tc-ldc93s1_new_sdb.sh b/bin/run-tc-ldc93s1_new_sdb.sh index 6cd4a450..47c8eefc 100755 --- a/bin/run-tc-ldc93s1_new_sdb.sh +++ b/bin/run-tc-ldc93s1_new_sdb.sh @@ -23,7 +23,7 @@ fi; # and when trying to run on multiple devices (like GPUs), this will break export CUDA_VISIBLE_DEVICES=0 -python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ +python -u train.py --noshow_progressbar --noearly_stop \ --train_files ${ldc93s1_sdb} --train_batch_size 1 \ --dev_files ${ldc93s1_sdb} --dev_batch_size 1 \ --test_files ${ldc93s1_sdb} --test_batch_size 1 \ diff --git a/bin/run-tc-ldc93s1_new_sdb_csv.sh b/bin/run-tc-ldc93s1_new_sdb_csv.sh index ec3e7774..3a1f61ef 100755 --- a/bin/run-tc-ldc93s1_new_sdb_csv.sh +++ b/bin/run-tc-ldc93s1_new_sdb_csv.sh @@ -23,7 +23,7 @@ fi; # and when trying to run on multiple devices (like GPUs), this will break export CUDA_VISIBLE_DEVICES=0 -python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ +python -u train.py --noshow_progressbar --noearly_stop \ --train_files ${ldc93s1_sdb},${ldc93s1_csv} --train_batch_size 1 \ --feature_cache '/tmp/ldc93s1_cache_sdb_csv' \ --dev_files ${ldc93s1_sdb},${ldc93s1_csv} --dev_batch_size 1 \ diff --git a/bin/run-tc-ldc93s1_singleshotinference.sh b/bin/run-tc-ldc93s1_singleshotinference.sh index 997bf08f..cf5e4abb 100755 --- a/bin/run-tc-ldc93s1_singleshotinference.sh +++ b/bin/run-tc-ldc93s1_singleshotinference.sh @@ -14,7 +14,7 @@ fi; # and when trying to run on multiple devices (like GPUs), this will break export CUDA_VISIBLE_DEVICES=0 -python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ +python -u train.py --noshow_progressbar --noearly_stop \ --train_files ${ldc93s1_csv} --train_batch_size 1 \ --dev_files ${ldc93s1_csv} --dev_batch_size 1 \ --test_files ${ldc93s1_csv} --test_batch_size 1 \ @@ -23,7 +23,7 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ --learning_rate 0.001 --dropout_rate 0.05 \ --scorer_path 'data/smoke_test/pruned_lm.scorer' -python -u DeepSpeech.py \ +python -u train.py \ --n_hidden 100 \ --checkpoint_dir '/tmp/ckpt' \ --scorer_path 'data/smoke_test/pruned_lm.scorer' \ diff --git a/bin/run-tc-ldc93s1_tflite.sh b/bin/run-tc-ldc93s1_tflite.sh index f7daca21..ca9fd976 100755 --- a/bin/run-tc-ldc93s1_tflite.sh +++ b/bin/run-tc-ldc93s1_tflite.sh @@ -16,7 +16,7 @@ fi; # and when trying to run on multiple devices (like GPUs), this will break export CUDA_VISIBLE_DEVICES=0 -python -u DeepSpeech.py --noshow_progressbar \ +python -u train.py --noshow_progressbar \ --n_hidden 100 \ --checkpoint_dir '/tmp/ckpt' \ --export_dir '/tmp/train_tflite' \ @@ -26,7 +26,7 @@ python -u DeepSpeech.py --noshow_progressbar \ mkdir /tmp/train_tflite/en-us -python -u DeepSpeech.py --noshow_progressbar \ +python -u train.py --noshow_progressbar \ --n_hidden 100 \ --checkpoint_dir '/tmp/ckpt' \ --export_dir '/tmp/train_tflite/en-us' \ diff --git a/bin/run-tc-transfer.sh b/bin/run-tc-transfer.sh index aae6d71a..4a0edeab 100755 --- a/bin/run-tc-transfer.sh +++ b/bin/run-tc-transfer.sh @@ -29,7 +29,7 @@ for LOAD in 'init' 'last' 'auto'; do echo "########################################################" echo "#### Train ENGLISH model with just --checkpoint_dir ####" echo "########################################################" - python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ + python -u train.py --noshow_progressbar --noearly_stop \ --alphabet_config_path "./data/alphabet.txt" \ --load_train "$LOAD" \ --train_files "${ldc93s1_csv}" --train_batch_size 1 \ @@ -43,7 +43,7 @@ for LOAD in 'init' 'last' 'auto'; do echo "##############################################################################" echo "#### Train ENGLISH model with --save_checkpoint_dir --load_checkpoint_dir ####" echo "##############################################################################" - python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ + python -u train.py --noshow_progressbar --noearly_stop \ --alphabet_config_path "./data/alphabet.txt" \ --load_train "$LOAD" \ --train_files "${ldc93s1_csv}" --train_batch_size 1 \ @@ -58,7 +58,7 @@ for LOAD in 'init' 'last' 'auto'; do echo "####################################################################################" echo "#### Transfer to RUSSIAN model with --save_checkpoint_dir --load_checkpoint_dir ####" echo "####################################################################################" - python -u DeepSpeech.py --noshow_progressbar --noearly_stop \ + python -u train.py --noshow_progressbar --noearly_stop \ --drop_source_layers 1 \ --alphabet_config_path "${ru_dir}/alphabet.ru" \ --load_train 'last' \ diff --git a/data/README.rst b/data/README.rst index f731a31c..289146c9 100644 --- a/data/README.rst +++ b/data/README.rst @@ -3,9 +3,9 @@ Language-Specific Data This directory contains language-specific data files. Most importantly, you will find here: -1. A list of unique characters for the target language (e.g. English) in ``data/alphabet.txt``. After installing the training code, you can check ``python -m deepspeech_training.util.check_characters --help`` for a tool that creates an alphabet file from a list of training CSV files. +1. A list of unique characters for the target language (e.g. English) in ``data/alphabet.txt``. After installing the training code, you can check ``python -m coqui_stt_training.util.check_characters --help`` for a tool that creates an alphabet file from a list of training CSV files. 2. A script used to generate a binary n-gram language model: ``data/lm/generate_lm.py``. -For more information on how to build these resources from scratch, see the ``External scorer scripts`` section on `deepspeech.readthedocs.io `_. +For more information on how to build these resources from scratch, see the ``External scorer scripts`` section on `stt.readthedocs.io `_. diff --git a/data/lm/generate_lm.py b/data/lm/generate_lm.py index 659d5077..47941437 100644 --- a/data/lm/generate_lm.py +++ b/data/lm/generate_lm.py @@ -130,7 +130,7 @@ def build_lm(args, data_lower, vocab_str): def main(): parser = argparse.ArgumentParser( - description="Generate lm.binary and top-k vocab for DeepSpeech." + description="Generate lm.binary and top-k vocab for Coqui STT." ) parser.add_argument( "--input_txt", diff --git a/doc/DeepSpeech.rst b/doc/Architecture.rst similarity index 99% rename from doc/DeepSpeech.rst rename to doc/Architecture.rst index 3d74d22e..a701a888 100644 --- a/doc/DeepSpeech.rst +++ b/doc/Architecture.rst @@ -1,5 +1,5 @@ -DeepSpeech Model -================ +STT Model +========= The aim of this project is to create a simple, open, and ubiquitous speech recognition engine. Simple, in that the engine should not require server-class diff --git a/doc/BUILDING.rst b/doc/BUILDING.rst index 56484205..3f0457d4 100644 --- a/doc/BUILDING.rst +++ b/doc/BUILDING.rst @@ -1,12 +1,12 @@ .. _build-native-client: -Building DeepSpeech Binaries -============================ +Building Coqui STT Binaries +=========================== This section describes how to rebuild binaries. We have already several prebuilt binaries for all the supported platform, it is highly advised to use them except if you know what you are doing. -If you'd like to build the DeepSpeech binaries yourself, you'll need the following pre-requisites downloaded and installed: +If you'd like to build the 🐸STT binaries yourself, you'll need the following pre-requisites downloaded and installed: * `Bazel 3.1.0 `_ * `General TensorFlow r2.3 requirements `_ @@ -26,18 +26,18 @@ If you'd like to build the language bindings or the decoder package, you'll also Dependencies ------------ -If you follow these instructions, you should compile your own binaries of DeepSpeech (built on TensorFlow using Bazel). +If you follow these instructions, you should compile your own binaries of 🐸STT (built on TensorFlow using Bazel). For more information on configuring TensorFlow, read the docs up to the end of `"Configure the Build" `_. Checkout source code ^^^^^^^^^^^^^^^^^^^^ -Clone DeepSpeech source code (TensorFlow will come as a submdule): +Clone 🐸STT source code (TensorFlow will come as a submdule): .. code-block:: - git clone https://github.com/mozilla/DeepSpeech.git + git clone https://github.com/coqui-ai/STT.git git submodule sync tensorflow/ git submodule update --init tensorflow/ @@ -56,24 +56,24 @@ After you have installed the correct version of Bazel, configure TensorFlow: cd tensorflow ./configure -Compile DeepSpeech ------------------- +Compile Coqui STT +----------------- -Compile ``libdeepspeech.so`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Compile ``libstt.so`` +^^^^^^^^^^^^^^^^^^^^^ -Within your TensorFlow directory, there should be a symbolic link to the DeepSpeech ``native_client`` directory. If it is not present, create it with the follow command: +Within your TensorFlow directory, there should be a symbolic link to the 🐸STT ``native_client`` directory. If it is not present, create it with the follow command: .. code-block:: cd tensorflow ln -s ../native_client -You can now use Bazel to build the main DeepSpeech library, ``libdeepspeech.so``. Add ``--config=cuda`` if you want a CUDA build. +You can now use Bazel to build the main 🐸STT library, ``libstt.so``. Add ``--config=cuda`` if you want a CUDA build. .. code-block:: - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-fvisibility=hidden //native_client:libdeepspeech.so + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-fvisibility=hidden //native_client:libstt.so The generated binaries will be saved to ``bazel-bin/native_client/``. @@ -82,24 +82,24 @@ The generated binaries will be saved to ``bazel-bin/native_client/``. Compile ``generate_scorer_package`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Following the same setup as for ``libdeepspeech.so`` above, you can rebuild the ``generate_scorer_package`` binary by adding its target to the command line: ``//native_client:generate_scorer_package``. +Following the same setup as for ``libstt.so`` above, you can rebuild the ``generate_scorer_package`` binary by adding its target to the command line: ``//native_client:generate_scorer_package``. Using the example from above you can build the library and that binary at the same time: .. code-block:: - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_scorer_package + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-fvisibility=hidden //native_client:libstt.so //native_client:generate_scorer_package The generated binaries will be saved to ``bazel-bin/native_client/``. Compile Language Bindings ^^^^^^^^^^^^^^^^^^^^^^^^^ -Now, ``cd`` into the ``DeepSpeech/native_client`` directory and use the ``Makefile`` to build all the language bindings (C++ client, Python package, Nodejs package, etc.). +Now, ``cd`` into the ``STT/native_client`` directory and use the ``Makefile`` to build all the language bindings (C++ client, Python package, Nodejs package, etc.). .. code-block:: - cd ../DeepSpeech/native_client - make deepspeech + cd ../STT/native_client + make stt Installing your own Binaries ---------------------------- @@ -121,9 +121,9 @@ Included are a set of generated Python bindings. After following the above build cd native_client/python make bindings - pip install dist/deepspeech* + pip install dist/stt-* -The API mirrors the C++ API and is demonstrated in `client.py `_. Refer to `deepspeech.h `_ for documentation. +The API mirrors the C++ API and is demonstrated in `client.py `_. Refer to `coqui-stt.h `_ for documentation. Install NodeJS / ElectronJS bindings ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -136,7 +136,7 @@ After following the above build and installation instructions, the Node.JS bindi make build make npm-pack -This will create the package ``deepspeech-VERSION.tgz`` in ``native_client/javascript``. +This will create the package ``stt-VERSION.tgz`` in ``native_client/javascript``. Install the CTC decoder package ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -196,23 +196,23 @@ So your command line for ``RPi3`` and ``ARMv7`` should look like: .. code-block:: - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=rpi3 --config=rpi3_opt -c opt --copt=-O3 --copt=-fvisibility=hidden //native_client:libdeepspeech.so + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=rpi3 --config=rpi3_opt -c opt --copt=-O3 --copt=-fvisibility=hidden //native_client:libstt.so And your command line for ``LePotato`` and ``ARM64`` should look like: .. code-block:: - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=rpi3-armv8 --config=rpi3-armv8_opt -c opt --copt=-O3 --copt=-fvisibility=hidden //native_client:libdeepspeech.so + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=rpi3-armv8 --config=rpi3-armv8_opt -c opt --copt=-O3 --copt=-fvisibility=hidden //native_client:libstt.so While we test only on RPi3 Raspbian Buster and LePotato ARMBian Buster, anything compatible with ``armv7-a cortex-a53`` or ``armv8-a cortex-a53`` should be fine. -The ``deepspeech`` binary can also be cross-built, with ``TARGET=rpi3`` or ``TARGET=rpi3-armv8``. This might require you to setup a system tree using the tool ``multistrap`` and the multitrap configuration files: ``native_client/multistrap_armbian64_buster.conf`` and ``native_client/multistrap_raspbian_buster.conf``. +The ``stt`` binary can also be cross-built, with ``TARGET=rpi3`` or ``TARGET=rpi3-armv8``. This might require you to setup a system tree using the tool ``multistrap`` and the multitrap configuration files: ``native_client/multistrap_armbian64_buster.conf`` and ``native_client/multistrap_raspbian_buster.conf``. The path of the system tree can be overridden from the default values defined in ``definitions.mk`` through the ``RASPBIAN`` ``make`` variable. .. code-block:: - cd ../DeepSpeech/native_client - make TARGET= deepspeech + cd ../STT/native_client + make TARGET= stt Android devices support ----------------------- @@ -224,64 +224,66 @@ Please refer to TensorFlow documentation on how to setup the environment to buil Using the library from Android project ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We provide uptodate and tested ``libdeepspeech`` usable as an ``AAR`` package, -for Android versions starting with 7.0 to 11.0. The package is published on -`JCenter `_, -and the ``JCenter`` repository should be available by default in any Android -project. Please make sure your project is setup to pull from this repository. -You can then include the library by just adding this line to your -``gradle.build``, adjusting ``VERSION`` to the version you need: +Due to the discontinuation of Bintray JCenter we do not have pre-built Android packages published for now. We are working to move to Maven Central and will update this section when it's available. + +.. We provide uptodate and tested ``libstt`` usable as an ``AAR`` package, + for Android versions starting with 7.0 to 11.0. The package is published on + `JCenter `_, + and the ``JCenter`` repository should be available by default in any Android + project. Please make sure your project is setup to pull from this repository. + You can then include the library by just adding this line to your + ``gradle.build``, adjusting ``VERSION`` to the version you need: + + .. code-block:: + + implementation 'stt.coqui.ai:libstt:VERSION@aar' + +Building ``libstt.so`` for Android +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can build the ``libstt.so`` using (ARMv7): .. code-block:: - implementation 'deepspeech.mozilla.org:libdeepspeech:VERSION@aar' - -Building ``libdeepspeech.so`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -You can build the ``libdeepspeech.so`` using (ARMv7): - -.. code-block:: - - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=android --config=android_arm --define=runtime=tflite --action_env ANDROID_NDK_API_LEVEL=21 --cxxopt=-std=c++14 --copt=-D_GLIBCXX_USE_C99 //native_client:libdeepspeech.so + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=android --config=android_arm --define=runtime=tflite --action_env ANDROID_NDK_API_LEVEL=21 --cxxopt=-std=c++14 --copt=-D_GLIBCXX_USE_C99 //native_client:libstt.so Or (ARM64): .. code-block:: - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=android --config=android_arm64 --define=runtime=tflite --action_env ANDROID_NDK_API_LEVEL=21 --cxxopt=-std=c++14 --copt=-D_GLIBCXX_USE_C99 //native_client:libdeepspeech.so + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic --config=android --config=android_arm64 --define=runtime=tflite --action_env ANDROID_NDK_API_LEVEL=21 --cxxopt=-std=c++14 --copt=-D_GLIBCXX_USE_C99 //native_client:libstt.so -Building ``libdeepspeech.aar`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Building ``libstt.aar`` +^^^^^^^^^^^^^^^^^^^^^^^ In the unlikely event you have to rebuild the JNI bindings, source code is -available under the ``libdeepspeech`` subdirectory. Building depends on shared -object: please ensure to place ``libdeepspeech.so`` into the -``libdeepspeech/libs/{arm64-v8a,armeabi-v7a,x86_64}/`` matching subdirectories. +available under the ``libstt`` subdirectory. Building depends on shared +object: please ensure to place ``libstt.so`` into the +``libstt/libs/{arm64-v8a,armeabi-v7a,x86_64}/`` matching subdirectories. Building the bindings is managed by ``gradle`` and should be limited to issuing -``./gradlew libdeepspeech:build``, producing an ``AAR`` package in -``./libdeepspeech/build/outputs/aar/``. +``./gradlew libstt:build``, producing an ``AAR`` package in +``./libstt/build/outputs/aar/``. Please note that you might have to copy the file to a local Maven repository and adapt file naming (when missing, the error message should states what filename it expects and where). -Building C++ ``deepspeech`` binary -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Building C++ ``stt`` binary +^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Building the ``deepspeech`` binary will happen through ``ndk-build`` (ARMv7): +Building the ``stt`` binary will happen through ``ndk-build`` (ARMv7): .. code-block:: - cd ../DeepSpeech/native_client + cd ../STT/native_client $ANDROID_NDK_HOME/ndk-build APP_PLATFORM=android-21 APP_BUILD_SCRIPT=$(pwd)/Android.mk NDK_PROJECT_PATH=$(pwd) APP_STL=c++_shared TFDIR=$(pwd)/../tensorflow/ TARGET_ARCH_ABI=armeabi-v7a And (ARM64): .. code-block:: - cd ../DeepSpeech/native_client + cd ../STT/native_client $ANDROID_NDK_HOME/ndk-build APP_PLATFORM=android-21 APP_BUILD_SCRIPT=$(pwd)/Android.mk NDK_PROJECT_PATH=$(pwd) APP_STL=c++_shared TFDIR=$(pwd)/../tensorflow/ TARGET_ARCH_ABI=arm64-v8a Android demo APK @@ -303,13 +305,13 @@ demo of one usage of the application. For example, it's only able to read PCM mono 16kHz 16-bits file and it might fail on some WAVE file that are not following exactly the specification. -Running ``deepspeech`` via adb -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Running ``stt`` via adb +^^^^^^^^^^^^^^^^^^^^^^^ You should use ``adb push`` to send data to device, please refer to Android documentation on how to use that. -Please push DeepSpeech data to ``/sdcard/deepspeech/``\ , including: +Please push 🐸STT data to ``/sdcard/STT/``\ , including: * ``output_graph.tflite`` which is the TF Lite model @@ -319,8 +321,8 @@ Please push DeepSpeech data to ``/sdcard/deepspeech/``\ , including: Then, push binaries from ``native_client.tar.xz`` to ``/data/local/tmp/ds``\ : -* ``deepspeech`` -* ``libdeepspeech.so`` +* ``stt`` +* ``libstt.so`` * ``libc++_shared.so`` You should then be able to run as usual, using a shell from ``adb shell``\ : @@ -328,7 +330,7 @@ You should then be able to run as usual, using a shell from ``adb shell``\ : .. code-block:: user@device$ cd /data/local/tmp/ds/ - user@device$ LD_LIBRARY_PATH=$(pwd)/ ./deepspeech [...] + user@device$ LD_LIBRARY_PATH=$(pwd)/ ./stt [...] Please note that Android linker does not support ``rpath`` so you have to set ``LD_LIBRARY_PATH``. Properly wrapped / packaged bindings does embed the library @@ -347,7 +349,7 @@ to leverage GPU / DSP / NPU * Hexagon, the Qualcomm-specific DSP This is highly experimental: -* Requires passing environment variable ``DS_TFLITE_DELEGATE`` with values of +* Requires passing environment variable ``STT_TFLITE_DELEGATE`` with values of ``gpu``, ``nnapi`` or ``hexagon`` (only one at a time) * Might require exported model changes (some Op might not be supported) * We can't guarantee it will work, nor it will be faster than default diff --git a/doc/BUILDING_DotNet.rst b/doc/BUILDING_DotNet.rst index a85598e0..5bc73175 100644 --- a/doc/BUILDING_DotNet.rst +++ b/doc/BUILDING_DotNet.rst @@ -1,9 +1,9 @@ .. _build-native-client-dotnet: -Building DeepSpeech native client for Windows -============================================= +Building Coqui STT native client for Windows +============================================ -Now we can build the native client of DeepSpeech and run inference on Windows using the C# client, to do that we need to compile the ``native_client``. +Now we can build the native client of 🐸STT and run inference on Windows using the C# client, to do that we need to compile the ``native_client``. **Table of Contents** @@ -44,11 +44,11 @@ We highly recommend sticking to the recommended versions of CUDA/cuDNN in order Getting the code ---------------- -We need to clone ``mozilla/DeepSpeech``. +We need to clone ``coqui-ai/STT``. .. code-block:: bash - git clone https://github.com/mozilla/DeepSpeech + git clone https://github.com/coqui-ai/STT git submodule sync tensorflow/ git submodule update --init tensorflow/ @@ -61,8 +61,8 @@ There should already be a symbolic link, for this example let's suppose that we . ├── D:\ - │ ├── cloned # Contains DeepSpeech and tensorflow side by side - │ │ └── DeepSpeech # Root of the cloned DeepSpeech + │ ├── cloned # Contains 🐸STT and tensorflow side by side + │ │ └── STT # Root of the cloned 🐸STT │ │ ├── tensorflow # Root of the cloned mozilla/tensorflow └── ... @@ -71,7 +71,7 @@ Change your path accordingly to your path structure, for the structure above we .. code-block:: bash - mklink /d "D:\cloned\DeepSpeech\tensorflow\native_client" "D:\cloned\DeepSpeech\native_client" + mklink /d "D:\cloned\STT\tensorflow\native_client" "D:\cloned\STT\native_client" Adding environment variables ---------------------------- @@ -119,7 +119,7 @@ Building the native_client There's one last command to run before building, you need to run the `configure.py `_ inside ``tensorflow`` cloned directory. -At this point we are ready to start building the ``native_client``, go to ``tensorflow`` sub-directory, following our examples should be ``D:\cloned\DeepSpeech\tensorflow``. +At this point we are ready to start building the ``native_client``, go to ``tensorflow`` sub-directory, following our examples should be ``D:\cloned\STT\tensorflow``. CPU ~~~ @@ -128,7 +128,7 @@ We will add AVX/AVX2 support in the command, please make sure that your CPU supp .. code-block:: bash - bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" -c opt --copt=/arch:AVX --copt=/arch:AVX2 //native_client:libdeepspeech.so + bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" -c opt --copt=/arch:AVX --copt=/arch:AVX2 //native_client:libstt.so GPU with CUDA ~~~~~~~~~~~~~ @@ -137,11 +137,11 @@ If you enabled CUDA in `configure.py `_ in your DeepSpeech directory and open the Visual Studio solution, then we need to build in debug or release mode, finally we just need to copy ``libdeepspeech.so`` to the generated ``x64/Debug`` or ``x64/Release`` directory. +As for now we can only use the generated ``libstt.so`` with the C# clients, go to `native_client/dotnet/ `_ in your STT directory and open the Visual Studio solution, then we need to build in debug or release mode, finally we just need to copy ``libstt.so`` to the generated ``x64/Debug`` or ``x64/Release`` directory. diff --git a/doc/C-API.rst b/doc/C-API.rst index d9c2da1d..b76c06b8 100644 --- a/doc/C-API.rst +++ b/doc/C-API.rst @@ -10,65 +10,65 @@ C API See also the list of error codes including descriptions for each error in :ref:`error-codes`. -.. doxygenfunction:: DS_CreateModel - :project: deepspeech-c +.. doxygenfunction:: STT_CreateModel + :project: stt-c -.. doxygenfunction:: DS_FreeModel - :project: deepspeech-c +.. doxygenfunction:: STT_FreeModel + :project: stt-c -.. doxygenfunction:: DS_EnableExternalScorer - :project: deepspeech-c +.. doxygenfunction:: STT_EnableExternalScorer + :project: stt-c -.. doxygenfunction:: DS_DisableExternalScorer - :project: deepspeech-c +.. doxygenfunction:: STT_DisableExternalScorer + :project: stt-c -.. doxygenfunction:: DS_AddHotWord - :project: deepspeech-c +.. doxygenfunction:: STT_AddHotWord + :project: stt-c -.. doxygenfunction:: DS_EraseHotWord - :project: deepspeech-c +.. doxygenfunction:: STT_EraseHotWord + :project: stt-c -.. doxygenfunction:: DS_ClearHotWords - :project: deepspeech-c +.. doxygenfunction:: STT_ClearHotWords + :project: stt-c -.. doxygenfunction:: DS_SetScorerAlphaBeta - :project: deepspeech-c +.. doxygenfunction:: STT_SetScorerAlphaBeta + :project: stt-c -.. doxygenfunction:: DS_GetModelSampleRate - :project: deepspeech-c +.. doxygenfunction:: STT_GetModelSampleRate + :project: stt-c -.. doxygenfunction:: DS_SpeechToText - :project: deepspeech-c +.. doxygenfunction:: STT_SpeechToText + :project: stt-c -.. doxygenfunction:: DS_SpeechToTextWithMetadata - :project: deepspeech-c +.. doxygenfunction:: STT_SpeechToTextWithMetadata + :project: stt-c -.. doxygenfunction:: DS_CreateStream - :project: deepspeech-c +.. doxygenfunction:: STT_CreateStream + :project: stt-c -.. doxygenfunction:: DS_FeedAudioContent - :project: deepspeech-c +.. doxygenfunction:: STT_FeedAudioContent + :project: stt-c -.. doxygenfunction:: DS_IntermediateDecode - :project: deepspeech-c +.. doxygenfunction:: STT_IntermediateDecode + :project: stt-c -.. doxygenfunction:: DS_IntermediateDecodeWithMetadata - :project: deepspeech-c +.. doxygenfunction:: STT_IntermediateDecodeWithMetadata + :project: stt-c -.. doxygenfunction:: DS_FinishStream - :project: deepspeech-c +.. doxygenfunction:: STT_FinishStream + :project: stt-c -.. doxygenfunction:: DS_FinishStreamWithMetadata - :project: deepspeech-c +.. doxygenfunction:: STT_FinishStreamWithMetadata + :project: stt-c -.. doxygenfunction:: DS_FreeStream - :project: deepspeech-c +.. doxygenfunction:: STT_FreeStream + :project: stt-c -.. doxygenfunction:: DS_FreeMetadata - :project: deepspeech-c +.. doxygenfunction:: STT_FreeMetadata + :project: stt-c -.. doxygenfunction:: DS_FreeString - :project: deepspeech-c +.. doxygenfunction:: STT_FreeString + :project: stt-c -.. doxygenfunction:: DS_Version - :project: deepspeech-c +.. doxygenfunction:: STT_Version + :project: stt-c diff --git a/doc/Contributed-Examples.rst b/doc/Contributed-Examples.rst index 7eaba452..a4c08a86 100644 --- a/doc/Contributed-Examples.rst +++ b/doc/Contributed-Examples.rst @@ -1,4 +1,4 @@ User contributed examples ========================= -There are also several user contributed examples available on a separate examples repository: `https://github.com/mozilla/DeepSpeech-examples `_. +There are also several user contributed examples available on a separate examples repository: `https://github.com/coqui-ai/STT-examples `_. diff --git a/doc/Decoder.rst b/doc/Decoder.rst index da974bc4..471f1234 100644 --- a/doc/Decoder.rst +++ b/doc/Decoder.rst @@ -6,7 +6,7 @@ CTC beam search decoder Introduction ^^^^^^^^^^^^ -DeepSpeech uses the `Connectionist Temporal Classification `_ loss function. For an excellent explanation of CTC and its usage, see this Distill article: `Sequence Modeling with CTC `_. This document assumes the reader is familiar with the concepts described in that article, and describes DeepSpeech specific behaviors that developers building systems with DeepSpeech should know to avoid problems. +🐸STT uses the `Connectionist Temporal Classification `_ loss function. For an excellent explanation of CTC and its usage, see this Distill article: `Sequence Modeling with CTC `_. This document assumes the reader is familiar with the concepts described in that article, and describes 🐸STT specific behaviors that developers building systems with 🐸STT should know to avoid problems. Note: Documentation for the tooling for creating custom scorer packages is available in :ref:`scorer-scripts`. @@ -16,19 +16,19 @@ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "S External scorer ^^^^^^^^^^^^^^^ -DeepSpeech clients support OPTIONAL use of an external language model to improve the accuracy of the predicted transcripts. In the code, command line parameters, and documentation, this is referred to as a "scorer". The scorer is used to compute the likelihood (also called a score, hence the name "scorer") of sequences of words or characters in the output, to guide the decoder towards more likely results. This improves accuracy significantly. +🐸STT clients support OPTIONAL use of an external language model to improve the accuracy of the predicted transcripts. In the code, command line parameters, and documentation, this is referred to as a "scorer". The scorer is used to compute the likelihood (also called a score, hence the name "scorer") of sequences of words or characters in the output, to guide the decoder towards more likely results. This improves accuracy significantly. -The use of an external scorer is fully optional. When an external scorer is not specified, DeepSpeech still uses a beam search decoding algorithm, but without any outside scoring. +The use of an external scorer is fully optional. When an external scorer is not specified, 🐸STT still uses a beam search decoding algorithm, but without any outside scoring. -Currently, the DeepSpeech external scorer is implemented with `KenLM `_, plus some tooling to package the necessary files and metadata into a single ``.scorer`` package. The tooling lives in ``data/lm/``. The scripts included in ``data/lm/`` can be used and modified to build your own language model based on your particular use case or language. See :ref:`scorer-scripts` for more details on how to reproduce our scorer file as well as create your own. +Currently, the 🐸STT external scorer is implemented with `KenLM `_, plus some tooling to package the necessary files and metadata into a single ``.scorer`` package. The tooling lives in ``data/lm/``. The scripts included in ``data/lm/`` can be used and modified to build your own language model based on your particular use case or language. See :ref:`scorer-scripts` for more details on how to reproduce our scorer file as well as create your own. -The scripts are geared towards replicating the language model files we release as part of `DeepSpeech model releases `_, but modifying them to use different datasets or language model construction parameters should be simple. +The scripts are geared towards replicating the language model files we release as part of `STT model releases `_, but modifying them to use different datasets or language model construction parameters should be simple. Decoding modes ^^^^^^^^^^^^^^ -DeepSpeech currently supports two modes of operation with significant differences at both training and decoding time. Note that Bytes output mode is experimental and has not been tested for languages other than Chinese Mandarin. +🐸STT currently supports two modes of operation with significant differences at both training and decoding time. Note that Bytes output mode is experimental and has not been tested for languages other than Chinese Mandarin. Default mode (alphabet based) diff --git a/doc/DotNet-API.rst b/doc/DotNet-API.rst index 92342ded..bba28896 100644 --- a/doc/DotNet-API.rst +++ b/doc/DotNet-API.rst @@ -2,18 +2,18 @@ ============== -DeepSpeech Class +STT Class ---------------- -.. doxygenclass:: DeepSpeechClient::DeepSpeech - :project: deepspeech-dotnet +.. doxygenclass:: STTClient::STT + :project: stt-dotnet :members: -DeepSpeechStream Class +Stream Class ---------------------- -.. doxygenclass:: DeepSpeechClient::Models::DeepSpeechStream - :project: deepspeech-dotnet +.. doxygenclass:: STTClient::Models::Stream + :project: stt-dotnet :members: ErrorCodes @@ -21,33 +21,33 @@ ErrorCodes See also the main definition including descriptions for each error in :ref:`error-codes`. -.. doxygenenum:: DeepSpeechClient::Enums::ErrorCodes - :project: deepspeech-dotnet +.. doxygenenum:: STTClient::Enums::ErrorCodes + :project: stt-dotnet Metadata -------- -.. doxygenclass:: DeepSpeechClient::Models::Metadata - :project: deepspeech-dotnet +.. doxygenclass:: STTClient::Models::Metadata + :project: stt-dotnet :members: Transcripts CandidateTranscript ------------------- -.. doxygenclass:: DeepSpeechClient::Models::CandidateTranscript - :project: deepspeech-dotnet +.. doxygenclass:: STTClient::Models::CandidateTranscript + :project: stt-dotnet :members: Tokens, Confidence TokenMetadata ------------- -.. doxygenclass:: DeepSpeechClient::Models::TokenMetadata - :project: deepspeech-dotnet +.. doxygenclass:: STTClient::Models::TokenMetadata + :project: stt-dotnet :members: Text, Timestep, StartTime -DeepSpeech Interface +STT Interface -------------------- -.. doxygeninterface:: DeepSpeechClient::Interfaces::IDeepSpeech - :project: deepspeech-dotnet +.. doxygeninterface:: STTClient::Interfaces::ISTT + :project: stt-dotnet :members: diff --git a/doc/DotNet-Examples.rst b/doc/DotNet-Examples.rst index a00ee833..beec6243 100644 --- a/doc/DotNet-Examples.rst +++ b/doc/DotNet-Examples.rst @@ -1,12 +1,12 @@ .NET API Usage example ====================== -Examples are from `native_client/dotnet/DeepSpeechConsole/Program.cs`. +Examples are from `native_client/dotnet/STTConsole/Program.cs`. Creating a model instance and loading model ------------------------------------------- -.. literalinclude:: ../native_client/dotnet/DeepSpeechConsole/Program.cs +.. literalinclude:: ../native_client/dotnet/STTConsole/Program.cs :language: csharp :linenos: :lineno-match: @@ -16,7 +16,7 @@ Creating a model instance and loading model Performing inference -------------------- -.. literalinclude:: ../native_client/dotnet/DeepSpeechConsole/Program.cs +.. literalinclude:: ../native_client/dotnet/STTConsole/Program.cs :language: csharp :linenos: :lineno-match: @@ -26,4 +26,4 @@ Performing inference Full source code ---------------- -See :download:`Full source code<../native_client/dotnet/DeepSpeechConsole/Program.cs>`. +See :download:`Full source code<../native_client/dotnet/STTConsole/Program.cs>`. diff --git a/doc/Error-Codes.rst b/doc/Error-Codes.rst index 361ca025..932d6b46 100644 --- a/doc/Error-Codes.rst +++ b/doc/Error-Codes.rst @@ -5,7 +5,7 @@ Error codes Below is the definition for all error codes used in the API, their numerical values, and a human readable description. -.. literalinclude:: ../native_client/deepspeech.h +.. literalinclude:: ../native_client/coqui-stt.h :language: c :start-after: sphinx-doc: error_code_listing_start :end-before: sphinx-doc: error_code_listing_end diff --git a/doc/Flags.rst b/doc/Flags.rst index 66b26f0c..deb3e65a 100644 --- a/doc/Flags.rst +++ b/doc/Flags.rst @@ -3,12 +3,12 @@ Command-line flags for the training scripts =========================================== -Below you can find the definition of all command-line flags supported by the training scripts. This includes ``DeepSpeech.py``, ``evaluate.py``, ``evaluate_tflite.py``, ``transcribe.py`` and ``lm_optimizer.py``. +Below you can find the definition of all command-line flags supported by the training scripts. This includes ``train.py``, ``evaluate.py``, ``evaluate_tflite.py``, ``transcribe.py`` and ``lm_optimizer.py``. Flags ----- -.. literalinclude:: ../training/deepspeech_training/util/flags.py +.. literalinclude:: ../training/coqui_stt_training/util/flags.py :language: python :linenos: :lineno-match: diff --git a/doc/HotWordBoosting-Examples.rst b/doc/HotWordBoosting-Examples.rst index a234c89c..deb32ddd 100644 --- a/doc/HotWordBoosting-Examples.rst +++ b/doc/HotWordBoosting-Examples.rst @@ -1,7 +1,7 @@ Hot-word boosting API Usage example =================================== -With DeepSpeech 0.9 release a new API feature was introduced that allows boosting probability from the scorer of given words. It is exposed in all bindings (C, Python, JS, Java and .Net). +With the 🐸STT 0.9 release a new API feature was introduced that allows boosting probability from the scorer of given words. It is exposed in all bindings (C, Python, JS, Java and .Net). Currently, it provides three methods for the Model class: @@ -19,11 +19,11 @@ It is worth noting that boosting non-existent words in scorer (mostly proper nou Adjusting the boosting value ---------------------------- -For hot-word boosting it is hard to determine what the optimal value that one might be searching for is. Additionally, this is dependant on the input audio file. In practice, as it was reported by DeepSpeech users, the value should be not bigger than 20.0 for positive value boosting. Nevertheless, each usecase is different and you might need to adjust values on your own. +For hot-word boosting it is hard to determine what the optimal value that one might be searching for is. Additionally, this is dependant on the input audio file. In practice, as it was reported by 🐸STT users, the value should be not bigger than 20.0 for positive value boosting. Nevertheless, each usecase is different and you might need to adjust values on your own. -There is a user contributed script available on ``DeepSpeech-examples`` repository for adjusting boost values: +There is a user contributed script available on ``STT-examples`` repository for adjusting boost values: -`https://github.com/mozilla/DeepSpeech-examples/tree/master/hotword_adjusting `_. +`https://github.com/coqui-ai/STT-examples/tree/master/hotword_adjusting `_. Positive value boosting diff --git a/doc/Java-API.rst b/doc/Java-API.rst index a61bd1b1..69603141 100644 --- a/doc/Java-API.rst +++ b/doc/Java-API.rst @@ -1,29 +1,29 @@ Java ==== -DeepSpeechModel +STTModel --------------- -.. doxygenclass:: org::deepspeech::libdeepspeech::DeepSpeechModel - :project: deepspeech-java +.. doxygenclass:: ai::coqui::libstt::STTModel + :project: stt-java :members: Metadata -------- -.. doxygenclass:: org::deepspeech::libdeepspeech::Metadata - :project: deepspeech-java +.. doxygenclass:: ai::coqui::libstt::Metadata + :project: stt-java :members: getNumTranscripts, getTranscript CandidateTranscript ------------------- -.. doxygenclass:: org::deepspeech::libdeepspeech::CandidateTranscript - :project: deepspeech-java +.. doxygenclass:: ai::coqui::libstt::CandidateTranscript + :project: stt-java :members: getNumTokens, getConfidence, getToken TokenMetadata ------------- -.. doxygenclass:: org::deepspeech::libdeepspeech::TokenMetadata - :project: deepspeech-java +.. doxygenclass:: ai::coqui::libstt::TokenMetadata + :project: stt-java :members: getText, getTimestep, getStartTime diff --git a/doc/Java-Examples.rst b/doc/Java-Examples.rst index 04836ed5..834354df 100644 --- a/doc/Java-Examples.rst +++ b/doc/Java-Examples.rst @@ -1,12 +1,12 @@ Java API Usage example ====================== -Examples are from `native_client/java/app/src/main/java/org/deepspeech/DeepSpeechActivity.java`. +Examples are from `native_client/java/app/src/main/java/ai/coqui/STTActivity.java`. Creating a model instance and loading model ------------------------------------------- -.. literalinclude:: ../native_client/java/app/src/main/java/org/deepspeech/DeepSpeechActivity.java +.. literalinclude:: ../native_client/java/app/src/main/java/ai/coqui/STTActivity.java :language: java :linenos: :lineno-match: @@ -16,7 +16,7 @@ Creating a model instance and loading model Performing inference -------------------- -.. literalinclude:: ../native_client/java/app/src/main/java/org/deepspeech/DeepSpeechActivity.java +.. literalinclude:: ../native_client/java/app/src/main/java/ai/coqui/STTActivity.java :language: java :linenos: :lineno-match: @@ -26,4 +26,4 @@ Performing inference Full source code ---------------- -See :download:`Full source code<../native_client/java/app/src/main/java/org/deepspeech/DeepSpeechActivity.java>`. +See :download:`Full source code<../native_client/java/app/src/main/java/ai/coqui/STTActivity.java>`. diff --git a/doc/Makefile b/doc/Makefile index 0980ab24..2eb83d83 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -4,7 +4,7 @@ # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build -SPHINXPROJ = DeepSpeech +SPHINXPROJ = "Coqui STT" SOURCEDIR = . BUILDDIR = .build diff --git a/doc/ParallelOptimization.rst b/doc/ParallelOptimization.rst index e0d3734c..9f6dca36 100644 --- a/doc/ParallelOptimization.rst +++ b/doc/ParallelOptimization.rst @@ -1,7 +1,7 @@ Parallel Optimization ===================== -This is how we implement optimization of the DeepSpeech model across GPUs on a +This is how we implement optimization of the 🐸STT model across GPUs on a single host. Parallel optimization can take on various forms. For example one can use asynchronous updates of the model, synchronous updates of the model, or some combination of the two. diff --git a/doc/SUPPORTED_PLATFORMS.rst b/doc/SUPPORTED_PLATFORMS.rst index 1ccfb7e3..800d92f2 100644 --- a/doc/SUPPORTED_PLATFORMS.rst +++ b/doc/SUPPORTED_PLATFORMS.rst @@ -9,61 +9,61 @@ Linux / AMD64 without GPU ^^^^^^^^^^^^^^^^^^^^^^^^^ * x86-64 CPU with AVX/FMA (one can rebuild without AVX/FMA, but it might slow down inference) * Ubuntu 14.04+ (glibc >= 2.19, libstdc++6 >= 4.8) -* Full TensorFlow runtime (``deepspeech`` packages) -* TensorFlow Lite runtime (``deepspeech-tflite`` packages) +* Full TensorFlow runtime (``stt`` packages) +* TensorFlow Lite runtime (``stt-tflite`` packages) Linux / AMD64 with GPU ^^^^^^^^^^^^^^^^^^^^^^ * x86-64 CPU with AVX/FMA (one can rebuild without AVX/FMA, but it might slow down inference) * Ubuntu 14.04+ (glibc >= 2.19, libstdc++6 >= 4.8) * CUDA 10.0 (and capable GPU) -* Full TensorFlow runtime (``deepspeech`` packages) -* TensorFlow Lite runtime (``deepspeech-tflite`` packages) +* Full TensorFlow runtime (``stt`` packages) +* TensorFlow Lite runtime (``stt-tflite`` packages) Linux / ARMv7 ^^^^^^^^^^^^^ * Cortex-A53 compatible ARMv7 SoC with Neon support * Raspbian Buster-compatible distribution -* TensorFlow Lite runtime (``deepspeech-tflite`` packages) +* TensorFlow Lite runtime (``stt-tflite`` packages) Linux / Aarch64 ^^^^^^^^^^^^^^^ * Cortex-A72 compatible Aarch64 SoC * ARMbian Buster-compatible distribution -* TensorFlow Lite runtime (``deepspeech-tflite`` packages) +* TensorFlow Lite runtime (``stt-tflite`` packages) Android / ARMv7 ^^^^^^^^^^^^^^^ * ARMv7 SoC with Neon support * Android 7.0-10.0 * NDK API level >= 21 -* TensorFlow Lite runtime (``deepspeech-tflite`` packages) +* TensorFlow Lite runtime (``stt-tflite`` packages) Android / Aarch64 ^^^^^^^^^^^^^^^^^ * Aarch64 SoC * Android 7.0-10.0 * NDK API level >= 21 -* TensorFlow Lite runtime (``deepspeech-tflite`` packages) +* TensorFlow Lite runtime (``stt-tflite`` packages) macOS / AMD64 ^^^^^^^^^^^^^ * x86-64 CPU with AVX/FMA (one can rebuild without AVX/FMA, but it might slow down inference) * macOS >= 10.10 -* Full TensorFlow runtime (``deepspeech`` packages) -* TensorFlow Lite runtime (``deepspeech-tflite`` packages) +* Full TensorFlow runtime (``stt`` packages) +* TensorFlow Lite runtime (``stt-tflite`` packages) Windows / AMD64 without GPU ^^^^^^^^^^^^^^^^^^^^^^^^^^^ * x86-64 CPU with AVX/FMA (one can rebuild without AVX/FMA, but it might slow down inference) * Windows Server >= 2012 R2 ; Windows >= 8.1 -* Full TensorFlow runtime (``deepspeech`` packages) -* TensorFlow Lite runtime (``deepspeech-tflite`` packages) +* Full TensorFlow runtime (``stt`` packages) +* TensorFlow Lite runtime (``stt-tflite`` packages) Windows / AMD64 with GPU ^^^^^^^^^^^^^^^^^^^^^^^^ * x86-64 CPU with AVX/FMA (one can rebuild without AVX/FMA, but it might slow down inference) * Windows Server >= 2012 R2 ; Windows >= 8.1 * CUDA 10.0 (and capable GPU) -* Full TensorFlow runtime (``deepspeech`` packages) -* TensorFlow Lite runtime (``deepspeech-tflite`` packages) +* Full TensorFlow runtime (``stt`` packages) +* TensorFlow Lite runtime (``stt-tflite`` packages) diff --git a/doc/Scorer.rst b/doc/Scorer.rst index 1f374604..881a3f91 100644 --- a/doc/Scorer.rst +++ b/doc/Scorer.rst @@ -3,11 +3,11 @@ External scorer scripts ======================= -DeepSpeech pre-trained models include an external scorer. This document explains how to reproduce our external scorer, as well as adapt the scripts to create your own. +🐸STT pre-trained models include an external scorer. This document explains how to reproduce our external scorer, as well as adapt the scripts to create your own. The scorer is composed of two sub-components, a KenLM language model and a trie data structure containing all words in the vocabulary. In order to create the scorer package, first we must create a KenLM language model (using ``data/lm/generate_lm.py``, and then use ``generate_scorer_package`` to create the final package file including the trie data structure. -The ``generate_scorer_package`` binary is part of the native client package that is included with official releases. You can find the appropriate archive for your platform in the `GitHub release downloads `_. The native client package is named ``native_client.{arch}.{config}.{plat}.tar.xz``, where ``{arch}`` is the architecture the binary was built for, for example ``amd64`` or ``arm64``, ``config`` is the build configuration, which for building decoder packages does not matter, and ``{plat}`` is the platform the binary was built-for, for example ``linux`` or ``osx``. If you wanted to run the ``generate_scorer_package`` binary on a Linux desktop, you would download ``native_client.amd64.cpu.linux.tar.xz``. +The ``generate_scorer_package`` binary is part of the native client package that is included with official releases. You can find the appropriate archive for your platform in the `GitHub release downloads `_. The native client package is named ``native_client.{arch}.{config}.{plat}.tar.xz``, where ``{arch}`` is the architecture the binary was built for, for example ``amd64`` or ``arm64``, ``config`` is the build configuration, which for building decoder packages does not matter, and ``{plat}`` is the platform the binary was built-for, for example ``linux`` or ``osx``. If you wanted to run the ``generate_scorer_package`` binary on a Linux desktop, you would download ``native_client.amd64.cpu.linux.tar.xz``. Reproducing our external scorer ------------------------------- @@ -26,7 +26,7 @@ Then use the ``generate_lm.py`` script to generate ``lm.binary`` and ``vocab-500 As input you can use a plain text (e.g. ``file.txt``) or gzipped (e.g. ``file.txt.gz``) text file with one sentence in each line. -If you are using a container created from ``Dockerfile.build``, you can use ``--kenlm_bins /DeepSpeech/native_client/kenlm/build/bin/``. +If you are using a container created from ``Dockerfile.build``, you can use ``--kenlm_bins /STT/native_client/kenlm/build/bin/``. Else you have to build `KenLM `_ first and then pass the build directory to the script. .. code-block:: bash @@ -44,7 +44,7 @@ Afterwards you can use ``generate_scorer_package`` to generate the scorer packag cd data/lm # Download and extract appropriate native_client package: - curl -LO http://github.com/mozilla/DeepSpeech/releases/... + curl -LO http://github.com/coqui-ai/STT/releases/... tar xvf native_client.*.tar.xz ./generate_scorer_package --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \ --package kenlm.scorer --default_alpha 0.931289039105002 --default_beta 1.1834137581510284 @@ -59,6 +59,6 @@ Building your own scorer can be useful if you're using models in a narrow usage The LibriSpeech LM training text used by our scorer is around 4GB uncompressed, which should give an idea of the size of a corpus needed for a reasonable language model for general speech recognition. For more constrained use cases with smaller vocabularies, you don't need as much data, but you should still try to gather as much as you can. -With a text corpus in hand, you can then re-use ``generate_lm.py`` and ``generate_scorer_package`` to create your own scorer that is compatible with DeepSpeech clients and language bindings. Before building the language model, you must first familiarize yourself with the `KenLM toolkit `_. Most of the options exposed by the ``generate_lm.py`` script are simply forwarded to KenLM options of the same name, so you must read the KenLM documentation in order to fully understand their behavior. +With a text corpus in hand, you can then re-use ``generate_lm.py`` and ``generate_scorer_package`` to create your own scorer that is compatible with 🐸STT clients and language bindings. Before building the language model, you must first familiarize yourself with the `KenLM toolkit `_. Most of the options exposed by the ``generate_lm.py`` script are simply forwarded to KenLM options of the same name, so you must read the KenLM documentation in order to fully understand their behavior. After using ``generate_lm.py`` to create a KenLM language model binary file, you can use ``generate_scorer_package`` to create a scorer package as described in the previous section. Note that we have a :github:`lm_optimizer.py script ` which can be used to find good default values for alpha and beta. To use it, you must first generate a package with any value set for default alpha and beta flags. For this step, it doesn't matter what values you use, as they'll be overridden by ``lm_optimizer.py`` later. Then, use ``lm_optimizer.py`` with this scorer file to find good alpha and beta values. Finally, use ``generate_scorer_package`` again, this time with the new values. diff --git a/doc/Structs.rst b/doc/Structs.rst index 5d532277..14869dd2 100644 --- a/doc/Structs.rst +++ b/doc/Structs.rst @@ -5,19 +5,19 @@ Metadata -------- .. doxygenstruct:: Metadata - :project: deepspeech-c + :project: stt-c :members: CandidateTranscript ------------------- .. doxygenstruct:: CandidateTranscript - :project: deepspeech-c + :project: stt-c :members: TokenMetadata ------------- .. doxygenstruct:: TokenMetadata - :project: deepspeech-c + :project: stt-c :members: diff --git a/doc/TRAINING.rst b/doc/TRAINING.rst index a5a08e24..97703f4a 100644 --- a/doc/TRAINING.rst +++ b/doc/TRAINING.rst @@ -15,11 +15,11 @@ Prerequisites for training a model Getting the training code ^^^^^^^^^^^^^^^^^^^^^^^^^ -Clone the latest released stable branch from Github (e.g. 0.9.3, check `here `_): +Clone the latest released stable branch from Github (e.g. 0.9.3, check `here `_): .. code-block:: bash - git clone --branch v0.9.3 https://github.com/mozilla/DeepSpeech + git clone --branch v0.9.3 https://github.com/coqui-ai/STT If you plan on committing code or you want to report bugs, please use the master branch. @@ -28,31 +28,31 @@ Creating a virtual environment Throughout the documentation we assume you are using **virtualenv** to manage your Python environments. This setup is the one used and recommended by the project authors and is the easiest way to make sure you won't run into environment issues. If you're using **Anaconda, Miniconda or Mamba**, first read the instructions at :ref:`training-with-conda` and then continue from the installation step below. -In creating a virtual environment you will create a directory containing a ``python3`` binary and everything needed to run deepspeech. You can use whatever directory you want. For the purpose of the documentation, we will rely on ``$HOME/tmp/deepspeech-train-venv``. You can create it using this command: +In creating a virtual environment you will create a directory containing a ``python3`` binary and everything needed to run 🐸STT. You can use whatever directory you want. For the purpose of the documentation, we will rely on ``$HOME/tmp/coqui-stt-train-venv``. You can create it using this command: .. code-block:: - $ python3 -m venv $HOME/tmp/deepspeech-train-venv/ + $ python3 -m venv $HOME/tmp/coqui-stt-train-venv/ Once this command completes successfully, the environment will be ready to be activated. Activating the environment ^^^^^^^^^^^^^^^^^^^^^^^^^^ -Each time you need to work with DeepSpeech, you have to *activate* this virtual environment. This is done with this simple command: +Each time you need to work with 🐸STT, you have to *activate* this virtual environment. This is done with this simple command: .. code-block:: - $ source $HOME/tmp/deepspeech-train-venv/bin/activate + $ source $HOME/tmp/coqui-stt-train-venv/bin/activate -Installing DeepSpeech Training Code and its dependencies -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Installing Coqui STT Training Code and its dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Install the required dependencies using ``pip3``\ : .. code-block:: bash - cd DeepSpeech + cd STT pip3 install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0 pip3 install --upgrade -e . @@ -95,11 +95,11 @@ This should ensure that you'll re-use the upstream Python 3 TensorFlow GPU-enabl make Dockerfile.train -If you want to specify a different DeepSpeech repository / branch, you can pass ``DEEPSPEECH_REPO`` or ``DEEPSPEECH_SHA`` parameters: +If you want to specify a different 🐸STT repository / branch, you can pass ``STT_REPO`` or ``STT_SHA`` parameters: .. code-block:: bash - make Dockerfile.train DEEPSPEECH_REPO=git://your/fork DEEPSPEECH_SHA=origin/your-branch + make Dockerfile.train STT_REPO=git://your/fork STT_SHA=origin/your-branch Common Voice training data ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -112,7 +112,7 @@ After extraction of such a data set, you'll find the following contents: * the ``*.tsv`` files output by CorporaCreator for the downloaded language * the mp3 audio files they reference in a ``clips`` sub-directory. -For bringing this data into a form that DeepSpeech understands, you have to run the CommonVoice v2.0 importer (\ ``bin/import_cv2.py``\ ): +For bringing this data into a form that 🐸STT understands, you have to run the CommonVoice v2.0 importer (\ ``bin/import_cv2.py``\ ): .. code-block:: bash @@ -134,22 +134,22 @@ The CSV files comprise of the following fields: * ``wav_filesize`` - samples size given in bytes, used for sorting the data before training. Expects integer. * ``transcript`` - transcription target for the sample. -To use Common Voice data during training, validation and testing, you pass (comma separated combinations of) their filenames into ``--train_files``\ , ``--dev_files``\ , ``--test_files`` parameters of ``DeepSpeech.py``. +To use Common Voice data during training, validation and testing, you pass (comma separated combinations of) their filenames into ``--train_files``\ , ``--dev_files``\ , ``--test_files`` parameters of ``train.py``. -If, for example, Common Voice language ``en`` was extracted to ``../data/CV/en/``\ , ``DeepSpeech.py`` could be called like this: +If, for example, Common Voice language ``en`` was extracted to ``../data/CV/en/``\ , ``train.py`` could be called like this: .. code-block:: bash - python3 DeepSpeech.py --train_files ../data/CV/en/clips/train.csv --dev_files ../data/CV/en/clips/dev.csv --test_files ../data/CV/en/clips/test.csv + python3 train.py --train_files ../data/CV/en/clips/train.csv --dev_files ../data/CV/en/clips/dev.csv --test_files ../data/CV/en/clips/test.csv Training a model ^^^^^^^^^^^^^^^^ -The central (Python) script is ``DeepSpeech.py`` in the project's root directory. For its list of command line options, you can call: +The central (Python) script is ``train.py`` in the project's root directory. For its list of command line options, you can call: .. code-block:: bash - python3 DeepSpeech.py --helpfull + python3 train.py --helpfull To get the output of this in a slightly better-formatted way, you can also look at the flag definitions in :ref:`training-flags`. @@ -157,7 +157,7 @@ For executing pre-configured training scenarios, there is a collection of conven **If you experience GPU OOM errors while training, try reducing the batch size with the ``--train_batch_size``\ , ``--dev_batch_size`` and ``--test_batch_size`` parameters.** -As a simple first example you can open a terminal, change to the directory of the DeepSpeech checkout, activate the virtualenv created above, and run: +As a simple first example you can open a terminal, change to the directory of the 🐸STT checkout, activate the virtualenv created above, and run: .. code-block:: bash @@ -165,9 +165,9 @@ As a simple first example you can open a terminal, change to the directory of th This script will train on a small sample dataset composed of just a single audio file, the sample file for the `TIMIT Acoustic-Phonetic Continuous Speech Corpus `_, which can be overfitted on a GPU in a few minutes for demonstration purposes. From here, you can alter any variables with regards to what dataset is used, how many training iterations are run and the default values of the network parameters. -Feel also free to pass additional (or overriding) ``DeepSpeech.py`` parameters to these scripts. Then, just run the script to train the modified network. +Feel also free to pass additional (or overriding) ``train.py`` parameters to these scripts. Then, just run the script to train the modified network. -Each dataset has a corresponding importer script in ``bin/`` that can be used to download (if it's freely available) and preprocess the dataset. See ``bin/import_librivox.py`` for an example of how to import and preprocess a large dataset for training with DeepSpeech. +Each dataset has a corresponding importer script in ``bin/`` that can be used to download (if it's freely available) and preprocess the dataset. See ``bin/import_librivox.py`` for an example of how to import and preprocess a large dataset for training with 🐸STT. Some importers might require additional code to properly handled your locale-specific requirements. Such handling is dealt with ``--validate_label_locale`` flag that allows you to source out-of-tree Python script that defines a ``validate_label`` function. Please refer to ``util/importers.py`` for implementation example of that function. If you don't provide this argument, the default ``validate_label`` function will be used. This one is only intended for English language, so you might have consistency issues in your data for other languages. @@ -191,10 +191,10 @@ Automatic Mixed Precision (AMP) training on GPU for TensorFlow has been recently Mixed precision training makes use of both FP32 and FP16 precisions where appropriate. FP16 operations can leverage the Tensor cores on NVIDIA GPUs (Volta, Turing or newer architectures) for improved throughput. Mixed precision training also often allows larger batch sizes. Automatic mixed precision training can be enabled by including the flag `--automatic_mixed_precision` at training time: ``` -python3 DeepSpeech.py --train_files ./train.csv --dev_files ./dev.csv --test_files ./test.csv --automatic_mixed_precision +python3 train.py --train_files ./train.csv --dev_files ./dev.csv --test_files ./test.csv --automatic_mixed_precision ``` -On a Volta generation V100 GPU, automatic mixed precision speeds up DeepSpeech training and evaluation by ~30%-40%. +On a Volta generation V100 GPU, automatic mixed precision speeds up 🐸STT training and evaluation by ~30%-40%. Checkpointing ^^^^^^^^^^^^^ @@ -212,7 +212,7 @@ Refer to the :ref:`usage instructions ` for information on running a Exporting a model for TFLite ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -If you want to experiment with the TF Lite engine, you need to export a model that is compatible with it, then use the ``--export_tflite`` flags. If you already have a trained model, you can re-export it for TFLite by running ``DeepSpeech.py`` again and specifying the same ``checkpoint_dir`` that you used for training, as well as passing ``--export_tflite --export_dir /model/export/destination``. If you changed the alphabet you also need to add the ``--alphabet_config_path my-new-language-alphabet.txt`` flag. +If you want to experiment with the TF Lite engine, you need to export a model that is compatible with it, then use the ``--export_tflite`` flags. If you already have a trained model, you can re-export it for TFLite by running ``train.py`` again and specifying the same ``checkpoint_dir`` that you used for training, as well as passing ``--export_tflite --export_dir /model/export/destination``. If you changed the alphabet you also need to add the ``--alphabet_config_path my-new-language-alphabet.txt`` flag. Making a mmap-able model for inference ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -236,9 +236,9 @@ Upon sucessfull run, it should report about conversion of a non-zero number of n Continuing training from a release model ---------------------------------------- -There are currently two supported approaches to make use of a pre-trained DeepSpeech model: fine-tuning or transfer-learning. Choosing which one to use is a simple decision, and it depends on your target dataset. Does your data use the same alphabet as the release model? If "Yes": fine-tune. If "No" use transfer-learning. +There are currently two supported approaches to make use of a pre-trained 🐸STT model: fine-tuning or transfer-learning. Choosing which one to use is a simple decision, and it depends on your target dataset. Does your data use the same alphabet as the release model? If "Yes": fine-tune. If "No" use transfer-learning. -If your own data uses the *extact* same alphabet as the English release model (i.e. `a-z` plus `'`) then the release model's output layer will match your data, and you can just fine-tune the existing parameters. However, if you want to use a new alphabet (e.g. Cyrillic `а`, `б`, `д`), the output layer of a release DeepSpeech model will *not* match your data. In this case, you should use transfer-learning (i.e. remove the trained model's output layer, and reinitialize a new output layer that matches your target character set. +If your own data uses the *extact* same alphabet as the English release model (i.e. `a-z` plus `'`) then the release model's output layer will match your data, and you can just fine-tune the existing parameters. However, if you want to use a new alphabet (e.g. Cyrillic `а`, `б`, `д`), the output layer of a release 🐸STT model will *not* match your data. In this case, you should use transfer-learning (i.e. remove the trained model's output layer, and reinitialize a new output layer that matches your target character set. N.B. - If you have access to a pre-trained model which uses UTF-8 bytes at the output layer you can always fine-tune, because any alphabet should be encodable as UTF-8. @@ -247,14 +247,14 @@ N.B. - If you have access to a pre-trained model which uses UTF-8 bytes at the o Fine-Tuning (same alphabet) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -If you'd like to use one of the pre-trained models to bootstrap your training process (fine tuning), you can do so by using the ``--checkpoint_dir`` flag in ``DeepSpeech.py``. Specify the path where you downloaded the checkpoint from the release, and training will resume from the pre-trained model. +If you'd like to use one of the pre-trained models to bootstrap your training process (fine tuning), you can do so by using the ``--checkpoint_dir`` flag in ``train.py``. Specify the path where you downloaded the checkpoint from the release, and training will resume from the pre-trained model. For example, if you want to fine tune the entire graph using your own data in ``my-train.csv``\ , ``my-dev.csv`` and ``my-test.csv``\ , for three epochs, you can something like the following, tuning the hyperparameters as needed: .. code-block:: bash mkdir fine_tuning_checkpoints - python3 DeepSpeech.py --n_hidden 2048 --checkpoint_dir path/to/checkpoint/folder --epochs 3 --train_files my-train.csv --dev_files my-dev.csv --test_files my_dev.csv --learning_rate 0.0001 + python3 train.py --n_hidden 2048 --checkpoint_dir path/to/checkpoint/folder --epochs 3 --train_files my-train.csv --dev_files my-dev.csv --test_files my_dev.csv --learning_rate 0.0001 Notes about the release checkpoints: the released models were trained with ``--n_hidden 2048``\ , so you need to use that same value when initializing from the release models. Since v0.6.0, the release models are also trained with ``--train_cudnn``\ , so you'll need to specify that as well. If you don't have a CUDA compatible GPU, then you can workaround it by using the ``--load_cudnn`` flag. Use ``--helpfull`` to get more information on how the flags work. @@ -270,17 +270,17 @@ If you try to load a release model without following these steps, you'll get an Transfer-Learning (new alphabet) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -If you want to continue training an alphabet-based DeepSpeech model (i.e. not a UTF-8 model) on a new language, or if you just want to add new characters to your custom alphabet, you will probably want to use transfer-learning instead of fine-tuning. If you're starting with a pre-trained UTF-8 model -- even if your data comes from a different language or uses a different alphabet -- the model will be able to predict your new transcripts, and you should use fine-tuning instead. +If you want to continue training an alphabet-based 🐸STT model (i.e. not a UTF-8 model) on a new language, or if you just want to add new characters to your custom alphabet, you will probably want to use transfer-learning instead of fine-tuning. If you're starting with a pre-trained UTF-8 model -- even if your data comes from a different language or uses a different alphabet -- the model will be able to predict your new transcripts, and you should use fine-tuning instead. -In a nutshell, DeepSpeech's transfer-learning allows you to remove certain layers from a pre-trained model, initialize new layers for your target data, stitch together the old and new layers, and update all layers via gradient descent. You will remove the pre-trained output layer (and optionally more layers) and reinitialize parameters to fit your target alphabet. The simplest case of transfer-learning is when you remove just the output layer. +In a nutshell, 🐸STT's transfer-learning allows you to remove certain layers from a pre-trained model, initialize new layers for your target data, stitch together the old and new layers, and update all layers via gradient descent. You will remove the pre-trained output layer (and optionally more layers) and reinitialize parameters to fit your target alphabet. The simplest case of transfer-learning is when you remove just the output layer. -In DeepSpeech's implementation of transfer-learning, all removed layers will be contiguous, starting from the output layer. The key flag you will want to experiment with is ``--drop_source_layers``. This flag accepts an integer from ``1`` to ``5`` and allows you to specify how many layers you want to remove from the pre-trained model. For example, if you supplied ``--drop_source_layers 3``, you will drop the last three layers of the pre-trained model: the output layer, penultimate layer, and LSTM layer. All dropped layers will be reinintialized, and (crucially) the output layer will be defined to match your supplied target alphabet. +In 🐸STT's implementation of transfer-learning, all removed layers will be contiguous, starting from the output layer. The key flag you will want to experiment with is ``--drop_source_layers``. This flag accepts an integer from ``1`` to ``5`` and allows you to specify how many layers you want to remove from the pre-trained model. For example, if you supplied ``--drop_source_layers 3``, you will drop the last three layers of the pre-trained model: the output layer, penultimate layer, and LSTM layer. All dropped layers will be reinintialized, and (crucially) the output layer will be defined to match your supplied target alphabet. You need to specify the location of the pre-trained model with ``--load_checkpoint_dir`` and define where your new model checkpoints will be saved with ``--save_checkpoint_dir``. You need to specify how many layers to remove (aka "drop") from the pre-trained model: ``--drop_source_layers``. You also need to supply your new alphabet file using the standard ``--alphabet_config_path`` (remember, using a new alphabet is the whole reason you want to use transfer-learning). .. code-block:: bash - python3 DeepSpeech.py \ + python3 train.py \ --drop_source_layers 1 \ --alphabet_config_path my-new-language-alphabet.txt \ --save_checkpoint_dir path/to/output-checkpoint/folder \ @@ -292,7 +292,7 @@ You need to specify the location of the pre-trained model with ``--load_checkpoi UTF-8 mode ^^^^^^^^^^ -DeepSpeech includes a UTF-8 operating mode which can be useful to model languages with very large alphabets, such as Chinese Mandarin. For details on how it works and how to use it, see :ref:`decoder-docs`. +🐸STT includes a UTF-8 operating mode which can be useful to model languages with very large alphabets, such as Chinese Mandarin. For details on how it works and how to use it, see :ref:`decoder-docs`. .. _training-data-augmentation: @@ -314,7 +314,7 @@ For example, for the ``overlay`` augmentation: .. code-block:: - python3 DeepSpeech.py --augment overlay[p=0.1,source=/path/to/audio.sdb,snr=20.0] ... + python3 train.py --augment overlay[p=0.1,source=/path/to/audio.sdb,snr=20.0] ... In the documentation below, whenever a value is specified as ```` or ````, it supports one of the follow formats: @@ -485,7 +485,7 @@ Example training with all augmentations: .. code-block:: bash - python -u DeepSpeech.py \ + python -u train.py \ --train_files "train.sdb" \ --feature_cache ./feature.cache \ --cache_for_epochs 10 \ @@ -541,5 +541,5 @@ To prevent common problems, make sure you **always use a separate environment wh .. code-block:: bash - (base) $ conda create -n deepspeech python=3.7 - (base) $ conda activate deepspeech + (base) $ conda create -n coqui-stt python=3.7 + (base) $ conda activate coqui-stt diff --git a/doc/USING.rst b/doc/USING.rst index 3380144a..9ca9c4d4 100644 --- a/doc/USING.rst +++ b/doc/USING.rst @@ -3,7 +3,7 @@ Using a Pre-trained Model ========================= -Inference using a DeepSpeech pre-trained model can be done with a client/language binding package. We have four clients/language bindings in this repository, listed below, and also a few community-maintained clients/language bindings in other repositories, listed `further down in this README <#third-party-bindings>`_. +Inference using a 🐸STT pre-trained model can be done with a client/language binding package. We have four clients/language bindings in this repository, listed below, and also a few community-maintained clients/language bindings in other repositories, listed `further down in this README <#third-party-bindings>`_. * :ref:`The C API `. * :ref:`The Python package/language binding ` @@ -13,7 +13,7 @@ Inference using a DeepSpeech pre-trained model can be done with a client/languag .. _runtime-deps: -Running ``deepspeech`` might, see below, require some runtime dependencies to be already installed on your system: +Running ``stt`` might, see below, require some runtime dependencies to be already installed on your system: * ``sox`` - The Python and Node.JS clients use SoX to resample files to 16kHz. * ``libgomp1`` - libsox (statically linked into the clients) depends on OpenMP. Some people have had to install this manually. @@ -33,23 +33,23 @@ The GPU capable builds (Python, NodeJS, C++, etc) depend on CUDA 10.1 and CuDNN Getting the pre-trained model ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -If you want to use the pre-trained English model for performing speech-to-text, you can download it (along with other important inference material) from the DeepSpeech `releases page `_. Alternatively, you can run the following command to download the model files in your current directory: +If you want to use the pre-trained English model for performing speech-to-text, you can download it (along with other important inference material) from the 🐸STT `releases page `_. Alternatively, you can run the following command to download the model files in your current directory: .. code-block:: bash - wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm - wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer + wget https://github.com/coqui-ai/STT/releases/download/v0.9.3/coqui-stt-0.9.3-models.pbmm + wget https://github.com/coqui-ai/STT/releases/download/v0.9.3/coqui-stt-0.9.3-models.scorer -There are several pre-trained model files available in official releases. Files ending in ``.pbmm`` are compatible with clients and language bindings built against the standard TensorFlow runtime. Usually these packages are simply called ``deepspeech``. These files are also compatible with CUDA enabled clients and language bindings. These packages are usually called ``deepspeech-gpu``. Files ending in ``.tflite`` are compatible with clients and language bindings built against the `TensorFlow Lite runtime `_. These models are optimized for size and performance in low power devices. On desktop platforms, the compatible packages are called ``deepspeech-tflite``. On Android and Raspberry Pi, we only publish TensorFlow Lite enabled packages, and they are simply called ``deepspeech``. You can see a full list of supported platforms and which TensorFlow runtime is supported at :ref:`supported-platforms-inference`. +There are several pre-trained model files available in official releases. Files ending in ``.pbmm`` are compatible with clients and language bindings built against the standard TensorFlow runtime. Usually these packages are simply called ``stt``. These files are also compatible with CUDA enabled clients and language bindings. These packages are usually called ``stt-gpu``. Files ending in ``.tflite`` are compatible with clients and language bindings built against the `TensorFlow Lite runtime `_. These models are optimized for size and performance in low power devices. On desktop platforms, the compatible packages are called ``stt-tflite``. On Android and Raspberry Pi, we only publish TensorFlow Lite enabled packages, and they are simply called ``stt``. You can see a full list of supported platforms and which TensorFlow runtime is supported at :ref:`supported-platforms-inference`. +--------------------+---------------------+---------------------+ | Package/Model type | .pbmm | .tflite | +====================+=====================+=====================+ -| deepspeech | Depends on platform | Depends on platform | +| stt | Depends on platform | Depends on platform | +--------------------+---------------------+---------------------+ -| deepspeech-gpu | ✅ | ❌ | +| stt-gpu | ✅ | ❌ | +--------------------+---------------------+---------------------+ -| deepspeech-tflite | ❌ | ✅ | +| stt-tflite | ❌ | ✅ | +--------------------+---------------------+---------------------+ Finally, the pre-trained model files also include files ending in ``.scorer``. These are external scorers (language models) that are used at inference time in conjunction with an acoustic model (``.pbmm`` or ``.tflite`` file) to produce transcriptions. We also provide further documentation on :ref:`the decoding process ` and :ref:`how scorers are generated `. @@ -61,82 +61,82 @@ The release notes include detailed information on how the released models were t The process for training an acoustic model is described in :ref:`training-docs`. In particular, fine tuning a release model using your own data can be a good way to leverage relatively smaller amounts of data that would not be sufficient for training a new model from scratch. See the :ref:`fine tuning and transfer learning sections ` for more information. :ref:`Data augmentation ` can also be a good way to increase the value of smaller training sets. -Creating your own external scorer from text data is another way that you can adapt the model to your specific needs. The process and tools used to generate an external scorer package are described in :ref:`scorer-scripts` and an overview of how the external scorer is used by DeepSpeech to perform inference is available in :ref:`decoder-docs`. Generating a smaller scorer from a single purpose text dataset is a quick process and can bring significant accuracy improvements, specially for more constrained, limited vocabulary applications. +Creating your own external scorer from text data is another way that you can adapt the model to your specific needs. The process and tools used to generate an external scorer package are described in :ref:`scorer-scripts` and an overview of how the external scorer is used by 🐸STT to perform inference is available in :ref:`decoder-docs`. Generating a smaller scorer from a single purpose text dataset is a quick process and can bring significant accuracy improvements, specially for more constrained, limited vocabulary applications. Model compatibility ^^^^^^^^^^^^^^^^^^^ -DeepSpeech models are versioned to keep you from trying to use an incompatible graph with a newer client after a breaking change was made to the code. If you get an error saying your model file version is too old for the client, you should either upgrade to a newer model release, re-export your model from the checkpoint using a newer version of the code, or downgrade your client if you need to use the old model and can't re-export it. +🐸STT models are versioned to keep you from trying to use an incompatible graph with a newer client after a breaking change was made to the code. If you get an error saying your model file version is too old for the client, you should either upgrade to a newer model release, re-export your model from the checkpoint using a newer version of the code, or downgrade your client if you need to use the old model and can't re-export it. .. _py-usage: Using the Python package ^^^^^^^^^^^^^^^^^^^^^^^^ -Pre-built binaries which can be used for performing inference with a trained model can be installed with ``pip3``. You can then use the ``deepspeech`` binary to do speech-to-text on an audio file: +Pre-built binaries which can be used for performing inference with a trained model can be installed with ``pip3``. You can then use the ``stt`` binary to do speech-to-text on an audio file: For the Python bindings, it is highly recommended that you perform the installation within a Python 3.5 or later virtual environment. You can find more information about those in `this documentation `_. We will continue under the assumption that you already have your system properly setup to create new virtual environments. -Create a DeepSpeech virtual environment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Create a Coqui STT virtual environment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In creating a virtual environment you will create a directory containing a ``python3`` binary and everything needed to run deepspeech. You can use whatever directory you want. For the purpose of the documentation, we will rely on ``$HOME/tmp/deepspeech-venv``. You can create it using this command: +In creating a virtual environment you will create a directory containing a ``python3`` binary and everything needed to run 🐸STT. You can use whatever directory you want. For the purpose of the documentation, we will rely on ``$HOME/tmp/coqui-stt-venv``. You can create it using this command: .. code-block:: - $ virtualenv -p python3 $HOME/tmp/deepspeech-venv/ + $ virtualenv -p python3 $HOME/tmp/coqui-stt-venv/ Once this command completes successfully, the environment will be ready to be activated. Activating the environment ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Each time you need to work with DeepSpeech, you have to *activate* this virtual environment. This is done with this simple command: +Each time you need to work with 🐸STT, you have to *activate* this virtual environment. This is done with this simple command: .. code-block:: - $ source $HOME/tmp/deepspeech-venv/bin/activate + $ source $HOME/tmp/coqui-stt-venv/bin/activate -Installing DeepSpeech Python bindings -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Installing Coqui STT Python bindings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Once your environment has been set-up and loaded, you can use ``pip3`` to manage packages locally. On a fresh setup of the ``virtualenv``\ , you will have to install the DeepSpeech wheel. You can check if ``deepspeech`` is already installed with ``pip3 list``. +Once your environment has been set-up and loaded, you can use ``pip3`` to manage packages locally. On a fresh setup of the ``virtualenv``\ , you will have to install the 🐸STT wheel. You can check if ``stt`` is already installed with ``pip3 list``. To perform the installation, just use ``pip3`` as such: .. code-block:: - $ pip3 install deepspeech + $ pip3 install stt -If ``deepspeech`` is already installed, you can update it as such: +If ``stt`` is already installed, you can update it as such: .. code-block:: - $ pip3 install --upgrade deepspeech + $ pip3 install --upgrade stt Alternatively, if you have a supported NVIDIA GPU on Linux, you can install the GPU specific package as follows: .. code-block:: - $ pip3 install deepspeech-gpu + $ pip3 install stt-gpu -See the `release notes `_ to find which GPUs are supported. Please ensure you have the required `CUDA dependency <#cuda-dependency>`_. +See the `release notes `_ to find which GPUs are supported. Please ensure you have the required `CUDA dependency <#cuda-dependency>`_. -You can update ``deepspeech-gpu`` as follows: +You can update ``stt-gpu`` as follows: .. code-block:: - $ pip3 install --upgrade deepspeech-gpu + $ pip3 install --upgrade stt-gpu -In both cases, ``pip3`` should take care of installing all the required dependencies. After installation has finished, you should be able to call ``deepspeech`` from the command-line. +In both cases, ``pip3`` should take care of installing all the required dependencies. After installation has finished, you should be able to call ``stt`` from the command-line. Note: the following command assumes you `downloaded the pre-trained model <#getting-the-pre-trained-model>`_. .. code-block:: bash - deepspeech --model deepspeech-0.9.3-models.pbmm --scorer deepspeech-0.9.3-models.scorer --audio my_audio_file.wav + stt --model stt-0.9.3-models.pbmm --scorer stt-0.9.3-models.scorer --audio my_audio_file.wav The ``--scorer`` argument is optional, and represents an external language model to be used when transcribing the audio. @@ -151,7 +151,9 @@ You can download the JS bindings using ``npm``\ : .. code-block:: bash - npm install deepspeech + npm install stt + +Special thanks to `Huan - Google Developers Experts in Machine Learning (ML GDE) `_ for providing the STT project name on npmjs.org Please note that as of now, we support: - Node.JS versions 4 to 13. @@ -163,9 +165,9 @@ Alternatively, if you're using Linux and have a supported NVIDIA GPU, you can in .. code-block:: bash - npm install deepspeech-gpu + npm install stt-gpu -See the `release notes `_ to find which GPUs are supported. Please ensure you have the required `CUDA dependency <#cuda-dependency>`_. +See the `release notes `_ to find which GPUs are supported. Please ensure you have the required `CUDA dependency <#cuda-dependency>`_. See the :ref:`TypeScript client ` for an example of how to use the bindings programatically. @@ -174,7 +176,7 @@ See the :ref:`TypeScript client ` for an example of how to use t Using the command-line client ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To download the pre-built binaries for the ``deepspeech`` command-line (compiled C++) client, use ``util/taskcluster.py``\ : +To download the pre-built binaries for the ``stt`` command-line (compiled C++) client, use ``util/taskcluster.py``\ : .. code-block:: bash @@ -186,23 +188,23 @@ or if you're on macOS: python3 util/taskcluster.py --arch osx --target . -also, if you need some binaries different than current master, like ``v0.2.0-alpha.6``\ , you can use ``--branch``\ : +also, if you need some binaries different than current main branch, like ``v0.2.0-alpha.6``\ , you can use ``--branch``\ : .. code-block:: bash python3 util/taskcluster.py --branch "v0.2.0-alpha.6" --target "." -The script ``taskcluster.py`` will download ``native_client.tar.xz`` (which includes the ``deepspeech`` binary and associated libraries) and extract it into the current folder. Also, ``taskcluster.py`` will download binaries for Linux/x86_64 by default, but you can override that behavior with the ``--arch`` parameter. See the help info with ``python util/taskcluster.py -h`` for more details. Specific branches of DeepSpeech or TensorFlow can be specified as well. +The script ``taskcluster.py`` will download ``native_client.tar.xz`` (which includes the ``stt`` binary and associated libraries) and extract it into the current folder. Also, ``taskcluster.py`` will download binaries for Linux/x86_64 by default, but you can override that behavior with the ``--arch`` parameter. See the help info with ``python util/taskcluster.py -h`` for more details. Specific branches of 🐸STT or TensorFlow can be specified as well. -Alternatively you may manually download the ``native_client.tar.xz`` from the [releases](https://github.com/mozilla/DeepSpeech/releases). +Alternatively you may manually download the ``native_client.tar.xz`` from the [releases](https://github.com/coqui-ai/STT/releases). Note: the following command assumes you `downloaded the pre-trained model <#getting-the-pre-trained-model>`_. .. code-block:: bash - ./deepspeech --model deepspeech-0.9.3-models.pbmm --scorer deepspeech-0.9.3-models.scorer --audio audio_input.wav + ./stt --model coqui-stt-0.9.3-models.pbmm --scorer coqui-stt-0.9.3-models.scorer --audio audio_input.wav -See the help output with ``./deepspeech -h`` for more details. +See the help output with ``./stt -h`` for more details. Installing bindings from source ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -212,28 +214,27 @@ If pre-built binaries aren't available for your system, you'll need to install t Dockerfile for building from source ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We provide ``Dockerfile.build`` to automatically build ``libdeepspeech.so``, the C++ native client, Python bindings, and KenLM. +We provide ``Dockerfile.build`` to automatically build ``libstt.so``, the C++ native client, Python bindings, and KenLM. You need to generate the Dockerfile from the template using: .. code-block:: bash make Dockerfile.build -If you want to specify a different DeepSpeech repository / branch, you can pass ``DEEPSPEECH_REPO`` or ``DEEPSPEECH_SHA`` parameters: +If you want to specify a different repository / branch, you can pass ``STT_REPO`` or ``STT_SHA`` parameters: .. code-block:: bash - make Dockerfile.build DEEPSPEECH_REPO=git://your/fork DEEPSPEECH_SHA=origin/your-branch + make Dockerfile.build STT_REPO=git://your/fork STT_SHA=origin/your-branch -Third party bindings -^^^^^^^^^^^^^^^^^^^^ +.. Third party bindings + ^^^^^^^^^^^^^^^^^^^^ -In addition to the bindings above, third party developers have started to provide bindings to other languages: + In addition to the bindings above, third party developers have started to provide bindings to other languages: - -* `Asticode `_ provides `Golang `_ bindings in its `go-astideepspeech `_ repo. -* `RustAudio `_ provide a `Rust `_ binding, the installation and use of which is described in their `deepspeech-rs `_ repo. -* `stes `_ provides preliminary `PKGBUILDs `_ to install the client and python bindings on `Arch Linux `_ in the `arch-deepspeech `_ repo. -* `gst-deepspeech `_ provides a `GStreamer `_ plugin which can be used from any language with GStreamer bindings. -* `thecodrr `_ provides `Vlang `_ bindings. The installation and use of which is described in their `vspeech `_ repo. -* `eagledot `_ provides `NIM-lang `_ bindings. The installation and use of which is described in their `nim-deepspeech `_ repo. + * `Asticode `_ provides `Golang `_ bindings in its `go-astideepspeech `_ repo. + * `RustAudio `_ provide a `Rust `_ binding, the installation and use of which is described in their `deepspeech-rs `_ repo. + * `stes `_ provides preliminary `PKGBUILDs `_ to install the client and python bindings on `Arch Linux `_ in the `arch-deepspeech `_ repo. + * `gst-deepspeech `_ provides a `GStreamer `_ plugin which can be used from any language with GStreamer bindings. + * `thecodrr `_ provides `Vlang `_ bindings. The installation and use of which is described in their `vspeech `_ repo. + * `eagledot `_ provides `NIM-lang `_ bindings. The installation and use of which is described in their `nim-deepspeech `_ repo. diff --git a/doc/conf.py b/doc/conf.py index 401ba08b..92b315e9 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# DeepSpeech documentation build configuration file, created by +# Coqui STT documentation build configuration file, created by # sphinx-quickstart on Thu Feb 2 21:20:39 2017. # # This file is execfile()d with the current directory set to its @@ -24,7 +24,7 @@ import sys sys.path.insert(0, os.path.abspath('../')) -autodoc_mock_imports = ['deepspeech'] +autodoc_mock_imports = ['stt'] # This is in fact only relevant on ReadTheDocs, but we want to run the same way # on our CI as in RTD to avoid regressions on RTD that we would not catch on @@ -45,9 +45,9 @@ import semver # -- Project information ----------------------------------------------------- -project = u'DeepSpeech' -copyright = '2019-2020 Mozilla Corporation, 2020 DeepSpeech authors' -author = 'DeepSpeech authors' +project = u'Coqui STT' +copyright = '2019-2020 Mozilla Corporation, 2020 DeepSpeech authors, 2021 Coqui GmbH' +author = 'Coqui GmbH' with open('../VERSION', 'r') as ver: v = ver.read().strip() @@ -81,9 +81,9 @@ extensions = [ breathe_projects = { - "deepspeech-c": "xml-c/", - "deepspeech-java": "xml-java/", - "deepspeech-dotnet": "xml-dotnet/", + "stt-c": "xml-c/", + "stt-java": "xml-java/", + "stt-dotnet": "xml-dotnet/", } js_source_path = "../native_client/javascript/index.ts" @@ -99,7 +99,7 @@ templates_path = ['.templates'] # source_suffix = ['.rst', '.md'] source_suffix = '.rst' -# The master toctree document. +# The main toctree document. master_doc = 'index' # The language for content autogenerated by Sphinx. Refer to documentation @@ -147,7 +147,7 @@ html_static_path = ['.static'] # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. -htmlhelp_basename = 'DeepSpeechdoc' +htmlhelp_basename = 'STTdoc' # -- Options for LaTeX output --------------------------------------------- @@ -174,8 +174,8 @@ latex_elements = { # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'DeepSpeech.tex', u'DeepSpeech Documentation', - u'DeepSpeech authors', 'manual'), + (master_doc, 'STT.tex', u'Coqui STT Documentation', + u'Coqui GmbH', 'manual'), ] @@ -184,7 +184,7 @@ latex_documents = [ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'deepspeech', u'DeepSpeech Documentation', + (master_doc, 'stt', u'Coqui STT Documentation', [author], 1) ] @@ -195,8 +195,8 @@ man_pages = [ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'DeepSpeech', u'DeepSpeech Documentation', - author, 'DeepSpeech', 'One line description of project.', + (master_doc, 'STT', u'Coqui STT Documentation', + author, 'STT', 'One line description of project.', 'Miscellaneous'), ] @@ -206,5 +206,5 @@ texinfo_documents = [ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = {'https://docs.python.org/': None} -extlinks = {'github': ('https://github.com/mozilla/DeepSpeech/blob/v{}/%s'.format(release), +extlinks = {'github': ('https://github.com/coqui-ai/STT/blob/v{}/%s'.format(release), '%s')} diff --git a/doc/doxygen-c.conf b/doc/doxygen-c.conf index f36f57b2..ec2ac239 100644 --- a/doc/doxygen-c.conf +++ b/doc/doxygen-c.conf @@ -790,7 +790,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = native_client/deepspeech.h +INPUT = native_client/coqui-stt.h # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/doc/doxygen-dotnet.conf b/doc/doxygen-dotnet.conf index 74c2c5bb..04047fc0 100644 --- a/doc/doxygen-dotnet.conf +++ b/doc/doxygen-dotnet.conf @@ -790,7 +790,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = native_client/dotnet/DeepSpeechClient/ native_client/dotnet/DeepSpeechClient/Interfaces/ native_client/dotnet/DeepSpeechClient/Enums/ native_client/dotnet/DeepSpeechClient/Models/ +INPUT = native_client/dotnet/STTClient/ native_client/dotnet/STTClient/Interfaces/ native_client/dotnet/STTClient/Enums/ native_client/dotnet/STTClient/Models/ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/doc/doxygen-java.conf b/doc/doxygen-java.conf index 9516d6ec..c6474d7a 100644 --- a/doc/doxygen-java.conf +++ b/doc/doxygen-java.conf @@ -790,7 +790,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = native_client/java/libdeepspeech/src/main/java/org/deepspeech/libdeepspeech/ native_client/java/libdeepspeech/src/main/java/org/deepspeech/libdeepspeech_doc/ +INPUT = native_client/java/libstt/src/main/java/ai/coqui/libstt/ native_client/java/libstt/src/main/java/ai/coqui/libstt_doc/ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/doc/index.rst b/doc/index.rst index 33285c67..edb048f8 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -1,54 +1,54 @@ -.. DeepSpeech documentation master file, created by +.. Coqui STT documentation main file, created by sphinx-quickstart on Thu Feb 2 21:20:39 2017. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to DeepSpeech's documentation! -====================================== +Coqui STT +========= -DeepSpeech is an open source Speech-To-Text engine, using a model trained by machine learning techniques based on `Baidu's Deep Speech research paper `_. Project DeepSpeech uses Google's `TensorFlow `_ to make the implementation easier. +Coqui STT (🐸STT) is an open source Speech-To-Text engine, using a model trained by machine learning techniques based on `Baidu's Deep Speech research paper `_. 🐸STT uses Google's `TensorFlow `_ to make the implementation easier. -To install and use DeepSpeech all you have to do is: +To install and use 🐸STT all you have to do is: .. code-block:: bash # Create and activate a virtualenv - virtualenv -p python3 $HOME/tmp/deepspeech-venv/ - source $HOME/tmp/deepspeech-venv/bin/activate + virtualenv -p python3 $HOME/tmp/stt/ + source $HOME/tmp/stt/bin/activate - # Install DeepSpeech - pip3 install deepspeech + # Install 🐸STT + pip3 install stt # Download pre-trained English model files - curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm - curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer + curl -LO https://github.com/coqui-ai/STT/releases/download/v0.9.3/coqui-stt-0.9.3-models.pbmm + curl -LO https://github.com/coqui-ai/STT/releases/download/v0.9.3/coqui-stt-0.9.3-models.scorer # Download example audio files - curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/audio-0.9.3.tar.gz + curl -LO https://github.com/coqui-ai/STT/releases/download/v0.9.3/audio-0.9.3.tar.gz tar xvf audio-0.9.3.tar.gz # Transcribe an audio file - deepspeech --model deepspeech-0.9.3-models.pbmm --scorer deepspeech-0.9.3-models.scorer --audio audio/2830-3980-0043.wav + stt --model coqui-stt-0.9.3-models.pbmm --scorer coqui-stt-0.9.3-models.scorer --audio audio/2830-3980-0043.wav -A pre-trained English model is available for use and can be downloaded following the instructions in :ref:`the usage docs `. For the latest release, including pre-trained models and checkpoints, `see the GitHub releases page `_. +A pre-trained English model is available for use and can be downloaded following the instructions in :ref:`the usage docs `. For the latest release, including pre-trained models and checkpoints, `see the GitHub releases page `_. -Quicker inference can be performed using a supported NVIDIA GPU on Linux. See the `release notes `_ to find which GPUs are supported. To run ``deepspeech`` on a GPU, install the GPU specific package: +Quicker inference can be performed using a supported NVIDIA GPU on Linux. See the `release notes `_ to find which GPUs are supported. To run ``stt`` on a GPU, install the GPU specific package: .. code-block:: bash # Create and activate a virtualenv - virtualenv -p python3 $HOME/tmp/deepspeech-gpu-venv/ - source $HOME/tmp/deepspeech-gpu-venv/bin/activate + virtualenv -p python3 $HOME/tmp/coqui-stt-gpu-venv/ + source $HOME/tmp/coqui-stt-gpu-venv/bin/activate - # Install DeepSpeech CUDA enabled package - pip3 install deepspeech-gpu + # Install 🐸STT CUDA enabled package + pip3 install stt-gpu # Transcribe an audio file. - deepspeech --model deepspeech-0.9.3-models.pbmm --scorer deepspeech-0.9.3-models.scorer --audio audio/2830-3980-0043.wav + stt --model coqui-stt-0.9.3-models.pbmm --scorer coqui-stt-0.9.3-models.scorer --audio audio/2830-3980-0043.wav Please ensure you have the required :ref:`CUDA dependencies `. -See the output of ``deepspeech -h`` for more information on the use of ``deepspeech``. (If you experience problems running ``deepspeech``, please check :ref:`required runtime dependencies `). +See the output of ``stt -h`` for more information on the use of ``stt``. (If you experience problems running ``stt``, please check :ref:`required runtime dependencies `). .. toctree:: :maxdepth: 2 @@ -78,7 +78,7 @@ See the output of ``deepspeech -h`` for more information on the use of ``deepspe :maxdepth: 2 :caption: Architecture and training - DeepSpeech + Architecture Geometry diff --git a/doc/make.bat b/doc/make.bat index cfcbc831..277fcf31 100644 --- a/doc/make.bat +++ b/doc/make.bat @@ -9,7 +9,7 @@ if "%SPHINXBUILD%" == "" ( ) set SOURCEDIR=. set BUILDDIR=.build -set SPHINXPROJ=DeepSpeech +set SPHINXPROJ="Coqui STT" if "%1" == "" goto help diff --git a/ds_lib.supp b/ds_lib.supp index d7748e34..9fae3aef 100644 --- a/ds_lib.supp +++ b/ds_lib.supp @@ -1,10 +1,10 @@ { - deepspeech_tflite_error_reporter + stt_tflite_error_reporter Memcheck:Leak match-leak-kinds: reachable fun:_Znwm fun:_ZN6tflite20DefaultErrorReporterEv fun:_ZN16TFLiteModelState4initEPKc - fun:DS_CreateModel + fun:STT_CreateModel fun:main } diff --git a/ds_openfst.supp b/ds_openfst.supp index 378659db..8cb96016 100644 --- a/ds_openfst.supp +++ b/ds_openfst.supp @@ -815,7 +815,7 @@ fun:_ZN6Scorer9load_trieERSt14basic_ifstreamIcSt11char_traitsIcEERKNSt7__cxx1112basic_stringIcS2_SaIcEEE fun:_ZN6Scorer7load_lmERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE fun:_ZN6Scorer4initERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERK8Alphabet - fun:DS_EnableExternalScorer + fun:STT_EnableExternalScorer fun:main } { @@ -831,7 +831,7 @@ fun:_ZN6Scorer9load_trieERSt14basic_ifstreamIcSt11char_traitsIcEERKNSt7__cxx1112basic_stringIcS2_SaIcEEE fun:_ZN6Scorer7load_lmERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE fun:_ZN6Scorer4initERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERK8Alphabet - fun:DS_EnableExternalScorer + fun:STT_EnableExternalScorer fun:main } { diff --git a/evaluate.py b/evaluate.py index dc502542..eca856b2 100644 --- a/evaluate.py +++ b/evaluate.py @@ -4,7 +4,7 @@ from __future__ import absolute_import, division, print_function if __name__ == '__main__': try: - from deepspeech_training import evaluate as ds_evaluate + from coqui_stt_training import evaluate as ds_evaluate except ImportError: print('Training package is not installed. See training documentation.') raise diff --git a/evaluate_tflite.py b/evaluate_tflite.py index 0d462615..d8cff40f 100644 --- a/evaluate_tflite.py +++ b/evaluate_tflite.py @@ -10,23 +10,23 @@ import csv import os import sys -from deepspeech import Model -from deepspeech_training.util.evaluate_tools import calculate_and_print_report -from deepspeech_training.util.flags import create_flags +from stt import Model +from coqui_stt_training.util.evaluate_tools import calculate_and_print_report +from coqui_stt_training.util.flags import create_flags from functools import partial from multiprocessing import JoinableQueue, Process, cpu_count, Manager from six.moves import zip, range r''' This module should be self-contained: - - build libdeepspeech.so with TFLite: - - bazel build [...] --define=runtime=tflite [...] //native_client:libdeepspeech.so + - build libstt.so with TFLite: + - bazel build [...] --define=runtime=tflite [...] //native_client:libstt.so - make -C native_client/python/ TFDIR=... bindings - setup a virtualenv - - pip install native_client/python/dist/deepspeech*.whl + - pip install native_client/python/dist/*.whl - pip install -r requirements_eval_tflite.txt -Then run with a TF Lite model, a scorer and a CSV test file +Then run with a TFLite model, a scorer and a CSV test file ''' def tflite_worker(model, scorer, queue_in, queue_out, gpu_mask): diff --git a/examples/README.rst b/examples/README.rst index f5ebb1bd..2d71bc17 100644 --- a/examples/README.rst +++ b/examples/README.rst @@ -1,6 +1,6 @@ Examples ======== -DeepSpeech examples were moved to a separate repository. +🐸STT examples were moved to a separate repository. -New location: https://github.com/mozilla/DeepSpeech-examples +New location: https://github.com/coqui-ai/STT-examples diff --git a/images/coqui-STT-logo-green.png b/images/coqui-STT-logo-green.png new file mode 100644 index 00000000..2afec422 Binary files /dev/null and b/images/coqui-STT-logo-green.png differ diff --git a/lm_optimizer.py b/lm_optimizer.py index 25d8a05e..74a02dc7 100644 --- a/lm_optimizer.py +++ b/lm_optimizer.py @@ -7,12 +7,12 @@ import optuna import sys import tensorflow.compat.v1 as tfv1 -from deepspeech_training.evaluate import evaluate -from deepspeech_training.train import create_model -from deepspeech_training.util.config import Config, initialize_globals -from deepspeech_training.util.flags import create_flags, FLAGS -from deepspeech_training.util.logging import log_error -from deepspeech_training.util.evaluate_tools import wer_cer_batch +from coqui_stt_training.evaluate import evaluate +from coqui_stt_training.train import create_model +from coqui_stt_training.util.config import Config, initialize_globals +from coqui_stt_training.util.flags import create_flags, FLAGS +from coqui_stt_training.util.logging import log_error +from coqui_stt_training.util.evaluate_tools import wer_cer_batch from ds_ctcdecoder import Scorer diff --git a/native_client/Android.mk b/native_client/Android.mk index d21551fd..49bf8f93 100644 --- a/native_client/Android.mk +++ b/native_client/Android.mk @@ -1,14 +1,14 @@ LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) -LOCAL_MODULE := deepspeech-prebuilt -LOCAL_SRC_FILES := $(TFDIR)/bazel-bin/native_client/libdeepspeech.so +LOCAL_MODULE := stt-prebuilt +LOCAL_SRC_FILES := $(TFDIR)/bazel-bin/native_client/libstt.so include $(PREBUILT_SHARED_LIBRARY) include $(CLEAR_VARS) LOCAL_CPP_EXTENSION := .cc .cxx .cpp -LOCAL_MODULE := deepspeech +LOCAL_MODULE := stt LOCAL_SRC_FILES := client.cc -LOCAL_SHARED_LIBRARIES := deepspeech-prebuilt +LOCAL_SHARED_LIBRARIES := stt-prebuilt LOCAL_LDFLAGS := -Wl,--no-as-needed include $(BUILD_EXECUTABLE) diff --git a/native_client/BUILD b/native_client/BUILD index d25454a1..e905fb94 100644 --- a/native_client/BUILD +++ b/native_client/BUILD @@ -1,4 +1,4 @@ -# Description: Deepspeech native client library. +# Description: Coqui STT native client library. load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cc_shared_object", "tf_copts", "lrt_if_needed") load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") @@ -112,11 +112,11 @@ cc_library( ) cc_library( - name = "deepspeech_bundle", + name = "coqui_stt_bundle", srcs = [ - "deepspeech.cc", - "deepspeech.h", - "deepspeech_errors.cc", + "stt.cc", + "coqui-stt.h", + "stt_errors.cc", "modelstate.cc", "modelstate.h", "workspace_status.cc", @@ -165,7 +165,7 @@ cc_library( #"//tensorflow/core:all_kernels", ### => Trying to be more fine-grained ### Use bin/ops_in_graph.py to list all the ops used by a frozen graph. - ### CPU only build, libdeepspeech.so file size reduced by ~50% + ### CPU only build, libstt.so file size reduced by ~50% "//tensorflow/core/kernels:spectrogram_op", # AudioSpectrogram "//tensorflow/core/kernels:bias_op", # BiasAdd "//tensorflow/core/kernels:cast_op", # Cast @@ -205,31 +205,31 @@ cc_library( ) tf_cc_shared_object( - name = "libdeepspeech.so", - deps = [":deepspeech_bundle"], + name = "libstt.so", + deps = [":coqui_stt_bundle"], ) ios_static_framework( - name = "deepspeech_ios", - deps = [":deepspeech_bundle"], + name = "coqui_stt_ios", + deps = [":coqui_stt_bundle"], families = ["iphone", "ipad"], minimum_os_version = "9.0", linkopts = ["-lstdc++"], ) genrule( - name = "libdeepspeech_so_dsym", - srcs = [":libdeepspeech.so"], - outs = ["libdeepspeech.so.dSYM"], + name = "libstt_so_dsym", + srcs = [":libstt.so"], + outs = ["libstt.so.dSYM"], output_to_bindir = True, - cmd = "dsymutil $(location :libdeepspeech.so) -o $@" + cmd = "dsymutil $(location :libstt.so) -o $@" ) cc_binary( name = "generate_scorer_package", srcs = [ "generate_scorer_package.cpp", - "deepspeech_errors.cc", + "stt_errors.cc", ], copts = ["-std=c++11"], deps = [ diff --git a/native_client/CODINGSTYLE.md b/native_client/CODINGSTYLE.md index ddb8fc82..f0e4ec48 100644 --- a/native_client/CODINGSTYLE.md +++ b/native_client/CODINGSTYLE.md @@ -1,5 +1,5 @@ This file contains some notes on coding style within the C++ portion of the -DeepSpeech project. It is very much a work in progress and incomplete. +🐸STT project. It is very much a work in progress and incomplete. General ======= @@ -25,4 +25,4 @@ File naming Doubts ====== -If in doubt, please ask on our Matrix chat channel: https://chat.mozilla.org/#/room/#machinelearning:mozilla.org +If in doubt, please ask on our Matrix chat channel: https://matrix.to/#/#stt:matrix.org?via=matrix.org diff --git a/native_client/Makefile b/native_client/Makefile index b645499c..540eb49d 100644 --- a/native_client/Makefile +++ b/native_client/Makefile @@ -13,35 +13,35 @@ include definitions.mk -default: $(DEEPSPEECH_BIN) +default: $(STT_BIN) clean: - rm -f deepspeech + rm -f stt -$(DEEPSPEECH_BIN): client.cc Makefile - $(CXX) $(CFLAGS) $(CFLAGS_DEEPSPEECH) $(SOX_CFLAGS) client.cc $(LDFLAGS) $(SOX_LDFLAGS) +$(STT_BIN): client.cc Makefile + $(CXX) $(CFLAGS) $(CFLAGS_STT) $(SOX_CFLAGS) client.cc $(LDFLAGS) $(SOX_LDFLAGS) ifeq ($(OS),Darwin) - install_name_tool -change bazel-out/local-opt/bin/native_client/libdeepspeech.so @rpath/libdeepspeech.so deepspeech + install_name_tool -change bazel-out/local-opt/bin/native_client/libstt.so @rpath/libstt.so stt endif -run: $(DEEPSPEECH_BIN) - ${META_LD_LIBRARY_PATH}=${TFDIR}/bazel-bin/native_client:${${META_LD_LIBRARY_PATH}} ./deepspeech ${ARGS} +run: $(STT_BIN) + ${META_LD_LIBRARY_PATH}=${TFDIR}/bazel-bin/native_client:${${META_LD_LIBRARY_PATH}} ./stt ${ARGS} -debug: $(DEEPSPEECH_BIN) - ${META_LD_LIBRARY_PATH}=${TFDIR}/bazel-bin/native_client:${${META_LD_LIBRARY_PATH}} gdb --args ./deepspeech ${ARGS} +debug: $(STT_BIN) + ${META_LD_LIBRARY_PATH}=${TFDIR}/bazel-bin/native_client:${${META_LD_LIBRARY_PATH}} gdb --args ./stt ${ARGS} -install: $(DEEPSPEECH_BIN) +install: $(STT_BIN) install -d ${PREFIX}/lib - install -m 0644 ${TFDIR}/bazel-bin/native_client/libdeepspeech.so ${PREFIX}/lib/ + install -m 0644 ${TFDIR}/bazel-bin/native_client/libstt.so ${PREFIX}/lib/ install -d ${PREFIX}/include - install -m 0644 deepspeech.h ${PREFIX}/include + install -m 0644 coqui-stt.h ${PREFIX}/include install -d ${PREFIX}/bin - install -m 0755 deepspeech ${PREFIX}/bin/ + install -m 0755 stt ${PREFIX}/bin/ uninstall: - rm -f ${PREFIX}/bin/deepspeech + rm -f ${PREFIX}/bin/stt rmdir --ignore-fail-on-non-empty ${PREFIX}/bin - rm -f ${PREFIX}/lib/libdeepspeech.so + rm -f ${PREFIX}/lib/libstt.so rmdir --ignore-fail-on-non-empty ${PREFIX}/lib print-toolchain: diff --git a/native_client/args.h b/native_client/args.h index 069347e0..30ed3181 100644 --- a/native_client/args.h +++ b/native_client/args.h @@ -8,7 +8,7 @@ #endif #include -#include "deepspeech.h" +#include "coqui-stt.h" char* model = NULL; @@ -47,7 +47,7 @@ void PrintHelp(const char* bin) std::cout << "Usage: " << bin << " --model MODEL [--scorer SCORER] --audio AUDIO [-t] [-e]\n" "\n" - "Running DeepSpeech inference.\n" + "Running Coqui STT inference.\n" "\n" "\t--model MODEL\t\t\tPath to the model (protocol buffer binary file)\n" "\t--scorer SCORER\t\t\tPath to the external scorer file\n" @@ -64,9 +64,9 @@ void PrintHelp(const char* bin) "\t--hot_words\t\t\tHot-words and their boosts. Word:Boost pairs are comma-separated\n" "\t--help\t\t\t\tShow help\n" "\t--version\t\t\tPrint version and exits\n"; - char* version = DS_Version(); - std::cerr << "DeepSpeech " << version << "\n"; - DS_FreeString(version); + char* version = STT_Version(); + std::cerr << "Coqui STT " << version << "\n"; + STT_FreeString(version); exit(1); } @@ -169,9 +169,9 @@ bool ProcessArgs(int argc, char** argv) } if (has_versions) { - char* version = DS_Version(); - std::cout << "DeepSpeech " << version << "\n"; - DS_FreeString(version); + char* version = STT_Version(); + std::cout << "Coqui " << version << "\n"; + STT_FreeString(version); return false; } diff --git a/native_client/bazel_workspace_status_cmd.sh b/native_client/bazel_workspace_status_cmd.sh index a1a5a2a0..1af17e26 100755 --- a/native_client/bazel_workspace_status_cmd.sh +++ b/native_client/bazel_workspace_status_cmd.sh @@ -22,8 +22,8 @@ echo "STABLE_TF_GIT_VERSION ${tf_git_rev}" pushd $(dirname "$0") ds_git_rev=$(git describe --long --tags) echo "STABLE_DS_GIT_VERSION ${ds_git_rev}" -ds_version=$(cat ../training/deepspeech_training/VERSION) +ds_version=$(cat ../training/coqui_stt_training/VERSION) echo "STABLE_DS_VERSION ${ds_version}" -ds_graph_version=$(cat ../training/deepspeech_training/GRAPH_VERSION) +ds_graph_version=$(cat ../training/coqui_stt_training/GRAPH_VERSION) echo "STABLE_DS_GRAPH_VERSION ${ds_graph_version}" popd diff --git a/native_client/client.cc b/native_client/client.cc index 7d88b4d6..93afa555 100644 --- a/native_client/client.cc +++ b/native_client/client.cc @@ -34,7 +34,7 @@ #endif // NO_DIR #include -#include "deepspeech.h" +#include "coqui-stt.h" #include "args.h" typedef struct { @@ -168,17 +168,17 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, // sphinx-doc: c_ref_inference_start if (extended_output) { - Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1); + Metadata *result = STT_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, 1); res.string = CandidateTranscriptToString(&result->transcripts[0]); - DS_FreeMetadata(result); + STT_FreeMetadata(result); } else if (json_output) { - Metadata *result = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts); + Metadata *result = STT_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, json_candidate_transcripts); res.string = MetadataToJSON(result); - DS_FreeMetadata(result); + STT_FreeMetadata(result); } else if (stream_size > 0) { StreamingState* ctx; - int status = DS_CreateStream(aCtx, &ctx); - if (status != DS_ERR_OK) { + int status = STT_CreateStream(aCtx, &ctx); + if (status != STT_ERR_OK) { res.string = strdup(""); return res; } @@ -187,28 +187,28 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, const char *prev = nullptr; while (off < aBufferSize) { size_t cur = aBufferSize - off > stream_size ? stream_size : aBufferSize - off; - DS_FeedAudioContent(ctx, aBuffer + off, cur); + STT_FeedAudioContent(ctx, aBuffer + off, cur); off += cur; prev = last; - const char* partial = DS_IntermediateDecode(ctx); + const char* partial = STT_IntermediateDecode(ctx); if (last == nullptr || strcmp(last, partial)) { printf("%s\n", partial); last = partial; } else { - DS_FreeString((char *) partial); + STT_FreeString((char *) partial); } if (prev != nullptr && prev != last) { - DS_FreeString((char *) prev); + STT_FreeString((char *) prev); } } if (last != nullptr) { - DS_FreeString((char *) last); + STT_FreeString((char *) last); } - res.string = DS_FinishStream(ctx); + res.string = STT_FinishStream(ctx); } else if (extended_stream_size > 0) { StreamingState* ctx; - int status = DS_CreateStream(aCtx, &ctx); - if (status != DS_ERR_OK) { + int status = STT_CreateStream(aCtx, &ctx); + if (status != STT_ERR_OK) { res.string = strdup(""); return res; } @@ -217,10 +217,10 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, const char *prev = nullptr; while (off < aBufferSize) { size_t cur = aBufferSize - off > extended_stream_size ? extended_stream_size : aBufferSize - off; - DS_FeedAudioContent(ctx, aBuffer + off, cur); + STT_FeedAudioContent(ctx, aBuffer + off, cur); off += cur; prev = last; - const Metadata* result = DS_IntermediateDecodeWithMetadata(ctx, 1); + const Metadata* result = STT_IntermediateDecodeWithMetadata(ctx, 1); const char* partial = CandidateTranscriptToString(&result->transcripts[0]); if (last == nullptr || strcmp(last, partial)) { printf("%s\n", partial); @@ -231,14 +231,14 @@ LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize, if (prev != nullptr && prev != last) { free((char *) prev); } - DS_FreeMetadata((Metadata *)result); + STT_FreeMetadata((Metadata *)result); } - const Metadata* result = DS_FinishStreamWithMetadata(ctx, 1); + const Metadata* result = STT_FinishStreamWithMetadata(ctx, 1); res.string = CandidateTranscriptToString(&result->transcripts[0]); - DS_FreeMetadata((Metadata *)result); + STT_FreeMetadata((Metadata *)result); free((char *) last); } else { - res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize); + res.string = STT_SpeechToText(aCtx, aBuffer, aBufferSize); } // sphinx-doc: c_ref_inference_stop @@ -404,9 +404,9 @@ GetAudioBuffer(const char* path, int desired_sample_rate) void ProcessFile(ModelState* context, const char* path, bool show_times) { - ds_audio_buffer audio = GetAudioBuffer(path, DS_GetModelSampleRate(context)); + ds_audio_buffer audio = GetAudioBuffer(path, STT_GetModelSampleRate(context)); - // Pass audio to DeepSpeech + // Pass audio to STT // We take half of buffer_size because buffer is a char* while // LocalDsSTT() expected a short* ds_result result = LocalDsSTT(context, @@ -418,7 +418,7 @@ ProcessFile(ModelState* context, const char* path, bool show_times) if (result.string) { printf("%s\n", result.string); - DS_FreeString((char*)result.string); + STT_FreeString((char*)result.string); } if (show_times) { @@ -450,19 +450,19 @@ main(int argc, char **argv) return 1; } - // Initialise DeepSpeech + // Initialise STT ModelState* ctx; // sphinx-doc: c_ref_model_start - int status = DS_CreateModel(model, &ctx); + int status = STT_CreateModel(model, &ctx); if (status != 0) { - char* error = DS_ErrorCodeToErrorMessage(status); + char* error = STT_ErrorCodeToErrorMessage(status); fprintf(stderr, "Could not create model: %s\n", error); free(error); return 1; } if (set_beamwidth) { - status = DS_SetModelBeamWidth(ctx, beam_width); + status = STT_SetModelBeamWidth(ctx, beam_width); if (status != 0) { fprintf(stderr, "Could not set model beam width.\n"); return 1; @@ -470,13 +470,13 @@ main(int argc, char **argv) } if (scorer) { - status = DS_EnableExternalScorer(ctx, scorer); + status = STT_EnableExternalScorer(ctx, scorer); if (status != 0) { fprintf(stderr, "Could not enable external scorer.\n"); return 1; } if (set_alphabeta) { - status = DS_SetScorerAlphaBeta(ctx, lm_alpha, lm_beta); + status = STT_SetScorerAlphaBeta(ctx, lm_alpha, lm_beta); if (status != 0) { fprintf(stderr, "Error setting scorer alpha and beta.\n"); return 1; @@ -494,7 +494,7 @@ main(int argc, char **argv) // so, check the boost string before we turn it into a float bool boost_is_valid = (pair_[1].find_first_not_of("-.0123456789") == std::string::npos); float boost = strtof((pair_[1]).c_str(),0); - status = DS_AddHotWord(ctx, word, boost); + status = STT_AddHotWord(ctx, word, boost); if (status != 0 || !boost_is_valid) { fprintf(stderr, "Could not enable hot-word.\n"); return 1; @@ -555,7 +555,7 @@ main(int argc, char **argv) sox_quit(); #endif // NO_SOX - DS_FreeModel(ctx); + STT_FreeModel(ctx); return 0; } diff --git a/native_client/deepspeech.h b/native_client/coqui-stt.h similarity index 62% rename from native_client/deepspeech.h rename to native_client/coqui-stt.h index fbec4721..7794bc79 100644 --- a/native_client/deepspeech.h +++ b/native_client/coqui-stt.h @@ -1,5 +1,5 @@ -#ifndef DEEPSPEECH_H -#define DEEPSPEECH_H +#ifndef COQUI_STT_H +#define COQUI_STT_H #ifdef __cplusplus extern "C" { @@ -7,12 +7,12 @@ extern "C" { #ifndef SWIG #if defined _MSC_VER - #define DEEPSPEECH_EXPORT __declspec(dllexport) + #define STT_EXPORT __declspec(dllexport) #else - #define DEEPSPEECH_EXPORT __attribute__ ((visibility("default"))) + #define STT_EXPORT __attribute__ ((visibility("default"))) #endif /*End of _MSC_VER*/ #else - #define DEEPSPEECH_EXPORT + #define STT_EXPORT #endif typedef struct ModelState ModelState; @@ -61,92 +61,92 @@ typedef struct Metadata { // sphinx-doc: error_code_listing_start -#define DS_FOR_EACH_ERROR(APPLY) \ - APPLY(DS_ERR_OK, 0x0000, "No error.") \ - APPLY(DS_ERR_NO_MODEL, 0x1000, "Missing model information.") \ - APPLY(DS_ERR_INVALID_ALPHABET, 0x2000, "Invalid alphabet embedded in model. (Data corruption?)") \ - APPLY(DS_ERR_INVALID_SHAPE, 0x2001, "Invalid model shape.") \ - APPLY(DS_ERR_INVALID_SCORER, 0x2002, "Invalid scorer file.") \ - APPLY(DS_ERR_MODEL_INCOMPATIBLE, 0x2003, "Incompatible model.") \ - APPLY(DS_ERR_SCORER_NOT_ENABLED, 0x2004, "External scorer is not enabled.") \ - APPLY(DS_ERR_SCORER_UNREADABLE, 0x2005, "Could not read scorer file.") \ - APPLY(DS_ERR_SCORER_INVALID_LM, 0x2006, "Could not recognize language model header in scorer.") \ - APPLY(DS_ERR_SCORER_NO_TRIE, 0x2007, "Reached end of scorer file before loading vocabulary trie.") \ - APPLY(DS_ERR_SCORER_INVALID_TRIE, 0x2008, "Invalid magic in trie header.") \ - APPLY(DS_ERR_SCORER_VERSION_MISMATCH, 0x2009, "Scorer file version does not match expected version.") \ - APPLY(DS_ERR_FAIL_INIT_MMAP, 0x3000, "Failed to initialize memory mapped model.") \ - APPLY(DS_ERR_FAIL_INIT_SESS, 0x3001, "Failed to initialize the session.") \ - APPLY(DS_ERR_FAIL_INTERPRETER, 0x3002, "Interpreter failed.") \ - APPLY(DS_ERR_FAIL_RUN_SESS, 0x3003, "Failed to run the session.") \ - APPLY(DS_ERR_FAIL_CREATE_STREAM, 0x3004, "Error creating the stream.") \ - APPLY(DS_ERR_FAIL_READ_PROTOBUF, 0x3005, "Error reading the proto buffer model file.") \ - APPLY(DS_ERR_FAIL_CREATE_SESS, 0x3006, "Failed to create session.") \ - APPLY(DS_ERR_FAIL_CREATE_MODEL, 0x3007, "Could not allocate model state.") \ - APPLY(DS_ERR_FAIL_INSERT_HOTWORD, 0x3008, "Could not insert hot-word.") \ - APPLY(DS_ERR_FAIL_CLEAR_HOTWORD, 0x3009, "Could not clear hot-words.") \ - APPLY(DS_ERR_FAIL_ERASE_HOTWORD, 0x3010, "Could not erase hot-word.") +#define STT_FOR_EACH_ERROR(APPLY) \ + APPLY(STT_ERR_OK, 0x0000, "No error.") \ + APPLY(STT_ERR_NO_MODEL, 0x1000, "Missing model information.") \ + APPLY(STT_ERR_INVALID_ALPHABET, 0x2000, "Invalid alphabet embedded in model. (Data corruption?)") \ + APPLY(STT_ERR_INVALID_SHAPE, 0x2001, "Invalid model shape.") \ + APPLY(STT_ERR_INVALID_SCORER, 0x2002, "Invalid scorer file.") \ + APPLY(STT_ERR_MODEL_INCOMPATIBLE, 0x2003, "Incompatible model.") \ + APPLY(STT_ERR_SCORER_NOT_ENABLED, 0x2004, "External scorer is not enabled.") \ + APPLY(STT_ERR_SCORER_UNREADABLE, 0x2005, "Could not read scorer file.") \ + APPLY(STT_ERR_SCORER_INVALID_LM, 0x2006, "Could not recognize language model header in scorer.") \ + APPLY(STT_ERR_SCORER_NO_TRIE, 0x2007, "Reached end of scorer file before loading vocabulary trie.") \ + APPLY(STT_ERR_SCORER_INVALID_TRIE, 0x2008, "Invalid magic in trie header.") \ + APPLY(STT_ERR_SCORER_VERSION_MISMATCH, 0x2009, "Scorer file version does not match expected version.") \ + APPLY(STT_ERR_FAIL_INIT_MMAP, 0x3000, "Failed to initialize memory mapped model.") \ + APPLY(STT_ERR_FAIL_INIT_SESS, 0x3001, "Failed to initialize the session.") \ + APPLY(STT_ERR_FAIL_INTERPRETER, 0x3002, "Interpreter failed.") \ + APPLY(STT_ERR_FAIL_RUN_SESS, 0x3003, "Failed to run the session.") \ + APPLY(STT_ERR_FAIL_CREATE_STREAM, 0x3004, "Error creating the stream.") \ + APPLY(STT_ERR_FAIL_READ_PROTOBUF, 0x3005, "Error reading the proto buffer model file.") \ + APPLY(STT_ERR_FAIL_CREATE_SESS, 0x3006, "Failed to create session.") \ + APPLY(STT_ERR_FAIL_CREATE_MODEL, 0x3007, "Could not allocate model state.") \ + APPLY(STT_ERR_FAIL_INSERT_HOTWORD, 0x3008, "Could not insert hot-word.") \ + APPLY(STT_ERR_FAIL_CLEAR_HOTWORD, 0x3009, "Could not clear hot-words.") \ + APPLY(STT_ERR_FAIL_ERASE_HOTWORD, 0x3010, "Could not erase hot-word.") // sphinx-doc: error_code_listing_end -enum DeepSpeech_Error_Codes +enum STT_Error_Codes { #define DEFINE(NAME, VALUE, DESC) NAME = VALUE, -DS_FOR_EACH_ERROR(DEFINE) +STT_FOR_EACH_ERROR(DEFINE) #undef DEFINE }; /** - * @brief An object providing an interface to a trained DeepSpeech model. + * @brief An object providing an interface to a trained Coqui STT model. * * @param aModelPath The path to the frozen model graph. * @param[out] retval a ModelState pointer * * @return Zero on success, non-zero on failure. */ -DEEPSPEECH_EXPORT -int DS_CreateModel(const char* aModelPath, +STT_EXPORT +int STT_CreateModel(const char* aModelPath, ModelState** retval); /** - * @brief Get beam width value used by the model. If {@link DS_SetModelBeamWidth} + * @brief Get beam width value used by the model. If {@link STT_SetModelBeamWidth} * was not called before, will return the default value loaded from the * model file. * - * @param aCtx A ModelState pointer created with {@link DS_CreateModel}. + * @param aCtx A ModelState pointer created with {@link STT_CreateModel}. * * @return Beam width value used by the model. */ -DEEPSPEECH_EXPORT -unsigned int DS_GetModelBeamWidth(const ModelState* aCtx); +STT_EXPORT +unsigned int STT_GetModelBeamWidth(const ModelState* aCtx); /** * @brief Set beam width value used by the model. * - * @param aCtx A ModelState pointer created with {@link DS_CreateModel}. + * @param aCtx A ModelState pointer created with {@link STT_CreateModel}. * @param aBeamWidth The beam width used by the model. A larger beam width value * generates better results at the cost of decoding time. * * @return Zero on success, non-zero on failure. */ -DEEPSPEECH_EXPORT -int DS_SetModelBeamWidth(ModelState* aCtx, +STT_EXPORT +int STT_SetModelBeamWidth(ModelState* aCtx, unsigned int aBeamWidth); /** * @brief Return the sample rate expected by a model. * - * @param aCtx A ModelState pointer created with {@link DS_CreateModel}. + * @param aCtx A ModelState pointer created with {@link STT_CreateModel}. * * @return Sample rate expected by the model for its input. */ -DEEPSPEECH_EXPORT -int DS_GetModelSampleRate(const ModelState* aCtx); +STT_EXPORT +int STT_GetModelSampleRate(const ModelState* aCtx); /** * @brief Frees associated resources and destroys model object. */ -DEEPSPEECH_EXPORT -void DS_FreeModel(ModelState* ctx); +STT_EXPORT +void STT_FreeModel(ModelState* ctx); /** * @brief Enable decoding using an external scorer. @@ -156,8 +156,8 @@ void DS_FreeModel(ModelState* ctx); * * @return Zero on success, non-zero on failure (invalid arguments). */ -DEEPSPEECH_EXPORT -int DS_EnableExternalScorer(ModelState* aCtx, +STT_EXPORT +int STT_EnableExternalScorer(ModelState* aCtx, const char* aScorerPath); /** @@ -171,8 +171,8 @@ int DS_EnableExternalScorer(ModelState* aCtx, * * @return Zero on success, non-zero on failure (invalid arguments). */ -DEEPSPEECH_EXPORT -int DS_AddHotWord(ModelState* aCtx, +STT_EXPORT +int STT_AddHotWord(ModelState* aCtx, const char* word, float boost); @@ -184,8 +184,8 @@ int DS_AddHotWord(ModelState* aCtx, * * @return Zero on success, non-zero on failure (invalid arguments). */ -DEEPSPEECH_EXPORT -int DS_EraseHotWord(ModelState* aCtx, +STT_EXPORT +int STT_EraseHotWord(ModelState* aCtx, const char* word); /** @@ -195,8 +195,8 @@ int DS_EraseHotWord(ModelState* aCtx, * * @return Zero on success, non-zero on failure (invalid arguments). */ -DEEPSPEECH_EXPORT -int DS_ClearHotWords(ModelState* aCtx); +STT_EXPORT +int STT_ClearHotWords(ModelState* aCtx); /** * @brief Disable decoding using an external scorer. @@ -205,8 +205,8 @@ int DS_ClearHotWords(ModelState* aCtx); * * @return Zero on success, non-zero on failure. */ -DEEPSPEECH_EXPORT -int DS_DisableExternalScorer(ModelState* aCtx); +STT_EXPORT +int STT_DisableExternalScorer(ModelState* aCtx); /** * @brief Set hyperparameters alpha and beta of the external scorer. @@ -217,13 +217,13 @@ int DS_DisableExternalScorer(ModelState* aCtx); * * @return Zero on success, non-zero on failure. */ -DEEPSPEECH_EXPORT -int DS_SetScorerAlphaBeta(ModelState* aCtx, +STT_EXPORT +int STT_SetScorerAlphaBeta(ModelState* aCtx, float aAlpha, float aBeta); /** - * @brief Use the DeepSpeech model to convert speech to text. + * @brief Use the Coqui STT model to convert speech to text. * * @param aCtx The ModelState pointer for the model to use. * @param aBuffer A 16-bit, mono raw audio signal at the appropriate @@ -231,15 +231,15 @@ int DS_SetScorerAlphaBeta(ModelState* aCtx, * @param aBufferSize The number of samples in the audio signal. * * @return The STT result. The user is responsible for freeing the string using - * {@link DS_FreeString()}. Returns NULL on error. + * {@link STT_FreeString()}. Returns NULL on error. */ -DEEPSPEECH_EXPORT -char* DS_SpeechToText(ModelState* aCtx, +STT_EXPORT +char* STT_SpeechToText(ModelState* aCtx, const short* aBuffer, unsigned int aBufferSize); /** - * @brief Use the DeepSpeech model to convert speech to text and output results + * @brief Use the Coqui STT model to convert speech to text and output results * including metadata. * * @param aCtx The ModelState pointer for the model to use. @@ -250,19 +250,19 @@ char* DS_SpeechToText(ModelState* aCtx, * * @return Metadata struct containing multiple CandidateTranscript structs. Each * transcript has per-token metadata including timing information. The - * user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. + * user is responsible for freeing Metadata by calling {@link STT_FreeMetadata()}. * Returns NULL on error. */ -DEEPSPEECH_EXPORT -Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, +STT_EXPORT +Metadata* STT_SpeechToTextWithMetadata(ModelState* aCtx, const short* aBuffer, unsigned int aBufferSize, unsigned int aNumResults); /** * @brief Create a new streaming inference state. The streaming state returned - * by this function can then be passed to {@link DS_FeedAudioContent()} - * and {@link DS_FinishStream()}. + * by this function can then be passed to {@link STT_FeedAudioContent()} + * and {@link STT_FinishStream()}. * * @param aCtx The ModelState pointer for the model to use. * @param[out] retval an opaque pointer that represents the streaming state. Can @@ -270,81 +270,81 @@ Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, * * @return Zero for success, non-zero on failure. */ -DEEPSPEECH_EXPORT -int DS_CreateStream(ModelState* aCtx, +STT_EXPORT +int STT_CreateStream(ModelState* aCtx, StreamingState** retval); /** * @brief Feed audio samples to an ongoing streaming inference. * - * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. + * @param aSctx A streaming state pointer returned by {@link STT_CreateStream()}. * @param aBuffer An array of 16-bit, mono raw audio samples at the * appropriate sample rate (matching what the model was trained on). * @param aBufferSize The number of samples in @p aBuffer. */ -DEEPSPEECH_EXPORT -void DS_FeedAudioContent(StreamingState* aSctx, +STT_EXPORT +void STT_FeedAudioContent(StreamingState* aSctx, const short* aBuffer, unsigned int aBufferSize); /** * @brief Compute the intermediate decoding of an ongoing streaming inference. * - * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. + * @param aSctx A streaming state pointer returned by {@link STT_CreateStream()}. * * @return The STT intermediate result. The user is responsible for freeing the - * string using {@link DS_FreeString()}. + * string using {@link STT_FreeString()}. */ -DEEPSPEECH_EXPORT -char* DS_IntermediateDecode(const StreamingState* aSctx); +STT_EXPORT +char* STT_IntermediateDecode(const StreamingState* aSctx); /** * @brief Compute the intermediate decoding of an ongoing streaming inference, * return results including metadata. * - * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. + * @param aSctx A streaming state pointer returned by {@link STT_CreateStream()}. * @param aNumResults The number of candidate transcripts to return. * * @return Metadata struct containing multiple candidate transcripts. Each transcript * has per-token metadata including timing information. The user is - * responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. + * responsible for freeing Metadata by calling {@link STT_FreeMetadata()}. * Returns NULL on error. */ -DEEPSPEECH_EXPORT -Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx, +STT_EXPORT +Metadata* STT_IntermediateDecodeWithMetadata(const StreamingState* aSctx, unsigned int aNumResults); /** * @brief Compute the final decoding of an ongoing streaming inference and return * the result. Signals the end of an ongoing streaming inference. * - * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. + * @param aSctx A streaming state pointer returned by {@link STT_CreateStream()}. * * @return The STT result. The user is responsible for freeing the string using - * {@link DS_FreeString()}. + * {@link STT_FreeString()}. * * @note This method will free the state pointer (@p aSctx). */ -DEEPSPEECH_EXPORT -char* DS_FinishStream(StreamingState* aSctx); +STT_EXPORT +char* STT_FinishStream(StreamingState* aSctx); /** * @brief Compute the final decoding of an ongoing streaming inference and return * results including metadata. Signals the end of an ongoing streaming * inference. * - * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. + * @param aSctx A streaming state pointer returned by {@link STT_CreateStream()}. * @param aNumResults The number of candidate transcripts to return. * * @return Metadata struct containing multiple candidate transcripts. Each transcript * has per-token metadata including timing information. The user is - * responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. + * responsible for freeing Metadata by calling {@link STT_FreeMetadata()}. * Returns NULL on error. * * @note This method will free the state pointer (@p aSctx). */ -DEEPSPEECH_EXPORT -Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, +STT_EXPORT +Metadata* STT_FinishStreamWithMetadata(StreamingState* aSctx, unsigned int aNumResults); /** @@ -352,47 +352,47 @@ Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, * can be used if you no longer need the result of an ongoing streaming * inference and don't want to perform a costly decode operation. * - * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. + * @param aSctx A streaming state pointer returned by {@link STT_CreateStream()}. * * @note This method will free the state pointer (@p aSctx). */ -DEEPSPEECH_EXPORT -void DS_FreeStream(StreamingState* aSctx); +STT_EXPORT +void STT_FreeStream(StreamingState* aSctx); /** * @brief Free memory allocated for metadata information. */ -DEEPSPEECH_EXPORT -void DS_FreeMetadata(Metadata* m); +STT_EXPORT +void STT_FreeMetadata(Metadata* m); /** - * @brief Free a char* string returned by the DeepSpeech API. + * @brief Free a char* string returned by the Coqui STT API. */ -DEEPSPEECH_EXPORT -void DS_FreeString(char* str); +STT_EXPORT +void STT_FreeString(char* str); /** * @brief Returns the version of this library. The returned version is a semantic - * version (SemVer 2.0.0). The string returned must be freed with {@link DS_FreeString()}. + * version (SemVer 2.0.0). The string returned must be freed with {@link STT_FreeString()}. * * @return The version string. */ -DEEPSPEECH_EXPORT -char* DS_Version(); +STT_EXPORT +char* STT_Version(); /** * @brief Returns a textual description corresponding to an error code. - * The string returned must be freed with @{link DS_FreeString()}. + * The string returned must be freed with @{link STT_FreeString()}. * * @return The error description. */ -DEEPSPEECH_EXPORT -char* DS_ErrorCodeToErrorMessage(int aErrorCode); +STT_EXPORT +char* STT_ErrorCodeToErrorMessage(int aErrorCode); -#undef DEEPSPEECH_EXPORT +#undef STT_EXPORT #ifdef __cplusplus } #endif -#endif /* DEEPSPEECH_H */ +#endif /* COQUI_STT_H */ diff --git a/native_client/ctcdecode/__init__.py b/native_client/ctcdecode/__init__.py index 80edc51d..fc8f3255 100644 --- a/native_client/ctcdecode/__init__.py +++ b/native_client/ctcdecode/__init__.py @@ -9,7 +9,7 @@ __version__ = swigwrapper.__version__.decode('utf-8') # Hack: import error codes by matching on their names, as SWIG unfortunately # does not support binding enums to Python in a scoped manner yet. for symbol in dir(swigwrapper): - if symbol.startswith('DS_ERR_'): + if symbol.startswith('STT_ERR_'): globals()[symbol] = getattr(swigwrapper, symbol) class Scorer(swigwrapper.Scorer): diff --git a/native_client/ctcdecode/scorer.cpp b/native_client/ctcdecode/scorer.cpp index 5f25a335..e5c6c359 100644 --- a/native_client/ctcdecode/scorer.cpp +++ b/native_client/ctcdecode/scorer.cpp @@ -74,13 +74,13 @@ int Scorer::load_lm(const std::string& lm_path) // Check if file is readable to avoid KenLM throwing an exception const char* filename = lm_path.c_str(); if (access(filename, R_OK) != 0) { - return DS_ERR_SCORER_UNREADABLE; + return STT_ERR_SCORER_UNREADABLE; } // Check if the file format is valid to avoid KenLM throwing an exception lm::ngram::ModelType model_type; if (!lm::ngram::RecognizeBinary(filename, model_type)) { - return DS_ERR_SCORER_INVALID_LM; + return STT_ERR_SCORER_INVALID_LM; } // Load the LM @@ -97,7 +97,7 @@ int Scorer::load_lm(const std::string& lm_path) uint64_t trie_offset = language_model_->GetEndOfSearchOffset(); if (package_size <= trie_offset) { // File ends without a trie structure - return DS_ERR_SCORER_NO_TRIE; + return STT_ERR_SCORER_NO_TRIE; } // Read metadata and trie from file @@ -113,7 +113,7 @@ int Scorer::load_trie(std::ifstream& fin, const std::string& file_path) if (magic != MAGIC) { std::cerr << "Error: Can't parse scorer file, invalid header. Try updating " "your scorer file." << std::endl; - return DS_ERR_SCORER_INVALID_TRIE; + return STT_ERR_SCORER_INVALID_TRIE; } int version; @@ -125,10 +125,10 @@ int Scorer::load_trie(std::ifstream& fin, const std::string& file_path) if (version < FILE_VERSION) { std::cerr << "Update your scorer file."; } else { - std::cerr << "Downgrade your scorer file or update your version of DeepSpeech."; + std::cerr << "Downgrade your scorer file or update your version of Coqui STT."; } std::cerr << std::endl; - return DS_ERR_SCORER_VERSION_MISMATCH; + return STT_ERR_SCORER_VERSION_MISMATCH; } fin.read(reinterpret_cast(&is_utf8_mode_), sizeof(is_utf8_mode_)); @@ -143,7 +143,7 @@ int Scorer::load_trie(std::ifstream& fin, const std::string& file_path) opt.mode = fst::FstReadOptions::MAP; opt.source = file_path; dictionary.reset(FstType::Read(fin, opt)); - return DS_ERR_OK; + return STT_ERR_OK; } bool Scorer::save_dictionary(const std::string& path, bool append_instead_of_overwrite) diff --git a/native_client/ctcdecode/scorer.h b/native_client/ctcdecode/scorer.h index 5aee1046..67ea96d3 100644 --- a/native_client/ctcdecode/scorer.h +++ b/native_client/ctcdecode/scorer.h @@ -13,7 +13,7 @@ #include "path_trie.h" #include "alphabet.h" -#include "deepspeech.h" +#include "coqui-stt.h" const double OOV_SCORE = -1000.0; const std::string START_TOKEN = ""; diff --git a/native_client/ctcdecode/setup.py b/native_client/ctcdecode/setup.py index 82e702a8..e18578af 100644 --- a/native_client/ctcdecode/setup.py +++ b/native_client/ctcdecode/setup.py @@ -51,7 +51,7 @@ def maybe_rebuild(srcs, out_name, build_dir): num_parallel=known_args.num_processes, debug=debug) -project_version = read('../../training/deepspeech_training/VERSION').strip() +project_version = read('../../training/coqui_stt_training/VERSION').strip() build_dir = 'temp_build/temp_build' diff --git a/native_client/ctcdecode/swigwrapper.i b/native_client/ctcdecode/swigwrapper.i index 683a3426..facc83eb 100644 --- a/native_client/ctcdecode/swigwrapper.i +++ b/native_client/ctcdecode/swigwrapper.i @@ -44,14 +44,14 @@ namespace std { %constant const char* __version__ = ds_version(); %constant const char* __git_version__ = ds_git_version(); -// Import only the error code enum definitions from deepspeech.h +// Import only the error code enum definitions from coqui-stt.h // We can't just do |%ignore "";| here because it affects this file globally (even // files %include'd above). That causes SWIG to lose destructor information and // leads to leaks of the wrapper objects. // Instead we ignore functions and classes (structs), which are the only other -// things in deepspeech.h. If we add some new construct to deepspeech.h we need +// things in coqui-stt.h. If we add some new construct to coqui-stt.h we need // to update the ignore rules here to avoid exposing unwanted APIs in the decoder // package. %rename("$ignore", %$isfunction) ""; %rename("$ignore", %$isclass) ""; -%include "../deepspeech.h" +%include "../coqui-stt.h" diff --git a/native_client/definitions.mk b/native_client/definitions.mk index 737ec8f9..d5aa2fea 100644 --- a/native_client/definitions.mk +++ b/native_client/definitions.mk @@ -18,10 +18,10 @@ ifeq ($(findstring _NT,$(OS)),_NT) PLATFORM_EXE_SUFFIX := .exe endif -DEEPSPEECH_BIN := deepspeech$(PLATFORM_EXE_SUFFIX) -CFLAGS_DEEPSPEECH := -std=c++11 -o $(DEEPSPEECH_BIN) -LINK_DEEPSPEECH := -ldeepspeech -LINK_PATH_DEEPSPEECH := -L${TFDIR}/bazel-bin/native_client +STT_BIN := stt$(PLATFORM_EXE_SUFFIX) +CFLAGS_STT := -std=c++11 -o $(STT_BIN) +LINK_STT := -lstt +LINK_PATH_STT := -L${TFDIR}/bazel-bin/native_client ifeq ($(TARGET),host) TOOLCHAIN := @@ -61,9 +61,9 @@ TOOL_CC := cl.exe TOOL_CXX := cl.exe TOOL_LD := link.exe TOOL_LIBEXE := lib.exe -LINK_DEEPSPEECH := $(TFDIR)\bazel-bin\native_client\libdeepspeech.so.if.lib -LINK_PATH_DEEPSPEECH := -CFLAGS_DEEPSPEECH := -nologo -Fe$(DEEPSPEECH_BIN) +LINK_STT := $(TFDIR)\bazel-bin\native_client\libstt.so.if.lib +LINK_PATH_STT := +CFLAGS_STT := -nologo -Fe$(STT_BIN) SOX_CFLAGS := SOX_LDFLAGS := PYTHON_PACKAGES := numpy${NUMPY_BUILD_VERSION} @@ -141,8 +141,8 @@ endif CFLAGS += $(EXTRA_CFLAGS) CXXFLAGS += $(EXTRA_CXXFLAGS) -LIBS := $(LINK_DEEPSPEECH) $(EXTRA_LIBS) -LDFLAGS_DIRS := $(LINK_PATH_DEEPSPEECH) $(EXTRA_LDFLAGS) +LIBS := $(LINK_STT) $(EXTRA_LIBS) +LDFLAGS_DIRS := $(LINK_PATH_STT) $(EXTRA_LDFLAGS) LDFLAGS += $(LDFLAGS_NEEDED) $(LDFLAGS_RPATH) $(LDFLAGS_DIRS) $(LIBS) AS := $(TOOLCHAIN)$(TOOL_AS) @@ -182,7 +182,7 @@ define copy_missing_libs new_missing="$$( (for f in $$(otool -L $$lib 2>/dev/null | tail -n +2 | awk '{ print $$1 }' | grep -v '$$lib'); do ls -hal $$f; done;) 2>&1 | grep 'No such' | cut -d':' -f2 | xargs basename -a)"; \ missing_libs="$$missing_libs $$new_missing"; \ elif [ "$(OS)" = "${TC_MSYS_VERSION}" ]; then \ - missing_libs="libdeepspeech.so"; \ + missing_libs="libstt.so"; \ else \ missing_libs="$$missing_libs $$($(LDD) $$lib | grep 'not found' | awk '{ print $$1 }')"; \ fi; \ diff --git a/native_client/dotnet/DeepSpeechClient/Enums/ErrorCodes.cs b/native_client/dotnet/DeepSpeechClient/Enums/ErrorCodes.cs deleted file mode 100644 index cbcb8f43..00000000 --- a/native_client/dotnet/DeepSpeechClient/Enums/ErrorCodes.cs +++ /dev/null @@ -1,33 +0,0 @@ -namespace DeepSpeechClient.Enums -{ - /// - /// Error codes from the native DeepSpeech binary. - /// - internal enum ErrorCodes - { - // OK - DS_ERR_OK = 0x0000, - - // Missing invormations - DS_ERR_NO_MODEL = 0x1000, - - // Invalid parameters - DS_ERR_INVALID_ALPHABET = 0x2000, - DS_ERR_INVALID_SHAPE = 0x2001, - DS_ERR_INVALID_SCORER = 0x2002, - DS_ERR_MODEL_INCOMPATIBLE = 0x2003, - DS_ERR_SCORER_NOT_ENABLED = 0x2004, - - // Runtime failures - DS_ERR_FAIL_INIT_MMAP = 0x3000, - DS_ERR_FAIL_INIT_SESS = 0x3001, - DS_ERR_FAIL_INTERPRETER = 0x3002, - DS_ERR_FAIL_RUN_SESS = 0x3003, - DS_ERR_FAIL_CREATE_STREAM = 0x3004, - DS_ERR_FAIL_READ_PROTOBUF = 0x3005, - DS_ERR_FAIL_CREATE_SESS = 0x3006, - DS_ERR_FAIL_INSERT_HOTWORD = 0x3008, - DS_ERR_FAIL_CLEAR_HOTWORD = 0x3009, - DS_ERR_FAIL_ERASE_HOTWORD = 0x3010 - } -} diff --git a/native_client/dotnet/DeepSpeechClient/NativeImp.cs b/native_client/dotnet/DeepSpeechClient/NativeImp.cs deleted file mode 100644 index 1a7dacac..00000000 --- a/native_client/dotnet/DeepSpeechClient/NativeImp.cs +++ /dev/null @@ -1,114 +0,0 @@ -using DeepSpeechClient.Enums; - -using System; -using System.Runtime.InteropServices; - -namespace DeepSpeechClient -{ - /// - /// Wrapper for the native implementation of "libdeepspeech.so" - /// - internal static class NativeImp - { - #region Native Implementation - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, - CharSet = CharSet.Ansi, SetLastError = true)] - internal static extern IntPtr DS_Version(); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath, - ref IntPtr** pint); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal unsafe static extern IntPtr DS_ErrorCodeToErrorMessage(int aErrorCode); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal unsafe static extern uint DS_GetModelBeamWidth(IntPtr** aCtx); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal unsafe static extern ErrorCodes DS_SetModelBeamWidth(IntPtr** aCtx, - uint aBeamWidth); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal unsafe static extern ErrorCodes DS_CreateModel(string aModelPath, - uint aBeamWidth, - ref IntPtr** pint); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal unsafe static extern int DS_GetModelSampleRate(IntPtr** aCtx); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern ErrorCodes DS_EnableExternalScorer(IntPtr** aCtx, - string aScorerPath); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern ErrorCodes DS_AddHotWord(IntPtr** aCtx, - string aWord, - float aBoost); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern ErrorCodes DS_EraseHotWord(IntPtr** aCtx, - string aWord); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern ErrorCodes DS_ClearHotWords(IntPtr** aCtx); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern ErrorCodes DS_DisableExternalScorer(IntPtr** aCtx); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern ErrorCodes DS_SetScorerAlphaBeta(IntPtr** aCtx, - float aAlpha, - float aBeta); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, - CharSet = CharSet.Ansi, SetLastError = true)] - internal static unsafe extern IntPtr DS_SpeechToText(IntPtr** aCtx, - short[] aBuffer, - uint aBufferSize); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)] - internal static unsafe extern IntPtr DS_SpeechToTextWithMetadata(IntPtr** aCtx, - short[] aBuffer, - uint aBufferSize, - uint aNumResults); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern void DS_FreeModel(IntPtr** aCtx); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern ErrorCodes DS_CreateStream(IntPtr** aCtx, - ref IntPtr** retval); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern void DS_FreeStream(IntPtr** aSctx); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern void DS_FreeMetadata(IntPtr metadata); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern void DS_FreeString(IntPtr str); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, - CharSet = CharSet.Ansi, SetLastError = true)] - internal static unsafe extern void DS_FeedAudioContent(IntPtr** aSctx, - short[] aBuffer, - uint aBufferSize); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern IntPtr DS_IntermediateDecode(IntPtr** aSctx); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern IntPtr DS_IntermediateDecodeWithMetadata(IntPtr** aSctx, - uint aNumResults); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl, - CharSet = CharSet.Ansi, SetLastError = true)] - internal static unsafe extern IntPtr DS_FinishStream(IntPtr** aSctx); - - [DllImport("libdeepspeech.so", CallingConvention = CallingConvention.Cdecl)] - internal static unsafe extern IntPtr DS_FinishStreamWithMetadata(IntPtr** aSctx, - uint aNumResults); - #endregion - } -} diff --git a/native_client/dotnet/DeepSpeech.sln b/native_client/dotnet/STT.sln similarity index 79% rename from native_client/dotnet/DeepSpeech.sln rename to native_client/dotnet/STT.sln index 78afe7db..58fd6c8e 100644 --- a/native_client/dotnet/DeepSpeech.sln +++ b/native_client/dotnet/STT.sln @@ -2,9 +2,9 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 16 VisualStudioVersion = 16.0.30204.135 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DeepSpeechClient", "DeepSpeechClient\DeepSpeechClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "STTClient", "STTClient\STTClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechConsole", "DeepSpeechConsole\DeepSpeechConsole.csproj", "{312965E5-C4F6-4D95-BA64-79906B8BC7AC}" +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "STTConsole", "STTConsole\STTConsole.csproj", "{312965E5-C4F6-4D95-BA64-79906B8BC7AC}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/native_client/dotnet/STTClient/Enums/ErrorCodes.cs b/native_client/dotnet/STTClient/Enums/ErrorCodes.cs new file mode 100644 index 00000000..b3e76456 --- /dev/null +++ b/native_client/dotnet/STTClient/Enums/ErrorCodes.cs @@ -0,0 +1,33 @@ +namespace STTClient.Enums +{ + /// + /// Error codes from the native Coqui STT binary. + /// + internal enum ErrorCodes + { + // OK + STT_ERR_OK = 0x0000, + + // Missing invormations + STT_ERR_NO_MODEL = 0x1000, + + // Invalid parameters + STT_ERR_INVALID_ALPHABET = 0x2000, + STT_ERR_INVALID_SHAPE = 0x2001, + STT_ERR_INVALID_SCORER = 0x2002, + STT_ERR_MODEL_INCOMPATIBLE = 0x2003, + STT_ERR_SCORER_NOT_ENABLED = 0x2004, + + // Runtime failures + STT_ERR_FAIL_INIT_MMAP = 0x3000, + STT_ERR_FAIL_INIT_SESS = 0x3001, + STT_ERR_FAIL_INTERPRETER = 0x3002, + STT_ERR_FAIL_RUN_SESS = 0x3003, + STT_ERR_FAIL_CREATE_STREAM = 0x3004, + STT_ERR_FAIL_READ_PROTOBUF = 0x3005, + STT_ERR_FAIL_CREATE_SESS = 0x3006, + STT_ERR_FAIL_INSERT_HOTWORD = 0x3008, + STT_ERR_FAIL_CLEAR_HOTWORD = 0x3009, + STT_ERR_FAIL_ERASE_HOTWORD = 0x3010 + } +} diff --git a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs b/native_client/dotnet/STTClient/Extensions/NativeExtensions.cs similarity index 95% rename from native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs rename to native_client/dotnet/STTClient/Extensions/NativeExtensions.cs index 9325f4b8..297a311d 100644 --- a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs +++ b/native_client/dotnet/STTClient/Extensions/NativeExtensions.cs @@ -1,9 +1,9 @@ -using DeepSpeechClient.Structs; +using STTClient.Structs; using System; using System.Runtime.InteropServices; using System.Text; -namespace DeepSpeechClient.Extensions +namespace STTClient.Extensions { internal static class NativeExtensions { @@ -20,7 +20,7 @@ namespace DeepSpeechClient.Extensions byte[] buffer = new byte[len]; Marshal.Copy(intPtr, buffer, 0, buffer.Length); if (releasePtr) - NativeImp.DS_FreeString(intPtr); + NativeImp.STT_FreeString(intPtr); string result = Encoding.UTF8.GetString(buffer); return result; } @@ -86,7 +86,7 @@ namespace DeepSpeechClient.Extensions metadata.transcripts += sizeOfCandidateTranscript; } - NativeImp.DS_FreeMetadata(intPtr); + NativeImp.STT_FreeMetadata(intPtr); return managedMetadata; } } diff --git a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs b/native_client/dotnet/STTClient/Interfaces/ISTT.cs similarity index 88% rename from native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs rename to native_client/dotnet/STTClient/Interfaces/ISTT.cs index fca21a57..7486796d 100644 --- a/native_client/dotnet/DeepSpeechClient/Interfaces/IDeepSpeech.cs +++ b/native_client/dotnet/STTClient/Interfaces/ISTT.cs @@ -1,13 +1,13 @@ -using DeepSpeechClient.Models; +using STTClient.Models; using System; using System.IO; -namespace DeepSpeechClient.Interfaces +namespace STTClient.Interfaces { /// - /// Client interface for DeepSpeech + /// Client interface for Coqui STT /// - public interface IDeepSpeech : IDisposable + public interface ISTT : IDisposable { /// /// Return version of this library. The returned version is a semantic version @@ -80,7 +80,7 @@ namespace DeepSpeechClient.Interfaces unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta); /// - /// Use the DeepSpeech model to perform Speech-To-Text. + /// Use the STT model to perform Speech-To-Text. /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. @@ -89,7 +89,7 @@ namespace DeepSpeechClient.Interfaces uint aBufferSize); /// - /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata. + /// Use the STT model to perform Speech-To-Text, return results including metadata. /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. @@ -104,26 +104,26 @@ namespace DeepSpeechClient.Interfaces /// This can be used if you no longer need the result of an ongoing streaming /// inference and don't want to perform a costly decode operation. /// - unsafe void FreeStream(DeepSpeechStream stream); + unsafe void FreeStream(Stream stream); /// /// Creates a new streaming inference state. /// - unsafe DeepSpeechStream CreateStream(); + unsafe Stream CreateStream(); /// /// Feeds audio samples to an ongoing streaming inference. /// /// Instance of the stream to feed the data. /// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on). - unsafe void FeedAudioContent(DeepSpeechStream stream, short[] aBuffer, uint aBufferSize); + unsafe void FeedAudioContent(Stream stream, short[] aBuffer, uint aBufferSize); /// /// Computes the intermediate decoding of an ongoing streaming inference. /// /// Instance of the stream to decode. /// The STT intermediate result. - unsafe string IntermediateDecode(DeepSpeechStream stream); + unsafe string IntermediateDecode(Stream stream); /// /// Computes the intermediate decoding of an ongoing streaming inference, including metadata. @@ -131,14 +131,14 @@ namespace DeepSpeechClient.Interfaces /// Instance of the stream to decode. /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata result. - unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults); + unsafe Metadata IntermediateDecodeWithMetadata(Stream stream, uint aNumResults); /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal. /// /// Instance of the stream to finish. /// The STT result. - unsafe string FinishStream(DeepSpeechStream stream); + unsafe string FinishStream(Stream stream); /// /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal, including metadata. @@ -146,6 +146,6 @@ namespace DeepSpeechClient.Interfaces /// Instance of the stream to finish. /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata result. - unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults); + unsafe Metadata FinishStreamWithMetadata(Stream stream, uint aNumResults); } } diff --git a/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs b/native_client/dotnet/STTClient/Models/CandidateTranscript.cs similarity index 93% rename from native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs rename to native_client/dotnet/STTClient/Models/CandidateTranscript.cs index cc6b5d28..f158e2c2 100644 --- a/native_client/dotnet/DeepSpeechClient/Models/CandidateTranscript.cs +++ b/native_client/dotnet/STTClient/Models/CandidateTranscript.cs @@ -1,4 +1,4 @@ -namespace DeepSpeechClient.Models +namespace STTClient.Models { /// /// Stores the entire CTC output as an array of character metadata objects. diff --git a/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs b/native_client/dotnet/STTClient/Models/Metadata.cs similarity index 89% rename from native_client/dotnet/DeepSpeechClient/Models/Metadata.cs rename to native_client/dotnet/STTClient/Models/Metadata.cs index fb6c613d..537a22e8 100644 --- a/native_client/dotnet/DeepSpeechClient/Models/Metadata.cs +++ b/native_client/dotnet/STTClient/Models/Metadata.cs @@ -1,4 +1,4 @@ -namespace DeepSpeechClient.Models +namespace STTClient.Models { /// /// Stores the entire CTC output as an array of character metadata objects. diff --git a/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs b/native_client/dotnet/STTClient/Models/Stream.cs similarity index 81% rename from native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs rename to native_client/dotnet/STTClient/Models/Stream.cs index e4605f5e..49f92dfa 100644 --- a/native_client/dotnet/DeepSpeechClient/Models/DeepSpeechStream.cs +++ b/native_client/dotnet/STTClient/Models/Stream.cs @@ -1,19 +1,19 @@ using System; -namespace DeepSpeechClient.Models +namespace STTClient.Models { /// /// Wrapper of the pointer used for the decoding stream. /// - public class DeepSpeechStream : IDisposable + public class Stream : IDisposable { private unsafe IntPtr** _streamingStatePp; /// - /// Initializes a new instance of . + /// Initializes a new instance of . /// /// Native pointer of the native stream. - public unsafe DeepSpeechStream(IntPtr** streamingStatePP) + public unsafe Stream(IntPtr** streamingStatePP) { _streamingStatePp = streamingStatePP; } diff --git a/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs b/native_client/dotnet/STTClient/Models/TokenMetadata.cs similarity index 93% rename from native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs rename to native_client/dotnet/STTClient/Models/TokenMetadata.cs index 5f2dea56..c5ef94d8 100644 --- a/native_client/dotnet/DeepSpeechClient/Models/TokenMetadata.cs +++ b/native_client/dotnet/STTClient/Models/TokenMetadata.cs @@ -1,4 +1,4 @@ -namespace DeepSpeechClient.Models +namespace STTClient.Models { /// /// Stores each individual character, along with its timing information. diff --git a/native_client/dotnet/STTClient/NativeImp.cs b/native_client/dotnet/STTClient/NativeImp.cs new file mode 100644 index 00000000..a3491171 --- /dev/null +++ b/native_client/dotnet/STTClient/NativeImp.cs @@ -0,0 +1,114 @@ +using STTClient.Enums; + +using System; +using System.Runtime.InteropServices; + +namespace STTClient +{ + /// + /// Wrapper for the native implementation of "libstt.so" + /// + internal static class NativeImp + { + #region Native Implementation + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl, + CharSet = CharSet.Ansi, SetLastError = true)] + internal static extern IntPtr STT_Version(); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal unsafe static extern ErrorCodes STT_CreateModel(string aModelPath, + ref IntPtr** pint); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal unsafe static extern IntPtr STT_ErrorCodeToErrorMessage(int aErrorCode); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal unsafe static extern uint STT_GetModelBeamWidth(IntPtr** aCtx); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal unsafe static extern ErrorCodes STT_SetModelBeamWidth(IntPtr** aCtx, + uint aBeamWidth); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal unsafe static extern ErrorCodes STT_CreateModel(string aModelPath, + uint aBeamWidth, + ref IntPtr** pint); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal unsafe static extern int STT_GetModelSampleRate(IntPtr** aCtx); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern ErrorCodes STT_EnableExternalScorer(IntPtr** aCtx, + string aScorerPath); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern ErrorCodes STT_AddHotWord(IntPtr** aCtx, + string aWord, + float aBoost); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern ErrorCodes STT_EraseHotWord(IntPtr** aCtx, + string aWord); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern ErrorCodes STT_ClearHotWords(IntPtr** aCtx); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern ErrorCodes STT_DisableExternalScorer(IntPtr** aCtx); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern ErrorCodes STT_SetScorerAlphaBeta(IntPtr** aCtx, + float aAlpha, + float aBeta); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl, + CharSet = CharSet.Ansi, SetLastError = true)] + internal static unsafe extern IntPtr STT_SpeechToText(IntPtr** aCtx, + short[] aBuffer, + uint aBufferSize); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl, SetLastError = true)] + internal static unsafe extern IntPtr STT_SpeechToTextWithMetadata(IntPtr** aCtx, + short[] aBuffer, + uint aBufferSize, + uint aNumResults); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern void STT_FreeModel(IntPtr** aCtx); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern ErrorCodes STT_CreateStream(IntPtr** aCtx, + ref IntPtr** retval); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern void STT_FreeStream(IntPtr** aSctx); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern void STT_FreeMetadata(IntPtr metadata); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern void STT_FreeString(IntPtr str); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl, + CharSet = CharSet.Ansi, SetLastError = true)] + internal static unsafe extern void STT_FeedAudioContent(IntPtr** aSctx, + short[] aBuffer, + uint aBufferSize); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern IntPtr STT_IntermediateDecode(IntPtr** aSctx); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern IntPtr STT_IntermediateDecodeWithMetadata(IntPtr** aSctx, + uint aNumResults); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl, + CharSet = CharSet.Ansi, SetLastError = true)] + internal static unsafe extern IntPtr STT_FinishStream(IntPtr** aSctx); + + [DllImport("libstt.so", CallingConvention = CallingConvention.Cdecl)] + internal static unsafe extern IntPtr STT_FinishStreamWithMetadata(IntPtr** aSctx, + uint aNumResults); + #endregion + } +} diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/STTClient/STT.cs similarity index 75% rename from native_client/dotnet/DeepSpeechClient/DeepSpeech.cs rename to native_client/dotnet/STTClient/STT.cs index 79b276c2..60eeda9f 100644 --- a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs +++ b/native_client/dotnet/STTClient/STT.cs @@ -1,34 +1,34 @@ -using DeepSpeechClient.Interfaces; -using DeepSpeechClient.Extensions; +using STTClient.Interfaces; +using STTClient.Extensions; using System; using System.IO; -using DeepSpeechClient.Enums; -using DeepSpeechClient.Models; +using STTClient.Enums; +using STTClient.Models; -namespace DeepSpeechClient +namespace STTClient { /// - /// Concrete implementation of . + /// Concrete implementation of . /// - public class DeepSpeech : IDeepSpeech + public class STT : ISTT { private unsafe IntPtr** _modelStatePP; - + /// - /// Initializes a new instance of class and creates a new acoustic model. + /// Initializes a new instance of class and creates a new acoustic model. /// /// The path to the frozen model graph. /// Thrown when the native binary failed to create the model. - public DeepSpeech(string aModelPath) + public STT(string aModelPath) { CreateModel(aModelPath); } - #region IDeepSpeech + #region ISTT /// - /// Create an object providing an interface to a trained DeepSpeech model. + /// Create an object providing an interface to a trained STT model. /// /// The path to the frozen model graph. /// Thrown when the native binary failed to create the model. @@ -48,7 +48,7 @@ namespace DeepSpeechClient { throw new FileNotFoundException(exceptionMessage); } - var resultCode = NativeImp.DS_CreateModel(aModelPath, + var resultCode = NativeImp.STT_CreateModel(aModelPath, ref _modelStatePP); EvaluateResultCode(resultCode); } @@ -60,7 +60,7 @@ namespace DeepSpeechClient /// Beam width value used by the model. public unsafe uint GetModelBeamWidth() { - return NativeImp.DS_GetModelBeamWidth(_modelStatePP); + return NativeImp.STT_GetModelBeamWidth(_modelStatePP); } /// @@ -70,13 +70,13 @@ namespace DeepSpeechClient /// Thrown on failure. public unsafe void SetModelBeamWidth(uint aBeamWidth) { - var resultCode = NativeImp.DS_SetModelBeamWidth(_modelStatePP, aBeamWidth); + var resultCode = NativeImp.STT_SetModelBeamWidth(_modelStatePP, aBeamWidth); EvaluateResultCode(resultCode); } /// /// Add a hot-word. - /// + /// /// Words that don't occur in the scorer (e.g. proper nouns) or strings that contain spaces won't be taken into account. /// /// Some word @@ -84,7 +84,7 @@ namespace DeepSpeechClient /// Thrown on failure. public unsafe void AddHotWord(string aWord, float aBoost) { - var resultCode = NativeImp.DS_AddHotWord(_modelStatePP, aWord, aBoost); + var resultCode = NativeImp.STT_AddHotWord(_modelStatePP, aWord, aBoost); EvaluateResultCode(resultCode); } @@ -95,7 +95,7 @@ namespace DeepSpeechClient /// Thrown on failure. public unsafe void EraseHotWord(string aWord) { - var resultCode = NativeImp.DS_EraseHotWord(_modelStatePP, aWord); + var resultCode = NativeImp.STT_EraseHotWord(_modelStatePP, aWord); EvaluateResultCode(resultCode); } @@ -105,7 +105,7 @@ namespace DeepSpeechClient /// Thrown on failure. public unsafe void ClearHotWords() { - var resultCode = NativeImp.DS_ClearHotWords(_modelStatePP); + var resultCode = NativeImp.STT_ClearHotWords(_modelStatePP); EvaluateResultCode(resultCode); } @@ -115,7 +115,7 @@ namespace DeepSpeechClient /// Sample rate. public unsafe int GetModelSampleRate() { - return NativeImp.DS_GetModelSampleRate(_modelStatePP); + return NativeImp.STT_GetModelSampleRate(_modelStatePP); } /// @@ -124,9 +124,9 @@ namespace DeepSpeechClient /// Native result code. private void EvaluateResultCode(ErrorCodes resultCode) { - if (resultCode != ErrorCodes.DS_ERR_OK) + if (resultCode != ErrorCodes.STT_ERR_OK) { - throw new ArgumentException(NativeImp.DS_ErrorCodeToErrorMessage((int)resultCode).PtrToString()); + throw new ArgumentException(NativeImp.STT_ErrorCodeToErrorMessage((int)resultCode).PtrToString()); } } @@ -135,7 +135,7 @@ namespace DeepSpeechClient /// public unsafe void Dispose() { - NativeImp.DS_FreeModel(_modelStatePP); + NativeImp.STT_FreeModel(_modelStatePP); } /// @@ -155,7 +155,7 @@ namespace DeepSpeechClient throw new FileNotFoundException($"Cannot find the scorer file: {aScorerPath}"); } - var resultCode = NativeImp.DS_EnableExternalScorer(_modelStatePP, aScorerPath); + var resultCode = NativeImp.STT_EnableExternalScorer(_modelStatePP, aScorerPath); EvaluateResultCode(resultCode); } @@ -165,7 +165,7 @@ namespace DeepSpeechClient /// Thrown when an external scorer is not enabled. public unsafe void DisableExternalScorer() { - var resultCode = NativeImp.DS_DisableExternalScorer(_modelStatePP); + var resultCode = NativeImp.STT_DisableExternalScorer(_modelStatePP); EvaluateResultCode(resultCode); } @@ -177,7 +177,7 @@ namespace DeepSpeechClient /// Thrown when an external scorer is not enabled. public unsafe void SetScorerAlphaBeta(float aAlpha, float aBeta) { - var resultCode = NativeImp.DS_SetScorerAlphaBeta(_modelStatePP, + var resultCode = NativeImp.STT_SetScorerAlphaBeta(_modelStatePP, aAlpha, aBeta); EvaluateResultCode(resultCode); @@ -188,9 +188,9 @@ namespace DeepSpeechClient /// /// Instance of the stream to feed the data. /// An array of 16-bit, mono raw audio samples at the appropriate sample rate (matching what the model was trained on). - public unsafe void FeedAudioContent(DeepSpeechStream stream, short[] aBuffer, uint aBufferSize) + public unsafe void FeedAudioContent(Stream stream, short[] aBuffer, uint aBufferSize) { - NativeImp.DS_FeedAudioContent(stream.GetNativePointer(), aBuffer, aBufferSize); + NativeImp.STT_FeedAudioContent(stream.GetNativePointer(), aBuffer, aBufferSize); } /// @@ -198,9 +198,9 @@ namespace DeepSpeechClient /// /// Instance of the stream to finish. /// The STT result. - public unsafe string FinishStream(DeepSpeechStream stream) + public unsafe string FinishStream(Stream stream) { - return NativeImp.DS_FinishStream(stream.GetNativePointer()).PtrToString(); + return NativeImp.STT_FinishStream(stream.GetNativePointer()).PtrToString(); } /// @@ -209,9 +209,9 @@ namespace DeepSpeechClient /// Instance of the stream to finish. /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The extended metadata result. - public unsafe Metadata FinishStreamWithMetadata(DeepSpeechStream stream, uint aNumResults) + public unsafe Metadata FinishStreamWithMetadata(Stream stream, uint aNumResults) { - return NativeImp.DS_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata(); + return NativeImp.STT_FinishStreamWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata(); } /// @@ -219,9 +219,9 @@ namespace DeepSpeechClient /// /// Instance of the stream to decode. /// The STT intermediate result. - public unsafe string IntermediateDecode(DeepSpeechStream stream) + public unsafe string IntermediateDecode(Stream stream) { - return NativeImp.DS_IntermediateDecode(stream.GetNativePointer()).PtrToString(); + return NativeImp.STT_IntermediateDecode(stream.GetNativePointer()).PtrToString(); } /// @@ -230,9 +230,9 @@ namespace DeepSpeechClient /// Instance of the stream to decode. /// Maximum number of candidate transcripts to return. Returned list might be smaller than this. /// The STT intermediate result. - public unsafe Metadata IntermediateDecodeWithMetadata(DeepSpeechStream stream, uint aNumResults) + public unsafe Metadata IntermediateDecodeWithMetadata(Stream stream, uint aNumResults) { - return NativeImp.DS_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata(); + return NativeImp.STT_IntermediateDecodeWithMetadata(stream.GetNativePointer(), aNumResults).PtrToMetadata(); } /// @@ -241,18 +241,18 @@ namespace DeepSpeechClient /// public unsafe string Version() { - return NativeImp.DS_Version().PtrToString(); + return NativeImp.STT_Version().PtrToString(); } /// /// Creates a new streaming inference state. /// - public unsafe DeepSpeechStream CreateStream() + public unsafe Stream CreateStream() { IntPtr** streamingStatePointer = null; - var resultCode = NativeImp.DS_CreateStream(_modelStatePP, ref streamingStatePointer); + var resultCode = NativeImp.STT_CreateStream(_modelStatePP, ref streamingStatePointer); EvaluateResultCode(resultCode); - return new DeepSpeechStream(streamingStatePointer); + return new Stream(streamingStatePointer); } /// @@ -260,25 +260,25 @@ namespace DeepSpeechClient /// This can be used if you no longer need the result of an ongoing streaming /// inference and don't want to perform a costly decode operation. /// - public unsafe void FreeStream(DeepSpeechStream stream) + public unsafe void FreeStream(Stream stream) { - NativeImp.DS_FreeStream(stream.GetNativePointer()); + NativeImp.STT_FreeStream(stream.GetNativePointer()); stream.Dispose(); } /// - /// Use the DeepSpeech model to perform Speech-To-Text. + /// Use the STT model to perform Speech-To-Text. /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. /// The STT result. Returns NULL on error. public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize) { - return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize).PtrToString(); + return NativeImp.STT_SpeechToText(_modelStatePP, aBuffer, aBufferSize).PtrToString(); } /// - /// Use the DeepSpeech model to perform Speech-To-Text, return results including metadata. + /// Use the STT model to perform Speech-To-Text, return results including metadata. /// /// A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on). /// The number of samples in the audio signal. @@ -286,7 +286,7 @@ namespace DeepSpeechClient /// The extended metadata. Returns NULL on error. public unsafe Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aNumResults) { - return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata(); + return NativeImp.STT_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aNumResults).PtrToMetadata(); } #endregion diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj b/native_client/dotnet/STTClient/STTClient.csproj similarity index 100% rename from native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj rename to native_client/dotnet/STTClient/STTClient.csproj diff --git a/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs b/native_client/dotnet/STTClient/Structs/CandidateTranscript.cs similarity index 94% rename from native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs rename to native_client/dotnet/STTClient/Structs/CandidateTranscript.cs index 54581f6f..4743810b 100644 --- a/native_client/dotnet/DeepSpeechClient/Structs/CandidateTranscript.cs +++ b/native_client/dotnet/STTClient/Structs/CandidateTranscript.cs @@ -1,7 +1,7 @@ using System; using System.Runtime.InteropServices; -namespace DeepSpeechClient.Structs +namespace STTClient.Structs { [StructLayout(LayoutKind.Sequential)] internal unsafe struct CandidateTranscript diff --git a/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs b/native_client/dotnet/STTClient/Structs/Metadata.cs similarity index 92% rename from native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs rename to native_client/dotnet/STTClient/Structs/Metadata.cs index 0a9beddc..f2db6bcd 100644 --- a/native_client/dotnet/DeepSpeechClient/Structs/Metadata.cs +++ b/native_client/dotnet/STTClient/Structs/Metadata.cs @@ -1,7 +1,7 @@ using System; using System.Runtime.InteropServices; -namespace DeepSpeechClient.Structs +namespace STTClient.Structs { [StructLayout(LayoutKind.Sequential)] internal unsafe struct Metadata diff --git a/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs b/native_client/dotnet/STTClient/Structs/TokenMetadata.cs similarity index 93% rename from native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs rename to native_client/dotnet/STTClient/Structs/TokenMetadata.cs index 1c660c71..a21c1d26 100644 --- a/native_client/dotnet/DeepSpeechClient/Structs/TokenMetadata.cs +++ b/native_client/dotnet/STTClient/Structs/TokenMetadata.cs @@ -1,7 +1,7 @@ using System; using System.Runtime.InteropServices; -namespace DeepSpeechClient.Structs +namespace STTClient.Structs { [StructLayout(LayoutKind.Sequential)] internal unsafe struct TokenMetadata diff --git a/native_client/dotnet/DeepSpeechConsole/App.config b/native_client/dotnet/STTConsole/App.config similarity index 100% rename from native_client/dotnet/DeepSpeechConsole/App.config rename to native_client/dotnet/STTConsole/App.config diff --git a/native_client/dotnet/DeepSpeechConsole/Program.cs b/native_client/dotnet/STTConsole/Program.cs similarity index 96% rename from native_client/dotnet/DeepSpeechConsole/Program.cs rename to native_client/dotnet/STTConsole/Program.cs index 55bd8fd5..e09d0c1f 100644 --- a/native_client/dotnet/DeepSpeechConsole/Program.cs +++ b/native_client/dotnet/STTConsole/Program.cs @@ -1,6 +1,6 @@ -using DeepSpeechClient; -using DeepSpeechClient.Interfaces; -using DeepSpeechClient.Models; +using STTClient; +using STTClient.Interfaces; +using STTClient.Models; using NAudio.Wave; using System; using System.Collections.Generic; @@ -54,7 +54,7 @@ namespace CSharpExamples Console.WriteLine("Loading model..."); stopwatch.Start(); // sphinx-doc: csharp_ref_model_start - using (IDeepSpeech sttClient = new DeepSpeech(model ?? "output_graph.pbmm")) + using (ISTT sttClient = new STT(model ?? "output_graph.pbmm")) { // sphinx-doc: csharp_ref_model_stop stopwatch.Stop(); diff --git a/native_client/dotnet/DeepSpeechConsole/Properties/AssemblyInfo.cs b/native_client/dotnet/STTConsole/Properties/AssemblyInfo.cs similarity index 85% rename from native_client/dotnet/DeepSpeechConsole/Properties/AssemblyInfo.cs rename to native_client/dotnet/STTConsole/Properties/AssemblyInfo.cs index 845851a1..f7600c7c 100644 --- a/native_client/dotnet/DeepSpeechConsole/Properties/AssemblyInfo.cs +++ b/native_client/dotnet/STTConsole/Properties/AssemblyInfo.cs @@ -5,12 +5,12 @@ using System.Runtime.InteropServices; // General Information about an assembly is controlled through the following // set of attributes. Change these attribute values to modify the information // associated with an assembly. -[assembly: AssemblyTitle("DeepSpeechConsole")] +[assembly: AssemblyTitle("STTConsole")] [assembly: AssemblyDescription("")] [assembly: AssemblyConfiguration("")] -[assembly: AssemblyCompany("")] -[assembly: AssemblyProduct("CSharpExamples")] -[assembly: AssemblyCopyright("Copyright © 2018")] +[assembly: AssemblyCompany("Coqui GmbH")] +[assembly: AssemblyProduct("STTConsole")] +[assembly: AssemblyCopyright("Copyright © 2018-2020 Mozilla, © 2021 Coqui GmbH")] [assembly: AssemblyTrademark("")] [assembly: AssemblyCulture("")] diff --git a/native_client/dotnet/DeepSpeechConsole/DeepSpeechConsole.csproj b/native_client/dotnet/STTConsole/STTConsole.csproj similarity index 93% rename from native_client/dotnet/DeepSpeechConsole/DeepSpeechConsole.csproj rename to native_client/dotnet/STTConsole/STTConsole.csproj index a05fca61..54e11eb0 100644 --- a/native_client/dotnet/DeepSpeechConsole/DeepSpeechConsole.csproj +++ b/native_client/dotnet/STTConsole/STTConsole.csproj @@ -6,8 +6,8 @@ AnyCPU {312965E5-C4F6-4D95-BA64-79906B8BC7AC} Exe - DeepSpeechConsole - DeepSpeechConsole + STTConsole + STTConsole v4.6.2 512 true @@ -56,9 +56,9 @@ - + {56DE4091-BBBE-47E4-852D-7268B33B971F} - DeepSpeechClient + STTClient diff --git a/native_client/dotnet/DeepSpeechConsole/arctic_a0024.wav b/native_client/dotnet/STTConsole/arctic_a0024.wav similarity index 100% rename from native_client/dotnet/DeepSpeechConsole/arctic_a0024.wav rename to native_client/dotnet/STTConsole/arctic_a0024.wav diff --git a/native_client/dotnet/DeepSpeechConsole/packages.config b/native_client/dotnet/STTConsole/packages.config similarity index 100% rename from native_client/dotnet/DeepSpeechConsole/packages.config rename to native_client/dotnet/STTConsole/packages.config diff --git a/native_client/dotnet/DeepSpeechWPF/.gitignore b/native_client/dotnet/STTWPF/.gitignore similarity index 100% rename from native_client/dotnet/DeepSpeechWPF/.gitignore rename to native_client/dotnet/STTWPF/.gitignore diff --git a/native_client/dotnet/DeepSpeechWPF/App.config b/native_client/dotnet/STTWPF/App.config similarity index 100% rename from native_client/dotnet/DeepSpeechWPF/App.config rename to native_client/dotnet/STTWPF/App.config diff --git a/native_client/dotnet/DeepSpeechWPF/App.xaml b/native_client/dotnet/STTWPF/App.xaml similarity index 74% rename from native_client/dotnet/DeepSpeechWPF/App.xaml rename to native_client/dotnet/STTWPF/App.xaml index 16ebb0d4..97292db8 100644 --- a/native_client/dotnet/DeepSpeechWPF/App.xaml +++ b/native_client/dotnet/STTWPF/App.xaml @@ -1,8 +1,8 @@  diff --git a/native_client/dotnet/DeepSpeechWPF/App.xaml.cs b/native_client/dotnet/STTWPF/App.xaml.cs similarity index 61% rename from native_client/dotnet/DeepSpeechWPF/App.xaml.cs rename to native_client/dotnet/STTWPF/App.xaml.cs index d4b87d6e..80dd818a 100644 --- a/native_client/dotnet/DeepSpeechWPF/App.xaml.cs +++ b/native_client/dotnet/STTWPF/App.xaml.cs @@ -1,10 +1,10 @@ using CommonServiceLocator; -using DeepSpeech.WPF.ViewModels; -using DeepSpeechClient.Interfaces; +using STT.WPF.ViewModels; +using STTClient.Interfaces; using GalaSoft.MvvmLight.Ioc; using System.Windows; -namespace DeepSpeechWPF +namespace STTWPF { /// /// Interaction logic for App.xaml @@ -18,11 +18,11 @@ namespace DeepSpeechWPF try { - //Register instance of DeepSpeech - DeepSpeechClient.DeepSpeech deepSpeechClient = - new DeepSpeechClient.DeepSpeech("deepspeech-0.8.0-models.pbmm"); + //Register instance of STT + STTClient.STT client = + new STTClient.STT("coqui-stt-0.8.0-models.pbmm"); - SimpleIoc.Default.Register(() => deepSpeechClient); + SimpleIoc.Default.Register(() => client); SimpleIoc.Default.Register(); } catch (System.Exception ex) @@ -35,8 +35,8 @@ namespace DeepSpeechWPF protected override void OnExit(ExitEventArgs e) { base.OnExit(e); - //Dispose instance of DeepSpeech - ServiceLocator.Current.GetInstance()?.Dispose(); + //Dispose instance of STT + ServiceLocator.Current.GetInstance()?.Dispose(); } } } diff --git a/native_client/dotnet/DeepSpeechWPF/MainWindow.xaml b/native_client/dotnet/STTWPF/MainWindow.xaml similarity index 98% rename from native_client/dotnet/DeepSpeechWPF/MainWindow.xaml rename to native_client/dotnet/STTWPF/MainWindow.xaml index 4fbe5e72..569f6ad2 100644 --- a/native_client/dotnet/DeepSpeechWPF/MainWindow.xaml +++ b/native_client/dotnet/STTWPF/MainWindow.xaml @@ -1,10 +1,10 @@  /// Interaction logic for MainWindow.xaml diff --git a/native_client/dotnet/DeepSpeechWPF/Properties/AssemblyInfo.cs b/native_client/dotnet/STTWPF/Properties/AssemblyInfo.cs similarity index 91% rename from native_client/dotnet/DeepSpeechWPF/Properties/AssemblyInfo.cs rename to native_client/dotnet/STTWPF/Properties/AssemblyInfo.cs index f9ae7d76..f2e32102 100644 --- a/native_client/dotnet/DeepSpeechWPF/Properties/AssemblyInfo.cs +++ b/native_client/dotnet/STTWPF/Properties/AssemblyInfo.cs @@ -7,12 +7,12 @@ using System.Windows; // General Information about an assembly is controlled through the following // set of attributes. Change these attribute values to modify the information // associated with an assembly. -[assembly: AssemblyTitle("DeepSpeech.WPF")] +[assembly: AssemblyTitle("STT.WPF")] [assembly: AssemblyDescription("")] [assembly: AssemblyConfiguration("")] -[assembly: AssemblyCompany("")] -[assembly: AssemblyProduct("DeepSpeech.WPF.SingleFiles")] -[assembly: AssemblyCopyright("Copyright © 2018")] +[assembly: AssemblyCompany("Coqui GmbH")] +[assembly: AssemblyProduct("STT.WPF.SingleFiles")] +[assembly: AssemblyCopyright("Copyright © 2018-2020 Mozilla, © 2021 Coqui GmbH")] [assembly: AssemblyTrademark("")] [assembly: AssemblyCulture("")] diff --git a/native_client/dotnet/DeepSpeechWPF/Properties/Resources.Designer.cs b/native_client/dotnet/STTWPF/Properties/Resources.Designer.cs similarity index 94% rename from native_client/dotnet/DeepSpeechWPF/Properties/Resources.Designer.cs rename to native_client/dotnet/STTWPF/Properties/Resources.Designer.cs index 2da2b4b2..2478decd 100644 --- a/native_client/dotnet/DeepSpeechWPF/Properties/Resources.Designer.cs +++ b/native_client/dotnet/STTWPF/Properties/Resources.Designer.cs @@ -8,7 +8,7 @@ // //------------------------------------------------------------------------------ -namespace DeepSpeech.WPF.Properties { +namespace STT.WPF.Properties { using System; @@ -39,7 +39,7 @@ namespace DeepSpeech.WPF.Properties { internal static global::System.Resources.ResourceManager ResourceManager { get { if (object.ReferenceEquals(resourceMan, null)) { - global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("DeepSpeech.WPF.Properties.Resources", typeof(Resources).Assembly); + global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("STT.WPF.Properties.Resources", typeof(Resources).Assembly); resourceMan = temp; } return resourceMan; diff --git a/native_client/dotnet/DeepSpeechWPF/Properties/Resources.resx b/native_client/dotnet/STTWPF/Properties/Resources.resx similarity index 100% rename from native_client/dotnet/DeepSpeechWPF/Properties/Resources.resx rename to native_client/dotnet/STTWPF/Properties/Resources.resx diff --git a/native_client/dotnet/DeepSpeechWPF/Properties/Settings.Designer.cs b/native_client/dotnet/STTWPF/Properties/Settings.Designer.cs similarity index 96% rename from native_client/dotnet/DeepSpeechWPF/Properties/Settings.Designer.cs rename to native_client/dotnet/STTWPF/Properties/Settings.Designer.cs index 0f464bc4..de63d157 100644 --- a/native_client/dotnet/DeepSpeechWPF/Properties/Settings.Designer.cs +++ b/native_client/dotnet/STTWPF/Properties/Settings.Designer.cs @@ -8,7 +8,7 @@ // //------------------------------------------------------------------------------ -namespace DeepSpeech.WPF.Properties { +namespace STT.WPF.Properties { [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] diff --git a/native_client/dotnet/DeepSpeechWPF/Properties/Settings.settings b/native_client/dotnet/STTWPF/Properties/Settings.settings similarity index 100% rename from native_client/dotnet/DeepSpeechWPF/Properties/Settings.settings rename to native_client/dotnet/STTWPF/Properties/Settings.settings diff --git a/native_client/dotnet/DeepSpeechWPF/DeepSpeech.WPF.csproj b/native_client/dotnet/STTWPF/STT.WPF.csproj similarity index 95% rename from native_client/dotnet/DeepSpeechWPF/DeepSpeech.WPF.csproj rename to native_client/dotnet/STTWPF/STT.WPF.csproj index 7f46a31e..160adafe 100644 --- a/native_client/dotnet/DeepSpeechWPF/DeepSpeech.WPF.csproj +++ b/native_client/dotnet/STTWPF/STT.WPF.csproj @@ -6,8 +6,8 @@ AnyCPU {54BFD766-4305-4F4C-BA59-AF45505DF3C1} WinExe - DeepSpeech.WPF - DeepSpeech.WPF + STT.WPF + STT.WPF v4.6.2 512 {60dc8134-eba5-43b8-bcc9-bb4bc16c2548};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} @@ -131,9 +131,9 @@ - + {56de4091-bbbe-47e4-852d-7268b33b971f} - DeepSpeechClient + STTClient diff --git a/native_client/dotnet/DeepSpeechWPF/DeepSpeech.WPF.sln b/native_client/dotnet/STTWPF/STT.WPF.sln similarity index 80% rename from native_client/dotnet/DeepSpeechWPF/DeepSpeech.WPF.sln rename to native_client/dotnet/STTWPF/STT.WPF.sln index cd29025e..96c87ee5 100644 --- a/native_client/dotnet/DeepSpeechWPF/DeepSpeech.WPF.sln +++ b/native_client/dotnet/STTWPF/STT.WPF.sln @@ -3,9 +3,9 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 15 VisualStudioVersion = 15.0.28307.421 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeech.WPF", "DeepSpeech.WPF.csproj", "{54BFD766-4305-4F4C-BA59-AF45505DF3C1}" +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "STT.WPF", "STT.WPF.csproj", "{54BFD766-4305-4F4C-BA59-AF45505DF3C1}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechClient", "..\DeepSpeechClient\DeepSpeechClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}" +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "STTClient", "..\STTClient\STTClient.csproj", "{56DE4091-BBBE-47E4-852D-7268B33B971F}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/native_client/dotnet/DeepSpeechWPF/ViewModels/BindableBase.cs b/native_client/dotnet/STTWPF/ViewModels/BindableBase.cs similarity index 98% rename from native_client/dotnet/DeepSpeechWPF/ViewModels/BindableBase.cs rename to native_client/dotnet/STTWPF/ViewModels/BindableBase.cs index 909327ee..e5187cd6 100644 --- a/native_client/dotnet/DeepSpeechWPF/ViewModels/BindableBase.cs +++ b/native_client/dotnet/STTWPF/ViewModels/BindableBase.cs @@ -3,7 +3,7 @@ using System.Collections.Generic; using System.ComponentModel; using System.Runtime.CompilerServices; -namespace DeepSpeech.WPF.ViewModels +namespace STT.WPF.ViewModels { /// /// Implementation of to simplify models. diff --git a/native_client/dotnet/DeepSpeechWPF/ViewModels/MainWindowViewModel.cs b/native_client/dotnet/STTWPF/ViewModels/MainWindowViewModel.cs similarity index 97% rename from native_client/dotnet/DeepSpeechWPF/ViewModels/MainWindowViewModel.cs rename to native_client/dotnet/STTWPF/ViewModels/MainWindowViewModel.cs index 230fd42a..0ed4822b 100644 --- a/native_client/dotnet/DeepSpeechWPF/ViewModels/MainWindowViewModel.cs +++ b/native_client/dotnet/STTWPF/ViewModels/MainWindowViewModel.cs @@ -3,8 +3,8 @@ using CSCore; using CSCore.CoreAudioAPI; using CSCore.SoundIn; using CSCore.Streams; -using DeepSpeechClient.Interfaces; -using DeepSpeechClient.Models; +using STTClient.Interfaces; +using STTClient.Models; using GalaSoft.MvvmLight.CommandWpf; using Microsoft.Win32; using System; @@ -15,7 +15,7 @@ using System.IO; using System.Threading; using System.Threading.Tasks; -namespace DeepSpeech.WPF.ViewModels +namespace STT.WPF.ViewModels { /// /// View model of the MainWindow View. @@ -27,7 +27,7 @@ namespace DeepSpeech.WPF.ViewModels private const string ScorerPath = "kenlm.scorer"; #endregion - private readonly IDeepSpeech _sttClient; + private readonly ISTT _sttClient; #region Commands /// @@ -62,7 +62,7 @@ namespace DeepSpeech.WPF.ViewModels /// /// Stream used to feed data into the acoustic model. /// - private DeepSpeechStream _sttStream; + private Stream _sttStream; /// /// Records the audio of the selected device. @@ -75,7 +75,7 @@ namespace DeepSpeech.WPF.ViewModels private SoundInSource _soundInSource; /// - /// Target wave source.(16KHz Mono 16bit for DeepSpeech) + /// Target wave source.(16KHz Mono 16bit for STT) /// private IWaveSource _convertedSource; @@ -200,7 +200,7 @@ namespace DeepSpeech.WPF.ViewModels #endregion #region Ctors - public MainWindowViewModel(IDeepSpeech sttClient) + public MainWindowViewModel(ISTT sttClient) { _sttClient = sttClient; @@ -290,7 +290,7 @@ namespace DeepSpeech.WPF.ViewModels //read data from the converedSource //important: don't use the e.Data here //the e.Data contains the raw data provided by the - //soundInSource which won't have the deepspeech required audio format + //soundInSource which won't have the STT required audio format byte[] buffer = new byte[_convertedSource.WaveFormat.BytesPerSecond / 2]; int read; diff --git a/native_client/dotnet/DeepSpeechWPF/packages.config b/native_client/dotnet/STTWPF/packages.config similarity index 100% rename from native_client/dotnet/DeepSpeechWPF/packages.config rename to native_client/dotnet/STTWPF/packages.config diff --git a/native_client/dotnet/nupkg/deepspeech.nuspec.in b/native_client/dotnet/nupkg/STT.spec.in similarity index 54% rename from native_client/dotnet/nupkg/deepspeech.nuspec.in rename to native_client/dotnet/nupkg/STT.spec.in index 68e192c8..5c563bb6 100644 --- a/native_client/dotnet/nupkg/deepspeech.nuspec.in +++ b/native_client/dotnet/nupkg/STT.spec.in @@ -3,14 +3,14 @@ $NUPKG_ID $NUPKG_VERSION - DeepSpeech - DeepSpeech authors - DeepSpeech authors + Coqui STT + Coqui GmbH + Coqui GmbH MPL-2.0 - http://github.com/mozilla/DeepSpeech + http://github.com/coqui-ai/STT false - A library for running inference with a DeepSpeech model - Copyright (c) 2019-2020 Mozilla Corporation, 2020 DeepSpeech authors + A library for doing speech recognition using a Coqui STT model + Copyright (c) 2019-2020 Mozilla Corporation, (c) 2020 DeepSpeech authors, (c) 2021 Coqui GmbH native speech speech_recognition diff --git a/native_client/dotnet/nupkg/build/DeepSpeech.targets b/native_client/dotnet/nupkg/build/STT.targets similarity index 100% rename from native_client/dotnet/nupkg/build/DeepSpeech.targets rename to native_client/dotnet/nupkg/build/STT.targets diff --git a/native_client/generate_scorer_package.cpp b/native_client/generate_scorer_package.cpp index 0af0bfd9..dbc4bcd9 100644 --- a/native_client/generate_scorer_package.cpp +++ b/native_client/generate_scorer_package.cpp @@ -11,7 +11,7 @@ using namespace std; #include "ctcdecode/decoder_utils.h" #include "ctcdecode/scorer.h" #include "alphabet.h" -#include "deepspeech.h" +#include "coqui-stt.h" namespace po = boost::program_options; @@ -66,9 +66,9 @@ create_package(absl::optional alphabet_path, scorer.set_utf8_mode(force_bytes_output_mode.value()); scorer.reset_params(default_alpha, default_beta); int err = scorer.load_lm(lm_path); - if (err != DS_ERR_SCORER_NO_TRIE) { + if (err != STT_ERR_SCORER_NO_TRIE) { cerr << "Error loading language model file: " - << (err == DS_ERR_SCORER_UNREADABLE ? "Can't open binary LM file." : DS_ErrorCodeToErrorMessage(err)) + << (err == STT_ERR_SCORER_UNREADABLE ? "Can't open binary LM file." : STT_ErrorCodeToErrorMessage(err)) << "\n"; return 1; } @@ -103,7 +103,7 @@ main(int argc, char** argv) ("package", po::value(), "Path to save scorer package.") ("default_alpha", po::value(), "Default value of alpha hyperparameter (float).") ("default_beta", po::value(), "Default value of beta hyperparameter (float).") - ("force_bytes_output_mode", po::value(), "Boolean flag, force set or unset bytes output mode in the scorer package. If not set, infers from the vocabulary. See for further explanation.") + ("force_bytes_output_mode", po::value(), "Boolean flag, force set or unset bytes output mode in the scorer package. If not set, infers from the vocabulary. See for further explanation.") ; po::variables_map vm; diff --git a/native_client/java/Makefile b/native_client/java/Makefile index 90493621..21746f87 100644 --- a/native_client/java/Makefile +++ b/native_client/java/Makefile @@ -2,25 +2,25 @@ include ../definitions.mk -ARCHS := $(shell grep 'ABI_FILTERS' libdeepspeech/gradle.properties | cut -d'=' -f2 | sed -e 's/;/ /g') +ARCHS := $(shell grep 'ABI_FILTERS' libstt/gradle.properties | cut -d'=' -f2 | sed -e 's/;/ /g') GRADLE ?= ./gradlew all: apk clean: apk-clean - rm -rf *.java jni/deepspeech_wrap.cpp + rm -rf *.java jni/stt_wrap.cpp apk-clean: $(GRADLE) clean libs-clean: - rm -fr libdeepspeech/libs/*/libdeepspeech.so + rm -fr libstt/libs/*/libstt.so -libdeepspeech/libs/%/libdeepspeech.so: - -mkdir libdeepspeech/libs/$*/ - cp ${TFDIR}/bazel-out/$*-*/bin/native_client/libdeepspeech.so libdeepspeech/libs/$*/ +libstt/libs/%/libstt.so: + -mkdir libstt/libs/$*/ + cp ${TFDIR}/bazel-out/$*-*/bin/native_client/libstt.so libstt/libs/$*/ -apk: apk-clean bindings $(patsubst %,libdeepspeech/libs/%/libdeepspeech.so,$(ARCHS)) +apk: apk-clean bindings $(patsubst %,libstt/libs/%/libstt.so,$(ARCHS)) $(GRADLE) build maven-bundle: apk @@ -28,4 +28,4 @@ maven-bundle: apk $(GRADLE) zipMavenArtifacts bindings: clean ds-swig - $(DS_SWIG_ENV) swig -c++ -java -package org.deepspeech.libdeepspeech -outdir libdeepspeech/src/main/java/org/deepspeech/libdeepspeech/ -o jni/deepspeech_wrap.cpp jni/deepspeech.i + $(DS_SWIG_ENV) swig -c++ -java -package ai.coqui.libstt -outdir libstt/src/main/java/ai/coqui/libstt/ -o jni/stt_wrap.cpp jni/stt.i diff --git a/native_client/java/README.md b/native_client/java/README.md index 89ebc594..f21b61c4 100644 --- a/native_client/java/README.md +++ b/native_client/java/README.md @@ -1 +1 @@ -Full project description and documentation on GitHub: [https://github.com/mozilla/DeepSpeech](https://github.com/mozilla/DeepSpeech). +Full project description and documentation on [https://stt.readthedocs.io/](https://stt.readthedocs.io/). diff --git a/native_client/java/app/build.gradle b/native_client/java/app/build.gradle index 3b5b124a..11ef0ee5 100644 --- a/native_client/java/app/build.gradle +++ b/native_client/java/app/build.gradle @@ -4,7 +4,7 @@ android { compileSdkVersion 27 defaultConfig { - applicationId "org.deepspeech" + applicationId "ai.coqui.sttexampleapp" minSdkVersion 21 targetSdkVersion 27 versionName androidGitVersion.name() @@ -28,7 +28,7 @@ android { dependencies { implementation fileTree(dir: 'libs', include: ['*.jar']) - implementation project(':libdeepspeech') + implementation project(':libstt') implementation 'com.android.support:appcompat-v7:27.1.1' implementation 'com.android.support.constraint:constraint-layout:1.1.3' testImplementation 'junit:junit:4.12' diff --git a/native_client/java/app/src/androidTest/java/org/deepspeech/ExampleInstrumentedTest.java b/native_client/java/app/src/androidTest/java/ai/coqui/sttexampleapp/ExampleInstrumentedTest.java similarity index 85% rename from native_client/java/app/src/androidTest/java/org/deepspeech/ExampleInstrumentedTest.java rename to native_client/java/app/src/androidTest/java/ai/coqui/sttexampleapp/ExampleInstrumentedTest.java index ea6458a1..0a68a324 100644 --- a/native_client/java/app/src/androidTest/java/org/deepspeech/ExampleInstrumentedTest.java +++ b/native_client/java/app/src/androidTest/java/ai/coqui/sttexampleapp/ExampleInstrumentedTest.java @@ -1,4 +1,4 @@ -package org.deepspeech; +package ai.coqui.sttexampleapp import android.content.Context; import android.support.test.InstrumentationRegistry; @@ -21,6 +21,6 @@ public class ExampleInstrumentedTest { // Context of the app under test. Context appContext = InstrumentationRegistry.getTargetContext(); - assertEquals("org.deepspeech", appContext.getPackageName()); + assertEquals("ai.coqui.sttexampleapp", appContext.getPackageName()); } } diff --git a/native_client/java/app/src/main/AndroidManifest.xml b/native_client/java/app/src/main/AndroidManifest.xml index 668ef13f..dcf69307 100644 --- a/native_client/java/app/src/main/AndroidManifest.xml +++ b/native_client/java/app/src/main/AndroidManifest.xml @@ -1,6 +1,6 @@ + package="ai.coqui.sttexampleapp"> - + diff --git a/native_client/java/app/src/main/java/org/deepspeech/DeepSpeechActivity.java b/native_client/java/app/src/main/java/ai/coqui/sttexampleapp/STTActivity.java similarity index 92% rename from native_client/java/app/src/main/java/org/deepspeech/DeepSpeechActivity.java rename to native_client/java/app/src/main/java/ai/coqui/sttexampleapp/STTActivity.java index 22a15ba6..32395fdf 100644 --- a/native_client/java/app/src/main/java/org/deepspeech/DeepSpeechActivity.java +++ b/native_client/java/app/src/main/java/ai/coqui/sttexampleapp/STTActivity.java @@ -1,4 +1,4 @@ -package org.deepspeech; +package ai.coqui.sttexampleapp; import android.support.v7.app.AppCompatActivity; import android.os.Bundle; @@ -16,11 +16,11 @@ import java.io.IOException; import java.nio.ByteOrder; import java.nio.ByteBuffer; -import org.deepspeech.libdeepspeech.DeepSpeechModel; +import ai.coqui.libstt.STTModel; -public class DeepSpeechActivity extends AppCompatActivity { +public class STTActivity extends AppCompatActivity { - DeepSpeechModel _m = null; + STTModel _m = null; EditText _tfliteModel; EditText _audioFile; @@ -50,7 +50,7 @@ public class DeepSpeechActivity extends AppCompatActivity { this._tfliteStatus.setText("Creating model"); if (this._m == null) { // sphinx-doc: java_ref_model_start - this._m = new DeepSpeechModel(tfliteModel); + this._m = new STTModel(tfliteModel); this._m.setBeamWidth(BEAM_WIDTH); // sphinx-doc: java_ref_model_stop } @@ -124,7 +124,7 @@ public class DeepSpeechActivity extends AppCompatActivity { @Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); - setContentView(R.layout.activity_deep_speech); + setContentView(R.layout.activity_stt); this._decodedString = (TextView) findViewById(R.id.decodedString); this._tfliteStatus = (TextView) findViewById(R.id.tfliteStatus); @@ -132,10 +132,10 @@ public class DeepSpeechActivity extends AppCompatActivity { this._tfliteModel = (EditText) findViewById(R.id.tfliteModel); this._audioFile = (EditText) findViewById(R.id.audioFile); - this._tfliteModel.setText("/sdcard/deepspeech/output_graph.tflite"); + this._tfliteModel.setText("/sdcard/stt/output_graph.tflite"); this._tfliteStatus.setText("Ready, waiting ..."); - this._audioFile.setText("/sdcard/deepspeech/audio.wav"); + this._audioFile.setText("/sdcard/stt/audio.wav"); this._startInference = (Button) findViewById(R.id.btnStartInference); } diff --git a/native_client/java/app/src/main/res/layout/activity_deep_speech.xml b/native_client/java/app/src/main/res/layout/activity_stt.xml similarity index 99% rename from native_client/java/app/src/main/res/layout/activity_deep_speech.xml rename to native_client/java/app/src/main/res/layout/activity_stt.xml index 02c383d4..849b9e8d 100644 --- a/native_client/java/app/src/main/res/layout/activity_deep_speech.xml +++ b/native_client/java/app/src/main/res/layout/activity_stt.xml @@ -4,7 +4,7 @@ xmlns:tools="http://schemas.android.com/tools" android:layout_width="match_parent" android:layout_height="match_parent" - tools:context=".DeepSpeechActivity"> + tools:context=".STTActivity">