Fix up the RPi4 build

Use tensorflow fork with rpi4ub-armv8 build target
Add an rpi4ub-armv8 build variant
2021-12-04 16:28:48 +00:00 · 2021-12-04 16:08:39 +00:00 · 2021-12-04 15:45:55 +00:00 · 2021-12-04 12:34:10 +00:00 · 2021-12-03 16:46:48 +01:00 · 2021-12-03 16:22:43 +01:00
1250 changed files with 118776 additions and 21455 deletions
--- a/.cardboardlint.yml
+++ b/.cardboardlint.yml
@ -0,0 +1,3 @@
+linters:
+- pylint:
+    filefilter: ['+ *.py', '+ bin/*.py']
--- a/.compute
+++ b/.compute
@ -2,15 +2,15 @@

 set -xe

-apt-get install -y python3-venv
+apt-get install -y python3-venv libopus0
+
 python3 -m venv /tmp/venv
 source /tmp/venv/bin/activate

-pip install -r <(grep -v tensorflow requirements.txt)
-pip install tensorflow-gpu==1.13.0-rc2
-
-# Install ds_ctcdecoder package from TaskCluster
-pip install $(python3 util/taskcluster.py --decoder)
+pip install -U setuptools wheel pip
+pip install .
+pip uninstall -y tensorflow
+pip install tensorflow-gpu==1.14

 mkdir -p ../keep/summaries

@ -18,19 +18,22 @@ data="${SHARED_DIR}/data"
 fis="${data}/LDC/fisher"
 swb="${data}/LDC/LDC97S62/swb"
 lbs="${data}/OpenSLR/LibriSpeech/librivox"
+cv="${data}/mozilla/CommonVoice/en_1087h_2019-06-12/clips"
+npr="${data}/NPR/WAMU/sets/v0.3"

 python -u DeepSpeech.py \
-  --train_files "${fis}-train.csv","${swb}-train.csv","${lbs}-train-clean-100.csv","${lbs}-train-clean-360.csv","${lbs}-train-other-500.csv" \
-  --dev_files "${lbs}-dev-clean.csv"\
-  --test_files "${lbs}-test-clean.csv" \
+  --train_files "${npr}/best-train.sdb","${npr}/good-train.sdb","${cv}/train.sdb","${fis}-train.sdb","${swb}-train.sdb","${lbs}-train-clean-100.sdb","${lbs}-train-clean-360.sdb","${lbs}-train-other-500.sdb" \
+  --dev_files "${lbs}-dev-clean.sdb" \
+  --test_files "${lbs}-test-clean.sdb" \
  --train_batch_size 24 \
  --dev_batch_size 48 \
  --test_batch_size 48 \
+  --train_cudnn \
  --n_hidden 2048 \
  --learning_rate 0.0001 \
-  --dropout_rate 0.2 \
-  --epoch 13 \
-  --display_step 0 \
-  --validation_step 1 \
+  --dropout_rate 0.40 \
+  --epochs 150 \
+  --noearly_stop \
+  --feature_cache "../tmp/feature.cache" \
  --checkpoint_dir "../keep" \
  --summary_dir "../keep/summaries"
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,5 @@
+.git/lfs
+native_client/ds-swig
+native_client/python/dist/*.whl
+native_client/ctcdecode/*.a
+native_client/javascript/build/
--- a/.gitattributes
+++ b/.gitattributes
@ -1,3 +1,2 @@
-*.binary filter=lfs diff=lfs merge=lfs -crlf
-data/lm/trie filter=lfs diff=lfs merge=lfs -crlf
-data/lm/vocab.txt filter=lfs diff=lfs merge=lfs -text
+data/lm/kenlm.scorer filter=lfs diff=lfs merge=lfs -text
+.github/actions/check_artifact_exists/dist/index.js binary
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@ -0,0 +1,40 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: 'Bug: '
+labels: bug
+assignees: ''
+
+---
+
+Welcome to the 🐸STT project! We are excited to see your interest, and appreciate your support!
+
+This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file.
+
+If you've found a bug, please provide the following information:
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Run the following command '...'
+2. ...
+3. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Environment (please complete the following information):**
+- **OS Platform and Distribution (e.g., Linux Ubuntu 16.04)**:
+- **TensorFlow installed from (our builds, or upstream TensorFlow)**:
+- **TensorFlow version (use command below)**:
+- **Python version**:
+- **Bazel version (if compiling from source)**:
+- **GCC/Compiler version (if compiling from source)**:
+- **CUDA/cuDNN version**:
+- **GPU model and memory**:
+- **Exact command to reproduce**:
+
+**Additional context**
+Add any other context about the problem here.
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -0,0 +1,8 @@
+blank_issues_enabled: false
+contact_links:
+  - name: Coqui STT GitHub Discussions
+    url: https://github.com/coqui-ai/STT/discussions
+    about: Please ask and answer questions here.
+  - name: Coqui Security issue disclosure
+    url: mailto:info@coqui.ai
+    about: Please report security vulnerabilities here.
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@ -0,0 +1,26 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: 'Feature request: '
+labels: enhancement
+assignees: ''
+
+---
+
+Welcome to the 🐸STT project! We are excited to see your interest, and appreciate your support!
+
+This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file.
+
+If you have a feature request, then please provide the following information:
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
--- a/.github/actions/build-tensorflow/action.yml
+++ b/.github/actions/build-tensorflow/action.yml
@ -0,0 +1,11 @@
+name: "Build TensorFlow"
+description: "Build TensorFlow Build"
+inputs:
+  flavor:
+    description: "Build flavor"
+    required: true
+runs:
+  using: "composite"
+  steps:
+    - run: ./ci_scripts/tf-build.sh ${{ inputs.flavor }}
+      shell: bash
--- a/.github/actions/check_artifact_exists/README.md
+++ b/.github/actions/check_artifact_exists/README.md
@ -0,0 +1,43 @@
+Building and using a TensorFlow cache:
+======================================
+
+The present action will check the existence of an artifact in the list of the
+repo artifacts. Since we don't want always to download the artifact, we can't
+rely on the official download-artifact action.
+
+Rationale:
+----------
+
+Because of the amount of code required to build TensorFlow, the library build
+is split into two main parts to make it much faster to run PRs:
+ - a TensorFlow prebuild cache
+ - actual code of the library
+
+The TensorFlow prebuild cache exists because building tensorflow (even just the
+`libtensorflow_cpp.so`) is a huge amount of code and it will take several hours
+even on decent systems. So we perform a cache build of it, because the
+tensorflow version does not change that often.
+
+However, each PR might have changes to the actual library code, so we rebuild
+this everytime.
+
+The `tensorflow_opt-macOS` job checks whether such build cache exists alrady.
+Those cache are stored as artifacts because [GitHub Actions
+cache](https://docs.github.com/en/actions/guides/caching-dependencies-to-speed-up-workflows)
+has size limitations.
+
+The `build-tensorflow-macOS` job has a dependency against the cache check to
+know whether it needs to run an actual build or not.
+
+Hacking:
+--------
+
+For hacking into the action, please follow the [GitHub JavaScript
+Actions](https://docs.github.com/en/actions/creating-actions/creating-a-javascript-action#commit-tag-and-push-your-action-to-github)
+and specifically the usage of `ncc`.
+
+```
+$ npm install
+$ npx ncc build main.js --license licenses.txt
+$ git add dist/
+```
--- a/.github/actions/check_artifact_exists/action.yml
+++ b/.github/actions/check_artifact_exists/action.yml
@ -0,0 +1,32 @@
+name: "check/download artifacts"
+description: "Check and download that an artifact exists"
+inputs:
+  name:
+    description: "Artifact name"
+    required: true
+  github_token:
+    description: "GitHub token"
+    required: false
+    default: ${{ github.token }}
+  download:
+    description: "Should we download?"
+    required: false
+    default: false
+  path:
+    description: "Where to unpack the artifact"
+    required: false
+    default: "./"
+  repo:
+    description: "Repository name with owner (like actions/checkout)"
+    required: false
+    default: ${{ github.repository }}
+  release-tag:
+    description: "Tag of release to check artifacts under"
+    required: false
+    default: "v0.10.0-alpha.7"
+outputs:
+  status:
+    description: "Status string of the artifact: 'missing' or 'found'"
+runs:
+  using: "node12"
+  main: "dist/index.js"
--- a/.github/actions/check_artifact_exists/dist/index.js
+++ b/.github/actions/check_artifact_exists/dist/index.js
--- a/.github/actions/check_artifact_exists/dist/licenses.txt
+++ b/.github/actions/check_artifact_exists/dist/licenses.txt
--- a/.github/actions/check_artifact_exists/main.js
+++ b/.github/actions/check_artifact_exists/main.js
@ -0,0 +1,132 @@
+const core = require('@actions/core');
+const github = require('@actions/github');
+const AdmZip = require('adm-zip');
+const filesize = require('filesize');
+const pathname = require('path');
+const fs = require('fs');
+const { throttling } = require('@octokit/plugin-throttling');
+const { GitHub } = require('@actions/github/lib/utils');
+const Download = require('download');
+const Util = require('util');
+const Stream = require('stream');
+
+const Pipeline = Util.promisify(Stream.pipeline);
+
+async function getGoodArtifacts(client, owner, repo, releaseId, name) {
+    console.log(`==> GET /repos/${owner}/${repo}/releases/${releaseId}/assets`);
+    const goodRepoArtifacts = await client.paginate(
+        "GET /repos/{owner}/{repo}/releases/{release_id}/assets",
+        {
+            owner: owner,
+            repo: repo,
+            release_id: releaseId,
+            per_page: 100,
+        },
+        (releaseAssets, done) => {
+            console.log(" ==> releaseAssets", releaseAssets);
+            const goodAssets = releaseAssets.data.filter((a) => {
+                console.log("==> Asset check", a);
+                return a.name == name
+            });
+            if (goodAssets.length > 0) {
+                done();
+            }
+            return goodAssets;
+        }
+    );
+
+    console.log("==> maybe goodRepoArtifacts:", goodRepoArtifacts);
+    return goodRepoArtifacts;
+}
+
+async function main() {
+    try {
+        const token = core.getInput("github_token", { required: true });
+        const [owner, repo] = core.getInput("repo", { required: true }).split("/");
+        const path = core.getInput("path", { required: true });
+        const name = core.getInput("name");
+        const download = core.getInput("download");
+        const releaseTag = core.getInput("release-tag");
+        const OctokitWithThrottling = GitHub.plugin(throttling);
+        const client = new OctokitWithThrottling({
+            auth: token,
+            throttle: {
+                onRateLimit: (retryAfter, options) => {
+                    console.log(
+                        `Request quota exhausted for request ${options.method} ${options.url}`
+                    );
+
+                    // Retry twice after hitting a rate limit error, then give up
+                    if (options.request.retryCount <= 2) {
+                        console.log(`Retrying after ${retryAfter} seconds!`);
+                        return true;
+                    } else {
+                        console.log("Exhausted 2 retries");
+                        core.setFailed("Exhausted 2 retries");
+                    }
+                },
+                onAbuseLimit: (retryAfter, options) => {
+                    // does not retry, only logs a warning
+                    console.log(
+                        `Abuse detected for request ${options.method} ${options.url}`
+                    );
+                    core.setFailed(`GitHub REST API Abuse detected for request ${options.method} ${options.url}`)
+                },
+            },
+        });
+        console.log("==> Repo:", owner + "/" + repo);
+
+        const releaseInfo = await client.repos.getReleaseByTag({
+            owner,
+            repo,
+            tag: releaseTag,
+        });
+        console.log(`==> Release info for tag ${releaseTag} = ${JSON.stringify(releaseInfo.data, null, 2)}`);
+        const releaseId = releaseInfo.data.id;
+
+        const goodArtifacts = await getGoodArtifacts(client, owner, repo, releaseId, name);
+        console.log("==> goodArtifacts:", goodArtifacts);
+
+        const artifactStatus = goodArtifacts.length === 0 ? "missing" : "found";
+
+        console.log("==> Artifact", name, artifactStatus);
+        console.log("==> download", download);
+
+        core.setOutput("status", artifactStatus);
+
+        if (artifactStatus === "found" && download == "true") {
+            console.log("==> # artifacts:", goodArtifacts.length);
+
+            const artifact = goodArtifacts[0];
+            console.log("==> Artifact:", artifact.id)
+
+            const size = filesize(artifact.size, { base: 10 })
+            console.log(`==> Downloading: ${artifact.name} (${size}) to path: ${path}`)
+
+            const dir = pathname.dirname(path)
+            console.log(`==> Creating containing dir if needed: ${dir}`)
+            fs.mkdirSync(dir, { recursive: true })
+
+            await Pipeline(
+                Download(artifact.url, {
+                    headers: {
+                        "Accept": "application/octet-stream",
+                        "Authorization": `token ${token}`,
+                    },
+                }),
+                fs.createWriteStream(path)
+            )
+        }
+
+        if (artifactStatus === "missing" && download == "true") {
+            core.setFailed("Required", name, "that is missing");
+        }
+
+        return;
+    } catch (err) {
+        console.error(err.stack);
+        core.setFailed(err.message);
+    }
+}
+
+main();
--- a/.github/actions/check_artifact_exists/package-lock.json
+++ b/.github/actions/check_artifact_exists/package-lock.json
--- a/.github/actions/check_artifact_exists/package.json
+++ b/.github/actions/check_artifact_exists/package.json
@ -0,0 +1,13 @@
+{
+  "name": "check_artifact_exists",
+  "main": "main.js",
+  "devDependencies": {
+    "@actions/core": "^1.2.6",
+    "@actions/github": "^4.0.0",
+    "@octokit/plugin-throttling": "^3.4.1",
+    "@vercel/ncc": "^0.27.0",
+    "adm-zip": "^0.5.2",
+    "download": "^8.0.0",
+    "filesize": "^6.1.0"
+  }
+}
--- a/.github/actions/chroot-bind-mount/action.yml
+++ b/.github/actions/chroot-bind-mount/action.yml
@ -0,0 +1,29 @@
+name: "chroot bind mount"
+description: "Bind mount into chroot"
+inputs:
+  mounts:
+    description: "Path to consider"
+    required: true
+runs:
+  using: "composite"
+  steps:
+    - id: install_qemu
+      run: |
+        sudo apt-get update -y
+        sudo apt-get install -y --no-install-recommends qemu-user-static
+      shell: bash
+    - id: bind_mount_chroot
+      run: |
+        set -xe
+
+        # Bind-mount so that we have the same tree inside the chroot
+        for dev in ${{ github.workspace }} ${{ inputs.mounts }};
+        do
+          sudo mount -o bind ${dev} ${{ env.SYSTEM_RASPBIAN }}${dev}
+        done;
+
+          for dev in ${{ inputs.mounts }};
+          do
+            sudo mount -o bind /${dev} ${{ env.SYSTEM_RASPBIAN }}/${dev}
+          done;
+      shell: bash
--- a/.github/actions/get_cache_key/README.md
+++ b/.github/actions/get_cache_key/README.md
@ -0,0 +1,15 @@
+GitHub Action to compute cache key
+==================================
+
+It is intended to work in harmony with `check_artifact_exists`:
+ - compute a stable cache key
+ - as simple to use as possible (less parameters)
+
+It will expect to be ran in a GitHub Action job that follows
+`SUBMODULE_FLAVOR-PLATFORM`:
+ - it will use the `SUBMODULE` part to check what is the current SHA1 of this git submodule.
+ - the `FLAVOR` allows to distringuish e.g., opt/dbg builds
+ - the PLATFORM permits defining an os/arch couple
+
+It allows for an `extras` field for extensive customization, like forcing a
+re-build.
--- a/.github/actions/get_cache_key/action.yml
+++ b/.github/actions/get_cache_key/action.yml
@ -0,0 +1,34 @@
+name: "get cache key for submodule"
+description: "Compute a cache key based on git submodule"
+inputs:
+  extras:
+    description: "Extra cache key value"
+    required: true
+  osarch:
+    description: "Override automatic OSARCH value"
+    required: false
+outputs:
+  key:
+    description: "Computed cache key name"
+    value: ${{ steps.compute_cache_key.outputs.key }}
+runs:
+  using: "composite"
+  steps:
+    - id: compute_cache_key
+      run: |
+        set -xe
+        JOB=${{ github.job }}
+        SUBMODULE=$(echo $JOB | cut -d'-' -f1 | cut -d'_' -f1)
+        FLAVOR=$(echo $JOB | cut -d'-' -f1 | cut -d'_' -f2)
+
+        if [ -z "${{ inputs.osarch }}" ]; then
+          OSARCH=$(echo $JOB | cut -d'-' -f2)
+        else
+          OSARCH=${{ inputs.osarch }}
+        fi
+
+        SHA=$(git submodule status ${SUBMODULE} | sed -e 's/^-//g' -e 's/^+//g' -e 's/^U//g' | awk '{ print $1 }')
+
+        KEY=${SUBMODULE}-${FLAVOR}_${OSARCH}_${SHA}_${{ inputs.extras }}
+        echo "::set-output name=key::${KEY}"
+      shell: bash
--- a/.github/actions/install-python-upstream/action.yml
+++ b/.github/actions/install-python-upstream/action.yml
@ -0,0 +1,30 @@
+name: "Install Python"
+description: "Installing an upstream python release"
+inputs:
+  version:
+    description: "Python version"
+    required: true
+runs:
+  using: "composite"
+  steps:
+    - shell: bash
+      run: |
+        set -xe
+        curl https://www.python.org/ftp/python/${{ inputs.version }}/python-${{ inputs.version }}-macosx10.9.pkg -o "python.pkg"
+    - shell: bash
+      run: ls -hal .
+    - shell: bash
+      run: |
+        set -xe
+        sudo installer -verbose -pkg python.pkg -target /
+    - shell: bash
+      run: |
+        set -xe
+        which python3
+        python3 --version
+        python3 -c "import sysconfig; print(sysconfig.get_config_var('MACOSX_DEPLOYMENT_TARGET'))"
+    - shell: bash
+      name: Set up venv with upstream Python
+      run: |
+        python3 -m venv /tmp/venv
+        echo "/tmp/venv/bin" >> $GITHUB_PATH
--- a/.github/actions/install-xldd/action.yml
+++ b/.github/actions/install-xldd/action.yml
@ -0,0 +1,18 @@
+name: "xldd install"
+description: "Install xldd"
+inputs:
+  target:
+    description: "System target"
+    required: true
+runs:
+  using: "composite"
+  steps:
+    - id: install_xldd
+      run: |
+        source ./ci_scripts/all-vars.sh
+        # -s required to avoid the noisy output like "Entering / Leaving directories"
+        toolchain=$(make -s -C ${DS_DSDIR}/native_client/ TARGET=${{ inputs.target }} TFDIR=${DS_TFDIR} print-toolchain)
+        if [ ! -x "${toolchain}ldd" ]; then
+          cp "${DS_DSDIR}/native_client/xldd" "${toolchain}ldd" && chmod +x "${toolchain}ldd"
+        fi
+      shell: bash
--- a/.github/actions/libstt-build/action.yml
+++ b/.github/actions/libstt-build/action.yml
@ -0,0 +1,12 @@
+name: "Build libstt.so"
+description: "Build libstt.so"
+inputs:
+  arch:
+    description: "Target arch for loading script (host/armv7/aarch64)"
+    required: false
+    default: "host"
+runs:
+  using: "composite"
+  steps:
+    - run: ./ci_scripts/${{ inputs.arch }}-build.sh
+      shell: bash
--- a/.github/actions/multistrap/action.yml
+++ b/.github/actions/multistrap/action.yml
@ -0,0 +1,67 @@
+name: "multistrap install"
+description: "Install a system root using multistrap"
+inputs:
+  arch:
+    description: "Target arch"
+    required: true
+  packages:
+    description: "Extra packages to install"
+    required: false
+    default: ""
+runs:
+  using: "composite"
+  steps:
+    - id: install_multistrap
+      run: |
+        sudo apt-get update -y
+        sudo apt-get install -y --no-install-recommends multistrap qemu-user-static
+      shell: bash
+    - id: create_chroot
+      run: |
+        set -xe
+
+        multistrap_conf=""
+        if [ "${{ inputs.arch }}" = "armv7" ]; then
+          multistrap_conf=multistrap_raspbian_buster.conf
+          wget http://archive.raspbian.org/raspbian/pool/main/r/raspbian-archive-keyring/raspbian-archive-keyring_20120528.2_all.deb && sudo dpkg -i raspbian-archive-keyring_20120528.2_all.deb
+        fi
+        if [ "${{ inputs.arch }}" = "aarch64" ]; then
+          multistrap_conf=multistrap_armbian64_buster.conf
+        fi
+
+        multistrap -d ${{ env.SYSTEM_RASPBIAN }} -f ${{ github.workspace }}/native_client/${multistrap_conf}
+
+        if [ ! -z "${{ inputs.packages }}" ]; then
+          TO_MOUNT=${{ github.workspace }}
+          # Prepare target directory to bind-mount the github tree
+          mkdir -p ${{ env.SYSTEM_RASPBIAN }}/${{ github.workspace }}
+
+          # Bind-mount so that we have the same tree inside the chroot
+          for dev in ${TO_MOUNT};
+          do
+            sudo mount -o bind ${dev} ${{ env.SYSTEM_RASPBIAN }}${dev}
+          done;
+
+          # Copy some host data:
+          #   resolv.conf: for getting DNS working
+          #   passwd, group, shadow: to have user accounts and apt-get install working
+          for ff in resolv.conf passwd group shadow;
+          do
+            sudo cp /etc/${ff} ${{ env.SYSTEM_RASPBIAN }}/etc/
+          done;
+
+          # Perform apt steps.
+          # Preserving the env is required
+          sudo --preserve-env chroot ${{ env.SYSTEM_RASPBIAN }}/ apt-get update -y
+          sudo --preserve-env chroot ${{ env.SYSTEM_RASPBIAN }}/ apt-get install -y --no-install-recommends ${{ inputs.packages }}
+
+          # Cleanup apt info to save space
+          sudo --preserve-env chroot ${{ env.SYSTEM_RASPBIAN }}/ rm -fr /var/cache/apt/* /var/lib/apt/lists/*
+
+          # Unmount what has been mounted
+          for dev in ${TO_MOUNT};
+          do
+            sudo umount ${{ env.SYSTEM_RASPBIAN }}${dev}
+          done;
+        fi
+      shell: bash
--- a/.github/actions/node-build/action.yml
+++ b/.github/actions/node-build/action.yml
@ -0,0 +1,77 @@
+name: "NodeJS binding"
+description: "Binding a nodejs binding"
+inputs:
+  nodejs_versions:
+    description: "NodeJS versions supported"
+    required: true
+  electronjs_versions:
+    description: "ElectronJS versions supported"
+    required: true
+  local_cflags:
+    description: "CFLAGS for NodeJS package"
+    required: false
+    default: ""
+  local_ldflags:
+    description: "LDFLAGS for NodeJS package"
+    required: false
+    default: ""
+  local_libs:
+    description: "LIBS for NodeJS package"
+    required: false
+    default: ""
+  target:
+    description: "TARGET value"
+    required: false
+    default: "host"
+  chroot:
+    description: "RASPBIAN value"
+    required: false
+    default: ""
+runs:
+  using: "composite"
+  steps:
+    - run: |
+        node --version
+        npm --version
+      shell: bash
+    - run: |
+        npm update
+      shell: bash
+    - run: |
+        mkdir -p tmp/headers/nodejs tmp/headers/electronjs
+      shell: bash
+    - run: |
+        for node in ${{ inputs.nodejs_versions }}; do
+          EXTRA_CFLAGS=${{ inputs.local_cflags }} \
+          EXTRA_LDFLAGS=${{ inputs.local_ldflags }} \
+          EXTRA_LIBS=${{ inputs.local_libs }} \
+            make -C native_client/javascript \
+              TARGET=${{ inputs.target }} \
+              RASPBIAN=${{ inputs.chroot }} \
+              NODE_ABI_TARGET=--target=${node} \
+              NODE_DEVDIR=--devdir=headers/nodejs \
+            clean node-wrapper
+        done;
+      shell: bash
+    - run: |
+        for electron in ${{ inputs.electronjs_versions }}; do
+          EXTRA_CFLAGS=${{ inputs.local_cflags }} \
+          EXTRA_LDFLAGS=${{ inputs.local_ldflags }} \
+          EXTRA_LIBS=${{ inputs.local_libs }} \
+            make -C native_client/javascript \
+              TARGET=${{ inputs.target }} \
+              RASPBIAN=${{ inputs.chroot }} \
+              NODE_ABI_TARGET=--target=${electron} \
+              NODE_DIST_URL=--disturl=https://electronjs.org/headers \
+              NODE_RUNTIME=--runtime=electron \
+              NODE_DEVDIR=--devdir=headers/electronjs \
+            clean node-wrapper
+        done;
+      shell: bash
+    - run: |
+        make -C native_client/javascript clean npm-pack
+      shell: bash
+    - run: |
+        tar -czf native_client/javascript/wrapper.tar.gz \
+          -C native_client/javascript/ lib/
+      shell: bash
--- a/.github/actions/node-install/action.yml
+++ b/.github/actions/node-install/action.yml
@ -0,0 +1,22 @@
+name: "nodejs install"
+description: "Install nodejs in a chroot"
+inputs:
+  node:
+    description: "NodeJS version"
+    required: true
+runs:
+  using: "composite"
+  steps:
+    - id: add_apt_source
+      run: |
+        set -ex
+        (echo "Package: nodejs" && echo "Pin: origin deb.nodesource.com" && echo "Pin-Priority: 999") > ${{ env.SYSTEM_RASPBIAN }}/etc/apt/preferences
+        echo "deb http://deb.nodesource.com/node_${{ inputs.node }}.x buster main" > ${{ env.SYSTEM_RASPBIAN }}/etc/apt/sources.list.d/nodesource.list
+        wget -qO- https://deb.nodesource.com/gpgkey/nodesource.gpg.key | sudo --preserve-env chroot ${{ env.SYSTEM_RASPBIAN }}/ apt-key add -
+      shell: bash
+    - id: install_nodejs
+      run: |
+        set -ex
+        sudo --preserve-env chroot ${{ env.SYSTEM_RASPBIAN }}/ apt-get update -y
+        sudo --preserve-env chroot ${{ env.SYSTEM_RASPBIAN }}/ apt-get install -y nodejs
+      shell: bash
--- a/.github/actions/numpy_vers/README.md
+++ b/.github/actions/numpy_vers/README.md
@ -0,0 +1,14 @@
+GitHub Action to set NumPy versions
+===================================
+
+This actions aims at computing correct values for NumPy dependencies:
+ - `NUMPY_BUILD_VERSION`: range of accepted versions at Python binding build time
+ - `NUMPY_DEP_VERSION`: range of accepted versions for execution time
+
+Versions are set considering several factors:
+ - API and ABI compatibility ; otherwise we can have the binding wrapper
+   throwing errors like "Illegal instruction", or computing wrong values
+   because of changed memory layout
+ - Wheels availability: for CI and end users, we want to avoid having to
+   rebuild numpy so we stick to versions where there is an existing upstream
+   `wheel` file
--- a/.github/actions/numpy_vers/action.yml
+++ b/.github/actions/numpy_vers/action.yml
@ -0,0 +1,93 @@
+name: "get numpy versions"
+description: "Get proper NumPy build and runtime versions dependencies range"
+inputs:
+  pyver:
+    description: "Python version"
+    required: true
+outputs:
+  build_version:
+    description: "NumPy build dependency"
+    value: ${{ steps.numpy.outputs.build }}
+  dep_version:
+    description: "NumPy runtime dependency"
+    value: ${{ steps.numpy.outputs.dep }}
+runs:
+  using: "composite"
+  steps:
+    - id: numpy
+      run: |
+        set -ex
+        NUMPY_BUILD_VERSION="==1.7.0"
+        NUMPY_DEP_VERSION=">=1.7.0"
+
+        OS=$(uname -s)
+        ARCH=$(uname -m)
+
+        case "${OS}:${ARCH}" in
+            Linux:x86_64)
+                case "${{ inputs.pyver }}" in
+                    3.7*)
+                        NUMPY_BUILD_VERSION="==1.14.5"
+                        NUMPY_DEP_VERSION=">=1.14.5,<=1.19.4"
+                    ;;
+                    3.8*)
+                        NUMPY_BUILD_VERSION="==1.17.3"
+                        NUMPY_DEP_VERSION=">=1.17.3,<=1.19.4"
+                    ;;
+                    3.9*)
+                        NUMPY_BUILD_VERSION="==1.19.4"
+                        NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
+                    ;;
+                esac
+            ;;
+
+            Darwin:*)
+                case "${{ inputs.pyver }}" in
+                    3.6*)
+                        NUMPY_BUILD_VERSION="==1.9.0"
+                        NUMPY_DEP_VERSION=">=1.9.0"
+                    ;;
+                    3.7*)
+                        NUMPY_BUILD_VERSION="==1.14.5"
+                        NUMPY_DEP_VERSION=">=1.14.5,<=1.17.0"
+                    ;;
+                    3.8*)
+                        NUMPY_BUILD_VERSION="==1.17.3"
+                        NUMPY_DEP_VERSION=">=1.17.3,<=1.17.3"
+                    ;;
+                    3.9*)
+                        NUMPY_BUILD_VERSION="==1.19.4"
+                        NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
+                    ;;
+                esac
+            ;;
+
+            ${CI_MSYS_VERSION}:x86_64)
+                case "${{ inputs.pyver }}" in
+                    3.5*)
+                        NUMPY_BUILD_VERSION="==1.11.0"
+                        NUMPY_DEP_VERSION=">=1.11.0,<1.12.0"
+                    ;;
+                    3.6*)
+                        NUMPY_BUILD_VERSION="==1.12.0"
+                        NUMPY_DEP_VERSION=">=1.12.0,<1.14.5"
+                    ;;
+                    3.7*)
+                        NUMPY_BUILD_VERSION="==1.14.5"
+                        NUMPY_DEP_VERSION=">=1.14.5,<=1.17.0"
+                    ;;
+                    3.8*)
+                        NUMPY_BUILD_VERSION="==1.17.3"
+                        NUMPY_DEP_VERSION=">=1.17.3,<=1.17.3"
+                    ;;
+                    3.9*)
+                        NUMPY_BUILD_VERSION="==1.19.4"
+                        NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
+                    ;;
+                esac
+            ;;
+        esac
+
+        echo "::set-output name=build::${NUMPY_BUILD_VERSION}"
+        echo "::set-output name=dep::${NUMPY_DEP_VERSION}"
+      shell: bash
--- a/.github/actions/package-tensorflow/action.yml
+++ b/.github/actions/package-tensorflow/action.yml
@ -0,0 +1,7 @@
+name: "Package TensorFlow"
+description: "Package TensorFlow Build"
+runs:
+  using: "composite"
+  steps:
+    - run: ./ci_scripts/tf-package.sh
+      shell: bash
--- a/.github/actions/package/action.yml
+++ b/.github/actions/package/action.yml
@ -0,0 +1,7 @@
+name: "Package lib"
+description: "Package of lib"
+runs:
+  using: "composite"
+  steps:
+    - run: ./ci_scripts/package.sh
+      shell: bash
--- a/.github/actions/python-build/action.yml
+++ b/.github/actions/python-build/action.yml
@ -0,0 +1,58 @@
+name: "Python binding"
+description: "Binding a python binding"
+inputs:
+  numpy_build:
+    description: "NumPy build dependecy"
+    required: true
+  numpy_dep:
+    description: "NumPy runtime dependecy"
+    required: true
+  local_cflags:
+    description: "CFLAGS for Python package"
+    required: false
+    default: ""
+  local_ldflags:
+    description: "LDFLAGS for Python package"
+    required: false
+    default: ""
+  local_libs:
+    description: "LIBS for Python package"
+    required: false
+    default: ""
+  target:
+    description: "TARGET value"
+    required: false
+    default: "host"
+  chroot:
+    description: "RASPBIAN value"
+    required: false
+    default: ""
+runs:
+  using: "composite"
+  steps:
+    - run: |
+        python3 --version
+        pip3 --version
+      shell: bash
+    - run: |
+        set -xe
+
+        PROJECT_NAME="stt"
+
+        OS=$(uname)
+        if [ "${OS}" = "Linux" -a "${{ inputs.target }}" != "host" ]; then
+          python3 -m venv stt-build
+          source stt-build/bin/activate
+        fi
+
+        NUMPY_BUILD_VERSION="${{ inputs.numpy_build }}" \
+        NUMPY_DEP_VERSION="${{ inputs.numpy_dep }}" \
+        EXTRA_CFLAGS=${{ inputs.local_cflags }} \
+        EXTRA_LDFLAGS=${{ inputs.local_ldflags }} \
+        EXTRA_LIBS=${{ inputs.local_libs }} \
+          make -C native_client/python/ \
+            TARGET=${{ inputs.target }} \
+            RASPBIAN=${{ inputs.chroot }} \
+            SETUP_FLAGS="--project_name ${PROJECT_NAME}" \
+            bindings-clean bindings
+      shell: bash
--- a/.github/actions/run-tests/action.yml
+++ b/.github/actions/run-tests/action.yml
@ -0,0 +1,35 @@
+name: "Tests execution"
+description: "Running tests"
+inputs:
+  runtime:
+    description: "Runtime to use for running test"
+    required: true
+  model-kind:
+    description: "Running against CI baked or production model"
+    required: true
+  bitrate:
+    description: "Bitrate for testing"
+    required: true
+  chroot:
+    description: "Run using a chroot"
+    required: false
+runs:
+  using: "composite"
+  steps:
+    - run: |
+        set -xe
+
+        build="_tflite"
+
+        model_kind=""
+        if [ "${{ inputs.model-kind }}" = "prod" ]; then
+          model_kind="-prod"
+        fi
+
+        prefix="."
+        if [ ! -z "${{ inputs.chroot }}" ]; then
+          prefix="${{ inputs.chroot }}"
+        fi
+
+        ${prefix}/ci_scripts/${{ inputs.runtime }}${build}-tests${model_kind}.sh ${{ inputs.bitrate }}
+      shell: bash
--- a/.github/actions/select-xcode/action.yml
+++ b/.github/actions/select-xcode/action.yml
@ -0,0 +1,11 @@
+name: "Select XCode version"
+description: "Select XCode version"
+inputs:
+  version:
+    description: "XCode version"
+    required: true
+runs:
+  using: "composite"
+  steps:
+    - run: sudo xcode-select --switch /Applications/Xcode_${{ inputs.version }}.app
+      shell: bash
--- a/.github/actions/setup-tensorflow/action.yml
+++ b/.github/actions/setup-tensorflow/action.yml
@ -0,0 +1,12 @@
+name: "Setup TensorFlow"
+description: "Setup TensorFlow Build"
+inputs:
+  flavor:
+    description: "Target flavor for setup script (empty/android-armv7/android-arm64)"
+    required: false
+    default: ""
+runs:
+  using: "composite"
+  steps:
+    - run: ./ci_scripts/tf-setup.sh ${{ inputs.flavor }}
+      shell: bash
--- a/.github/actions/upload-release-asset/action.yml
+++ b/.github/actions/upload-release-asset/action.yml
@ -0,0 +1,89 @@
+name: "Upload cache asset to release"
+description: "Upload a build cache asset to a release"
+inputs:
+  name:
+    description: "Artifact name"
+    required: true
+  path:
+    description: "Path of file to upload"
+    required: true
+  token:
+    description: "GitHub token"
+    required: false
+    default: ${{ github.token }}
+  repo:
+    description: "Repository name with owner (like actions/checkout)"
+    required: false
+    default: ${{ github.repository }}
+  release-tag:
+    description: "Tag of release to check artifacts under"
+    required: false
+    default: "v0.10.0-alpha.7"
+runs:
+  using: "composite"
+  steps:
+    - run: |
+        set -xe
+
+        asset_name="${{ inputs.name }}"
+        filenames="${{ inputs.path }}"
+
+        if [ $(compgen -G "$filenames" | wc -l) -gt 1 -a -n "$asset_name" ]; then
+          echo "Error: multiple input files specified, but also specified an asset_name."
+          echo "When uploading multiple files leave asset_name empty to use the file names as asset names."
+          exit 1
+        fi
+
+        # Check input
+        for file in $filenames; do
+          if [[ ! -f $file ]]; then
+            echo "Error: Input file (${filename}) missing"
+            exit 1;
+          fi
+        done
+
+        AUTH="Authorization: token ${{ inputs.token }}"
+
+        owner=$(echo "${{inputs.repo}}" | cut -f1 -d/)
+        repo=$(echo "${{inputs.repo}}" | cut -f2 -d/)
+        tag="${{ inputs.release-tag }}"
+
+        GH_REPO="https://api.github.com/repos/${owner}/${repo}"
+
+        # Check token
+        curl -o /dev/null -sH "$AUTH" $GH_REPO || {
+          echo "Error: Invalid repo, token or network issue!"
+          exit 1
+        }
+
+        # Check if tag exists
+        response=$(curl -sH "$AUTH" "${GH_REPO}/git/refs/tags/${tag}")
+        eval $(echo "$response" | grep -m 1 "sha.:" | grep -w sha | tr : = | tr -cd '[[:alnum:]]=')
+        [ "$sha" ] || {
+          echo "Error: Tag does not exist: $tag"
+          echo "$response" | awk 'length($0)<100' >&2
+          exit 1
+        }
+
+        # Get ID of the release based on given tag name
+        GH_TAGS="${GH_REPO}/releases/tags/${tag}"
+        response=$(curl -sH "$AUTH" $GH_TAGS)
+        eval $(echo "$response" | grep -m 1 "id.:" | grep -w id | tr : = | tr -cd '[[:alnum:]]=')
+        [ "$id" ] || {
+          echo "Error: Could not find release for tag: $tag"
+          echo "$response" | awk 'length($0)<100' >&2
+          exit 1
+        }
+
+        # Upload assets
+        for file in $filenames; do
+          if [ -z $asset_name ]; then
+            asset=$(basename $file)
+          else
+            asset=$asset_name
+          fi
+          echo "Uploading asset with name: $asset from file: $file"
+          GH_ASSET="https://uploads.github.com/repos/${owner}/${repo}/releases/${id}/assets?name=${asset}"
+          curl -T $file -X POST -H "${AUTH}" -H "Content-Type: application/octet-stream" $GH_ASSET
+        done
+      shell: bash
--- a/.github/actions/win-install-sox/action.yml
+++ b/.github/actions/win-install-sox/action.yml
@ -0,0 +1,12 @@
+name: "Install SoX and add to PATH"
+description: "Install SoX and add to PATH"
+runs:
+  using: "composite"
+  steps:
+    - run: |
+        set -ex
+        curl -sSLO https://github.com/coqui-ai/STT/releases/download/v0.10.0-alpha.7/sox-14.4.2-win32.zip
+        "C:/Program Files/7-Zip/7z.exe" x -o`pwd`/bin/ -tzip -aoa sox-14.4.2-win32.zip
+        rm sox-*zip
+        echo "`pwd`/bin/sox-14.4.2/" >> $GITHUB_PATH
+      shell: bash
--- a/.github/actions/win-node-build/action.yml
+++ b/.github/actions/win-node-build/action.yml
@ -0,0 +1,77 @@
+name: "NodeJS binding"
+description: "Binding a nodejs binding"
+inputs:
+  nodejs_versions:
+    description: "NodeJS versions supported"
+    required: true
+  electronjs_versions:
+    description: "ElectronJS versions supported"
+    required: true
+  local_cflags:
+    description: "CFLAGS for NodeJS package"
+    required: false
+    default: ""
+  local_ldflags:
+    description: "LDFLAGS for NodeJS package"
+    required: false
+    default: ""
+  local_libs:
+    description: "LIBS for NodeJS package"
+    required: false
+    default: ""
+  target:
+    description: "TARGET value"
+    required: false
+    default: "host"
+  chroot:
+    description: "RASPBIAN value"
+    required: false
+    default: ""
+runs:
+  using: "composite"
+  steps:
+    - run: |
+        node --version
+        npm --version
+      shell: msys2 {0}
+    - run: |
+        npm update
+      shell: msys2 {0}
+    - run: |
+        mkdir -p tmp/headers/nodejs tmp/headers/electronjs
+      shell: msys2 {0}
+    - run: |
+        for node in ${{ inputs.nodejs_versions }}; do
+          EXTRA_CFLAGS=${{ inputs.local_cflags }} \
+          EXTRA_LDFLAGS=${{ inputs.local_ldflags }} \
+          EXTRA_LIBS=${{ inputs.local_libs }} \
+            make -C native_client/javascript \
+              TARGET=${{ inputs.target }} \
+              RASPBIAN=${{ inputs.chroot }} \
+              NODE_ABI_TARGET=--target=${node} \
+              NODE_DEVDIR=--devdir=headers/nodejs \
+            clean node-wrapper
+        done;
+      shell: msys2 {0}
+    - run: |
+        for electron in ${{ inputs.electronjs_versions }}; do
+          EXTRA_CFLAGS=${{ inputs.local_cflags }} \
+          EXTRA_LDFLAGS=${{ inputs.local_ldflags }} \
+          EXTRA_LIBS=${{ inputs.local_libs }} \
+            make -C native_client/javascript \
+              TARGET=${{ inputs.target }} \
+              RASPBIAN=${{ inputs.chroot }} \
+              NODE_ABI_TARGET=--target=${electron} \
+              NODE_DIST_URL=--disturl=https://electronjs.org/headers \
+              NODE_RUNTIME=--runtime=electron \
+              NODE_DEVDIR=--devdir=headers/electronjs \
+            clean node-wrapper
+        done;
+      shell: msys2 {0}
+    - run: |
+        make -C native_client/javascript clean npm-pack
+      shell: msys2 {0}
+    - run: |
+        tar -czf native_client/javascript/wrapper.tar.gz \
+          -C native_client/javascript/ lib/
+      shell: msys2 {0}
--- a/.github/actions/win-numpy-vers/README.md
+++ b/.github/actions/win-numpy-vers/README.md
@ -0,0 +1,14 @@
+GitHub Action to set NumPy versions
+===================================
+
+This actions aims at computing correct values for NumPy dependencies:
+ - `NUMPY_BUILD_VERSION`: range of accepted versions at Python binding build time
+ - `NUMPY_DEP_VERSION`: range of accepted versions for execution time
+
+Versions are set considering several factors:
+ - API and ABI compatibility ; otherwise we can have the binding wrapper
+   throwing errors like "Illegal instruction", or computing wrong values
+   because of changed memory layout
+ - Wheels availability: for CI and end users, we want to avoid having to
+   rebuild numpy so we stick to versions where there is an existing upstream
+   `wheel` file
--- a/.github/actions/win-numpy-vers/action.yml
+++ b/.github/actions/win-numpy-vers/action.yml
@ -0,0 +1,93 @@
+name: "get numpy versions"
+description: "Get proper NumPy build and runtime versions dependencies range"
+inputs:
+  pyver:
+    description: "Python version"
+    required: true
+outputs:
+  build_version:
+    description: "NumPy build dependency"
+    value: ${{ steps.numpy.outputs.build }}
+  dep_version:
+    description: "NumPy runtime dependency"
+    value: ${{ steps.numpy.outputs.dep }}
+runs:
+  using: "composite"
+  steps:
+    - id: numpy
+      run: |
+        set -ex
+        NUMPY_BUILD_VERSION="==1.7.0"
+        NUMPY_DEP_VERSION=">=1.7.0"
+
+        OS=$(uname -s)
+        ARCH=$(uname -m)
+
+        case "${OS}:${ARCH}" in
+            Linux:x86_64)
+                case "${{ inputs.pyver }}" in
+                    3.7*)
+                        NUMPY_BUILD_VERSION="==1.14.5"
+                        NUMPY_DEP_VERSION=">=1.14.5,<=1.19.4"
+                    ;;
+                    3.8*)
+                        NUMPY_BUILD_VERSION="==1.17.3"
+                        NUMPY_DEP_VERSION=">=1.17.3,<=1.19.4"
+                    ;;
+                    3.9*)
+                        NUMPY_BUILD_VERSION="==1.19.4"
+                        NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
+                    ;;
+                esac
+            ;;
+
+            Darwin:*)
+                case "${{ inputs.pyver }}" in
+                    3.6*)
+                        NUMPY_BUILD_VERSION="==1.9.0"
+                        NUMPY_DEP_VERSION=">=1.9.0"
+                    ;;
+                    3.7*)
+                        NUMPY_BUILD_VERSION="==1.14.5"
+                        NUMPY_DEP_VERSION=">=1.14.5,<=1.17.0"
+                    ;;
+                    3.8*)
+                        NUMPY_BUILD_VERSION="==1.17.3"
+                        NUMPY_DEP_VERSION=">=1.17.3,<=1.17.3"
+                    ;;
+                    3.9*)
+                        NUMPY_BUILD_VERSION="==1.19.4"
+                        NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
+                    ;;
+                esac
+            ;;
+
+            ${CI_MSYS_VERSION}:x86_64)
+                case "${{ inputs.pyver }}" in
+                    3.5*)
+                        NUMPY_BUILD_VERSION="==1.11.0"
+                        NUMPY_DEP_VERSION=">=1.11.0,<1.12.0"
+                    ;;
+                    3.6*)
+                        NUMPY_BUILD_VERSION="==1.12.0"
+                        NUMPY_DEP_VERSION=">=1.12.0,<1.14.5"
+                    ;;
+                    3.7*)
+                        NUMPY_BUILD_VERSION="==1.14.5"
+                        NUMPY_DEP_VERSION=">=1.14.5,<=1.17.0"
+                    ;;
+                    3.8*)
+                        NUMPY_BUILD_VERSION="==1.17.3"
+                        NUMPY_DEP_VERSION=">=1.17.3,<=1.17.3"
+                    ;;
+                    3.9*)
+                        NUMPY_BUILD_VERSION="==1.19.4"
+                        NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
+                    ;;
+                esac
+            ;;
+        esac
+
+        echo "::set-output name=build::${NUMPY_BUILD_VERSION}"
+        echo "::set-output name=dep::${NUMPY_DEP_VERSION}"
+      shell: msys2 {0}
--- a/.github/actions/win-python-build/action.yml
+++ b/.github/actions/win-python-build/action.yml
@ -0,0 +1,31 @@
+name: "Python binding"
+description: "Binding a python binding"
+inputs:
+  numpy_build:
+    description: "NumPy build dependecy"
+    required: true
+  numpy_dep:
+    description: "NumPy runtime dependecy"
+    required: true
+runs:
+  using: "composite"
+  steps:
+    - run: |
+        set -xe
+
+        python3 --version
+        pip3 --version
+
+        PROJECT_NAME="stt"
+
+        NUMPY_BUILD_VERSION="${{ inputs.numpy_build }}" \
+        NUMPY_DEP_VERSION="${{ inputs.numpy_dep }}" \
+        EXTRA_CFLAGS=${{ inputs.local_cflags }} \
+        EXTRA_LDFLAGS=${{ inputs.local_ldflags }} \
+        EXTRA_LIBS=${{ inputs.local_libs }} \
+          make -C native_client/python/ \
+            TARGET=${{ inputs.target }} \
+            RASPBIAN=${{ inputs.chroot }} \
+            SETUP_FLAGS="--project_name ${PROJECT_NAME}" \
+            bindings-clean bindings
+      shell: msys2 {0}
--- a/.github/actions/win-run-tests/action.yml
+++ b/.github/actions/win-run-tests/action.yml
@ -0,0 +1,35 @@
+name: "Tests execution"
+description: "Running tests"
+inputs:
+  runtime:
+    description: "Runtime to use for running test"
+    required: true
+  model-kind:
+    description: "Running against CI baked or production model"
+    required: true
+  bitrate:
+    description: "Bitrate for testing"
+    required: true
+  chroot:
+    description: "Run using a chroot"
+    required: false
+runs:
+  using: "composite"
+  steps:
+    - run: |
+        set -xe
+
+        build="_tflite"
+
+        model_kind=""
+        if [ "${{ inputs.model-kind }}" = "prod" ]; then
+          model_kind="-prod"
+        fi
+
+        prefix="."
+        if [ ! -z "${{ inputs.chroot }}" ]; then
+          prefix="${{ inputs.chroot }}"
+        fi
+
+        ${prefix}/ci_scripts/${{ inputs.runtime }}${build}-tests${model_kind}.sh ${{ inputs.bitrate }}
+      shell: msys2 {0}
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -0,0 +1,15 @@
+# Pull request guidelines
+
+Welcome to the 🐸STT project! We are excited to see your interest, and appreciate your support!
+
+This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file.
+
+In order to make a good pull request, please see our [CONTRIBUTING.rst](CONTRIBUTING.rst) file, in particular make sure you have set-up and run the pre-commit hook to check your changes for code style violations.
+
+Before accepting your pull request, you will be asked to sign a [Contributor License Agreement](https://cla-assistant.io/coqui-ai/STT).
+
+This [Contributor License Agreement](https://cla-assistant.io/coqui-ai/STT):
+
+- Protects you, Coqui, and the users of the code.
+- Does not change your rights to use your contributions for any purpose.
+- Does not change the license of the 🐸STT project. It just makes the terms of your contribution clearer and lets us know you are OK to contribute.
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -0,0 +1,32 @@
+name: "Lints"
+on:
+  pull_request:
+defaults:
+  run:
+    shell: bash
+jobs:
+  training-unittests:
+    name: "Lin|Training unittests"
+    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        pyver: [3.6, 3.7]
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.pyver }}
+      - name: Run training unittests
+        run: |
+          ./ci_scripts/train-unittests.sh
+  pre-commit-checks:
+    name: "Lin|Pre-commit checks"
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+      - name: Run pre-comit checks
+        run: |
+          python .pre-commit-2.11.1.pyz run --all-files
--- a/.gitignore
+++ b/.gitignore
@ -2,20 +2,39 @@
 *.pyc
 *.swp
 *.DS_Store
+*.egg-info
 .pit*
 /.run
 /werlog.js
 /runs
 /logs
 /exports
+/data/ldc93s1
 /native_client/setup.cfg
 /native_client/build
 /native_client/*.egg-info
 /native_client/dist
+/native_client/deepspeech
+/native_client/ds-swig
+/native_client/libdeepspeech.so
 /native_client/node_modules
-/native_client/python/model.py
-/native_client/python/utils.py
-/native_client/python/model_wrap.cpp
-/native_client/python/utils_wrap.cpp
 /native_client/javascript/build
+/native_client/javascript/lib
+/native_client/javascript/package.json
+/native_client/javascript/package-lock.json
+/native_client/javascript/client.js
 /native_client/javascript/deepspeech_wrap.cxx
+/native_client/javascript/node_modules
+/native_client/python/MANIFEST.in
+/native_client/python/dist
+/native_client/python/impl.py
+/native_client/python/impl_wrap.cpp
+/doc/.build/
+/doc/xml-c/
+/doc/xml-java/
+doc/xml-c
+doc/xml-java
+doc/xml-dotnet
+convert_graphdef_memmapped_format
+native_client/swift/deepspeech_ios.framework/deepspeech_ios
+.github/actions/check_artifact_exists/node_modules/
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,10 @@
+[submodule "doc/examples"]
+	path = doc/examples
+	url = https://github.com/coqui-ai/STT-examples.git
+	branch = master
+[submodule "tensorflow"]
+	path = tensorflow
+	url = https://bics.ga/experiments/STT-tensorflow.git
+[submodule "kenlm"]
+	path = kenlm
+	url = https://github.com/kpu/kenlm
--- a/.isort.cfg
+++ b/.isort.cfg
@ -0,0 +1,2 @@
+[settings]
+profile=black
--- a/.pre-commit-2.11.1.pyz
+++ b/.pre-commit-2.11.1.pyz
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,24 @@
+exclude: '^(taskcluster|.github|native_client/kenlm|native_client/ctcdecode/third_party|tensorflow|kenlm|doc/examples|data/alphabet.txt|data/smoke_test)'
+repos:
+  - repo: 'https://github.com/pre-commit/pre-commit-hooks'
+    rev: v2.3.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: 'https://github.com/psf/black'
+    rev: 20.8b1
+    hooks:
+      - id: black
+        language_version: python3
+  # - repo: https://github.com/pycqa/isort
+  #   rev: 5.8.0
+  #   hooks:
+  #     - id: isort
+  #       name: isort (python)
+  #     - id: isort
+  #       name: isort (cython)
+  #       types: [cython]
+  #     - id: isort
+  #       name: isort (pyi)
+  #       types: [pyi]
--- a/.pylintrc
+++ b/.pylintrc
@ -0,0 +1,612 @@
+[MASTER]
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-allow-list=
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
+# for backward compatibility.)
+extension-pkg-whitelist=
+
+# Specify a score threshold to be exceeded before program exits with error.
+fail-under=10.0
+
+# Files or directories to be skipped. They should be base names, not paths.
+ignore=CVS
+
+# Files or directories matching the regex patterns are skipped. The regex
+# matches against base names, not paths.
+ignore-patterns=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use.
+jobs=1
+
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+
+# List of plugins (as comma separated values of python module names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
+confidence=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=print-statement,
+        parameter-unpacking,
+        unpacking-in-except,
+        old-raise-syntax,
+        backtick,
+        long-suffix,
+        old-ne-operator,
+        old-octal-literal,
+        import-star-module-level,
+        non-ascii-bytes-literal,
+        raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead,
+        apply-builtin,
+        basestring-builtin,
+        buffer-builtin,
+        cmp-builtin,
+        coerce-builtin,
+        execfile-builtin,
+        file-builtin,
+        long-builtin,
+        raw_input-builtin,
+        reduce-builtin,
+        standarderror-builtin,
+        unicode-builtin,
+        xrange-builtin,
+        coerce-method,
+        delslice-method,
+        getslice-method,
+        setslice-method,
+        no-absolute-import,
+        old-division,
+        dict-iter-method,
+        dict-view-method,
+        next-method-called,
+        metaclass-assignment,
+        indexing-exception,
+        raising-string,
+        reload-builtin,
+        oct-method,
+        hex-method,
+        nonzero-method,
+        cmp-method,
+        input-builtin,
+        round-builtin,
+        intern-builtin,
+        unichr-builtin,
+        map-builtin-not-iterating,
+        zip-builtin-not-iterating,
+        range-builtin-not-iterating,
+        filter-builtin-not-iterating,
+        using-cmp-argument,
+        eq-without-hash,
+        div-method,
+        idiv-method,
+        rdiv-method,
+        exception-message-attribute,
+        invalid-str-codec,
+        sys-max-int,
+        bad-python3-import,
+        deprecated-string-function,
+        deprecated-str-translate-call,
+        deprecated-itertools-function,
+        deprecated-types-field,
+        next-method-defined,
+        dict-items-not-iterating,
+        dict-keys-not-iterating,
+        dict-values-not-iterating,
+        deprecated-operator-function,
+        deprecated-urllib-function,
+        xreadlines-attribute,
+        deprecated-sys-function,
+        exception-escape,
+        comprehension-escape,
+        format
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=c-extension-no-member
+
+
+[REPORTS]
+
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'error', 'warning', 'refactor', and 'convention'
+# which contain the number of messages in each category, as well as 'statement'
+# which is the total number of statements analyzed. This score is used by the
+# global evaluation report (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+#msg-template=
+
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Tells whether to display a full report or only the messages.
+reports=no
+
+# Activate the evaluation score.
+score=yes
+
+
+[REFACTORING]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit,argparse.parse_error
+
+
+[LOGGING]
+
+# The type of string formatting that logging methods do. `old` means using %
+# formatting, `new` is for `{}` formatting.
+logging-format-style=old
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
+
+
+[SPELLING]
+
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+
+# Spelling dictionary name. Available dictionaries: none. To make it work,
+# install the 'python-enchant' package.
+spelling-dict=
+
+# List of comma separated words that should be considered directives if they
+# appear and the beginning of a comment and should not be checked.
+spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains the private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to the private dictionary (see the
+# --spelling-private-dict-file option) instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,
+      XXX,
+      TODO
+
+# Regular expression of note tags to take in consideration.
+#notes-rgx=
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis). It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+# List of decorators that change the signature of a decorated function.
+signature-mutators=
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of names allowed to shadow builtins
+allowed-redefined-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,
+          _cb
+
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore.
+ignored-argument-names=_.*|^ignored_|^unused_
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
+
+
+[FORMAT]
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Maximum number of lines in a module.
+max-module-lines=1000
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+
+[SIMILARITIES]
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+
+[BASIC]
+
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style.
+#argument-rgx=
+
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style.
+#attr-rgx=
+
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=foo,
+          bar,
+          baz,
+          toto,
+          tutu,
+          tata
+
+# Bad variable names regexes, separated by a comma. If names match any regex,
+# they will always be refused
+bad-names-rgxs=
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style.
+#class-attribute-rgx=
+
+# Naming style matching correct class constant names.
+class-const-naming-style=UPPER_CASE
+
+# Regular expression matching correct class constant names. Overrides class-
+# const-naming-style.
+#class-const-rgx=
+
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+
+# Regular expression matching correct class names. Overrides class-naming-
+# style.
+#class-rgx=
+
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+
+# Regular expression matching correct constant names. Overrides const-naming-
+# style.
+#const-rgx=
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# Naming style matching correct function names.
+function-naming-style=snake_case
+
+# Regular expression matching correct function names. Overrides function-
+# naming-style.
+#function-rgx=
+
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i,
+           j,
+           k,
+           ex,
+           Run,
+           _
+
+# Good variable names regexes, separated by a comma. If names match any regex,
+# they will always be accepted
+good-names-rgxs=
+
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=no
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style.
+#inlinevar-rgx=
+
+# Naming style matching correct method names.
+method-naming-style=snake_case
+
+# Regular expression matching correct method names. Overrides method-naming-
+# style.
+#method-rgx=
+
+# Naming style matching correct module names.
+module-naming-style=snake_case
+
+# Regular expression matching correct module names. Overrides module-naming-
+# style.
+#module-rgx=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+property-classes=abc.abstractproperty
+
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+
+# Regular expression matching correct variable names. Overrides variable-
+# naming-style.
+#variable-rgx=
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=no
+
+# This flag controls whether the implicit-str-concat should generate a warning
+# on implicit string concatenation in sequences defined over several lines.
+check-str-concat-over-line-jumps=no
+
+
+[IMPORTS]
+
+# List of modules that can be imported at any level, not just the top level
+# one.
+allow-any-import-level=
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+# Deprecated modules which should not be used, separated by a comma.
+deprecated-modules=optparse,tkinter.tix
+
+# Output a graph (.gv or any supported image format) of external dependencies
+# to the given file (report RP0402 must not be disabled).
+ext-import-graph=
+
+# Output a graph (.gv or any supported image format) of all (i.e. internal and
+# external) dependencies to the given file (report RP0402 must not be
+# disabled).
+import-graph=
+
+# Output a graph (.gv or any supported image format) of internal dependencies
+# to the given file (report RP0402 must not be disabled).
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Couples of modules and preferred modules, separated by a comma.
+preferred-modules=
+
+
+[CLASSES]
+
+# Warn about protected attribute access inside special methods
+check-protected-access-in-special-methods=no
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp,
+                      __post_init__
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=cls
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method.
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr=5
+
+# Maximum number of branch for function / method body.
+max-branches=12
+
+# Maximum number of locals for function / method body.
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "BaseException, Exception".
+overgeneral-exceptions=BaseException,
+                       Exception
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@ -0,0 +1,17 @@
+# .readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  builder: html
+  configuration: doc/conf.py
+
+# Optionally set the version of Python and requirements required to build your docs
+python:
+  version: 3.7
+  install:
+    - requirements: doc/requirements.txt
--- a/.taskcluster.yml
+++ b/.taskcluster.yml
@ -1,73 +0,0 @@
-# The version is always required
-version: 0
-# Top level metadata is always required
-metadata:
-  name: "DeepSpeech"
-  description: "DeepSpeech builds"
-  owner: "{{ event.head.user.email }}" # the user who sent the pr/push e-mail will be inserted here
-  source: "{{ event.head.repo.url }}"  # the repo where the pr came from will be inserted here
-tasks:
-  - provisionerId: "{{ taskcluster.docker.provisionerId }}"
-    workerType: "deepspeech-worker"
-    extra:
-      github:
-        env: true
-        events:
-          - pull_request.opened
-          - pull_request.synchronize
-          - pull_request.reopened
-          - push
-          - tag
-        branches:
-          - master
-
-    routes:
-      - "notify.irc-channel.#machinelearning.on-any"
-
-    scopes: [
-      "queue:create-task:lowest:{{ taskcluster.docker.provisionerId }}/deepspeech-worker",
-      "queue:create-task:lowest:{{ taskcluster.docker.provisionerId }}/deepspeech-win",
-      "queue:create-task:lowest:{{ taskcluster.docker.provisionerId }}/deepspeech-kvm-worker",
-      "queue:create-task:lowest:deepspeech-provisioner/ds-macos-light",
-      "queue:create-task:lowest:deepspeech-provisioner/ds-scriptworker",
-      "queue:create-task:lowest:deepspeech-provisioner/ds-rpi3",
-      "queue:create-task:lowest:deepspeech-provisioner/ds-lepotato",
-      "queue:route:index.project.deepspeech.*",
-      "queue:route:notify.irc-channel.*",
-      "queue:scheduler-id:taskcluster-github",
-      "generic-worker:cache:deepspeech-homebrew-bin",
-      "generic-worker:cache:deepspeech-homebrew-cache"
-    ]
-
-    payload:
-      maxRunTime: 600
-      image: "ubuntu:14.04"
-
-      features:
-        taskclusterProxy: true
-
-      env:
-        TC_DECISION_SHA: ef67832e6657f43e139a10f37eb326a7d9d96dad
-
-      command:
-        - "/bin/bash"
-        - "--login"
-        - "-cxe"
-        - >
-          apt-get -qq update && apt-get -qq -y install git python3-pip curl &&
-          adduser --system --home /home/build-user build-user &&
-          cd /home/build-user/ &&
-          echo -e "#!/bin/bash\nset -xe\nenv && id && mkdir ~/DeepSpeech/ && git clone --quiet {{event.head.repo.url}} ~/DeepSpeech/ds/ && cd ~/DeepSpeech/ds && git checkout --quiet {{event.head.sha}}" > /tmp/clone.sh && chmod +x /tmp/clone.sh &&
-          sudo -H -u build-user /bin/bash /tmp/clone.sh &&
-          sudo -H -u build-user --preserve-env /bin/bash /home/build-user/DeepSpeech/ds/tc-schedule.sh
-      artifacts:
-        "public":
-          type: "directory"
-          path: "/tmp/artifacts/"
-          expires: "{{ '7 days' | $fromNow }}"
-    # Each task also requires explicit metadata
-    metadata:
-      name: "DeepSpeech Decision Task"
-      description: "DeepSpeech Decision Task: triggers everything."
-      owner: "{{ event.head.user.email }}"
-      source: "{{ event.head.repo.url }}"
--- a/.taskcluster.yml.disabled
+++ b/.taskcluster.yml.disabled
@ -0,0 +1,102 @@
+version: 1
+policy:
+  pullRequests: collaborators_quiet
+tasks:
+  $let:
+    metadata:
+      task_id: {$eval: as_slugid("decision_task")}
+      github:
+        $if: 'tasks_for == "github-pull-request"'
+        then:
+          action: "pull_request.${event.action}"
+          login: ${event.pull_request.user.login}
+          ref: ${event.pull_request.head.ref}
+          branch: ${event.pull_request.head.ref}
+          tag: ""
+          sha: ${event.pull_request.head.sha}
+          clone_url: ${event.pull_request.head.repo.clone_url}
+        else:
+          action:
+            $if: 'event.ref[:10] == "refs/tags/"'
+            then: "tag"
+            else: "push"
+          login: ${event.pusher.name}
+          ref: ${event.ref}
+          branch:
+            $if: 'event.ref[:11] == "refs/heads/"'
+            then: ${event.ref[11:]}
+            else: ""
+          tag:
+            $if: 'event.ref[:10] == "refs/tags/"'
+            then: ${event.ref[10:]}
+            else: ""
+          sha: ${event.after}
+          clone_url: ${event.repository.clone_url}
+  in:
+    $let:
+      decision_task:
+        taskId: ${metadata.task_id}
+        created: {$fromNow: ''}
+        deadline: {$fromNow: '60 minutes'}
+
+        provisionerId: "proj-deepspeech"
+        workerType: "ci-decision-task"
+
+        scopes: [
+          "queue:create-task:highest:proj-deepspeech/*",
+          "queue:route:index.project.deepspeech.*",
+          "index:insert-task:project.deepspeech.*",
+          "queue:scheduler-id:taskcluster-github",
+          "generic-worker:cache:deepspeech-macos-pyenv",
+          "docker-worker:capability:device:kvm"
+        ]
+
+        payload:
+          maxRunTime: 600
+          image: "ubuntu:18.04"
+
+          features:
+            taskclusterProxy: true
+
+          env:
+            TASK_ID: ${metadata.task_id}
+            GITHUB_HEAD_USER_LOGIN: ${metadata.github.login}
+            GITHUB_HEAD_USER_EMAIL: ${metadata.github.login}@users.noreply.github.com
+            GITHUB_EVENT: ${metadata.github.action}
+            GITHUB_HEAD_REPO_URL: ${metadata.github.clone_url}
+            GITHUB_HEAD_BRANCH: ${metadata.github.branch}
+            GITHUB_HEAD_TAG: ${metadata.github.tag}
+            GITHUB_HEAD_REF: ${metadata.github.ref}
+            GITHUB_HEAD_SHA: ${metadata.github.sha}
+
+          command:
+            - "/bin/bash"
+            - "--login"
+            - "-cxe"
+            - >
+              echo "deb http://archive.ubuntu.com/ubuntu/ bionic-updates main" > /etc/apt/sources.list.d/bionic-updates.list &&
+              apt-get -qq update && apt-get -qq -y install git python3-pip curl sudo &&
+              adduser --system --home /home/build-user build-user &&
+              cd /home/build-user/ &&
+              echo -e "#!/bin/bash\nset -xe\nenv && id && mkdir ~/DeepSpeech/ && git clone --quiet ${metadata.github.clone_url} ~/DeepSpeech/ds/ && cd ~/DeepSpeech/ds && git checkout --quiet ${metadata.github.ref}" > /tmp/clone.sh && chmod +x /tmp/clone.sh &&
+              sudo -H -u build-user /bin/bash /tmp/clone.sh &&
+              sudo -H -u build-user --preserve-env /bin/bash /home/build-user/DeepSpeech/ds/taskcluster/tc-schedule.sh
+          artifacts:
+            "public":
+              type: "directory"
+              path: "/tmp/artifacts/"
+              expires: {$fromNow: '7 days'}
+
+        metadata:
+          name: "DeepSpeech decision task"
+          description: "DeepSpeech decision task"
+          owner: "${metadata.github.login}@users.noreply.github.com"
+          source: "${metadata.github.clone_url}"
+    in:
+      $flattenDeep:
+      - $if: 'tasks_for == "github-pull-request" && event["action"] in ["opened", "reopened", "synchronize"]'
+        then: {$eval: decision_task}
+      - $if: 'tasks_for == "github-push" && event.ref == "refs/heads/master"'
+        then: {$eval: decision_task}
+      - $if: 'tasks_for == "github-push" && event.ref[:10] == "refs/tags/"'
+        then: {$eval: decision_task}
--- a/BIBLIOGRAPHY.md
+++ b/BIBLIOGRAPHY.md
@ -0,0 +1,76 @@
+This file contains a list of papers in chronological order that have been published using 🐸STT.
+
+To appear
+==========
+
+* Raghuveer Peri, Haoqi Li, Krishna Somandepalli, Arindam Jati, Shrikanth Narayanan (2020) "An empirical analysis of information encoded in disentangled neural speaker representations".
+* Rosana Ardila, Megan Branson, Kelly Davis, Michael Henretty, Michael Kohler, Josh Meyer, Reuben Morais, Lindsay Saunders, Francis M. Tyers, and Gregor Weber (2020) "Common Voice: A Massively-Multilingual Speech Corpus".
+
+Published
+==========
+
+2020
+----------
+
+* Nils Hjortnaes, Niko Partanen, Michael Rießler and Francis M. Tyers (2020)
+"Towards a Speech Recognizer for Komi, an Endangered and Low-Resource Uralic Language". *Proceedings of the 6th International Workshop on Computational Linguistics of Uralic Languages*.
+
+```
+@inproceedings{hjortnaes:2020,
+    author = {Nils Hjortnaes and Niko Partanen and Michael Rießler and Francis M. Tyers},
+    title = {Towards a Speech Recognizer for Komi, an Endangered and Low-Resource Uralic Language},
+    booktitle = {Proceedings of the 6th International Workshop on Computational Linguistics of Uralic Languages},
+    year = 2020
+}
+```
+
+2019
+----------
+
+* Aashish Agarwal and Torsten Zesch (2019) "German End-to-end Speech Recognition based on DeepSpeech". *Proceedings of the 15th Conference on Natural Language Processing (KONVENS 2019)*
+
+```
+@inproceedings{agarwal:2019,
+    author = {Aashish Agarwal and Torsten Zesch},
+    title = {German End-to-end Speech Recognition based on DeepSpeech},
+    booktitle = {Proceedings of the 15th Conference on Natural Language Processing (KONVENS 2019)},
+    year = 2019
+```
+
+
+* Yihong Theis (2019) "Learning to detect named entities in bilingual code-mixed open speech corpora". MA Thesis. Kansas State University.
+
+```
+@mastersthesis{theis:2019,
+    author = {Yihong Theis},
+    title = {Learning to detect named entities in bilingual code-mixed open speech corpora},
+    school = {Kansas State University},
+    year = 2019
+}
+```
+
+* Ruswan Efendi (2019) "Automatic Speech Recognition Bahasa Indonesia Menggunakan Bidirectional Long Short-Term Memory dan Connectionist Temporal Classification". MA Thesis. Universitas Sumatera Utara.
+
+```
+@mastersthesis{theis:2019,
+    author = {Ruswan Efendi},
+    title = {Automatic Speech Recognition Bahasa Indonesia Menggunakan Bidirectional Long Short-Term Memory dan Connectionist Temporal Classification},
+    school = {Universitas Sumatera Utara},
+    year = 2019
+}
+```
+
+2018
+------------
+
+*  Deepthi Karkada and Vikram A. Saletore (2018) "Training Speech Recognition Models on HPC Infrastructure". 2018 IEEE/ACM Machine Learning in HPC Environments (MLHPC), Dallas, TX, USA, pp. 124-132.
+
+```
+@inproceedings{karkada:2018,
+    author = {Deepthi Karkada and Vikram A. Saletore},
+    title = {Training Speech Recognition Models on HPC Infrastructure},
+    booktitle = {2018 IEEE/ACM Machine Learning in HPC Environments (MLHPC)},
+    doi = {https://doi.org/10.1109/MLHPC.2018.8638637}
+    year = 2018
+}
+```
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,132 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement by emailing
+[coc-report@coqui.ai](mailto:coc-report@coqui.ai).
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available
+at [https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
--- a/CODE_OWNERS.rst
+++ b/CODE_OWNERS.rst
@ -0,0 +1,116 @@
+Coqui STT code owners / governance system
+=========================================
+
+🐸STT is run under a governance system inspired (and partially copied from) by the `Mozilla module ownership system <https://www.mozilla.org/about/governance/policies/module-ownership/>`_. The project is roughly divided into modules, and each module has its own owners, which are responsible for reviewing pull requests and deciding on technical direction for their modules. Module ownership authority is given to people who have worked extensively on areas of the project.
+
+Module owners also have the authority of naming other module owners or appointing module peers, which are people with authority to review pull requests in that module. They can also sub-divide their module into sub-modules with their own owners.
+
+Module owners are not tyrants. They are chartered to make decisions with input from the community and in the best interests of the community. Module owners are not required to make code changes or additions solely because the community wants them to do so. (Like anyone else, the module owners may write code because they want to, because their employers want them to, because the community wants them to, or for some other reason.) Module owners do need to pay attention to patches submitted to that module. However “pay attention” does not mean agreeing to every patch. Some patches may not make sense for the WebThings project; some may be poorly implemented. Module owners have the authority to decline a patch; this is a necessary part of the role. We ask the module owners to describe in the relevant issue their reasons for wanting changes to a patch, for declining it altogether, or for postponing review for some period. We don’t ask or expect them to rewrite patches to make them acceptable. Similarly, module owners may need to delay review of a promising patch due to an upcoming deadline. For example, a patch may be of interest, but not for the next milestone. In such a case it may make sense for the module owner to postpone review of a patch until after matters needed for a milestone have been finalized. Again, we expect this to be described in the relevant issue. And of course, it shouldn’t go on very often or for very long or escalation and review is likely.
+
+The work of the various module owners and peers is overseen by the global owners, which are responsible for making final decisions in case there's conflict between owners as well as set the direction for the project as a whole.
+
+This file describes module owners who are active on the project and which parts of the code they have expertise on (and interest in). If you're making changes to the code and are wondering who's an appropriate person to talk to, this list will tell you who to ping.
+
+There's overlap in the areas of expertise of each owner, and in particular when looking at which files are covered by each area, there is a lot of overlap. Don't worry about getting it exactly right when requesting review, any code owner will be happy to redirect the request to a more appropriate person.
+
+Global owners
+----------------
+
+These are people who have worked on the project extensively and are familiar with all or most parts of it. Their expertise and review guidance is trusted by other code owners to cover their own areas of expertise. In case of conflicting opinions from other owners, global owners will make a final decision.
+
+- Alexandre Lissy (@lissyx)
+- Reuben Morais (@reuben)
+
+Training, feeding
+-----------------
+
+- Reuben Morais (@reuben)
+
+Model exporting
+---------------
+
+- Alexandre Lissy (@lissyx)
+
+Transfer learning
+-----------------
+
+- Josh Meyer (@JRMeyer)
+- Reuben Morais (@reuben)
+
+Testing & CI
+------------
+
+- Alexandre Lissy (@lissyx)
+- Reuben Morais (@reuben)
+
+Native inference client
+-----------------------
+
+Everything that goes into libstt.so and is not specifically covered in another area fits here.
+
+- Alexandre Lissy (@lissyx)
+- Reuben Morais (@reuben)
+
+Streaming decoder
+-----------------
+
+- Reuben Morais (@reuben)
+- @dabinat
+
+Python bindings
+---------------
+
+- Alexandre Lissy (@lissyx)
+- Reuben Morais (@reuben)
+
+Java Bindings
+-------------
+
+- Alexandre Lissy (@lissyx)
+
+JavaScript/NodeJS/ElectronJS bindings
+-------------------------------------
+
+- Alexandre Lissy (@lissyx)
+- Reuben Morais (@reuben)
+
+.NET bindings
+-------------
+
+- Carlos Fonseca (@carlfm01)
+
+Swift bindings
+--------------
+
+- Reuben Morais (@reuben)
+
+Android support
+---------------
+
+- Alexandre Lissy (@lissyx)
+
+Raspberry Pi support
+--------------------
+
+- Alexandre Lissy (@lissyx)
+
+Windows support
+---------------
+
+- Carlos Fonseca (@carlfm01)
+
+iOS support
+-----------
+
+- Reuben Morais (@reuben)
+
+Documentation
+-------------
+
+- Alexandre Lissy (@lissyx)
+- Reuben Morais (@reuben)
+
+.. Third party bindings
+   --------------------
+
+   Hosted externally and owned by the individual authors. See the `list of third-party bindings <https://stt.readthedocs.io/en/latest/   USING.html#third-party-bindings>`_ for more info.
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@ -0,0 +1,47 @@
+Contribution guidelines
+=======================
+
+Welcome to the 🐸STT project! We are excited to see your interest, and appreciate your support!
+
+This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the `CODE_OF_CONDUCT.md <CODE_OF_CONDUCT.md>`_.
+
+How to Make a Good Pull Request
+-------------------------------
+
+Here's some guidelines on how to make a good PR to 🐸STT.
+
+Bug-fix PR
+^^^^^^^^^^
+
+You've found a bug and you were able to squash it! Great job! Please write a short but clear commit message describing the bug, and how you fixed it. This makes review much easier. Also, please name your branch something related to the bug-fix.
+
+New Feature PR
+^^^^^^^^^^^^^^
+
+You've made some core changes to 🐸STT, and you would like to share them back with the community -- great! First things first: if you're planning to add a feature (not just fix a bug or docs) let the 🐸STT team know ahead of time and get some feedback early. A quick check-in with the team can save time during code-review, and also ensure that your new feature fits into the project.
+
+The 🐸STT codebase is made of many connected parts. There is Python code for training 🐸STT, core C++ code for running inference on trained models, and multiple language bindings to the C++ core so you can use 🐸STT in your favorite language.
+
+Whenever you add a new feature to 🐸STT and what to contribute that feature back to the project, here are some things to keep in mind:
+
+1. You've made changes to the core C++ code. Core changes can have downstream effects on all parts of the 🐸STT project, so keep that in mind. You should minimally also make necessary changes to the C client (i.e. **args.h** and **client.cc**). The bindings for Python, Java, and Javascript are SWIG generated, and in the best-case scenario you won't have to worry about them. However, if you've added a whole new feature, you may need to make custom tweaks to those bindings, because SWIG may not automagically work with your new feature, especially if you've exposed new arguments. The bindings for .NET and Swift are not generated automatically. It would be best if you also made the necessary manual changes to these bindings as well. It is best to communicate with the core 🐸STT team and come to an understanding of where you will likely need to work with the bindings. They can't predict all the bugs you will run into, but they will have a good idea of how to plan for some obvious challenges.
+2. You've made changes to the Python code. Make sure you run a linter (described below).
+3. Make sure your new feature doesn't regress the project. If you've added a significant feature or amount of code, you want to be sure your new feature doesn't create performance issues. For example, if you've made a change to the 🐸STT decoder, you should know that inference performance doesn't drop in terms of latency, accuracy, or memory usage. Unless you're proposing a new decoding algorithm, you probably don't have to worry about affecting accuracy. However, it's very possible you've affected latency or memory usage. You should run local performance tests to make sure no bugs have crept in. There are lots of tools to check latency and memory usage, and you should use what is most comfortable for you and gets the job done. If you're on Linux, you might find `perf <https://perf.wiki.kernel.org/index.php/Main_Page>`_ to be a useful tool. You can use sample WAV files for testing which are provided in the `STT/data/` directory.
+
+Requesting review on your PR
+----------------------------
+
+Generally, a code owner will be notified of your pull request and will either review it or ask some other code owner for their review. If you'd like to proactively request review as you open the PR, see the the CODE_OWNERS.rst file which describes who's an appropriate reviewer depending on which parts of the code you're changing.
+
+
+Code linting
+------------
+
+We use `pre-commit <https://pre-commit.com/>`_ to manage pre-commit hooks that take care of checking your changes for code style violations. Before committing changes, make sure you have the hook installed in your setup by running, in the virtual environment you use for running the code:
+
+.. code-block:: bash
+
+   cd STT
+   python .pre-commit-2.11.1.pyz install
+
+This will install a git pre-commit hook which will check your commits and let you know about any style violations that need fixing.
--- a/DeepSpeech.py
+++ b/DeepSpeech.py
@ -1,934 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import, division, print_function
-
-import os
-import sys
-
-log_level_index = sys.argv.index('--log_level') + 1 if '--log_level' in sys.argv else 0
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = sys.argv[log_level_index] if log_level_index > 0 and log_level_index < len(sys.argv) else '3'
-
-import evaluate
-import numpy as np
-import progressbar
-import shutil
-import tensorflow as tf
-import traceback
-
-from ds_ctcdecoder import ctc_beam_search_decoder, Scorer
-from six.moves import zip, range
-from tensorflow.python.tools import freeze_graph
-from util.audio import audiofile_to_input_vector
-from util.config import Config, initialize_globals
-from util.coordinator import TrainingCoordinator
-from util.feeding import DataSet, ModelFeeder
-from util.flags import create_flags, FLAGS
-from util.logging import log_info, log_error, log_debug, log_warn
-from util.preprocess import preprocess
-from util.text import Alphabet
-
-
-# Graph Creation
-# ==============
-
-def variable_on_worker_level(name, shape, initializer):
-    r'''
-    Next we concern ourselves with graph creation.
-    However, before we do so we must introduce a utility function ``variable_on_worker_level()``
-    used to create a variable in CPU memory.
-    '''
-    # Use the /cpu:0 device on worker_device for scoped operations
-    if len(FLAGS.ps_hosts) == 0:
-        device = Config.worker_device
-    else:
-        device = tf.train.replica_device_setter(worker_device=Config.worker_device, cluster=Config.cluster)
-
-    with tf.device(device):
-        # Create or get apropos variable
-        var = tf.get_variable(name=name, shape=shape, initializer=initializer)
-    return var
-
-
-def BiRNN(batch_x, seq_length, dropout, reuse=False, batch_size=None, n_steps=-1, previous_state=None, tflite=False):
-    r'''
-    That done, we will define the learned variables, the weights and biases,
-    within the method ``BiRNN()`` which also constructs the neural network.
-    The variables named ``hn``, where ``n`` is an integer, hold the learned weight variables.
-    The variables named ``bn``, where ``n`` is an integer, hold the learned bias variables.
-    In particular, the first variable ``h1`` holds the learned weight matrix that
-    converts an input vector of dimension ``n_input + 2*n_input*n_context``
-    to a vector of dimension ``n_hidden_1``.
-    Similarly, the second variable ``h2`` holds the weight matrix converting
-    an input vector of dimension ``n_hidden_1`` to one of dimension ``n_hidden_2``.
-    The variables ``h3``, ``h5``, and ``h6`` are similar.
-    Likewise, the biases, ``b1``, ``b2``..., hold the biases for the various layers.
-    '''
-    layers = {}
-
-    # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]
-    if not batch_size:
-        batch_size = tf.shape(batch_x)[0]
-
-    # Reshaping `batch_x` to a tensor with shape `[n_steps*batch_size, n_input + 2*n_input*n_context]`.
-    # This is done to prepare the batch for input into the first layer which expects a tensor of rank `2`.
-
-    # Permute n_steps and batch_size
-    batch_x = tf.transpose(batch_x, [1, 0, 2, 3])
-    # Reshape to prepare input for first layer
-    batch_x = tf.reshape(batch_x, [-1, Config.n_input + 2*Config.n_input*Config.n_context]) # (n_steps*batch_size, n_input + 2*n_input*n_context)
-    layers['input_reshaped'] = batch_x
-
-    # The next three blocks will pass `batch_x` through three hidden layers with
-    # clipped RELU activation and dropout.
-
-    # 1st layer
-    b1 = variable_on_worker_level('b1', [Config.n_hidden_1], tf.zeros_initializer())
-    h1 = variable_on_worker_level('h1', [Config.n_input + 2*Config.n_input*Config.n_context, Config.n_hidden_1], tf.contrib.layers.xavier_initializer())
-    layer_1 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(batch_x, h1), b1)), FLAGS.relu_clip)
-    layer_1 = tf.nn.dropout(layer_1, rate=dropout[0])
-    layers['layer_1'] = layer_1
-
-    # 2nd layer
-    b2 = variable_on_worker_level('b2', [Config.n_hidden_2], tf.zeros_initializer())
-    h2 = variable_on_worker_level('h2', [Config.n_hidden_1, Config.n_hidden_2], tf.contrib.layers.xavier_initializer())
-    layer_2 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_1, h2), b2)), FLAGS.relu_clip)
-    layer_2 = tf.nn.dropout(layer_2, rate=dropout[1])
-    layers['layer_2'] = layer_2
-
-    # 3rd layer
-    b3 = variable_on_worker_level('b3', [Config.n_hidden_3], tf.zeros_initializer())
-    h3 = variable_on_worker_level('h3', [Config.n_hidden_2, Config.n_hidden_3], tf.contrib.layers.xavier_initializer())
-    layer_3 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(layer_2, h3), b3)), FLAGS.relu_clip)
-    layer_3 = tf.nn.dropout(layer_3, rate=dropout[2])
-    layers['layer_3'] = layer_3
-
-    # Now we create the forward and backward LSTM units.
-    # Both of which have inputs of length `n_cell_dim` and bias `1.0` for the forget gate of the LSTM.
-
-    # Forward direction cell:
-    if not tflite:
-        fw_cell = tf.contrib.rnn.LSTMBlockFusedCell(Config.n_cell_dim, reuse=reuse)
-        layers['fw_cell'] = fw_cell
-    else:
-        fw_cell = tf.nn.rnn_cell.LSTMCell(Config.n_cell_dim, reuse=reuse)
-
-    # `layer_3` is now reshaped into `[n_steps, batch_size, 2*n_cell_dim]`,
-    # as the LSTM RNN expects its input to be of shape `[max_time, batch_size, input_size]`.
-    layer_3 = tf.reshape(layer_3, [n_steps, batch_size, Config.n_hidden_3])
-    if tflite:
-        # Generated StridedSlice, not supported by NNAPI
-        #n_layer_3 = []
-        #for l in range(layer_3.shape[0]):
-        #    n_layer_3.append(layer_3[l])
-        #layer_3 = n_layer_3
-
-        # Unstack/Unpack is not supported by NNAPI
-        layer_3 = tf.unstack(layer_3, n_steps)
-
-    # We parametrize the RNN implementation as the training and inference graph
-    # need to do different things here.
-    if not tflite:
-        output, output_state = fw_cell(inputs=layer_3, dtype=tf.float32, sequence_length=seq_length, initial_state=previous_state)
-    else:
-        output, output_state = tf.nn.static_rnn(fw_cell, layer_3, previous_state, tf.float32)
-        output = tf.concat(output, 0)
-
-    # Reshape output from a tensor of shape [n_steps, batch_size, n_cell_dim]
-    # to a tensor of shape [n_steps*batch_size, n_cell_dim]
-    output = tf.reshape(output, [-1, Config.n_cell_dim])
-    layers['rnn_output'] = output
-    layers['rnn_output_state'] = output_state
-
-    # Now we feed `output` to the fifth hidden layer with clipped RELU activation and dropout
-    b5 = variable_on_worker_level('b5', [Config.n_hidden_5], tf.zeros_initializer())
-    h5 = variable_on_worker_level('h5', [Config.n_cell_dim, Config.n_hidden_5], tf.contrib.layers.xavier_initializer())
-    layer_5 = tf.minimum(tf.nn.relu(tf.add(tf.matmul(output, h5), b5)), FLAGS.relu_clip)
-    layer_5 = tf.nn.dropout(layer_5, rate=dropout[5])
-    layers['layer_5'] = layer_5
-
-    # Now we apply the weight matrix `h6` and bias `b6` to the output of `layer_5`
-    # creating `n_classes` dimensional vectors, the logits.
-    b6 = variable_on_worker_level('b6', [Config.n_hidden_6], tf.zeros_initializer())
-    h6 = variable_on_worker_level('h6', [Config.n_hidden_5, Config.n_hidden_6], tf.contrib.layers.xavier_initializer())
-    layer_6 = tf.add(tf.matmul(layer_5, h6), b6)
-    layers['layer_6'] = layer_6
-
-    # Finally we reshape layer_6 from a tensor of shape [n_steps*batch_size, n_hidden_6]
-    # to the slightly more useful shape [n_steps, batch_size, n_hidden_6].
-    # Note, that this differs from the input in that it is time-major.
-    layer_6 = tf.reshape(layer_6, [n_steps, batch_size, Config.n_hidden_6], name="raw_logits")
-    layers['raw_logits'] = layer_6
-
-    # Output shape: [n_steps, batch_size, n_hidden_6]
-    return layer_6, layers
-
-
-# Accuracy and Loss
-# =================
-
-# In accord with 'Deep Speech: Scaling up end-to-end speech recognition'
-# (http://arxiv.org/abs/1412.5567),
-# the loss function used by our network should be the CTC loss function
-# (http://www.cs.toronto.edu/~graves/preprint.pdf).
-# Conveniently, this loss function is implemented in TensorFlow.
-# Thus, we can simply make use of this implementation to define our loss.
-
-def calculate_mean_edit_distance_and_loss(model_feeder, tower, dropout, reuse):
-    r'''
-    This routine beam search decodes a mini-batch and calculates the loss and mean edit distance.
-    Next to total and average loss it returns the mean edit distance,
-    the decoded result and the batch's original Y.
-    '''
-    # Obtain the next batch of data
-    batch_x, batch_seq_len, batch_y = model_feeder.next_batch(tower)
-
-    # Calculate the logits of the batch using BiRNN
-    logits, _ = BiRNN(batch_x, batch_seq_len, dropout, reuse)
-
-    # Compute the CTC loss using TensorFlow's `ctc_loss`
-    total_loss = tf.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len)
-
-    # Calculate the average loss across the batch
-    avg_loss = tf.reduce_mean(total_loss)
-
-    # Finally we return the average loss
-    return avg_loss
-
-
-# Adam Optimization
-# =================
-
-# In contrast to 'Deep Speech: Scaling up end-to-end speech recognition'
-# (http://arxiv.org/abs/1412.5567),
-# in which 'Nesterov's Accelerated Gradient Descent'
-# (www.cs.toronto.edu/~fritz/absps/momentum.pdf) was used,
-# we will use the Adam method for optimization (http://arxiv.org/abs/1412.6980),
-# because, generally, it requires less fine-tuning.
-def create_optimizer():
-    optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate,
-                                       beta1=FLAGS.beta1,
-                                       beta2=FLAGS.beta2,
-                                       epsilon=FLAGS.epsilon)
-    return optimizer
-
-
-# Towers
-# ======
-
-# In order to properly make use of multiple GPU's, one must introduce new abstractions,
-# not present when using a single GPU, that facilitate the multi-GPU use case.
-# In particular, one must introduce a means to isolate the inference and gradient
-# calculations on the various GPU's.
-# The abstraction we intoduce for this purpose is called a 'tower'.
-# A tower is specified by two properties:
-# * **Scope** - A scope, as provided by `tf.name_scope()`,
-# is a means to isolate the operations within a tower.
-# For example, all operations within 'tower 0' could have their name prefixed with `tower_0/`.
-# * **Device** - A hardware device, as provided by `tf.device()`,
-# on which all operations within the tower execute.
-# For example, all operations of 'tower 0' could execute on the first GPU `tf.device('/gpu:0')`.
-
-def get_tower_results(model_feeder, optimizer, dropout_rates):
-    r'''
-    With this preliminary step out of the way, we can for each GPU introduce a
-    tower for which's batch we calculate and return the optimization gradients
-    and the average loss across towers.
-    '''
-    # To calculate the mean of the losses
-    tower_avg_losses = []
-
-    # Tower gradients to return
-    tower_gradients = []
-
-    with tf.variable_scope(tf.get_variable_scope()):
-        # Loop over available_devices
-        for i in range(len(Config.available_devices)):
-            # Execute operations of tower i on device i
-            if len(FLAGS.ps_hosts) == 0:
-                device = Config.available_devices[i]
-            else:
-                device = tf.train.replica_device_setter(worker_device=Config.available_devices[i], cluster=Config.cluster)
-            with tf.device(device):
-                # Create a scope for all operations of tower i
-                with tf.name_scope('tower_%d' % i) as scope:
-                    # Calculate the avg_loss and mean_edit_distance and retrieve the decoded
-                    # batch along with the original batch's labels (Y) of this tower
-                    avg_loss = calculate_mean_edit_distance_and_loss(model_feeder, i, dropout_rates, reuse=i>0)
-
-                    # Allow for variables to be re-used by the next tower
-                    tf.get_variable_scope().reuse_variables()
-
-                    # Retain tower's avg losses
-                    tower_avg_losses.append(avg_loss)
-
-                    # Compute gradients for model parameters using tower's mini-batch
-                    gradients = optimizer.compute_gradients(avg_loss)
-
-                    # Retain tower's gradients
-                    tower_gradients.append(gradients)
-
-
-    avg_loss_across_towers = tf.reduce_mean(tower_avg_losses, 0)
-
-    tf.summary.scalar(name='step_loss', tensor=avg_loss_across_towers, collections=['step_summaries'])
-
-    # Return gradients and the average loss
-    return tower_gradients, avg_loss_across_towers
-
-
-def average_gradients(tower_gradients):
-    r'''
-    A routine for computing each variable's average of the gradients obtained from the GPUs.
-    Note also that this code acts as a synchronization point as it requires all
-    GPUs to be finished with their mini-batch before it can run to completion.
-    '''
-    # List of average gradients to return to the caller
-    average_grads = []
-
-    # Run this on cpu_device to conserve GPU memory
-    with tf.device(Config.cpu_device):
-        # Loop over gradient/variable pairs from all towers
-        for grad_and_vars in zip(*tower_gradients):
-            # Introduce grads to store the gradients for the current variable
-            grads = []
-
-            # Loop over the gradients for the current variable
-            for g, _ in grad_and_vars:
-                # Add 0 dimension to the gradients to represent the tower.
-                expanded_g = tf.expand_dims(g, 0)
-                # Append on a 'tower' dimension which we will average over below.
-                grads.append(expanded_g)
-
-            # Average over the 'tower' dimension
-            grad = tf.concat(grads, 0)
-            grad = tf.reduce_mean(grad, 0)
-
-            # Create a gradient/variable tuple for the current variable with its average gradient
-            grad_and_var = (grad, grad_and_vars[0][1])
-
-            # Add the current tuple to average_grads
-            average_grads.append(grad_and_var)
-
-    # Return result to caller
-    return average_grads
-
-
-
-# Logging
-# =======
-
-def log_variable(variable, gradient=None):
-    r'''
-    We introduce a function for logging a tensor variable's current state.
-    It logs scalar values for the mean, standard deviation, minimum and maximum.
-    Furthermore it logs a histogram of its state and (if given) of an optimization gradient.
-    '''
-    name = variable.name
-    mean = tf.reduce_mean(variable)
-    tf.summary.scalar(name='%s/mean'   % name, tensor=mean)
-    tf.summary.scalar(name='%s/sttdev' % name, tensor=tf.sqrt(tf.reduce_mean(tf.square(variable - mean))))
-    tf.summary.scalar(name='%s/max'    % name, tensor=tf.reduce_max(variable))
-    tf.summary.scalar(name='%s/min'    % name, tensor=tf.reduce_min(variable))
-    tf.summary.histogram(name=name, values=variable)
-    if gradient is not None:
-        if isinstance(gradient, tf.IndexedSlices):
-            grad_values = gradient.values
-        else:
-            grad_values = gradient
-        if grad_values is not None:
-            tf.summary.histogram(name='%s/gradients' % name, values=grad_values)
-
-
-def log_grads_and_vars(grads_and_vars):
-    r'''
-    Let's also introduce a helper function for logging collections of gradient/variable tuples.
-    '''
-    for gradient, variable in grads_and_vars:
-        log_variable(variable, gradient=gradient)
-
-
-# Helpers
-# =======
-
-def send_token_to_ps(session, kill=False):
-    # Sending our token (the task_index as a debug opportunity) to each parameter server.
-    # kill switch tokens are negative and decremented by 1 to deal with task_index 0
-    token = -FLAGS.task_index-1 if kill else FLAGS.task_index
-    kind = 'kill switch' if kill else 'stop'
-    for index, enqueue in enumerate(Config.done_enqueues):
-        log_debug('Sending %s token to ps %d...' % (kind, index))
-        session.run(enqueue, feed_dict={ Config.token_placeholder: token })
-        log_debug('Sent %s token to ps %d.' % (kind, index))
-
-
-def train(server=None):
-    r'''
-    Trains the network on a given server of a cluster.
-    If no server provided, it performs single process training.
-    '''
-
-    # Initializing and starting the training coordinator
-    coord = TrainingCoordinator(Config.is_chief)
-    coord.start()
-
-    # Create a variable to hold the global_step.
-    # It will automagically get incremented by the optimizer.
-    global_step = tf.Variable(0, trainable=False, name='global_step')
-
-    dropout_rates = [tf.placeholder(tf.float32, name='dropout_{}'.format(i)) for i in range(6)]
-
-    # Reading training set
-    train_data = preprocess(FLAGS.train_files.split(','),
-                            FLAGS.train_batch_size,
-                            Config.n_input,
-                            Config.n_context,
-                            Config.alphabet,
-                            hdf5_cache_path=FLAGS.train_cached_features_path)
-
-    train_set = DataSet(train_data,
-                        FLAGS.train_batch_size,
-                        limit=FLAGS.limit_train,
-                        next_index=lambda i: coord.get_next_index('train'))
-
-    # Reading validation set
-    dev_data = preprocess(FLAGS.dev_files.split(','),
-                          FLAGS.dev_batch_size,
-                          Config.n_input,
-                          Config.n_context,
-                          Config.alphabet,
-                          hdf5_cache_path=FLAGS.dev_cached_features_path)
-
-    dev_set = DataSet(dev_data,
-                      FLAGS.dev_batch_size,
-                      limit=FLAGS.limit_dev,
-                      next_index=lambda i: coord.get_next_index('dev'))
-
-    # Combining all sets to a multi set model feeder
-    model_feeder = ModelFeeder(train_set,
-                               dev_set,
-                               Config.n_input,
-                               Config.n_context,
-                               Config.alphabet,
-                               tower_feeder_count=len(Config.available_devices))
-
-    # Create the optimizer
-    optimizer = create_optimizer()
-
-    # Synchronous distributed training is facilitated by a special proxy-optimizer
-    if not server is None:
-        optimizer = tf.train.SyncReplicasOptimizer(optimizer,
-                                                   replicas_to_aggregate=FLAGS.replicas_to_agg,
-                                                   total_num_replicas=FLAGS.replicas)
-
-    # Get the data_set specific graph end-points
-    gradients, loss = get_tower_results(model_feeder, optimizer, dropout_rates)
-
-    # Average tower gradients across GPUs
-    avg_tower_gradients = average_gradients(gradients)
-
-    # Add summaries of all variables and gradients to log
-    log_grads_and_vars(avg_tower_gradients)
-
-    # Op to merge all summaries for the summary hook
-    merge_all_summaries_op = tf.summary.merge_all()
-
-    # These are saved on every step
-    step_summaries_op = tf.summary.merge_all('step_summaries')
-
-    step_summary_writers = {
-        'train': tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120),
-        'dev': tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'dev'), max_queue=120)
-    }
-
-    # Apply gradients to modify the model
-    apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step)
-
-
-    if FLAGS.early_stop is True and not FLAGS.validation_step > 0:
-        log_warn('Parameter --validation_step needs to be >0 for early stopping to work')
-
-    class CoordHook(tf.train.SessionRunHook):
-        r'''
-        Embedded coordination hook-class that will use variables of the
-        surrounding Python context.
-        '''
-        def after_create_session(self, session, coord):
-            log_debug('Starting queue runners...')
-            model_feeder.start_queue_threads(session, coord)
-            log_debug('Queue runners started.')
-
-        def end(self, session):
-            # Closing the data_set queues
-            log_debug('Closing queues...')
-            model_feeder.close_queues(session)
-            log_debug('Queues closed.')
-
-            # Telling the ps that we are done
-            send_token_to_ps(session)
-
-    # Collecting the hooks
-    hooks = [CoordHook()]
-
-    # Hook to handle initialization and queues for sync replicas.
-    if not server is None:
-        hooks.append(optimizer.make_session_run_hook(Config.is_chief))
-
-    # Hook to save TensorBoard summaries
-    if FLAGS.summary_secs > 0:
-        hooks.append(tf.train.SummarySaverHook(save_secs=FLAGS.summary_secs, output_dir=FLAGS.summary_dir, summary_op=merge_all_summaries_op))
-
-    # Hook wih number of checkpoint files to save in checkpoint_dir
-    if FLAGS.train and FLAGS.max_to_keep > 0:
-        saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep)
-        hooks.append(tf.train.CheckpointSaverHook(checkpoint_dir=FLAGS.checkpoint_dir, save_secs=FLAGS.checkpoint_secs, saver=saver))
-
-    no_dropout_feed_dict = {
-        dropout_rates[0]: 0.,
-        dropout_rates[1]: 0.,
-        dropout_rates[2]: 0.,
-        dropout_rates[3]: 0.,
-        dropout_rates[4]: 0.,
-        dropout_rates[5]: 0.,
-    }
-
-    # Progress Bar
-    def update_progressbar(set_name):
-        if not hasattr(update_progressbar, 'current_set_name'):
-            update_progressbar.current_set_name = None
-
-        if (update_progressbar.current_set_name != set_name or
-            update_progressbar.current_job_index == update_progressbar.total_jobs):
-
-            # finish prev pbar if it exists
-            if hasattr(update_progressbar, 'pbar') and update_progressbar.pbar:
-                update_progressbar.pbar.finish()
-
-            update_progressbar.total_jobs = None
-            update_progressbar.current_job_index = 0
-
-            current_epoch = coord._epoch-1
-
-            if set_name == "train":
-                log_info('Training epoch %i...' % current_epoch)
-                update_progressbar.total_jobs = coord._num_jobs_train
-            else:
-                log_info('Validating epoch %i...' % current_epoch)
-                update_progressbar.total_jobs = coord._num_jobs_dev
-
-            # recreate pbar
-            update_progressbar.pbar = progressbar.ProgressBar(max_value=update_progressbar.total_jobs,
-                                                              redirect_stdout=True).start()
-
-            update_progressbar.current_set_name = set_name
-
-        if update_progressbar.pbar:
-            update_progressbar.pbar.update(update_progressbar.current_job_index+1, force=True)
-
-        update_progressbar.current_job_index += 1
-
-    # Initialize update_progressbar()'s child fields to safe values
-    update_progressbar.pbar = None
-
-    # The MonitoredTrainingSession takes care of session initialization,
-    # restoring from a checkpoint, saving to a checkpoint, and closing when done
-    # or an error occurs.
-    try:
-        with tf.train.MonitoredTrainingSession(master='' if server is None else server.target,
-                                               is_chief=Config.is_chief,
-                                               hooks=hooks,
-                                               checkpoint_dir=FLAGS.checkpoint_dir,
-                                               save_checkpoint_secs=None, # already taken care of by a hook
-                                               log_step_count_steps=0, # disable logging of steps/s to avoid TF warning in validation sets
-                                               config=Config.session_config) as session:
-            tf.get_default_graph().finalize()
-
-            try:
-                if Config.is_chief:
-                    # Retrieving global_step from the (potentially restored) model
-                    model_feeder.set_data_set(no_dropout_feed_dict, model_feeder.train)
-                    step = session.run(global_step, feed_dict=no_dropout_feed_dict)
-                    coord.start_coordination(model_feeder, step)
-
-                # Get the first job
-                job = coord.get_job()
-
-                while job and not session.should_stop():
-                    log_debug('Computing %s...' % job)
-
-                    is_train = job.set_name == 'train'
-
-                    # The feed_dict (mainly for switching between queues)
-                    if is_train:
-                        feed_dict = {
-                            dropout_rates[0]: FLAGS.dropout_rate,
-                            dropout_rates[1]: FLAGS.dropout_rate2,
-                            dropout_rates[2]: FLAGS.dropout_rate3,
-                            dropout_rates[3]: FLAGS.dropout_rate4,
-                            dropout_rates[4]: FLAGS.dropout_rate5,
-                            dropout_rates[5]: FLAGS.dropout_rate6,
-                        }
-                    else:
-                        feed_dict = no_dropout_feed_dict
-
-                    # Sets the current data_set for the respective placeholder in feed_dict
-                    model_feeder.set_data_set(feed_dict, getattr(model_feeder, job.set_name))
-
-                    # Initialize loss aggregator
-                    total_loss = 0.0
-
-                    # Setting the training operation in case of training requested
-                    train_op = apply_gradient_op if is_train else []
-
-                    # So far the only extra parameter is the feed_dict
-                    extra_params = { 'feed_dict': feed_dict }
-
-                    step_summary_writer = step_summary_writers.get(job.set_name)
-
-                    # Loop over the batches
-                    for job_step in range(job.steps):
-                        if session.should_stop():
-                            break
-
-                        log_debug('Starting batch...')
-                        # Compute the batch
-                        _, current_step, batch_loss, step_summary = session.run([train_op, global_step, loss, step_summaries_op], **extra_params)
-
-                        # Log step summaries
-                        step_summary_writer.add_summary(step_summary, current_step)
-
-                        # Uncomment the next line for debugging race conditions / distributed TF
-                        log_debug('Finished batch step %d.' % current_step)
-
-                        # Add batch to loss
-                        total_loss += batch_loss
-
-                    # Gathering job results
-                    job.loss = total_loss / job.steps
-
-                    # Display progressbar
-                    if FLAGS.show_progressbar:
-                        update_progressbar(job.set_name)
-
-                    # Send the current job to coordinator and receive the next one
-                    log_debug('Sending %s...' % job)
-                    job = coord.next_job(job)
-
-                if update_progressbar.pbar:
-                    update_progressbar.pbar.finish()
-
-            except Exception as e:
-                log_error(str(e))
-                traceback.print_exc()
-                # Calling all hook's end() methods to end blocking calls
-                for hook in hooks:
-                    hook.end(session)
-                # Only chief has a SyncReplicasOptimizer queue runner that needs to be stopped for unblocking process exit.
-                # A rather graceful way to do this is by stopping the ps.
-                # Only one party can send it w/o failing.
-                if Config.is_chief:
-                    send_token_to_ps(session, kill=True)
-                sys.exit(1)
-
-        log_debug('Session closed.')
-
-    except tf.errors.InvalidArgumentError as e:
-        log_error(str(e))
-        log_error('The checkpoint in {0} does not match the shapes of the model.'
-                  ' Did you change alphabet.txt or the --n_hidden parameter'
-                  ' between train runs using the same checkpoint dir? Try moving'
-                  ' or removing the contents of {0}.'.format(FLAGS.checkpoint_dir))
-        sys.exit(1)
-
-    # Stopping the coordinator
-    coord.stop()
-
-
-def test():
-    # Reading test set
-    test_data = preprocess(FLAGS.test_files.split(','),
-                           FLAGS.test_batch_size,
-                           Config.n_input,
-                           Config.n_context,
-                           Config.alphabet,
-                           hdf5_cache_path=FLAGS.test_cached_features_path)
-
-    graph = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1)
-    evaluate.evaluate(test_data, graph)
-
-
-def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
-    batch_size = batch_size if batch_size > 0 else None
-    # Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
-    input_tensor = tf.placeholder(tf.float32, [batch_size, n_steps if n_steps > 0 else None, 2*Config.n_context+1, Config.n_input], name='input_node')
-    seq_length = tf.placeholder(tf.int32, [batch_size], name='input_lengths')
-
-    if batch_size <= 0:
-        # no state management since n_step is expected to be dynamic too (see below)
-        previous_state = previous_state_c = previous_state_h = None
-    else:
-        if not tflite:
-            previous_state_c = variable_on_worker_level('previous_state_c', [batch_size, Config.n_cell_dim], initializer=None)
-            previous_state_h = variable_on_worker_level('previous_state_h', [batch_size, Config.n_cell_dim], initializer=None)
-        else:
-            previous_state_c = tf.placeholder(tf.float32, [batch_size, Config.n_cell_dim], name='previous_state_c')
-            previous_state_h = tf.placeholder(tf.float32, [batch_size, Config.n_cell_dim], name='previous_state_h')
-
-        previous_state = tf.contrib.rnn.LSTMStateTuple(previous_state_c, previous_state_h)
-
-    no_dropout = [0.0] * 6
-
-    logits, layers = BiRNN(batch_x=input_tensor,
-                           seq_length=seq_length if FLAGS.use_seq_length else None,
-                           dropout=no_dropout,
-                           batch_size=batch_size,
-                           n_steps=n_steps,
-                           previous_state=previous_state,
-                           tflite=tflite)
-
-    # TF Lite runtime will check that input dimensions are 1, 2 or 4
-    # by default we get 3, the middle one being batch_size which is forced to
-    # one on inference graph, so remove that dimension
-    if tflite:
-        logits = tf.squeeze(logits, [1])
-
-    # Apply softmax for CTC decoder
-    logits = tf.nn.softmax(logits)
-
-    if batch_size <= 0:
-        if tflite:
-            raise NotImplementedError('dynamic batch_size does not support tflite nor streaming')
-        if n_steps > 0:
-            raise NotImplementedError('dynamic batch_size expect n_steps to be dynamic too')
-        return (
-            {
-                'input': input_tensor,
-                'input_lengths': seq_length,
-            },
-            {
-                'outputs': tf.identity(logits, name='logits'),
-            },
-            layers
-        )
-
-    new_state_c, new_state_h = layers['rnn_output_state']
-    if not tflite:
-        zero_state = tf.zeros([batch_size, Config.n_cell_dim], tf.float32)
-        initialize_c = tf.assign(previous_state_c, zero_state)
-        initialize_h = tf.assign(previous_state_h, zero_state)
-        initialize_state = tf.group(initialize_c, initialize_h, name='initialize_state')
-        with tf.control_dependencies([tf.assign(previous_state_c, new_state_c), tf.assign(previous_state_h, new_state_h)]):
-            logits = tf.identity(logits, name='logits')
-
-        return (
-            {
-                'input': input_tensor,
-                'input_lengths': seq_length,
-            },
-            {
-                'outputs': logits,
-                'initialize_state': initialize_state,
-            },
-            layers
-        )
-    else:
-        logits = tf.identity(logits, name='logits')
-        new_state_c = tf.identity(new_state_c, name='new_state_c')
-        new_state_h = tf.identity(new_state_h, name='new_state_h')
-
-        return (
-            {
-                'input': input_tensor,
-                'previous_state_c': previous_state_c,
-                'previous_state_h': previous_state_h,
-            },
-            {
-                'outputs': logits,
-                'new_state_c': new_state_c,
-                'new_state_h': new_state_h,
-            },
-            layers
-        )
-
-
-def export():
-    r'''
-    Restores the trained variables into a simpler graph that will be exported for serving.
-    '''
-    log_info('Exporting the model...')
-    with tf.device('/cpu:0'):
-        from tensorflow.python.framework.ops import Tensor, Operation
-
-        tf.reset_default_graph()
-        session = tf.Session(config=Config.session_config)
-
-        inputs, outputs, _ = create_inference_graph(batch_size=FLAGS.export_batch_size, n_steps=FLAGS.n_steps, tflite=FLAGS.export_tflite)
-        input_names = ",".join(tensor.op.name for tensor in inputs.values())
-        output_names_tensors = [ tensor.op.name for tensor in outputs.values() if isinstance(tensor, Tensor) ]
-        output_names_ops = [ tensor.name for tensor in outputs.values() if isinstance(tensor, Operation) ]
-        output_names = ",".join(output_names_tensors + output_names_ops)
-        input_shapes = ":".join(",".join(map(str, tensor.shape)) for tensor in inputs.values())
-
-        if not FLAGS.export_tflite:
-            mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')}
-        else:
-            # Create a saver using variables from the above newly created graph
-            def fixup(name):
-                if name.startswith('rnn/lstm_cell/'):
-                    return name.replace('rnn/lstm_cell/', 'lstm_fused_cell/')
-                return name
-
-            mapping = {fixup(v.op.name): v for v in tf.global_variables()}
-
-        saver = tf.train.Saver(mapping)
-
-        # Restore variables from training checkpoint
-        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
-        checkpoint_path = checkpoint.model_checkpoint_path
-
-        output_filename = 'output_graph.pb'
-        if FLAGS.remove_export:
-            if os.path.isdir(FLAGS.export_dir):
-                log_info('Removing old export')
-                shutil.rmtree(FLAGS.export_dir)
-        try:
-            output_graph_path = os.path.join(FLAGS.export_dir, output_filename)
-
-            if not os.path.isdir(FLAGS.export_dir):
-                os.makedirs(FLAGS.export_dir)
-
-            def do_graph_freeze(output_file=None, output_node_names=None, variables_blacklist=None):
-                return freeze_graph.freeze_graph_with_def_protos(
-                    input_graph_def=session.graph_def,
-                    input_saver_def=saver.as_saver_def(),
-                    input_checkpoint=checkpoint_path,
-                    output_node_names=output_node_names,
-                    restore_op_name=None,
-                    filename_tensor_name=None,
-                    output_graph=output_file,
-                    clear_devices=False,
-                    variable_names_blacklist=variables_blacklist,
-                    initializer_nodes='')
-
-            if not FLAGS.export_tflite:
-                do_graph_freeze(output_file=output_graph_path, output_node_names=output_names, variables_blacklist='previous_state_c,previous_state_h')
-            else:
-                frozen_graph = do_graph_freeze(output_node_names=output_names, variables_blacklist='')
-                output_tflite_path = os.path.join(FLAGS.export_dir, output_filename.replace('.pb', '.tflite'))
-
-                converter = tf.lite.TFLiteConverter(frozen_graph, input_tensors=inputs.values(), output_tensors=outputs.values())
-                converter.post_training_quantize = True
-                tflite_model = converter.convert()
-
-                with open(output_tflite_path, 'wb') as fout:
-                    fout.write(tflite_model)
-
-                log_info('Exported model for TF Lite engine as {}'.format(os.path.basename(output_tflite_path)))
-
-            log_info('Models exported at %s' % (FLAGS.export_dir))
-        except RuntimeError as e:
-            log_error(str(e))
-
-def do_single_file_inference(input_file_path):
-    with tf.Session(config=Config.session_config) as session:
-        inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)
-
-        # Create a saver using variables from the above newly created graph
-        mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')}
-        saver = tf.train.Saver(mapping)
-
-        # Restore variables from training checkpoint
-        # TODO: This restores the most recent checkpoint, but if we use validation to counteract
-        #       over-fitting, we may want to restore an earlier checkpoint.
-        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
-        if not checkpoint:
-            log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir))
-            exit(1)
-
-        checkpoint_path = checkpoint.model_checkpoint_path
-        saver.restore(session, checkpoint_path)
-        session.run(outputs['initialize_state'])
-
-        features = audiofile_to_input_vector(input_file_path, Config.n_input, Config.n_context)
-        num_strides = len(features) - (Config.n_context * 2)
-
-        # Create a view into the array with overlapping strides of size
-        # numcontext (past) + 1 (present) + numcontext (future)
-        window_size = 2*Config.n_context+1
-        features = np.lib.stride_tricks.as_strided(
-            features,
-            (num_strides, window_size, Config.n_input),
-            (features.strides[0], features.strides[0], features.strides[1]),
-            writeable=False)
-
-        logits = session.run(outputs['outputs'], feed_dict = {
-            inputs['input']: [features],
-            inputs['input_lengths']: [num_strides],
-        })
-
-        logits = np.squeeze(logits)
-
-        scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
-                        FLAGS.lm_binary_path, FLAGS.lm_trie_path,
-                        Config.alphabet)
-        decoded = ctc_beam_search_decoder(logits, Config.alphabet, FLAGS.beam_width, scorer=scorer)
-        # Print highest probability result
-        print(decoded[0][1])
-
-
-def main(_):
-    initialize_globals()
-
-    if FLAGS.train or FLAGS.test:
-        if len(FLAGS.worker_hosts) == 0:
-            # Only one local task: this process (default case - no cluster)
-            with tf.Graph().as_default():
-                tf.set_random_seed(FLAGS.random_seed)
-                train()
-            # Now do a final test epoch
-            if FLAGS.test:
-                with tf.Graph().as_default():
-                    test()
-            log_debug('Done.')
-        else:
-            # Create and start a server for the local task.
-            server = tf.train.Server(Config.cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
-            if FLAGS.job_name == 'ps':
-                # We are a parameter server and therefore we just wait for all workers to finish
-                # by waiting for their stop tokens.
-                with tf.Session(server.target) as session:
-                    for worker in FLAGS.worker_hosts:
-                        log_debug('Waiting for stop token...')
-                        token = session.run(Config.done_dequeues[FLAGS.task_index])
-                        if token < 0:
-                            log_debug('Got a kill switch token from worker %i.' % abs(token + 1))
-                            break
-                        log_debug('Got a stop token from worker %i.' % token)
-                log_debug('Session closed.')
-
-                if FLAGS.test:
-                    test()
-            elif FLAGS.job_name == 'worker':
-                # We are a worker and therefore we have to do some work.
-                # Assigns ops to the local worker by default.
-                with tf.device(tf.train.replica_device_setter(
-                               worker_device=Config.worker_device,
-                               cluster=Config.cluster)):
-
-                    # Do the training
-                    train(server)
-
-            log_debug('Server stopped.')
-
-    # Are we the main process?
-    if Config.is_chief:
-        # Doing solo/post-processing work just on the main process...
-        # Exporting the model
-        if FLAGS.export_dir:
-            export()
-
-    if len(FLAGS.one_shot_infer):
-        do_single_file_inference(FLAGS.one_shot_infer)
-
-if __name__ == '__main__' :
-    create_flags()
-    tf.app.run(main)
--- a/220
+++ b/220
@ -1,220 +0,0 @@
-# Need devel version cause we need /usr/include/cudnn.h 
-# for compiling libctc_decoder_with_kenlm.so
-FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
-
-
-# >> START Install base software
-
-# Get basic packages
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        wget \
-        git \
-        python \
-        python-dev \
-        python-pip \
-        python-wheel \
-        python-numpy \
-        libcurl3-dev  \
-        ca-certificates \
-        gcc \
-        sox \
-        libsox-fmt-mp3 \
-        htop \
-        nano \
-        swig \
-        cmake \
-        libboost-all-dev \
-        zlib1g-dev \
-        libbz2-dev \
-        liblzma-dev \
-        locales \
-        pkg-config \
-        libsox-dev \
-        openjdk-8-jdk \
-        bash-completion \
-        g++ \
-        unzip
-
-# Install NCCL 2.2
-RUN apt-get install -qq -y --allow-downgrades --allow-change-held-packages libnccl2=2.3.7-1+cuda10.0 libnccl-dev=2.3.7-1+cuda10.0
-
-# Install Bazel
-RUN curl -LO "https://github.com/bazelbuild/bazel/releases/download/0.19.2/bazel_0.19.2-linux-x86_64.deb"
-RUN dpkg -i bazel_*.deb
-
-# Install CUDA CLI Tools
-RUN apt-get install -qq -y cuda-command-line-tools-10-0
-
-# Install pip
-RUN wget https://bootstrap.pypa.io/get-pip.py && \
-    python get-pip.py && \
-    rm get-pip.py
-
-# << END Install base software
-
-
-
-
-# >> START Configure Tensorflow Build
-
-# Clone TensoFlow from Mozilla repo
-RUN git clone https://github.com/mozilla/tensorflow/
-WORKDIR /tensorflow
-RUN git checkout r1.13
-
-
-# GPU Environment Setup
-ENV TF_NEED_CUDA 1
-ENV CUDA_TOOLKIT_PATH /usr/local/cuda
-ENV TF_CUDA_VERSION 10.0
-ENV TF_CUDNN_VERSION 7
-ENV CUDNN_INSTALL_PATH /usr/lib/x86_64-linux-gnu/
-ENV TF_CUDA_COMPUTE_CAPABILITIES 6.0
-ENV TF_NCCL_VERSION 2.3
-# ENV NCCL_INSTALL_PATH /usr/lib/x86_64-linux-gnu/
-
-# Common Environment Setup
-ENV TF_BUILD_CONTAINER_TYPE GPU
-ENV TF_BUILD_OPTIONS OPT
-ENV TF_BUILD_DISABLE_GCP 1
-ENV TF_BUILD_ENABLE_XLA 0
-ENV TF_BUILD_PYTHON_VERSION PYTHON2
-ENV TF_BUILD_IS_OPT OPT
-ENV TF_BUILD_IS_PIP PIP
-
-# Other Parameters
-ENV CC_OPT_FLAGS -mavx -mavx2 -msse4.1 -msse4.2 -mfma
-ENV TF_NEED_GCP 0
-ENV TF_NEED_HDFS 0
-ENV TF_NEED_JEMALLOC 1
-ENV TF_NEED_OPENCL 0
-ENV TF_CUDA_CLANG 0
-ENV TF_NEED_MKL 0
-ENV TF_ENABLE_XLA 0
-ENV TF_NEED_AWS 0
-ENV TF_NEED_KAFKA 0
-ENV TF_NEED_NGRAPH 0
-ENV TF_DOWNLOAD_CLANG 0
-ENV TF_NEED_TENSORRT 0
-ENV TF_NEED_GDR 0
-ENV TF_NEED_VERBS 0
-ENV TF_NEED_OPENCL_SYCL 0
-ENV PYTHON_BIN_PATH /usr/bin/python2.7
-ENV PYTHON_LIB_PATH /usr/lib/python2.7/dist-packages
-
-# << END Configure Tensorflow Build
-
-
-
-
-# >> START Configure Bazel
-
-# Running bazel inside a `docker build` command causes trouble, cf:
-#   https://github.com/bazelbuild/bazel/issues/134
-# The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/etc/bazel.bazelrc
-# Similarly, we need to workaround sandboxing issues:
-#   https://github.com/bazelbuild/bazel/issues/418
-RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/etc/bazel.bazelrc
-
-# Put cuda libraries to where they are expected to be
-RUN mkdir /usr/local/cuda/lib &&  \
-    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
-    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h && \
-    ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
-    ln -s /usr/include/cudnn.h /usr/local/cuda/include/cudnn.h
-
-
-# Set library paths
-ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu/:/usr/local/cuda/lib64/stubs/
-
-# << END Configure Bazel
-
-
-
-
-# Copy DeepSpeech repo contents to container's /DeepSpeech
-COPY . /DeepSpeech/
-
-WORKDIR /DeepSpeech
-
-RUN pip --no-cache-dir install -r requirements.txt
-
-# Link DeepSpeech native_client libs to tf folder
-RUN ln -s /DeepSpeech/native_client /tensorflow
-
-
-
-
-# >> START Build and bind
-
-WORKDIR /tensorflow
-
-# Fix for not found script https://github.com/tensorflow/tensorflow/issues/471
-RUN ./configure
-
-# Using CPU optimizations:
-# -mtune=generic -march=x86-64 -msse -msse2 -msse3 -msse4.1 -msse4.2 -mavx.
-# Adding --config=cuda flag to build using CUDA.
-
-# passing LD_LIBRARY_PATH is required cause Bazel doesn't pickup it from environment
-
-
-# Build DeepSpeech
-RUN bazel build --config=monolithic --config=cuda -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_trie --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
-
-###
-### Using TensorFlow upstream should work
-###
-# # Build TF pip package
-# RUN bazel build --config=opt --config=cuda --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx //tensorflow/tools/pip_package:build_pip_package --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
-#
-# # Build wheel
-# RUN bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
-#
-# # Install tensorflow from our custom wheel
-# RUN pip install /tmp/tensorflow_pkg/*.whl
-
-# Copy built libs to /DeepSpeech/native_client
-RUN cp /tensorflow/bazel-bin/native_client/generate_trie /DeepSpeech/native_client/ \
-    && cp /tensorflow/bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/
-
-# Install TensorFlow
-WORKDIR /DeepSpeech/
-RUN pip install tensorflow-gpu==1.13.1
-
-
-# Make DeepSpeech and install Python bindings
-ENV TFDIR /tensorflow
-WORKDIR /DeepSpeech/native_client
-RUN make deepspeech
-WORKDIR /DeepSpeech/native_client/python
-RUN make bindings
-RUN pip install dist/deepspeech*
-WORKDIR /DeepSpeech/native_client/ctcdecode
-RUN make
-RUN pip install dist/*.whl
-
-
-# << END Build and bind
-
-
-
-
-# Allow Python printing utf-8
-ENV PYTHONIOENCODING UTF-8
-
-# Build KenLM in /DeepSpeech/native_client/kenlm folder
-WORKDIR /DeepSpeech/native_client
-RUN rm -rf kenlm \
-    && git clone --depth 1 https://github.com/kpu/kenlm && cd kenlm \
-    && mkdir -p build \
-    && cd build \
-    && cmake .. \
-    && make -j 4
-
-# Done
-WORKDIR /DeepSpeech
--- a/Dockerfile.build
+++ b/Dockerfile.build
@ -0,0 +1,183 @@
+# Please refer to the USING documentation, "Dockerfile for building from source"
+
+# Need devel version cause we need /usr/include/cudnn.h
+FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+
+# >> START Install base software
+
+# Get basic packages
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        apt-utils \
+        bash-completion \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        g++ \
+        gcc \
+        git \
+        libbz2-dev \
+        libboost-all-dev \
+        libgsm1-dev \
+        libltdl-dev \
+        liblzma-dev \
+        libmagic-dev \
+        libpng-dev \
+        libsox-fmt-mp3 \
+        libsox-dev \
+        locales \
+        openjdk-8-jdk \
+        pkg-config \
+        python3 \
+        python3-dev \
+        python3-pip \
+        python3-wheel \
+        python3-numpy \
+        sox \
+        unzip \
+        wget \
+        zlib1g-dev
+
+RUN update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1
+
+# Install Bazel
+RUN curl -LO "https://github.com/bazelbuild/bazel/releases/download/3.1.0/bazel_3.1.0-linux-x86_64.deb"
+RUN dpkg -i bazel_*.deb
+
+# Try and free some space
+RUN rm -rf /var/lib/apt/lists/*
+
+# << END Install base software
+
+# >> START Configure Tensorflow Build
+
+# GPU Environment Setup
+ENV TF_NEED_ROCM 0
+ENV TF_NEED_OPENCL_SYCL 0
+ENV TF_NEED_OPENCL 0
+ENV TF_NEED_CUDA 1
+ENV TF_CUDA_PATHS "/usr,/usr/local/cuda-10.1,/usr/lib/x86_64-linux-gnu/"
+ENV TF_CUDA_VERSION 10.1
+ENV TF_CUDNN_VERSION 7.6
+ENV TF_CUDA_COMPUTE_CAPABILITIES 6.0
+ENV TF_NCCL_VERSION 2.8
+
+# Common Environment Setup
+ENV TF_BUILD_CONTAINER_TYPE GPU
+ENV TF_BUILD_OPTIONS OPT
+ENV TF_BUILD_DISABLE_GCP 1
+ENV TF_BUILD_ENABLE_XLA 0
+ENV TF_BUILD_PYTHON_VERSION PYTHON3
+ENV TF_BUILD_IS_OPT OPT
+ENV TF_BUILD_IS_PIP PIP
+
+# Other Parameters
+ENV CC_OPT_FLAGS -mavx -mavx2 -msse4.1 -msse4.2 -mfma
+ENV TF_NEED_GCP 0
+ENV TF_NEED_HDFS 0
+ENV TF_NEED_JEMALLOC 1
+ENV TF_NEED_OPENCL 0
+ENV TF_CUDA_CLANG 0
+ENV TF_NEED_MKL 0
+ENV TF_ENABLE_XLA 0
+ENV TF_NEED_AWS 0
+ENV TF_NEED_KAFKA 0
+ENV TF_NEED_NGRAPH 0
+ENV TF_DOWNLOAD_CLANG 0
+ENV TF_NEED_TENSORRT 0
+ENV TF_NEED_GDR 0
+ENV TF_NEED_VERBS 0
+ENV TF_NEED_OPENCL_SYCL 0
+
+ENV PYTHON_BIN_PATH /usr/bin/python3.6
+ENV PYTHON_LIB_PATH /usr/local/lib/python3.6/dist-packages
+
+# << END Configure Tensorflow Build
+
+# >> START Configure Bazel
+
+# Running bazel inside a `docker build` command causes trouble, cf:
+#   https://github.com/bazelbuild/bazel/issues/134
+# The easiest solution is to set up a bazelrc file forcing --batch.
+RUN echo "startup --batch" >>/etc/bazel.bazelrc
+# Similarly, we need to workaround sandboxing issues:
+#   https://github.com/bazelbuild/bazel/issues/418
+RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
+    >>/etc/bazel.bazelrc
+
+# << END Configure Bazel
+
+WORKDIR /
+COPY . /STT/
+
+# >> START Build and bind
+
+WORKDIR /STT/tensorflow
+
+# Fix for not found script https://github.com/tensorflow/tensorflow/issues/471
+RUN ./configure
+
+# Using CPU optimizations:
+# -mtune=generic -march=x86-64 -msse -msse2 -msse3 -msse4.1 -msse4.2 -mavx.
+# Adding --config=cuda flag to build using CUDA.
+
+# passing LD_LIBRARY_PATH is required cause Bazel doesn't pickup it from environment
+
+# Build STT
+
+RUN bazel build \
+	--verbose_failures \
+	--workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" \
+	-c opt \
+	--copt=-mtune=generic \
+	--copt=-march=x86-64 \
+	--copt=-msse \
+	--copt=-msse2 \
+	--copt=-msse3 \
+	--copt=-msse4.1 \
+	--copt=-msse4.2 \
+	--copt=-mavx \
+	--config=noaws \
+	--config=nogcp \
+	--config=nohdfs \
+	--config=nonccl \
+	//native_client:libstt.so
+
+# Copy built libs to /STT/native_client
+RUN cp bazel-bin/native_client/libstt.so /STT/native_client/
+
+# Build client.cc and install Python client and decoder bindings
+ENV TFDIR /STT/tensorflow
+
+RUN nproc
+
+WORKDIR /STT/native_client
+RUN make NUM_PROCESSES=$(nproc) stt
+
+WORKDIR /STT
+RUN cd native_client/python && make NUM_PROCESSES=$(nproc) bindings
+RUN pip3 install -U pip setuptools wheel
+RUN pip3 install --upgrade native_client/python/dist/*.whl
+
+RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
+RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl
+
+# << END Build and bind
+
+# Allow Python printing utf-8
+ENV PYTHONIOENCODING UTF-8
+
+# Build KenLM in /STT/native_client/kenlm folder
+WORKDIR /STT/native_client
+RUN rm -rf kenlm && \
+	git clone https://github.com/kpu/kenlm && \
+	cd kenlm && \
+	git checkout 87e85e66c99ceff1fab2500a7c60c01da7315eec && \
+	mkdir -p build && \
+	cd build && \
+	cmake .. && \
+	make -j $(nproc)
+
+# Done
+WORKDIR /STT
--- a/Dockerfile.train
+++ b/Dockerfile.train
@ -0,0 +1,97 @@
+# This is a Dockerfile useful for training models with Coqui STT.
+# You can train "acoustic models" with audio + Tensorflow, and
+# you can create "scorers" with text + KenLM.
+
+FROM nvcr.io/nvidia/tensorflow:20.06-tf1-py3 AS kenlm-build
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential cmake libboost-system-dev \
+    libboost-thread-dev libboost-program-options-dev \
+    libboost-test-dev libeigen3-dev zlib1g-dev \
+    libbz2-dev liblzma-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+# Build KenLM to generate new scorers
+WORKDIR /code
+COPY kenlm /code/kenlm
+RUN cd /code/kenlm && \
+    mkdir -p build && \
+    cd build && \
+    cmake .. && \
+    make -j $(nproc) || \
+    ( echo "ERROR: Failed to build KenLM."; \
+    echo "ERROR: Make sure you update the kenlm submodule on host before building this Dockerfile."; \
+    echo "ERROR: $ cd STT; git submodule update --init kenlm"; \
+    exit 1; )
+
+
+FROM ubuntu:20.04 AS wget-binaries
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends wget unzip xz-utils && \
+    rm -rf /var/lib/apt/lists/*
+
+# Tool to convert output graph for inference
+RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/convert_graphdef_memmapped_format.linux.amd64.zip -O temp.zip && \
+    unzip temp.zip && \
+    rm temp.zip
+
+RUN wget --no-check-certificate https://github.com/reuben/STT/releases/download/v0.10.0-alpha.1/native_client.tar.xz -O temp.tar.xz && \
+    tar -xf temp.tar.xz && \
+    rm temp.tar.xz
+
+
+FROM nvcr.io/nvidia/tensorflow:20.06-tf1-py3
+ENV DEBIAN_FRONTEND=noninteractive
+
+# We need to purge python3-xdg because
+# it's breaking STT install later with
+# errors about setuptools
+#
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        git \
+        wget \
+        libopus0 \
+        libopusfile0 \
+        libsndfile1 \
+        sox \
+        libsox-fmt-mp3 && \
+    apt-get purge -y python3-xdg && \
+    rm -rf /var/lib/apt/lists/*
+
+# Make sure pip and its dependencies are up-to-date
+RUN pip3 install --upgrade pip wheel setuptools
+
+WORKDIR /code
+
+COPY native_client /code/native_client
+COPY .git /code/.git
+COPY training/coqui_stt_training/VERSION /code/training/coqui_stt_training/VERSION
+COPY training/coqui_stt_training/GRAPH_VERSION /code/training/coqui_stt_training/GRAPH_VERSION
+
+# Build CTC decoder first, to avoid clashes on incompatible versions upgrades
+RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
+RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl
+
+COPY setup.py /code/setup.py
+COPY VERSION /code/VERSION
+COPY training /code/training
+# Copy files from previous build stages
+RUN mkdir -p /code/kenlm/build/
+COPY --from=kenlm-build /code/kenlm/build/bin /code/kenlm/build/bin
+COPY --from=wget-binaries /convert_graphdef_memmapped_format /code/convert_graphdef_memmapped_format
+COPY --from=wget-binaries /generate_scorer_package /code/generate_scorer_package
+
+# Install STT
+# No need for the decoder since we did it earlier
+# TensorFlow GPU should already be installed on the base image,
+# and we don't want to break that
+RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e .
+
+# Copy rest of the code and test training
+COPY . /code
+RUN ./bin/run-ldc93s1.sh && rm -rf ~/.local/share/stt
--- a/Dockerfile.train.dockerignore
+++ b/Dockerfile.train.dockerignore
@ -0,0 +1,10 @@
+.git/lfs
+tensorflow
+.git/modules/tensorflow
+native_client/ds-swig
+native_client/libstt.so
+native_client/stt
+native_client/ctcdecode/dist/
+native_client/ctcdecode/temp_build
+native_client/ctcdecode/third_party.a
+native_client/ctcdecode/workspace_status.cc
--- a/Dockerfile.train.jupyter
+++ b/Dockerfile.train.jupyter
@ -0,0 +1,12 @@
+# This is a Dockerfile useful for training models with Coqui STT in Jupyter notebooks
+
+FROM ghcr.io/coqui-ai/stt-train:latest
+
+WORKDIR /code/notebooks
+
+RUN python3 -m pip install --no-cache-dir jupyter jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+EXPOSE 8888
+
+CMD ["bash", "-c", "jupyter notebook --notebook-dir=/code/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
--- a/1
+++ b/1
@ -0,0 +1 @@
+training/coqui_stt_training/GRAPH_VERSION
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@ -1,24 +0,0 @@
-For support and discussions, please use our [Discourse forums](https://discourse.mozilla.org/c/deep-speech).
-
-If you've found a bug, or have a feature request, then please create an issue with the following information:
-
- **Have I written custom code (as opposed to running examples on an unmodified clone of the repository)**:
- **OS Platform and Distribution (e.g., Linux Ubuntu 16.04)**:
- **TensorFlow installed from (our builds, or upstream TensorFlow)**:
- **TensorFlow version (use command below)**:
- **Python version**: 
- **Bazel version (if compiling from source)**:
- **GCC/Compiler version (if compiling from source)**:
- **CUDA/cuDNN version**:
- **GPU model and memory**:
- **Exact command to reproduce**:
-
-You can obtain the TensorFlow version with
-
-```bash
-python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"
-```
-
-Please describe the problem clearly. Be sure to convey here why it's a bug or a feature request.
-
-Include any logs or source code that would be helpful to diagnose the problem. For larger logs, link to a Gist, not a screenshot. If including tracebacks, please include the full traceback. Try to provide a reproducible test case.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,2 @@
+include training/coqui_stt_training/VERSION
+include training/coqui_stt_training/GRAPH_VERSION
--- a/8
+++ b/8
@ -0,0 +1,8 @@
+STT_REPO ?= https://github.com/coqui-ai/STT.git
+STT_SHA  ?= origin/main
+
+Dockerfile%: Dockerfile%.tmpl
+	sed \
+		-e "s|#STT_REPO#|$(STT_REPO)|g" \
+		-e "s|#STT_SHA#|$(STT_SHA)|g" \
+		< $< > $@
--- a/README.md
+++ b/README.md
@ -1,402 +0,0 @@
-# Project DeepSpeech
-
-[![Task Status](https://github.taskcluster.net/v1/repository/mozilla/DeepSpeech/master/badge.svg)](https://github.taskcluster.net/v1/repository/mozilla/DeepSpeech/master/latest)
-
-DeepSpeech is an open source Speech-To-Text engine, using a model trained by machine learning techniques based on [Baidu's Deep Speech research paper](https://arxiv.org/abs/1412.5567). Project DeepSpeech uses Google's [TensorFlow](https://www.tensorflow.org/) to make the implementation easier.
-
-![Usage](images/usage.gif)
-
-Pre-built binaries for performing inference with a trained model can be installed with `pip3`. Proper setup using a virtual environment is recommended, and you can find that documentation [below](#using-the-python-package).
-
-A pre-trained English model is available for use and can be downloaded using [the instructions below](#getting-the-pre-trained-model). Currently, only 16-bit, 16 kHz, mono-channel WAVE audio files are supported in the Python client.
-
-Once everything is installed, you can then use the `deepspeech` binary to do speech-to-text on short (approximately 5-second long) audio files as such:
-
-```bash
-pip3 install deepspeech
-deepspeech --model models/output_graph.pbmm --alphabet models/alphabet.txt --lm models/lm.binary --trie models/trie --audio my_audio_file.wav
-```
-
-Alternatively, quicker inference can be performed using a supported NVIDIA GPU on Linux. See the [release notes](https://github.com/mozilla/DeepSpeech/releases) to find which GPUs are supported. To run `deepspeech` on a GPU, install the GPU specific package:
-
-```bash
-pip3 install deepspeech-gpu
-deepspeech --model models/output_graph.pbmm --alphabet models/alphabet.txt --lm models/lm.binary --trie models/trie --audio my_audio_file.wav
-```
-
-Please ensure you have the required [CUDA dependency](#cuda-dependency).
-
-See the output of `deepspeech -h` for more information on the use of `deepspeech`. (If you experience problems running `deepspeech`, please check [required runtime dependencies](native_client/README.md#required-dependencies)).
-
-**Table of Contents**
-
- [Prerequisites](#prerequisites)
- [Getting the code](#getting-the-code)
- [Getting the pre-trained model](#getting-the-pre-trained-model)
- [CUDA dependency](#cuda-dependency)
- [Using the model](#using-the-model)
-  - [Using the Python package](#using-the-python-package)
-  - [Using the command line client](#using-the-command-line-client)
-  - [Using the Node.JS package](#using-the-nodejs-package)
-  - [Installing bindings from source](#installing-bindings-from-source)
-  - [Third party bindings](#third-party-bindings)
- [Training](#training)
-  - [Installing prerequisites for training](#installing-prerequisites-for-training)
-  - [Recommendations](#recommendations)
-  - [Common Voice training data](#common-voice-training-data)
-  - [Training a model](#training-a-model)
-  - [Checkpointing](#checkpointing)
-  - [Exporting a model for inference](#exporting-a-model-for-inference)
-  - [Exporting a model for TFLite](#exporting-a-model-for-tflite)
-  - [Distributed computing across more than one machine](#distributed-training-across-more-than-one-machine)
-  - [Continuing training from a release model](#continuing-training-from-a-release-model)
- [Contact/Getting Help](#contactgetting-help)
-
-## Prerequisites
-
-* [Python 3.6](https://www.python.org/)
-* [Git Large File Storage](https://git-lfs.github.com/)
-* Mac or Linux environment
-* Go to [build README](examples/net_framework/README.md) to start building DeepSpeech for Windows from source.
-
-## Getting the code
-
-Install [Git Large File Storage](https://git-lfs.github.com/) either manually or through a package-manager if available on your system. Then clone the DeepSpeech repository normally:
-
-```bash
-git clone https://github.com/mozilla/DeepSpeech
-```
-
-## Getting the pre-trained model
-
-If you want to use the pre-trained English model for performing speech-to-text, you can download it (along with other important inference material) from the DeepSpeech [releases page](https://github.com/mozilla/DeepSpeech/releases). Alternatively, you can run the following command to download and unzip the model files in your current directory:
-
-```bash
-wget https://github.com/mozilla/DeepSpeech/releases/download/v0.4.1/deepspeech-0.4.1-models.tar.gz
-tar xvfz deepspeech-0.4.1-models.tar.gz
-```
-
-## Using the model
-
-There are three ways to use DeepSpeech inference:
-
- [The Python package](#using-the-python-package)
- [The command-line client](#using-the-command-line-client)
- [The Node.JS package](#using-the-nodejs-package)
-
-
-### CUDA dependency
-
-The GPU capable builds (Python, NodeJS, C++ etc) depend on the same CUDA runtime as upstream TensorFlow. Currently with TensorFlow r1.12 it depends on CUDA 9.0 and CuDNN v7.2.
-
-### Using the Python package
-
-Pre-built binaries which can be used for performing inference with a trained model can be installed with `pip3`. You can then use the `deepspeech` binary to do speech-to-text on an audio file:
-
-For the Python bindings, it is highly recommended that you perform the installation within a Python 3.5 or later virtual environment. You can find more information about those in [this documentation](http://docs.python-guide.org/en/latest/dev/virtualenvs/).
-
-We will continue under the assumption that you already have your system properly setup to create new virtual environments.
-
-#### Create a DeepSpeech virtual environment
-
-In creating a virtual environment you will create a directory containing a `python3` binary and everything needed to run deepspeech. You can use whatever directory you want. For the purpose of the documentation, we will rely on `$HOME/tmp/deepspeech-venv`. You can create it using this command:
-
-```
-$ virtualenv -p python3 $HOME/tmp/deepspeech-venv/
-```
-
-Once this command completes successfully, the environment will be ready to be activated.
-
-#### Activating the environment
-
-Each time you need to work with DeepSpeech, you have to *activate* this virtual environment. This is done with this simple command:
-
-```
-$ source $HOME/tmp/deepspeech-venv/bin/activate
-```
-
-#### Installing DeepSpeech Python bindings
-
-Once your environment has been set-up and loaded, you can use `pip3` to manage packages locally. On a fresh setup of the `virtualenv`, you will have to install the DeepSpeech wheel. You can check if `deepspeech` is already installed with `pip3 list`.
-
-To perform the installation, just use `pip3` as such:
-
-```
-$ pip3 install deepspeech
-```
-
-If `deepspeech` is already installed, you can update it as such:
-
-```
-$ pip3 install --upgrade deepspeech
-```
-
-Alternatively, if you have a supported NVIDIA GPU on Linux, you can install the GPU specific package as follows:
-
-```
-$ pip3 install deepspeech-gpu
-```
-
-See the [release notes](https://github.com/mozilla/DeepSpeech/releases) to find which GPUs are supported. Please ensure you have the required [CUDA dependency](#cuda-dependency).
-
-You can update `deepspeech-gpu` as follows:
-
-```
-$ pip3 install --upgrade deepspeech-gpu
-```
-
-In both cases, `pip3` should take care of installing all the required dependencies. After installation has finished, you should be able to call `deepspeech` from the command-line.
-
-
-Note: the following command assumes you [downloaded the pre-trained model](#getting-the-pre-trained-model).
-
-```bash
-deepspeech --model models/output_graph.pbmm --alphabet models/alphabet.txt --lm models/lm.binary --trie models/trie --audio my_audio_file.wav
-```
-
-The arguments `--lm` and `--trie` are optional, and represent a language model.
-
-See [client.py](native_client/python/client.py) for an example of how to use the package programatically.
-
-### Using the command-line client
-
-To download the pre-built binaries for the `deepspeech` command-line client, use `util/taskcluster.py`:
-
-```bash
-python3 util/taskcluster.py --target .
-```
-
-or if you're on macOS:
-
-```bash
-python3 util/taskcluster.py --arch osx --target .
-```
-
-also, if you need some binaries different than current master, like `v0.2.0-alpha.6`, you can use `--branch`:
-
-```bash
-python3 util/taskcluster.py --branch "v0.2.0-alpha.6" --target "."
-```
-
-The script `taskcluster.py` will download `native_client.tar.xz` (which includes the `deepspeech` binary and associated libraries) and extract it into the current folder. Also, `taskcluster.py` will download binaries for Linux/x86_64 by default, but you can override that behavior with the `--arch` parameter. See the help info with `python util/taskcluster.py -h` for more details. Specific branches of DeepSpeech or TensorFlow can be specified as well.
-
-Note: the following command assumes you [downloaded the pre-trained model](#getting-the-pre-trained-model).
-
-```bash
-./deepspeech --model models/output_graph.pbmm --alphabet models/alphabet.txt --lm models/lm.binary --trie models/trie --audio audio_input.wav
-```
-
-See the help output with `./deepspeech -h` and the [native client README](native_client/README.md) for more details.
-
-### Using the Node.JS package
-
-You can download the Node.JS bindings using `npm`:
-
-```bash
-npm install deepspeech
-```
-
-Alternatively, if you're using Linux and have a supported NVIDIA GPU, you can install the GPU specific package as follows:
-
-```bash
-npm install deepspeech-gpu
-```
-
-See the [release notes](https://github.com/mozilla/DeepSpeech/releases) to find which GPUs are supported. Please ensure you have the required [CUDA dependency](#cuda-dependency).
-
-See [client.js](native_client/javascript/client.js) for an example of how to use the bindings. Or download the [wav example](examples/nodejs_wav).
-
-### Installing bindings from source
-
-If pre-built binaries aren't available for your system, you'll need to install them from scratch. Follow these [`native_client` installation instructions](native_client/README.md).
-
-### Third party bindings
-
-In addition to the bindings above, third party developers have started to provide bindings to other languages:
-
-* [Asticode](https://github.com/asticode) provides [Golang](https://golang.org) bindings in its [go-astideepspeech](https://github.com/asticode/go-astideepspeech) repo.
-* [RustAudio](https://github.com/RustAudio) provide a [Rust](https://www.rust-lang.org) binding, the installation and use of which is described in their [deepspeech-rs](https://github.com/RustAudio/deepspeech-rs) repo.
-* [stes](https://github.com/stes) provides preliminary [PKGBUILDs](https://wiki.archlinux.org/index.php/PKGBUILD) to install the client and python bindings on [Arch Linux](https://www.archlinux.org/) in the [arch-deepspeech](https://github.com/stes/arch-deepspeech) repo.
-* [gst-deepspeech](https://github.com/Elleo/gst-deepspeech) provides a [GStreamer](https://gstreamer.freedesktop.org/) plugin which can be used from any language with GStreamer bindings.
-
-## Training
-
-### Installing prerequisites for training
-
-Install the required dependencies using `pip3`:
-
-```bash
-cd DeepSpeech
-pip3 install -r requirements.txt
-```
-
-You'll also need to install the `ds_ctcdecoder` Python package. `ds_ctcdecoder` is required for decoding the outputs of the `deepspeech` acoustic model into text. You can use `util/taskcluster.py` with the `--decoder` flag to get a URL to a binary of the decoder package appropriate for your platform and Python version:
-
-```bash
-pip3 install $(python3 util/taskcluster.py --decoder)
-```
-
-This command will download and install the `ds_ctcdecoder` package. If you prefer building the binaries from source, see the [native_client README file](native_client/README.md). You can override the platform with `--arch` if you want the package for ARM7 (`--arch arm`) or ARM64 (`--arch arm64`).
-
-### Recommendations
-
-If you have a capable (NVIDIA, at least 8GB of VRAM) GPU, it is highly recommended to install TensorFlow with GPU support. Training will be significantly faster than using the CPU. To enable GPU support, you can do:
-
-```bash
-pip3 uninstall tensorflow
-pip3 install 'tensorflow-gpu==1.13.1'
-```
-
-Please ensure you have the required [CUDA dependency](#cuda-dependency).
-
-### Common Voice training data
-
-The Common Voice corpus consists of voice samples that were donated through Mozilla's [Common Voice](https://voice.mozilla.org/) Initiative.
-
-We provide an importer (`bin/import_cv.py`) which automates downloading and preparing the Common Voice corpus as such:
-
-```bash
-bin/import_cv.py path/to/target/directory
-```
-
-If you already downloaded Common Voice from [here](https://voice.mozilla.org/data), simply run `bin/import_cv.py` on the directory where the corpus is located. The importer will detect that you've already downloaded the data and immediately proceed to unpackaging and importing. If you haven't downloaded the data already, `bin/import_cv.py` will download it for you and save to the path you've specified.
-
-Please be aware that training with the Common Voice corpus archive requires at least 70GB of free disk space and quite some time to conclude. As this process creates a huge number of small files, using an SSD drive is highly recommended. If the import script gets interrupted, it will try to continue from where it stopped the next time you run it. Unfortunately, there are some cases where it will need to start over. Once the import is done, the directory will contain a bunch of CSV files.
-
-The following files are official user-validated sets for training, validating and testing:
-
- `cv-valid-train.csv`
- `cv-valid-dev.csv`
- `cv-valid-test.csv`
-
-The following files are the non-validated unofficial sets for training, validating and testing:
-
- `cv-other-train.csv`
- `cv-other-dev.csv`
- `cv-other-test.csv`
-
-`cv-invalid.csv` contains all samples that users flagged as invalid.
-
-A sub-directory called `cv_corpus_{version}` contains the mp3 and wav files that were extracted from an archive named `cv_corpus_{version}.tar.gz`.
-All entries in the CSV files refer to their samples by absolute paths. So moving this sub-directory would require another import or tweaking the CSV files accordingly.
-
-To use Common Voice data during training, validation and testing, you pass (comma separated combinations of) their filenames into `--train_files`, `--dev_files`, `--test_files` parameters of `DeepSpeech.py`.
-
-If, for example, Common Voice was imported into `../data/CV`, `DeepSpeech.py` could be called like this:
-
-```bash
-./DeepSpeech.py --train_files ../data/CV/cv-valid-train.csv --dev_files ../data/CV/cv-valid-dev.csv --test_files ../data/CV/cv-valid-test.csv
-```
-
-If you are brave enough, you can also include the `other` dataset, which contains not-yet-validated content:
-
-```bash
-./DeepSpeech.py --train_files ../data/CV/cv-valid-train.csv,../data/CV/cv-other-train.csv --dev_files ../data/CV/cv-valid-dev.csv --test_files ../data/CV/cv-valid-test.csv
-```
-
-### Training a model
-
-The central (Python) script is `DeepSpeech.py` in the project's root directory. For its list of command line options, you can call:
-
-```bash
-./DeepSpeech.py --helpfull
-```
-
-To get the output of this in a slightly better-formatted way, you can also look up the option definitions top `DeepSpeech.py`.
-
-For executing pre-configured training scenarios, there is a collection of convenience scripts in the `bin` folder. Most of them are named after the corpora they are configured for. Keep in mind that the other speech corpora are *very large*, on the order of tens of gigabytes, and some aren't free. Downloading and preprocessing them can take a very long time, and training on them without a fast GPU (GTX 10 series recommended) takes even longer.
-
-**If you experience GPU OOM errors while training, try reducing the batch size with the `--train_batch_size`, `--dev_batch_size` and `--test_batch_size` parameters.**
-
-As a simple first example you can open a terminal, change to the directory of the DeepSpeech checkout and run:
-
-```bash
-./bin/run-ldc93s1.sh
-```
-
-This script will train on a small sample dataset called LDC93S1, which can be overfitted on a GPU in a few minutes for demonstration purposes. From here, you can alter any variables with regards to what dataset is used, how many training iterations are run and the default values of the network parameters.
-
-Feel also free to pass additional (or overriding) `DeepSpeech.py` parameters to these scripts. Then, just run the script to train the modified network.
-
-Each dataset has a corresponding importer script in `bin/` that can be used to download (if it's freely available) and preprocess the dataset. See `bin/import_librivox.py` for an example of how to import and preprocess a large dataset for training with DeepSpeech.
-
-If you've run the old importers (in `util/importers/`), they could have removed source files that are needed for the new importers to run. In that case, simply remove the extracted folders and let the importer extract and process the dataset from scratch, and things should work.
-
-### Checkpointing
-
-During training of a model so-called checkpoints will get stored on disk. This takes place at a configurable time interval. The purpose of checkpoints is to allow interruption (also in the case of some unexpected failure) and later continuation of training without losing hours of training time. Resuming from checkpoints happens automatically by just (re)starting training with the same `--checkpoint_dir` of the former run.
-
-Be aware however that checkpoints are only valid for the same model geometry they had been generated from. In other words: If there are error messages of certain `Tensors` having incompatible dimensions, this is most likely due to an incompatible model change. One usual way out would be to wipe all checkpoint files in the checkpoint directory or changing it before starting the training.
-
-### Exporting a model for inference
-
-If the `--export_dir` parameter is provided, a model will have been exported to this directory during training.
-Refer to the corresponding [README.md](native_client/README.md) for information on building and running a client that can use the exported model.
-
-### Exporting a model for TFLite
-
-If you want to experiment with the TF Lite engine, you need to export a model that is compatible with it, then use the `--export_tflite` flag. If you already have a trained model, you can re-export it for TFLite by running `DeepSpeech.py` again and specifying the same `checkpoint_dir` that you used for training, as well as passing `--notrain --notest --export_tflite --export_dir /model/export/destination`.
-
-### Making a mmap-able model for inference
-
-The `output_graph.pb` model file generated in the above step will be loaded in memory to be dealt with when running inference.
-This will result in extra loading time and memory consumption. One way to avoid this is to directly read data from the disk.
-
-TensorFlow has tooling to achieve this: it requires building the target `//tensorflow/contrib/util:convert_graphdef_memmapped_format` (binaries are produced by our TaskCluster for some systems including Linux/amd64 and macOS/amd64), use `util/taskcluster.py` tool to download, specifying `tensorflow` as a source and `convert_graphdef_memmapped_format` as artifact.
-
-Producing a mmap-able model is as simple as:
-
-```
-$ convert_graphdef_memmapped_format --in_graph=output_graph.pb --out_graph=output_graph.pbmm
-```
-
-Upon sucessfull run, it should report about conversion of a non-zero number of nodes. If it reports converting `0` nodes, something is wrong: make sure your model is a frozen one, and that you have not applied any incompatible changes (this includes `quantize_weights`).
-
-### Distributed training across more than one machine
-
-DeepSpeech has built-in support for [distributed TensorFlow](https://www.tensorflow.org/deploy/distributed). To get an idea on how this works, you can use the script `bin/run-cluster.sh` for running a cluster with workers just on the local machine.
-
-```bash
-$ bin/run-cluster.sh --help
-Usage: run-cluster.sh [--help] [--script script] [p:w:g] <arg>*
-
--help      print this help message
--script    run the provided script instead of DeepSpeech.py
-p           number of local parameter servers
-w           number of local workers
-g           number of local GPUs per worker
-<arg>*      remaining parameters will be forwarded to DeepSpeech.py or a provided script
-
-Example usage - The following example will create a local DeepSpeech.py cluster
-with 1 parameter server, and 2 workers with 1 GPU each:
-$ run-cluster.sh 1:2:1 --epoch 10
-```
-
-Be aware that for the help example to be able to run, you need at least two `CUDA` capable GPUs (2 workers x 1 GPU). The script utilizes environment variable `CUDA_VISIBLE_DEVICES` for `DeepSpeech.py` to see only the provided number of GPUs per worker.
-
-The script is meant to be a template for your own distributed computing instrumentation. Just modify the startup code for the different servers (workers and parameter servers) accordingly. You could use SSH or something similar for running them on your remote hosts.
-
-### Continuing training from a release model
-
-If you'd like to use one of the pre-trained models released by Mozilla to bootstrap your training process (transfer learning, fine tuning), you can do so by using the `--checkpoint_dir` flag in `DeepSpeech.py`. Specify the path where you downloaded the checkpoint from the release, and training will resume from the pre-trained model.
-
-For example, if you want to fine tune the entire graph using your own data in `my-train.csv`, `my-dev.csv` and `my-test.csv`, for three epochs, you can something like the following, tuning the hyperparameters as needed:
-
-```bash
-mkdir fine_tuning_checkpoints
-python3 DeepSpeech.py --n_hidden 2048 --checkpoint_dir path/to/checkpoint/folder --epoch -3 --train_files my-train.csv --dev_files my-dev.csv --test_files my_dev.csv --learning_rate 0.0001
-```
-
-Note: the released models were trained with `--n_hidden 2048`, so you need to use that same value when initializing from the release models. Note as well the use of a negative epoch count -3 (meaning 3 more epochs) since the checkpoint you're loading from was already trained for several epochs.
-
-## Contact/Getting Help
-
-There are several ways to contact us or to get help:
-
-1. [**FAQ**](https://github.com/mozilla/DeepSpeech/wiki#frequently-asked-questions) - We have a list of common questions, and their answers, in our [FAQ](https://github.com/mozilla/DeepSpeech/wiki#frequently-asked-questions). When just getting started, it's best to first check the [FAQ](https://github.com/mozilla/DeepSpeech/wiki#frequently-asked-questions) to see if your question is addressed.
-
-2. [**Discourse Forums**](https://discourse.mozilla.org/c/deep-speech) - If your question is not addressed in the [FAQ](https://github.com/mozilla/DeepSpeech/wiki#frequently-asked-questions), the [Discourse Forums](https://discourse.mozilla.org/c/deep-speech) is the next place to look. They contain conversations on [General Topics](https://discourse.mozilla.org/t/general-topics/21075), [Using Deep Speech](https://discourse.mozilla.org/t/using-deep-speech/21076/4), and [Deep Speech Development](https://discourse.mozilla.org/t/deep-speech-development/21077).
-
-3. [**IRC**](https://wiki.mozilla.org/IRC) - If your question is not addressed by either the [FAQ](https://github.com/mozilla/DeepSpeech/wiki#frequently-asked-questions) or [Discourse Forums](https://discourse.mozilla.org/c/deep-speech), you can contact us on the `#machinelearning` channel on [Mozilla IRC](https://wiki.mozilla.org/IRC); people there can try to answer/help
-
-4. [**Issues**](https://github.com/mozilla/deepspeech/issues) - Finally, if all else fails, you can open an issue in our repo.
--- a/README.rst
+++ b/README.rst
@ -0,0 +1,69 @@
+.. image:: images/coqui-STT-logo-green.png
+   :alt: Coqui STT logo
+
+
+.. |doc-img| image:: https://readthedocs.org/projects/stt/badge/?version=latest
+   :target: https://stt.readthedocs.io/?badge=latest
+   :alt: Documentation
+
+.. |covenant-img| image:: https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg
+   :target: CODE_OF_CONDUCT.md
+   :alt: Contributor Covenant
+
+.. |gitter-img| image:: https://badges.gitter.im/coqui-ai/STT.svg
+   :target: https://gitter.im/coqui-ai/STT?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge
+   :alt: Gitter Room
+
+.. |doi| image:: https://zenodo.org/badge/344354127.svg
+   :target: https://zenodo.org/badge/latestdoi/344354127
+
+|doc-img| |covenant-img| |gitter-img| |doi|
+
+`👉 Subscribe to 🐸Coqui's Newsletter <https://coqui.ai/?subscription=true>`_
+
+**Coqui STT** (🐸STT) is a fast, open-source, multi-platform, deep-learning toolkit for training and deploying speech-to-text models. 🐸STT is battle tested in both production and research 🚀
+
+🐸STT features
+---------------
+
+* High-quality pre-trained STT model.
+* Efficient training pipeline with Multi-GPU support.
+* Streaming inference.
+* Multiple possible transcripts, each with an associated confidence score.
+* Real-time inference.
+* Small-footprint acoustic model.
+* Bindings for various programming languages.
+
+Where to Ask Questions
+----------------------
+
+.. list-table::
+   :widths: 25 25
+   :header-rows: 1
+
+   * - Type
+     - Link
+   * - 🚨 **Bug Reports**
+     - `Github Issue Tracker <https://github.com/coqui-ai/STT/issues/>`_
+   * - 🎁 **Feature Requests & Ideas**
+     - `Github Issue Tracker <https://github.com/coqui-ai/STT/issues/>`_
+   * - ❔ **Questions**
+     - `Github Discussions <https://github.com/coqui-ai/stt/discussions/>`_
+   * - 💬 **General Discussion**
+     - `Github Discussions <https://github.com/coqui-ai/stt/discussions/>`_ or `Gitter Room <https://gitter.im/coqui-ai/STT?utm_source=share-link&utm_medium=link&utm_campaign=share-link>`_
+
+
+Links & Resources
+-----------------
+.. list-table::
+   :widths: 25 25
+   :header-rows: 1
+
+   * - Type
+     - Link
+   * - 📰 **Documentation**
+     - `stt.readthedocs.io <https://stt.readthedocs.io/>`_
+   * - 🚀 **Latest release with pre-trained models**
+     - `see the latest release on GitHub <https://github.com/coqui-ai/STT/releases/latest>`_
+   * - 🤝 **Contribution Guidelines**
+     - `CONTRIBUTING.rst <CONTRIBUTING.rst>`_
--- a/RELEASE.md
+++ b/RELEASE.md
@ -1,9 +0,0 @@
-Making a (new) release of the codebase
-======================================
- - Update version in VERSION file, commit
- - Open PR, ensure all tests are passing properly
- - Merge the PR
- - Fetch the new master, tag it with (hopefully) the same version as in VERSION
- - Push that to Github
- - New build should be triggered and new packages should be made
- - TaskCluster should schedule a merge build **including** a "DeepSpeech Packages" task
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@ -0,0 +1,95 @@
+# General
+
+This is the 1.0.0 release for Coqui STT, the deep learning toolkit for speech-to-text. In accordance with [semantic versioning](https://semver.org/), this version is not completely backwards compatible with previous versions. The compatibility guarantees of our semantic versioning cover the inference APIs: the C API and all the official language bindings: Python, Node.JS/ElectronJS and Android. You can get started today with Coqui STT 1.0.0 by following the steps in our [documentation](https://stt.readthedocs.io/).
+
+This release includes pre-trained English models, available in the Coqui Model Zoo:
+
+ - [Coqui English STT v1.0.0-huge-vocab](https://coqui.ai/english/coqui/v1.0.0-huge-vocab)
+ - [Coqui English STT v1.0.0-yesno](https://coqui.ai/english/coqui/v1.0.0-yesno)
+ - [Coqui English STT v1.0.0-large-vocab](https://coqui.ai/english/coqui/v1.0.0-large-vocab)
+ - [Coqui English STT v1.0.0-digits](https://coqui.ai/english/coqui/v1.0.0-digits)
+
+all under the Apache 2.0 license.
+
+The acoustic models were trained on American English data with synthetic noise augmentation. The model achieves a 4.5% word error rate on the [LibriSpeech clean test corpus](http://www.openslr.org/12) and 13.6% word error rate on the [LibriSpeech other test corpus](http://www.openslr.org/12) with the largest release language model.
+
+Note that the model currently performs best in low-noise environments with clear recordings. This does not mean the model cannot be used outside of these conditions, but that accuracy may be lower. Some users may need to further fine tune the model to meet their intended use-case.
+
+We also include example audio files:
+
+[audio-1.0.0.tar.gz](https://github.com/coqui-ai/STT/releases/download/v1.0.0/audio-1.0.0.tar.gz)
+
+which can be used to test the engine, and checkpoint files for the English model:
+
+[coqui-stt-1.0.0-checkpoint.tar.gz](https://github.com/coqui-ai/STT/releases/download/v1.0.0/coqui-stt-1.0.0-checkpoint.tar.gz)
+
+which are under the Apache 2.0 license and can be used as the basis for further fine-tuning. Finally this release also includes a source code tarball:
+
+[v1.0.0.tar.gz](https://github.com/coqui-ai/STT/archive/v1.0.0.tar.gz)
+
+Under the [MPL-2.0 license](https://www.mozilla.org/en-US/MPL/2.0/). Note that this tarball is for archival purposes only since GitHub does not include submodules in the automatic tarballs. For usage and development with the source code, clone the repository using Git, following our [documentation](https://stt.readthedocs.io/).
+
+
+# Notable changes
+
+ - Removed support for protocol buffer input in native client and consolidated all packages under a single "STT" name accepting TFLite inputs
+ - Added programmatic interface to training code and example Jupyter Notebooks, including how to train with Common Voice data
+ - Added transparent handling of mixed sample rates and stereo audio in training inputs
+ - Moved CI setup to GitHub Actions, making code contributions easier to test
+ - Added configuration management via Coqpit, providing a more flexible config interface that's compatible with Coqui TTS
+ - Handle Opus audio files transparently in training inputs
+ - Added support for automatic dataset subset splitting
+ - Added support for automatic alphabet generation and loading
+ - Started publishing the training code CI for a faster notebook setup
+ - Refactor training code into self-contained modules and deprecate train.py as universal entry point for training
+
+# Training Regimen + Hyperparameters for fine-tuning
+
+The hyperparameters used to train the model are useful for fine tuning. Thus, we document them here along with the training regimen, hardware used (a server with 8 NVIDIA A100 GPUs each with 40GB of VRAM), along with the full training hyperparameters. The full training configuration in JSON format is available [here](https://gist.github.com/reuben/6ced6a8b41e3d0849dafb7cae301e905).
+
+The datasets used were:
+ - Common Voice 7.0 (with custom train/dev/test splits)
+ - Multilingual LibriSpeech (English, Opus)
+ - LibriSpeech
+
+The optimal `lm_alpha` and `lm_beta` values with respect to the Common Voice 7.0 (custom Coqui splits) and a large vocabulary language model:
+
+ - lm_alpha: 0.5891777425167632
+ - lm_beta: 0.6619145283338659
+
+# Documentation
+
+Documentation is available on [stt.readthedocs.io](https://stt.readthedocs.io/).
+
+# Contact/Getting Help
+
+1. [GitHub Discussions](https://github.com/coqui-ai/STT/discussions/) - best place to ask questions, get support, and discuss anything related to 🐸STT with other users.
+3. [Gitter](https://gitter.im/coqui-ai/) - You can also join our Gitter chat.
+4. [Issues](https://github.com/coqui-ai/STT/issues) - If you have discussed a problem and identified a bug in 🐸STT, or if you have a feature request, please open an issue in our repo. Please make sure you search for an already existing issue beforehand!
+
+# Contributors to 1.0.0 release
+
+ - Alexandre Lissy
+ - Anon-Artist
+ - Anton Yaroshenko
+ - Catalin Voss
+ - CatalinVoss
+ - dag7dev
+ - Dustin Zubke
+ - Eren Gölge
+ - Erik Ziegler
+ - Francis Tyers
+ - Ideefixze
+ - Ilnar Salimzianov
+ - imrahul3610
+ - Jeremiah Rose
+ - Josh Meyer
+ - Kathy Reid
+ - Kelly Davis
+ - Kenneth Heafield
+ - NanoNabla
+ - Neil Stoker
+ - Reuben Morais
+ - zaptrem
+
+We’d also like to thank all the members of our [Gitter chat room](https://gitter.im/coqui-ai/STT) who have been helping to shape this release!
--- a/1
+++ b/1
@ -1 +0,0 @@
-0.5.0-alpha.3
--- a/1
+++ b/1
@ -0,0 +1 @@
+training/coqui_stt_training/VERSION
--- a/bazel.patch
+++ b/bazel.patch
@ -9,23 +9,23 @@ index c7aa4cb63..e084bc27c 100644
 +import java.io.PrintWriter;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
- 
+
@@ -73,6 +74,8 @@ public final class FileWriteAction extends AbstractFileWriteAction {
    */
   private final CharSequence fileContents;
- 
+
 +  private final Artifact output;
 +
   /** Minimum length (in chars) for content to be eligible for compression. */
   private static final int COMPRESS_CHARS_THRESHOLD = 256;
- 
+
@@ -90,6 +93,7 @@ public final class FileWriteAction extends AbstractFileWriteAction {
       fileContents = new CompressedString((String) fileContents);
     }
     this.fileContents = fileContents;
 +    this.output = output;
   }
- 
+
   /**
@@ -230,11 +234,32 @@ public final class FileWriteAction extends AbstractFileWriteAction {
    */
@ -59,7 +59,7 @@ index c7aa4cb63..e084bc27c 100644
 +    computeKeyDebugWriter.close();
 +    return rv;
   }
- 
+
   /**
 diff --git a/src/main/java/com/google/devtools/build/lib/analysis/actions/SpawnAction.java b/src/main/java/com/google/devtools/build/lib/analysis/actions/SpawnAction.java
 index 580788160..26883eb92 100644
@ -74,9 +74,9 @@ index 580788160..26883eb92 100644
 import java.util.Collections;
 import java.util.LinkedHashMap;
@@ -91,6 +92,9 @@ public class SpawnAction extends AbstractAction implements ExecutionInfoSpecifie
- 
+
   private final CommandLine argv;
- 
+
 +  private final Iterable<Artifact> inputs;
 +  private final Iterable<Artifact> outputs;
 +
@ -91,10 +91,10 @@ index 580788160..26883eb92 100644
 +    this.inputs = inputs;
 +    this.outputs = outputs;
   }
- 
+
   @Override
@@ -312,23 +319,89 @@ public class SpawnAction extends AbstractAction implements ExecutionInfoSpecifie
- 
+
   @Override
   protected String computeKey() {
 +    boolean genruleSetup = String.valueOf(Iterables.get(inputs, 0).getExecPath()).contains("genrule/genrule-setup.sh");
@ -182,14 +182,14 @@ index 580788160..26883eb92 100644
 +    }
 +    return rv;
   }
- 
+
   @Override
 diff --git a/src/main/java/com/google/devtools/build/lib/rules/cpp/CppCompileAction.java b/src/main/java/com/google/devtools/build/lib/rules/cpp/CppCompileAction.java
 index 3559fffde..3ba39617c 100644
 --- a/src/main/java/com/google/devtools/build/lib/rules/cpp/CppCompileAction.java
 +++ b/src/main/java/com/google/devtools/build/lib/rules/cpp/CppCompileAction.java
@@ -1111,10 +1111,30 @@ public class CppCompileAction extends AbstractAction
- 
+
   @Override
   public String computeKey() {
 +    // ".ckd" Compute Key Debug
@ -216,7 +216,7 @@ index 3559fffde..3ba39617c 100644
 +    for (Map.Entry<String, String> entry : executionInfo.entrySet()) {
 +      computeKeyDebugWriter.println("EXECINFO: " + entry.getKey() + "=" + entry.getValue());
 +    }
- 
+
     // For the argv part of the cache key, ignore all compiler flags that explicitly denote module
     // file (.pcm) inputs. Depending on input discovery, some of the unused ones are removed from
@@ -1124,6 +1144,9 @@ public class CppCompileAction extends AbstractAction
@ -226,7 +226,7 @@ index 3559fffde..3ba39617c 100644
 +    for (String input : compileCommandLine.getArgv(getInternalOutputFile(), null)) {
 +      computeKeyDebugWriter.println("COMMAND: " + input);
 +    }
- 
+
     /*
      * getArgv() above captures all changes which affect the compilation
@@ -1133,19 +1156,31 @@ public class CppCompileAction extends AbstractAction
@ -260,5 +260,5 @@ index 3559fffde..3ba39617c 100644
 +    computeKeyDebugWriter.close();
 +    return rv;
   }
- 
+
   @Override
--- a/bin/README.rst
+++ b/bin/README.rst
@ -1,3 +1,4 @@
-# Utility scripts
+Utility scripts
+===============

-This folder contains scripts that can be used to do training on the various included importers from the command line. This is useful to be able to run training without a browser open, or unattended on a remote machine. They should be run from the base directory of the repository. Note that the default settings assume a very well-specified machine. In the situation that out-of-memory errors occur, you may find decreasing the values of `--train_batch_size`, `--dev_batch_size` and `--test_batch_size` will allow you to continue, at the expense of speed.
+This folder contains scripts that can be used to do training on the various included importers from the command line. This is useful to be able to run training without a browser open, or unattended on a remote machine. They should be run from the base directory of the repository. Note that the default settings assume a very well-specified machine. In the situation that out-of-memory errors occur, you may find decreasing the values of ``--train_batch_size``\ , ``--dev_batch_size`` and ``--test_batch_size`` will allow you to continue, at the expense of speed.
--- a/bin/benchmark_nc.py
+++ b/bin/benchmark_nc.py
@ -1,506 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from __future__ import absolute_import, division, print_function
-
-import os
-import sys
-
-# To use util.tc
-sys.path.append(os.path.abspath(os.path.dirname(os.path.dirname(sys.argv[0]))))
-import util.taskcluster as tcu
-from util.benchmark import keep_only_digits
-
-import paramiko
-import argparse
-import tempfile
-import shutil
-import subprocess
-import stat
-import numpy
-import matplotlib.pyplot as plt
-import scipy.stats as scipy_stats
-import csv
-import getpass
-import zipfile
-
-from six import iteritems
-from six.moves import range, map
-
-r'''
- Tool to:
-  - remote local or remote (ssh) native_client
-  - handles copying models (as protocolbuffer files)
-  - run native_client in benchmark mode
-  - collect timing results
-  - compute mean values (with wariances)
-  - output as CSV
-'''
-
-ssh_conn = None
-def exec_command(command, cwd=None):
-    r'''
-    Helper to exec locally (subprocess) or remotely (paramiko)
-    '''
-
-    rc = None
-    stdout = stderr = None
-    if ssh_conn is None:
-        ld_library_path = {'LD_LIBRARY_PATH': '.:%s' % os.environ.get('LD_LIBRARY_PATH', '')}
-        p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=ld_library_path, cwd=cwd)
-        stdout, stderr = p.communicate()
-        rc = p.returncode
-    else:
-        # environment= requires paramiko >= 2.1 (fails with 2.0.2)
-        final_command = command if cwd is None else 'cd %s && %s %s' % (cwd, 'LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH', command)
-        ssh_stdin, ssh_stdout, ssh_stderr = ssh_conn.exec_command(final_command)
-        stdout = ''.join(ssh_stdout.readlines())
-        stderr = ''.join(ssh_stderr.readlines())
-        rc = ssh_stdout.channel.recv_exit_status()
-
-    return rc, stdout, stderr
-
-def assert_valid_dir(dir):
-    if dir is None:
-        raise AssertionError('Invalid temp directory')
-    return True
-
-def get_arch_string():
-    r'''
-    Check local or remote system arch, to produce TaskCluster proper link.
-    '''
-    rc, stdout, stderr = exec_command('uname -sm')
-    if rc > 0:
-        raise AssertionError('Error checking OS')
-
-    stdout = stdout.lower().strip()
-    if not 'linux' in stdout:
-        raise AssertionError('Unsupported OS')
-
-    if 'armv7l' in stdout:
-        return 'arm'
-
-    if 'x86_64' in stdout:
-        nv_rc, nv_stdout, nv_stderr = exec_command('nvidia-smi')
-        nv_stdout = nv_stdout.lower().strip()
-        if 'NVIDIA-SMI' in nv_stdout:
-            return 'gpu'
-        else:
-            return 'cpu'
-
-    raise AssertionError('Unsupported arch:', stdout)
-
-def maybe_download_binaries(dir):
-    assert_valid_dir(dir)
-    tcu.maybe_download_tc(target_dir=dir, tc_url=tcu.get_tc_url(get_arch_string()), progress=True)
-
-def extract_native_client_tarball(dir):
-    r'''
-    Download a native_client.tar.xz file from TaskCluster and extract it to dir.
-    '''
-    assert_valid_dir(dir)
-
-    target_tarball = os.path.join(dir, 'native_client.tar.xz')
-    if os.path.isfile(target_tarball) and os.stat(target_tarball).st_size == 0:
-        return
-
-    subprocess.check_call(['pixz', '-d', 'native_client.tar.xz'], cwd=dir)
-    subprocess.check_call(['tar', 'xf', 'native_client.tar'], cwd=dir)
-    os.unlink(os.path.join(dir, 'native_client.tar'))
-    open(target_tarball, 'w').close()
-
-def is_zip_file(models):
-    r'''
-    Ensure that a path is a zip file by:
-     - checking length is 1
-     - checking extension is '.zip'
-    '''
-    ext = os.path.splitext(models[0])[1]
-    return (len(models) == 1) and (ext == '.zip')
-
-def maybe_inspect_zip(models):
-    r'''
-    Detect if models is a list of protocolbuffer files or a ZIP file.
-    If the latter, then unzip it and return the list of protocolbuffer files
-    that were inside.
-    '''
-
-    if not(is_zip_file(models)):
-        return models
-
-    if len(models) > 1:
-        return models
-
-    if len(models) < 1:
-        raise AssertionError('No models at all')
-
-    return zipfile.ZipFile(models[0]).namelist()
-
-def all_files(models=[]):
-    r'''
-    Return a list of full path of files matching 'models', sorted in human
-    numerical order (i.e., 0 1 2 ..., 10 11 12, ..., 100, ..., 1000).
-
-    Files are supposed to be named identically except one variable component
-    e.g. the list,
-      test.weights.e5.lstm1200.ldc93s1.pb
-      test.weights.e5.lstm1000.ldc93s1.pb
-      test.weights.e5.lstm800.ldc93s1.pb
-    gets sorted:
-      test.weights.e5.lstm800.ldc93s1.pb
-      test.weights.e5.lstm1000.ldc93s1.pb
-      test.weights.e5.lstm1200.ldc93s1.pb
-    '''
-
-    def nsort(a, b):
-        fa = os.path.basename(a).split('.')
-        fb = os.path.basename(b).split('.')
-        elements_to_remove = []
-
-        assert len(fa) == len(fb)
-
-        for i in range(0, len(fa)):
-            if fa[i] == fb[i]:
-                elements_to_remove.append(fa[i])
-
-        for e in elements_to_remove:
-            fa.remove(e)
-            fb.remove(e)
-
-        assert len(fa) == len(fb)
-        assert len(fa) == 1
-
-        fa = keep_only_digits(fa[0])
-        fb = keep_only_digits(fb[0])
-
-        if fa < fb:
-            return -1
-        if fa == fb:
-            return 0
-        if fa > fb:
-            return 1
-
-    base = list(map(lambda x: os.path.abspath(x), maybe_inspect_zip(models)))
-    base.sort(cmp=nsort)
-
-    return base
-
-def copy_tree(dir):
-    assert_valid_dir(dir)
-
-    sftp = ssh_conn.open_sftp()
-    # IOError will get triggered if the path does not exists remotely
-    try:
-        if stat.S_ISDIR(sftp.stat(dir).st_mode):
-            print('Directory already existent: %s' % dir)
-    except IOError:
-        print('Creating remote directory: %s' % dir)
-        sftp.mkdir(dir)
-
-    print('Copy files to remote')
-    for fname in os.listdir(dir):
-        fullpath = os.path.join(dir, fname)
-        local_stat  = os.stat(fullpath)
-        try:
-            remote_mode = sftp.stat(fullpath).st_mode
-        except IOError:
-            remote_mode = 0
-
-        if not stat.S_ISREG(remote_mode):
-            print('Copying %s ...' % fullpath)
-            remote_mode = sftp.put(fullpath, fullpath, confirm=True).st_mode
-
-        if local_stat.st_mode != remote_mode:
-            print('Setting proper remote mode: %s' % local_stat.st_mode)
-            sftp.chmod(fullpath, local_stat.st_mode)
-
-    sftp.close()
-
-def delete_tree(dir):
-    assert_valid_dir(dir)
-
-    sftp = ssh_conn.open_sftp()
-    # IOError will get triggered if the path does not exists remotely
-    try:
-        if stat.S_ISDIR(sftp.stat(dir).st_mode):
-            print('Removing remote files')
-            for fname in sftp.listdir(dir):
-                fullpath = os.path.join(dir, fname)
-                remote_stat = sftp.stat(fullpath)
-                if stat.S_ISREG(remote_stat.st_mode):
-                    print('Removing %s ...' % fullpath)
-                    sftp.remove(fullpath)
-
-            print('Removing directory %s ...' % dir)
-            sftp.rmdir(dir)
-
-        sftp.close()
-    except IOError:
-        print('No remote directory: %s' % dir)
-
-def setup_tempdir(dir, models, wav, alphabet, lm_binary, trie, binaries):
-    r'''
-    Copy models, libs and binary to a directory (new one if dir is None)
-    '''
-    if dir is None:
-        dir = tempfile.mkdtemp(suffix='dsbench')
-
-    sorted_models = all_files(models=models)
-    if binaries is None:
-        maybe_download_binaries(dir)
-    else:
-        print('Using local binaries: %s' % (binaries))
-        shutil.copy2(binaries, dir)
-    extract_native_client_tarball(dir)
-
-    filenames = map(lambda x: os.path.join(dir, os.path.basename(x)), sorted_models)
-    missing_models = filter(lambda x: not os.path.isfile(x), filenames)
-    if len(missing_models) > 0:
-        # If we have a ZIP file, directly extract it to the proper path
-        if is_zip_file(models):
-            print('Extracting %s to %s' % (models[0], dir))
-            zipfile.ZipFile(models[0]).extractall(path=dir)
-            print('Extracted %s.' % models[0])
-        else:
-            # If one model is missing, let's copy everything again. Be safe.
-            for f in sorted_models:
-                print('Copying %s to %s' % (f, dir))
-                shutil.copy2(f, dir)
-
-    for extra_file in [ wav, alphabet, lm_binary, trie ]:
-        if extra_file and not os.path.isfile(os.path.join(dir, os.path.basename(extra_file))):
-            print('Copying %s to %s' % (extra_file, dir))
-            shutil.copy2(extra_file, dir)
-
-    if ssh_conn:
-        copy_tree(dir)
-
-    return dir, sorted_models
-
-def teardown_tempdir(dir):
-    r'''
-    Cleanup temporary directory.
-    '''
-
-    if ssh_conn:
-        delete_tree(dir)
-
-    assert_valid_dir(dir)
-    shutil.rmtree(dir)
-
-def get_sshconfig():
-    r'''
-    Read user's SSH configuration file
-    '''
-
-    with open(os.path.expanduser('~/.ssh/config')) as f:
-        cfg = paramiko.SSHConfig()
-        cfg.parse(f)
-        ret_dict = {}
-        for d in cfg._config:
-            _copy = dict(d)
-            # Avoid buggy behavior with strange host definitions, we need
-            # Hostname and not Host.
-            del _copy['host']
-            for host in d['host']:
-                ret_dict[host] = _copy['config']
-
-        return ret_dict
-
-def establish_ssh(target=None, auto_trust=False, allow_agent=True, look_keys=True):
-    r'''
-    Establish a SSH connection to a remote host. It should be able to use
-    SSH's config file Host name declarations. By default, will not automatically
-    add trust for hosts, will use SSH agent and will try to load keys.
-    '''
-
-    def password_prompt(username, hostname):
-        r'''
-        If the Host is relying on password authentication, lets ask it.
-        Relying on SSH itself to take care of that would not work when the
-        remote authentication is password behind a SSH-key+2FA jumphost.
-        '''
-        return getpass.getpass('No SSH key for %s@%s, please provide password: ' % (username, hostname))
-
-    ssh_conn = None
-    if target is not None:
-        ssh_conf = get_sshconfig()
-        cfg = {
-            'hostname': None,
-            'port': 22,
-            'allow_agent': allow_agent,
-            'look_for_keys': look_keys
-        }
-        if ssh_conf.has_key(target):
-            user_config = ssh_conf.get(target)
-
-            # If ssh_config file's Host defined 'User' instead of 'Username'
-            if user_config.has_key('user') and not user_config.has_key('username'):
-                user_config['username'] = user_config['user']
-                del user_config['user']
-
-            for k in ('username', 'hostname', 'port'):
-                if k in user_config:
-                    cfg[k] = user_config[k]
-
-            # Assume Password auth. If we don't do that, then when connecting
-            # through a jumphost we will run into issues and the user will
-            # not be able to input his password to the SSH prompt.
-            if 'identityfile' in user_config:
-                cfg['key_filename'] = user_config['identityfile']
-            else:
-                cfg['password'] = password_prompt(cfg['username'], cfg['hostname'] or target)
-
-            # Should be the last one, since ProxyCommand will issue connection to remote host
-            if 'proxycommand' in user_config:
-                cfg['sock'] = paramiko.ProxyCommand(user_config['proxycommand'])
-
-        else:
-            cfg['username'] = target.split('@')[0]
-            cfg['hostname'] = target.split('@')[1].split(':')[0]
-            cfg['password'] = password_prompt(cfg['username'], cfg['hostname'])
-            try:
-                cfg['port'] = int(target.split('@')[1].split(':')[1])
-            except IndexError:
-                # IndexError will happen if no :PORT is there.
-                # Default value 22 is defined above in 'cfg'.
-                pass
-
-        ssh_conn = paramiko.SSHClient()
-        if auto_trust:
-            ssh_conn.set_missing_host_key_policy(paramiko.AutoAddPolicy())
-
-        ssh_conn.connect(**cfg)
-
-    return ssh_conn
-
-def run_benchmarks(dir, models, wav, alphabet, lm_binary=None, trie=None, iters=-1):
-    r'''
-    Core of the running of the benchmarks. We will run on all of models, against
-    the WAV file provided as wav, and the provided alphabet.
-    '''
-
-    assert_valid_dir(dir)
-
-    inference_times = [ ]
-
-    for model in models:
-        model_filename = model
-
-        current_model = {
-          'name':   model,
-          'iters':  [ ],
-          'mean':   numpy.infty,
-          'stddev': numpy.infty
-        }
-
-        if lm_binary and trie:
-            cmdline = './deepspeech --model "%s" --alphabet "%s" --lm "%s" --trie "%s" --audio "%s" -t' % (model_filename, alphabet, lm_binary, trie, wav)
-        else:
-            cmdline = './deepspeech --model "%s" --alphabet "%s" --audio "%s" -t' % (model_filename, alphabet, wav)
-
-        for it in range(iters):
-            sys.stdout.write('\rRunning %s: %d/%d' % (os.path.basename(model), (it+1), iters))
-            sys.stdout.flush()
-            rc, stdout, stderr = exec_command(cmdline, cwd=dir)
-            if rc == 0:
-                inference_time = float(stdout.split('\n')[1].split('=')[-1])
-                # print("[%d] model=%s inference=%f" % (it, model, inference_time))
-                current_model['iters'].append(inference_time)
-            else:
-                print('exec_command("%s") failed with rc=%d' % (cmdline, rc))
-                print('stdout: %s' % stdout)
-                print('stderr: %s' % stderr)
-                raise AssertionError('Execution failure: rc=%d' % (rc))
-
-        sys.stdout.write('\n')
-        sys.stdout.flush()
-        current_model['mean']   = numpy.mean(current_model['iters'])
-        current_model['stddev'] = numpy.std(current_model['iters'])
-        inference_times.append(current_model)
-
-    return inference_times
-
-def produce_csv(input, output):
-    r'''
-    Take an input dictionnary and write it to the object-file output.
-    '''
-    output.write('"model","mean","std"\n')
-    for model_data in input:
-        output.write('"%s",%f,%f\n' % (model_data['name'], model_data['mean'], model_data['stddev']))
-    output.flush()
-    output.close()
-    print("Wrote as %s" % output.name)
-
-def handle_args():
-    parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.')
-    parser.add_argument('--target', required=False,
-                                 help='SSH user:pass@host string for remote benchmarking. This can also be a name of a matching \'Host\' in your SSH config.')
-    parser.add_argument('--autotrust', action='store_true', default=False,
-                                 help='SSH Paramiko policy to automatically trust unknown keys.')
-    parser.add_argument('--allowagent', action='store_true', dest='allowagent',
-                                 help='Allow the use of a SSH agent.')
-    parser.add_argument('--no-allowagent', action='store_false', dest='allowagent',
-                                 help='Disallow the use of a SSH agent.')
-    parser.add_argument('--lookforkeys', action='store_true', dest='lookforkeys',
-                                 help='Allow to look for SSH keys in ~/.ssh/.')
-    parser.add_argument('--no-lookforkeys', action='store_false', dest='lookforkeys',
-                                 help='Disallow to look for SSH keys in ~/.ssh/.')
-    parser.add_argument('--dir', required=False, default=None,
-                                 help='Local directory where to copy stuff. This will be mirrored to the remote system if needed (make sure to use path that will work on both).')
-    parser.add_argument('--models', nargs='+', required=False,
-                                 help='List of files (protocolbuffer) to work on. Might be a zip file.')
-    parser.add_argument('--wav', required=False,
-                                 help='WAV file to pass to native_client. Supply again in plotting mode to draw realine line.')
-    parser.add_argument('--alphabet', required=False,
-                                 help='Text file to pass to native_client for the alphabet.')
-    parser.add_argument('--lm_binary', required=False,
-                                 help='Path to the LM binary file used by the decoder.')
-    parser.add_argument('--trie', required=False,
-                                 help='Path to the trie file used by the decoder.')
-    parser.add_argument('--iters', type=int, required=False, default=5,
-                                 help='How many iterations to perfom on each model.')
-    parser.add_argument('--keep', required=False, action='store_true',
-                                 help='Keeping run files (binaries & models).')
-    parser.add_argument('--csv', type=argparse.FileType('w'), required=False,
-                                 help='Target CSV file where to dump data.')
-    parser.add_argument('--binaries', required=False, default=None,
-                                 help='Specify non TaskCluster native_client.tar.xz to use')
-    return parser.parse_args()
-
-def do_main():
-    cli_args = handle_args()
-
-    if not cli_args.models or not cli_args.wav or not cli_args.alphabet:
-        raise AssertionError('Missing arguments (models, wav or alphabet)')
-
-    if cli_args.dir is not None and not os.path.isdir(cli_args.dir):
-        raise AssertionError('Inexistent temp directory')
-
-    if cli_args.binaries is not None and cli_args.binaries.find('native_client.tar.xz') == -1:
-        raise AssertionError('Local binaries must be bundled in a native_client.tar.xz file')
-
-    global ssh_conn
-    ssh_conn = establish_ssh(target=cli_args.target, auto_trust=cli_args.autotrust, allow_agent=cli_args.allowagent, look_keys=cli_args.lookforkeys)
-
-    tempdir, sorted_models = setup_tempdir(dir=cli_args.dir, models=cli_args.models, wav=cli_args.wav, alphabet=cli_args.alphabet, lm_binary=cli_args.lm_binary, trie=cli_args.trie, binaries=cli_args.binaries)
-
-    dest_sorted_models = list(map(lambda x: os.path.join(tempdir, os.path.basename(x)), sorted_models))
-    dest_wav = os.path.join(tempdir, os.path.basename(cli_args.wav))
-    dest_alphabet = os.path.join(tempdir, os.path.basename(cli_args.alphabet))
-
-    if cli_args.lm_binary and cli_args.trie:
-        dest_lm_binary = os.path.join(tempdir, os.path.basename(cli_args.lm_binary))
-        dest_trie = os.path.join(tempdir, os.path.basename(cli_args.trie))
-        inference_times = run_benchmarks(dir=tempdir, models=dest_sorted_models, wav=dest_wav, alphabet=dest_alphabet, lm_binary=dest_lm_binary, trie=dest_trie, iters=cli_args.iters)
-    else:
-        inference_times = run_benchmarks(dir=tempdir, models=dest_sorted_models, wav=dest_wav, alphabet=dest_alphabet, iters=cli_args.iters)
-
-    if cli_args.csv:
-        produce_csv(input=inference_times, output=cli_args.csv)
-
-    if not cli_args.keep:
-        teardown_tempdir(dir=tempdir)
-
-if __name__ == '__main__' :
-    do_main()
--- a/bin/benchmark_plotter.py
+++ b/bin/benchmark_plotter.py
@ -1,146 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from __future__ import absolute_import, division, print_function
-
-import os
-import sys
-
-# To use util.tc
-sys.path.append(os.path.abspath(os.path.dirname(os.path.dirname(sys.argv[0]))))
-import util.taskcluster as tcu
-from util.benchmark import keep_only_digits
-
-import argparse
-import numpy
-import matplotlib.pyplot as plt
-import scipy.stats as scipy_stats
-import scipy.io.wavfile as wav
-import csv
-import getpass
-
-from six import iteritems
-from six.moves import range, map
-
-r'''
- Tool to:
-  - ingest CSV file produced by benchmark_nc and produce nice plots
-'''
-
-def reduce_filename(f):
-    r'''
-    Expects something like /tmp/tmpAjry4Gdsbench/test.weights.e5.XXX.YYY.pb
-    Where XXX is a variation on the model size for example
-    And where YYY is a const related to the training dataset
-    '''
-
-    f = os.path.basename(f).split('.')
-    return keep_only_digits(f[-3])
-
-def ingest_csv(datasets=None, range=None):
-    existing_files = filter(lambda x: os.path.isfile(x[1]), datasets)
-    assert len(datasets) == len(existing_files)
-
-    if range:
-        range = map(int, range.split(','))
-
-    data = {}
-    for (dsname, dsfile) in datasets:
-        print('Reading %s from %s' % (dsname, dsfile))
-        with open(dsfile) as f:
-            d = csv.DictReader(f)
-            data[dsname] = []
-            for e in d:
-                if range:
-                    re       = reduce_filename(e['model'])
-                    in_range = (re >= range[0] and re <= range[1])
-                    if in_range:
-                        data[dsname].append(e)
-                else:
-                    data[dsname].append(e)
-
-    return data
-
-def produce_plot(input=None, output=None):
-    x = range(len(input))
-    xlabels = list(map(lambda a: a['name'], input))
-    y = list(map(lambda a: a['mean'], input))
-    yerr = list(map(lambda a: a['stddev'], input))
-
-    print('y=', y)
-    print('yerr=', yerr)
-    plt.errorbar(x, y, yerr=yerr)
-    plt.show()
-    print("Wrote as %s" % output.name)
-
-def produce_plot_multiseries(input=None, output=None, title=None, size=None, fig_dpi=None, source_wav=None):
-    fig, ax = plt.subplots()
-    # float() required because size.split()[] is a string
-    fig.set_figwidth(float(size.split('x')[0]) / fig_dpi)
-    fig.set_figheight(float(size.split('x')[1]) / fig_dpi)
-
-    nb_items = len(input[input.keys()[0]])
-    x_all    = list(range(nb_items))
-    for serie, serie_values in iteritems(input):
-        xtics  = list(map(lambda a: reduce_filename(a['model']), serie_values))
-        y      = list(map(lambda a: float(a['mean']), serie_values))
-        yerr   = list(map(lambda a: float(a['std']), serie_values))
-        linreg = scipy_stats.linregress(x_all, y)
-        ylin   = linreg.intercept + linreg.slope * numpy.asarray(x_all)
-
-        ax.errorbar(x_all, y, yerr=yerr, label=('%s' % serie), fmt='-', capsize=4, elinewidth=1)
-        ax.plot(x_all, ylin, label=('%s ~= %0.4f*x+%0.4f (R=%0.4f)' % (serie, linreg.slope, linreg.intercept, linreg.rvalue)))
-
-        plt.xticks(x_all, xtics, rotation=60)
-
-    if source_wav:
-        audio = wav.read(source_wav)
-        print('Adding realtime')
-        for rt_factor in [ 0.5, 1.0, 1.5, 2.0 ]:
-            rt_secs = len(audio[1]) / audio[0] * rt_factor
-            y_rt    = numpy.repeat(rt_secs, nb_items)
-            ax.plot(x_all, y_rt, label=('Realtime: %0.4f secs [%0.1f]' % (rt_secs, rt_factor)))
-
-    ax.set_title(title)
-    ax.set_xlabel('Model size')
-    ax.set_ylabel('Execution time (s)')
-    legend = ax.legend(loc='best')
-
-    plot_format = os.path.splitext(output.name)[-1].split('.')[-1]
-
-    plt.grid()
-    plt.tight_layout()
-    plt.savefig(output, transparent=False, frameon=True, dpi=fig_dpi, format=plot_format)
-
-def handle_args():
-    parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.')
-    parser.add_argument('--wav', required=False,
-                                 help='WAV file to pass to native_client. Supply again in plotting mode to draw realine line.')
-    parser.add_argument('--dataset', action='append', nargs=2, metavar=('name','source'),
-                                help='Include dataset NAME from file SOURCE. Repeat the option to add more datasets.')
-    parser.add_argument('--title', default=None, help='Title of the plot.')
-    parser.add_argument('--plot', type=argparse.FileType('w'), required=False,
-                                help='Target file where to plot data. Format will be deduced from extension.')
-    parser.add_argument('--size', default='800x600',
-                                help='Size (px) of the resulting plot.')
-    parser.add_argument('--dpi', type=int, default=96,
-                                help='Set plot DPI.')
-    parser.add_argument('--range', default=None,
-                                help='Range of model size to use. Comma-separated string of boundaries: min,max')
-    return parser.parse_args()
-
-def do_main():
-    cli_args = handle_args()
-
-    if not cli_args.dataset or not cli_args.plot:
-        raise AssertionError('Missing arguments (dataset or target file)')
-
-    # This is required to avoid errors about missing DISPLAY env var
-    plt.switch_backend('agg')
-    all_inference_times = ingest_csv(datasets=cli_args.dataset, range=cli_args.range)
-
-    if cli_args.plot:
-        produce_plot_multiseries(input=all_inference_times, output=cli_args.plot, title=cli_args.title, size=cli_args.size, fig_dpi=cli_args.dpi, source_wav=cli_args.wav)
-
-if __name__ == '__main__' :
-    do_main()
--- a/bin/compare_samples.py
+++ b/bin/compare_samples.py
@ -0,0 +1,85 @@
+#!/usr/bin/env python
+"""
+Tool for comparing two wav samples
+"""
+import argparse
+import sys
+
+import numpy as np
+from coqui_stt_training.util.audio import AUDIO_TYPE_NP, mean_dbfs
+from coqui_stt_training.util.sample_collections import load_sample
+
+
+def fail(message):
+    print(message, file=sys.stderr, flush=True)
+    sys.exit(1)
+
+
+def compare_samples():
+    sample1 = load_sample(CLI_ARGS.sample1).unpack()
+    sample2 = load_sample(CLI_ARGS.sample2).unpack()
+    if sample1.audio_format != sample2.audio_format:
+        fail(
+            "Samples differ on: audio-format ({} and {})".format(
+                sample1.audio_format, sample2.audio_format
+            )
+        )
+    if abs(sample1.duration - sample2.duration) > 0.001:
+        fail(
+            "Samples differ on: duration ({} and {})".format(
+                sample1.duration, sample2.duration
+            )
+        )
+    sample1.change_audio_type(AUDIO_TYPE_NP)
+    sample2.change_audio_type(AUDIO_TYPE_NP)
+    samples = [sample1, sample2]
+    largest = np.argmax([sample1.audio.shape[0], sample2.audio.shape[0]])
+    smallest = (largest + 1) % 2
+    samples[largest].audio = samples[largest].audio[: len(samples[smallest].audio)]
+    audio_diff = samples[largest].audio - samples[smallest].audio
+    diff_dbfs = mean_dbfs(audio_diff)
+    differ_msg = "Samples differ on: sample data ({:0.2f} dB difference) ".format(
+        diff_dbfs
+    )
+    equal_msg = "Samples are considered equal ({:0.2f} dB difference)".format(diff_dbfs)
+    if CLI_ARGS.if_differ:
+        if diff_dbfs <= CLI_ARGS.threshold:
+            fail(equal_msg)
+        if not CLI_ARGS.no_success_output:
+            print(differ_msg, file=sys.stderr, flush=True)
+    else:
+        if diff_dbfs > CLI_ARGS.threshold:
+            fail(differ_msg)
+        if not CLI_ARGS.no_success_output:
+            print(equal_msg, file=sys.stderr, flush=True)
+
+
+def handle_args():
+    parser = argparse.ArgumentParser(
+        description="Tool for checking similarity of two samples"
+    )
+    parser.add_argument("sample1", help="Filename of sample 1 to compare")
+    parser.add_argument("sample2", help="Filename of sample 2 to compare")
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=-60.0,
+        help="dB of sample deltas above which they are considered different",
+    )
+    parser.add_argument(
+        "--if-differ",
+        action="store_true",
+        help="If to succeed and return status code 0 on different signals and fail on equal ones (inverse check)."
+        "This will still fail on different formats or durations.",
+    )
+    parser.add_argument(
+        "--no-success-output",
+        action="store_true",
+        help="Stay silent on success (if samples are equal of - with --if-differ - samples are not equal)",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    CLI_ARGS = handle_args()
+    compare_samples()
--- a/bin/data_set_tool.py
+++ b/bin/data_set_tool.py
@ -0,0 +1,136 @@
+#!/usr/bin/env python
+"""
+Tool for building a combined SDB or CSV sample-set from other sets
+Use 'python3 data_set_tool.py -h' for help
+"""
+import argparse
+import sys
+from pathlib import Path
+
+import progressbar
+from coqui_stt_training.util.audio import (
+    AUDIO_TYPE_OPUS,
+    AUDIO_TYPE_PCM,
+    AUDIO_TYPE_WAV,
+    change_audio_types,
+)
+from coqui_stt_training.util.augmentations import (
+    SampleAugmentation,
+    apply_sample_augmentations,
+    parse_augmentations,
+)
+from coqui_stt_training.util.downloader import SIMPLE_BAR
+from coqui_stt_training.util.sample_collections import (
+    CSVWriter,
+    DirectSDBWriter,
+    TarWriter,
+    samples_from_sources,
+)
+
+AUDIO_TYPE_LOOKUP = {"wav": AUDIO_TYPE_WAV, "opus": AUDIO_TYPE_OPUS}
+
+
+def build_data_set():
+    audio_type = AUDIO_TYPE_LOOKUP[CLI_ARGS.audio_type]
+    augmentations = parse_augmentations(CLI_ARGS.augment)
+    print(f"Parsed augmentations from flags: {augmentations}")
+    if any(not isinstance(a, SampleAugmentation) for a in augmentations):
+        print(
+            "Warning: Some of the specified augmentations will not get applied, as this tool only supports "
+            "overlay, codec, reverb, resample and volume."
+        )
+    extension = Path(CLI_ARGS.target).suffix.lower()
+    labeled = not CLI_ARGS.unlabeled
+    if extension == ".csv":
+        writer = CSVWriter(
+            CLI_ARGS.target, absolute_paths=CLI_ARGS.absolute_paths, labeled=labeled
+        )
+    elif extension == ".sdb":
+        writer = DirectSDBWriter(
+            CLI_ARGS.target, audio_type=audio_type, labeled=labeled
+        )
+    elif extension == ".tar":
+        writer = TarWriter(
+            CLI_ARGS.target, labeled=labeled, gz=False, include=CLI_ARGS.include
+        )
+    elif extension == ".tgz" or CLI_ARGS.target.lower().endswith(".tar.gz"):
+        writer = TarWriter(
+            CLI_ARGS.target, labeled=labeled, gz=True, include=CLI_ARGS.include
+        )
+    else:
+        print(
+            "Unknown extension of target file - has to be either .csv, .sdb, .tar, .tar.gz or .tgz"
+        )
+        sys.exit(1)
+    with writer:
+        samples = samples_from_sources(CLI_ARGS.sources, labeled=not CLI_ARGS.unlabeled)
+        num_samples = len(samples)
+        if augmentations:
+            samples = apply_sample_augmentations(
+                samples, audio_type=AUDIO_TYPE_PCM, augmentations=augmentations
+            )
+        bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
+        for sample in bar(
+            change_audio_types(
+                samples,
+                audio_type=audio_type,
+                bitrate=CLI_ARGS.bitrate,
+                processes=CLI_ARGS.workers,
+            )
+        ):
+            writer.add(sample)
+
+
+def handle_args():
+    parser = argparse.ArgumentParser(
+        description="Tool for building a combined SDB or CSV sample-set from other sets"
+    )
+    parser.add_argument(
+        "sources",
+        nargs="+",
+        help="Source CSV and/or SDB files - "
+        "Note: For getting a correctly ordered target set, source SDBs have to have their samples "
+        "already ordered from shortest to longest.",
+    )
+    parser.add_argument("target", help="SDB, CSV or TAR(.gz) file to create")
+    parser.add_argument(
+        "--audio-type",
+        default="opus",
+        choices=AUDIO_TYPE_LOOKUP.keys(),
+        help="Audio representation inside target SDB",
+    )
+    parser.add_argument(
+        "--bitrate",
+        type=int,
+        help="Bitrate for lossy compressed SDB samples like in case of --audio-type opus",
+    )
+    parser.add_argument(
+        "--workers", type=int, default=None, help="Number of encoding SDB workers"
+    )
+    parser.add_argument(
+        "--unlabeled",
+        action="store_true",
+        help="If to build an data-set with unlabeled (audio only) samples - "
+        "typically used for building noise augmentation corpora",
+    )
+    parser.add_argument(
+        "--absolute-paths",
+        action="store_true",
+        help="If to reference samples by their absolute paths when writing CSV files",
+    )
+    parser.add_argument(
+        "--augment",
+        action="append",
+        help="Add an augmentation operation",
+    )
+    parser.add_argument(
+        "--include",
+        action="append",
+        help="Adds a file to the root directory of .tar(.gz) targets",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    CLI_ARGS = handle_args()
+    build_data_set()
--- a/bin/gpu_usage_chart
+++ b/bin/gpu_usage_chart
@ -1,11 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-
-import os
-sys.path.append(os.path.abspath('.'))
-
-from util.gpu_usage import GPUUsage
-
-gu = GPUUsage()
-gu.start()
--- a/bin/gpu_usage_plot
+++ b/bin/gpu_usage_plot
@ -1,10 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-
-import os
-sys.path.append(os.path.abspath('.'))
-
-from util.gpu_usage import GPUUsageChart
-
-GPUUsageChart(sys.argv[1], sys.argv[2])
--- a/bin/graphdef_binary_to_text.py
+++ b/bin/graphdef_binary_to_text.py
@ -1,14 +1,21 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

-import tensorflow as tf
 import sys

-# Load and export as string
-with tf.gfile.FastGFile(sys.argv[1], 'rb') as fin:
-    graph_def = tf.GraphDef()
-    graph_def.ParseFromString(fin.read())
+import tensorflow.compat.v1 as tfv1
+from google.protobuf import text_format

-    with tf.gfile.FastGFile(sys.argv[1] + 'txt', 'w') as fout:
-        from google.protobuf import text_format
-        fout.write(text_format.MessageToString(graph_def))
+
+def main():
+    # Load and export as string
+    with tfv1.gfile.FastGFile(sys.argv[1], "rb") as fin:
+        graph_def = tfv1.GraphDef()
+        graph_def.ParseFromString(fin.read())
+
+        with tfv1.gfile.FastGFile(sys.argv[1] + "txt", "w") as fout:
+            fout.write(text_format.MessageToString(graph_def))
+
+
+if __name__ == "__main__":
+    main()
--- a/bin/import_aidatatang.py
+++ b/bin/import_aidatatang.py
@ -0,0 +1,97 @@
+#!/usr/bin/env python
+import glob
+import os
+import tarfile
+
+import pandas
+from coqui_stt_training.util.importers import get_importers_parser
+
+COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"]
+
+
+def extract(archive_path, target_dir):
+    print("Extracting {} into {}...".format(archive_path, target_dir))
+    with tarfile.open(archive_path) as tar:
+        tar.extractall(target_dir)
+
+
+def preprocess_data(tgz_file, target_dir):
+    # First extract main archive and sub-archives
+    extract(tgz_file, target_dir)
+    main_folder = os.path.join(target_dir, "aidatatang_200zh")
+
+    for targz in glob.glob(os.path.join(main_folder, "corpus", "*", "*.tar.gz")):
+        extract(targz, os.path.dirname(targz))
+
+    # Folder structure is now:
+    # - aidatatang_200zh/
+    #   - transcript/aidatatang_200_zh_transcript.txt
+    #   - corpus/train/*.tar.gz
+    #   - corpus/train/*/*.{wav,txt,trn,metadata}
+    #   - corpus/dev/*.tar.gz
+    #   - corpus/dev/*/*.{wav,txt,trn,metadata}
+    #   - corpus/test/*.tar.gz
+    #   - corpus/test/*/*.{wav,txt,trn,metadata}
+
+    # Transcripts file has one line per WAV file, where each line consists of
+    # the WAV file name without extension followed by a single space followed
+    # by the transcript.
+
+    # Since the transcripts themselves can contain spaces, we split on space but
+    # only once, then build a mapping from file name to transcript
+    transcripts_path = os.path.join(
+        main_folder, "transcript", "aidatatang_200_zh_transcript.txt"
+    )
+    with open(transcripts_path) as fin:
+        transcripts = dict((line.split(" ", maxsplit=1) for line in fin))
+
+    def load_set(glob_path):
+        set_files = []
+        for wav in glob.glob(glob_path):
+            try:
+                wav_filename = wav
+                wav_filesize = os.path.getsize(wav)
+                transcript_key = os.path.splitext(os.path.basename(wav))[0]
+                transcript = transcripts[transcript_key].strip("\n")
+                set_files.append((wav_filename, wav_filesize, transcript))
+            except KeyError:
+                print("Warning: Missing transcript for WAV file {}.".format(wav))
+        return set_files
+
+    for subset in ("train", "dev", "test"):
+        print("Loading {} set samples...".format(subset))
+        subset_files = load_set(
+            os.path.join(main_folder, "corpus", subset, "*", "*.wav")
+        )
+        df = pandas.DataFrame(data=subset_files, columns=COLUMN_NAMES)
+
+        # Trim train set to under 10s by removing the last couple hundred samples
+        if subset == "train":
+            durations = (df["wav_filesize"] - 44) / 16000 / 2
+            df = df[durations <= 10.0]
+            print("Trimming {} samples > 10 seconds".format((durations > 10.0).sum()))
+
+        dest_csv = os.path.join(target_dir, "aidatatang_{}.csv".format(subset))
+        print("Saving {} set into {}...".format(subset, dest_csv))
+        df.to_csv(dest_csv, index=False)
+
+
+def main():
+    # https://www.openslr.org/62/
+    parser = get_importers_parser(description="Import aidatatang_200zh corpus")
+    parser.add_argument("tgz_file", help="Path to aidatatang_200zh.tgz")
+    parser.add_argument(
+        "--target_dir",
+        default="",
+        help="Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.",
+    )
+    params = parser.parse_args()
+
+    if not params.target_dir:
+        params.target_dir = os.path.dirname(params.tgz_file)
+
+    preprocess_data(params.tgz_file, params.target_dir)
+
+
+if __name__ == "__main__":
+    main()
--- a/bin/import_aishell.py
+++ b/bin/import_aishell.py
@ -0,0 +1,94 @@
+#!/usr/bin/env python
+import glob
+import os
+import tarfile
+
+import pandas
+from coqui_stt_training.util.importers import get_importers_parser
+
+COLUMNNAMES = ["wav_filename", "wav_filesize", "transcript"]
+
+
+def extract(archive_path, target_dir):
+    print("Extracting {} into {}...".format(archive_path, target_dir))
+    with tarfile.open(archive_path) as tar:
+        tar.extractall(target_dir)
+
+
+def preprocess_data(tgz_file, target_dir):
+    # First extract main archive and sub-archives
+    extract(tgz_file, target_dir)
+    main_folder = os.path.join(target_dir, "data_aishell")
+
+    wav_archives_folder = os.path.join(main_folder, "wav")
+    for targz in glob.glob(os.path.join(wav_archives_folder, "*.tar.gz")):
+        extract(targz, main_folder)
+
+    # Folder structure is now:
+    # - data_aishell/
+    #   - train/S****/*.wav
+    #   - dev/S****/*.wav
+    #   - test/S****/*.wav
+    #   - wav/S****.tar.gz
+    #   - transcript/aishell_transcript_v0.8.txt
+
+    # Transcripts file has one line per WAV file, where each line consists of
+    # the WAV file name without extension followed by a single space followed
+    # by the transcript.
+
+    # Since the transcripts themselves can contain spaces, we split on space but
+    # only once, then build a mapping from file name to transcript
+    transcripts_path = os.path.join(
+        main_folder, "transcript", "aishell_transcript_v0.8.txt"
+    )
+    with open(transcripts_path) as fin:
+        transcripts = dict((line.split(" ", maxsplit=1) for line in fin))
+
+    def load_set(glob_path):
+        set_files = []
+        for wav in glob.glob(glob_path):
+            try:
+                wav_filename = wav
+                wav_filesize = os.path.getsize(wav)
+                transcript_key = os.path.splitext(os.path.basename(wav))[0]
+                transcript = transcripts[transcript_key].strip("\n")
+                set_files.append((wav_filename, wav_filesize, transcript))
+            except KeyError:
+                print("Warning: Missing transcript for WAV file {}.".format(wav))
+        return set_files
+
+    for subset in ("train", "dev", "test"):
+        print("Loading {} set samples...".format(subset))
+        subset_files = load_set(os.path.join(main_folder, subset, "S*", "*.wav"))
+        df = pandas.DataFrame(data=subset_files, columns=COLUMNNAMES)
+
+        # Trim train set to under 10s by removing the last couple hundred samples
+        if subset == "train":
+            durations = (df["wav_filesize"] - 44) / 16000 / 2
+            df = df[durations <= 10.0]
+            print("Trimming {} samples > 10 seconds".format((durations > 10.0).sum()))
+
+        dest_csv = os.path.join(target_dir, "aishell_{}.csv".format(subset))
+        print("Saving {} set into {}...".format(subset, dest_csv))
+        df.to_csv(dest_csv, index=False)
+
+
+def main():
+    # http://www.openslr.org/33/
+    parser = get_importers_parser(description="Import AISHELL corpus")
+    parser.add_argument("aishell_tgz_file", help="Path to data_aishell.tgz")
+    parser.add_argument(
+        "--target_dir",
+        default="",
+        help="Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.",
+    )
+    params = parser.parse_args()
+
+    if not params.target_dir:
+        params.target_dir = os.path.dirname(params.aishell_tgz_file)
+
+    preprocess_data(params.aishell_tgz_file, params.target_dir)
+
+
+if __name__ == "__main__":
+    main()
--- a/bin/import_ccpmf.py
+++ b/bin/import_ccpmf.py
@ -0,0 +1,750 @@
+#!/usr/bin/env python
+"""
+Importer for dataset published from Centre de Conférence Pierre Mendès-France
+Ministère de l'Économie, des Finances et de la Relance
+"""
+
+import csv
+import decimal
+import hashlib
+import math
+import os
+import re
+import subprocess
+import sys
+import unicodedata
+import xml.etree.ElementTree as ET
+import zipfile
+from glob import glob
+from multiprocessing import Pool
+
+import progressbar
+import sox
+
+try:
+    from num2words import num2words
+except ImportError as ex:
+    print("pip install num2words")
+    sys.exit(1)
+
+import json
+
+import requests
+from coqui_stt_ctcdecoder import Alphabet
+from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
+from coqui_stt_training.util.helpers import secs_to_hours
+from coqui_stt_training.util.importers import (
+    get_counter,
+    get_imported_samples,
+    get_importers_parser,
+    get_validate_label,
+    print_import_report,
+)
+
+FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
+SAMPLE_RATE = 16000
+CHANNELS = 1
+BIT_DEPTH = 16
+MAX_SECS = 10
+MIN_SECS = 0.85
+
+DATASET_RELEASE_CSV = "https://data.economie.gouv.fr/explore/dataset/transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020/download/?format=csv&timezone=Europe/Berlin&lang=fr&use_labels_for_header=true&csv_separator=%3B"
+DATASET_RELEASE_SHA = [
+    (
+        "863d39a06a388c6491c6ff2f6450b151f38f1b57",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.001",
+    ),
+    (
+        "2f3a0305aa04c61220bb00b5a4e553e45dbf12e1",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.002",
+    ),
+    (
+        "5e55e9f1f844097349188ac875947e5a3d7fe9f1",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.003",
+    ),
+    (
+        "8bf54842cf07948ca5915e27a8bd5fa5139c06ae",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.004",
+    ),
+    (
+        "c8963504aadc015ac48f9af80058a0bb3440b94f",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.005",
+    ),
+    (
+        "d95e225e908621d83ce4e9795fd108d9d310e244",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.006",
+    ),
+    (
+        "de6ed9c2b0ee80ca879aae8ba7923cc93217d811",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.007",
+    ),
+    (
+        "234283c47dacfcd4450d836c52c25f3e807fc5f2",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.008",
+    ),
+    (
+        "4e6b67a688639bb72f8cd81782eaba604a8d32a6",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.009",
+    ),
+    (
+        "4165a51389777c8af8e6253d87bdacb877e8b3b0",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.010",
+    ),
+    (
+        "34322e7009780d97ef5bd02bf2f2c7a31f00baff",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.011",
+    ),
+    (
+        "48c5be3b2ca9d6108d525da6a03e91d93a95dbac",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.012",
+    ),
+    (
+        "87573172f506a189c2ebc633856fe11a2e9cd213",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.013",
+    ),
+    (
+        "6ab2c9e508e9278d5129f023e018725c4a7c69e8",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.014",
+    ),
+    (
+        "4f84df831ef46dce5d3ab3e21817687a2d8c12d0",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.015",
+    ),
+    (
+        "e69bfb079885c299cb81080ef88b1b8b57158aa6",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.016",
+    ),
+    (
+        "5f764ba788ee273981cf211b242c29b49ca22c5e",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.017",
+    ),
+    (
+        "b6aa81a959525363223494830c1e7307d4c4bae6",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.018",
+    ),
+    (
+        "91ddcf43c7bf113a6f2528b857c7ec22a50a148a",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.019",
+    ),
+    (
+        "fa1b29273dd77b9a7494983a2f9ae52654b931d7",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.020",
+    ),
+    (
+        "1113aef4f5e2be2f7fbf2d54b6c710c1c0e7135f",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.021",
+    ),
+    (
+        "ce6420d5d0b6b5135ba559f83e1a82d4d615c470",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.022",
+    ),
+    (
+        "d0976ed292ac24fcf1590d1ea195077c74b05471",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.023",
+    ),
+    (
+        "ec746cd6af066f62d9bf8d3b2f89174783ff4e3c",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.024",
+    ),
+    (
+        "570d9e1e84178e32fd867171d4b3aaecda1fd4fb",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.025",
+    ),
+    (
+        "c29ccc7467a75b2cae3d7f2e9fbbb2ab276cb8ac",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.026",
+    ),
+    (
+        "08406a51146d88e208704ce058c060a1e44efa50",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.027",
+    ),
+    (
+        "199aedad733a78ea1e7d47def9c71c6fd5795e02",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.028",
+    ),
+    (
+        "db856a068f92fb4f01f410bba42c7271de0f231a",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.029",
+    ),
+    (
+        "e3c0135f16c6c9d25a09dcb4f99a685438a84740",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.030",
+    ),
+    (
+        "e51b8bb9c0ae4339f98b4f21e6d29b825109f0ac",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.031",
+    ),
+    (
+        "be5e80cbc49b59b31ae33c30576ef0e1a162d84e",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.032",
+    ),
+    (
+        "501df58e3ff55fcfd75b93dab57566dc536948b8",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.033",
+    ),
+    (
+        "1a114875811a8cdcb8d85a9f6dbee78be3e05131",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.034",
+    ),
+    (
+        "465d824e7ee46448369182c0c28646d155a2249b",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.035",
+    ),
+    (
+        "37f341b1b266d143eb73138c31cfff3201b9d619",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.036",
+    ),
+    (
+        "9e7d8255987a8a77a90e0d4b55c8fd38b9fb5694",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.037",
+    ),
+    (
+        "54886755630cb080a53098cb1b6c951c6714a143",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.038",
+    ),
+    (
+        "4b7cbb0154697be795034f7a49712e882a97197a",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.039",
+    ),
+    (
+        "c8e1e565a0e7a1f6ff1dbfcefe677aa74a41d2f2",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.040",
+    ),
+]
+
+
+def _download_and_preprocess_data(csv_url, target_dir):
+    dataset_sources = os.path.join(
+        target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020", "data.txt"
+    )
+    if os.path.exists(dataset_sources):
+        return dataset_sources
+
+    # Making path absolute
+    target_dir = os.path.abspath(target_dir)
+    csv_ref = requests.get(csv_url).text.split("\r\n")[1:-1]
+    for part in csv_ref:
+        part_filename = (
+            requests.head(part)
+            .headers.get("Content-Disposition")
+            .split(" ")[1]
+            .split("=")[1]
+            .replace('"', "")
+        )
+        if not os.path.exists(os.path.join(target_dir, part_filename)):
+            part_path = maybe_download(part_filename, target_dir, part)
+
+    def _big_sha1(fname):
+        s = hashlib.sha1()
+        buffer_size = 65536
+        with open(fname, "rb") as f:
+            while True:
+                data = f.read(buffer_size)
+                if not data:
+                    break
+                s.update(data)
+        return s.hexdigest()
+
+    for (sha1, filename) in DATASET_RELEASE_SHA:
+        print("Checking {} SHA1:".format(filename))
+        csum = _big_sha1(os.path.join(target_dir, filename))
+        if csum == sha1:
+            print("\t{}: OK {}".format(filename, sha1))
+        else:
+            print("\t{}: ERROR: expected {}, computed {}".format(filename, sha1, csum))
+        assert csum == sha1
+
+    # Conditionally extract data
+    _maybe_extract(
+        target_dir,
+        "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020",
+        "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip",
+        "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020.zip",
+    )
+
+    # Produce source text for extraction / conversion
+    return _maybe_create_sources(
+        os.path.join(target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020")
+    )
+
+
+def _maybe_extract(target_dir, extracted_data, archive, final):
+    # If target_dir/extracted_data does not exist, extract archive in target_dir
+    extracted_path = os.path.join(target_dir, extracted_data)
+    archive_path = os.path.join(target_dir, archive)
+    final_archive = os.path.join(extracted_path, final)
+
+    if not os.path.exists(extracted_path):
+        if not os.path.exists(archive_path):
+            print('No archive "%s" - building ...' % archive_path)
+            all_zip_parts = glob(archive_path + ".*")
+            all_zip_parts.sort()
+            cmdline = "cat {} > {}".format(" ".join(all_zip_parts), archive_path)
+            print('Building with "%s"' % cmdline)
+            subprocess.check_call(cmdline, shell=True, cwd=target_dir)
+            assert os.path.exists(archive_path)
+
+        print(
+            'No directory "%s" - extracting archive %s ...'
+            % (extracted_path, archive_path)
+        )
+        with zipfile.ZipFile(archive_path) as zip_f:
+            zip_f.extractall(extracted_path)
+
+        with zipfile.ZipFile(final_archive) as zip_f:
+            zip_f.extractall(target_dir)
+    else:
+        print('Found directory "%s" - not extracting it from archive.' % extracted_path)
+
+
+def _maybe_create_sources(dir):
+    dataset_sources = os.path.join(dir, "data.txt")
+    MP3 = glob(os.path.join(dir, "**", "*.mp3"))
+    XML = glob(os.path.join(dir, "**", "*.xml"))
+
+    MP3_XML_Scores = []
+    MP3_XML_Fin = {}
+
+    for f_mp3 in MP3:
+        for f_xml in XML:
+            b_mp3 = os.path.splitext(os.path.basename(f_mp3))[0]
+            b_xml = os.path.splitext(os.path.basename(f_xml))[0]
+            a_mp3 = b_mp3.split("_")
+            a_xml = b_xml.split("_")
+            score = 0
+            date_mp3 = a_mp3[0]
+            date_xml = a_xml[0]
+
+            if date_mp3 != date_xml:
+                continue
+
+            for i in range(min(len(a_mp3), len(a_xml))):
+                if a_mp3[i] == a_xml[i]:
+                    score += 1
+
+            if score >= 1:
+                MP3_XML_Scores.append((f_mp3, f_xml, score))
+
+    # sort by score
+    MP3_XML_Scores.sort(key=lambda x: x[2], reverse=True)
+    for s_mp3, s_xml, score in MP3_XML_Scores:
+        # print(s_mp3, s_xml, score)
+        if score not in MP3_XML_Fin:
+            MP3_XML_Fin[score] = {}
+
+        if s_mp3 not in MP3_XML_Fin[score]:
+            try:
+                MP3.index(s_mp3)
+                MP3.remove(s_mp3)
+                MP3_XML_Fin[score][s_mp3] = s_xml
+            except ValueError as ex:
+                pass
+        else:
+            print("here:", MP3_XML_Fin[score][s_mp3], s_xml, file=sys.stderr)
+
+    with open(dataset_sources, "w") as ds:
+        for score in MP3_XML_Fin:
+            for mp3 in MP3_XML_Fin[score]:
+                xml = MP3_XML_Fin[score][mp3]
+                if os.path.getsize(mp3) > 0 and os.path.getsize(xml) > 0:
+                    mp3 = os.path.relpath(mp3, dir)
+                    xml = os.path.relpath(xml, dir)
+                    ds.write("{},{},{:0.2e}\n".format(xml, mp3, 2.5e-4))
+                else:
+                    print("Empty file {} or {}".format(mp3, xml), file=sys.stderr)
+
+    print("Missing XML pairs:", MP3, file=sys.stderr)
+    return dataset_sources
+
+
+def maybe_normalize_for_digits(label):
+    # first, try to identify numbers like "50 000", "260 000"
+    if " " in label:
+        if any(s.isdigit() for s in label):
+            thousands = re.compile(r"(\d{1,3}(?:\s*\d{3})*(?:,\d+)?)")
+            maybe_thousands = thousands.findall(label)
+            if len(maybe_thousands) > 0:
+                while True:
+                    (label, r) = re.subn(r"(\d)\s(\d{3})", "\\1\\2", label)
+                    if r == 0:
+                        break
+
+    # this might be a time or duration in the form "hh:mm" or "hh:mm:ss"
+    if ":" in label:
+        for s in label.split(" "):
+            if any(i.isdigit() for i in s):
+                date_or_time = re.compile(r"(\d{1,2}):(\d{2}):?(\d{2})?")
+                maybe_date_or_time = date_or_time.findall(s)
+                if len(maybe_date_or_time) > 0:
+                    maybe_hours = maybe_date_or_time[0][0]
+                    maybe_minutes = maybe_date_or_time[0][1]
+                    maybe_seconds = maybe_date_or_time[0][2]
+                    if len(maybe_seconds) > 0:
+                        label = label.replace(
+                            "{}:{}:{}".format(
+                                maybe_hours, maybe_minutes, maybe_seconds
+                            ),
+                            "{} heures {} minutes et {} secondes".format(
+                                maybe_hours, maybe_minutes, maybe_seconds
+                            ),
+                        )
+                    else:
+                        label = label.replace(
+                            "{}:{}".format(maybe_hours, maybe_minutes),
+                            "{} heures et {} minutes".format(
+                                maybe_hours, maybe_minutes
+                            ),
+                        )
+
+    new_label = []
+    # pylint: disable=too-many-nested-blocks
+    for s in label.split(" "):
+        if any(i.isdigit() for i in s):
+            s = s.replace(",", ".")  # num2words requires "." for floats
+            s = s.replace('"', "")  # clean some data, num2words would choke on 1959"
+
+            last_c = s[-1]
+            if not last_c.isdigit():  # num2words will choke on "0.6.", "24 ?"
+                s = s[:-1]
+
+            if any(
+                i.isalpha() for i in s
+            ):  # So we have any(isdigit()) **and** any(sialpha), like "3D"
+                ns = []
+                for c in s:
+                    nc = c
+                    if c.isdigit():  # convert "3" to "trois-"
+                        try:
+                            nc = num2words(c, lang="fr") + "-"
+                        except decimal.InvalidOperation as ex:
+                            print("decimal.InvalidOperation: '{}'".format(s))
+                            raise ex
+                    ns.append(nc)
+                s = "".join(s)
+            else:
+                try:
+                    s = num2words(s, lang="fr")
+                except decimal.InvalidOperation as ex:
+                    print("decimal.InvalidOperation: '{}'".format(s))
+                    raise ex
+        new_label.append(s)
+    return " ".join(new_label)
+
+
+def maybe_normalize_for_specials_chars(label):
+    label = label.replace("%", "pourcents")
+    label = label.replace("/", ", ")  # clean intervals like 2019/2022 to "2019 2022"
+    label = label.replace("-", ", ")  # clean intervals like 70-80 to "70 80"
+    label = label.replace("+", " plus ")  # clean + and make it speakable
+    label = label.replace("€", " euros ")  # clean euro symbol and make it speakable
+    label = label.replace(
+        "., ", ", "
+    )  # clean some strange "4.0., " (20181017_Innovation.xml)
+    label = label.replace(
+        "°", " degré "
+    )  # clean some strange "°5" (20181210_EtatsGeneraux-1000_fre_750_und.xml)
+    label = label.replace("...", ".")  # remove ellipsis
+    label = label.replace("..", ".")  # remove broken ellipsis
+    label = label.replace(
+        "m²", "mètre-carrés"
+    )  # 20150616_Defi_Climat_3_wmv_0_fre_minefi.xml
+    label = label.replace(
+        "[end]", ""
+    )  # broken tag in 20150123_Entretiens_Tresor_PGM_wmv_0_fre_minefi.xml
+    label = label.replace(
+        u"\xB8c", " ç"
+    )  # strange cedilla in 20150417_Printemps_Economie_2_wmv_0_fre_minefi.xml
+    label = label.replace(
+        "C0²", "CO 2"
+    )  # 20121016_Syteme_sante_copie_wmv_0_fre_minefi.xml
+    return label
+
+
+def maybe_normalize_for_anglicisms(label):
+    label = label.replace("B2B", "B to B")
+    label = label.replace("B2C", "B to C")
+    label = label.replace("#", "hashtag ")
+    label = label.replace("@", "at ")
+    return label
+
+
+def maybe_normalize(label):
+    label = maybe_normalize_for_specials_chars(label)
+    label = maybe_normalize_for_anglicisms(label)
+    label = maybe_normalize_for_digits(label)
+    return label
+
+
+def one_sample(sample):
+    file_size = -1
+    frames = 0
+
+    audio_source = sample[0]
+    target_dir = sample[1]
+    dataset_basename = sample[2]
+
+    start_time = sample[3]
+    duration = sample[4]
+    label = label_filter_fun(sample[5])
+    sample_id = sample[6]
+
+    _wav_filename = os.path.basename(
+        audio_source.replace(".wav", "_{:06}.wav".format(sample_id))
+    )
+    wav_fullname = os.path.join(target_dir, dataset_basename, _wav_filename)
+
+    if not os.path.exists(wav_fullname):
+        subprocess.check_output(
+            [
+                "ffmpeg",
+                "-i",
+                audio_source,
+                "-ss",
+                str(start_time),
+                "-t",
+                str(duration),
+                "-c",
+                "copy",
+                wav_fullname,
+            ],
+            stdin=subprocess.DEVNULL,
+            stderr=subprocess.STDOUT,
+        )
+
+    file_size = os.path.getsize(wav_fullname)
+    frames = int(
+        subprocess.check_output(["soxi", "-s", wav_fullname], stderr=subprocess.STDOUT)
+    )
+
+    _counter = get_counter()
+    _rows = []
+
+    if file_size == -1:
+        # Excluding samples that failed upon conversion
+        _counter["failed"] += 1
+    elif label is None:
+        # Excluding samples that failed on label validation
+        _counter["invalid_label"] += 1
+    elif int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(label)):
+        # Excluding samples that are too short to fit the transcript
+        _counter["too_short"] += 1
+    elif frames / SAMPLE_RATE < MIN_SECS:
+        # Excluding samples that are too short
+        _counter["too_short"] += 1
+    elif frames / SAMPLE_RATE > MAX_SECS:
+        # Excluding very long samples to keep a reasonable batch-size
+        _counter["too_long"] += 1
+    else:
+        # This one is good - keep it for the target CSV
+        _rows.append((os.path.join(dataset_basename, _wav_filename), file_size, label))
+        _counter["imported_time"] += frames
+    _counter["all"] += 1
+    _counter["total_time"] += frames
+
+    return (_counter, _rows)
+
+
+def _maybe_import_data(xml_file, audio_source, target_dir, rel_tol=1e-1):
+    dataset_basename = os.path.splitext(os.path.split(xml_file)[1])[0]
+    wav_root = os.path.join(target_dir, dataset_basename)
+    if not os.path.exists(wav_root):
+        os.makedirs(wav_root)
+
+    source_frames = int(
+        subprocess.check_output(["soxi", "-s", audio_source], stderr=subprocess.STDOUT)
+    )
+    print("Source audio length: %s" % secs_to_hours(source_frames / SAMPLE_RATE))
+
+    # Get audiofile path and transcript for each sentence in tsv
+    samples = []
+    tree = ET.parse(xml_file)
+    root = tree.getroot()
+    seq_id = 0
+    this_time = 0.0
+    this_duration = 0.0
+    prev_time = 0.0
+    prev_duration = 0.0
+    this_text = ""
+    for child in root:
+        if child.tag == "row":
+            cur_time = float(child.attrib["timestamp"])
+            cur_duration = float(child.attrib["timedur"])
+            cur_text = child.text
+
+            if this_time == 0.0:
+                this_time = cur_time
+
+            delta = cur_time - (prev_time + prev_duration)
+            # rel_tol value is made from trial/error to try and compromise between:
+            # - cutting enough to skip missing words
+            # - not too short, not too long sentences
+            is_close = math.isclose(
+                cur_time, this_time + this_duration, rel_tol=rel_tol
+            )
+            is_short = (this_duration + cur_duration + delta) < MAX_SECS
+
+            # when the previous element is close enough **and** this does not
+            # go over MAX_SECS, we append content
+            if is_close and is_short:
+                this_duration += cur_duration + delta
+                this_text += cur_text
+            else:
+                samples.append(
+                    (
+                        audio_source,
+                        target_dir,
+                        dataset_basename,
+                        this_time,
+                        this_duration,
+                        this_text,
+                        seq_id,
+                    )
+                )
+
+                this_time = cur_time
+                this_duration = cur_duration
+                this_text = cur_text
+
+                seq_id += 1
+
+            prev_time = cur_time
+            prev_duration = cur_duration
+
+    # Keep track of how many samples are good vs. problematic
+    _counter = get_counter()
+    num_samples = len(samples)
+    _rows = []
+
+    print("Processing XML data: {}".format(xml_file))
+    pool = Pool()
+    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
+    for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
+        _counter += processed[0]
+        _rows += processed[1]
+        bar.update(i)
+    bar.update(num_samples)
+    pool.close()
+    pool.join()
+
+    imported_samples = get_imported_samples(_counter)
+    assert _counter["all"] == num_samples
+    assert len(_rows) == imported_samples
+
+    print_import_report(_counter, SAMPLE_RATE, MAX_SECS)
+    print(
+        "Import efficiency: %.1f%%" % ((_counter["total_time"] / source_frames) * 100)
+    )
+    print("")
+
+    return _counter, _rows
+
+
+def _maybe_convert_wav(mp3_filename, _wav_filename):
+    if not os.path.exists(_wav_filename):
+        print("Converting {} to WAV file: {}".format(mp3_filename, _wav_filename))
+        transformer = sox.Transformer()
+        transformer.convert(
+            samplerate=SAMPLE_RATE, n_channels=CHANNELS, bitdepth=BIT_DEPTH
+        )
+        try:
+            transformer.build(mp3_filename, _wav_filename)
+        except sox.core.SoxError:
+            pass
+
+
+def write_general_csv(target_dir, _rows, _counter):
+    target_csv_template = os.path.join(target_dir, "ccpmf_{}.csv")
+    with open(target_csv_template.format("train"), "w") as train_csv_file:  # 80%
+        with open(target_csv_template.format("dev"), "w") as dev_csv_file:  # 10%
+            with open(target_csv_template.format("test"), "w") as test_csv_file:  # 10%
+                train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES)
+                train_writer.writeheader()
+                dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES)
+                dev_writer.writeheader()
+                test_writer = csv.DictWriter(test_csv_file, fieldnames=FIELDNAMES)
+                test_writer.writeheader()
+
+                bar = progressbar.ProgressBar(max_value=len(_rows), widgets=SIMPLE_BAR)
+                for i, item in enumerate(bar(_rows)):
+                    i_mod = i % 10
+                    if i_mod == 0:
+                        writer = test_writer
+                    elif i_mod == 1:
+                        writer = dev_writer
+                    else:
+                        writer = train_writer
+                    writer.writerow(
+                        {
+                            "wav_filename": item[0],
+                            "wav_filesize": item[1],
+                            "transcript": item[2],
+                        }
+                    )
+
+    print("")
+    print("~~~~ FINAL STATISTICS ~~~~")
+    print_import_report(_counter, SAMPLE_RATE, MAX_SECS)
+    print("~~~~ (FINAL STATISTICS) ~~~~")
+    print("")
+
+
+if __name__ == "__main__":
+    PARSER = get_importers_parser(
+        description="Import XML from Conference Centre for Economics, France"
+    )
+    PARSER.add_argument("target_dir", help="Destination directory")
+    PARSER.add_argument(
+        "--filter_alphabet",
+        help="Exclude samples with characters not in provided alphabet",
+    )
+    PARSER.add_argument(
+        "--normalize",
+        action="store_true",
+        help="Converts diacritic characters to their base ones",
+    )
+
+    PARAMS = PARSER.parse_args()
+    validate_label = get_validate_label(PARAMS)
+    ALPHABET = Alphabet(PARAMS.filter_alphabet) if PARAMS.filter_alphabet else None
+
+    def label_filter_fun(label):
+        if PARAMS.normalize:
+            label = (
+                unicodedata.normalize("NFKD", label.strip())
+                .encode("ascii", "ignore")
+                .decode("ascii", "ignore")
+            )
+        label = maybe_normalize(label)
+        label = validate_label(label)
+        if ALPHABET and label:
+            try:
+                ALPHABET.encode(label)
+            except KeyError:
+                label = None
+        return label
+
+    dataset_sources = _download_and_preprocess_data(
+        csv_url=DATASET_RELEASE_CSV, target_dir=PARAMS.target_dir
+    )
+    sources_root_dir = os.path.dirname(dataset_sources)
+    all_counter = get_counter()
+    all_rows = []
+    with open(dataset_sources, "r") as sources:
+        for line in sources.readlines():
+            d = line.split(",")
+            this_xml = os.path.join(sources_root_dir, d[0])
+            this_mp3 = os.path.join(sources_root_dir, d[1])
+            this_rel = float(d[2])
+
+            wav_filename = os.path.join(
+                sources_root_dir,
+                os.path.splitext(os.path.basename(this_mp3))[0] + ".wav",
+            )
+            _maybe_convert_wav(this_mp3, wav_filename)
+            counter, rows = _maybe_import_data(
+                this_xml, wav_filename, sources_root_dir, this_rel
+            )
+
+            all_counter += counter
+            all_rows += rows
+    write_general_csv(sources_root_dir, _counter=all_counter, _rows=all_rows)
--- a/bin/import_cv.py
+++ b/bin/import_cv.py
@ -1,61 +1,107 @@
 #!/usr/bin/env python
-from __future__ import absolute_import, division, print_function
-
-# Make sure we can import stuff from util/
-# This script needs to be run from the root of the DeepSpeech repository
-import os
-import sys
-sys.path.insert(1, os.path.join(sys.path[0], '..'))
-
 import csv
-import sox
-import tarfile
+import os
 import subprocess
-import progressbar
-
+import sys
+import tarfile
 from glob import glob
-from os import path
-from threading import RLock
-from multiprocessing.dummy import Pool
-from multiprocessing import cpu_count
-from util.text import validate_label
-from util.downloader import maybe_download, SIMPLE_BAR
+from multiprocessing import Pool

-FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript']
+import progressbar
+import sox
+from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
+from coqui_stt_training.util.importers import (
+    get_counter,
+    get_imported_samples,
+    print_import_report,
+)
+from coqui_stt_training.util.importers import validate_label_eng as validate_label
+
+FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
 SAMPLE_RATE = 16000
 MAX_SECS = 10
-ARCHIVE_DIR_NAME = 'cv_corpus_v1'
-ARCHIVE_NAME = ARCHIVE_DIR_NAME + '.tar.gz'
-ARCHIVE_URL = 'https://s3.us-east-2.amazonaws.com/common-voice-data-download/' + ARCHIVE_NAME
+ARCHIVE_DIR_NAME = "cv_corpus_v1"
+ARCHIVE_NAME = ARCHIVE_DIR_NAME + ".tar.gz"
+ARCHIVE_URL = (
+    "https://s3.us-east-2.amazonaws.com/common-voice-data-download/" + ARCHIVE_NAME
+)
+

 def _download_and_preprocess_data(target_dir):
    # Making path absolute
-    target_dir = path.abspath(target_dir)
+    target_dir = os.path.abspath(target_dir)
    # Conditionally download data
    archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL)
    # Conditionally extract common voice data
    _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
-    # Conditionally convert common voice CSV files and mp3 data to DeepSpeech CSVs and wav
+    # Conditionally convert common voice CSV files and mp3 data to Coqui STT CSVs and wav
    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)

+
 def _maybe_extract(target_dir, extracted_data, archive_path):
    # If target_dir/extracted_data does not exist, extract archive in target_dir
-    extracted_path = path.join(target_dir, extracted_data)
-    if not path.exists(extracted_path):
+    extracted_path = os.join(target_dir, extracted_data)
+    if not os.path.exists(extracted_path):
        print('No directory "%s" - extracting archive...' % extracted_path)
        with tarfile.open(archive_path) as tar:
            tar.extractall(target_dir)
    else:
        print('Found directory "%s" - not extracting it from archive.' % extracted_path)

+
 def _maybe_convert_sets(target_dir, extracted_data):
-    extracted_dir = path.join(target_dir, extracted_data)
-    for source_csv in glob(path.join(extracted_dir, '*.csv')):
-        _maybe_convert_set(extracted_dir, source_csv, path.join(target_dir, os.path.split(source_csv)[-1]))
+    extracted_dir = os.path.join(target_dir, extracted_data)
+    for source_csv in glob(os.path.join(extracted_dir, "*.csv")):
+        _maybe_convert_set(
+            extracted_dir,
+            source_csv,
+            os.path.join(target_dir, os.path.split(source_csv)[-1]),
+        )
+
+
+def one_sample(sample):
+    mp3_filename = sample[0]
+    # Storing wav files next to the mp3 ones - just with a different suffix
+    wav_filename = path.splitext(mp3_filename)[0] + ".wav"
+    _maybe_convert_wav(mp3_filename, wav_filename)
+    frames = int(
+        subprocess.check_output(["soxi", "-s", wav_filename], stderr=subprocess.STDOUT)
+    )
+    file_size = -1
+    if os.path.exists(wav_filename):
+        file_size = path.getsize(wav_filename)
+        frames = int(
+            subprocess.check_output(
+                ["soxi", "-s", wav_filename], stderr=subprocess.STDOUT
+            )
+        )
+    label = validate_label(sample[1])
+    rows = []
+    counter = get_counter()
+    if file_size == -1:
+        # Excluding samples that failed upon conversion
+        counter["failed"] += 1
+    elif label is None:
+        # Excluding samples that failed on label validation
+        counter["invalid_label"] += 1
+    elif int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(label)):
+        # Excluding samples that are too short to fit the transcript
+        counter["too_short"] += 1
+    elif frames / SAMPLE_RATE > MAX_SECS:
+        # Excluding very long samples to keep a reasonable batch-size
+        counter["too_long"] += 1
+    else:
+        # This one is good - keep it for the target CSV
+        rows.append((wav_filename, file_size, label))
+        counter["imported_time"] += frames
+    counter["all"] += 1
+    counter["total_time"] += frames
+    return (counter, rows)
+

 def _maybe_convert_set(extracted_dir, source_csv, target_csv):
    print()
-    if path.exists(target_csv):
+    if os.path.exists(target_csv):
        print('Found CSV file "%s" - not importing "%s".' % (target_csv, source_csv))
        return
    print('No CSV file "%s" - importing "%s"...' % (target_csv, source_csv))
@ -63,73 +109,47 @@ def _maybe_convert_set(extracted_dir, source_csv, target_csv):
    with open(source_csv) as source_csv_file:
        reader = csv.DictReader(source_csv_file)
        for row in reader:
-            samples.append((row['filename'], row['text']))
+            samples.append((os.path.join(extracted_dir, row["filename"]), row["text"]))

    # Mutable counters for the concurrent embedded routine
-    counter = { 'all': 0, 'failed': 0, 'invalid_label': 0, 'too_short': 0, 'too_long': 0 }
-    lock = RLock()
+    counter = get_counter()
    num_samples = len(samples)
    rows = []

-    def one_sample(sample):
-        mp3_filename = path.join(*(sample[0].split('/')))
-        mp3_filename = path.join(extracted_dir, mp3_filename)
-        # Storing wav files next to the mp3 ones - just with a different suffix
-        wav_filename = path.splitext(mp3_filename)[0] + ".wav"
-        _maybe_convert_wav(mp3_filename, wav_filename)
-        frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
-        file_size = -1
-        if path.exists(wav_filename):
-            file_size = path.getsize(wav_filename)
-            frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
-        label = validate_label(sample[1])
-        with lock:
-            if file_size == -1:
-                # Excluding samples that failed upon conversion
-                counter['failed'] += 1
-            elif label is None:
-                # Excluding samples that failed on label validation
-                counter['invalid_label'] += 1
-            elif int(frames/SAMPLE_RATE*1000/10/2) < len(str(label)):
-                # Excluding samples that are too short to fit the transcript
-                counter['too_short'] += 1
-            elif frames/SAMPLE_RATE > MAX_SECS:
-                # Excluding very long samples to keep a reasonable batch-size
-                counter['too_long'] += 1
-            else:
-                # This one is good - keep it for the target CSV
-                rows.append((wav_filename, file_size, label))
-            counter['all'] += 1
-
-    print('Importing mp3 files...')
-    pool = Pool(cpu_count())
+    print("Importing mp3 files...")
+    pool = Pool()
    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
-    for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1):
+    for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
+        counter += processed[0]
+        rows += processed[1]
        bar.update(i)
    bar.update(num_samples)
    pool.close()
    pool.join()

    print('Writing "%s"...' % target_csv)
-    with open(target_csv, 'w') as target_csv_file:
+    with open(target_csv, "w", encoding="utf-8", newline="") as target_csv_file:
        writer = csv.DictWriter(target_csv_file, fieldnames=FIELDNAMES)
        writer.writeheader()
        bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
        for filename, file_size, transcript in bar(rows):
-            writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript })
+            writer.writerow(
+                {
+                    "wav_filename": filename,
+                    "wav_filesize": file_size,
+                    "transcript": transcript,
+                }
+            )
+
+    imported_samples = get_imported_samples(counter)
+    assert counter["all"] == num_samples
+    assert len(rows) == imported_samples
+
+    print_import_report(counter, SAMPLE_RATE, MAX_SECS)

-    print('Imported %d samples.' % (counter['all'] - counter['failed'] - counter['too_short'] - counter['too_long']))
-    if counter['failed'] > 0:
-        print('Skipped %d samples that failed upon conversion.' % counter['failed'])
-    if counter['invalid_label'] > 0:
-        print('Skipped %d samples that failed on transcript validation.' % counter['invalid_label'])
-    if counter['too_short'] > 0:
-        print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
-    if counter['too_long'] > 0:
-        print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))

 def _maybe_convert_wav(mp3_filename, wav_filename):
-    if not path.exists(wav_filename):
+    if not os.path.exists(wav_filename):
        transformer = sox.Transformer()
        transformer.convert(samplerate=SAMPLE_RATE)
        try:
@ -137,5 +157,6 @@ def _maybe_convert_wav(mp3_filename, wav_filename):
        except sox.core.SoxError:
            pass

+
 if __name__ == "__main__":
    _download_and_preprocess_data(sys.argv[1])
--- a/bin/import_cv2.py
+++ b/bin/import_cv2.py
@ -1,144 +1,250 @@
 #!/usr/bin/env python
-from __future__ import absolute_import, division, print_function
-
-# Make sure we can import stuff from util/
-# This script needs to be run from the root of the DeepSpeech repository
-import os
-import sys
-sys.path.insert(1, os.path.join(sys.path[0], '..'))
-
-import csv
-import sox
-import subprocess
-import progressbar
-
-from os import path
-from threading import RLock
-from multiprocessing.dummy import Pool
-from multiprocessing import cpu_count
-from util.downloader import SIMPLE_BAR
-from util.text import validate_label
-
-'''
+"""
 Broadly speaking, this script takes the audio downloaded from Common Voice
 for a certain language, in addition to the *.tsv files output by CorporaCreator,
 and the script formats the data and transcripts to be in a state usable by
-DeepSpeech.py
+train.py
+Use "python3 import_cv2.py -h" for help
+"""
+import csv
+import os
+import subprocess
+import unicodedata
+from multiprocessing import Pool

-Usage:
-        $ python3 import_cv2.py /path/to/audio/data_dir /path/to/tsv_dir
+import progressbar
+import sox
+from coqui_stt_ctcdecoder import Alphabet
+from coqui_stt_training.util.downloader import SIMPLE_BAR
+from coqui_stt_training.util.importers import (
+    get_counter,
+    get_imported_samples,
+    get_importers_parser,
+    get_validate_label,
+    print_import_report,
+)

-Input:
-        (1) audio_dir (string) path to dir of audio downloaded from Common Voice
-        (2) tsv_dir (string) path to dir containing {train,test,dev}.tsv files
-            which were generated by CorporaCreator
-
-Ouput:
-        (1) csv files in format needed by DeepSpeech.py, saved into audio_dir
-        (2) wav files, saved into audio_dir alongside their mp3s
-'''
-
-FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript']
+FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
 SAMPLE_RATE = 16000
+CHANNELS = 1
 MAX_SECS = 10
+PARAMS = None
+FILTER_OBJ = None

-def _preprocess_data(audio_dir, tsv_dir):
-    for dataset in ['train','test','dev']:
-        input_tsv= path.join(path.abspath(tsv_dir), dataset+".tsv")
-        if os.path.isfile(input_tsv):
-            print("Loading TSV file: ", input_tsv)
-            _maybe_convert_set(audio_dir, input_tsv)
-        else:
-            print("ERROR: no TSV file found: ", input_tsv)

-def _maybe_convert_set(audio_dir, input_tsv):
-    output_csv =  path.join(audio_dir,os.path.split(input_tsv)[-1].replace('tsv', 'csv'))
-    print("Saving new DeepSpeech-formatted CSV file to: ", output_csv)
+class LabelFilter:
+    def __init__(self, normalize, alphabet, validate_fun):
+        self.normalize = normalize
+        self.alphabet = alphabet
+        self.validate_fun = validate_fun

-    # Get audiofile path and transcript for each sentence in tsv
-    samples = []
-    with open(input_tsv) as input_tsv_file:
-        reader = csv.DictReader(input_tsv_file, delimiter='\t')
-        for row in reader:
-            samples.append((row['path'], row['sentence']))
+    def filter(self, label):
+        if self.normalize:
+            label = (
+                unicodedata.normalize("NFKD", label.strip())
+                .encode("ascii", "ignore")
+                .decode("ascii", "ignore")
+            )
+        label = self.validate_fun(label)
+        if self.alphabet and label and not self.alphabet.CanEncode(label):
+            label = None
+        return label

-    # Keep track of how many samples are good vs. problematic
-    counter = { 'all': 0, 'failed': 0, 'invalid_label': 0, 'too_short': 0, 'too_long': 0 }
-    lock = RLock()
-    num_samples = len(samples)
+
+def init_worker(params):
+    global FILTER_OBJ  # pylint: disable=global-statement
+    validate_label = get_validate_label(params)
+    alphabet = Alphabet(params.filter_alphabet) if params.filter_alphabet else None
+    FILTER_OBJ = LabelFilter(params.normalize, alphabet, validate_label)
+
+
+def one_sample(sample):
+    """ Take an audio file, and optionally convert it to 16kHz WAV """
+    mp3_filename = sample[0]
+    if not os.path.splitext(mp3_filename.lower())[1] == ".mp3":
+        mp3_filename += ".mp3"
+    # Storing wav files next to the mp3 ones - just with a different suffix
+    wav_filename = os.path.splitext(mp3_filename)[0] + ".wav"
+    _maybe_convert_wav(mp3_filename, wav_filename)
+    file_size = -1
+    frames = 0
+    if os.path.exists(wav_filename):
+        file_size = os.path.getsize(wav_filename)
+        frames = int(
+            subprocess.check_output(
+                ["soxi", "-s", wav_filename], stderr=subprocess.STDOUT
+            )
+        )
+    label = FILTER_OBJ.filter(sample[1])
    rows = []
+    counter = get_counter()
+    if file_size == -1:
+        # Excluding samples that failed upon conversion
+        counter["failed"] += 1
+    elif label is None:
+        # Excluding samples that failed on label validation
+        counter["invalid_label"] += 1
+    elif int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(label)):
+        # Excluding samples that are too short to fit the transcript
+        counter["too_short"] += 1
+    elif frames / SAMPLE_RATE > MAX_SECS:
+        # Excluding very long samples to keep a reasonable batch-size
+        counter["too_long"] += 1
+    else:
+        # This one is good - keep it for the target CSV
+        rows.append((os.path.split(wav_filename)[-1], file_size, label, sample[2]))
+        counter["imported_time"] += frames
+    counter["all"] += 1
+    counter["total_time"] += frames

-    def one_sample(sample):
-        """ Take a audio file, and optionally convert it to 16kHz WAV """
-        mp3_filename = path.join(audio_dir, sample[0])
-        if not path.splitext(mp3_filename.lower())[1] == '.mp3':
-            mp3_filename += ".mp3"
-        # Storing wav files next to the mp3 ones - just with a different suffix
-        wav_filename = path.splitext(mp3_filename)[0] + ".wav"
-        _maybe_convert_wav(mp3_filename, wav_filename)
-        file_size = -1
-        if path.exists(wav_filename):
-            file_size = path.getsize(wav_filename)
-            frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
-        label = validate_label(sample[1])
-        with lock:
-            if file_size == -1:
-                # Excluding samples that failed upon conversion
-                counter['failed'] += 1
-            elif label is None:
-                # Excluding samples that failed on label validation
-                counter['invalid_label'] += 1
-            elif int(frames/SAMPLE_RATE*1000/10/2) < len(str(label)):
-                # Excluding samples that are too short to fit the transcript
-                counter['too_short'] += 1
-            elif frames/SAMPLE_RATE > MAX_SECS:
-                # Excluding very long samples to keep a reasonable batch-size
-                counter['too_long'] += 1
-            else:
-                # This one is good - keep it for the target CSV
-                rows.append((wav_filename, file_size, label))
-            counter['all'] += 1
+    return (counter, rows)

-    print("Importing mp3 files...")
-    pool = Pool(cpu_count())
-    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
-    for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1):
-        bar.update(i)
-    bar.update(num_samples)
-    pool.close()
-    pool.join()

-    with open(output_csv, 'w') as output_csv_file:
-        print('Writing CSV file for DeepSpeech.py as: ', output_csv)
+def _maybe_convert_set(
+    dataset,
+    tsv_dir,
+    audio_dir,
+    filter_obj,
+    space_after_every_character=None,
+    rows=None,
+    exclude=None,
+):
+    exclude_transcripts = set()
+    exclude_speakers = set()
+    if exclude is not None:
+        for sample in exclude:
+            exclude_transcripts.add(sample[2])
+            exclude_speakers.add(sample[3])
+
+    if rows is None:
+        rows = []
+        input_tsv = os.path.join(os.path.abspath(tsv_dir), dataset + ".tsv")
+        if not os.path.isfile(input_tsv):
+            return rows
+        print("Loading TSV file: ", input_tsv)
+        # Get audiofile path and transcript for each sentence in tsv
+        samples = []
+        with open(input_tsv, encoding="utf-8") as input_tsv_file:
+            reader = csv.DictReader(input_tsv_file, delimiter="\t")
+            for row in reader:
+                samples.append(
+                    (
+                        os.path.join(audio_dir, row["path"]),
+                        row["sentence"],
+                        row["client_id"],
+                    )
+                )
+
+        counter = get_counter()
+        num_samples = len(samples)
+
+        print("Importing mp3 files...")
+        pool = Pool(initializer=init_worker, initargs=(PARAMS,))
+        bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
+        for i, processed in enumerate(
+            pool.imap_unordered(one_sample, samples), start=1
+        ):
+            counter += processed[0]
+            rows += processed[1]
+            bar.update(i)
+        bar.update(num_samples)
+        pool.close()
+        pool.join()
+
+        imported_samples = get_imported_samples(counter)
+        assert counter["all"] == num_samples
+        assert len(rows) == imported_samples
+        print_import_report(counter, SAMPLE_RATE, MAX_SECS)
+
+    output_csv = os.path.join(os.path.abspath(audio_dir), dataset + ".csv")
+    print("Saving new Coqui STT-formatted CSV file to: ", output_csv)
+    with open(output_csv, "w", encoding="utf-8", newline="") as output_csv_file:
+        print("Writing CSV file for train.py as: ", output_csv)
        writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
        writer.writeheader()
        bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
-        for filename, file_size, transcript in bar(rows):
-            writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript })
+        for filename, file_size, transcript, speaker in bar(rows):
+            if transcript in exclude_transcripts or speaker in exclude_speakers:
+                continue
+            if space_after_every_character:
+                writer.writerow(
+                    {
+                        "wav_filename": filename,
+                        "wav_filesize": file_size,
+                        "transcript": " ".join(transcript),
+                    }
+                )
+            else:
+                writer.writerow(
+                    {
+                        "wav_filename": filename,
+                        "wav_filesize": file_size,
+                        "transcript": transcript,
+                    }
+                )
+    return rows
+
+
+def _preprocess_data(tsv_dir, audio_dir, space_after_every_character=False):
+    exclude = []
+    for dataset in ["test", "dev", "train", "validated", "other"]:
+        set_samples = _maybe_convert_set(
+            dataset, tsv_dir, audio_dir, space_after_every_character
+        )
+        if dataset in ["test", "dev"]:
+            exclude += set_samples
+        if dataset == "validated":
+            _maybe_convert_set(
+                "train-all",
+                tsv_dir,
+                audio_dir,
+                space_after_every_character,
+                rows=set_samples,
+                exclude=exclude,
+            )

-    print('Imported %d samples.' % (counter['all'] - counter['failed'] - counter['too_short'] - counter['too_long']))
-    if counter['failed'] > 0:
-        print('Skipped %d samples that failed upon conversion.' % counter['failed'])
-    if counter['invalid_label'] > 0:
-        print('Skipped %d samples that failed on transcript validation.' % counter['invalid_label'])
-    if counter['too_short'] > 0:
-        print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
-    if counter['too_long'] > 0:
-        print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))

 def _maybe_convert_wav(mp3_filename, wav_filename):
-    if not path.exists(wav_filename):
+    if not os.path.exists(wav_filename):
        transformer = sox.Transformer()
-        transformer.convert(samplerate=SAMPLE_RATE)
+        transformer.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS)
        try:
            transformer.build(mp3_filename, wav_filename)
        except sox.core.SoxError:
            pass


+def parse_args():
+    parser = get_importers_parser(description="Import CommonVoice v2.0 corpora")
+    parser.add_argument("tsv_dir", help="Directory containing tsv files")
+    parser.add_argument(
+        "--audio_dir",
+        help='Directory containing the audio clips - defaults to "<tsv_dir>/clips"',
+    )
+    parser.add_argument(
+        "--filter_alphabet",
+        help="Exclude samples with characters not in provided alphabet",
+    )
+    parser.add_argument(
+        "--normalize",
+        action="store_true",
+        help="Converts diacritic characters to their base ones",
+    )
+    parser.add_argument(
+        "--space_after_every_character",
+        action="store_true",
+        help="To help transcript join by white space",
+    )
+    return parser.parse_args()
+
+
+def main():
+    audio_dir = (
+        PARAMS.audio_dir if PARAMS.audio_dir else os.path.join(PARAMS.tsv_dir, "clips")
+    )
+    _preprocess_data(PARAMS.tsv_dir, audio_dir, PARAMS.space_after_every_character)
+
+
 if __name__ == "__main__":
-    audio_dir = sys.argv[1]
-    tsv_dir = sys.argv[2]
-    print('Expecting your audio from Common Voice to be in: ', audio_dir)
-    print('Looking for *.tsv files (generated by CorporaCreator) in: ', tsv_dir)
-    _preprocess_data(audio_dir, tsv_dir)
+    PARAMS = parse_args()
+    main()
--- a/bin/import_fisher.py
+++ b/bin/import_fisher.py
@ -1,25 +1,20 @@
 #!/usr/bin/env python
-from __future__ import absolute_import, division, print_function
+import codecs
+import fnmatch
+import os
+import random
+import subprocess
+import sys
+import unicodedata
+
+import librosa
+import pandas
+import soundfile  # <= Has an external dependency on libsndfile
+from coqui_stt_training.util.importers import validate_label_eng as validate_label

 # Prerequisite: Having the sph2pipe tool in your PATH:
 # https://www.ldc.upenn.edu/language-resources/tools/sphere-conversion-tools

-# Make sure we can import stuff from util/
-# This script needs to be run from the root of the DeepSpeech repository
-import sys
-import os
-sys.path.insert(1, os.path.join(sys.path[0], '..'))
-
-import codecs
-import fnmatch
-import os
-import pandas
-import subprocess
-import unicodedata
-import librosa
-import soundfile # <= Has an external dependency on libsndfile
-
-from util.text import validate_label

 def _download_and_preprocess_data(data_dir):
    # Assume data_dir contains extracted LDC2004S13, LDC2004T19, LDC2005S13, LDC2005T19
@ -29,33 +24,55 @@ def _download_and_preprocess_data(data_dir):
    _maybe_convert_wav(data_dir, "LDC2005S13", "fisher-2005-wav")

    # Conditionally split Fisher wav data
-    all_2004 = _split_wav_and_sentences(data_dir,
-                             original_data="fisher-2004-wav",
-                             converted_data="fisher-2004-split-wav",
-                             trans_data=os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"))
-    all_2005 = _split_wav_and_sentences(data_dir,
-                             original_data="fisher-2005-wav",
-                             converted_data="fisher-2005-split-wav",
-                             trans_data=os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"))
+    all_2004 = _split_wav_and_sentences(
+        data_dir,
+        original_data="fisher-2004-wav",
+        converted_data="fisher-2004-split-wav",
+        trans_data=os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"),
+    )
+    all_2005 = _split_wav_and_sentences(
+        data_dir,
+        original_data="fisher-2005-wav",
+        converted_data="fisher-2005-split-wav",
+        trans_data=os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"),
+    )

    # The following files have incorrect transcripts that are much longer than
    # their audio source. The result is that we end up with more labels than time
    # slices, which breaks CTC.
-    all_2004.loc[all_2004["wav_filename"].str.endswith("fe_03_00265-33.53-33.81.wav"), "transcript"] = "correct"
-    all_2004.loc[all_2004["wav_filename"].str.endswith("fe_03_00991-527.39-528.3.wav"), "transcript"] = "that's one of those"
-    all_2005.loc[all_2005["wav_filename"].str.endswith("fe_03_10282-344.42-344.84.wav"), "transcript"] = "they don't want"
-    all_2005.loc[all_2005["wav_filename"].str.endswith("fe_03_10677-101.04-106.41.wav"), "transcript"] = "uh my mine yeah the german shepherd pitbull mix he snores almost as loud as i do"
+    all_2004.loc[
+        all_2004["wav_filename"].str.endswith("fe_03_00265-33.53-33.81.wav"),
+        "transcript",
+    ] = "correct"
+    all_2004.loc[
+        all_2004["wav_filename"].str.endswith("fe_03_00991-527.39-528.3.wav"),
+        "transcript",
+    ] = "that's one of those"
+    all_2005.loc[
+        all_2005["wav_filename"].str.endswith("fe_03_10282-344.42-344.84.wav"),
+        "transcript",
+    ] = "they don't want"
+    all_2005.loc[
+        all_2005["wav_filename"].str.endswith("fe_03_10677-101.04-106.41.wav"),
+        "transcript",
+    ] = "uh my mine yeah the german shepherd pitbull mix he snores almost as loud as i do"

    # The following file is just a short sound and not at all transcribed like provided.
    # So we just exclude it.
-    all_2004 = all_2004[~all_2004["wav_filename"].str.endswith("fe_03_00027-393.8-394.05.wav")]
+    all_2004 = all_2004[
+        ~all_2004["wav_filename"].str.endswith("fe_03_00027-393.8-394.05.wav")
+    ]

    # The following file is far too long and would ruin our training batch size.
    # So we just exclude it.
-    all_2005 = all_2005[~all_2005["wav_filename"].str.endswith("fe_03_11487-31.09-234.06.wav")]
+    all_2005 = all_2005[
+        ~all_2005["wav_filename"].str.endswith("fe_03_11487-31.09-234.06.wav")
+    ]

    # The following file is too large for its transcript, so we just exclude it.
-    all_2004 = all_2004[~all_2004["wav_filename"].str.endswith("fe_03_01326-307.42-307.93.wav")]
+    all_2004 = all_2004[
+        ~all_2004["wav_filename"].str.endswith("fe_03_01326-307.42-307.93.wav")
+    ]

    # Conditionally split Fisher data into train/validation/test sets
    train_2004, dev_2004, test_2004 = _split_sets(all_2004)
@ -71,6 +88,7 @@ def _download_and_preprocess_data(data_dir):
    dev_files.to_csv(os.path.join(data_dir, "fisher-dev.csv"), index=False)
    test_files.to_csv(os.path.join(data_dir, "fisher-test.csv"), index=False)

+
 def _maybe_convert_wav(data_dir, original_data, converted_data):
    source_dir = os.path.join(data_dir, original_data)
    target_dir = os.path.join(data_dir, converted_data)
@ -88,10 +106,18 @@ def _maybe_convert_wav(data_dir, original_data, converted_data):
        for filename in fnmatch.filter(filenames, "*.sph"):
            sph_file = os.path.join(root, filename)
            for channel in ["1", "2"]:
-                wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + "_c" + channel + ".wav"
+                wav_filename = (
+                    os.path.splitext(os.path.basename(sph_file))[0]
+                    + "_c"
+                    + channel
+                    + ".wav"
+                )
                wav_file = os.path.join(target_dir, wav_filename)
                print("converting {} to {}".format(sph_file, wav_file))
-                subprocess.check_call(["sph2pipe", "-c", channel, "-p", "-f", "rif", sph_file, wav_file])
+                subprocess.check_call(
+                    ["sph2pipe", "-c", channel, "-p", "-f", "rif", sph_file, wav_file]
+                )
+

 def _parse_transcriptions(trans_file):
    segments = []
@ -109,18 +135,23 @@ def _parse_transcriptions(trans_file):
            # We need to do the encode-decode dance here because encode
            # returns a bytes() object on Python 3, and text_to_char_array
            # expects a string.
-            transcript = unicodedata.normalize("NFKD", transcript)  \
-                                    .encode("ascii", "ignore")      \
-                                    .decode("ascii", "ignore")
+            transcript = (
+                unicodedata.normalize("NFKD", transcript)
+                .encode("ascii", "ignore")
+                .decode("ascii", "ignore")
+            )

-            segments.append({
-                "start_time": start_time,
-                "stop_time": stop_time,
-                "speaker": speaker,
-                "transcript": transcript,
-            })
+            segments.append(
+                {
+                    "start_time": start_time,
+                    "stop_time": stop_time,
+                    "speaker": speaker,
+                    "transcript": transcript,
+                }
+            )
    return segments

+
 def _split_wav_and_sentences(data_dir, trans_data, original_data, converted_data):
    trans_dir = os.path.join(data_dir, trans_data)
    source_dir = os.path.join(data_dir, original_data)
@ -137,59 +168,115 @@ def _split_wav_and_sentences(data_dir, trans_data, original_data, converted_data
            segments = _parse_transcriptions(trans_file)

            # Open wav corresponding to transcription file
-            wav_filenames = [os.path.splitext(os.path.basename(trans_file))[0] + "_c" + channel + ".wav" for channel in ["1", "2"]]
-            wav_files = [os.path.join(source_dir, wav_filename) for wav_filename in wav_filenames]
+            wav_filenames = [
+                os.path.splitext(os.path.basename(trans_file))[0]
+                + "_c"
+                + channel
+                + ".wav"
+                for channel in ["1", "2"]
+            ]
+            wav_files = [
+                os.path.join(source_dir, wav_filename) for wav_filename in wav_filenames
+            ]

            print("splitting {} according to {}".format(wav_files, trans_file))

-            origAudios = [librosa.load(wav_file, sr=16000, mono=False) for wav_file in wav_files]
+            origAudios = [
+                librosa.load(wav_file, sr=16000, mono=False) for wav_file in wav_files
+            ]

            # Loop over segments and split wav_file for each segment
            for segment in segments:
                # Create wav segment filename
                start_time = segment["start_time"]
                stop_time = segment["stop_time"]
-                new_wav_filename = os.path.splitext(os.path.basename(trans_file))[0] + "-" + str(start_time) + "-" + str(stop_time) + ".wav"
+                new_wav_filename = (
+                    os.path.splitext(os.path.basename(trans_file))[0]
+                    + "-"
+                    + str(start_time)
+                    + "-"
+                    + str(stop_time)
+                    + ".wav"
+                )
                new_wav_file = os.path.join(target_dir, new_wav_filename)

                channel = 0 if segment["speaker"] == "A:" else 1
-                _split_and_resample_wav(origAudios[channel], start_time, stop_time, new_wav_file)
+                _split_and_resample_wav(
+                    origAudios[channel], start_time, stop_time, new_wav_file
+                )

                new_wav_filesize = os.path.getsize(new_wav_file)
                transcript = validate_label(segment["transcript"])
                if transcript != None:
-                    files.append((os.path.abspath(new_wav_file), new_wav_filesize, transcript))
+                    files.append(
+                        (os.path.abspath(new_wav_file), new_wav_filesize, transcript)
+                    )
+
+    return pandas.DataFrame(
+        data=files, columns=["wav_filename", "wav_filesize", "transcript"]
+    )

-    return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"])

 def _split_audio(origAudio, start_time, stop_time):
    audioData, frameRate = origAudio
    nChannels = len(audioData.shape)
    startIndex = int(start_time * frameRate)
    stopIndex = int(stop_time * frameRate)
-    return audioData[startIndex: stopIndex] if 1 == nChannels else audioData[:, startIndex: stopIndex]
+    return (
+        audioData[startIndex:stopIndex]
+        if 1 == nChannels
+        else audioData[:, startIndex:stopIndex]
+    )
+

 def _split_and_resample_wav(origAudio, start_time, stop_time, new_wav_file):
    frameRate = origAudio[1]
    chunkData = _split_audio(origAudio, start_time, stop_time)
    soundfile.write(new_wav_file, chunkData, frameRate, "PCM_16")

-def _split_sets(filelist):
-    # We initially split the entire set into 80% train and 20% test, then
-    # split the train set into 80% train and 20% validation.
-    train_beg = 0
-    train_end = int(0.8 * len(filelist))

-    dev_beg = int(0.8 * train_end)
-    dev_end = train_end
-    train_end = dev_beg
+def _split_sets(filelist):
+    """
+    randomply split the datasets into train, validation, and test sets where the size of the
+    validation and test sets are determined by the `get_sample_size` function.
+    """
+    random.shuffle(filelist)
+    sample_size = get_sample_size(len(filelist))
+
+    train_beg = 0
+    train_end = len(filelist) - 2 * sample_size
+
+    dev_beg = train_end
+    dev_end = train_end + sample_size

    test_beg = dev_end
    test_end = len(filelist)

-    return (filelist[train_beg:train_end],
-            filelist[dev_beg:dev_end],
-            filelist[test_beg:test_end])
+    return (
+        filelist[train_beg:train_end],
+        filelist[dev_beg:dev_end],
+        filelist[test_beg:test_end],
+    )
+
+
+def get_sample_size(population_size):
+    """calculates the sample size for a 99% confidence and 1% margin of error"""
+    margin_of_error = 0.01
+    fraction_picking = 0.50
+    z_score = 2.58  # Corresponds to confidence level 99%
+    numerator = (z_score ** 2 * fraction_picking * (1 - fraction_picking)) / (
+        margin_of_error ** 2
+    )
+    sample_size = 0
+    for train_size in range(population_size, 0, -1):
+        denominator = 1 + (z_score ** 2 * fraction_picking * (1 - fraction_picking)) / (
+            margin_of_error ** 2 * train_size
+        )
+        sample_size = int(numerator / denominator)
+        if 2 * sample_size + train_size <= population_size:
+            break
+    return sample_size
+

 if __name__ == "__main__":
    _download_and_preprocess_data(sys.argv[1])
--- a/bin/import_freestmandarin.py
+++ b/bin/import_freestmandarin.py
@ -0,0 +1,93 @@
+#!/usr/bin/env python
+import glob
+import os
+import tarfile
+
+import numpy as np
+import pandas
+from coqui_stt_training.util.importers import get_importers_parser
+
+COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"]
+
+
+def extract(archive_path, target_dir):
+    print("Extracting {} into {}...".format(archive_path, target_dir))
+    with tarfile.open(archive_path) as tar:
+        tar.extractall(target_dir)
+
+
+def preprocess_data(tgz_file, target_dir):
+    # First extract main archive and sub-archives
+    extract(tgz_file, target_dir)
+    main_folder = os.path.join(target_dir, "ST-CMDS-20170001_1-OS")
+
+    # Folder structure is now:
+    # - ST-CMDS-20170001_1-OS/
+    #   - *.wav
+    #   - *.txt
+    #   - *.metadata
+
+    def load_set(glob_path):
+        set_files = []
+        for wav in glob.glob(glob_path):
+            wav_filename = wav
+            wav_filesize = os.path.getsize(wav)
+            txt_filename = os.path.splitext(wav_filename)[0] + ".txt"
+            with open(txt_filename, "r") as fin:
+                transcript = fin.read()
+            set_files.append((wav_filename, wav_filesize, transcript))
+        return set_files
+
+    # Load all files, then deterministically split into train/dev/test sets
+    all_files = load_set(os.path.join(main_folder, "*.wav"))
+    df = pandas.DataFrame(data=all_files, columns=COLUMN_NAMES)
+    df.sort_values(by="wav_filename", inplace=True)
+
+    indices = np.arange(0, len(df))
+    np.random.seed(12345)
+    np.random.shuffle(indices)
+
+    # Total corpus size: 102600 samples. 5000 samples gives us 99% confidence
+    # level with a margin of error of under 2%.
+    test_indices = indices[-5000:]
+    dev_indices = indices[-10000:-5000]
+    train_indices = indices[:-10000]
+
+    train_files = df.iloc[train_indices]
+    durations = (train_files["wav_filesize"] - 44) / 16000 / 2
+    train_files = train_files[durations <= 10.0]
+    print("Trimming {} samples > 10 seconds".format((durations > 10.0).sum()))
+    dest_csv = os.path.join(target_dir, "freestmandarin_train.csv")
+    print("Saving train set into {}...".format(dest_csv))
+    train_files.to_csv(dest_csv, index=False)
+
+    dev_files = df.iloc[dev_indices]
+    dest_csv = os.path.join(target_dir, "freestmandarin_dev.csv")
+    print("Saving dev set into {}...".format(dest_csv))
+    dev_files.to_csv(dest_csv, index=False)
+
+    test_files = df.iloc[test_indices]
+    dest_csv = os.path.join(target_dir, "freestmandarin_test.csv")
+    print("Saving test set into {}...".format(dest_csv))
+    test_files.to_csv(dest_csv, index=False)
+
+
+def main():
+    # https://www.openslr.org/38/
+    parser = get_importers_parser(description="Import Free ST Chinese Mandarin corpus")
+    parser.add_argument("tgz_file", help="Path to ST-CMDS-20170001_1-OS.tar.gz")
+    parser.add_argument(
+        "--target_dir",
+        default="",
+        help="Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.",
+    )
+    params = parser.parse_args()
+
+    if not params.target_dir:
+        params.target_dir = os.path.dirname(params.tgz_file)
+
+    preprocess_data(params.tgz_file, params.target_dir)
+
+
+if __name__ == "__main__":
+    main()
--- a/bin/import_gram_vaani.py
+++ b/bin/import_gram_vaani.py
@ -0,0 +1,365 @@
+#!/usr/bin/env python
+
+import csv
+import logging
+import math
+import os
+import subprocess
+import urllib
+from pathlib import Path
+
+import pandas as pd
+import swifter
+from coqui_stt_training.util.importers import get_importers_parser, get_validate_label
+from sox import Transformer
+
+__version__ = "0.1.0"
+_logger = logging.getLogger(__name__)
+
+
+MAX_SECS = 10
+BITDEPTH = 16
+N_CHANNELS = 1
+SAMPLE_RATE = 16000
+
+DEV_PERCENTAGE = 0.10
+TRAIN_PERCENTAGE = 0.80
+
+
+def parse_args(args):
+    """Parse command line parameters
+    Args:
+      args ([str]): Command line parameters as list of strings
+    Returns:
+      :obj:`argparse.Namespace`: command line parameters namespace
+    """
+    parser = get_importers_parser(description="Imports GramVaani data for Deep Speech")
+    parser.add_argument(
+        "--version",
+        action="version",
+        version="GramVaaniImporter {ver}".format(ver=__version__),
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_const",
+        required=False,
+        help="set loglevel to INFO",
+        dest="loglevel",
+        const=logging.INFO,
+    )
+    parser.add_argument(
+        "-vv",
+        "--very-verbose",
+        action="store_const",
+        required=False,
+        help="set loglevel to DEBUG",
+        dest="loglevel",
+        const=logging.DEBUG,
+    )
+    parser.add_argument(
+        "-c",
+        "--csv_filename",
+        required=True,
+        help="Path to the GramVaani csv",
+        dest="csv_filename",
+    )
+    parser.add_argument(
+        "-t",
+        "--target_dir",
+        required=True,
+        help="Directory in which to save the importer GramVaani data",
+        dest="target_dir",
+    )
+    return parser.parse_args(args)
+
+
+def setup_logging(level):
+    """Setup basic logging
+    Args:
+      level (int): minimum log level for emitting messages
+    """
+    format = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s"
+    logging.basicConfig(
+        level=level, stream=sys.stdout, format=format, datefmt="%Y-%m-%d %H:%M:%S"
+    )
+
+
+class GramVaaniCSV:
+    """GramVaaniCSV representing a GramVaani dataset.
+    Args:
+      csv_filename (str): Path to the GramVaani csv
+    Attributes:
+        data (:class:`pandas.DataFrame`): `pandas.DataFrame` Containing the GramVaani csv data
+    """
+
+    def __init__(self, csv_filename):
+        self.data = self._parse_csv(csv_filename)
+
+    def _parse_csv(self, csv_filename):
+        _logger.info("Parsing csv file...%s", os.path.abspath(csv_filename))
+        data = pd.read_csv(
+            os.path.abspath(csv_filename),
+            names=[
+                "piece_id",
+                "audio_url",
+                "transcript_labelled",
+                "transcript",
+                "labels",
+                "content_filename",
+                "audio_length",
+                "user_id",
+            ],
+            usecols=["audio_url", "transcript", "audio_length"],
+            skiprows=[0],
+            engine="python",
+            encoding="utf-8",
+            quotechar='"',
+            quoting=csv.QUOTE_ALL,
+        )
+        data.dropna(inplace=True)
+        _logger.info("Parsed %d lines csv file." % len(data))
+        return data
+
+
+class GramVaaniDownloader:
+    """GramVaaniDownloader downloads a GramVaani dataset.
+    Args:
+      gram_vaani_csv (GramVaaniCSV): A GramVaaniCSV representing the data to download
+      target_dir (str): The path to download the data to
+    Attributes:
+        data (:class:`pandas.DataFrame`): `pandas.DataFrame` Containing the GramVaani csv data
+    """
+
+    def __init__(self, gram_vaani_csv, target_dir):
+        self.target_dir = target_dir
+        self.data = gram_vaani_csv.data
+
+    def download(self):
+        """Downloads the data associated with this instance
+        Return:
+          mp3_directory (os.path): The directory into which the associated mp3's were downloaded
+        """
+        mp3_directory = self._pre_download()
+        self.data.swifter.apply(
+            func=lambda arg: self._download(*arg, mp3_directory), axis=1, raw=True
+        )
+        return mp3_directory
+
+    def _pre_download(self):
+        mp3_directory = os.path.join(self.target_dir, "mp3")
+        if not os.path.exists(self.target_dir):
+            _logger.info("Creating directory...%s", self.target_dir)
+            os.mkdir(self.target_dir)
+        if not os.path.exists(mp3_directory):
+            _logger.info("Creating directory...%s", mp3_directory)
+            os.mkdir(mp3_directory)
+        return mp3_directory
+
+    def _download(self, audio_url, transcript, audio_length, mp3_directory):
+        if audio_url == "audio_url":
+            return
+        mp3_filename = os.path.join(mp3_directory, os.path.basename(audio_url))
+        if not os.path.exists(mp3_filename):
+            _logger.debug("Downloading mp3 file...%s", audio_url)
+            urllib.request.urlretrieve(audio_url, mp3_filename)
+        else:
+            _logger.debug("Already downloaded mp3 file...%s", audio_url)
+
+
+class GramVaaniConverter:
+    """GramVaaniConverter converts the mp3's to wav's for a GramVaani dataset.
+    Args:
+      target_dir (str): The path to download the data from
+      mp3_directory (os.path): The path containing the GramVaani mp3's
+    Attributes:
+        target_dir (str): The target directory passed as a command line argument
+        mp3_directory (os.path): The path containing the GramVaani mp3's
+    """
+
+    def __init__(self, target_dir, mp3_directory):
+        self.target_dir = target_dir
+        self.mp3_directory = Path(mp3_directory)
+
+    def convert(self):
+        """Converts the mp3's associated with this instance to wav's
+        Return:
+          wav_directory (os.path): The directory into which the associated wav's were downloaded
+        """
+        wav_directory = self._pre_convert()
+        for mp3_filename in self.mp3_directory.glob("**/*.mp3"):
+            wav_filename = os.path.join(
+                wav_directory,
+                os.path.splitext(os.path.basename(mp3_filename))[0] + ".wav",
+            )
+            if not os.path.exists(wav_filename):
+                _logger.debug(
+                    "Converting mp3 file %s to wav file %s"
+                    % (mp3_filename, wav_filename)
+                )
+                transformer = Transformer()
+                transformer.convert(
+                    samplerate=SAMPLE_RATE, n_channels=N_CHANNELS, bitdepth=BITDEPTH
+                )
+                transformer.build(str(mp3_filename), str(wav_filename))
+            else:
+                _logger.debug(
+                    "Already converted mp3 file %s to wav file %s"
+                    % (mp3_filename, wav_filename)
+                )
+        return wav_directory
+
+    def _pre_convert(self):
+        wav_directory = os.path.join(self.target_dir, "wav")
+        if not os.path.exists(self.target_dir):
+            _logger.info("Creating directory...%s", self.target_dir)
+            os.mkdir(self.target_dir)
+        if not os.path.exists(wav_directory):
+            _logger.info("Creating directory...%s", wav_directory)
+            os.mkdir(wav_directory)
+        return wav_directory
+
+
+class GramVaaniDataSets:
+    def __init__(self, target_dir, wav_directory, gram_vaani_csv):
+        self.target_dir = target_dir
+        self.wav_directory = wav_directory
+        self.csv_data = gram_vaani_csv.data
+        self.raw = pd.DataFrame(columns=["wav_filename", "wav_filesize", "transcript"])
+        self.valid = pd.DataFrame(
+            columns=["wav_filename", "wav_filesize", "transcript"]
+        )
+        self.train = pd.DataFrame(
+            columns=["wav_filename", "wav_filesize", "transcript"]
+        )
+        self.dev = pd.DataFrame(columns=["wav_filename", "wav_filesize", "transcript"])
+        self.test = pd.DataFrame(columns=["wav_filename", "wav_filesize", "transcript"])
+
+    def create(self):
+        self._convert_csv_data_to_raw_data()
+        self.raw.index = range(len(self.raw.index))
+        self.valid = self.raw[self._is_valid_raw_rows()]
+        self.valid = self.valid.sample(frac=1).reset_index(drop=True)
+        train_size, dev_size, test_size = self._calculate_data_set_sizes()
+        self.train = self.valid.loc[0:train_size]
+        self.dev = self.valid.loc[train_size : train_size + dev_size]
+        self.test = self.valid.loc[
+            train_size + dev_size : train_size + dev_size + test_size
+        ]
+
+    def _convert_csv_data_to_raw_data(self):
+        self.raw[["wav_filename", "wav_filesize", "transcript"]] = self.csv_data[
+            ["audio_url", "transcript", "audio_length"]
+        ].swifter.apply(
+            func=lambda arg: self._convert_csv_data_to_raw_data_impl(*arg),
+            axis=1,
+            raw=True,
+        )
+        self.raw.reset_index()
+
+    def _convert_csv_data_to_raw_data_impl(self, audio_url, transcript, audio_length):
+        if audio_url == "audio_url":
+            return pd.Series(["wav_filename", "wav_filesize", "transcript"])
+        mp3_filename = os.path.basename(audio_url)
+        wav_relative_filename = os.path.join(
+            "wav", os.path.splitext(os.path.basename(mp3_filename))[0] + ".wav"
+        )
+        wav_filesize = os.path.getsize(
+            os.path.join(self.target_dir, wav_relative_filename)
+        )
+        transcript = validate_label(transcript)
+        if None == transcript:
+            transcript = ""
+        return pd.Series([wav_relative_filename, wav_filesize, transcript])
+
+    def _is_valid_raw_rows(self):
+        is_valid_raw_transcripts = self._is_valid_raw_transcripts()
+        is_valid_raw_wav_frames = self._is_valid_raw_wav_frames()
+        is_valid_raw_row = [
+            (is_valid_raw_transcript & is_valid_raw_wav_frame)
+            for is_valid_raw_transcript, is_valid_raw_wav_frame in zip(
+                is_valid_raw_transcripts, is_valid_raw_wav_frames
+            )
+        ]
+        series = pd.Series(is_valid_raw_row)
+        return series
+
+    def _is_valid_raw_transcripts(self):
+        return pd.Series([bool(transcript) for transcript in self.raw.transcript])
+
+    def _is_valid_raw_wav_frames(self):
+        transcripts = [str(transcript) for transcript in self.raw.transcript]
+        wav_filepaths = [
+            os.path.join(self.target_dir, str(wav_filename))
+            for wav_filename in self.raw.wav_filename
+        ]
+        wav_frames = [
+            int(
+                subprocess.check_output(
+                    ["soxi", "-s", wav_filepath], stderr=subprocess.STDOUT
+                )
+            )
+            for wav_filepath in wav_filepaths
+        ]
+        is_valid_raw_wav_frames = [
+            self._is_wav_frame_valid(wav_frame, transcript)
+            for wav_frame, transcript in zip(wav_frames, transcripts)
+        ]
+        return pd.Series(is_valid_raw_wav_frames)
+
+    def _is_wav_frame_valid(self, wav_frame, transcript):
+        is_wav_frame_valid = True
+        if int(wav_frame / SAMPLE_RATE * 1000 / 10 / 2) < len(str(transcript)):
+            is_wav_frame_valid = False
+        elif wav_frame / SAMPLE_RATE > MAX_SECS:
+            is_wav_frame_valid = False
+        return is_wav_frame_valid
+
+    def _calculate_data_set_sizes(self):
+        total_size = len(self.valid)
+        dev_size = math.floor(total_size * DEV_PERCENTAGE)
+        train_size = math.floor(total_size * TRAIN_PERCENTAGE)
+        test_size = total_size - (train_size + dev_size)
+        return (train_size, dev_size, test_size)
+
+    def save(self):
+        datasets = ["train", "dev", "test"]
+        for dataset in datasets:
+            self._save(dataset)
+
+    def _save(self, dataset):
+        dataset_path = os.path.join(self.target_dir, dataset + ".csv")
+        dataframe = getattr(self, dataset)
+        dataframe.to_csv(
+            dataset_path,
+            index=False,
+            encoding="utf-8",
+            escapechar="\\",
+            quoting=csv.QUOTE_MINIMAL,
+        )
+
+
+def main(args):
+    """Main entry point allowing external calls
+    Args:
+      args ([str]): command line parameter list
+    """
+    args = parse_args(args)
+    validate_label = get_validate_label(args)
+    setup_logging(args.loglevel)
+    _logger.info("Starting GramVaani importer...")
+    _logger.info("Starting loading GramVaani csv...")
+    csv = GramVaaniCSV(args.csv_filename)
+    _logger.info("Starting downloading GramVaani mp3's...")
+    downloader = GramVaaniDownloader(csv, args.target_dir)
+    mp3_directory = downloader.download()
+    _logger.info("Starting converting GramVaani mp3's to wav's...")
+    converter = GramVaaniConverter(args.target_dir, mp3_directory)
+    wav_directory = converter.convert()
+    datasets = GramVaaniDataSets(args.target_dir, wav_directory, csv)
+    datasets.create()
+    datasets.save()
+    _logger.info("Finished GramVaani importer...")
+
+
+main(sys.argv[1:])
--- a/bin/import_ldc93s1.py
+++ b/bin/import_ldc93s1.py
@ -1,28 +1,32 @@
 #!/usr/bin/env python
-from __future__ import absolute_import, division, print_function
-
-# Make sure we can import stuff from util/
-# This script needs to be run from the root of the DeepSpeech repository
-import sys
 import os
-sys.path.insert(1, os.path.join(sys.path[0], '..'))
+import sys

 import pandas
+from coqui_stt_training.util.downloader import maybe_download

-from util.downloader import maybe_download

 def _download_and_preprocess_data(data_dir):
    # Conditionally download data
    LDC93S1_BASE = "LDC93S1"
    LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/"
-    local_file = maybe_download(LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav")
-    trans_file = maybe_download(LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt")
+    local_file = maybe_download(
+        LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav"
+    )
+    trans_file = maybe_download(
+        LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt"
+    )
    with open(trans_file, "r") as fin:
-        transcript = ' '.join(fin.read().strip().lower().split(' ')[2:]).replace('.', '')
+        transcript = " ".join(fin.read().strip().lower().split(" ")[2:]).replace(
+            ".", ""
+        )

-    df = pandas.DataFrame(data=[(os.path.abspath(local_file), os.path.getsize(local_file), transcript)],
-                          columns=["wav_filename", "wav_filesize", "transcript"])
+    df = pandas.DataFrame(
+        data=[(os.path.abspath(local_file), os.path.getsize(local_file), transcript)],
+        columns=["wav_filename", "wav_filesize", "transcript"],
+    )
    df.to_csv(os.path.join(data_dir, "ldc93s1.csv"), index=False)

+
 if __name__ == "__main__":
    _download_and_preprocess_data(sys.argv[1])
--- a/bin/import_librivox.py
+++ b/bin/import_librivox.py
@ -1,31 +1,38 @@
 #!/usr/bin/env python
-from __future__ import absolute_import, division, print_function
-
-# Make sure we can import stuff from util/
-# This script needs to be run from the root of the DeepSpeech repository
-import sys
-import os
-sys.path.insert(1, os.path.join(sys.path[0], '..'))
-
 import codecs
 import fnmatch
-import pandas
-import progressbar
+import os
 import subprocess
+import sys
 import tarfile
 import unicodedata

+import pandas
+import progressbar
+from coqui_stt_training.util.downloader import maybe_download
 from sox import Transformer
-from util.downloader import maybe_download
 from tensorflow.python.platform import gfile

+SAMPLE_RATE = 16000
+
+
 def _download_and_preprocess_data(data_dir):
    # Conditionally download data to data_dir
-    print("Downloading Librivox data set (55GB) into {} if not already present...".format(data_dir))
+    print(
+        "Downloading Librivox data set (55GB) into {} if not already present...".format(
+            data_dir
+        )
+    )
    with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar:
-        TRAIN_CLEAN_100_URL = "http://www.openslr.org/resources/12/train-clean-100.tar.gz"
-        TRAIN_CLEAN_360_URL = "http://www.openslr.org/resources/12/train-clean-360.tar.gz"
-        TRAIN_OTHER_500_URL = "http://www.openslr.org/resources/12/train-other-500.tar.gz"
+        TRAIN_CLEAN_100_URL = (
+            "http://www.openslr.org/resources/12/train-clean-100.tar.gz"
+        )
+        TRAIN_CLEAN_360_URL = (
+            "http://www.openslr.org/resources/12/train-clean-360.tar.gz"
+        )
+        TRAIN_OTHER_500_URL = (
+            "http://www.openslr.org/resources/12/train-other-500.tar.gz"
+        )

        DEV_CLEAN_URL = "http://www.openslr.org/resources/12/dev-clean.tar.gz"
        DEV_OTHER_URL = "http://www.openslr.org/resources/12/dev-other.tar.gz"
@ -33,12 +40,20 @@ def _download_and_preprocess_data(data_dir):
        TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz"
        TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz"

-        def filename_of(x): return os.path.split(x)[1]
-        train_clean_100 = maybe_download(filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL)
+        def filename_of(x):
+            return os.path.split(x)[1]
+
+        train_clean_100 = maybe_download(
+            filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL
+        )
        bar.update(0)
-        train_clean_360 = maybe_download(filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL)
+        train_clean_360 = maybe_download(
+            filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL
+        )
        bar.update(1)
-        train_other_500 = maybe_download(filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL)
+        train_other_500 = maybe_download(
+            filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL
+        )
        bar.update(2)

        dev_clean = maybe_download(filename_of(DEV_CLEAN_URL), data_dir, DEV_CLEAN_URL)
@ -46,9 +61,13 @@ def _download_and_preprocess_data(data_dir):
        dev_other = maybe_download(filename_of(DEV_OTHER_URL), data_dir, DEV_OTHER_URL)
        bar.update(4)

-        test_clean = maybe_download(filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL)
+        test_clean = maybe_download(
+            filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL
+        )
        bar.update(5)
-        test_other = maybe_download(filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL)
+        test_other = maybe_download(
+            filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL
+        )
        bar.update(6)

    # Conditionally extract LibriSpeech data
@ -59,11 +78,17 @@ def _download_and_preprocess_data(data_dir):
        LIBRIVOX_DIR = "LibriSpeech"
        work_dir = os.path.join(data_dir, LIBRIVOX_DIR)

-        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"), train_clean_100)
+        _maybe_extract(
+            data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"), train_clean_100
+        )
        bar.update(0)
-        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"), train_clean_360)
+        _maybe_extract(
+            data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"), train_clean_360
+        )
        bar.update(1)
-        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"), train_other_500)
+        _maybe_extract(
+            data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"), train_other_500
+        )
        bar.update(2)

        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-clean"), dev_clean)
@ -89,28 +114,48 @@ def _download_and_preprocess_data(data_dir):
    #  data_dir/LibriSpeech/split-wav/1-2-2.txt
    #  ...
    print("Converting FLAC to WAV and splitting transcriptions...")
-    with progressbar.ProgressBar(max_value=7,  widget=progressbar.AdaptiveETA) as bar:
-        train_100 = _convert_audio_and_split_sentences(work_dir, "train-clean-100", "train-clean-100-wav")
+    with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar:
+        train_100 = _convert_audio_and_split_sentences(
+            work_dir, "train-clean-100", "train-clean-100-wav"
+        )
        bar.update(0)
-        train_360 = _convert_audio_and_split_sentences(work_dir, "train-clean-360", "train-clean-360-wav")
+        train_360 = _convert_audio_and_split_sentences(
+            work_dir, "train-clean-360", "train-clean-360-wav"
+        )
        bar.update(1)
-        train_500 = _convert_audio_and_split_sentences(work_dir, "train-other-500", "train-other-500-wav")
+        train_500 = _convert_audio_and_split_sentences(
+            work_dir, "train-other-500", "train-other-500-wav"
+        )
        bar.update(2)

-        dev_clean = _convert_audio_and_split_sentences(work_dir, "dev-clean", "dev-clean-wav")
+        dev_clean = _convert_audio_and_split_sentences(
+            work_dir, "dev-clean", "dev-clean-wav"
+        )
        bar.update(3)
-        dev_other = _convert_audio_and_split_sentences(work_dir, "dev-other", "dev-other-wav")
+        dev_other = _convert_audio_and_split_sentences(
+            work_dir, "dev-other", "dev-other-wav"
+        )
        bar.update(4)

-        test_clean = _convert_audio_and_split_sentences(work_dir, "test-clean", "test-clean-wav")
+        test_clean = _convert_audio_and_split_sentences(
+            work_dir, "test-clean", "test-clean-wav"
+        )
        bar.update(5)
-        test_other = _convert_audio_and_split_sentences(work_dir, "test-other", "test-other-wav")
+        test_other = _convert_audio_and_split_sentences(
+            work_dir, "test-other", "test-other-wav"
+        )
        bar.update(6)

    # Write sets to disk as CSV files
-    train_100.to_csv(os.path.join(data_dir, "librivox-train-clean-100.csv"), index=False)
-    train_360.to_csv(os.path.join(data_dir, "librivox-train-clean-360.csv"), index=False)
-    train_500.to_csv(os.path.join(data_dir, "librivox-train-other-500.csv"), index=False)
+    train_100.to_csv(
+        os.path.join(data_dir, "librivox-train-clean-100.csv"), index=False
+    )
+    train_360.to_csv(
+        os.path.join(data_dir, "librivox-train-clean-360.csv"), index=False
+    )
+    train_500.to_csv(
+        os.path.join(data_dir, "librivox-train-other-500.csv"), index=False
+    )

    dev_clean.to_csv(os.path.join(data_dir, "librivox-dev-clean.csv"), index=False)
    dev_other.to_csv(os.path.join(data_dir, "librivox-dev-other.csv"), index=False)
@ -118,6 +163,7 @@ def _download_and_preprocess_data(data_dir):
    test_clean.to_csv(os.path.join(data_dir, "librivox-test-clean.csv"), index=False)
    test_other.to_csv(os.path.join(data_dir, "librivox-test-other.csv"), index=False)

+
 def _maybe_extract(data_dir, extracted_data, archive):
    # If data_dir/extracted_data does not exist, extract archive in data_dir
    if not gfile.Exists(os.path.join(data_dir, extracted_data)):
@ -125,6 +171,7 @@ def _maybe_extract(data_dir, extracted_data, archive):
        tar.extractall(data_dir)
        tar.close()

+
 def _convert_audio_and_split_sentences(extracted_dir, data_set, dest_dir):
    source_dir = os.path.join(extracted_dir, data_set)
    target_dir = os.path.join(extracted_dir, dest_dir)
@ -147,20 +194,22 @@ def _convert_audio_and_split_sentences(extracted_dir, data_set, dest_dir):
    # We also convert the corresponding FLACs to WAV in the same pass
    files = []
    for root, dirnames, filenames in os.walk(source_dir):
-        for filename in fnmatch.filter(filenames, '*.trans.txt'):
+        for filename in fnmatch.filter(filenames, "*.trans.txt"):
            trans_filename = os.path.join(root, filename)
            with codecs.open(trans_filename, "r", "utf-8") as fin:
                for line in fin:
                    # Parse each segment line
                    first_space = line.find(" ")
-                    seqid, transcript = line[:first_space], line[first_space+1:]
+                    seqid, transcript = line[:first_space], line[first_space + 1 :]

                    # We need to do the encode-decode dance here because encode
                    # returns a bytes() object on Python 3, and text_to_char_array
                    # expects a string.
-                    transcript = unicodedata.normalize("NFKD", transcript)  \
-                                            .encode("ascii", "ignore")      \
-                                            .decode("ascii", "ignore")
+                    transcript = (
+                        unicodedata.normalize("NFKD", transcript)
+                        .encode("ascii", "ignore")
+                        .decode("ascii", "ignore")
+                    )

                    transcript = transcript.lower().strip()

@ -168,12 +217,17 @@ def _convert_audio_and_split_sentences(extracted_dir, data_set, dest_dir):
                    flac_file = os.path.join(root, seqid + ".flac")
                    wav_file = os.path.join(target_dir, seqid + ".wav")
                    if not os.path.exists(wav_file):
-                        Transformer().build(flac_file, wav_file)
+                        tfm = Transformer()
+                        tfm.set_output_format(rate=SAMPLE_RATE)
+                        tfm.build(flac_file, wav_file)
                    wav_filesize = os.path.getsize(wav_file)

                    files.append((os.path.abspath(wav_file), wav_filesize, transcript))

-    return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"])
+    return pandas.DataFrame(
+        data=files, columns=["wav_filename", "wav_filesize", "transcript"]
+    )
+

 if __name__ == "__main__":
    _download_and_preprocess_data(sys.argv[1])
--- a/bin/import_lingua_libre.py
+++ b/bin/import_lingua_libre.py
@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import os
+import re
+import subprocess
+import unicodedata
+import zipfile
+from glob import glob
+from multiprocessing import Pool
+
+import progressbar
+import sox
+from coqui_stt_ctcdecoder import Alphabet
+from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
+from coqui_stt_training.util.importers import (
+    get_counter,
+    get_imported_samples,
+    get_importers_parser,
+    get_validate_label,
+    print_import_report,
+)
+
+FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
+SAMPLE_RATE = 16000
+BITDEPTH = 16
+N_CHANNELS = 1
+MAX_SECS = 10
+
+ARCHIVE_DIR_NAME = "lingua_libre"
+ARCHIVE_NAME = "Q{qId}-{iso639_3}-{language_English_name}.zip"
+ARCHIVE_URL = "https://lingualibre.fr/datasets/" + ARCHIVE_NAME
+
+
+def _download_and_preprocess_data(target_dir):
+    # Making path absolute
+    target_dir = os.path.abspath(target_dir)
+    # Conditionally download data
+    archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL)
+    # Conditionally extract data
+    _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
+    # Produce CSV files and convert ogg data to wav
+    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)
+
+
+def _maybe_extract(target_dir, extracted_data, archive_path):
+    # If target_dir/extracted_data does not exist, extract archive in target_dir
+    extracted_path = os.path.join(target_dir, extracted_data)
+    if not os.path.exists(extracted_path):
+        print('No directory "%s" - extracting archive...' % extracted_path)
+        if not os.path.isdir(extracted_path):
+            os.mkdir(extracted_path)
+        with zipfile.ZipFile(archive_path) as zip_f:
+            zip_f.extractall(extracted_path)
+    else:
+        print('Found directory "%s" - not extracting it from archive.' % archive_path)
+
+
+def one_sample(sample):
+    """ Take a audio file, and optionally convert it to 16kHz WAV """
+    ogg_filename = sample[0]
+    # Storing wav files next to the ogg ones - just with a different suffix
+    wav_filename = os.path.splitext(ogg_filename)[0] + ".wav"
+    _maybe_convert_wav(ogg_filename, wav_filename)
+    file_size = -1
+    frames = 0
+    if os.path.exists(wav_filename):
+        file_size = os.path.getsize(wav_filename)
+        frames = int(
+            subprocess.check_output(
+                ["soxi", "-s", wav_filename], stderr=subprocess.STDOUT
+            )
+        )
+    label = label_filter(sample[1])
+    rows = []
+    counter = get_counter()
+
+    if file_size == -1:
+        # Excluding samples that failed upon conversion
+        counter["failed"] += 1
+    elif label is None:
+        # Excluding samples that failed on label validation
+        counter["invalid_label"] += 1
+    elif int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(label)):
+        # Excluding samples that are too short to fit the transcript
+        counter["too_short"] += 1
+    elif frames / SAMPLE_RATE > MAX_SECS:
+        # Excluding very long samples to keep a reasonable batch-size
+        counter["too_long"] += 1
+    else:
+        # This one is good - keep it for the target CSV
+        rows.append((wav_filename, file_size, label))
+        counter["imported_time"] += frames
+    counter["all"] += 1
+    counter["total_time"] += frames
+
+    return (counter, rows)
+
+
+def _maybe_convert_sets(target_dir, extracted_data):
+    extracted_dir = os.path.join(target_dir, extracted_data)
+    # override existing CSV with normalized one
+    target_csv_template = os.path.join(
+        target_dir, ARCHIVE_DIR_NAME + "_" + ARCHIVE_NAME.replace(".zip", "_{}.csv")
+    )
+    if os.path.isfile(target_csv_template):
+        return
+
+    ogg_root_dir = os.path.join(extracted_dir, ARCHIVE_NAME.replace(".zip", ""))
+
+    # Get audiofile path and transcript for each sentence in tsv
+    samples = []
+    glob_dir = os.path.join(ogg_root_dir, "**/*.ogg")
+    for record in glob(glob_dir, recursive=True):
+        record_file = record.replace(ogg_root_dir + os.path.sep, "")
+        if record_filter(record_file):
+            samples.append(
+                (
+                    os.path.join(ogg_root_dir, record_file),
+                    os.path.splitext(os.path.basename(record_file))[0],
+                )
+            )
+
+    counter = get_counter()
+    num_samples = len(samples)
+    rows = []
+
+    print("Importing ogg files...")
+    pool = Pool()
+    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
+    for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
+        counter += processed[0]
+        rows += processed[1]
+        bar.update(i)
+    bar.update(num_samples)
+    pool.close()
+    pool.join()
+
+    with open(
+        target_csv_template.format("train"), "w", encoding="utf-8", newline=""
+    ) as train_csv_file:  # 80%
+        with open(
+            target_csv_template.format("dev"), "w", encoding="utf-8", newline=""
+        ) as dev_csv_file:  # 10%
+            with open(
+                target_csv_template.format("test"), "w", encoding="utf-8", newline=""
+            ) as test_csv_file:  # 10%
+                train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES)
+                train_writer.writeheader()
+                dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES)
+                dev_writer.writeheader()
+                test_writer = csv.DictWriter(test_csv_file, fieldnames=FIELDNAMES)
+                test_writer.writeheader()
+
+                for i, item in enumerate(rows):
+                    transcript = validate_label(item[2])
+                    if not transcript:
+                        continue
+                    wav_filename = os.path.join(
+                        ogg_root_dir, item[0].replace(".ogg", ".wav")
+                    )
+                    i_mod = i % 10
+                    if i_mod == 0:
+                        writer = test_writer
+                    elif i_mod == 1:
+                        writer = dev_writer
+                    else:
+                        writer = train_writer
+                    writer.writerow(
+                        dict(
+                            wav_filename=wav_filename,
+                            wav_filesize=os.path.getsize(wav_filename),
+                            transcript=transcript,
+                        )
+                    )
+
+    imported_samples = get_imported_samples(counter)
+    assert counter["all"] == num_samples
+    assert len(rows) == imported_samples
+
+    print_import_report(counter, SAMPLE_RATE, MAX_SECS)
+
+
+def _maybe_convert_wav(ogg_filename, wav_filename):
+    if not os.path.exists(wav_filename):
+        transformer = sox.Transformer()
+        transformer.convert(
+            samplerate=SAMPLE_RATE, n_channels=N_CHANNELS, bitdepth=BITDEPTH
+        )
+        try:
+            transformer.build(ogg_filename, wav_filename)
+        except sox.core.SoxError as ex:
+            print("SoX processing error", ex, ogg_filename, wav_filename)
+
+
+def handle_args():
+    parser = get_importers_parser(
+        description="Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details."
+    )
+    parser.add_argument(dest="target_dir")
+    parser.add_argument(
+        "--qId", type=int, required=True, help="LinguaLibre language qId"
+    )
+    parser.add_argument(
+        "--iso639-3", type=str, required=True, help="ISO639-3 language code"
+    )
+    parser.add_argument(
+        "--english-name", type=str, required=True, help="English name of the language"
+    )
+    parser.add_argument(
+        "--filter_alphabet",
+        help="Exclude samples with characters not in provided alphabet",
+    )
+    parser.add_argument(
+        "--normalize",
+        action="store_true",
+        help="Converts diacritic characters to their base ones",
+    )
+    parser.add_argument(
+        "--bogus-records",
+        type=argparse.FileType("r"),
+        required=False,
+        help="Text file listing well-known bogus record to skip from importing, from https://lingualibre.fr/wiki/LinguaLibre:Misleading_items",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    CLI_ARGS = handle_args()
+    ALPHABET = Alphabet(CLI_ARGS.filter_alphabet) if CLI_ARGS.filter_alphabet else None
+    validate_label = get_validate_label(CLI_ARGS)
+
+    bogus_regexes = []
+    if CLI_ARGS.bogus_records:
+        for line in CLI_ARGS.bogus_records:
+            bogus_regexes.append(re.compile(line.strip()))
+
+    def record_filter(path):
+        if any(regex.match(path) for regex in bogus_regexes):
+            print("Reject", path)
+            return False
+        return True
+
+    def label_filter(label):
+        if CLI_ARGS.normalize:
+            label = (
+                unicodedata.normalize("NFKD", label.strip())
+                .encode("ascii", "ignore")
+                .decode("ascii", "ignore")
+            )
+        label = validate_label(label)
+        if ALPHABET and label and not ALPHABET.CanEncode(label):
+            label = None
+        return label
+
+    ARCHIVE_NAME = ARCHIVE_NAME.format(
+        qId=CLI_ARGS.qId,
+        iso639_3=CLI_ARGS.iso639_3,
+        language_English_name=CLI_ARGS.english_name,
+    )
+    ARCHIVE_URL = ARCHIVE_URL.format(
+        qId=CLI_ARGS.qId,
+        iso639_3=CLI_ARGS.iso639_3,
+        language_English_name=CLI_ARGS.english_name,
+    )
+    _download_and_preprocess_data(target_dir=CLI_ARGS.target_dir)
--- a/bin/import_m-ailabs.py
+++ b/bin/import_m-ailabs.py
@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+# pylint: disable=invalid-name
+import csv
+import os
+import subprocess
+import tarfile
+import unicodedata
+from glob import glob
+from multiprocessing import Pool
+
+import progressbar
+from coqui_stt_ctcdecoder import Alphabet
+from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
+from coqui_stt_training.util.importers import (
+    get_counter,
+    get_imported_samples,
+    get_importers_parser,
+    get_validate_label,
+    print_import_report,
+)
+
+FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
+SAMPLE_RATE = 16000
+MAX_SECS = 15
+
+ARCHIVE_DIR_NAME = "{language}"
+ARCHIVE_NAME = "{language}.tgz"
+ARCHIVE_URL = "http://www.caito.de/data/Training/stt_tts/" + ARCHIVE_NAME
+
+
+def _download_and_preprocess_data(target_dir):
+    # Making path absolute
+    target_dir = os.path.abspath(target_dir)
+    # Conditionally download data
+    archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL)
+    # Conditionally extract data
+    _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
+    # Produce CSV files
+    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)
+
+
+def _maybe_extract(target_dir, extracted_data, archive_path):
+    # If target_dir/extracted_data does not exist, extract archive in target_dir
+    extracted_path = os.path.join(target_dir, extracted_data)
+    if not os.path.exists(extracted_path):
+        print('No directory "%s" - extracting archive...' % extracted_path)
+        if not os.path.isdir(extracted_path):
+            os.mkdir(extracted_path)
+        tar = tarfile.open(archive_path)
+        tar.extractall(extracted_path)
+        tar.close()
+    else:
+        print('Found directory "%s" - not extracting it from archive.' % archive_path)
+
+
+def one_sample(sample):
+    """ Take a audio file, and optionally convert it to 16kHz WAV """
+    wav_filename = sample[0]
+    file_size = -1
+    frames = 0
+    if os.path.exists(wav_filename):
+        tmp_filename = os.path.splitext(wav_filename)[0] + ".tmp.wav"
+        subprocess.check_call(
+            [
+                "sox",
+                wav_filename,
+                "-r",
+                str(SAMPLE_RATE),
+                "-c",
+                "1",
+                "-b",
+                "16",
+                tmp_filename,
+            ],
+            stderr=subprocess.STDOUT,
+        )
+        os.rename(tmp_filename, wav_filename)
+        file_size = os.path.getsize(wav_filename)
+        frames = int(
+            subprocess.check_output(
+                ["soxi", "-s", wav_filename], stderr=subprocess.STDOUT
+            )
+        )
+    label = label_filter(sample[1])
+    counter = get_counter()
+    rows = []
+
+    if file_size == -1:
+        # Excluding samples that failed upon conversion
+        print("conversion failure", wav_filename)
+        counter["failed"] += 1
+    elif label is None:
+        # Excluding samples that failed on label validation
+        counter["invalid_label"] += 1
+    elif int(frames / SAMPLE_RATE * 1000 / 15 / 2) < len(str(label)):
+        # Excluding samples that are too short to fit the transcript
+        counter["too_short"] += 1
+    elif frames / SAMPLE_RATE > MAX_SECS:
+        # Excluding very long samples to keep a reasonable batch-size
+        counter["too_long"] += 1
+    else:
+        # This one is good - keep it for the target CSV
+        rows.append((wav_filename, file_size, label))
+        counter["imported_time"] += frames
+    counter["all"] += 1
+    counter["total_time"] += frames
+    return (counter, rows)
+
+
+def _maybe_convert_sets(target_dir, extracted_data):
+    extracted_dir = os.path.join(target_dir, extracted_data)
+    # override existing CSV with normalized one
+    target_csv_template = os.path.join(
+        target_dir, ARCHIVE_DIR_NAME, ARCHIVE_NAME.replace(".tgz", "_{}.csv")
+    )
+    if os.path.isfile(target_csv_template):
+        return
+
+    wav_root_dir = os.path.join(extracted_dir)
+
+    # Get audiofile path and transcript for each sentence in tsv
+    samples = []
+    glob_dir = os.path.join(wav_root_dir, "**/metadata.csv")
+    for record in glob(glob_dir, recursive=True):
+        if any(
+            map(lambda sk: sk in record, SKIP_LIST)
+        ):  # pylint: disable=cell-var-from-loop
+            continue
+        with open(record, "r") as rec:
+            for re in rec.readlines():
+                re = re.strip().split("|")
+                audio = os.path.join(os.path.dirname(record), "wavs", re[0] + ".wav")
+                transcript = re[2]
+                samples.append((audio, transcript))
+
+    counter = get_counter()
+    num_samples = len(samples)
+    rows = []
+
+    print("Importing WAV files...")
+    pool = Pool()
+    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
+    for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
+        counter += processed[0]
+        rows += processed[1]
+        bar.update(i)
+    bar.update(num_samples)
+    pool.close()
+    pool.join()
+
+    with open(
+        target_csv_template.format("train"), "w", encoding="utf-8", newline=""
+    ) as train_csv_file:  # 80%
+        with open(
+            target_csv_template.format("dev"), "w", encoding="utf-8", newline=""
+        ) as dev_csv_file:  # 10%
+            with open(
+                target_csv_template.format("test"), "w", encoding="utf-8", newline=""
+            ) as test_csv_file:  # 10%
+                train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES)
+                train_writer.writeheader()
+                dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES)
+                dev_writer.writeheader()
+                test_writer = csv.DictWriter(test_csv_file, fieldnames=FIELDNAMES)
+                test_writer.writeheader()
+
+                for i, item in enumerate(rows):
+                    transcript = validate_label(item[2])
+                    if not transcript:
+                        continue
+                    wav_filename = item[0]
+                    i_mod = i % 10
+                    if i_mod == 0:
+                        writer = test_writer
+                    elif i_mod == 1:
+                        writer = dev_writer
+                    else:
+                        writer = train_writer
+                    writer.writerow(
+                        dict(
+                            wav_filename=os.path.relpath(wav_filename, extracted_dir),
+                            wav_filesize=os.path.getsize(wav_filename),
+                            transcript=transcript,
+                        )
+                    )
+
+    imported_samples = get_imported_samples(counter)
+    assert counter["all"] == num_samples
+    assert len(rows) == imported_samples
+
+    print_import_report(counter, SAMPLE_RATE, MAX_SECS)
+
+
+def handle_args():
+    parser = get_importers_parser(
+        description="Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/."
+    )
+    parser.add_argument(dest="target_dir")
+    parser.add_argument(
+        "--filter_alphabet",
+        help="Exclude samples with characters not in provided alphabet",
+    )
+    parser.add_argument(
+        "--normalize",
+        action="store_true",
+        help="Converts diacritic characters to their base ones",
+    )
+    parser.add_argument(
+        "--skiplist",
+        type=str,
+        default="",
+        help="Directories / books to skip, comma separated",
+    )
+    parser.add_argument(
+        "--language", required=True, type=str, help="Dataset language to use"
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    CLI_ARGS = handle_args()
+    ALPHABET = Alphabet(CLI_ARGS.filter_alphabet) if CLI_ARGS.filter_alphabet else None
+    SKIP_LIST = filter(None, CLI_ARGS.skiplist.split(","))
+    validate_label = get_validate_label(CLI_ARGS)
+
+    def label_filter(label):
+        if CLI_ARGS.normalize:
+            label = (
+                unicodedata.normalize("NFKD", label.strip())
+                .encode("ascii", "ignore")
+                .decode("ascii", "ignore")
+            )
+        label = validate_label(label)
+        if ALPHABET and label and not ALPHABET.CanEncode(label):
+            label = None
+        return label
+
+    ARCHIVE_DIR_NAME = ARCHIVE_DIR_NAME.format(language=CLI_ARGS.language)
+    ARCHIVE_NAME = ARCHIVE_NAME.format(language=CLI_ARGS.language)
+    ARCHIVE_URL = ARCHIVE_URL.format(language=CLI_ARGS.language)
+
+    _download_and_preprocess_data(target_dir=CLI_ARGS.target_dir)
--- a/bin/import_magicdata.py
+++ b/bin/import_magicdata.py
@ -0,0 +1,127 @@
+#!/usr/bin/env python
+import glob
+import os
+import tarfile
+import wave
+
+import pandas
+from coqui_stt_training.util.importers import get_importers_parser
+
+COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"]
+
+
+def extract(archive_path, target_dir):
+    print("Extracting {} into {}...".format(archive_path, target_dir))
+    with tarfile.open(archive_path) as tar:
+        tar.extractall(target_dir)
+
+
+def is_file_truncated(wav_filename, wav_filesize):
+    with wave.open(wav_filename, mode="rb") as fin:
+        assert fin.getframerate() == 16000
+        assert fin.getsampwidth() == 2
+        assert fin.getnchannels() == 1
+
+        header_duration = fin.getnframes() / fin.getframerate()
+        filesize_duration = (wav_filesize - 44) / 16000 / 2
+
+    return header_duration != filesize_duration
+
+
+def preprocess_data(folder_with_archives, target_dir):
+    # First extract subset archives
+    for subset in ("train", "dev", "test"):
+        extract(
+            os.path.join(
+                folder_with_archives, "magicdata_{}_set.tar.gz".format(subset)
+            ),
+            target_dir,
+        )
+
+    # Folder structure is now:
+    # - magicdata_{train,dev,test}.tar.gz
+    # - magicdata/
+    #   - train/*.wav
+    #   - train/TRANS.txt
+    #   - dev/*.wav
+    #   - dev/TRANS.txt
+    #   - test/*.wav
+    #   - test/TRANS.txt
+
+    # The TRANS files are CSVs with three columns, one containing the WAV file
+    # name, one containing the speaker ID, and one containing the transcription
+
+    def load_set(set_path):
+        transcripts = pandas.read_csv(
+            os.path.join(set_path, "TRANS.txt"), sep="\t", index_col=0
+        )
+        glob_path = os.path.join(set_path, "*", "*.wav")
+        set_files = []
+        for wav in glob.glob(glob_path):
+            try:
+                wav_filename = wav
+                wav_filesize = os.path.getsize(wav)
+                transcript_key = os.path.basename(wav)
+                transcript = transcripts.loc[transcript_key, "Transcription"]
+
+                # Some files in this dataset are truncated, the header duration
+                # doesn't match the file size. This causes errors at training
+                # time, so check here if things are fine before including a file
+                if is_file_truncated(wav_filename, wav_filesize):
+                    print(
+                        "Warning: File {} is corrupted, header duration does "
+                        "not match file size. Ignoring.".format(wav_filename)
+                    )
+                    continue
+
+                set_files.append((wav_filename, wav_filesize, transcript))
+            except KeyError:
+                print("Warning: Missing transcript for WAV file {}.".format(wav))
+        return set_files
+
+    for subset in ("train", "dev", "test"):
+        print("Loading {} set samples...".format(subset))
+        subset_files = load_set(os.path.join(target_dir, subset))
+        df = pandas.DataFrame(data=subset_files, columns=COLUMN_NAMES)
+
+        # Trim train set to under 10s
+        if subset == "train":
+            durations = (df["wav_filesize"] - 44) / 16000 / 2
+            df = df[durations <= 10.0]
+            print("Trimming {} samples > 10 seconds".format((durations > 10.0).sum()))
+
+            with_noise = df["transcript"].str.contains(r"\[(FIL|SPK)\]")
+            df = df[~with_noise]
+            print(
+                "Trimming {} samples with noise ([FIL] or [SPK])".format(
+                    sum(with_noise)
+                )
+            )
+
+        dest_csv = os.path.join(target_dir, "magicdata_{}.csv".format(subset))
+        print("Saving {} set into {}...".format(subset, dest_csv))
+        df.to_csv(dest_csv, index=False)
+
+
+def main():
+    # https://openslr.org/68/
+    parser = get_importers_parser(description="Import MAGICDATA corpus")
+    parser.add_argument(
+        "folder_with_archives",
+        help="Path to folder containing magicdata_{train,dev,test}.tar.gz",
+    )
+    parser.add_argument(
+        "--target_dir",
+        default="",
+        help="Target folder to extract files into and put the resulting CSVs. Defaults to a folder called magicdata next to the archives",
+    )
+    params = parser.parse_args()
+
+    if not params.target_dir:
+        params.target_dir = os.path.join(params.folder_with_archives, "magicdata")
+
+    preprocess_data(params.folder_with_archives, params.target_dir)
+
+
+if __name__ == "__main__":
+    main()
--- a/bin/import_mls_english.py
+++ b/bin/import_mls_english.py
@ -0,0 +1,99 @@
+#!/usr/bin/env python
+import argparse
+import ctypes
+import os
+from pathlib import Path
+
+import pandas
+from tqdm import tqdm
+
+
+def read_ogg_opus_duration(ogg_file_path):
+    error = ctypes.c_int()
+    opusfile = pyogg.opus.op_open_file(
+        ogg_file_path.encode("utf-8"), ctypes.pointer(error)
+    )
+
+    if error.value != 0:
+        raise ValueError(
+            ("Ogg/Opus file could not be read." "Error code: {}").format(error.value)
+        )
+
+    pcm_buffer_size = pyogg.opus.op_pcm_total(opusfile, -1)
+    channel_count = pyogg.opus.op_channel_count(opusfile, -1)
+    sample_rate = 48000  # opus files are always 48kHz
+    sample_width = 2  # always 16-bit
+    pyogg.opus.op_free(opusfile)
+    return pcm_buffer_size / sample_rate
+
+
+def main(root_dir):
+    for subset in (
+        "train",
+        "dev",
+        "test",
+    ):
+        print("Processing {} subset...".format(subset))
+        with open(Path(root_dir) / subset / "transcripts.txt") as fin:
+            subset_entries = []
+            for i, line in tqdm(enumerate(fin)):
+                audio_id, transcript = line.split("\t")
+                audio_id_parts = audio_id.split("_")
+                # e.g. 4800_10003_000000 -> train/audio/4800/10003/4800_10003_000000.opus
+                audio_path = (
+                    Path(root_dir)
+                    / subset
+                    / "audio"
+                    / audio_id_parts[0]
+                    / audio_id_parts[1]
+                    / "{}.opus".format(audio_id)
+                )
+                audio_duration = read_ogg_opus_duration(audio_path)
+                # TODO: support other languages
+                transcript = (
+                    transcript.strip()
+                    .replace("-", " ")
+                    .replace("ñ", "n")
+                    .replace(".", "")
+                    .translate(
+                        {
+                            ord(ch): None
+                            for ch in (
+                                "а",
+                                "в",
+                                "е",
+                                "и",
+                                "к",
+                                "м",
+                                "н",
+                                "о",
+                                "п",
+                                "р",
+                                "т",
+                                "ы",
+                                "я",
+                            )
+                        }
+                    )
+                )
+                subset_entries.append(
+                    (
+                        audio_path.relative_to(root_dir),
+                        audio_duration,
+                        transcript.strip(),
+                    )
+                )
+            df = pandas.DataFrame(
+                columns=["wav_filename", "wav_filesize", "transcript"],
+                data=subset_entries,
+            )
+            csv_name = Path(root_dir) / "{}.csv".format(subset)
+            df.to_csv(csv_name, index=False)
+            print("Wrote {}".format(csv_name))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("root_dir", help="Path to the mls_english_opus directory.")
+    args = parser.parse_args()
+    main(args.root_dir)
--- a/bin/import_primewords.py
+++ b/bin/import_primewords.py
@ -0,0 +1,102 @@
+#!/usr/bin/env python
+import glob
+import json
+import os
+import tarfile
+
+import numpy as np
+import pandas
+from coqui_stt_training.util.importers import get_importers_parser
+
+COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"]
+
+
+def extract(archive_path, target_dir):
+    print("Extracting {} into {}...".format(archive_path, target_dir))
+    with tarfile.open(archive_path) as tar:
+        tar.extractall(target_dir)
+
+
+def preprocess_data(tgz_file, target_dir):
+    # First extract main archive and sub-archives
+    extract(tgz_file, target_dir)
+    main_folder = os.path.join(target_dir, "primewords_md_2018_set1")
+
+    # Folder structure is now:
+    # - primewords_md_2018_set1/
+    #   - audio_files/
+    #     - [0-f]/[00-0f]/*.wav
+    #   - set1_transcript.json
+
+    transcripts_path = os.path.join(main_folder, "set1_transcript.json")
+    with open(transcripts_path) as fin:
+        transcripts = json.load(fin)
+
+    transcripts = {entry["file"]: entry["text"] for entry in transcripts}
+
+    def load_set(glob_path):
+        set_files = []
+        for wav in glob.glob(glob_path):
+            try:
+                wav_filename = wav
+                wav_filesize = os.path.getsize(wav)
+                transcript_key = os.path.basename(wav)
+                transcript = transcripts[transcript_key]
+                set_files.append((wav_filename, wav_filesize, transcript))
+            except KeyError:
+                print("Warning: Missing transcript for WAV file {}.".format(wav))
+        return set_files
+
+    # Load all files, then deterministically split into train/dev/test sets
+    all_files = load_set(os.path.join(main_folder, "audio_files", "*", "*", "*.wav"))
+    df = pandas.DataFrame(data=all_files, columns=COLUMN_NAMES)
+    df.sort_values(by="wav_filename", inplace=True)
+
+    indices = np.arange(0, len(df))
+    np.random.seed(12345)
+    np.random.shuffle(indices)
+
+    # Total corpus size: 50287 samples. 5000 samples gives us 99% confidence
+    # level with a margin of error of under 2%.
+    test_indices = indices[-5000:]
+    dev_indices = indices[-10000:-5000]
+    train_indices = indices[:-10000]
+
+    train_files = df.iloc[train_indices]
+    durations = (train_files["wav_filesize"] - 44) / 16000 / 2
+    train_files = train_files[durations <= 15.0]
+    print("Trimming {} samples > 15 seconds".format((durations > 15.0).sum()))
+    dest_csv = os.path.join(target_dir, "primewords_train.csv")
+    print("Saving train set into {}...".format(dest_csv))
+    train_files.to_csv(dest_csv, index=False)
+
+    dev_files = df.iloc[dev_indices]
+    dest_csv = os.path.join(target_dir, "primewords_dev.csv")
+    print("Saving dev set into {}...".format(dest_csv))
+    dev_files.to_csv(dest_csv, index=False)
+
+    test_files = df.iloc[test_indices]
+    dest_csv = os.path.join(target_dir, "primewords_test.csv")
+    print("Saving test set into {}...".format(dest_csv))
+    test_files.to_csv(dest_csv, index=False)
+
+
+def main():
+    # https://www.openslr.org/47/
+    parser = get_importers_parser(description="Import Primewords Chinese corpus set 1")
+    parser.add_argument("tgz_file", help="Path to primewords_md_2018_set1.tar.gz")
+    parser.add_argument(
+        "--target_dir",
+        default="",
+        help="Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.",
+    )
+    params = parser.parse_args()
+
+    if not params.target_dir:
+        params.target_dir = os.path.dirname(params.tgz_file)
+
+    preprocess_data(params.tgz_file, params.target_dir)
+
+
+if __name__ == "__main__":
+    main()
--- a/bin/import_slr57.py
+++ b/bin/import_slr57.py
@ -0,0 +1,236 @@
+#!/usr/bin/env python3
+import csv
+import os
+import subprocess
+import tarfile
+import unicodedata
+from glob import glob
+from multiprocessing import Pool
+
+import progressbar
+from coqui_stt_ctcdecoder import Alphabet
+from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
+from coqui_stt_training.util.importers import (
+    get_counter,
+    get_imported_samples,
+    get_importers_parser,
+    get_validate_label,
+    print_import_report,
+)
+
+FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
+SAMPLE_RATE = 16000
+MAX_SECS = 15
+
+ARCHIVE_DIR_NAME = "African_Accented_French"
+ARCHIVE_NAME = "African_Accented_French.tar.gz"
+ARCHIVE_URL = "http://www.openslr.org/resources/57/" + ARCHIVE_NAME
+
+
+def _download_and_preprocess_data(target_dir):
+    # Making path absolute
+    target_dir = os.path.abspath(target_dir)
+    # Conditionally download data
+    archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL)
+    # Conditionally extract data
+    _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
+    # Produce CSV files
+    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)
+
+
+def _maybe_extract(target_dir, extracted_data, archive_path):
+    # If target_dir/extracted_data does not exist, extract archive in target_dir
+    extracted_path = os.path.join(target_dir, extracted_data)
+    if not os.path.exists(extracted_path):
+        print('No directory "%s" - extracting archive...' % extracted_path)
+        if not os.path.isdir(extracted_path):
+            os.mkdir(extracted_path)
+        tar = tarfile.open(archive_path)
+        tar.extractall(target_dir)
+        tar.close()
+    else:
+        print('Found directory "%s" - not extracting it from archive.' % archive_path)
+
+
+def one_sample(sample):
+    """ Take a audio file, and optionally convert it to 16kHz WAV """
+    wav_filename = sample[0]
+    file_size = -1
+    frames = 0
+    if os.path.exists(wav_filename):
+        file_size = os.path.getsize(wav_filename)
+        frames = int(
+            subprocess.check_output(
+                ["soxi", "-s", wav_filename], stderr=subprocess.STDOUT
+            )
+        )
+    label = label_filter(sample[1])
+    counter = get_counter()
+    rows = []
+    if file_size == -1:
+        # Excluding samples that failed upon conversion
+        counter["failed"] += 1
+    elif label is None:
+        # Excluding samples that failed on label validation
+        counter["invalid_label"] += 1
+    elif int(frames / SAMPLE_RATE * 1000 / 15 / 2) < len(str(label)):
+        # Excluding samples that are too short to fit the transcript
+        counter["too_short"] += 1
+    elif frames / SAMPLE_RATE > MAX_SECS:
+        # Excluding very long samples to keep a reasonable batch-size
+        counter["too_long"] += 1
+    else:
+        # This one is good - keep it for the target CSV
+        rows.append((wav_filename, file_size, label))
+        counter["imported_time"] += frames
+    counter["all"] += 1
+    counter["total_time"] += frames
+
+    return (counter, rows)
+
+
+def _maybe_convert_sets(target_dir, extracted_data):
+    extracted_dir = os.path.join(target_dir, extracted_data)
+    # override existing CSV with normalized one
+    target_csv_template = os.path.join(
+        target_dir, ARCHIVE_DIR_NAME, ARCHIVE_NAME.replace(".tar.gz", "_{}.csv")
+    )
+    if os.path.isfile(target_csv_template):
+        return
+
+    wav_root_dir = os.path.join(extracted_dir)
+
+    all_files = [
+        "transcripts/train/yaounde/fn_text.txt",
+        "transcripts/train/ca16_conv/transcripts.txt",
+        "transcripts/train/ca16_read/conditioned.txt",
+        "transcripts/dev/niger_west_african_fr/transcripts.txt",
+        "speech/dev/niger_west_african_fr/niger_wav_file_name_transcript.tsv",
+        "transcripts/devtest/ca16_read/conditioned.txt",
+        "transcripts/test/ca16/prompts.txt",
+    ]
+
+    transcripts = {}
+    for tr in all_files:
+        with open(os.path.join(target_dir, ARCHIVE_DIR_NAME, tr), "r") as tr_source:
+            for line in tr_source.readlines():
+                line = line.strip()
+
+                if ".tsv" in tr:
+                    sep = "	"
+                else:
+                    sep = " "
+
+                audio = os.path.basename(line.split(sep)[0])
+
+                if not (".wav" in audio):
+                    if ".tdf" in audio:
+                        audio = audio.replace(".tdf", ".wav")
+                    else:
+                        audio += ".wav"
+
+                transcript = " ".join(line.split(sep)[1:])
+                transcripts[audio] = transcript
+
+    # Get audiofile path and transcript for each sentence in tsv
+    samples = []
+    glob_dir = os.path.join(wav_root_dir, "**/*.wav")
+    for record in glob(glob_dir, recursive=True):
+        record_file = os.path.basename(record)
+        if record_file in transcripts:
+            samples.append((record, transcripts[record_file]))
+
+    # Keep track of how many samples are good vs. problematic
+    counter = get_counter()
+    num_samples = len(samples)
+    rows = []
+
+    print("Importing WAV files...")
+    pool = Pool()
+    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
+    for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
+        counter += processed[0]
+        rows += processed[1]
+        bar.update(i)
+    bar.update(num_samples)
+    pool.close()
+    pool.join()
+
+    with open(
+        target_csv_template.format("train"), "w", encoding="utf-8", newline=""
+    ) as train_csv_file:  # 80%
+        with open(
+            target_csv_template.format("dev"), "w", encoding="utf-8", newline=""
+        ) as dev_csv_file:  # 10%
+            with open(
+                target_csv_template.format("test"), "w", encoding="utf-8", newline=""
+            ) as test_csv_file:  # 10%
+                train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES)
+                train_writer.writeheader()
+                dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES)
+                dev_writer.writeheader()
+                test_writer = csv.DictWriter(test_csv_file, fieldnames=FIELDNAMES)
+                test_writer.writeheader()
+
+                for i, item in enumerate(rows):
+                    transcript = validate_label(item[2])
+                    if not transcript:
+                        continue
+                    wav_filename = item[0]
+                    i_mod = i % 10
+                    if i_mod == 0:
+                        writer = test_writer
+                    elif i_mod == 1:
+                        writer = dev_writer
+                    else:
+                        writer = train_writer
+                    writer.writerow(
+                        dict(
+                            wav_filename=wav_filename,
+                            wav_filesize=os.path.getsize(wav_filename),
+                            transcript=transcript,
+                        )
+                    )
+
+    imported_samples = get_imported_samples(counter)
+    assert counter["all"] == num_samples
+    assert len(rows) == imported_samples
+
+    print_import_report(counter, SAMPLE_RATE, MAX_SECS)
+
+
+def handle_args():
+    parser = get_importers_parser(
+        description="Importer for African Accented French dataset. More information on http://www.openslr.org/57/."
+    )
+    parser.add_argument(dest="target_dir")
+    parser.add_argument(
+        "--filter_alphabet",
+        help="Exclude samples with characters not in provided alphabet",
+    )
+    parser.add_argument(
+        "--normalize",
+        action="store_true",
+        help="Converts diacritic characters to their base ones",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    CLI_ARGS = handle_args()
+    ALPHABET = Alphabet(CLI_ARGS.filter_alphabet) if CLI_ARGS.filter_alphabet else None
+    validate_label = get_validate_label(CLI_ARGS)
+
+    def label_filter(label):
+        if CLI_ARGS.normalize:
+            label = (
+                unicodedata.normalize("NFKD", label.strip())
+                .encode("ascii", "ignore")
+                .decode("ascii", "ignore")
+            )
+        label = validate_label(label)
+        if ALPHABET and label and not ALPHABET.CanEncode(label):
+            label = None
+        return label
+
+    _download_and_preprocess_data(target_dir=CLI_ARGS.target_dir)
--- a/bin/import_swb.py
+++ b/bin/import_swb.py
@ -1,44 +1,38 @@
 #!/usr/bin/env python
-from __future__ import absolute_import, division, print_function
-
-# Make sure we can import stuff from util/
-# This script needs to be run from the root of the DeepSpeech repository
-
 # ensure that you have downloaded the LDC dataset LDC97S62 and tar exists in a folder e.g.
 # ./data/swb/swb1_LDC97S62.tgz
-# from the deepspeech directory run with: ./bin/import_swb.py ./data/swb/
-
-import sys
-import os
-sys.path.insert(1, os.path.join(sys.path[0], '..'))
-
+# from the Coqui STT directory run with: ./bin/import_swb.py ./data/swb/
+import codecs
 import fnmatch
-import pandas
+import os
+import random
 import subprocess
+import sys
+import tarfile
 import unicodedata
 import wave
-import codecs
-import tarfile
-import requests
-from util.text import validate_label
+
 import librosa
-import soundfile # <= Has an external dependency on libsndfile
+import pandas
+import requests
+import soundfile  # <= Has an external dependency on libsndfile
+from coqui_stt_training.util.importers import validate_label_eng as validate_label

 # ARCHIVE_NAME refers to ISIP alignments from 01/29/03
-ARCHIVE_NAME = 'switchboard_word_alignments.tar.gz'
-ARCHIVE_URL = 'http://www.openslr.org/resources/5/'
-ARCHIVE_DIR_NAME = 'LDC97S62'
-LDC_DATASET = 'swb1_LDC97S62.tgz'
+ARCHIVE_NAME = "switchboard_word_alignments.tar.gz"
+ARCHIVE_URL = "http://www.openslr.org/resources/5/"
+ARCHIVE_DIR_NAME = "LDC97S62"
+LDC_DATASET = "swb1_LDC97S62.tgz"


 def download_file(folder, url):
    # https://stackoverflow.com/a/16696317/738515
-    local_filename = url.split('/')[-1]
+    local_filename = url.split("/")[-1]
    full_filename = os.path.join(folder, local_filename)
    r = requests.get(url, stream=True)
-    with open(full_filename, 'wb') as f:
-        for chunk in r.iter_content(chunk_size=1024): 
-            if chunk: # filter out keep-alive new chunks
+    with open(full_filename, "wb") as f:
+        for chunk in r.iter_content(chunk_size=1024):
+            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)
    return full_filename

@ -46,10 +40,10 @@ def download_file(folder, url):
 def maybe_download(archive_url, target_dir, ldc_dataset):
    # If archive file does not exist, download it...
    archive_path = os.path.join(target_dir, ldc_dataset)
-    ldc_path = archive_url+ldc_dataset
+    ldc_path = archive_url + ldc_dataset
    if not os.path.exists(target_dir):
        print('No path "%s" - creating ...' % target_dir)
-        makedirs(target_dir)
+        os.makedirs(target_dir)

    if not os.path.exists(archive_path):
        print('No archive "%s" - downloading...' % archive_path)
@ -65,17 +59,23 @@ def _download_and_preprocess_data(data_dir):
    archive_path = os.path.abspath(os.path.join(data_dir, LDC_DATASET))

    # Check swb1_LDC97S62.tgz then extract
-    assert(os.path.isfile(archive_path))
+    assert os.path.isfile(archive_path)
    _extract(target_dir, archive_path)
-    
+
    # Transcripts
    transcripts_path = maybe_download(ARCHIVE_URL, target_dir, ARCHIVE_NAME)
    _extract(target_dir, transcripts_path)

    # Check swb1_d1/2/3/4/swb_ms98_transcriptions
-    expected_folders = ["swb1_d1","swb1_d2","swb1_d3","swb1_d4","swb_ms98_transcriptions"]
-    assert(all([os.path.isdir(os.path.join(target_dir,e)) for e in expected_folders]))
-    
+    expected_folders = [
+        "swb1_d1",
+        "swb1_d2",
+        "swb1_d3",
+        "swb1_d4",
+        "swb_ms98_transcriptions",
+    ]
+    assert all([os.path.isdir(os.path.join(target_dir, e)) for e in expected_folders])
+
    # Conditionally convert swb sph data to wav
    _maybe_convert_wav(target_dir, "swb1_d1", "swb1_d1-wav")
    _maybe_convert_wav(target_dir, "swb1_d2", "swb1_d2-wav")
@ -83,13 +83,21 @@ def _download_and_preprocess_data(data_dir):
    _maybe_convert_wav(target_dir, "swb1_d4", "swb1_d4-wav")

    # Conditionally split wav data
-    d1 = _maybe_split_wav_and_sentences(target_dir, "swb_ms98_transcriptions", "swb1_d1-wav", "swb1_d1-split-wav")
-    d2 = _maybe_split_wav_and_sentences(target_dir, "swb_ms98_transcriptions", "swb1_d2-wav", "swb1_d2-split-wav")
-    d3 = _maybe_split_wav_and_sentences(target_dir, "swb_ms98_transcriptions", "swb1_d3-wav", "swb1_d3-split-wav")
-    d4 = _maybe_split_wav_and_sentences(target_dir, "swb_ms98_transcriptions", "swb1_d4-wav", "swb1_d4-split-wav")
-    
+    d1 = _maybe_split_wav_and_sentences(
+        target_dir, "swb_ms98_transcriptions", "swb1_d1-wav", "swb1_d1-split-wav"
+    )
+    d2 = _maybe_split_wav_and_sentences(
+        target_dir, "swb_ms98_transcriptions", "swb1_d2-wav", "swb1_d2-split-wav"
+    )
+    d3 = _maybe_split_wav_and_sentences(
+        target_dir, "swb_ms98_transcriptions", "swb1_d3-wav", "swb1_d3-split-wav"
+    )
+    d4 = _maybe_split_wav_and_sentences(
+        target_dir, "swb_ms98_transcriptions", "swb1_d4-wav", "swb1_d4-split-wav"
+    )
+
    swb_files = d1.append(d2).append(d3).append(d4)
-    
+
    train_files, dev_files, test_files = _split_sets(swb_files)

    # Write sets to disk as CSV files
@ -97,7 +105,7 @@ def _download_and_preprocess_data(data_dir):
    dev_files.to_csv(os.path.join(target_dir, "swb-dev.csv"), index=False)
    test_files.to_csv(os.path.join(target_dir, "swb-test.csv"), index=False)

-    
+
 def _extract(target_dir, archive_path):
    with tarfile.open(archive_path) as tar:
        tar.extractall(target_dir)
@ -118,25 +126,46 @@ def _maybe_convert_wav(data_dir, original_data, converted_data):
    # Loop over sph files in source_dir and convert each to 16-bit PCM wav
    for root, dirnames, filenames in os.walk(source_dir):
        for filename in fnmatch.filter(filenames, "*.sph"):
-            for channel in ['1', '2']:
+            for channel in ["1", "2"]:
                sph_file = os.path.join(root, filename)
-                wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + "-" + channel + ".wav"
+                wav_filename = (
+                    os.path.splitext(os.path.basename(sph_file))[0]
+                    + "-"
+                    + channel
+                    + ".wav"
+                )
                wav_file = os.path.join(target_dir, wav_filename)
-                temp_wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + "-" + channel + "-temp.wav"
+                temp_wav_filename = (
+                    os.path.splitext(os.path.basename(sph_file))[0]
+                    + "-"
+                    + channel
+                    + "-temp.wav"
+                )
                temp_wav_file = os.path.join(target_dir, temp_wav_filename)
                print("converting {} to {}".format(sph_file, temp_wav_file))
-                subprocess.check_call(["sph2pipe", "-c", channel, "-p", "-f", "rif", sph_file, temp_wav_file])
+                subprocess.check_call(
+                    [
+                        "sph2pipe",
+                        "-c",
+                        channel,
+                        "-p",
+                        "-f",
+                        "rif",
+                        sph_file,
+                        temp_wav_file,
+                    ]
+                )
                print("upsampling {} to {}".format(temp_wav_file, wav_file))
                audioData, frameRate = librosa.load(temp_wav_file, sr=16000, mono=True)
                soundfile.write(wav_file, audioData, frameRate, "PCM_16")
                os.remove(temp_wav_file)

-                
+
 def _parse_transcriptions(trans_file):
    segments = []
    with codecs.open(trans_file, "r", "utf-8") as fin:
        for line in fin:
-            if line.startswith("#")  or len(line) <= 1:
+            if line.startswith("#") or len(line) <= 1:
                continue

            tokens = line.split()
@ -150,15 +179,19 @@ def _parse_transcriptions(trans_file):
            # We need to do the encode-decode dance here because encode
            # returns a bytes() object on Python 3, and text_to_char_array
            # expects a string.
-            transcript = unicodedata.normalize("NFKD", transcript)  \
-                                    .encode("ascii", "ignore")      \
-                                    .decode("ascii", "ignore")
+            transcript = (
+                unicodedata.normalize("NFKD", transcript)
+                .encode("ascii", "ignore")
+                .decode("ascii", "ignore")
+            )

-            segments.append({
-                "start_time": start_time,
-                "stop_time": stop_time,
-                "transcript": transcript,
-            })
+            segments.append(
+                {
+                    "start_time": start_time,
+                    "stop_time": stop_time,
+                    "transcript": transcript,
+                }
+            )
    return segments


@ -183,8 +216,16 @@ def _maybe_split_wav_and_sentences(data_dir, trans_data, original_data, converte
            segments = _parse_transcriptions(trans_file)

            # Open wav corresponding to transcription file
-            channel = ("2","1")[(os.path.splitext(os.path.basename(trans_file))[0])[6] == 'A']
-            wav_filename = "sw0" + (os.path.splitext(os.path.basename(trans_file))[0])[2:6] + "-" + channel + ".wav"
+            channel = ("2", "1")[
+                (os.path.splitext(os.path.basename(trans_file))[0])[6] == "A"
+            ]
+            wav_filename = (
+                "sw0"
+                + (os.path.splitext(os.path.basename(trans_file))[0])[2:6]
+                + "-"
+                + channel
+                + ".wav"
+            )
            wav_file = os.path.join(source_dir, wav_filename)

            print("splitting {} according to {}".format(wav_file, trans_file))
@ -200,26 +241,39 @@ def _maybe_split_wav_and_sentences(data_dir, trans_data, original_data, converte
                # Create wav segment filename
                start_time = segment["start_time"]
                stop_time = segment["stop_time"]
-                new_wav_filename = os.path.splitext(os.path.basename(trans_file))[0] + "-" + str(
-                    start_time) + "-" + str(stop_time) + ".wav"
+                new_wav_filename = (
+                    os.path.splitext(os.path.basename(trans_file))[0]
+                    + "-"
+                    + str(start_time)
+                    + "-"
+                    + str(stop_time)
+                    + ".wav"
+                )
                if _is_wav_too_short(new_wav_filename):
-                  continue
+                    continue
                new_wav_file = os.path.join(target_dir, new_wav_filename)

                _split_wav(origAudio, start_time, stop_time, new_wav_file)

                new_wav_filesize = os.path.getsize(new_wav_file)
                transcript = segment["transcript"]
-                files.append((os.path.abspath(new_wav_file), new_wav_filesize, transcript))
+                files.append(
+                    (os.path.abspath(new_wav_file), new_wav_filesize, transcript)
+                )

            # Close origAudio
            origAudio.close()

-    return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"])
+    return pandas.DataFrame(
+        data=files, columns=["wav_filename", "wav_filesize", "transcript"]
+    )


 def _is_wav_too_short(wav_filename):
-    short_wav_filenames = ['sw2986A-ms98-a-trans-80.6385-83.358875.wav', 'sw2663A-ms98-a-trans-161.12025-164.213375.wav']
+    short_wav_filenames = [
+        "sw2986A-ms98-a-trans-80.6385-83.358875.wav",
+        "sw2663A-ms98-a-trans-161.12025-164.213375.wav",
+    ]
    return wav_filename in short_wav_filenames


@ -234,24 +288,61 @@ def _split_wav(origAudio, start_time, stop_time, new_wav_file):
    chunkAudio.writeframes(chunkData)
    chunkAudio.close()

-    
-def _split_sets(filelist):
-    # We initially split the entire set into 80% train and 20% test, then
-    # split the train set into 80% train and 20% validation.
-    train_beg = 0
-    train_end = int(0.8 * len(filelist))

-    dev_beg = int(0.8 * train_end)
-    dev_end = train_end
-    train_end = dev_beg
+def _split_sets(filelist):
+    """
+    randomply split the datasets into train, validation, and test sets where the size of the
+    validation and test sets are determined by the `get_sample_size` function.
+    """
+    random.shuffle(filelist)
+    sample_size = get_sample_size(len(filelist))
+
+    train_beg = 0
+    train_end = len(filelist) - 2 * sample_size
+
+    dev_beg = train_end
+    dev_end = train_end + sample_size

    test_beg = dev_end
    test_end = len(filelist)

-    return (filelist[train_beg:train_end], filelist[dev_beg:dev_end], filelist[test_beg:test_end])
+    return (
+        filelist[train_beg:train_end],
+        filelist[dev_beg:dev_end],
+        filelist[test_beg:test_end],
+    )


-def _read_data_set(filelist, thread_count, batch_size, numcep, numcontext, stride=1, offset=0, next_index=lambda i: i + 1, limit=0):
+def get_sample_size(population_size):
+    """calculates the sample size for a 99% confidence and 1% margin of error"""
+    margin_of_error = 0.01
+    fraction_picking = 0.50
+    z_score = 2.58  # Corresponds to confidence level 99%
+    numerator = (z_score ** 2 * fraction_picking * (1 - fraction_picking)) / (
+        margin_of_error ** 2
+    )
+    sample_size = 0
+    for train_size in range(population_size, 0, -1):
+        denominator = 1 + (z_score ** 2 * fraction_picking * (1 - fraction_picking)) / (
+            margin_of_error ** 2 * train_size
+        )
+        sample_size = int(numerator / denominator)
+        if 2 * sample_size + train_size <= population_size:
+            break
+    return sample_size
+
+
+def _read_data_set(
+    filelist,
+    thread_count,
+    batch_size,
+    numcep,
+    numcontext,
+    stride=1,
+    offset=0,
+    next_index=lambda i: i + 1,
+    limit=0,
+):
    # Optionally apply dataset size limit
    if limit > 0:
        filelist = filelist.iloc[:limit]
@ -259,7 +350,9 @@ def _read_data_set(filelist, thread_count, batch_size, numcep, numcontext, strid
    filelist = filelist[offset::stride]

    # Return DataSet
-    return DataSet(txt_files, thread_count, batch_size, numcep, numcontext, next_index=next_index)
+    return DataSet(
+        txt_files, thread_count, batch_size, numcep, numcontext, next_index=next_index
+    )


 if __name__ == "__main__":
--- a/bin/import_swc.py
+++ b/bin/import_swc.py
@ -0,0 +1,577 @@
+#!/usr/bin/env python
+"""
+Downloads and prepares (parts of) the "Spoken Wikipedia Corpora" for train.py
+Use "python3 import_swc.py -h" for help
+"""
+
+import argparse
+import csv
+import os
+import random
+import re
+import shutil
+import sys
+import tarfile
+import unicodedata
+import wave
+import xml.etree.ElementTree as ET
+from collections import Counter
+from glob import glob
+from multiprocessing.pool import ThreadPool
+
+import progressbar
+import sox
+from coqui_stt_ctcdecoder import Alphabet
+from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
+from coqui_stt_training.util.importers import validate_label_eng as validate_label
+
+SWC_URL = "https://www2.informatik.uni-hamburg.de/nats/pub/SWC/SWC_{language}.tar"
+SWC_ARCHIVE = "SWC_{language}.tar"
+LANGUAGES = ["dutch", "english", "german"]
+FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
+FIELDNAMES_EXT = FIELDNAMES + ["article", "speaker"]
+CHANNELS = 1
+SAMPLE_RATE = 16000
+UNKNOWN = "<unknown>"
+AUDIO_PATTERN = "audio*.ogg"
+WAV_NAME = "audio.wav"
+ALIGNED_NAME = "aligned.swc"
+
+SUBSTITUTIONS = {
+    "german": [
+        (re.compile(r"\$"), "dollar"),
+        (re.compile(r"€"), "euro"),
+        (re.compile(r"£"), "pfund"),
+        (
+            re.compile(r"ein tausend ([^\s]+) hundert ([^\s]+) er( |$)"),
+            r"\1zehnhundert \2er ",
+        ),
+        (re.compile(r"ein tausend (acht|neun) hundert"), r"\1zehnhundert"),
+        (
+            re.compile(
+                r"eins punkt null null null punkt null null null punkt null null null"
+            ),
+            "eine milliarde",
+        ),
+        (
+            re.compile(
+                r"punkt null null null punkt null null null punkt null null null"
+            ),
+            "milliarden",
+        ),
+        (re.compile(r"eins punkt null null null punkt null null null"), "eine million"),
+        (re.compile(r"punkt null null null punkt null null null"), "millionen"),
+        (re.compile(r"eins punkt null null null"), "ein tausend"),
+        (re.compile(r"punkt null null null"), "tausend"),
+        (re.compile(r"punkt null"), None),
+    ]
+}
+
+DONT_NORMALIZE = {"german": "ÄÖÜäöüß"}
+
+PRE_FILTER = str.maketrans(dict.fromkeys("/()[]{}<>:"))
+
+
+class Sample:
+    def __init__(self, wav_path, start, end, text, article, speaker, sub_set=None):
+        self.wav_path = wav_path
+        self.start = start
+        self.end = end
+        self.text = text
+        self.article = article
+        self.speaker = speaker
+        self.sub_set = sub_set
+
+
+def fail(message):
+    print(message)
+    sys.exit(1)
+
+
+def group(lst, get_key):
+    groups = {}
+    for obj in lst:
+        key = get_key(obj)
+        if key in groups:
+            groups[key].append(obj)
+        else:
+            groups[key] = [obj]
+    return groups
+
+
+def get_sample_size(population_size):
+    margin_of_error = 0.01
+    fraction_picking = 0.50
+    z_score = 2.58  # Corresponds to confidence level 99%
+    numerator = (z_score ** 2 * fraction_picking * (1 - fraction_picking)) / (
+        margin_of_error ** 2
+    )
+    sample_size = 0
+    for train_size in range(population_size, 0, -1):
+        denominator = 1 + (z_score ** 2 * fraction_picking * (1 - fraction_picking)) / (
+            margin_of_error ** 2 * train_size
+        )
+        sample_size = int(numerator / denominator)
+        if 2 * sample_size + train_size <= population_size:
+            break
+    return sample_size
+
+
+def maybe_download_language(language):
+    lang_upper = language[0].upper() + language[1:]
+    return maybe_download(
+        SWC_ARCHIVE.format(language=lang_upper),
+        CLI_ARGS.base_dir,
+        SWC_URL.format(language=lang_upper),
+    )
+
+
+def maybe_extract(data_dir, extracted_data, archive):
+    extracted = os.path.join(data_dir, extracted_data)
+    if os.path.isdir(extracted):
+        print('Found directory "{}" - not extracting.'.format(extracted))
+    else:
+        print('Extracting "{}"...'.format(archive))
+        with tarfile.open(archive) as tar:
+            members = tar.getmembers()
+            bar = progressbar.ProgressBar(max_value=len(members), widgets=SIMPLE_BAR)
+            for member in bar(members):
+                tar.extract(member=member, path=extracted)
+    return extracted
+
+
+def ignored(node):
+    if node is None:
+        return False
+    if node.tag == "ignored":
+        return True
+    return ignored(node.find(".."))
+
+
+def read_token(token):
+    texts, start, end = [], None, None
+    notes = token.findall("n")
+    if len(notes) > 0:
+        for note in notes:
+            attributes = note.attrib
+            if start is None and "start" in attributes:
+                start = int(attributes["start"])
+            if "end" in attributes:
+                token_end = int(attributes["end"])
+                if end is None or token_end > end:
+                    end = token_end
+            if "pronunciation" in attributes:
+                t = attributes["pronunciation"]
+                texts.append(t)
+    elif "text" in token.attrib:
+        texts.append(token.attrib["text"])
+    return start, end, " ".join(texts)
+
+
+def in_alphabet(alphabet, c):
+    return alphabet.CanEncode(c) if alphabet else True
+
+
+ALPHABETS = {}
+
+
+def get_alphabet(language):
+    if language in ALPHABETS:
+        return ALPHABETS[language]
+    alphabet_path = getattr(CLI_ARGS, language + "_alphabet")
+    alphabet = Alphabet(alphabet_path) if alphabet_path else None
+    ALPHABETS[language] = alphabet
+    return alphabet
+
+
+def label_filter(label, language):
+    label = label.translate(PRE_FILTER)
+    label = validate_label(label)
+    if label is None:
+        return None, "validation"
+    substitutions = SUBSTITUTIONS[language] if language in SUBSTITUTIONS else []
+    for pattern, replacement in substitutions:
+        if replacement is None:
+            if pattern.match(label):
+                return None, "substitution rule"
+        else:
+            label = pattern.sub(replacement, label)
+    chars = []
+    dont_normalize = DONT_NORMALIZE[language] if language in DONT_NORMALIZE else ""
+    alphabet = get_alphabet(language)
+    for c in label:
+        if (
+            CLI_ARGS.normalize
+            and c not in dont_normalize
+            and not in_alphabet(alphabet, c)
+        ):
+            c = (
+                unicodedata.normalize("NFKD", c)
+                .encode("ascii", "ignore")
+                .decode("ascii", "ignore")
+            )
+        for sc in c:
+            if not in_alphabet(alphabet, sc):
+                return None, "illegal character"
+            chars.append(sc)
+    label = "".join(chars)
+    label = validate_label(label)
+    return label, "validation" if label is None else None
+
+
+def collect_samples(base_dir, language):
+    roots = []
+    for root, _, files in os.walk(base_dir):
+        if ALIGNED_NAME in files and WAV_NAME in files:
+            roots.append(root)
+    samples = []
+    reasons = Counter()
+
+    def add_sample(
+        p_wav_path, p_article, p_speaker, p_start, p_end, p_text, p_reason="complete"
+    ):
+        if p_start is not None and p_end is not None and p_text is not None:
+            duration = p_end - p_start
+            text, filter_reason = label_filter(p_text, language)
+            skip = False
+            if filter_reason is not None:
+                skip = True
+                p_reason = filter_reason
+            elif CLI_ARGS.exclude_unknown_speakers and p_speaker == UNKNOWN:
+                skip = True
+                p_reason = "unknown speaker"
+            elif CLI_ARGS.exclude_unknown_articles and p_article == UNKNOWN:
+                skip = True
+                p_reason = "unknown article"
+            elif duration > CLI_ARGS.max_duration > 0 and CLI_ARGS.ignore_too_long:
+                skip = True
+                p_reason = "exceeded duration"
+            elif int(duration / 30) < len(text):
+                skip = True
+                p_reason = "too short to decode"
+            elif duration / len(text) < 10:
+                skip = True
+                p_reason = "length duration ratio"
+            if skip:
+                reasons[p_reason] += 1
+            else:
+                samples.append(
+                    Sample(p_wav_path, p_start, p_end, text, p_article, p_speaker)
+                )
+        elif p_start is None or p_end is None:
+            reasons["missing timestamps"] += 1
+        else:
+            reasons["missing text"] += 1
+
+    print("Collecting samples...")
+    bar = progressbar.ProgressBar(max_value=len(roots), widgets=SIMPLE_BAR)
+    for root in bar(roots):
+        wav_path = os.path.join(root, WAV_NAME)
+        aligned = ET.parse(os.path.join(root, ALIGNED_NAME))
+        article = UNKNOWN
+        speaker = UNKNOWN
+        for prop in aligned.iter("prop"):
+            attributes = prop.attrib
+            if "key" in attributes and "value" in attributes:
+                if attributes["key"] == "DC.identifier":
+                    article = attributes["value"]
+                elif attributes["key"] == "reader.name":
+                    speaker = attributes["value"]
+        for sentence in aligned.iter("s"):
+            if ignored(sentence):
+                continue
+            split = False
+            tokens = list(map(read_token, sentence.findall("t")))
+            sample_start, sample_end, token_texts, sample_texts = None, None, [], []
+            for token_start, token_end, token_text in tokens:
+                if CLI_ARGS.exclude_numbers and any(c.isdigit() for c in token_text):
+                    add_sample(
+                        wav_path,
+                        article,
+                        speaker,
+                        sample_start,
+                        sample_end,
+                        " ".join(sample_texts),
+                        p_reason="has numbers",
+                    )
+                    sample_start, sample_end, token_texts, sample_texts = (
+                        None,
+                        None,
+                        [],
+                        [],
+                    )
+                    continue
+                if sample_start is None:
+                    sample_start = token_start
+                if sample_start is None:
+                    continue
+                token_texts.append(token_text)
+                if token_end is not None:
+                    if (
+                        token_start != sample_start
+                        and token_end - sample_start > CLI_ARGS.max_duration > 0
+                    ):
+                        add_sample(
+                            wav_path,
+                            article,
+                            speaker,
+                            sample_start,
+                            sample_end,
+                            " ".join(sample_texts),
+                            p_reason="split",
+                        )
+                        sample_start = sample_end
+                        sample_texts = []
+                        split = True
+                    sample_end = token_end
+                    sample_texts.extend(token_texts)
+                    token_texts = []
+            add_sample(
+                wav_path,
+                article,
+                speaker,
+                sample_start,
+                sample_end,
+                " ".join(sample_texts),
+                p_reason="split" if split else "complete",
+            )
+    print("Skipped samples:")
+    for reason, n in reasons.most_common():
+        print(" - {}: {}".format(reason, n))
+    return samples
+
+
+def maybe_convert_one_to_wav(entry):
+    root, _, files = entry
+    transformer = sox.Transformer()
+    transformer.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS)
+    combiner = sox.Combiner()
+    combiner.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS)
+    output_wav = os.path.join(root, WAV_NAME)
+    if os.path.isfile(output_wav):
+        return
+    files = sorted(glob(os.path.join(root, AUDIO_PATTERN)))
+    try:
+        if len(files) == 1:
+            transformer.build(files[0], output_wav)
+        elif len(files) > 1:
+            wav_files = []
+            for i, file in enumerate(files):
+                wav_path = os.path.join(root, "audio{}.wav".format(i))
+                transformer.build(file, wav_path)
+                wav_files.append(wav_path)
+            combiner.set_input_format(file_type=["wav"] * len(wav_files))
+            combiner.build(wav_files, output_wav, "concatenate")
+    except sox.core.SoxError:
+        return
+
+
+def maybe_convert_to_wav(base_dir):
+    roots = list(os.walk(base_dir))
+    print("Converting and joining source audio files...")
+    bar = progressbar.ProgressBar(max_value=len(roots), widgets=SIMPLE_BAR)
+    tp = ThreadPool()
+    for _ in bar(tp.imap_unordered(maybe_convert_one_to_wav, roots)):
+        pass
+    tp.close()
+    tp.join()
+
+
+def assign_sub_sets(samples):
+    sample_size = get_sample_size(len(samples))
+    speakers = group(samples, lambda sample: sample.speaker).values()
+    speakers = list(sorted(speakers, key=len))
+    sample_sets = [[], []]
+    while any(map(lambda s: len(s) < sample_size, sample_sets)) and len(speakers) > 0:
+        for sample_set in sample_sets:
+            if len(sample_set) < sample_size and len(speakers) > 0:
+                sample_set.extend(speakers.pop(0))
+    train_set = sum(speakers, [])
+    if len(train_set) == 0:
+        print(
+            "WARNING: Unable to build dev and test sets without speaker bias as there is no speaker meta data"
+        )
+        random.seed(42)  # same source data == same output
+        random.shuffle(samples)
+        for index, sample in enumerate(samples):
+            if index < sample_size:
+                sample.sub_set = "dev"
+            elif index < 2 * sample_size:
+                sample.sub_set = "test"
+            else:
+                sample.sub_set = "train"
+    else:
+        for sub_set, sub_set_samples in [
+            ("train", train_set),
+            ("dev", sample_sets[0]),
+            ("test", sample_sets[1]),
+        ]:
+            for sample in sub_set_samples:
+                sample.sub_set = sub_set
+    for sub_set, sub_set_samples in group(samples, lambda s: s.sub_set).items():
+        t = sum(map(lambda s: s.end - s.start, sub_set_samples)) / (1000 * 60 * 60)
+        print(
+            'Sub-set "{}" with {} samples (duration: {:.2f} h)'.format(
+                sub_set, len(sub_set_samples), t
+            )
+        )
+
+
+def create_sample_dirs(language):
+    print("Creating sample directories...")
+    for set_name in ["train", "dev", "test"]:
+        dir_path = os.path.join(CLI_ARGS.base_dir, language + "-" + set_name)
+        if not os.path.isdir(dir_path):
+            os.mkdir(dir_path)
+
+
+def split_audio_files(samples, language):
+    print("Splitting audio files...")
+    sub_sets = Counter()
+    src_wav_files = group(samples, lambda s: s.wav_path).items()
+    bar = progressbar.ProgressBar(max_value=len(src_wav_files), widgets=SIMPLE_BAR)
+    for wav_path, file_samples in bar(src_wav_files):
+        file_samples = sorted(file_samples, key=lambda s: s.start)
+        with wave.open(wav_path, "r") as src_wav_file:
+            rate = src_wav_file.getframerate()
+            for sample in file_samples:
+                index = sub_sets[sample.sub_set]
+                sample_wav_path = os.path.join(
+                    CLI_ARGS.base_dir,
+                    language + "-" + sample.sub_set,
+                    "sample-{0:06d}.wav".format(index),
+                )
+                sample.wav_path = sample_wav_path
+                sub_sets[sample.sub_set] += 1
+                src_wav_file.setpos(int(sample.start * rate / 1000.0))
+                data = src_wav_file.readframes(
+                    int((sample.end - sample.start) * rate / 1000.0)
+                )
+                with wave.open(sample_wav_path, "w") as sample_wav_file:
+                    sample_wav_file.setnchannels(src_wav_file.getnchannels())
+                    sample_wav_file.setsampwidth(src_wav_file.getsampwidth())
+                    sample_wav_file.setframerate(rate)
+                    sample_wav_file.writeframes(data)
+
+
+def write_csvs(samples, language):
+    for sub_set, set_samples in group(samples, lambda s: s.sub_set).items():
+        set_samples = sorted(set_samples, key=lambda s: s.wav_path)
+        base_dir = os.path.abspath(CLI_ARGS.base_dir)
+        csv_path = os.path.join(base_dir, language + "-" + sub_set + ".csv")
+        print('Writing "{}"...'.format(csv_path))
+        with open(csv_path, "w", encoding="utf-8", newline="") as csv_file:
+            writer = csv.DictWriter(
+                csv_file, fieldnames=FIELDNAMES_EXT if CLI_ARGS.add_meta else FIELDNAMES
+            )
+            writer.writeheader()
+            bar = progressbar.ProgressBar(
+                max_value=len(set_samples), widgets=SIMPLE_BAR
+            )
+            for sample in bar(set_samples):
+                row = {
+                    "wav_filename": os.path.relpath(sample.wav_path, base_dir),
+                    "wav_filesize": os.path.getsize(sample.wav_path),
+                    "transcript": sample.text,
+                }
+                if CLI_ARGS.add_meta:
+                    row["article"] = sample.article
+                    row["speaker"] = sample.speaker
+                writer.writerow(row)
+
+
+def cleanup(archive, language):
+    if not CLI_ARGS.keep_archive:
+        print('Removing archive "{}"...'.format(archive))
+        os.remove(archive)
+    language_dir = os.path.join(CLI_ARGS.base_dir, language)
+    if not CLI_ARGS.keep_intermediate and os.path.isdir(language_dir):
+        print('Removing intermediate files in "{}"...'.format(language_dir))
+        shutil.rmtree(language_dir)
+
+
+def prepare_language(language):
+    archive = maybe_download_language(language)
+    extracted = maybe_extract(CLI_ARGS.base_dir, language, archive)
+    maybe_convert_to_wav(extracted)
+    samples = collect_samples(extracted, language)
+    assign_sub_sets(samples)
+    create_sample_dirs(language)
+    split_audio_files(samples, language)
+    write_csvs(samples, language)
+    cleanup(archive, language)
+
+
+def handle_args():
+    parser = argparse.ArgumentParser(description="Import Spoken Wikipedia Corpora")
+    parser.add_argument("base_dir", help="Directory containing all data")
+    parser.add_argument(
+        "--language", default="all", help="One of (all|{})".format("|".join(LANGUAGES))
+    )
+    parser.add_argument(
+        "--exclude_numbers",
+        type=bool,
+        default=True,
+        help="If sequences with non-transliterated numbers should be excluded",
+    )
+    parser.add_argument(
+        "--max_duration",
+        type=int,
+        default=10000,
+        help="Maximum sample duration in milliseconds",
+    )
+    parser.add_argument(
+        "--ignore_too_long",
+        type=bool,
+        default=False,
+        help="If samples exceeding max_duration should be removed",
+    )
+    parser.add_argument(
+        "--normalize",
+        action="store_true",
+        help="Converts diacritic characters to their base ones",
+    )
+    for language in LANGUAGES:
+        parser.add_argument(
+            "--{}_alphabet".format(language),
+            help="Exclude {} samples with characters not in provided alphabet file".format(
+                language
+            ),
+        )
+    parser.add_argument(
+        "--add_meta", action="store_true", help="Adds article and speaker CSV columns"
+    )
+    parser.add_argument(
+        "--exclude_unknown_speakers",
+        action="store_true",
+        help="Exclude unknown speakers",
+    )
+    parser.add_argument(
+        "--exclude_unknown_articles",
+        action="store_true",
+        help="Exclude unknown articles",
+    )
+    parser.add_argument(
+        "--keep_archive",
+        type=bool,
+        default=True,
+        help="If downloaded archives should be kept",
+    )
+    parser.add_argument(
+        "--keep_intermediate",
+        type=bool,
+        default=False,
+        help="If intermediate files should be kept",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    CLI_ARGS = handle_args()
+    if CLI_ARGS.language == "all":
+        for lang in LANGUAGES:
+            prepare_language(lang)
+    elif CLI_ARGS.language in LANGUAGES:
+        prepare_language(CLI_ARGS.language)
+    else:
+        fail("Wrong language id")
--- a/bin/import_ted.py
+++ b/bin/import_ted.py
@ -1,24 +1,17 @@
 #!/usr/bin/env python
-from __future__ import absolute_import, division, print_function
-
-# Make sure we can import stuff from util/
-# This script needs to be run from the root of the DeepSpeech repository
 import sys
-import os
-sys.path.insert(1, os.path.join(sys.path[0], '..'))
-
-import codecs
-import pandas
 import tarfile
 import unicodedata
 import wave
-
 from glob import glob
 from os import makedirs, path, remove, rmdir
+
+import pandas
+from coqui_stt_training.util.downloader import maybe_download
+from coqui_stt_training.util.stm import parse_stm_file
 from sox import Transformer
-from util.downloader import maybe_download
 from tensorflow.python.platform import gfile
-from util.stm import parse_stm_file
+

 def _download_and_preprocess_data(data_dir):
    # Conditionally download data
@ -41,6 +34,7 @@ def _download_and_preprocess_data(data_dir):
    dev_files.to_csv(path.join(data_dir, "ted-dev.csv"), index=False)
    test_files.to_csv(path.join(data_dir, "ted-test.csv"), index=False)

+
 def _maybe_extract(data_dir, extracted_data, archive):
    # If data_dir/extracted_data does not exist, extract archive in data_dir
    if not gfile.Exists(path.join(data_dir, extracted_data)):
@ -48,6 +42,7 @@ def _maybe_extract(data_dir, extracted_data, archive):
        tar.extractall(data_dir)
        tar.close()

+
 def _maybe_convert_wav(data_dir, extracted_data):
    # Create extracted_data dir
    extracted_dir = path.join(data_dir, extracted_data)
@ -61,6 +56,7 @@ def _maybe_convert_wav(data_dir, extracted_data):
    # Conditionally convert test sph to wav
    _maybe_convert_wav_dataset(extracted_dir, "test")

+
 def _maybe_convert_wav_dataset(extracted_dir, data_set):
    # Create source dir
    source_dir = path.join(extracted_dir, data_set, "sph")
@ -84,6 +80,7 @@ def _maybe_convert_wav_dataset(extracted_dir, data_set):
        # Remove source_dir
        rmdir(source_dir)

+
 def _maybe_split_sentences(data_dir, extracted_data):
    # Create extracted_data dir
    extracted_dir = path.join(data_dir, extracted_data)
@ -99,6 +96,7 @@ def _maybe_split_sentences(data_dir, extracted_data):

    return train_files, dev_files, test_files

+
 def _maybe_split_dataset(extracted_dir, data_set):
    # Create stm dir
    stm_dir = path.join(extracted_dir, data_set, "stm")
@ -116,14 +114,21 @@ def _maybe_split_dataset(extracted_dir, data_set):
        # Open wav corresponding to stm_file
        wav_filename = path.splitext(path.basename(stm_file))[0] + ".wav"
        wav_file = path.join(wav_dir, wav_filename)
-        origAudio = wave.open(wav_file,'r')
+        origAudio = wave.open(wav_file, "r")

        # Loop over stm_segments and split wav_file for each segment
        for stm_segment in stm_segments:
            # Create wav segment filename
            start_time = stm_segment.start_time
            stop_time = stm_segment.stop_time
-            new_wav_filename = path.splitext(path.basename(stm_file))[0] + "-" + str(start_time) + "-" + str(stop_time) + ".wav"
+            new_wav_filename = (
+                path.splitext(path.basename(stm_file))[0]
+                + "-"
+                + str(start_time)
+                + "-"
+                + str(stop_time)
+                + ".wav"
+            )
            new_wav_file = path.join(wav_dir, new_wav_filename)

            # If the wav segment filename does not exist create it
@ -131,23 +136,29 @@ def _maybe_split_dataset(extracted_dir, data_set):
                _split_wav(origAudio, start_time, stop_time, new_wav_file)

            new_wav_filesize = path.getsize(new_wav_file)
-            files.append((path.abspath(new_wav_file), new_wav_filesize, stm_segment.transcript))
+            files.append(
+                (path.abspath(new_wav_file), new_wav_filesize, stm_segment.transcript)
+            )

        # Close origAudio
        origAudio.close()

-    return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"])
+    return pandas.DataFrame(
+        data=files, columns=["wav_filename", "wav_filesize", "transcript"]
+    )
+

 def _split_wav(origAudio, start_time, stop_time, new_wav_file):
    frameRate = origAudio.getframerate()
-    origAudio.setpos(int(start_time*frameRate))
-    chunkData = origAudio.readframes(int((stop_time - start_time)*frameRate))
-    chunkAudio = wave.open(new_wav_file,'w')
+    origAudio.setpos(int(start_time * frameRate))
+    chunkData = origAudio.readframes(int((stop_time - start_time) * frameRate))
+    chunkAudio = wave.open(new_wav_file, "w")
    chunkAudio.setnchannels(origAudio.getnchannels())
    chunkAudio.setsampwidth(origAudio.getsampwidth())
    chunkAudio.setframerate(frameRate)
    chunkAudio.writeframes(chunkData)
    chunkAudio.close()

+
 if __name__ == "__main__":
    _download_and_preprocess_data(sys.argv[1])
--- a/bin/import_timit.py
+++ b/bin/import_timit.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python

-'''
+"""
    NAME    : LDC TIMIT Dataset
    URL     : https://catalog.ldc.upenn.edu/ldc93s1
    HOURS   : 5
@ -8,29 +8,32 @@
    AUTHORS : Garofolo, John, et al.
    TYPE    : LDC Membership
    LICENCE : LDC User Agreement
-'''
+"""

 import errno
+import fnmatch
 import os
-from os import path
+import subprocess
 import sys
 import tarfile
-import fnmatch
+from os import path
+
 import pandas as pd
-import subprocess
+

 def clean(word):
    # LC ALL & strip punctuation which are not required
-    new = word.lower().replace('.', '')
-    new = new.replace(',', '')
-    new = new.replace(';', '')
-    new = new.replace('"', '')
-    new = new.replace('!', '')
-    new = new.replace('?', '')
-    new = new.replace(':', '')
-    new = new.replace('-', '')
+    new = word.lower().replace(".", "")
+    new = new.replace(",", "")
+    new = new.replace(";", "")
+    new = new.replace('"', "")
+    new = new.replace("!", "")
+    new = new.replace("?", "")
+    new = new.replace(":", "")
+    new = new.replace("-", "")
    return new

+
 def _preprocess_data(args):

    # Assume data is downloaded from LDC - https://catalog.ldc.upenn.edu/ldc93s1
@ -40,16 +43,24 @@ def _preprocess_data(args):

    if ignoreSASentences:
        print("Using recommended ignore SA sentences")
-        print("Ignoring SA sentences (2 x sentences which are repeated by all speakers)")
+        print(
+            "Ignoring SA sentences (2 x sentences which are repeated by all speakers)"
+        )
    else:
        print("Using unrecommended setting to include SA sentences")

    datapath = args
    target = path.join(datapath, "TIMIT")
-    print("Checking to see if data has already been extracted in given argument: %s", target)
+    print(
+        "Checking to see if data has already been extracted in given argument: %s",
+        target,
+    )

    if not path.isdir(target):
-        print("Could not find extracted data, trying to find: TIMIT-LDC93S1.tgz in: ", datapath)
+        print(
+            "Could not find extracted data, trying to find: TIMIT-LDC93S1.tgz in: ",
+            datapath,
+        )
        filepath = path.join(datapath, "TIMIT-LDC93S1.tgz")
        if path.isfile(filepath):
            print("File found, extracting")
@ -103,40 +114,58 @@ def _preprocess_data(args):
            # if ignoreSAsentences we only want those without SA in the name
            # OR
            # if not ignoreSAsentences we want all to be added
-            if (ignoreSASentences and not ('SA' in os.path.basename(full_wav))) or (not ignoreSASentences):
-                if 'train' in full_wav.lower():
+            if (ignoreSASentences and not ("SA" in os.path.basename(full_wav))) or (
+                not ignoreSASentences
+            ):
+                if "train" in full_wav.lower():
                    train_list_wavs.append(full_wav)
                    train_list_trans.append(trans)
                    train_list_size.append(wav_filesize)
-                elif 'test' in full_wav.lower():
+                elif "test" in full_wav.lower():
                    test_list_wavs.append(full_wav)
                    test_list_trans.append(trans)
                    test_list_size.append(wav_filesize)
                else:
                    raise IOError

-    a = {'wav_filename': train_list_wavs,
-         'wav_filesize': train_list_size,
-         'transcript': train_list_trans
-         }
+    a = {
+        "wav_filename": train_list_wavs,
+        "wav_filesize": train_list_size,
+        "transcript": train_list_trans,
+    }

-    c = {'wav_filename': test_list_wavs,
-         'wav_filesize': test_list_size,
-         'transcript': test_list_trans
-         }
+    c = {
+        "wav_filename": test_list_wavs,
+        "wav_filesize": test_list_size,
+        "transcript": test_list_trans,
+    }

-    all = {'wav_filename': train_list_wavs + test_list_wavs,
-          'wav_filesize': train_list_size + test_list_size,
-          'transcript': train_list_trans + test_list_trans
-          }
+    all = {
+        "wav_filename": train_list_wavs + test_list_wavs,
+        "wav_filesize": train_list_size + test_list_size,
+        "transcript": train_list_trans + test_list_trans,
+    }

-    df_all = pd.DataFrame(all, columns=['wav_filename', 'wav_filesize', 'transcript'], dtype=int)
-    df_train = pd.DataFrame(a, columns=['wav_filename', 'wav_filesize', 'transcript'], dtype=int)
-    df_test = pd.DataFrame(c, columns=['wav_filename', 'wav_filesize', 'transcript'], dtype=int)
+    df_all = pd.DataFrame(
+        all, columns=["wav_filename", "wav_filesize", "transcript"], dtype=int
+    )
+    df_train = pd.DataFrame(
+        a, columns=["wav_filename", "wav_filesize", "transcript"], dtype=int
+    )
+    df_test = pd.DataFrame(
+        c, columns=["wav_filename", "wav_filesize", "transcript"], dtype=int
+    )
+
+    df_all.to_csv(
+        target + "/timit_all.csv", sep=",", header=True, index=False, encoding="ascii"
+    )
+    df_train.to_csv(
+        target + "/timit_train.csv", sep=",", header=True, index=False, encoding="ascii"
+    )
+    df_test.to_csv(
+        target + "/timit_test.csv", sep=",", header=True, index=False, encoding="ascii"
+    )

-    df_all.to_csv(target+"/timit_all.csv", sep=',', header=True, index=False, encoding='ascii')
-    df_train.to_csv(target+"/timit_train.csv", sep=',', header=True, index=False, encoding='ascii')
-    df_test.to_csv(target+"/timit_test.csv", sep=',', header=True, index=False, encoding='ascii')

 if __name__ == "__main__":
    _preprocess_data(sys.argv[1])
--- a/Show More
+++ b/Show More