Run pre-commit hooks on all files
This commit is contained in:
parent
14aee5d35b
commit
43a6c3e62a
|
@ -1,2 +1,2 @@
|
|||
data/lm/kenlm.scorer filter=lfs diff=lfs merge=lfs -text
|
||||
.github/actions/check_artifact_exists/dist/index.js binary
|
||||
.github/actions/check_artifact_exists/dist/index.js binary
|
||||
|
|
|
@ -22,7 +22,3 @@ repos:
|
|||
- id: isort
|
||||
name: isort (pyi)
|
||||
types: [pyi]
|
||||
- repo: https://github.com/pycqa/pylint
|
||||
rev: v2.8.2
|
||||
hooks:
|
||||
- id: pylint
|
||||
|
|
|
@ -3,16 +3,16 @@ This file contains a list of papers in chronological order that have been publis
|
|||
To appear
|
||||
==========
|
||||
|
||||
* Raghuveer Peri, Haoqi Li, Krishna Somandepalli, Arindam Jati, Shrikanth Narayanan (2020) "An empirical analysis of information encoded in disentangled neural speaker representations".
|
||||
* Raghuveer Peri, Haoqi Li, Krishna Somandepalli, Arindam Jati, Shrikanth Narayanan (2020) "An empirical analysis of information encoded in disentangled neural speaker representations".
|
||||
* Rosana Ardila, Megan Branson, Kelly Davis, Michael Henretty, Michael Kohler, Josh Meyer, Reuben Morais, Lindsay Saunders, Francis M. Tyers, and Gregor Weber (2020) "Common Voice: A Massively-Multilingual Speech Corpus".
|
||||
|
||||
Published
|
||||
Published
|
||||
==========
|
||||
|
||||
2020
|
||||
----------
|
||||
|
||||
* Nils Hjortnaes, Niko Partanen, Michael Rießler and Francis M. Tyers (2020)
|
||||
* Nils Hjortnaes, Niko Partanen, Michael Rießler and Francis M. Tyers (2020)
|
||||
"Towards a Speech Recognizer for Komi, an Endangered and Low-Resource Uralic Language". *Proceedings of the 6th International Workshop on Computational Linguistics of Uralic Languages*.
|
||||
|
||||
```
|
||||
|
@ -72,5 +72,5 @@ Published
|
|||
booktitle = {2018 IEEE/ACM Machine Learning in HPC Environments (MLHPC)},
|
||||
doi = {https://doi.org/10.1109/MLHPC.2018.8638637}
|
||||
year = 2018
|
||||
}
|
||||
}
|
||||
```
|
||||
|
|
|
@ -118,11 +118,11 @@ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
|||
version 2.0, available at
|
||||
[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
|
||||
|
||||
Community Impact Guidelines were inspired by
|
||||
Community Impact Guidelines were inspired by
|
||||
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
|
||||
|
||||
For answers to common questions about this code of conduct, see the FAQ at
|
||||
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available
|
||||
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available
|
||||
at [https://www.contributor-covenant.org/translations][translations].
|
||||
|
||||
[homepage]: https://www.contributor-covenant.org
|
||||
|
|
|
@ -112,5 +112,5 @@ Documentation
|
|||
|
||||
.. Third party bindings
|
||||
--------------------
|
||||
|
||||
|
||||
Hosted externally and owned by the individual authors. See the `list of third-party bindings <https://stt.readthedocs.io/en/latest/ USING.html#third-party-bindings>`_ for more info.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# Please refer to the USING documentation, "Dockerfile for building from source"
|
||||
|
||||
# Need devel version cause we need /usr/include/cudnn.h
|
||||
# Need devel version cause we need /usr/include/cudnn.h
|
||||
FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
|
||||
|
||||
ARG STT_REPO=https://github.com/coqui-ai/STT.git
|
||||
|
|
25
README.rst
25
README.rst
|
@ -9,14 +9,14 @@
|
|||
.. |covenant-img| image:: https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg
|
||||
:target: CODE_OF_CONDUCT.md
|
||||
:alt: Contributor Covenant
|
||||
|
||||
|
||||
.. |gitter-img| image:: https://badges.gitter.im/coqui-ai/STT.svg
|
||||
:target: https://gitter.im/coqui-ai/STT?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge
|
||||
:alt: Gitter Room
|
||||
|
||||
|
||||
.. |doi| image:: https://zenodo.org/badge/344354127.svg
|
||||
:target: https://zenodo.org/badge/latestdoi/344354127
|
||||
|
||||
|
||||
|doc-img| |covenant-img| |gitter-img| |doi|
|
||||
|
||||
`👉 Subscribe to 🐸Coqui's Newsletter <https://coqui.ai/?subscription=true>`_
|
||||
|
@ -31,16 +31,16 @@
|
|||
* Streaming inference.
|
||||
* Multiple possible transcripts, each with an associated confidence score.
|
||||
* Real-time inference.
|
||||
* Small-footprint acoustic model.
|
||||
* Bindings for various programming languages.
|
||||
* Small-footprint acoustic model.
|
||||
* Bindings for various programming languages.
|
||||
|
||||
Where to Ask Questions
|
||||
----------------------
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 25
|
||||
:widths: 25 25
|
||||
:header-rows: 1
|
||||
|
||||
|
||||
* - Type
|
||||
- Link
|
||||
* - 🚨 **Bug Reports**
|
||||
|
@ -51,14 +51,14 @@ Where to Ask Questions
|
|||
- `Github Discussions <https://github.com/coqui-ai/stt/discussions/>`_
|
||||
* - 💬 **General Discussion**
|
||||
- `Github Discussions <https://github.com/coqui-ai/stt/discussions/>`_ or `Gitter Room <https://gitter.im/coqui-ai/STT?utm_source=share-link&utm_medium=link&utm_campaign=share-link>`_
|
||||
|
||||
|
||||
|
||||
|
||||
Links & Resources
|
||||
-----------------
|
||||
.. list-table::
|
||||
:widths: 25 25
|
||||
.. list-table::
|
||||
:widths: 25 25
|
||||
:header-rows: 1
|
||||
|
||||
|
||||
* - Type
|
||||
- Link
|
||||
* - 📰 **Documentation**
|
||||
|
@ -67,4 +67,3 @@ Links & Resources
|
|||
- `see the latest release on GitHub <https://github.com/coqui-ai/STT/releases/latest>`_
|
||||
* - 🤝 **Contribution Guidelines**
|
||||
- `CONTRIBUTING.rst <CONTRIBUTING.rst>`_
|
||||
|
||||
|
|
28
bazel.patch
28
bazel.patch
|
@ -9,23 +9,23 @@ index c7aa4cb63..e084bc27c 100644
|
|||
+import java.io.PrintWriter;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
|
||||
@@ -73,6 +74,8 @@ public final class FileWriteAction extends AbstractFileWriteAction {
|
||||
*/
|
||||
private final CharSequence fileContents;
|
||||
|
||||
|
||||
+ private final Artifact output;
|
||||
+
|
||||
/** Minimum length (in chars) for content to be eligible for compression. */
|
||||
private static final int COMPRESS_CHARS_THRESHOLD = 256;
|
||||
|
||||
|
||||
@@ -90,6 +93,7 @@ public final class FileWriteAction extends AbstractFileWriteAction {
|
||||
fileContents = new CompressedString((String) fileContents);
|
||||
}
|
||||
this.fileContents = fileContents;
|
||||
+ this.output = output;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
@@ -230,11 +234,32 @@ public final class FileWriteAction extends AbstractFileWriteAction {
|
||||
*/
|
||||
|
@ -59,7 +59,7 @@ index c7aa4cb63..e084bc27c 100644
|
|||
+ computeKeyDebugWriter.close();
|
||||
+ return rv;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
diff --git a/src/main/java/com/google/devtools/build/lib/analysis/actions/SpawnAction.java b/src/main/java/com/google/devtools/build/lib/analysis/actions/SpawnAction.java
|
||||
index 580788160..26883eb92 100644
|
||||
|
@ -74,9 +74,9 @@ index 580788160..26883eb92 100644
|
|||
import java.util.Collections;
|
||||
import java.util.LinkedHashMap;
|
||||
@@ -91,6 +92,9 @@ public class SpawnAction extends AbstractAction implements ExecutionInfoSpecifie
|
||||
|
||||
|
||||
private final CommandLine argv;
|
||||
|
||||
|
||||
+ private final Iterable<Artifact> inputs;
|
||||
+ private final Iterable<Artifact> outputs;
|
||||
+
|
||||
|
@ -91,10 +91,10 @@ index 580788160..26883eb92 100644
|
|||
+ this.inputs = inputs;
|
||||
+ this.outputs = outputs;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
@@ -312,23 +319,89 @@ public class SpawnAction extends AbstractAction implements ExecutionInfoSpecifie
|
||||
|
||||
|
||||
@Override
|
||||
protected String computeKey() {
|
||||
+ boolean genruleSetup = String.valueOf(Iterables.get(inputs, 0).getExecPath()).contains("genrule/genrule-setup.sh");
|
||||
|
@ -182,14 +182,14 @@ index 580788160..26883eb92 100644
|
|||
+ }
|
||||
+ return rv;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
diff --git a/src/main/java/com/google/devtools/build/lib/rules/cpp/CppCompileAction.java b/src/main/java/com/google/devtools/build/lib/rules/cpp/CppCompileAction.java
|
||||
index 3559fffde..3ba39617c 100644
|
||||
--- a/src/main/java/com/google/devtools/build/lib/rules/cpp/CppCompileAction.java
|
||||
+++ b/src/main/java/com/google/devtools/build/lib/rules/cpp/CppCompileAction.java
|
||||
@@ -1111,10 +1111,30 @@ public class CppCompileAction extends AbstractAction
|
||||
|
||||
|
||||
@Override
|
||||
public String computeKey() {
|
||||
+ // ".ckd" Compute Key Debug
|
||||
|
@ -216,7 +216,7 @@ index 3559fffde..3ba39617c 100644
|
|||
+ for (Map.Entry<String, String> entry : executionInfo.entrySet()) {
|
||||
+ computeKeyDebugWriter.println("EXECINFO: " + entry.getKey() + "=" + entry.getValue());
|
||||
+ }
|
||||
|
||||
|
||||
// For the argv part of the cache key, ignore all compiler flags that explicitly denote module
|
||||
// file (.pcm) inputs. Depending on input discovery, some of the unused ones are removed from
|
||||
@@ -1124,6 +1144,9 @@ public class CppCompileAction extends AbstractAction
|
||||
|
@ -226,7 +226,7 @@ index 3559fffde..3ba39617c 100644
|
|||
+ for (String input : compileCommandLine.getArgv(getInternalOutputFile(), null)) {
|
||||
+ computeKeyDebugWriter.println("COMMAND: " + input);
|
||||
+ }
|
||||
|
||||
|
||||
/*
|
||||
* getArgv() above captures all changes which affect the compilation
|
||||
@@ -1133,19 +1156,31 @@ public class CppCompileAction extends AbstractAction
|
||||
|
@ -260,5 +260,5 @@ index 3559fffde..3ba39617c 100644
|
|||
+ computeKeyDebugWriter.close();
|
||||
+ return rv;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
|
|
|
@ -2,10 +2,10 @@
|
|||
"""
|
||||
Tool for comparing two wav samples
|
||||
"""
|
||||
import sys
|
||||
import argparse
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
from coqui_stt_training.util.audio import AUDIO_TYPE_NP, mean_dbfs
|
||||
from coqui_stt_training.util.sample_collections import load_sample
|
||||
|
||||
|
@ -19,19 +19,29 @@ def compare_samples():
|
|||
sample1 = load_sample(CLI_ARGS.sample1).unpack()
|
||||
sample2 = load_sample(CLI_ARGS.sample2).unpack()
|
||||
if sample1.audio_format != sample2.audio_format:
|
||||
fail('Samples differ on: audio-format ({} and {})'.format(sample1.audio_format, sample2.audio_format))
|
||||
fail(
|
||||
"Samples differ on: audio-format ({} and {})".format(
|
||||
sample1.audio_format, sample2.audio_format
|
||||
)
|
||||
)
|
||||
if abs(sample1.duration - sample2.duration) > 0.001:
|
||||
fail('Samples differ on: duration ({} and {})'.format(sample1.duration, sample2.duration))
|
||||
fail(
|
||||
"Samples differ on: duration ({} and {})".format(
|
||||
sample1.duration, sample2.duration
|
||||
)
|
||||
)
|
||||
sample1.change_audio_type(AUDIO_TYPE_NP)
|
||||
sample2.change_audio_type(AUDIO_TYPE_NP)
|
||||
samples = [sample1, sample2]
|
||||
largest = np.argmax([sample1.audio.shape[0], sample2.audio.shape[0]])
|
||||
smallest = (largest + 1) % 2
|
||||
samples[largest].audio = samples[largest].audio[:len(samples[smallest].audio)]
|
||||
samples[largest].audio = samples[largest].audio[: len(samples[smallest].audio)]
|
||||
audio_diff = samples[largest].audio - samples[smallest].audio
|
||||
diff_dbfs = mean_dbfs(audio_diff)
|
||||
differ_msg = 'Samples differ on: sample data ({:0.2f} dB difference) '.format(diff_dbfs)
|
||||
equal_msg = 'Samples are considered equal ({:0.2f} dB difference)'.format(diff_dbfs)
|
||||
differ_msg = "Samples differ on: sample data ({:0.2f} dB difference) ".format(
|
||||
diff_dbfs
|
||||
)
|
||||
equal_msg = "Samples are considered equal ({:0.2f} dB difference)".format(diff_dbfs)
|
||||
if CLI_ARGS.if_differ:
|
||||
if diff_dbfs <= CLI_ARGS.threshold:
|
||||
fail(equal_msg)
|
||||
|
@ -50,13 +60,17 @@ def handle_args():
|
|||
)
|
||||
parser.add_argument("sample1", help="Filename of sample 1 to compare")
|
||||
parser.add_argument("sample2", help="Filename of sample 2 to compare")
|
||||
parser.add_argument("--threshold", type=float, default=-60.0,
|
||||
help="dB of sample deltas above which they are considered different")
|
||||
parser.add_argument(
|
||||
"--threshold",
|
||||
type=float,
|
||||
default=-60.0,
|
||||
help="dB of sample deltas above which they are considered different",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--if-differ",
|
||||
action="store_true",
|
||||
help="If to succeed and return status code 0 on different signals and fail on equal ones (inverse check)."
|
||||
"This will still fail on different formats or durations.",
|
||||
"This will still fail on different formats or durations.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-success-output",
|
||||
|
|
|
@ -1,19 +1,24 @@
|
|||
#!/usr/bin/env python
|
||||
'''
|
||||
"""
|
||||
Tool for building a combined SDB or CSV sample-set from other sets
|
||||
Use 'python3 data_set_tool.py -h' for help
|
||||
'''
|
||||
import sys
|
||||
"""
|
||||
import argparse
|
||||
import progressbar
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import progressbar
|
||||
from coqui_stt_training.util.audio import (
|
||||
AUDIO_TYPE_PCM,
|
||||
AUDIO_TYPE_OPUS,
|
||||
AUDIO_TYPE_PCM,
|
||||
AUDIO_TYPE_WAV,
|
||||
change_audio_types,
|
||||
)
|
||||
from coqui_stt_training.util.augmentations import (
|
||||
SampleAugmentation,
|
||||
apply_sample_augmentations,
|
||||
parse_augmentations,
|
||||
)
|
||||
from coqui_stt_training.util.downloader import SIMPLE_BAR
|
||||
from coqui_stt_training.util.sample_collections import (
|
||||
CSVWriter,
|
||||
|
@ -21,101 +26,110 @@ from coqui_stt_training.util.sample_collections import (
|
|||
TarWriter,
|
||||
samples_from_sources,
|
||||
)
|
||||
from coqui_stt_training.util.augmentations import (
|
||||
parse_augmentations,
|
||||
apply_sample_augmentations,
|
||||
SampleAugmentation
|
||||
)
|
||||
|
||||
AUDIO_TYPE_LOOKUP = {'wav': AUDIO_TYPE_WAV, 'opus': AUDIO_TYPE_OPUS}
|
||||
AUDIO_TYPE_LOOKUP = {"wav": AUDIO_TYPE_WAV, "opus": AUDIO_TYPE_OPUS}
|
||||
|
||||
|
||||
def build_data_set():
|
||||
audio_type = AUDIO_TYPE_LOOKUP[CLI_ARGS.audio_type]
|
||||
augmentations = parse_augmentations(CLI_ARGS.augment)
|
||||
if any(not isinstance(a, SampleAugmentation) for a in augmentations):
|
||||
print('Warning: Some of the specified augmentations will not get applied, as this tool only supports '
|
||||
'overlay, codec, reverb, resample and volume.')
|
||||
print(
|
||||
"Warning: Some of the specified augmentations will not get applied, as this tool only supports "
|
||||
"overlay, codec, reverb, resample and volume."
|
||||
)
|
||||
extension = Path(CLI_ARGS.target).suffix.lower()
|
||||
labeled = not CLI_ARGS.unlabeled
|
||||
if extension == '.csv':
|
||||
writer = CSVWriter(CLI_ARGS.target, absolute_paths=CLI_ARGS.absolute_paths, labeled=labeled)
|
||||
elif extension == '.sdb':
|
||||
writer = DirectSDBWriter(CLI_ARGS.target, audio_type=audio_type, labeled=labeled)
|
||||
elif extension == '.tar':
|
||||
writer = TarWriter(CLI_ARGS.target, labeled=labeled, gz=False, include=CLI_ARGS.include)
|
||||
elif extension == '.tgz' or CLI_ARGS.target.lower().endswith('.tar.gz'):
|
||||
writer = TarWriter(CLI_ARGS.target, labeled=labeled, gz=True, include=CLI_ARGS.include)
|
||||
if extension == ".csv":
|
||||
writer = CSVWriter(
|
||||
CLI_ARGS.target, absolute_paths=CLI_ARGS.absolute_paths, labeled=labeled
|
||||
)
|
||||
elif extension == ".sdb":
|
||||
writer = DirectSDBWriter(
|
||||
CLI_ARGS.target, audio_type=audio_type, labeled=labeled
|
||||
)
|
||||
elif extension == ".tar":
|
||||
writer = TarWriter(
|
||||
CLI_ARGS.target, labeled=labeled, gz=False, include=CLI_ARGS.include
|
||||
)
|
||||
elif extension == ".tgz" or CLI_ARGS.target.lower().endswith(".tar.gz"):
|
||||
writer = TarWriter(
|
||||
CLI_ARGS.target, labeled=labeled, gz=True, include=CLI_ARGS.include
|
||||
)
|
||||
else:
|
||||
print('Unknown extension of target file - has to be either .csv, .sdb, .tar, .tar.gz or .tgz')
|
||||
print(
|
||||
"Unknown extension of target file - has to be either .csv, .sdb, .tar, .tar.gz or .tgz"
|
||||
)
|
||||
sys.exit(1)
|
||||
with writer:
|
||||
samples = samples_from_sources(CLI_ARGS.sources, labeled=not CLI_ARGS.unlabeled)
|
||||
num_samples = len(samples)
|
||||
if augmentations:
|
||||
samples = apply_sample_augmentations(samples, audio_type=AUDIO_TYPE_PCM, augmentations=augmentations)
|
||||
samples = apply_sample_augmentations(
|
||||
samples, audio_type=AUDIO_TYPE_PCM, augmentations=augmentations
|
||||
)
|
||||
bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
|
||||
for sample in bar(change_audio_types(
|
||||
for sample in bar(
|
||||
change_audio_types(
|
||||
samples,
|
||||
audio_type=audio_type,
|
||||
bitrate=CLI_ARGS.bitrate,
|
||||
processes=CLI_ARGS.workers)):
|
||||
processes=CLI_ARGS.workers,
|
||||
)
|
||||
):
|
||||
writer.add(sample)
|
||||
|
||||
|
||||
def handle_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Tool for building a combined SDB or CSV sample-set from other sets'
|
||||
description="Tool for building a combined SDB or CSV sample-set from other sets"
|
||||
)
|
||||
parser.add_argument(
|
||||
'sources',
|
||||
nargs='+',
|
||||
help='Source CSV and/or SDB files - '
|
||||
'Note: For getting a correctly ordered target set, source SDBs have to have their samples '
|
||||
'already ordered from shortest to longest.',
|
||||
"sources",
|
||||
nargs="+",
|
||||
help="Source CSV and/or SDB files - "
|
||||
"Note: For getting a correctly ordered target set, source SDBs have to have their samples "
|
||||
"already ordered from shortest to longest.",
|
||||
)
|
||||
parser.add_argument("target", help="SDB, CSV or TAR(.gz) file to create")
|
||||
parser.add_argument(
|
||||
'target',
|
||||
help='SDB, CSV or TAR(.gz) file to create'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--audio-type',
|
||||
default='opus',
|
||||
"--audio-type",
|
||||
default="opus",
|
||||
choices=AUDIO_TYPE_LOOKUP.keys(),
|
||||
help='Audio representation inside target SDB',
|
||||
help="Audio representation inside target SDB",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--bitrate',
|
||||
"--bitrate",
|
||||
type=int,
|
||||
help='Bitrate for lossy compressed SDB samples like in case of --audio-type opus',
|
||||
help="Bitrate for lossy compressed SDB samples like in case of --audio-type opus",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--workers', type=int, default=None, help='Number of encoding SDB workers'
|
||||
"--workers", type=int, default=None, help="Number of encoding SDB workers"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--unlabeled',
|
||||
action='store_true',
|
||||
help='If to build an data-set with unlabeled (audio only) samples - '
|
||||
'typically used for building noise augmentation corpora',
|
||||
"--unlabeled",
|
||||
action="store_true",
|
||||
help="If to build an data-set with unlabeled (audio only) samples - "
|
||||
"typically used for building noise augmentation corpora",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--absolute-paths',
|
||||
action='store_true',
|
||||
help='If to reference samples by their absolute paths when writing CSV files',
|
||||
"--absolute-paths",
|
||||
action="store_true",
|
||||
help="If to reference samples by their absolute paths when writing CSV files",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--augment',
|
||||
action='append',
|
||||
help='Add an augmentation operation',
|
||||
"--augment",
|
||||
action="append",
|
||||
help="Add an augmentation operation",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--include',
|
||||
action='append',
|
||||
help='Adds a file to the root directory of .tar(.gz) targets',
|
||||
"--include",
|
||||
action="append",
|
||||
help="Adds a file to the root directory of .tar(.gz) targets",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
CLI_ARGS = handle_args()
|
||||
build_data_set()
|
||||
|
|
|
@ -3,9 +3,10 @@
|
|||
|
||||
import sys
|
||||
|
||||
import tensorflow.compat.v1 as tfv1
|
||||
from google.protobuf import text_format
|
||||
|
||||
import tensorflow.compat.v1 as tfv1
|
||||
|
||||
|
||||
def main():
|
||||
# Load and export as string
|
||||
|
|
|
@ -4,7 +4,6 @@ import os
|
|||
import tarfile
|
||||
|
||||
import pandas
|
||||
|
||||
from coqui_stt_training.util.importers import get_importers_parser
|
||||
|
||||
COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"]
|
||||
|
|
|
@ -4,7 +4,6 @@ import os
|
|||
import tarfile
|
||||
|
||||
import pandas
|
||||
|
||||
from coqui_stt_training.util.importers import get_importers_parser
|
||||
|
||||
COLUMNNAMES = ["wav_filename", "wav_filesize", "transcript"]
|
||||
|
|
|
@ -5,21 +5,21 @@ Ministère de l'Économie, des Finances et de la Relance
|
|||
"""
|
||||
|
||||
import csv
|
||||
import sys
|
||||
import decimal
|
||||
import hashlib
|
||||
import math
|
||||
import os
|
||||
import progressbar
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import unicodedata
|
||||
import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from glob import glob
|
||||
from multiprocessing import Pool
|
||||
|
||||
import hashlib
|
||||
import decimal
|
||||
import math
|
||||
import unicodedata
|
||||
import re
|
||||
import progressbar
|
||||
import sox
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
try:
|
||||
from num2words import num2words
|
||||
|
@ -27,19 +27,19 @@ except ImportError as ex:
|
|||
print("pip install num2words")
|
||||
sys.exit(1)
|
||||
|
||||
import requests
|
||||
import json
|
||||
|
||||
import requests
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
|
||||
from coqui_stt_training.util.helpers import secs_to_hours
|
||||
from coqui_stt_training.util.importers import (
|
||||
get_counter,
|
||||
get_importers_parser,
|
||||
get_imported_samples,
|
||||
get_importers_parser,
|
||||
get_validate_label,
|
||||
print_import_report,
|
||||
)
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
|
||||
FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
|
||||
SAMPLE_RATE = 16000
|
||||
|
@ -50,58 +50,187 @@ MIN_SECS = 0.85
|
|||
|
||||
DATASET_RELEASE_CSV = "https://data.economie.gouv.fr/explore/dataset/transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020/download/?format=csv&timezone=Europe/Berlin&lang=fr&use_labels_for_header=true&csv_separator=%3B"
|
||||
DATASET_RELEASE_SHA = [
|
||||
("863d39a06a388c6491c6ff2f6450b151f38f1b57", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.001"),
|
||||
("2f3a0305aa04c61220bb00b5a4e553e45dbf12e1", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.002"),
|
||||
("5e55e9f1f844097349188ac875947e5a3d7fe9f1", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.003"),
|
||||
("8bf54842cf07948ca5915e27a8bd5fa5139c06ae", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.004"),
|
||||
("c8963504aadc015ac48f9af80058a0bb3440b94f", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.005"),
|
||||
("d95e225e908621d83ce4e9795fd108d9d310e244", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.006"),
|
||||
("de6ed9c2b0ee80ca879aae8ba7923cc93217d811", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.007"),
|
||||
("234283c47dacfcd4450d836c52c25f3e807fc5f2", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.008"),
|
||||
("4e6b67a688639bb72f8cd81782eaba604a8d32a6", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.009"),
|
||||
("4165a51389777c8af8e6253d87bdacb877e8b3b0", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.010"),
|
||||
("34322e7009780d97ef5bd02bf2f2c7a31f00baff", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.011"),
|
||||
("48c5be3b2ca9d6108d525da6a03e91d93a95dbac", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.012"),
|
||||
("87573172f506a189c2ebc633856fe11a2e9cd213", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.013"),
|
||||
("6ab2c9e508e9278d5129f023e018725c4a7c69e8", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.014"),
|
||||
("4f84df831ef46dce5d3ab3e21817687a2d8c12d0", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.015"),
|
||||
("e69bfb079885c299cb81080ef88b1b8b57158aa6", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.016"),
|
||||
("5f764ba788ee273981cf211b242c29b49ca22c5e", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.017"),
|
||||
("b6aa81a959525363223494830c1e7307d4c4bae6", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.018"),
|
||||
("91ddcf43c7bf113a6f2528b857c7ec22a50a148a", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.019"),
|
||||
("fa1b29273dd77b9a7494983a2f9ae52654b931d7", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.020"),
|
||||
("1113aef4f5e2be2f7fbf2d54b6c710c1c0e7135f", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.021"),
|
||||
("ce6420d5d0b6b5135ba559f83e1a82d4d615c470", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.022"),
|
||||
("d0976ed292ac24fcf1590d1ea195077c74b05471", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.023"),
|
||||
("ec746cd6af066f62d9bf8d3b2f89174783ff4e3c", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.024"),
|
||||
("570d9e1e84178e32fd867171d4b3aaecda1fd4fb", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.025"),
|
||||
("c29ccc7467a75b2cae3d7f2e9fbbb2ab276cb8ac", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.026"),
|
||||
("08406a51146d88e208704ce058c060a1e44efa50", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.027"),
|
||||
("199aedad733a78ea1e7d47def9c71c6fd5795e02", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.028"),
|
||||
("db856a068f92fb4f01f410bba42c7271de0f231a", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.029"),
|
||||
("e3c0135f16c6c9d25a09dcb4f99a685438a84740", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.030"),
|
||||
("e51b8bb9c0ae4339f98b4f21e6d29b825109f0ac", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.031"),
|
||||
("be5e80cbc49b59b31ae33c30576ef0e1a162d84e", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.032"),
|
||||
("501df58e3ff55fcfd75b93dab57566dc536948b8", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.033"),
|
||||
("1a114875811a8cdcb8d85a9f6dbee78be3e05131", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.034"),
|
||||
("465d824e7ee46448369182c0c28646d155a2249b", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.035"),
|
||||
("37f341b1b266d143eb73138c31cfff3201b9d619", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.036"),
|
||||
("9e7d8255987a8a77a90e0d4b55c8fd38b9fb5694", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.037"),
|
||||
("54886755630cb080a53098cb1b6c951c6714a143", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.038"),
|
||||
("4b7cbb0154697be795034f7a49712e882a97197a", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.039"),
|
||||
("c8e1e565a0e7a1f6ff1dbfcefe677aa74a41d2f2", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.040"),
|
||||
(
|
||||
"863d39a06a388c6491c6ff2f6450b151f38f1b57",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.001",
|
||||
),
|
||||
(
|
||||
"2f3a0305aa04c61220bb00b5a4e553e45dbf12e1",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.002",
|
||||
),
|
||||
(
|
||||
"5e55e9f1f844097349188ac875947e5a3d7fe9f1",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.003",
|
||||
),
|
||||
(
|
||||
"8bf54842cf07948ca5915e27a8bd5fa5139c06ae",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.004",
|
||||
),
|
||||
(
|
||||
"c8963504aadc015ac48f9af80058a0bb3440b94f",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.005",
|
||||
),
|
||||
(
|
||||
"d95e225e908621d83ce4e9795fd108d9d310e244",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.006",
|
||||
),
|
||||
(
|
||||
"de6ed9c2b0ee80ca879aae8ba7923cc93217d811",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.007",
|
||||
),
|
||||
(
|
||||
"234283c47dacfcd4450d836c52c25f3e807fc5f2",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.008",
|
||||
),
|
||||
(
|
||||
"4e6b67a688639bb72f8cd81782eaba604a8d32a6",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.009",
|
||||
),
|
||||
(
|
||||
"4165a51389777c8af8e6253d87bdacb877e8b3b0",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.010",
|
||||
),
|
||||
(
|
||||
"34322e7009780d97ef5bd02bf2f2c7a31f00baff",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.011",
|
||||
),
|
||||
(
|
||||
"48c5be3b2ca9d6108d525da6a03e91d93a95dbac",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.012",
|
||||
),
|
||||
(
|
||||
"87573172f506a189c2ebc633856fe11a2e9cd213",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.013",
|
||||
),
|
||||
(
|
||||
"6ab2c9e508e9278d5129f023e018725c4a7c69e8",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.014",
|
||||
),
|
||||
(
|
||||
"4f84df831ef46dce5d3ab3e21817687a2d8c12d0",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.015",
|
||||
),
|
||||
(
|
||||
"e69bfb079885c299cb81080ef88b1b8b57158aa6",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.016",
|
||||
),
|
||||
(
|
||||
"5f764ba788ee273981cf211b242c29b49ca22c5e",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.017",
|
||||
),
|
||||
(
|
||||
"b6aa81a959525363223494830c1e7307d4c4bae6",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.018",
|
||||
),
|
||||
(
|
||||
"91ddcf43c7bf113a6f2528b857c7ec22a50a148a",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.019",
|
||||
),
|
||||
(
|
||||
"fa1b29273dd77b9a7494983a2f9ae52654b931d7",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.020",
|
||||
),
|
||||
(
|
||||
"1113aef4f5e2be2f7fbf2d54b6c710c1c0e7135f",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.021",
|
||||
),
|
||||
(
|
||||
"ce6420d5d0b6b5135ba559f83e1a82d4d615c470",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.022",
|
||||
),
|
||||
(
|
||||
"d0976ed292ac24fcf1590d1ea195077c74b05471",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.023",
|
||||
),
|
||||
(
|
||||
"ec746cd6af066f62d9bf8d3b2f89174783ff4e3c",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.024",
|
||||
),
|
||||
(
|
||||
"570d9e1e84178e32fd867171d4b3aaecda1fd4fb",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.025",
|
||||
),
|
||||
(
|
||||
"c29ccc7467a75b2cae3d7f2e9fbbb2ab276cb8ac",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.026",
|
||||
),
|
||||
(
|
||||
"08406a51146d88e208704ce058c060a1e44efa50",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.027",
|
||||
),
|
||||
(
|
||||
"199aedad733a78ea1e7d47def9c71c6fd5795e02",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.028",
|
||||
),
|
||||
(
|
||||
"db856a068f92fb4f01f410bba42c7271de0f231a",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.029",
|
||||
),
|
||||
(
|
||||
"e3c0135f16c6c9d25a09dcb4f99a685438a84740",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.030",
|
||||
),
|
||||
(
|
||||
"e51b8bb9c0ae4339f98b4f21e6d29b825109f0ac",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.031",
|
||||
),
|
||||
(
|
||||
"be5e80cbc49b59b31ae33c30576ef0e1a162d84e",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.032",
|
||||
),
|
||||
(
|
||||
"501df58e3ff55fcfd75b93dab57566dc536948b8",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.033",
|
||||
),
|
||||
(
|
||||
"1a114875811a8cdcb8d85a9f6dbee78be3e05131",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.034",
|
||||
),
|
||||
(
|
||||
"465d824e7ee46448369182c0c28646d155a2249b",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.035",
|
||||
),
|
||||
(
|
||||
"37f341b1b266d143eb73138c31cfff3201b9d619",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.036",
|
||||
),
|
||||
(
|
||||
"9e7d8255987a8a77a90e0d4b55c8fd38b9fb5694",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.037",
|
||||
),
|
||||
(
|
||||
"54886755630cb080a53098cb1b6c951c6714a143",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.038",
|
||||
),
|
||||
(
|
||||
"4b7cbb0154697be795034f7a49712e882a97197a",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.039",
|
||||
),
|
||||
(
|
||||
"c8e1e565a0e7a1f6ff1dbfcefe677aa74a41d2f2",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip.040",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _download_and_preprocess_data(csv_url, target_dir):
|
||||
dataset_sources = os.path.join(target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020", "data.txt")
|
||||
dataset_sources = os.path.join(
|
||||
target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020", "data.txt"
|
||||
)
|
||||
if os.path.exists(dataset_sources):
|
||||
return dataset_sources
|
||||
|
||||
# Making path absolute
|
||||
target_dir = os.path.abspath(target_dir)
|
||||
csv_ref = requests.get(csv_url).text.split('\r\n')[1:-1]
|
||||
csv_ref = requests.get(csv_url).text.split("\r\n")[1:-1]
|
||||
for part in csv_ref:
|
||||
part_filename = requests.head(part).headers.get("Content-Disposition").split(" ")[1].split("=")[1].replace('"', "")
|
||||
part_filename = (
|
||||
requests.head(part)
|
||||
.headers.get("Content-Disposition")
|
||||
.split(" ")[1]
|
||||
.split("=")[1]
|
||||
.replace('"', "")
|
||||
)
|
||||
if not os.path.exists(os.path.join(target_dir, part_filename)):
|
||||
part_path = maybe_download(part_filename, target_dir, part)
|
||||
|
||||
|
@ -126,10 +255,18 @@ def _download_and_preprocess_data(csv_url, target_dir):
|
|||
assert csum == sha1
|
||||
|
||||
# Conditionally extract data
|
||||
_maybe_extract(target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip", "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020.zip")
|
||||
_maybe_extract(
|
||||
target_dir,
|
||||
"transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020",
|
||||
"transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip",
|
||||
"transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020.zip",
|
||||
)
|
||||
|
||||
# Produce source text for extraction / conversion
|
||||
return _maybe_create_sources(os.path.join(target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020"))
|
||||
return _maybe_create_sources(
|
||||
os.path.join(target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020")
|
||||
)
|
||||
|
||||
|
||||
def _maybe_extract(target_dir, extracted_data, archive, final):
|
||||
# If target_dir/extracted_data does not exist, extract archive in target_dir
|
||||
|
@ -147,7 +284,10 @@ def _maybe_extract(target_dir, extracted_data, archive, final):
|
|||
subprocess.check_call(cmdline, shell=True, cwd=target_dir)
|
||||
assert os.path.exists(archive_path)
|
||||
|
||||
print('No directory "%s" - extracting archive %s ...' % (extracted_path, archive_path))
|
||||
print(
|
||||
'No directory "%s" - extracting archive %s ...'
|
||||
% (extracted_path, archive_path)
|
||||
)
|
||||
with zipfile.ZipFile(archive_path) as zip_f:
|
||||
zip_f.extractall(extracted_path)
|
||||
|
||||
|
@ -156,6 +296,7 @@ def _maybe_extract(target_dir, extracted_data, archive, final):
|
|||
else:
|
||||
print('Found directory "%s" - not extracting it from archive.' % extracted_path)
|
||||
|
||||
|
||||
def _maybe_create_sources(dir):
|
||||
dataset_sources = os.path.join(dir, "data.txt")
|
||||
MP3 = glob(os.path.join(dir, "**", "*.mp3"))
|
||||
|
@ -168,8 +309,8 @@ def _maybe_create_sources(dir):
|
|||
for f_xml in XML:
|
||||
b_mp3 = os.path.splitext(os.path.basename(f_mp3))[0]
|
||||
b_xml = os.path.splitext(os.path.basename(f_xml))[0]
|
||||
a_mp3 = b_mp3.split('_')
|
||||
a_xml = b_xml.split('_')
|
||||
a_mp3 = b_mp3.split("_")
|
||||
a_xml = b_xml.split("_")
|
||||
score = 0
|
||||
date_mp3 = a_mp3[0]
|
||||
date_xml = a_xml[0]
|
||||
|
@ -178,7 +319,7 @@ def _maybe_create_sources(dir):
|
|||
continue
|
||||
|
||||
for i in range(min(len(a_mp3), len(a_xml))):
|
||||
if (a_mp3[i] == a_xml[i]):
|
||||
if a_mp3[i] == a_xml[i]:
|
||||
score += 1
|
||||
|
||||
if score >= 1:
|
||||
|
@ -187,7 +328,7 @@ def _maybe_create_sources(dir):
|
|||
# sort by score
|
||||
MP3_XML_Scores.sort(key=lambda x: x[2], reverse=True)
|
||||
for s_mp3, s_xml, score in MP3_XML_Scores:
|
||||
#print(s_mp3, s_xml, score)
|
||||
# print(s_mp3, s_xml, score)
|
||||
if score not in MP3_XML_Fin:
|
||||
MP3_XML_Fin[score] = {}
|
||||
|
||||
|
@ -208,13 +349,14 @@ def _maybe_create_sources(dir):
|
|||
if os.path.getsize(mp3) > 0 and os.path.getsize(xml) > 0:
|
||||
mp3 = os.path.relpath(mp3, dir)
|
||||
xml = os.path.relpath(xml, dir)
|
||||
ds.write('{},{},{:0.2e}\n'.format(xml, mp3, 2.5e-4))
|
||||
ds.write("{},{},{:0.2e}\n".format(xml, mp3, 2.5e-4))
|
||||
else:
|
||||
print("Empty file {} or {}".format(mp3, xml), file=sys.stderr)
|
||||
|
||||
print("Missing XML pairs:", MP3, file=sys.stderr)
|
||||
return dataset_sources
|
||||
|
||||
|
||||
def maybe_normalize_for_digits(label):
|
||||
# first, try to identify numbers like "50 000", "260 000"
|
||||
if " " in label:
|
||||
|
@ -234,30 +376,44 @@ def maybe_normalize_for_digits(label):
|
|||
date_or_time = re.compile(r"(\d{1,2}):(\d{2}):?(\d{2})?")
|
||||
maybe_date_or_time = date_or_time.findall(s)
|
||||
if len(maybe_date_or_time) > 0:
|
||||
maybe_hours = maybe_date_or_time[0][0]
|
||||
maybe_hours = maybe_date_or_time[0][0]
|
||||
maybe_minutes = maybe_date_or_time[0][1]
|
||||
maybe_seconds = maybe_date_or_time[0][2]
|
||||
if len(maybe_seconds) > 0:
|
||||
label = label.replace("{}:{}:{}".format(maybe_hours, maybe_minutes, maybe_seconds), "{} heures {} minutes et {} secondes".format(maybe_hours, maybe_minutes, maybe_seconds))
|
||||
label = label.replace(
|
||||
"{}:{}:{}".format(
|
||||
maybe_hours, maybe_minutes, maybe_seconds
|
||||
),
|
||||
"{} heures {} minutes et {} secondes".format(
|
||||
maybe_hours, maybe_minutes, maybe_seconds
|
||||
),
|
||||
)
|
||||
else:
|
||||
label = label.replace("{}:{}".format(maybe_hours, maybe_minutes), "{} heures et {} minutes".format(maybe_hours, maybe_minutes))
|
||||
label = label.replace(
|
||||
"{}:{}".format(maybe_hours, maybe_minutes),
|
||||
"{} heures et {} minutes".format(
|
||||
maybe_hours, maybe_minutes
|
||||
),
|
||||
)
|
||||
|
||||
new_label = []
|
||||
# pylint: disable=too-many-nested-blocks
|
||||
for s in label.split(" "):
|
||||
if any(i.isdigit() for i in s):
|
||||
s = s.replace(",", ".") # num2words requires "." for floats
|
||||
s = s.replace("\"", "") # clean some data, num2words would choke on 1959"
|
||||
s = s.replace(",", ".") # num2words requires "." for floats
|
||||
s = s.replace('"', "") # clean some data, num2words would choke on 1959"
|
||||
|
||||
last_c = s[-1]
|
||||
if not last_c.isdigit(): # num2words will choke on "0.6.", "24 ?"
|
||||
if not last_c.isdigit(): # num2words will choke on "0.6.", "24 ?"
|
||||
s = s[:-1]
|
||||
|
||||
if any(i.isalpha() for i in s): # So we have any(isdigit()) **and** any(sialpha), like "3D"
|
||||
if any(
|
||||
i.isalpha() for i in s
|
||||
): # So we have any(isdigit()) **and** any(sialpha), like "3D"
|
||||
ns = []
|
||||
for c in s:
|
||||
nc = c
|
||||
if c.isdigit(): # convert "3" to "trois-"
|
||||
if c.isdigit(): # convert "3" to "trois-"
|
||||
try:
|
||||
nc = num2words(c, lang="fr") + "-"
|
||||
except decimal.InvalidOperation as ex:
|
||||
|
@ -274,22 +430,36 @@ def maybe_normalize_for_digits(label):
|
|||
new_label.append(s)
|
||||
return " ".join(new_label)
|
||||
|
||||
|
||||
def maybe_normalize_for_specials_chars(label):
|
||||
label = label.replace("%", "pourcents")
|
||||
label = label.replace("/", ", ") # clean intervals like 2019/2022 to "2019 2022"
|
||||
label = label.replace("-", ", ") # clean intervals like 70-80 to "70 80"
|
||||
label = label.replace("+", " plus ") # clean + and make it speakable
|
||||
label = label.replace("€", " euros ") # clean euro symbol and make it speakable
|
||||
label = label.replace("., ", ", ") # clean some strange "4.0., " (20181017_Innovation.xml)
|
||||
label = label.replace("°", " degré ") # clean some strange "°5" (20181210_EtatsGeneraux-1000_fre_750_und.xml)
|
||||
label = label.replace("...", ".") # remove ellipsis
|
||||
label = label.replace("..", ".") # remove broken ellipsis
|
||||
label = label.replace("m²", "mètre-carrés") # 20150616_Defi_Climat_3_wmv_0_fre_minefi.xml
|
||||
label = label.replace("[end]", "") # broken tag in 20150123_Entretiens_Tresor_PGM_wmv_0_fre_minefi.xml
|
||||
label = label.replace(u'\xB8c', " ç") # strange cedilla in 20150417_Printemps_Economie_2_wmv_0_fre_minefi.xml
|
||||
label = label.replace("C0²", "CO 2") # 20121016_Syteme_sante_copie_wmv_0_fre_minefi.xml
|
||||
label = label.replace("/", ", ") # clean intervals like 2019/2022 to "2019 2022"
|
||||
label = label.replace("-", ", ") # clean intervals like 70-80 to "70 80"
|
||||
label = label.replace("+", " plus ") # clean + and make it speakable
|
||||
label = label.replace("€", " euros ") # clean euro symbol and make it speakable
|
||||
label = label.replace(
|
||||
"., ", ", "
|
||||
) # clean some strange "4.0., " (20181017_Innovation.xml)
|
||||
label = label.replace(
|
||||
"°", " degré "
|
||||
) # clean some strange "°5" (20181210_EtatsGeneraux-1000_fre_750_und.xml)
|
||||
label = label.replace("...", ".") # remove ellipsis
|
||||
label = label.replace("..", ".") # remove broken ellipsis
|
||||
label = label.replace(
|
||||
"m²", "mètre-carrés"
|
||||
) # 20150616_Defi_Climat_3_wmv_0_fre_minefi.xml
|
||||
label = label.replace(
|
||||
"[end]", ""
|
||||
) # broken tag in 20150123_Entretiens_Tresor_PGM_wmv_0_fre_minefi.xml
|
||||
label = label.replace(
|
||||
u"\xB8c", " ç"
|
||||
) # strange cedilla in 20150417_Printemps_Economie_2_wmv_0_fre_minefi.xml
|
||||
label = label.replace(
|
||||
"C0²", "CO 2"
|
||||
) # 20121016_Syteme_sante_copie_wmv_0_fre_minefi.xml
|
||||
return label
|
||||
|
||||
|
||||
def maybe_normalize_for_anglicisms(label):
|
||||
label = label.replace("B2B", "B to B")
|
||||
label = label.replace("B2C", "B to C")
|
||||
|
@ -297,12 +467,14 @@ def maybe_normalize_for_anglicisms(label):
|
|||
label = label.replace("@", "at ")
|
||||
return label
|
||||
|
||||
|
||||
def maybe_normalize(label):
|
||||
label = maybe_normalize_for_specials_chars(label)
|
||||
label = maybe_normalize_for_anglicisms(label)
|
||||
label = maybe_normalize_for_digits(label)
|
||||
return label
|
||||
|
||||
|
||||
def one_sample(sample):
|
||||
file_size = -1
|
||||
frames = 0
|
||||
|
@ -316,14 +488,33 @@ def one_sample(sample):
|
|||
label = label_filter_fun(sample[5])
|
||||
sample_id = sample[6]
|
||||
|
||||
_wav_filename = os.path.basename(audio_source.replace(".wav", "_{:06}.wav".format(sample_id)))
|
||||
_wav_filename = os.path.basename(
|
||||
audio_source.replace(".wav", "_{:06}.wav".format(sample_id))
|
||||
)
|
||||
wav_fullname = os.path.join(target_dir, dataset_basename, _wav_filename)
|
||||
|
||||
if not os.path.exists(wav_fullname):
|
||||
subprocess.check_output(["ffmpeg", "-i", audio_source, "-ss", str(start_time), "-t", str(duration), "-c", "copy", wav_fullname], stdin=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
||||
subprocess.check_output(
|
||||
[
|
||||
"ffmpeg",
|
||||
"-i",
|
||||
audio_source,
|
||||
"-ss",
|
||||
str(start_time),
|
||||
"-t",
|
||||
str(duration),
|
||||
"-c",
|
||||
"copy",
|
||||
wav_fullname,
|
||||
],
|
||||
stdin=subprocess.DEVNULL,
|
||||
stderr=subprocess.STDOUT,
|
||||
)
|
||||
|
||||
file_size = os.path.getsize(wav_fullname)
|
||||
frames = int(subprocess.check_output(["soxi", "-s", wav_fullname], stderr=subprocess.STDOUT))
|
||||
frames = int(
|
||||
subprocess.check_output(["soxi", "-s", wav_fullname], stderr=subprocess.STDOUT)
|
||||
)
|
||||
|
||||
_counter = get_counter()
|
||||
_rows = []
|
||||
|
@ -334,13 +525,13 @@ def one_sample(sample):
|
|||
elif label is None:
|
||||
# Excluding samples that failed on label validation
|
||||
_counter["invalid_label"] += 1
|
||||
elif int(frames/SAMPLE_RATE*1000/10/2) < len(str(label)):
|
||||
elif int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(label)):
|
||||
# Excluding samples that are too short to fit the transcript
|
||||
_counter["too_short"] += 1
|
||||
elif frames/SAMPLE_RATE < MIN_SECS:
|
||||
elif frames / SAMPLE_RATE < MIN_SECS:
|
||||
# Excluding samples that are too short
|
||||
_counter["too_short"] += 1
|
||||
elif frames/SAMPLE_RATE > MAX_SECS:
|
||||
elif frames / SAMPLE_RATE > MAX_SECS:
|
||||
# Excluding very long samples to keep a reasonable batch-size
|
||||
_counter["too_long"] += 1
|
||||
else:
|
||||
|
@ -352,56 +543,71 @@ def one_sample(sample):
|
|||
|
||||
return (_counter, _rows)
|
||||
|
||||
|
||||
def _maybe_import_data(xml_file, audio_source, target_dir, rel_tol=1e-1):
|
||||
dataset_basename = os.path.splitext(os.path.split(xml_file)[1])[0]
|
||||
wav_root = os.path.join(target_dir, dataset_basename)
|
||||
if not os.path.exists(wav_root):
|
||||
os.makedirs(wav_root)
|
||||
|
||||
source_frames = int(subprocess.check_output(["soxi", "-s", audio_source], stderr=subprocess.STDOUT))
|
||||
source_frames = int(
|
||||
subprocess.check_output(["soxi", "-s", audio_source], stderr=subprocess.STDOUT)
|
||||
)
|
||||
print("Source audio length: %s" % secs_to_hours(source_frames / SAMPLE_RATE))
|
||||
|
||||
# Get audiofile path and transcript for each sentence in tsv
|
||||
samples = []
|
||||
tree = ET.parse(xml_file)
|
||||
root = tree.getroot()
|
||||
seq_id = 0
|
||||
this_time = 0.0
|
||||
seq_id = 0
|
||||
this_time = 0.0
|
||||
this_duration = 0.0
|
||||
prev_time = 0.0
|
||||
prev_time = 0.0
|
||||
prev_duration = 0.0
|
||||
this_text = ""
|
||||
this_text = ""
|
||||
for child in root:
|
||||
if child.tag == "row":
|
||||
cur_time = float(child.attrib["timestamp"])
|
||||
cur_time = float(child.attrib["timestamp"])
|
||||
cur_duration = float(child.attrib["timedur"])
|
||||
cur_text = child.text
|
||||
cur_text = child.text
|
||||
|
||||
if this_time == 0.0:
|
||||
this_time = cur_time
|
||||
|
||||
delta = cur_time - (prev_time + prev_duration)
|
||||
delta = cur_time - (prev_time + prev_duration)
|
||||
# rel_tol value is made from trial/error to try and compromise between:
|
||||
# - cutting enough to skip missing words
|
||||
# - not too short, not too long sentences
|
||||
is_close = math.isclose(cur_time, this_time + this_duration, rel_tol=rel_tol)
|
||||
is_short = ((this_duration + cur_duration + delta) < MAX_SECS)
|
||||
is_close = math.isclose(
|
||||
cur_time, this_time + this_duration, rel_tol=rel_tol
|
||||
)
|
||||
is_short = (this_duration + cur_duration + delta) < MAX_SECS
|
||||
|
||||
# when the previous element is close enough **and** this does not
|
||||
# go over MAX_SECS, we append content
|
||||
if (is_close and is_short):
|
||||
if is_close and is_short:
|
||||
this_duration += cur_duration + delta
|
||||
this_text += cur_text
|
||||
this_text += cur_text
|
||||
else:
|
||||
samples.append((audio_source, target_dir, dataset_basename, this_time, this_duration, this_text, seq_id))
|
||||
samples.append(
|
||||
(
|
||||
audio_source,
|
||||
target_dir,
|
||||
dataset_basename,
|
||||
this_time,
|
||||
this_duration,
|
||||
this_text,
|
||||
seq_id,
|
||||
)
|
||||
)
|
||||
|
||||
this_time = cur_time
|
||||
this_time = cur_time
|
||||
this_duration = cur_duration
|
||||
this_text = cur_text
|
||||
this_text = cur_text
|
||||
|
||||
seq_id += 1
|
||||
|
||||
prev_time = cur_time
|
||||
prev_time = cur_time
|
||||
prev_duration = cur_duration
|
||||
|
||||
# Keep track of how many samples are good vs. problematic
|
||||
|
@ -425,21 +631,27 @@ def _maybe_import_data(xml_file, audio_source, target_dir, rel_tol=1e-1):
|
|||
assert len(_rows) == imported_samples
|
||||
|
||||
print_import_report(_counter, SAMPLE_RATE, MAX_SECS)
|
||||
print("Import efficiency: %.1f%%" % ((_counter["total_time"] / source_frames)*100))
|
||||
print(
|
||||
"Import efficiency: %.1f%%" % ((_counter["total_time"] / source_frames) * 100)
|
||||
)
|
||||
print("")
|
||||
|
||||
return _counter, _rows
|
||||
|
||||
|
||||
def _maybe_convert_wav(mp3_filename, _wav_filename):
|
||||
if not os.path.exists(_wav_filename):
|
||||
print("Converting {} to WAV file: {}".format(mp3_filename, _wav_filename))
|
||||
transformer = sox.Transformer()
|
||||
transformer.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS, bitdepth=BIT_DEPTH)
|
||||
transformer.convert(
|
||||
samplerate=SAMPLE_RATE, n_channels=CHANNELS, bitdepth=BIT_DEPTH
|
||||
)
|
||||
try:
|
||||
transformer.build(mp3_filename, _wav_filename)
|
||||
except sox.core.SoxError:
|
||||
pass
|
||||
|
||||
|
||||
def write_general_csv(target_dir, _rows, _counter):
|
||||
target_csv_template = os.path.join(target_dir, "ccpmf_{}.csv")
|
||||
with open(target_csv_template.format("train"), "w") as train_csv_file: # 80%
|
||||
|
@ -461,7 +673,13 @@ def write_general_csv(target_dir, _rows, _counter):
|
|||
writer = dev_writer
|
||||
else:
|
||||
writer = train_writer
|
||||
writer.writerow({"wav_filename": item[0], "wav_filesize": item[1], "transcript": item[2]})
|
||||
writer.writerow(
|
||||
{
|
||||
"wav_filename": item[0],
|
||||
"wav_filesize": item[1],
|
||||
"transcript": item[2],
|
||||
}
|
||||
)
|
||||
|
||||
print("")
|
||||
print("~~~~ FINAL STATISTICS ~~~~")
|
||||
|
@ -469,11 +687,21 @@ def write_general_csv(target_dir, _rows, _counter):
|
|||
print("~~~~ (FINAL STATISTICS) ~~~~")
|
||||
print("")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
PARSER = get_importers_parser(description="Import XML from Conference Centre for Economics, France")
|
||||
PARSER = get_importers_parser(
|
||||
description="Import XML from Conference Centre for Economics, France"
|
||||
)
|
||||
PARSER.add_argument("target_dir", help="Destination directory")
|
||||
PARSER.add_argument("--filter_alphabet", help="Exclude samples with characters not in provided alphabet")
|
||||
PARSER.add_argument("--normalize", action="store_true", help="Converts diacritic characters to their base ones")
|
||||
PARSER.add_argument(
|
||||
"--filter_alphabet",
|
||||
help="Exclude samples with characters not in provided alphabet",
|
||||
)
|
||||
PARSER.add_argument(
|
||||
"--normalize",
|
||||
action="store_true",
|
||||
help="Converts diacritic characters to their base ones",
|
||||
)
|
||||
|
||||
PARAMS = PARSER.parse_args()
|
||||
validate_label = get_validate_label(PARAMS)
|
||||
|
@ -481,9 +709,11 @@ if __name__ == "__main__":
|
|||
|
||||
def label_filter_fun(label):
|
||||
if PARAMS.normalize:
|
||||
label = unicodedata.normalize("NFKD", label.strip()) \
|
||||
.encode("ascii", "ignore") \
|
||||
label = (
|
||||
unicodedata.normalize("NFKD", label.strip())
|
||||
.encode("ascii", "ignore")
|
||||
.decode("ascii", "ignore")
|
||||
)
|
||||
label = maybe_normalize(label)
|
||||
label = validate_label(label)
|
||||
if ALPHABET and label:
|
||||
|
@ -493,7 +723,9 @@ if __name__ == "__main__":
|
|||
label = None
|
||||
return label
|
||||
|
||||
dataset_sources = _download_and_preprocess_data(csv_url=DATASET_RELEASE_CSV, target_dir=PARAMS.target_dir)
|
||||
dataset_sources = _download_and_preprocess_data(
|
||||
csv_url=DATASET_RELEASE_CSV, target_dir=PARAMS.target_dir
|
||||
)
|
||||
sources_root_dir = os.path.dirname(dataset_sources)
|
||||
all_counter = get_counter()
|
||||
all_rows = []
|
||||
|
@ -504,9 +736,14 @@ if __name__ == "__main__":
|
|||
this_mp3 = os.path.join(sources_root_dir, d[1])
|
||||
this_rel = float(d[2])
|
||||
|
||||
wav_filename = os.path.join(sources_root_dir, os.path.splitext(os.path.basename(this_mp3))[0] + ".wav")
|
||||
wav_filename = os.path.join(
|
||||
sources_root_dir,
|
||||
os.path.splitext(os.path.basename(this_mp3))[0] + ".wav",
|
||||
)
|
||||
_maybe_convert_wav(this_mp3, wav_filename)
|
||||
counter, rows = _maybe_import_data(this_xml, wav_filename, sources_root_dir, this_rel)
|
||||
counter, rows = _maybe_import_data(
|
||||
this_xml, wav_filename, sources_root_dir, this_rel
|
||||
)
|
||||
|
||||
all_counter += counter
|
||||
all_rows += rows
|
||||
|
|
|
@ -1,15 +1,14 @@
|
|||
#!/usr/bin/env python
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import sys
|
||||
import tarfile
|
||||
from glob import glob
|
||||
from multiprocessing import Pool
|
||||
|
||||
import progressbar
|
||||
import sox
|
||||
|
||||
from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
|
||||
from coqui_stt_training.util.importers import (
|
||||
get_counter,
|
||||
|
|
|
@ -14,7 +14,7 @@ from multiprocessing import Pool
|
|||
|
||||
import progressbar
|
||||
import sox
|
||||
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
from coqui_stt_training.util.downloader import SIMPLE_BAR
|
||||
from coqui_stt_training.util.importers import (
|
||||
get_counter,
|
||||
|
@ -23,7 +23,6 @@ from coqui_stt_training.util.importers import (
|
|||
get_validate_label,
|
||||
print_import_report,
|
||||
)
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
|
||||
FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
|
||||
SAMPLE_RATE = 16000
|
||||
|
@ -41,7 +40,11 @@ class LabelFilter:
|
|||
|
||||
def filter(self, label):
|
||||
if self.normalize:
|
||||
label = unicodedata.normalize("NFKD", label.strip()).encode("ascii", "ignore").decode("ascii", "ignore")
|
||||
label = (
|
||||
unicodedata.normalize("NFKD", label.strip())
|
||||
.encode("ascii", "ignore")
|
||||
.decode("ascii", "ignore")
|
||||
)
|
||||
label = self.validate_fun(label)
|
||||
if self.alphabet and label and not self.alphabet.CanEncode(label):
|
||||
label = None
|
||||
|
@ -97,7 +100,15 @@ def one_sample(sample):
|
|||
return (counter, rows)
|
||||
|
||||
|
||||
def _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_every_character=None, rows=None, exclude=None):
|
||||
def _maybe_convert_set(
|
||||
dataset,
|
||||
tsv_dir,
|
||||
audio_dir,
|
||||
filter_obj,
|
||||
space_after_every_character=None,
|
||||
rows=None,
|
||||
exclude=None,
|
||||
):
|
||||
exclude_transcripts = set()
|
||||
exclude_speakers = set()
|
||||
if exclude is not None:
|
||||
|
@ -116,7 +127,13 @@ def _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_ever
|
|||
with open(input_tsv, encoding="utf-8") as input_tsv_file:
|
||||
reader = csv.DictReader(input_tsv_file, delimiter="\t")
|
||||
for row in reader:
|
||||
samples.append((os.path.join(audio_dir, row["path"]), row["sentence"], row["client_id"]))
|
||||
samples.append(
|
||||
(
|
||||
os.path.join(audio_dir, row["path"]),
|
||||
row["sentence"],
|
||||
row["client_id"],
|
||||
)
|
||||
)
|
||||
|
||||
counter = get_counter()
|
||||
num_samples = len(samples)
|
||||
|
@ -124,7 +141,9 @@ def _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_ever
|
|||
print("Importing mp3 files...")
|
||||
pool = Pool(initializer=init_worker, initargs=(PARAMS,))
|
||||
bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
|
||||
for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
|
||||
for i, processed in enumerate(
|
||||
pool.imap_unordered(one_sample, samples), start=1
|
||||
):
|
||||
counter += processed[0]
|
||||
rows += processed[1]
|
||||
bar.update(i)
|
||||
|
@ -169,12 +188,20 @@ def _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_ever
|
|||
def _preprocess_data(tsv_dir, audio_dir, space_after_every_character=False):
|
||||
exclude = []
|
||||
for dataset in ["test", "dev", "train", "validated", "other"]:
|
||||
set_samples = _maybe_convert_set(dataset, tsv_dir, audio_dir, space_after_every_character)
|
||||
set_samples = _maybe_convert_set(
|
||||
dataset, tsv_dir, audio_dir, space_after_every_character
|
||||
)
|
||||
if dataset in ["test", "dev"]:
|
||||
exclude += set_samples
|
||||
if dataset == "validated":
|
||||
_maybe_convert_set("train-all", tsv_dir, audio_dir, space_after_every_character,
|
||||
rows=set_samples, exclude=exclude)
|
||||
_maybe_convert_set(
|
||||
"train-all",
|
||||
tsv_dir,
|
||||
audio_dir,
|
||||
space_after_every_character,
|
||||
rows=set_samples,
|
||||
exclude=exclude,
|
||||
)
|
||||
|
||||
|
||||
def _maybe_convert_wav(mp3_filename, wav_filename):
|
||||
|
@ -212,7 +239,9 @@ def parse_args():
|
|||
|
||||
|
||||
def main():
|
||||
audio_dir = PARAMS.audio_dir if PARAMS.audio_dir else os.path.join(PARAMS.tsv_dir, "clips")
|
||||
audio_dir = (
|
||||
PARAMS.audio_dir if PARAMS.audio_dir else os.path.join(PARAMS.tsv_dir, "clips")
|
||||
)
|
||||
_preprocess_data(PARAMS.tsv_dir, audio_dir, PARAMS.space_after_every_character)
|
||||
|
||||
|
||||
|
|
|
@ -10,7 +10,6 @@ import unicodedata
|
|||
import librosa
|
||||
import pandas
|
||||
import soundfile # <= Has an external dependency on libsndfile
|
||||
|
||||
from coqui_stt_training.util.importers import validate_label_eng as validate_label
|
||||
|
||||
# Prerequisite: Having the sph2pipe tool in your PATH:
|
||||
|
@ -239,7 +238,7 @@ def _split_and_resample_wav(origAudio, start_time, stop_time, new_wav_file):
|
|||
def _split_sets(filelist):
|
||||
"""
|
||||
randomply split the datasets into train, validation, and test sets where the size of the
|
||||
validation and test sets are determined by the `get_sample_size` function.
|
||||
validation and test sets are determined by the `get_sample_size` function.
|
||||
"""
|
||||
random.shuffle(filelist)
|
||||
sample_size = get_sample_size(len(filelist))
|
||||
|
@ -261,8 +260,7 @@ def _split_sets(filelist):
|
|||
|
||||
|
||||
def get_sample_size(population_size):
|
||||
"""calculates the sample size for a 99% confidence and 1% margin of error
|
||||
"""
|
||||
"""calculates the sample size for a 99% confidence and 1% margin of error"""
|
||||
margin_of_error = 0.01
|
||||
fraction_picking = 0.50
|
||||
z_score = 2.58 # Corresponds to confidence level 99%
|
||||
|
|
|
@ -5,7 +5,6 @@ import tarfile
|
|||
|
||||
import numpy as np
|
||||
import pandas
|
||||
|
||||
from coqui_stt_training.util.importers import get_importers_parser
|
||||
|
||||
COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"]
|
||||
|
|
|
@ -9,10 +9,9 @@ import urllib
|
|||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from sox import Transformer
|
||||
|
||||
import swifter
|
||||
from coqui_stt_training.util.importers import get_importers_parser, get_validate_label
|
||||
from sox import Transformer
|
||||
|
||||
__version__ = "0.1.0"
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
|
|
@ -3,7 +3,6 @@ import os
|
|||
import sys
|
||||
|
||||
import pandas
|
||||
|
||||
from coqui_stt_training.util.downloader import maybe_download
|
||||
|
||||
|
||||
|
|
|
@ -9,10 +9,10 @@ import unicodedata
|
|||
|
||||
import pandas
|
||||
import progressbar
|
||||
from sox import Transformer
|
||||
from tensorflow.python.platform import gfile
|
||||
|
||||
from coqui_stt_training.util.downloader import maybe_download
|
||||
from sox import Transformer
|
||||
|
||||
from tensorflow.python.platform import gfile
|
||||
|
||||
SAMPLE_RATE = 16000
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ from multiprocessing import Pool
|
|||
|
||||
import progressbar
|
||||
import sox
|
||||
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
|
||||
from coqui_stt_training.util.importers import (
|
||||
get_counter,
|
||||
|
@ -20,7 +20,6 @@ from coqui_stt_training.util.importers import (
|
|||
get_validate_label,
|
||||
print_import_report,
|
||||
)
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
|
||||
FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
|
||||
SAMPLE_RATE = 16000
|
||||
|
@ -137,9 +136,15 @@ def _maybe_convert_sets(target_dir, extracted_data):
|
|||
pool.close()
|
||||
pool.join()
|
||||
|
||||
with open(target_csv_template.format("train"), "w", encoding="utf-8", newline="") as train_csv_file: # 80%
|
||||
with open(target_csv_template.format("dev"), "w", encoding="utf-8", newline="") as dev_csv_file: # 10%
|
||||
with open(target_csv_template.format("test"), "w", encoding="utf-8", newline="") as test_csv_file: # 10%
|
||||
with open(
|
||||
target_csv_template.format("train"), "w", encoding="utf-8", newline=""
|
||||
) as train_csv_file: # 80%
|
||||
with open(
|
||||
target_csv_template.format("dev"), "w", encoding="utf-8", newline=""
|
||||
) as dev_csv_file: # 10%
|
||||
with open(
|
||||
target_csv_template.format("test"), "w", encoding="utf-8", newline=""
|
||||
) as test_csv_file: # 10%
|
||||
train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES)
|
||||
train_writer.writeheader()
|
||||
dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES)
|
||||
|
@ -179,7 +184,9 @@ def _maybe_convert_sets(target_dir, extracted_data):
|
|||
def _maybe_convert_wav(ogg_filename, wav_filename):
|
||||
if not os.path.exists(wav_filename):
|
||||
transformer = sox.Transformer()
|
||||
transformer.convert(samplerate=SAMPLE_RATE, n_channels=N_CHANNELS, bitdepth=BITDEPTH)
|
||||
transformer.convert(
|
||||
samplerate=SAMPLE_RATE, n_channels=N_CHANNELS, bitdepth=BITDEPTH
|
||||
)
|
||||
try:
|
||||
transformer.build(ogg_filename, wav_filename)
|
||||
except sox.core.SoxError as ex:
|
||||
|
|
|
@ -9,7 +9,7 @@ from glob import glob
|
|||
from multiprocessing import Pool
|
||||
|
||||
import progressbar
|
||||
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
|
||||
from coqui_stt_training.util.importers import (
|
||||
get_counter,
|
||||
|
@ -18,7 +18,6 @@ from coqui_stt_training.util.importers import (
|
|||
get_validate_label,
|
||||
print_import_report,
|
||||
)
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
|
||||
FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
|
||||
SAMPLE_RATE = 16000
|
||||
|
@ -60,9 +59,20 @@ def one_sample(sample):
|
|||
file_size = -1
|
||||
frames = 0
|
||||
if os.path.exists(wav_filename):
|
||||
tmp_filename = os.path.splitext(wav_filename)[0]+'.tmp.wav'
|
||||
tmp_filename = os.path.splitext(wav_filename)[0] + ".tmp.wav"
|
||||
subprocess.check_call(
|
||||
['sox', wav_filename, '-r', str(SAMPLE_RATE), '-c', '1', '-b', '16', tmp_filename], stderr=subprocess.STDOUT
|
||||
[
|
||||
"sox",
|
||||
wav_filename,
|
||||
"-r",
|
||||
str(SAMPLE_RATE),
|
||||
"-c",
|
||||
"1",
|
||||
"-b",
|
||||
"16",
|
||||
tmp_filename,
|
||||
],
|
||||
stderr=subprocess.STDOUT,
|
||||
)
|
||||
os.rename(tmp_filename, wav_filename)
|
||||
file_size = os.path.getsize(wav_filename)
|
||||
|
@ -138,9 +148,15 @@ def _maybe_convert_sets(target_dir, extracted_data):
|
|||
pool.close()
|
||||
pool.join()
|
||||
|
||||
with open(target_csv_template.format("train"), "w", encoding="utf-8", newline="") as train_csv_file: # 80%
|
||||
with open(target_csv_template.format("dev"), "w", encoding="utf-8", newline="") as dev_csv_file: # 10%
|
||||
with open(target_csv_template.format("test"), "w", encoding="utf-8", newline="") as test_csv_file: # 10%
|
||||
with open(
|
||||
target_csv_template.format("train"), "w", encoding="utf-8", newline=""
|
||||
) as train_csv_file: # 80%
|
||||
with open(
|
||||
target_csv_template.format("dev"), "w", encoding="utf-8", newline=""
|
||||
) as dev_csv_file: # 10%
|
||||
with open(
|
||||
target_csv_template.format("test"), "w", encoding="utf-8", newline=""
|
||||
) as test_csv_file: # 10%
|
||||
train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES)
|
||||
train_writer.writeheader()
|
||||
dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES)
|
||||
|
|
|
@ -5,7 +5,6 @@ import tarfile
|
|||
import wave
|
||||
|
||||
import pandas
|
||||
|
||||
from coqui_stt_training.util.importers import get_importers_parser
|
||||
|
||||
COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"]
|
||||
|
|
|
@ -2,10 +2,9 @@
|
|||
import argparse
|
||||
import ctypes
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pandas
|
||||
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
|
|
|
@ -6,7 +6,6 @@ import tarfile
|
|||
|
||||
import numpy as np
|
||||
import pandas
|
||||
|
||||
from coqui_stt_training.util.importers import get_importers_parser
|
||||
|
||||
COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"]
|
||||
|
|
|
@ -8,7 +8,7 @@ from glob import glob
|
|||
from multiprocessing import Pool
|
||||
|
||||
import progressbar
|
||||
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
|
||||
from coqui_stt_training.util.importers import (
|
||||
get_counter,
|
||||
|
@ -17,7 +17,6 @@ from coqui_stt_training.util.importers import (
|
|||
get_validate_label,
|
||||
print_import_report,
|
||||
)
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
|
||||
FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
|
||||
SAMPLE_RATE = 16000
|
||||
|
@ -157,9 +156,15 @@ def _maybe_convert_sets(target_dir, extracted_data):
|
|||
pool.close()
|
||||
pool.join()
|
||||
|
||||
with open(target_csv_template.format("train"), "w", encoding="utf-8", newline="") as train_csv_file: # 80%
|
||||
with open(target_csv_template.format("dev"), "w", encoding="utf-8", newline="") as dev_csv_file: # 10%
|
||||
with open(target_csv_template.format("test"), "w", encoding="utf-8", newline="") as test_csv_file: # 10%
|
||||
with open(
|
||||
target_csv_template.format("train"), "w", encoding="utf-8", newline=""
|
||||
) as train_csv_file: # 80%
|
||||
with open(
|
||||
target_csv_template.format("dev"), "w", encoding="utf-8", newline=""
|
||||
) as dev_csv_file: # 10%
|
||||
with open(
|
||||
target_csv_template.format("test"), "w", encoding="utf-8", newline=""
|
||||
) as test_csv_file: # 10%
|
||||
train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES)
|
||||
train_writer.writeheader()
|
||||
dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES)
|
||||
|
|
|
@ -16,7 +16,6 @@ import librosa
|
|||
import pandas
|
||||
import requests
|
||||
import soundfile # <= Has an external dependency on libsndfile
|
||||
|
||||
from coqui_stt_training.util.importers import validate_label_eng as validate_label
|
||||
|
||||
# ARCHIVE_NAME refers to ISIP alignments from 01/29/03
|
||||
|
@ -293,7 +292,7 @@ def _split_wav(origAudio, start_time, stop_time, new_wav_file):
|
|||
def _split_sets(filelist):
|
||||
"""
|
||||
randomply split the datasets into train, validation, and test sets where the size of the
|
||||
validation and test sets are determined by the `get_sample_size` function.
|
||||
validation and test sets are determined by the `get_sample_size` function.
|
||||
"""
|
||||
random.shuffle(filelist)
|
||||
sample_size = get_sample_size(len(filelist))
|
||||
|
@ -315,8 +314,7 @@ def _split_sets(filelist):
|
|||
|
||||
|
||||
def get_sample_size(population_size):
|
||||
"""calculates the sample size for a 99% confidence and 1% margin of error
|
||||
"""
|
||||
"""calculates the sample size for a 99% confidence and 1% margin of error"""
|
||||
margin_of_error = 0.01
|
||||
fraction_picking = 0.50
|
||||
z_score = 2.58 # Corresponds to confidence level 99%
|
||||
|
|
|
@ -21,10 +21,9 @@ from multiprocessing.pool import ThreadPool
|
|||
|
||||
import progressbar
|
||||
import sox
|
||||
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
|
||||
from coqui_stt_training.util.importers import validate_label_eng as validate_label
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
|
||||
SWC_URL = "https://www2.informatik.uni-hamburg.de/nats/pub/SWC/SWC_{language}.tar"
|
||||
SWC_ARCHIVE = "SWC_{language}.tar"
|
||||
|
@ -173,7 +172,6 @@ def in_alphabet(alphabet, c):
|
|||
return alphabet.CanEncode(c) if alphabet else True
|
||||
|
||||
|
||||
|
||||
ALPHABETS = {}
|
||||
|
||||
|
||||
|
@ -202,8 +200,16 @@ def label_filter(label, language):
|
|||
dont_normalize = DONT_NORMALIZE[language] if language in DONT_NORMALIZE else ""
|
||||
alphabet = get_alphabet(language)
|
||||
for c in label:
|
||||
if CLI_ARGS.normalize and c not in dont_normalize and not in_alphabet(alphabet, c):
|
||||
c = unicodedata.normalize("NFKD", c).encode("ascii", "ignore").decode("ascii", "ignore")
|
||||
if (
|
||||
CLI_ARGS.normalize
|
||||
and c not in dont_normalize
|
||||
and not in_alphabet(alphabet, c)
|
||||
):
|
||||
c = (
|
||||
unicodedata.normalize("NFKD", c)
|
||||
.encode("ascii", "ignore")
|
||||
.decode("ascii", "ignore")
|
||||
)
|
||||
for sc in c:
|
||||
if not in_alphabet(alphabet, sc):
|
||||
return None, "illegal character"
|
||||
|
|
|
@ -7,11 +7,11 @@ from glob import glob
|
|||
from os import makedirs, path, remove, rmdir
|
||||
|
||||
import pandas
|
||||
from sox import Transformer
|
||||
from tensorflow.python.platform import gfile
|
||||
|
||||
from coqui_stt_training.util.downloader import maybe_download
|
||||
from coqui_stt_training.util.stm import parse_stm_file
|
||||
from sox import Transformer
|
||||
|
||||
from tensorflow.python.platform import gfile
|
||||
|
||||
|
||||
def _download_and_preprocess_data(data_dir):
|
||||
|
|
|
@ -8,7 +8,6 @@ from multiprocessing import Pool
|
|||
|
||||
import progressbar
|
||||
import sox
|
||||
|
||||
import unidecode
|
||||
from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
|
||||
from coqui_stt_training.util.importers import (
|
||||
|
@ -132,9 +131,15 @@ def _maybe_convert_sets(target_dir, extracted_data, english_compatible=False):
|
|||
pool.close()
|
||||
pool.join()
|
||||
|
||||
with open(target_csv_template.format("train"), "w", encoding="utf-8", newline="") as train_csv_file: # 80%
|
||||
with open(target_csv_template.format("dev"), "w", encoding="utf-8", newline="") as dev_csv_file: # 10%
|
||||
with open(target_csv_template.format("test"), "w", encoding="utf-8", newline="") as test_csv_file: # 10%
|
||||
with open(
|
||||
target_csv_template.format("train"), "w", encoding="utf-8", newline=""
|
||||
) as train_csv_file: # 80%
|
||||
with open(
|
||||
target_csv_template.format("dev"), "w", encoding="utf-8", newline=""
|
||||
) as dev_csv_file: # 10%
|
||||
with open(
|
||||
target_csv_template.format("test"), "w", encoding="utf-8", newline=""
|
||||
) as test_csv_file: # 10%
|
||||
train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES)
|
||||
train_writer.writeheader()
|
||||
dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES)
|
||||
|
|
|
@ -13,10 +13,9 @@ import xml.etree.ElementTree as ET
|
|||
from collections import Counter
|
||||
|
||||
import progressbar
|
||||
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
|
||||
from coqui_stt_training.util.importers import validate_label_eng as validate_label
|
||||
from coqui_stt_ctcdecoder import Alphabet
|
||||
|
||||
TUDA_VERSION = "v2"
|
||||
TUDA_PACKAGE = "german-speechdata-package-{}".format(TUDA_VERSION)
|
||||
|
@ -55,7 +54,11 @@ def check_and_prepare_sentence(sentence):
|
|||
chars = []
|
||||
for c in sentence:
|
||||
if CLI_ARGS.normalize and c not in "äöüß" and not in_alphabet(c):
|
||||
c = unicodedata.normalize("NFKD", c).encode("ascii", "ignore").decode("ascii", "ignore")
|
||||
c = (
|
||||
unicodedata.normalize("NFKD", c)
|
||||
.encode("ascii", "ignore")
|
||||
.decode("ascii", "ignore")
|
||||
)
|
||||
for sc in c:
|
||||
if not in_alphabet(c):
|
||||
return None
|
||||
|
@ -118,7 +121,7 @@ def write_csvs(extracted):
|
|||
sentence = list(meta.iter("cleaned_sentence"))[0].text
|
||||
sentence = check_and_prepare_sentence(sentence)
|
||||
if sentence is None:
|
||||
reasons['alphabet filter'] += 1
|
||||
reasons["alphabet filter"] += 1
|
||||
continue
|
||||
for wav_name in wav_names:
|
||||
sample_counter += 1
|
||||
|
|
|
@ -10,7 +10,6 @@ from zipfile import ZipFile
|
|||
|
||||
import librosa
|
||||
import progressbar
|
||||
|
||||
from coqui_stt_training.util.downloader import SIMPLE_BAR, maybe_download
|
||||
from coqui_stt_training.util.importers import (
|
||||
get_counter,
|
||||
|
|
|
@ -13,9 +13,10 @@ from os import makedirs, path
|
|||
|
||||
import pandas
|
||||
from bs4 import BeautifulSoup
|
||||
from tensorflow.python.platform import gfile
|
||||
from coqui_stt_training.util.downloader import maybe_download
|
||||
|
||||
from tensorflow.python.platform import gfile
|
||||
|
||||
"""The number of jobs to run in parallel"""
|
||||
NUM_PARALLEL = 8
|
||||
|
||||
|
|
60
bin/play.py
60
bin/play.py
|
@ -4,14 +4,26 @@ Tool for playing (and augmenting) single samples or samples from Sample Database
|
|||
Use "python3 play.py -h" for help
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import random
|
||||
import argparse
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
from coqui_stt_training.util.audio import get_loadable_audio_type_from_extension, AUDIO_TYPE_PCM, AUDIO_TYPE_WAV
|
||||
from coqui_stt_training.util.sample_collections import SampleList, LabeledSample, samples_from_source
|
||||
from coqui_stt_training.util.augmentations import parse_augmentations, apply_sample_augmentations, SampleAugmentation
|
||||
from coqui_stt_training.util.audio import (
|
||||
AUDIO_TYPE_PCM,
|
||||
AUDIO_TYPE_WAV,
|
||||
get_loadable_audio_type_from_extension,
|
||||
)
|
||||
from coqui_stt_training.util.augmentations import (
|
||||
SampleAugmentation,
|
||||
apply_sample_augmentations,
|
||||
parse_augmentations,
|
||||
)
|
||||
from coqui_stt_training.util.sample_collections import (
|
||||
LabeledSample,
|
||||
SampleList,
|
||||
samples_from_source,
|
||||
)
|
||||
|
||||
|
||||
def get_samples_in_play_order():
|
||||
|
@ -43,11 +55,13 @@ def play_collection():
|
|||
if any(not isinstance(a, SampleAugmentation) for a in augmentations):
|
||||
print("Warning: Some of the augmentations cannot be simulated by this command.")
|
||||
samples = get_samples_in_play_order()
|
||||
samples = apply_sample_augmentations(samples,
|
||||
audio_type=AUDIO_TYPE_PCM,
|
||||
augmentations=augmentations,
|
||||
process_ahead=0,
|
||||
clock=CLI_ARGS.clock)
|
||||
samples = apply_sample_augmentations(
|
||||
samples,
|
||||
audio_type=AUDIO_TYPE_PCM,
|
||||
augmentations=augmentations,
|
||||
process_ahead=0,
|
||||
clock=CLI_ARGS.clock,
|
||||
)
|
||||
for sample in samples:
|
||||
if not CLI_ARGS.quiet:
|
||||
print('Sample "{}"'.format(sample.sample_id), file=sys.stderr)
|
||||
|
@ -57,10 +71,12 @@ def play_collection():
|
|||
sample.change_audio_type(AUDIO_TYPE_WAV)
|
||||
sys.stdout.buffer.write(sample.audio.getvalue())
|
||||
return
|
||||
wave_obj = simpleaudio.WaveObject(sample.audio,
|
||||
sample.audio_format.channels,
|
||||
sample.audio_format.width,
|
||||
sample.audio_format.rate)
|
||||
wave_obj = simpleaudio.WaveObject(
|
||||
sample.audio,
|
||||
sample.audio_format.channels,
|
||||
sample.audio_format.width,
|
||||
sample.audio_format.rate,
|
||||
)
|
||||
play_obj = wave_obj.play()
|
||||
play_obj.wait_done()
|
||||
|
||||
|
@ -70,7 +86,9 @@ def handle_args():
|
|||
description="Tool for playing (and augmenting) single samples or samples from Sample Databases (SDB files) "
|
||||
"and Coqui STT CSV files"
|
||||
)
|
||||
parser.add_argument("source", help="Sample DB, CSV or WAV file to play samples from")
|
||||
parser.add_argument(
|
||||
"source", help="Sample DB, CSV or WAV file to play samples from"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--start",
|
||||
type=int,
|
||||
|
@ -90,7 +108,7 @@ def handle_args():
|
|||
)
|
||||
parser.add_argument(
|
||||
"--augment",
|
||||
action='append',
|
||||
action="append",
|
||||
help="Add an augmentation operation",
|
||||
)
|
||||
parser.add_argument(
|
||||
|
@ -98,8 +116,8 @@ def handle_args():
|
|||
type=float,
|
||||
default=0.5,
|
||||
help="Simulates clock value used for augmentations during training."
|
||||
"Ranges from 0.0 (representing parameter start values) to"
|
||||
"1.0 (representing parameter end values)",
|
||||
"Ranges from 0.0 (representing parameter start values) to"
|
||||
"1.0 (representing parameter end values)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pipe",
|
||||
|
@ -120,7 +138,9 @@ if __name__ == "__main__":
|
|||
try:
|
||||
import simpleaudio
|
||||
except ModuleNotFoundError:
|
||||
print('Unless using the --pipe flag, play.py requires Python package "simpleaudio" for playing samples')
|
||||
print(
|
||||
'Unless using the --pipe flag, play.py requires Python package "simpleaudio" for playing samples'
|
||||
)
|
||||
sys.exit(1)
|
||||
try:
|
||||
play_collection()
|
||||
|
|
|
@ -8,4 +8,3 @@ This directory contains language-specific data files. Most importantly, you will
|
|||
2. A script used to generate a binary n-gram language model: ``data/lm/generate_lm.py``.
|
||||
|
||||
For more information on how to build these resources from scratch, see the ``External scorer scripts`` section on `stt.readthedocs.io <https://stt.readthedocs.io/>`_.
|
||||
|
||||
|
|
|
@ -78,20 +78,20 @@ def build_lm(args, data_lower, vocab_str):
|
|||
print("\nCreating ARPA file ...")
|
||||
lm_path = os.path.join(args.output_dir, "lm.arpa")
|
||||
subargs = [
|
||||
os.path.join(args.kenlm_bins, "lmplz"),
|
||||
"--order",
|
||||
str(args.arpa_order),
|
||||
"--temp_prefix",
|
||||
args.output_dir,
|
||||
"--memory",
|
||||
args.max_arpa_memory,
|
||||
"--text",
|
||||
data_lower,
|
||||
"--arpa",
|
||||
lm_path,
|
||||
"--prune",
|
||||
*args.arpa_prune.split("|"),
|
||||
]
|
||||
os.path.join(args.kenlm_bins, "lmplz"),
|
||||
"--order",
|
||||
str(args.arpa_order),
|
||||
"--temp_prefix",
|
||||
args.output_dir,
|
||||
"--memory",
|
||||
args.max_arpa_memory,
|
||||
"--text",
|
||||
data_lower,
|
||||
"--arpa",
|
||||
lm_path,
|
||||
"--prune",
|
||||
*args.arpa_prune.split("|"),
|
||||
]
|
||||
if args.discount_fallback:
|
||||
subargs += ["--discount_fallback"]
|
||||
subprocess.check_call(subargs)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
|
||||
|
||||
о
|
||||
е
|
||||
а
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
wav_filename,wav_filesize,transcript
|
||||
ru.wav,0,бедняга ребят на его месте должен был быть я
|
||||
ru.wav,0,бедняга ребят на его месте должен был быть я
|
||||
|
|
|
|
@ -3537,4 +3537,4 @@ p r o t e c t e d
|
|||
t h a t ' s
|
||||
f o r m e r
|
||||
m e a n t
|
||||
j o i n t
|
||||
j o i n t
|
||||
|
|
|
@ -5,7 +5,7 @@ Training Data Augmentation
|
|||
|
||||
This document is an overview of the augmentation techniques available for training with STT.
|
||||
|
||||
Training data augmentations can help STT models better transcribe new speech at deployment time. The basic intuition behind data augmentation is the following: by distorting, modifying, or adding to your existing audio data, you can create a training set many times larger than what you started with. If you use a larger training data set to train as STT model, you force the model to learn more generalizable characteristics of speech, making `overfitting <https://en.wikipedia.org/wiki/Overfitting>`_ more difficult. If you can't find a larger data set of speech, you can create one with data augmentation.
|
||||
Training data augmentations can help STT models better transcribe new speech at deployment time. The basic intuition behind data augmentation is the following: by distorting, modifying, or adding to your existing audio data, you can create a training set many times larger than what you started with. If you use a larger training data set to train as STT model, you force the model to learn more generalizable characteristics of speech, making `overfitting <https://en.wikipedia.org/wiki/Overfitting>`_ more difficult. If you can't find a larger data set of speech, you can create one with data augmentation.
|
||||
|
||||
We have implemented a pre-processing pipeline with various augmentation techniques on audio data (i.e. raw ``PCM`` and spectrograms).
|
||||
|
||||
|
|
|
@ -119,7 +119,7 @@ Building the native_client
|
|||
|
||||
There's one last command to run before building, you need to run the `configure.py <https://github.com/coqui-ai/tensorflow/blob/master/configure.py>`_ inside ``tensorflow`` cloned directory.
|
||||
|
||||
At this point we are ready to start building the ``native_client``, go to ``tensorflow`` sub-directory, following our examples should be ``D:\cloned\STT\tensorflow``.
|
||||
At this point we are ready to start building the ``native_client``, go to ``tensorflow`` sub-directory, following our examples should be ``D:\cloned\STT\tensorflow``.
|
||||
|
||||
CPU
|
||||
~~~
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
Checkpointing
|
||||
=============
|
||||
|
||||
Checkpoints are representations of the parameters of a neural network. During training, model parameters are continually updated, and checkpoints allow graceful interruption of a training run without data loss. If you interrupt a training run for any reason, you can pick up where you left off by using the checkpoints as a starting place. This is the exact same logic behind :ref:`model fine-tuning <transfer-learning>`.
|
||||
Checkpoints are representations of the parameters of a neural network. During training, model parameters are continually updated, and checkpoints allow graceful interruption of a training run without data loss. If you interrupt a training run for any reason, you can pick up where you left off by using the checkpoints as a starting place. This is the exact same logic behind :ref:`model fine-tuning <transfer-learning>`.
|
||||
|
||||
Checkpointing occurs at a configurable time interval. Resuming from checkpoints happens automatically by re-starting training with the same ``--checkpoint_dir`` of the former run. Alternatively, you can specify more fine grained options with ``--load_checkpoint_dir`` and ``--save_checkpoint_dir``, which specify separate locations to use for loading and saving checkpoints respectively.
|
||||
|
||||
|
|
|
@ -134,7 +134,7 @@ The script ``taskcluster.py`` will download ``native_client.tar.xz`` (which incl
|
|||
|
||||
Alternatively you may manually download the ``native_client.tar.xz`` from the `releases page <https://github.com/coqui-ai/STT/releases>`_.
|
||||
|
||||
Assuming you have :ref:`downloaded the pre-trained models <download-models>`, you can use the client as such:
|
||||
Assuming you have :ref:`downloaded the pre-trained models <download-models>`, you can use the client as such:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
Hot-word boosting API Usage example
|
||||
===================================
|
||||
|
||||
With the 🐸STT 0.9 release a new API feature was introduced that allows boosting probability from the scorer of given words. It is exposed in all bindings (C, Python, JS, Java and .Net).
|
||||
With the 🐸STT 0.9 release a new API feature was introduced that allows boosting probability from the scorer of given words. It is exposed in all bindings (C, Python, JS, Java and .Net).
|
||||
|
||||
Currently, it provides three methods for the Model class:
|
||||
|
||||
- ``AddHotWord(word, boost)``
|
||||
- ``EraseHotWord(word)``
|
||||
- ``EraseHotWord(word)``
|
||||
- ``ClearHotWords()``
|
||||
|
||||
Exact API binding for the language you are using can be found in API Reference.
|
||||
|
@ -14,7 +14,7 @@ Exact API binding for the language you are using can be found in API Reference.
|
|||
General usage
|
||||
-------------
|
||||
|
||||
It is worth noting that boosting non-existent words in scorer (mostly proper nouns) or a word that share no phonetic prefix with other word in the input audio don't change the final transcription. Additionally, hot-word that has a space will not be taken into consideration, meaning that combination of words can not be boosted and each word must be added as hot-word separately.
|
||||
It is worth noting that boosting non-existent words in scorer (mostly proper nouns) or a word that share no phonetic prefix with other word in the input audio don't change the final transcription. Additionally, hot-word that has a space will not be taken into consideration, meaning that combination of words can not be boosted and each word must be added as hot-word separately.
|
||||
|
||||
Adjusting the boosting value
|
||||
----------------------------
|
||||
|
@ -29,9 +29,9 @@ There is a user contributed script available on ``STT-examples`` repository for
|
|||
Positive value boosting
|
||||
-----------------------
|
||||
|
||||
By adding a positive boost value to one of the words it is possible to increase the probability of the word occurence. This is particularly useful for detecting speech that is expected by the system.
|
||||
By adding a positive boost value to one of the words it is possible to increase the probability of the word occurence. This is particularly useful for detecting speech that is expected by the system.
|
||||
|
||||
In the output, overextensive positive boost value (e.g. 250.0 but it does vary) may cause a word following the boosted hot-word to be split into separate letters. This problem is related to the scorer structure and currently only way to avoid it is to tune boost to a lower value.
|
||||
In the output, overextensive positive boost value (e.g. 250.0 but it does vary) may cause a word following the boosted hot-word to be split into separate letters. This problem is related to the scorer structure and currently only way to avoid it is to tune boost to a lower value.
|
||||
|
||||
Negative value boosting
|
||||
-----------------------
|
||||
|
@ -40,7 +40,7 @@ Respectively, applying negative boost value might cause the selected word to occ
|
|||
|
||||
Previously mentioned problem where extensive boost value caused letter splitting doesn't arise for negative boost values.
|
||||
|
||||
Example
|
||||
Example
|
||||
-------
|
||||
|
||||
To use hot-word boosting just add hot-words of your choice performing a speech-to-text operation with a ``Model``. You can also erase boosting of a chosen word or clear it for all hot-words.
|
||||
|
@ -52,5 +52,5 @@ To use hot-word boosting just add hot-words of your choice performing a speech-t
|
|||
ds.addHotWord(word, boosting)
|
||||
...
|
||||
print(ds.stt(audio))
|
||||
|
||||
|
||||
Adding boost value to a word repeatedly or erasing hot-word without previously boosting it results in an error.
|
||||
|
|
|
@ -138,7 +138,7 @@ Data Format
|
|||
|
||||
Audio data is expected to be stored as WAV, sampled at 16kHz, and mono-channel. There's no hard expectations for the length of individual audio files, but in our experience, training is most successful when WAV files range from 5 to 20 seconds in length. Your training data should match as closely as possible the kind of speech you expect at deployment. You can read more about the significant characteristics of speech with regard to STT :ref:`here <model-data-match>`.
|
||||
|
||||
Text transcripts should be formatted exactly as the transcripts you expect your model to produce at deployment. If you want your model to produce capital letters, your transcripts should include capital letters. If you want your model to produce punctuation, your transcripts should include punctuation. Keep in mind that the more characters you include in your transcripts, the more difficult the task becomes for your model. STT models learn from experience, and if there's very few examples in the training data, the model will have a hard time learning rare characters (e.g. the "ï" in "naïve").
|
||||
Text transcripts should be formatted exactly as the transcripts you expect your model to produce at deployment. If you want your model to produce capital letters, your transcripts should include capital letters. If you want your model to produce punctuation, your transcripts should include punctuation. Keep in mind that the more characters you include in your transcripts, the more difficult the task becomes for your model. STT models learn from experience, and if there's very few examples in the training data, the model will have a hard time learning rare characters (e.g. the "ï" in "naïve").
|
||||
|
||||
CSV file format
|
||||
"""""""""""""""
|
||||
|
|
106
doc/conf.py
106
doc/conf.py
|
@ -22,21 +22,27 @@
|
|||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.abspath('../'))
|
||||
sys.path.insert(0, os.path.abspath("../"))
|
||||
|
||||
autodoc_mock_imports = ['stt']
|
||||
autodoc_mock_imports = ["stt"]
|
||||
|
||||
# This is in fact only relevant on ReadTheDocs, but we want to run the same way
|
||||
# on our CI as in RTD to avoid regressions on RTD that we would not catch on CI
|
||||
import subprocess
|
||||
|
||||
parent = subprocess.check_output("cd ../ && pwd", shell=True).decode().strip()
|
||||
os.environ["PATH"] = os.path.join(parent, 'node_modules', '.bin') + ':' + os.environ["PATH"]
|
||||
subprocess.check_call('cd ../ && npm install typedoc@0.17.4 typescript@3.8.3 @types/node@13.9.x', shell=True)
|
||||
subprocess.check_call('env', shell=True)
|
||||
subprocess.check_call('which typedoc', shell=True)
|
||||
subprocess.check_call('cd ../ && doxygen doc/doxygen-c.conf', shell=True)
|
||||
subprocess.check_call('cd ../ && doxygen doc/doxygen-java.conf', shell=True)
|
||||
subprocess.check_call('cd ../ && doxygen doc/doxygen-dotnet.conf', shell=True)
|
||||
os.environ["PATH"] = (
|
||||
os.path.join(parent, "node_modules", ".bin") + ":" + os.environ["PATH"]
|
||||
)
|
||||
subprocess.check_call(
|
||||
"cd ../ && npm install typedoc@0.17.4 typescript@3.8.3 @types/node@13.9.x",
|
||||
shell=True,
|
||||
)
|
||||
subprocess.check_call("env", shell=True)
|
||||
subprocess.check_call("which typedoc", shell=True)
|
||||
subprocess.check_call("cd ../ && doxygen doc/doxygen-c.conf", shell=True)
|
||||
subprocess.check_call("cd ../ && doxygen doc/doxygen-java.conf", shell=True)
|
||||
subprocess.check_call("cd ../ && doxygen doc/doxygen-dotnet.conf", shell=True)
|
||||
|
||||
# -- General configuration ------------------------------------------------
|
||||
|
||||
|
@ -44,11 +50,11 @@ import semver
|
|||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = u'Coqui STT'
|
||||
copyright = '2021 Coqui GmbH, 2020 DeepSpeech authors, 2019-2020 Mozilla Corporation'
|
||||
author = 'Coqui GmbH'
|
||||
project = u"Coqui STT"
|
||||
copyright = "2021 Coqui GmbH, 2020 DeepSpeech authors, 2019-2020 Mozilla Corporation"
|
||||
author = "Coqui GmbH"
|
||||
|
||||
with open('../VERSION', 'r') as ver:
|
||||
with open("../VERSION", "r") as ver:
|
||||
v = ver.read().strip()
|
||||
vv = semver.parse(v)
|
||||
|
||||
|
@ -56,7 +62,7 @@ vv = semver.parse(v)
|
|||
# |version| and |release|, also used in various other places throughout the
|
||||
# built documents.
|
||||
# The short X.Y version
|
||||
version = '{}.{}'.format(vv['major'], vv['minor'])
|
||||
version = "{}.{}".format(vv["major"], vv["minor"])
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = v
|
||||
|
||||
|
@ -68,22 +74,22 @@ release = v
|
|||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.extlinks',
|
||||
'sphinx.ext.intersphinx',
|
||||
'sphinx.ext.mathjax',
|
||||
'sphinx.ext.viewcode',
|
||||
'sphinx_js',
|
||||
'sphinx_csharp',
|
||||
'breathe',
|
||||
'recommonmark',
|
||||
"sphinx.ext.autodoc",
|
||||
"sphinx.ext.extlinks",
|
||||
"sphinx.ext.intersphinx",
|
||||
"sphinx.ext.mathjax",
|
||||
"sphinx.ext.viewcode",
|
||||
"sphinx_js",
|
||||
"sphinx_csharp",
|
||||
"breathe",
|
||||
"recommonmark",
|
||||
]
|
||||
|
||||
|
||||
breathe_projects = {
|
||||
"stt-c": "xml-c/",
|
||||
"stt-java": "xml-java/",
|
||||
"stt-dotnet": "xml-dotnet/",
|
||||
"stt-c": "xml-c/",
|
||||
"stt-java": "xml-java/",
|
||||
"stt-dotnet": "xml-dotnet/",
|
||||
}
|
||||
|
||||
js_source_path = "../native_client/javascript/index.ts"
|
||||
|
@ -91,16 +97,16 @@ js_language = "typescript"
|
|||
jsdoc_config_path = "../native_client/javascript/tsconfig.json"
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['.templates']
|
||||
templates_path = [".templates"]
|
||||
|
||||
# The suffix(es) of source filenames.
|
||||
# You can specify multiple suffix as a list of string:
|
||||
#
|
||||
# source_suffix = ['.rst', '.md']
|
||||
source_suffix = '.rst'
|
||||
source_suffix = ".rst"
|
||||
|
||||
# The main toctree document.
|
||||
master_doc = 'index'
|
||||
master_doc = "index"
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
|
@ -112,10 +118,10 @@ language = None
|
|||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This patterns also effect to html_static_path and html_extra_path
|
||||
exclude_patterns = ['.build', 'Thumbs.db', '.DS_Store', 'node_modules', 'examples']
|
||||
exclude_patterns = [".build", "Thumbs.db", ".DS_Store", "node_modules", "examples"]
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = 'sphinx'
|
||||
pygments_style = "sphinx"
|
||||
|
||||
# If true, `todo` and `todoList` produce output, else they produce nothing.
|
||||
todo_include_todos = False
|
||||
|
@ -128,18 +134,18 @@ add_module_names = False
|
|||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'furo'
|
||||
html_theme = "furo"
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['.static']
|
||||
html_static_path = [".static"]
|
||||
|
||||
|
||||
# -- Options for HTMLHelp output ------------------------------------------
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = 'STTdoc'
|
||||
htmlhelp_basename = "STTdoc"
|
||||
|
||||
|
||||
# -- Options for LaTeX output ---------------------------------------------
|
||||
|
@ -148,15 +154,12 @@ latex_elements = {
|
|||
# The paper size ('letterpaper' or 'a4paper').
|
||||
#
|
||||
# 'papersize': 'letterpaper',
|
||||
|
||||
# The font size ('10pt', '11pt' or '12pt').
|
||||
#
|
||||
# 'pointsize': '10pt',
|
||||
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
#
|
||||
# 'preamble': '',
|
||||
|
||||
# Latex figure (float) alignment
|
||||
#
|
||||
# 'figure_align': 'htbp',
|
||||
|
@ -166,8 +169,7 @@ latex_elements = {
|
|||
# (source start file, target name, title,
|
||||
# author, documentclass [howto, manual, or own class]).
|
||||
latex_documents = [
|
||||
(master_doc, 'STT.tex', u'Coqui STT Documentation',
|
||||
u'Coqui GmbH', 'manual'),
|
||||
(master_doc, "STT.tex", u"Coqui STT Documentation", u"Coqui GmbH", "manual"),
|
||||
]
|
||||
|
||||
|
||||
|
@ -175,10 +177,7 @@ latex_documents = [
|
|||
|
||||
# One entry per manual page. List of tuples
|
||||
# (source start file, name, description, authors, manual section).
|
||||
man_pages = [
|
||||
(master_doc, 'stt', u'Coqui STT Documentation',
|
||||
[author], 1)
|
||||
]
|
||||
man_pages = [(master_doc, "stt", u"Coqui STT Documentation", [author], 1)]
|
||||
|
||||
|
||||
# -- Options for Texinfo output -------------------------------------------
|
||||
|
@ -187,16 +186,21 @@ man_pages = [
|
|||
# (source start file, target name, title, author,
|
||||
# dir menu entry, description, category)
|
||||
texinfo_documents = [
|
||||
(master_doc, 'STT', u'Coqui STT Documentation',
|
||||
author, 'STT', 'One line description of project.',
|
||||
'Miscellaneous'),
|
||||
(
|
||||
master_doc,
|
||||
"STT",
|
||||
u"Coqui STT Documentation",
|
||||
author,
|
||||
"STT",
|
||||
"One line description of project.",
|
||||
"Miscellaneous",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
# Example configuration for intersphinx: refer to the Python standard library.
|
||||
intersphinx_mapping = {'https://docs.python.org/': None}
|
||||
intersphinx_mapping = {"https://docs.python.org/": None}
|
||||
|
||||
extlinks = {'github': ('https://github.com/coqui-ai/STT/blob/v{}/%s'.format(release),
|
||||
'%s')}
|
||||
extlinks = {
|
||||
"github": ("https://github.com/coqui-ai/STT/blob/v{}/%s".format(release), "%s")
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ Coqui STT
|
|||
Quickstart: Deployment
|
||||
^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The fastest way to deploy a pre-trained 🐸STT model is with `pip` with Python 3.5 or higher (*Note - only Linux supported at this time. We are working to get our normally supported packages back up and running.*):
|
||||
The fastest way to deploy a pre-trained 🐸STT model is with `pip` with Python 3.5 or higher (*Note - only Linux supported at this time. We are working to get our normally supported packages back up and running.*):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
+ [Testing the image by creating a container and running a script](#testing-the-image-by-creating-a-container-and-running-a-script)
|
||||
* [Setting up a bind mount to store persistent data](#setting-up-a-bind-mount-to-store-persistent-data)
|
||||
* [Extending the base `stt-train` Docker image for your needs](#extending-the-base--stt-train--docker-image-for-your-needs)
|
||||
|
||||
|
||||
This section of the Playbook assumes you are comfortable installing 🐸STT and using it with a pre-trained model, and that you are comfortable setting up a Python _virtual environment_.
|
||||
|
||||
Here, we provide information on setting up a Docker environment for training your own speech recognition model using 🐸STT. We also cover dependencies Docker has for NVIDIA GPUs, so that you can use your GPU(s) for training a model.
|
||||
|
@ -48,7 +48,7 @@ By default, your machine should already have GPU drivers installed. A good way t
|
|||
```
|
||||
$ nvidia-smi
|
||||
|
||||
Sat Jan 9 11:48:50 2021
|
||||
Sat Jan 9 11:48:50 2021
|
||||
+-----------------------------------------------------------------------------+
|
||||
| NVIDIA-SMI 450.80.02 Driver Version: 450.80.02 CUDA Version: 11.0 |
|
||||
|-------------------------------+----------------------+----------------------+
|
||||
|
@ -195,7 +195,7 @@ This command assumes that `/bin/bash` will be invoked as the `root` user. This i
|
|||
When you run the above command, you should see the following prompt:
|
||||
|
||||
```
|
||||
________ _______________
|
||||
________ _______________
|
||||
___ __/__________________________________ ____/__ /________ __
|
||||
__ / _ _ \_ __ \_ ___/ __ \_ ___/_ /_ __ /_ __ \_ | /| / /
|
||||
_ / / __/ / / /(__ )/ /_/ / / _ __/ _ / / /_/ /_ |/ |/ /
|
||||
|
|
|
@ -28,7 +28,7 @@ If you are training a model that uses a different alphabet to English, for examp
|
|||
|
||||
## [Building your own scorer](SCORER.md)
|
||||
|
||||
Learn what the scorer does, and how you can go about building your own.
|
||||
Learn what the scorer does, and how you can go about building your own.
|
||||
|
||||
## [Acoustic model and language model](AM_vs_LM.md)
|
||||
|
||||
|
@ -66,7 +66,7 @@ Here, we've linked to several resources that you may find helpful; they're liste
|
|||
|
||||
* [Google's machine learning crash course](https://developers.google.com/machine-learning/crash-course/ml-intro) provides a gentle introduction to the main concepts of machine learning, including _gradient descent_, _learning rate_, _training, test and validation sets_ and _overfitting_.
|
||||
|
||||
* If machine learning is something that sparks your interest, then you may enjoy [the MIT Open Learning Library's Introduction to Machine Learning course](https://openlearninglibrary.mit.edu/courses/course-v1:MITx+6.036+1T2019/course/), a 13-week college-level course covering perceptrons, neural networks, support vector machines and convolutional neural networks.
|
||||
* If machine learning is something that sparks your interest, then you may enjoy [the MIT Open Learning Library's Introduction to Machine Learning course](https://openlearninglibrary.mit.edu/courses/course-v1:MITx+6.036+1T2019/course/), a 13-week college-level course covering perceptrons, neural networks, support vector machines and convolutional neural networks.
|
||||
|
||||
---
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ When you invoked `train.py` in the [training](TRAINING.md) section, and trained
|
|||
|
||||
```
|
||||
Testing model on stt-data/cv-corpus-6.1-2020-12-11/id/clips/test.csv
|
||||
Test epoch | Steps: 1844 | Elapsed Time: 0:51:11
|
||||
Test epoch | Steps: 1844 | Elapsed Time: 0:51:11
|
||||
Test on stt-data/cv-corpus-6.1-2020-12-11/id/clips/test.csv - WER: 1.000000, CER: 0.824103, loss: 104.989326
|
||||
--------------------------------------------------------------------------------
|
||||
Best WER:
|
||||
|
@ -156,7 +156,7 @@ _Fine tuning_ and _transfer learning_ are two processes used to improve the accu
|
|||
|
||||
For more information on [fine tuning in 🐸STT, please consult the documentation](https://stt.readthedocs.io/en/latest/TRAINING.html#fine-tuning-same-alphabet).
|
||||
|
||||
For more information on [transfer learning in 🐸STT, please consult the documentation](https://stt.readthedocs.io/en/latest/TRAINING.html#transfer-learning-new-alphabet).
|
||||
For more information on [transfer learning in 🐸STT, please consult the documentation](https://stt.readthedocs.io/en/latest/TRAINING.html#transfer-learning-new-alphabet).
|
||||
|
||||
---
|
||||
|
||||
|
|
|
@ -2,11 +2,11 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
from coqui_stt_training import evaluate as ds_evaluate
|
||||
except ImportError:
|
||||
print('Training package is not installed. See training documentation.')
|
||||
print("Training package is not installed. See training documentation.")
|
||||
raise
|
||||
|
||||
ds_evaluate.run_script()
|
||||
|
|
|
@ -2,22 +2,22 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import absl.app
|
||||
import argparse
|
||||
import numpy as np
|
||||
import wave
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
import wave
|
||||
from functools import partial
|
||||
from multiprocessing import JoinableQueue, Manager, Process, cpu_count
|
||||
|
||||
from stt import Model
|
||||
import absl.app
|
||||
import numpy as np
|
||||
from coqui_stt_training.util.evaluate_tools import calculate_and_print_report
|
||||
from coqui_stt_training.util.flags import create_flags
|
||||
from functools import partial
|
||||
from multiprocessing import JoinableQueue, Process, cpu_count, Manager
|
||||
from six.moves import zip, range
|
||||
from six.moves import range, zip
|
||||
from stt import Model
|
||||
|
||||
r'''
|
||||
r"""
|
||||
This module should be self-contained:
|
||||
- build libstt.so with TFLite:
|
||||
- bazel build [...] --define=runtime=tflite [...] //native_client:libstt.so
|
||||
|
@ -27,10 +27,11 @@ This module should be self-contained:
|
|||
- pip install -r requirements_eval_tflite.txt
|
||||
|
||||
Then run with a TFLite model, a scorer and a CSV test file
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def tflite_worker(model, scorer, queue_in, queue_out, gpu_mask):
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_mask)
|
||||
ds = Model(model)
|
||||
ds.enableExternalScorer(scorer)
|
||||
|
||||
|
@ -38,29 +39,41 @@ def tflite_worker(model, scorer, queue_in, queue_out, gpu_mask):
|
|||
try:
|
||||
msg = queue_in.get()
|
||||
|
||||
filename = msg['filename']
|
||||
fin = wave.open(filename, 'rb')
|
||||
filename = msg["filename"]
|
||||
fin = wave.open(filename, "rb")
|
||||
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
|
||||
fin.close()
|
||||
|
||||
decoded = ds.stt(audio)
|
||||
|
||||
queue_out.put({'wav': filename, 'prediction': decoded, 'ground_truth': msg['transcript']})
|
||||
queue_out.put(
|
||||
{
|
||||
"wav": filename,
|
||||
"prediction": decoded,
|
||||
"ground_truth": msg["transcript"],
|
||||
}
|
||||
)
|
||||
except FileNotFoundError as ex:
|
||||
print('FileNotFoundError: ', ex)
|
||||
print("FileNotFoundError: ", ex)
|
||||
|
||||
print(queue_out.qsize(), end='\r') # Update the current progress
|
||||
print(queue_out.qsize(), end="\r") # Update the current progress
|
||||
queue_in.task_done()
|
||||
|
||||
|
||||
def main(args, _):
|
||||
manager = Manager()
|
||||
work_todo = JoinableQueue() # this is where we are going to store input data
|
||||
work_todo = JoinableQueue() # this is where we are going to store input data
|
||||
work_done = manager.Queue() # this where we are gonna push them out
|
||||
|
||||
processes = []
|
||||
for i in range(args.proc):
|
||||
worker_process = Process(target=tflite_worker, args=(args.model, args.scorer, work_todo, work_done, i), daemon=True, name='tflite_process_{}'.format(i))
|
||||
worker_process.start() # Launch reader() as a separate python process
|
||||
worker_process = Process(
|
||||
target=tflite_worker,
|
||||
args=(args.model, args.scorer, work_todo, work_done, i),
|
||||
daemon=True,
|
||||
name="tflite_process_{}".format(i),
|
||||
)
|
||||
worker_process.start() # Launch reader() as a separate python process
|
||||
processes.append(worker_process)
|
||||
|
||||
print([x.name for x in processes])
|
||||
|
@ -71,56 +84,75 @@ def main(args, _):
|
|||
losses = []
|
||||
wav_filenames = []
|
||||
|
||||
with open(args.csv, 'r') as csvfile:
|
||||
with open(args.csv, "r") as csvfile:
|
||||
csvreader = csv.DictReader(csvfile)
|
||||
count = 0
|
||||
for row in csvreader:
|
||||
count += 1
|
||||
# Relative paths are relative to the folder the CSV file is in
|
||||
if not os.path.isabs(row['wav_filename']):
|
||||
row['wav_filename'] = os.path.join(os.path.dirname(args.csv), row['wav_filename'])
|
||||
work_todo.put({'filename': row['wav_filename'], 'transcript': row['transcript']})
|
||||
wav_filenames.extend(row['wav_filename'])
|
||||
if not os.path.isabs(row["wav_filename"]):
|
||||
row["wav_filename"] = os.path.join(
|
||||
os.path.dirname(args.csv), row["wav_filename"]
|
||||
)
|
||||
work_todo.put(
|
||||
{"filename": row["wav_filename"], "transcript": row["transcript"]}
|
||||
)
|
||||
wav_filenames.extend(row["wav_filename"])
|
||||
|
||||
print('Totally %d wav entries found in csv\n' % count)
|
||||
print("Totally %d wav entries found in csv\n" % count)
|
||||
work_todo.join()
|
||||
print('\nTotally %d wav file transcripted' % work_done.qsize())
|
||||
print("\nTotally %d wav file transcripted" % work_done.qsize())
|
||||
|
||||
while not work_done.empty():
|
||||
msg = work_done.get()
|
||||
losses.append(0.0)
|
||||
ground_truths.append(msg['ground_truth'])
|
||||
predictions.append(msg['prediction'])
|
||||
wavlist.append(msg['wav'])
|
||||
ground_truths.append(msg["ground_truth"])
|
||||
predictions.append(msg["prediction"])
|
||||
wavlist.append(msg["wav"])
|
||||
|
||||
# Print test summary
|
||||
_ = calculate_and_print_report(wav_filenames, ground_truths, predictions, losses, args.csv)
|
||||
_ = calculate_and_print_report(
|
||||
wav_filenames, ground_truths, predictions, losses, args.csv
|
||||
)
|
||||
|
||||
if args.dump:
|
||||
with open(args.dump + '.txt', 'w') as ftxt, open(args.dump + '.out', 'w') as fout:
|
||||
with open(args.dump + ".txt", "w") as ftxt, open(
|
||||
args.dump + ".out", "w"
|
||||
) as fout:
|
||||
for wav, txt, out in zip(wavlist, ground_truths, predictions):
|
||||
ftxt.write('%s %s\n' % (wav, txt))
|
||||
fout.write('%s %s\n' % (wav, out))
|
||||
print('Reference texts dumped to %s.txt' % args.dump)
|
||||
print('Transcription dumped to %s.out' % args.dump)
|
||||
ftxt.write("%s %s\n" % (wav, txt))
|
||||
fout.write("%s %s\n" % (wav, out))
|
||||
print("Reference texts dumped to %s.txt" % args.dump)
|
||||
print("Transcription dumped to %s.out" % args.dump)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Computing TFLite accuracy')
|
||||
parser.add_argument('--model', required=True,
|
||||
help='Path to the model (protocol buffer binary file)')
|
||||
parser.add_argument('--scorer', required=True,
|
||||
help='Path to the external scorer file')
|
||||
parser.add_argument('--csv', required=True,
|
||||
help='Path to the CSV source file')
|
||||
parser.add_argument('--proc', required=False, default=cpu_count(), type=int,
|
||||
help='Number of processes to spawn, defaulting to number of CPUs')
|
||||
parser.add_argument('--dump', required=False,
|
||||
help='Path to dump the results as text file, with one line for each wav: "wav transcription".')
|
||||
parser = argparse.ArgumentParser(description="Computing TFLite accuracy")
|
||||
parser.add_argument(
|
||||
"--model", required=True, help="Path to the model (protocol buffer binary file)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--scorer", required=True, help="Path to the external scorer file"
|
||||
)
|
||||
parser.add_argument("--csv", required=True, help="Path to the CSV source file")
|
||||
parser.add_argument(
|
||||
"--proc",
|
||||
required=False,
|
||||
default=cpu_count(),
|
||||
type=int,
|
||||
help="Number of processes to spawn, defaulting to number of CPUs",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dump",
|
||||
required=False,
|
||||
help='Path to dump the results as text file, with one line for each wav: "wav transcription".',
|
||||
)
|
||||
args, unknown = parser.parse_known_args()
|
||||
# Reconstruct argv for absl.flags
|
||||
sys.argv = [sys.argv[0]] + unknown
|
||||
return args
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
create_flags()
|
||||
absl.app.run(partial(main, parse_args()))
|
||||
|
|
|
@ -2,35 +2,39 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, print_function
|
||||
|
||||
import sys
|
||||
|
||||
import absl.app
|
||||
import optuna
|
||||
import sys
|
||||
import tensorflow.compat.v1 as tfv1
|
||||
|
||||
from coqui_stt_ctcdecoder import Scorer
|
||||
from coqui_stt_training.evaluate import evaluate
|
||||
from coqui_stt_training.train import create_model
|
||||
from coqui_stt_training.util.config import Config, initialize_globals
|
||||
from coqui_stt_training.util.flags import create_flags, FLAGS
|
||||
from coqui_stt_training.util.logging import log_error
|
||||
from coqui_stt_training.util.evaluate_tools import wer_cer_batch
|
||||
from coqui_stt_ctcdecoder import Scorer
|
||||
from coqui_stt_training.util.flags import FLAGS, create_flags
|
||||
from coqui_stt_training.util.logging import log_error
|
||||
|
||||
import tensorflow.compat.v1 as tfv1
|
||||
|
||||
|
||||
def character_based():
|
||||
is_character_based = False
|
||||
if FLAGS.scorer_path:
|
||||
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path, Config.alphabet)
|
||||
scorer = Scorer(
|
||||
FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path, Config.alphabet
|
||||
)
|
||||
is_character_based = scorer.is_utf8_mode()
|
||||
return is_character_based
|
||||
|
||||
def objective(trial):
|
||||
FLAGS.lm_alpha = trial.suggest_uniform('lm_alpha', 0, FLAGS.lm_alpha_max)
|
||||
FLAGS.lm_beta = trial.suggest_uniform('lm_beta', 0, FLAGS.lm_beta_max)
|
||||
|
||||
is_character_based = trial.study.user_attrs['is_character_based']
|
||||
def objective(trial):
|
||||
FLAGS.lm_alpha = trial.suggest_uniform("lm_alpha", 0, FLAGS.lm_alpha_max)
|
||||
FLAGS.lm_beta = trial.suggest_uniform("lm_beta", 0, FLAGS.lm_beta_max)
|
||||
|
||||
is_character_based = trial.study.user_attrs["is_character_based"]
|
||||
|
||||
samples = []
|
||||
for step, test_file in enumerate(FLAGS.test_files.split(',')):
|
||||
for step, test_file in enumerate(FLAGS.test_files.split(",")):
|
||||
tfv1.reset_default_graph()
|
||||
|
||||
current_samples = evaluate([test_file], create_model)
|
||||
|
@ -47,12 +51,15 @@ def objective(trial):
|
|||
wer, cer = wer_cer_batch(samples)
|
||||
return cer if is_character_based else wer
|
||||
|
||||
|
||||
def main(_):
|
||||
initialize_globals()
|
||||
|
||||
if not FLAGS.test_files:
|
||||
log_error('You need to specify what files to use for evaluation via '
|
||||
'the --test_files flag.')
|
||||
log_error(
|
||||
"You need to specify what files to use for evaluation via "
|
||||
"the --test_files flag."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
is_character_based = character_based()
|
||||
|
@ -60,11 +67,15 @@ def main(_):
|
|||
study = optuna.create_study()
|
||||
study.set_user_attr("is_character_based", is_character_based)
|
||||
study.optimize(objective, n_jobs=1, n_trials=FLAGS.n_trials)
|
||||
print('Best params: lm_alpha={} and lm_beta={} with WER={}'.format(study.best_params['lm_alpha'],
|
||||
study.best_params['lm_beta'],
|
||||
study.best_value))
|
||||
print(
|
||||
"Best params: lm_alpha={} and lm_beta={} with WER={}".format(
|
||||
study.best_params["lm_alpha"],
|
||||
study.best_params["lm_beta"],
|
||||
study.best_value,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
create_flags()
|
||||
absl.app.run(main)
|
||||
|
|
|
@ -18,8 +18,8 @@ Variable naming
|
|||
File naming
|
||||
===========
|
||||
|
||||
* Source code files should have a `.cc` prefix and headers a `.h` prefix, excluding
|
||||
code important from elsewhere, which should follow local conventions, e.g. `.cpp` and `.h`
|
||||
* Source code files should have a `.cc` prefix and headers a `.h` prefix, excluding
|
||||
code important from elsewhere, which should follow local conventions, e.g. `.cpp` and `.h`
|
||||
in `ctcdecode/`.
|
||||
|
||||
Doubts
|
||||
|
|
|
@ -152,7 +152,7 @@ MetadataToJSON(Metadata* result)
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
out_string << "\n}\n";
|
||||
|
||||
return strdup(out_string.str().c_str());
|
||||
|
|
|
@ -20,4 +20,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
|
|
|
@ -1,17 +1,18 @@
|
|||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
from . import swigwrapper # pylint: disable=import-self
|
||||
from . import swigwrapper # pylint: disable=import-self
|
||||
|
||||
# This module is built with SWIG_PYTHON_STRICT_BYTE_CHAR so we must handle
|
||||
# string encoding explicitly, here and throughout this file.
|
||||
__version__ = swigwrapper.__version__.decode('utf-8')
|
||||
__version__ = swigwrapper.__version__.decode("utf-8")
|
||||
|
||||
# Hack: import error codes by matching on their names, as SWIG unfortunately
|
||||
# does not support binding enums to Python in a scoped manner yet.
|
||||
for symbol in dir(swigwrapper):
|
||||
if symbol.startswith('STT_ERR_'):
|
||||
if symbol.startswith("STT_ERR_"):
|
||||
globals()[symbol] = getattr(swigwrapper, symbol)
|
||||
|
||||
|
||||
class Scorer(swigwrapper.Scorer):
|
||||
"""Wrapper for Scorer.
|
||||
|
||||
|
@ -23,130 +24,140 @@ class Scorer(swigwrapper.Scorer):
|
|||
:alphabet: Alphabet
|
||||
:type scorer_path: basestring
|
||||
"""
|
||||
|
||||
def __init__(self, alpha=None, beta=None, scorer_path=None, alphabet=None):
|
||||
super(Scorer, self).__init__()
|
||||
# Allow bare initialization
|
||||
if alphabet:
|
||||
assert alpha is not None, 'alpha parameter is required'
|
||||
assert beta is not None, 'beta parameter is required'
|
||||
assert scorer_path, 'scorer_path parameter is required'
|
||||
assert alpha is not None, "alpha parameter is required"
|
||||
assert beta is not None, "beta parameter is required"
|
||||
assert scorer_path, "scorer_path parameter is required"
|
||||
|
||||
err = self.init(scorer_path.encode('utf-8'), alphabet)
|
||||
err = self.init(scorer_path.encode("utf-8"), alphabet)
|
||||
if err != 0:
|
||||
raise ValueError('Scorer initialization failed with error code 0x{:X}'.format(err))
|
||||
raise ValueError(
|
||||
"Scorer initialization failed with error code 0x{:X}".format(err)
|
||||
)
|
||||
|
||||
self.reset_params(alpha, beta)
|
||||
|
||||
|
||||
class Alphabet(swigwrapper.Alphabet):
|
||||
"""Convenience wrapper for Alphabet which calls init in the constructor"""
|
||||
|
||||
def __init__(self, config_path):
|
||||
super(Alphabet, self).__init__()
|
||||
err = self.init(config_path.encode('utf-8'))
|
||||
err = self.init(config_path.encode("utf-8"))
|
||||
if err != 0:
|
||||
raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err))
|
||||
raise ValueError(
|
||||
"Alphabet initialization failed with error code 0x{:X}".format(err)
|
||||
)
|
||||
|
||||
def CanEncodeSingle(self, input):
|
||||
'''
|
||||
"""
|
||||
Returns true if the single character/output class has a corresponding label
|
||||
in the alphabet.
|
||||
'''
|
||||
return super(Alphabet, self).CanEncodeSingle(input.encode('utf-8'))
|
||||
"""
|
||||
return super(Alphabet, self).CanEncodeSingle(input.encode("utf-8"))
|
||||
|
||||
def CanEncode(self, input):
|
||||
'''
|
||||
"""
|
||||
Returns true if the entire string can be encoded into labels in this
|
||||
alphabet.
|
||||
'''
|
||||
return super(Alphabet, self).CanEncode(input.encode('utf-8'))
|
||||
"""
|
||||
return super(Alphabet, self).CanEncode(input.encode("utf-8"))
|
||||
|
||||
def EncodeSingle(self, input):
|
||||
'''
|
||||
"""
|
||||
Encode a single character/output class into a label. Character must be in
|
||||
the alphabet, this method will assert that. Use `CanEncodeSingle` to test.
|
||||
'''
|
||||
return super(Alphabet, self).EncodeSingle(input.encode('utf-8'))
|
||||
"""
|
||||
return super(Alphabet, self).EncodeSingle(input.encode("utf-8"))
|
||||
|
||||
def Encode(self, input):
|
||||
'''
|
||||
"""
|
||||
Encode a sequence of character/output classes into a sequence of labels.
|
||||
Characters are assumed to always take a single Unicode codepoint.
|
||||
Characters must be in the alphabet, this method will assert that. Use
|
||||
`CanEncode` and `CanEncodeSingle` to test.
|
||||
'''
|
||||
"""
|
||||
# Convert SWIG's UnsignedIntVec to a Python list
|
||||
res = super(Alphabet, self).Encode(input.encode('utf-8'))
|
||||
res = super(Alphabet, self).Encode(input.encode("utf-8"))
|
||||
return [el for el in res]
|
||||
|
||||
def DecodeSingle(self, input):
|
||||
res = super(Alphabet, self).DecodeSingle(input)
|
||||
return res.decode('utf-8')
|
||||
return res.decode("utf-8")
|
||||
|
||||
def Decode(self, input):
|
||||
'''Decode a sequence of labels into a string.'''
|
||||
"""Decode a sequence of labels into a string."""
|
||||
res = super(Alphabet, self).Decode(input)
|
||||
return res.decode('utf-8')
|
||||
return res.decode("utf-8")
|
||||
|
||||
|
||||
class UTF8Alphabet(swigwrapper.UTF8Alphabet):
|
||||
"""Convenience wrapper for Alphabet which calls init in the constructor"""
|
||||
|
||||
def __init__(self):
|
||||
super(UTF8Alphabet, self).__init__()
|
||||
err = self.init(b'')
|
||||
err = self.init(b"")
|
||||
if err != 0:
|
||||
raise ValueError('UTF8Alphabet initialization failed with error code 0x{:X}'.format(err))
|
||||
raise ValueError(
|
||||
"UTF8Alphabet initialization failed with error code 0x{:X}".format(err)
|
||||
)
|
||||
|
||||
def CanEncodeSingle(self, input):
|
||||
'''
|
||||
"""
|
||||
Returns true if the single character/output class has a corresponding label
|
||||
in the alphabet.
|
||||
'''
|
||||
return super(UTF8Alphabet, self).CanEncodeSingle(input.encode('utf-8'))
|
||||
"""
|
||||
return super(UTF8Alphabet, self).CanEncodeSingle(input.encode("utf-8"))
|
||||
|
||||
def CanEncode(self, input):
|
||||
'''
|
||||
"""
|
||||
Returns true if the entire string can be encoded into labels in this
|
||||
alphabet.
|
||||
'''
|
||||
return super(UTF8Alphabet, self).CanEncode(input.encode('utf-8'))
|
||||
"""
|
||||
return super(UTF8Alphabet, self).CanEncode(input.encode("utf-8"))
|
||||
|
||||
def EncodeSingle(self, input):
|
||||
'''
|
||||
"""
|
||||
Encode a single character/output class into a label. Character must be in
|
||||
the alphabet, this method will assert that. Use `CanEncodeSingle` to test.
|
||||
'''
|
||||
return super(UTF8Alphabet, self).EncodeSingle(input.encode('utf-8'))
|
||||
"""
|
||||
return super(UTF8Alphabet, self).EncodeSingle(input.encode("utf-8"))
|
||||
|
||||
def Encode(self, input):
|
||||
'''
|
||||
"""
|
||||
Encode a sequence of character/output classes into a sequence of labels.
|
||||
Characters are assumed to always take a single Unicode codepoint.
|
||||
Characters must be in the alphabet, this method will assert that. Use
|
||||
`CanEncode` and `CanEncodeSingle` to test.
|
||||
'''
|
||||
"""
|
||||
# Convert SWIG's UnsignedIntVec to a Python list
|
||||
res = super(UTF8Alphabet, self).Encode(input.encode('utf-8'))
|
||||
res = super(UTF8Alphabet, self).Encode(input.encode("utf-8"))
|
||||
return [el for el in res]
|
||||
|
||||
def DecodeSingle(self, input):
|
||||
res = super(UTF8Alphabet, self).DecodeSingle(input)
|
||||
return res.decode('utf-8')
|
||||
return res.decode("utf-8")
|
||||
|
||||
def Decode(self, input):
|
||||
'''Decode a sequence of labels into a string.'''
|
||||
"""Decode a sequence of labels into a string."""
|
||||
res = super(UTF8Alphabet, self).Decode(input)
|
||||
return res.decode('utf-8')
|
||||
return res.decode("utf-8")
|
||||
|
||||
|
||||
|
||||
def ctc_beam_search_decoder(probs_seq,
|
||||
alphabet,
|
||||
beam_size,
|
||||
cutoff_prob=1.0,
|
||||
cutoff_top_n=40,
|
||||
scorer=None,
|
||||
hot_words=dict(),
|
||||
num_results=1):
|
||||
def ctc_beam_search_decoder(
|
||||
probs_seq,
|
||||
alphabet,
|
||||
beam_size,
|
||||
cutoff_prob=1.0,
|
||||
cutoff_top_n=40,
|
||||
scorer=None,
|
||||
hot_words=dict(),
|
||||
num_results=1,
|
||||
):
|
||||
"""Wrapper for the CTC Beam Search Decoder.
|
||||
|
||||
:param probs_seq: 2-D list of probability distributions over each time
|
||||
|
@ -175,22 +186,33 @@ def ctc_beam_search_decoder(probs_seq,
|
|||
:rtype: list
|
||||
"""
|
||||
beam_results = swigwrapper.ctc_beam_search_decoder(
|
||||
probs_seq, alphabet, beam_size, cutoff_prob, cutoff_top_n,
|
||||
scorer, hot_words, num_results)
|
||||
beam_results = [(res.confidence, alphabet.Decode(res.tokens)) for res in beam_results]
|
||||
probs_seq,
|
||||
alphabet,
|
||||
beam_size,
|
||||
cutoff_prob,
|
||||
cutoff_top_n,
|
||||
scorer,
|
||||
hot_words,
|
||||
num_results,
|
||||
)
|
||||
beam_results = [
|
||||
(res.confidence, alphabet.Decode(res.tokens)) for res in beam_results
|
||||
]
|
||||
return beam_results
|
||||
|
||||
|
||||
def ctc_beam_search_decoder_batch(probs_seq,
|
||||
seq_lengths,
|
||||
alphabet,
|
||||
beam_size,
|
||||
num_processes,
|
||||
cutoff_prob=1.0,
|
||||
cutoff_top_n=40,
|
||||
scorer=None,
|
||||
hot_words=dict(),
|
||||
num_results=1):
|
||||
def ctc_beam_search_decoder_batch(
|
||||
probs_seq,
|
||||
seq_lengths,
|
||||
alphabet,
|
||||
beam_size,
|
||||
num_processes,
|
||||
cutoff_prob=1.0,
|
||||
cutoff_top_n=40,
|
||||
scorer=None,
|
||||
hot_words=dict(),
|
||||
num_results=1,
|
||||
):
|
||||
"""Wrapper for the batched CTC beam search decoder.
|
||||
|
||||
:param probs_seq: 3-D list with each element as an instance of 2-D list
|
||||
|
@ -222,7 +244,18 @@ def ctc_beam_search_decoder_batch(probs_seq,
|
|||
results, in descending order of the confidence.
|
||||
:rtype: list
|
||||
"""
|
||||
batch_beam_results = swigwrapper.ctc_beam_search_decoder_batch(probs_seq, seq_lengths, alphabet, beam_size, num_processes, cutoff_prob, cutoff_top_n, scorer, hot_words, num_results)
|
||||
batch_beam_results = swigwrapper.ctc_beam_search_decoder_batch(
|
||||
probs_seq,
|
||||
seq_lengths,
|
||||
alphabet,
|
||||
beam_size,
|
||||
num_processes,
|
||||
cutoff_prob,
|
||||
cutoff_top_n,
|
||||
scorer,
|
||||
hot_words,
|
||||
num_results,
|
||||
)
|
||||
batch_beam_results = [
|
||||
[(res.confidence, alphabet.Decode(res.tokens)) for res in beam_results]
|
||||
for beam_results in batch_beam_results
|
||||
|
|
|
@ -6,84 +6,95 @@ import os
|
|||
import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from multiprocessing.dummy import Pool
|
||||
|
||||
if sys.platform.startswith('win'):
|
||||
ARGS = ['/nologo', '/D KENLM_MAX_ORDER=6', '/EHsc', '/source-charset:utf-8']
|
||||
OPT_ARGS = ['/O2', '/MT', '/D NDEBUG']
|
||||
DBG_ARGS = ['/Od', '/MTd', '/Zi', '/U NDEBUG', '/D DEBUG']
|
||||
OPENFST_DIR = 'third_party/openfst-1.6.9-win'
|
||||
if sys.platform.startswith("win"):
|
||||
ARGS = ["/nologo", "/D KENLM_MAX_ORDER=6", "/EHsc", "/source-charset:utf-8"]
|
||||
OPT_ARGS = ["/O2", "/MT", "/D NDEBUG"]
|
||||
DBG_ARGS = ["/Od", "/MTd", "/Zi", "/U NDEBUG", "/D DEBUG"]
|
||||
OPENFST_DIR = "third_party/openfst-1.6.9-win"
|
||||
else:
|
||||
ARGS = ['-fPIC', '-DKENLM_MAX_ORDER=6', '-std=c++11', '-Wno-unused-local-typedefs', '-Wno-sign-compare']
|
||||
OPT_ARGS = ['-O3', '-DNDEBUG']
|
||||
DBG_ARGS = ['-O0', '-g', '-UNDEBUG', '-DDEBUG']
|
||||
OPENFST_DIR = 'third_party/openfst-1.6.7'
|
||||
|
||||
ARGS = [
|
||||
"-fPIC",
|
||||
"-DKENLM_MAX_ORDER=6",
|
||||
"-std=c++11",
|
||||
"-Wno-unused-local-typedefs",
|
||||
"-Wno-sign-compare",
|
||||
]
|
||||
OPT_ARGS = ["-O3", "-DNDEBUG"]
|
||||
DBG_ARGS = ["-O0", "-g", "-UNDEBUG", "-DDEBUG"]
|
||||
OPENFST_DIR = "third_party/openfst-1.6.7"
|
||||
|
||||
|
||||
INCLUDES = [
|
||||
'..',
|
||||
'../kenlm',
|
||||
OPENFST_DIR + '/src/include',
|
||||
'third_party/ThreadPool',
|
||||
'third_party/object_pool'
|
||||
"..",
|
||||
"../kenlm",
|
||||
OPENFST_DIR + "/src/include",
|
||||
"third_party/ThreadPool",
|
||||
"third_party/object_pool",
|
||||
]
|
||||
|
||||
KENLM_FILES = (glob.glob('../kenlm/util/*.cc')
|
||||
+ glob.glob('../kenlm/lm/*.cc')
|
||||
+ glob.glob('../kenlm/util/double-conversion/*.cc'))
|
||||
KENLM_FILES = (
|
||||
glob.glob("../kenlm/util/*.cc")
|
||||
+ glob.glob("../kenlm/lm/*.cc")
|
||||
+ glob.glob("../kenlm/util/double-conversion/*.cc")
|
||||
)
|
||||
|
||||
KENLM_FILES += glob.glob(OPENFST_DIR + '/src/lib/*.cc')
|
||||
KENLM_FILES += glob.glob(OPENFST_DIR + "/src/lib/*.cc")
|
||||
|
||||
KENLM_FILES = [
|
||||
fn for fn in KENLM_FILES
|
||||
if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
|
||||
'unittest.cc'))
|
||||
fn
|
||||
for fn in KENLM_FILES
|
||||
if not (
|
||||
fn.endswith("main.cc") or fn.endswith("test.cc") or fn.endswith("unittest.cc")
|
||||
)
|
||||
]
|
||||
|
||||
CTC_DECODER_FILES = [
|
||||
'ctc_beam_search_decoder.cpp',
|
||||
'scorer.cpp',
|
||||
'path_trie.cpp',
|
||||
'decoder_utils.cpp',
|
||||
'workspace_status.cc',
|
||||
'../alphabet.cc',
|
||||
"ctc_beam_search_decoder.cpp",
|
||||
"scorer.cpp",
|
||||
"path_trie.cpp",
|
||||
"decoder_utils.cpp",
|
||||
"workspace_status.cc",
|
||||
"../alphabet.cc",
|
||||
]
|
||||
|
||||
def build_archive(srcs=[], out_name='', build_dir='temp_build/temp_build', debug=False, num_parallel=1):
|
||||
compiler = os.environ.get('CXX', 'g++')
|
||||
if sys.platform.startswith('win'):
|
||||
|
||||
def build_archive(
|
||||
srcs=[], out_name="", build_dir="temp_build/temp_build", debug=False, num_parallel=1
|
||||
):
|
||||
compiler = os.environ.get("CXX", "g++")
|
||||
if sys.platform.startswith("win"):
|
||||
compiler = '"{}"'.format(compiler)
|
||||
ar = os.environ.get('AR', 'ar')
|
||||
libexe = os.environ.get('LIBEXE', 'lib.exe')
|
||||
libtool = os.environ.get('LIBTOOL', 'libtool')
|
||||
cflags = os.environ.get('CFLAGS', '') + os.environ.get('CXXFLAGS', '')
|
||||
ar = os.environ.get("AR", "ar")
|
||||
libexe = os.environ.get("LIBEXE", "lib.exe")
|
||||
libtool = os.environ.get("LIBTOOL", "libtool")
|
||||
cflags = os.environ.get("CFLAGS", "") + os.environ.get("CXXFLAGS", "")
|
||||
args = ARGS + (DBG_ARGS if debug else OPT_ARGS)
|
||||
|
||||
for file in srcs:
|
||||
outfile = os.path.join(build_dir, os.path.splitext(file)[0] + '.o')
|
||||
outfile = os.path.join(build_dir, os.path.splitext(file)[0] + ".o")
|
||||
outdir = os.path.dirname(outfile)
|
||||
if not os.path.exists(outdir):
|
||||
print('mkdir', outdir)
|
||||
print("mkdir", outdir)
|
||||
os.makedirs(outdir)
|
||||
|
||||
def build_one(file):
|
||||
outfile = os.path.join(build_dir, os.path.splitext(file)[0] + '.o')
|
||||
outfile = os.path.join(build_dir, os.path.splitext(file)[0] + ".o")
|
||||
if os.path.exists(outfile):
|
||||
return
|
||||
|
||||
if sys.platform.startswith('win'):
|
||||
file = '"{}"'.format(file.replace('\\', '/'))
|
||||
output = '/Fo"{}"'.format(outfile.replace('\\', '/'))
|
||||
if sys.platform.startswith("win"):
|
||||
file = '"{}"'.format(file.replace("\\", "/"))
|
||||
output = '/Fo"{}"'.format(outfile.replace("\\", "/"))
|
||||
else:
|
||||
output = '-o ' + outfile
|
||||
output = "-o " + outfile
|
||||
|
||||
cmd = '{cc} -c {cflags} {args} {includes} {infile} {output}'.format(
|
||||
cmd = "{cc} -c {cflags} {args} {includes} {infile} {output}".format(
|
||||
cc=compiler,
|
||||
cflags=cflags,
|
||||
args=' '.join(args),
|
||||
includes=' '.join('-I' + i for i in INCLUDES),
|
||||
args=" ".join(args),
|
||||
includes=" ".join("-I" + i for i in INCLUDES),
|
||||
infile=file,
|
||||
output=output,
|
||||
)
|
||||
|
@ -94,30 +105,28 @@ def build_archive(srcs=[], out_name='', build_dir='temp_build/temp_build', debug
|
|||
pool = Pool(num_parallel)
|
||||
obj_files = list(pool.imap_unordered(build_one, srcs))
|
||||
|
||||
if sys.platform.startswith('darwin'):
|
||||
cmd = '{libtool} -static -o {outfile} {infiles}'.format(
|
||||
if sys.platform.startswith("darwin"):
|
||||
cmd = "{libtool} -static -o {outfile} {infiles}".format(
|
||||
libtool=libtool,
|
||||
outfile=out_name,
|
||||
infiles=' '.join(obj_files),
|
||||
infiles=" ".join(obj_files),
|
||||
)
|
||||
print(cmd)
|
||||
subprocess.check_call(shlex.split(cmd))
|
||||
elif sys.platform.startswith('win'):
|
||||
elif sys.platform.startswith("win"):
|
||||
cmd = '"{libexe}" /OUT:"{outfile}" {infiles} /MACHINE:X64 /NOLOGO'.format(
|
||||
libexe=libexe,
|
||||
outfile=out_name,
|
||||
infiles=' '.join(obj_files))
|
||||
cmd = cmd.replace('\\', '/')
|
||||
libexe=libexe, outfile=out_name, infiles=" ".join(obj_files)
|
||||
)
|
||||
cmd = cmd.replace("\\", "/")
|
||||
print(cmd)
|
||||
subprocess.check_call(shlex.split(cmd))
|
||||
else:
|
||||
cmd = '{ar} rcs {outfile} {infiles}'.format(
|
||||
ar=ar,
|
||||
outfile=out_name,
|
||||
infiles=' '.join(obj_files)
|
||||
cmd = "{ar} rcs {outfile} {infiles}".format(
|
||||
ar=ar, outfile=out_name, infiles=" ".join(obj_files)
|
||||
)
|
||||
print(cmd)
|
||||
subprocess.check_call(shlex.split(cmd))
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
build_common()
|
||||
|
|
|
@ -161,4 +161,4 @@ bool add_word_to_dictionary(
|
|||
|
||||
add_word_to_fst(int_word, dictionary);
|
||||
return true; // return with successful adding
|
||||
}
|
||||
}
|
||||
|
|
|
@ -545,7 +545,7 @@
|
|||
const npy_intp *dims = array_dimensions(ary);
|
||||
for (i=0; i < nd; ++i)
|
||||
n_non_one += (dims[i] != 1) ? 1 : 0;
|
||||
if (n_non_one > 1)
|
||||
if (n_non_one > 1)
|
||||
array_clearflags(ary,NPY_ARRAY_CARRAY);
|
||||
array_enableflags(ary,NPY_ARRAY_FARRAY);
|
||||
/* Recompute the strides */
|
||||
|
|
|
@ -93,8 +93,8 @@ public:
|
|||
unsigned int character;
|
||||
TimestepTreeNode* timesteps = nullptr;
|
||||
|
||||
// timestep temporary storage for each decoding step.
|
||||
TimestepTreeNode* previous_timesteps = nullptr;
|
||||
// timestep temporary storage for each decoding step.
|
||||
TimestepTreeNode* previous_timesteps = nullptr;
|
||||
unsigned int new_timestep;
|
||||
|
||||
PathTrie* parent;
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
#ifdef _MSC_VER
|
||||
#include <stdlib.h>
|
||||
#include <io.h>
|
||||
#include <windows.h>
|
||||
#include <windows.h>
|
||||
|
||||
#define R_OK 4 /* Read permission. */
|
||||
#define W_OK 2 /* Write permission. */
|
||||
#define W_OK 2 /* Write permission. */
|
||||
#define F_OK 0 /* Existence. */
|
||||
|
||||
#define access _access
|
||||
|
|
|
@ -13,4 +13,3 @@ bdist-dir=temp_build/temp_build
|
|||
|
||||
[install_lib]
|
||||
build-dir=temp_build/temp_build
|
||||
|
||||
|
|
|
@ -1,95 +1,105 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
from distutils.command.build import build
|
||||
from setuptools import setup, Extension, distutils
|
||||
|
||||
import argparse
|
||||
import multiprocessing.pool
|
||||
import os
|
||||
import platform
|
||||
import sys
|
||||
from distutils.command.build import build
|
||||
|
||||
from build_archive import *
|
||||
from setuptools import Extension, distutils, setup
|
||||
|
||||
try:
|
||||
import numpy
|
||||
|
||||
try:
|
||||
numpy_include = numpy.get_include()
|
||||
except AttributeError:
|
||||
numpy_include = numpy.get_numpy_include()
|
||||
except ImportError:
|
||||
numpy_include = ''
|
||||
assert 'NUMPY_INCLUDE' in os.environ
|
||||
numpy_include = ""
|
||||
assert "NUMPY_INCLUDE" in os.environ
|
||||
|
||||
numpy_include = os.getenv('NUMPY_INCLUDE', numpy_include)
|
||||
numpy_min_ver = os.getenv('NUMPY_DEP_VERSION', '')
|
||||
numpy_include = os.getenv("NUMPY_INCLUDE", numpy_include)
|
||||
numpy_min_ver = os.getenv("NUMPY_DEP_VERSION", "")
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--num_processes",
|
||||
default=1,
|
||||
type=int,
|
||||
help="Number of cpu processes to build package. (default: %(default)d)")
|
||||
help="Number of cpu processes to build package. (default: %(default)d)",
|
||||
)
|
||||
known_args, unknown_args = parser.parse_known_args()
|
||||
debug = '--debug' in unknown_args
|
||||
debug = "--debug" in unknown_args
|
||||
|
||||
# reconstruct sys.argv to pass to setup below
|
||||
sys.argv = [sys.argv[0]] + unknown_args
|
||||
|
||||
|
||||
def read(fname):
|
||||
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
||||
|
||||
|
||||
def maybe_rebuild(srcs, out_name, build_dir):
|
||||
if not os.path.exists(out_name):
|
||||
if not os.path.exists(build_dir):
|
||||
os.makedirs(build_dir)
|
||||
|
||||
build_archive(srcs=srcs,
|
||||
out_name=out_name,
|
||||
build_dir=build_dir,
|
||||
num_parallel=known_args.num_processes,
|
||||
debug=debug)
|
||||
build_archive(
|
||||
srcs=srcs,
|
||||
out_name=out_name,
|
||||
build_dir=build_dir,
|
||||
num_parallel=known_args.num_processes,
|
||||
debug=debug,
|
||||
)
|
||||
|
||||
project_version = read('../../training/coqui_stt_training/VERSION').strip()
|
||||
|
||||
build_dir = 'temp_build/temp_build'
|
||||
project_version = read("../../training/coqui_stt_training/VERSION").strip()
|
||||
|
||||
if sys.platform.startswith('win'):
|
||||
archive_ext = 'lib'
|
||||
build_dir = "temp_build/temp_build"
|
||||
|
||||
if sys.platform.startswith("win"):
|
||||
archive_ext = "lib"
|
||||
else:
|
||||
archive_ext = 'a'
|
||||
archive_ext = "a"
|
||||
|
||||
third_party_build = 'third_party.{}'.format(archive_ext)
|
||||
ctc_decoder_build = 'first_party.{}'.format(archive_ext)
|
||||
third_party_build = "third_party.{}".format(archive_ext)
|
||||
ctc_decoder_build = "first_party.{}".format(archive_ext)
|
||||
|
||||
|
||||
maybe_rebuild(KENLM_FILES, third_party_build, build_dir)
|
||||
maybe_rebuild(CTC_DECODER_FILES, ctc_decoder_build, build_dir)
|
||||
|
||||
decoder_module = Extension(
|
||||
name='coqui_stt_ctcdecoder._swigwrapper',
|
||||
sources=['swigwrapper.i'],
|
||||
swig_opts=['-c++', '-extranative'],
|
||||
language='c++',
|
||||
name="coqui_stt_ctcdecoder._swigwrapper",
|
||||
sources=["swigwrapper.i"],
|
||||
swig_opts=["-c++", "-extranative"],
|
||||
language="c++",
|
||||
include_dirs=INCLUDES + [numpy_include],
|
||||
extra_compile_args=ARGS + (DBG_ARGS if debug else OPT_ARGS),
|
||||
extra_link_args=[ctc_decoder_build, third_party_build],
|
||||
)
|
||||
|
||||
|
||||
class BuildExtFirst(build):
|
||||
sub_commands = [('build_ext', build.has_ext_modules),
|
||||
('build_py', build.has_pure_modules),
|
||||
('build_clib', build.has_c_libraries),
|
||||
('build_scripts', build.has_scripts)]
|
||||
sub_commands = [
|
||||
("build_ext", build.has_ext_modules),
|
||||
("build_py", build.has_pure_modules),
|
||||
("build_clib", build.has_c_libraries),
|
||||
("build_scripts", build.has_scripts),
|
||||
]
|
||||
|
||||
|
||||
setup(
|
||||
name='coqui_stt_ctcdecoder',
|
||||
name="coqui_stt_ctcdecoder",
|
||||
version=project_version,
|
||||
description="""DS CTC decoder""",
|
||||
cmdclass = {'build': BuildExtFirst},
|
||||
cmdclass={"build": BuildExtFirst},
|
||||
ext_modules=[decoder_module],
|
||||
package_dir = {'coqui_stt_ctcdecoder': '.'},
|
||||
py_modules=['coqui_stt_ctcdecoder', 'coqui_stt_ctcdecoder.swigwrapper'],
|
||||
install_requires = ['numpy%s' % numpy_min_ver],
|
||||
package_dir={"coqui_stt_ctcdecoder": "."},
|
||||
py_modules=["coqui_stt_ctcdecoder", "coqui_stt_ctcdecoder.swigwrapper"],
|
||||
install_requires=["numpy%s" % numpy_min_ver],
|
||||
)
|
||||
|
|
|
@ -221,7 +221,7 @@ ClientBin/
|
|||
*.publishsettings
|
||||
orleans.codegen.cs
|
||||
|
||||
# Including strong name files can present a security risk
|
||||
# Including strong name files can present a security risk
|
||||
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
|
||||
#*.snk
|
||||
|
||||
|
@ -317,7 +317,7 @@ __pycache__/
|
|||
# OpenCover UI analysis results
|
||||
OpenCover/
|
||||
|
||||
# Azure Stream Analytics local run output
|
||||
# Azure Stream Analytics local run output
|
||||
ASALocalRun/
|
||||
|
||||
# MSBuild Binary and Structured Log
|
||||
|
@ -326,5 +326,5 @@ ASALocalRun/
|
|||
# NVidia Nsight GPU debugger configuration file
|
||||
*.nvuser
|
||||
|
||||
# MFractors (Xamarin productivity tool) working folder
|
||||
# MFractors (Xamarin productivity tool) working folder
|
||||
.mfractor/
|
||||
|
|
|
@ -14,4 +14,4 @@
|
|||
/// </summary>
|
||||
public TokenMetadata[] Tokens { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,4 +10,4 @@
|
|||
/// </summary>
|
||||
public CandidateTranscript[] Transcripts { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,4 +18,4 @@
|
|||
/// </summary>
|
||||
public float StartTime;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
<Folder Include="Properties\" />
|
||||
</ItemGroup>
|
||||
|
||||
|
||||
|
||||
<PropertyGroup Condition=" '$(TargetFramework)' == 'uap10.0' ">
|
||||
<DefineConstants>$(DefineConstants);NO_HTTPS</DefineConstants>
|
||||
</PropertyGroup>
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
<?xml version="1.0" encoding="utf-8" ?>
|
||||
<configuration>
|
||||
<startup>
|
||||
<startup>
|
||||
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.6.2" />
|
||||
</startup>
|
||||
</configuration>
|
||||
</configuration>
|
||||
|
|
|
@ -67,4 +67,4 @@
|
|||
</Content>
|
||||
</ItemGroup>
|
||||
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="NAudio" version="1.8.5" targetFramework="net462" />
|
||||
</packages>
|
||||
</packages>
|
||||
|
|
|
@ -221,7 +221,7 @@ ClientBin/
|
|||
*.publishsettings
|
||||
orleans.codegen.cs
|
||||
|
||||
# Including strong name files can present a security risk
|
||||
# Including strong name files can present a security risk
|
||||
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
|
||||
#*.snk
|
||||
|
||||
|
@ -317,7 +317,7 @@ __pycache__/
|
|||
# OpenCover UI analysis results
|
||||
OpenCover/
|
||||
|
||||
# Azure Stream Analytics local run output
|
||||
# Azure Stream Analytics local run output
|
||||
ASALocalRun/
|
||||
|
||||
# MSBuild Binary and Structured Log
|
||||
|
@ -326,5 +326,5 @@ ASALocalRun/
|
|||
# NVidia Nsight GPU debugger configuration file
|
||||
*.nvuser
|
||||
|
||||
# MFractors (Xamarin productivity tool) working folder
|
||||
# MFractors (Xamarin productivity tool) working folder
|
||||
.mfractor/
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
<?xml version="1.0" encoding="utf-8" ?>
|
||||
<configuration>
|
||||
<startup>
|
||||
<startup>
|
||||
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.6.2" />
|
||||
</startup>
|
||||
</configuration>
|
||||
</configuration>
|
||||
|
|
|
@ -10,8 +10,8 @@
|
|||
|
||||
namespace STT.WPF.Properties {
|
||||
using System;
|
||||
|
||||
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// A strongly-typed resource class, for looking up localized strings, etc.
|
||||
/// </summary>
|
||||
|
@ -23,15 +23,15 @@ namespace STT.WPF.Properties {
|
|||
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
|
||||
[global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
|
||||
internal class Resources {
|
||||
|
||||
|
||||
private static global::System.Resources.ResourceManager resourceMan;
|
||||
|
||||
|
||||
private static global::System.Globalization.CultureInfo resourceCulture;
|
||||
|
||||
|
||||
[global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")]
|
||||
internal Resources() {
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Returns the cached ResourceManager instance used by this class.
|
||||
/// </summary>
|
||||
|
@ -45,7 +45,7 @@ namespace STT.WPF.Properties {
|
|||
return resourceMan;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Overrides the current thread's CurrentUICulture property for all
|
||||
/// resource lookups using this strongly typed resource class.
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<root>
|
||||
<!--
|
||||
Microsoft ResX Schema
|
||||
|
||||
<!--
|
||||
Microsoft ResX Schema
|
||||
|
||||
Version 2.0
|
||||
|
||||
The primary goals of this format is to allow a simple XML format
|
||||
that is mostly human readable. The generation and parsing of the
|
||||
various data types are done through the TypeConverter classes
|
||||
|
||||
The primary goals of this format is to allow a simple XML format
|
||||
that is mostly human readable. The generation and parsing of the
|
||||
various data types are done through the TypeConverter classes
|
||||
associated with the data types.
|
||||
|
||||
|
||||
Example:
|
||||
|
||||
|
||||
... ado.net/XML headers & schema ...
|
||||
<resheader name="resmimetype">text/microsoft-resx</resheader>
|
||||
<resheader name="version">2.0</resheader>
|
||||
|
@ -26,36 +26,36 @@
|
|||
<value>[base64 mime encoded string representing a byte array form of the .NET Framework object]</value>
|
||||
<comment>This is a comment</comment>
|
||||
</data>
|
||||
|
||||
There are any number of "resheader" rows that contain simple
|
||||
|
||||
There are any number of "resheader" rows that contain simple
|
||||
name/value pairs.
|
||||
|
||||
Each data row contains a name, and value. The row also contains a
|
||||
type or mimetype. Type corresponds to a .NET class that support
|
||||
text/value conversion through the TypeConverter architecture.
|
||||
Classes that don't support this are serialized and stored with the
|
||||
|
||||
Each data row contains a name, and value. The row also contains a
|
||||
type or mimetype. Type corresponds to a .NET class that support
|
||||
text/value conversion through the TypeConverter architecture.
|
||||
Classes that don't support this are serialized and stored with the
|
||||
mimetype set.
|
||||
|
||||
The mimetype is used for serialized objects, and tells the
|
||||
ResXResourceReader how to depersist the object. This is currently not
|
||||
|
||||
The mimetype is used for serialized objects, and tells the
|
||||
ResXResourceReader how to depersist the object. This is currently not
|
||||
extensible. For a given mimetype the value must be set accordingly:
|
||||
|
||||
Note - application/x-microsoft.net.object.binary.base64 is the format
|
||||
that the ResXResourceWriter will generate, however the reader can
|
||||
|
||||
Note - application/x-microsoft.net.object.binary.base64 is the format
|
||||
that the ResXResourceWriter will generate, however the reader can
|
||||
read any of the formats listed below.
|
||||
|
||||
|
||||
mimetype: application/x-microsoft.net.object.binary.base64
|
||||
value : The object must be serialized with
|
||||
value : The object must be serialized with
|
||||
: System.Serialization.Formatters.Binary.BinaryFormatter
|
||||
: and then encoded with base64 encoding.
|
||||
|
||||
|
||||
mimetype: application/x-microsoft.net.object.soap.base64
|
||||
value : The object must be serialized with
|
||||
value : The object must be serialized with
|
||||
: System.Runtime.Serialization.Formatters.Soap.SoapFormatter
|
||||
: and then encoded with base64 encoding.
|
||||
|
||||
mimetype: application/x-microsoft.net.object.bytearray.base64
|
||||
value : The object must be serialized into a byte array
|
||||
value : The object must be serialized into a byte array
|
||||
: using a System.ComponentModel.TypeConverter
|
||||
: and then encoded with base64 encoding.
|
||||
-->
|
||||
|
@ -114,4 +114,4 @@
|
|||
<resheader name="writer">
|
||||
<value>System.Resources.ResXResourceWriter, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
|
||||
</resheader>
|
||||
</root>
|
||||
</root>
|
||||
|
|
|
@ -9,14 +9,14 @@
|
|||
//------------------------------------------------------------------------------
|
||||
|
||||
namespace STT.WPF.Properties {
|
||||
|
||||
|
||||
|
||||
|
||||
[global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
|
||||
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "15.9.0.0")]
|
||||
internal sealed partial class Settings : global::System.Configuration.ApplicationSettingsBase {
|
||||
|
||||
|
||||
private static Settings defaultInstance = ((Settings)(global::System.Configuration.ApplicationSettingsBase.Synchronized(new Settings())));
|
||||
|
||||
|
||||
public static Settings Default {
|
||||
get {
|
||||
return defaultInstance;
|
||||
|
|
|
@ -4,4 +4,4 @@
|
|||
<Profile Name="(Default)" />
|
||||
</Profiles>
|
||||
<Settings />
|
||||
</SettingsFile>
|
||||
</SettingsFile>
|
||||
|
|
|
@ -131,7 +131,7 @@ namespace STT.WPF.ViewModels
|
|||
public MMDevice SelectedDevice
|
||||
{
|
||||
get => _selectedDevice;
|
||||
set => SetProperty(ref _selectedDevice, value,
|
||||
set => SetProperty(ref _selectedDevice, value,
|
||||
onChanged: UpdateSelectedDevice);
|
||||
}
|
||||
|
||||
|
@ -255,7 +255,7 @@ namespace STT.WPF.ViewModels
|
|||
private void LoadAvailableCaptureDevices()
|
||||
{
|
||||
AvailableRecordDevices = new ObservableCollection<MMDevice>(
|
||||
MMDeviceEnumerator.EnumerateDevices(DataFlow.All, DeviceState.Active)); //we get only enabled devices
|
||||
MMDeviceEnumerator.EnumerateDevices(DataFlow.All, DeviceState.Active)); //we get only enabled devices
|
||||
EnableStartRecord = true;
|
||||
if (AvailableRecordDevices?.Count != 0)
|
||||
SelectedDevice = AvailableRecordDevices[0];
|
||||
|
@ -282,14 +282,14 @@ namespace STT.WPF.ViewModels
|
|||
.ToWaveSource(16); //bits per sample
|
||||
|
||||
_convertedSource = _convertedSource.ToMono();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void Capture_DataAvailable(object sender, DataAvailableEventArgs e)
|
||||
{
|
||||
//read data from the converedSource
|
||||
//important: don't use the e.Data here
|
||||
//the e.Data contains the raw data provided by the
|
||||
//the e.Data contains the raw data provided by the
|
||||
//soundInSource which won't have the STT required audio format
|
||||
byte[] buffer = new byte[_convertedSource.WaveFormat.BytesPerSecond / 2];
|
||||
|
||||
|
@ -319,7 +319,7 @@ namespace STT.WPF.ViewModels
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Enables the external scorer.
|
||||
/// </summary>
|
||||
|
@ -422,4 +422,4 @@ namespace STT.WPF.ViewModels
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,4 +6,4 @@
|
|||
<package id="CSCore" version="1.2.1.2" targetFramework="net462" />
|
||||
<package id="MvvmLightLibs" version="5.4.1.1" targetFramework="net462" />
|
||||
<package id="NAudio" version="1.9.0" targetFramework="net462" />
|
||||
</packages>
|
||||
</packages>
|
||||
|
|
|
@ -6,4 +6,4 @@
|
|||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -3,9 +3,9 @@
|
|||
* DISCLAIMER
|
||||
* This file is part of the mingw-w64 runtime package.
|
||||
*
|
||||
* The mingw-w64 runtime package and its code is distributed in the hope that it
|
||||
* will be useful but WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESSED OR
|
||||
* IMPLIED ARE HEREBY DISCLAIMED. This includes but is not limited to
|
||||
* The mingw-w64 runtime package and its code is distributed in the hope that it
|
||||
* will be useful but WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESSED OR
|
||||
* IMPLIED ARE HEREBY DISCLAIMED. This includes but is not limited to
|
||||
* warranties of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*/
|
||||
/*
|
||||
|
|
|
@ -26,4 +26,4 @@
|
|||
</extensions>
|
||||
</Objective-C-extensions>
|
||||
</code_scheme>
|
||||
</component>
|
||||
</component>
|
||||
|
|
|
@ -16,4 +16,4 @@
|
|||
</GradleProjectSettings>
|
||||
</option>
|
||||
</component>
|
||||
</project>
|
||||
</project>
|
||||
|
|
|
@ -35,4 +35,4 @@
|
|||
<component name="ProjectType">
|
||||
<option name="id" value="Android" />
|
||||
</component>
|
||||
</project>
|
||||
</project>
|
||||
|
|
|
@ -9,4 +9,4 @@
|
|||
</set>
|
||||
</option>
|
||||
</component>
|
||||
</project>
|
||||
</project>
|
||||
|
|
|
@ -2,4 +2,4 @@
|
|||
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
|
||||
<background android:drawable="@drawable/ic_launcher_background" />
|
||||
<foreground android:drawable="@drawable/ic_launcher_foreground" />
|
||||
</adaptive-icon>
|
||||
</adaptive-icon>
|
||||
|
|
|
@ -2,4 +2,4 @@
|
|||
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
|
||||
<background android:drawable="@drawable/ic_launcher_background" />
|
||||
<foreground android:drawable="@drawable/ic_launcher_foreground" />
|
||||
</adaptive-icon>
|
||||
</adaptive-icon>
|
||||
|
|
|
@ -14,4 +14,4 @@ public class ExampleUnitTest {
|
|||
public void addition_isCorrect() {
|
||||
assertEquals(4, 2 + 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,5 +11,3 @@ org.gradle.jvmargs=-Xmx1536m
|
|||
# This option should only be used with decoupled projects. More details, visit
|
||||
# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
|
||||
# org.gradle.parallel=true
|
||||
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
%extend struct CandidateTranscript {
|
||||
/**
|
||||
* Retrieve one TokenMetadata element
|
||||
*
|
||||
*
|
||||
* @param i Array index of the TokenMetadata to get
|
||||
*
|
||||
* @return The TokenMetadata requested or null
|
||||
|
@ -33,7 +33,7 @@
|
|||
%extend struct Metadata {
|
||||
/**
|
||||
* Retrieve one CandidateTranscript element
|
||||
*
|
||||
*
|
||||
* @param i Array index of the CandidateTranscript to get
|
||||
*
|
||||
* @return The CandidateTranscript requested or null
|
||||
|
|
|
@ -36,7 +36,7 @@ public class CandidateTranscript {
|
|||
}
|
||||
|
||||
/**
|
||||
* Size of the tokens array
|
||||
* Size of the tokens array
|
||||
*/
|
||||
public long getNumTokens() {
|
||||
return implJNI.CandidateTranscript_NumTokens_get(swigCPtr, this);
|
||||
|
|
|
@ -40,7 +40,7 @@ public class Metadata {
|
|||
}
|
||||
|
||||
/**
|
||||
* Size of the transcripts array
|
||||
* Size of the transcripts array
|
||||
*/
|
||||
public long getNumTranscripts() {
|
||||
return implJNI.Metadata_NumTranscripts_get(swigCPtr, this);
|
||||
|
|
|
@ -70,4 +70,3 @@ public enum STT_Error_Codes {
|
|||
private static int next = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -35,21 +35,21 @@ public class TokenMetadata {
|
|||
}
|
||||
|
||||
/**
|
||||
* The text corresponding to this token
|
||||
* The text corresponding to this token
|
||||
*/
|
||||
public String getText() {
|
||||
return implJNI.TokenMetadata_Text_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Position of the token in units of 20ms
|
||||
* Position of the token in units of 20ms
|
||||
*/
|
||||
public long getTimestep() {
|
||||
return implJNI.TokenMetadata_Timestep_get(swigCPtr, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Position of the token in seconds
|
||||
* Position of the token in seconds
|
||||
*/
|
||||
public float getStartTime() {
|
||||
return implJNI.TokenMetadata_StartTime_get(swigCPtr, this);
|
||||
|
|
|
@ -14,4 +14,4 @@ public class ExampleUnitTest {
|
|||
public void addition_isCorrect() {
|
||||
assertEquals(4, 2 + 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
NODE_BUILD_TOOL ?= node-pre-gyp
|
||||
NODE_ABI_TARGET ?=
|
||||
NODE_ABI_TARGET ?=
|
||||
NODE_BUILD_VERBOSE ?= --verbose
|
||||
NPM_TOOL ?= npm
|
||||
PROJECT_NAME ?= stt
|
||||
|
|
|
@ -1,46 +1,44 @@
|
|||
{
|
||||
"targets": [
|
||||
{
|
||||
"target_name": "stt",
|
||||
"sources": [ "stt_wrap.cxx" ],
|
||||
"libraries": [
|
||||
"$(LIBS)"
|
||||
],
|
||||
"include_dirs": [
|
||||
"../"
|
||||
],
|
||||
"conditions": [
|
||||
[ "OS=='mac'", {
|
||||
"xcode_settings": {
|
||||
"OTHER_CXXFLAGS": [
|
||||
"-stdlib=libc++",
|
||||
"-mmacosx-version-min=10.10"
|
||||
],
|
||||
"OTHER_LDFLAGS": [
|
||||
"-stdlib=libc++",
|
||||
"-mmacosx-version-min=10.10"
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"target_name": "action_after_build",
|
||||
"type": "none",
|
||||
"dependencies": [ "<(module_name)" ],
|
||||
"copies": [
|
||||
"targets": [
|
||||
{
|
||||
"files": [ "<(PRODUCT_DIR)/<(module_name).node" ],
|
||||
"destination": "<(module_path)"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"variables": {
|
||||
"build_v8_with_gn": 0,
|
||||
"v8_enable_pointer_compression": 0,
|
||||
"v8_enable_31bit_smis_on_64bit_arch": 0,
|
||||
"enable_lto": 1
|
||||
},
|
||||
"target_name": "stt",
|
||||
"sources": ["stt_wrap.cxx"],
|
||||
"libraries": ["$(LIBS)"],
|
||||
"include_dirs": ["../"],
|
||||
"conditions": [
|
||||
[
|
||||
"OS=='mac'",
|
||||
{
|
||||
"xcode_settings": {
|
||||
"OTHER_CXXFLAGS": [
|
||||
"-stdlib=libc++",
|
||||
"-mmacosx-version-min=10.10",
|
||||
],
|
||||
"OTHER_LDFLAGS": [
|
||||
"-stdlib=libc++",
|
||||
"-mmacosx-version-min=10.10",
|
||||
],
|
||||
}
|
||||
},
|
||||
]
|
||||
],
|
||||
},
|
||||
{
|
||||
"target_name": "action_after_build",
|
||||
"type": "none",
|
||||
"dependencies": ["<(module_name)"],
|
||||
"copies": [
|
||||
{
|
||||
"files": ["<(PRODUCT_DIR)/<(module_name).node"],
|
||||
"destination": "<(module_path)",
|
||||
}
|
||||
],
|
||||
},
|
||||
],
|
||||
"variables": {
|
||||
"build_v8_with_gn": 0,
|
||||
"v8_enable_pointer_compression": 0,
|
||||
"v8_enable_31bit_smis_on_64bit_arch": 0,
|
||||
"enable_lto": 1,
|
||||
},
|
||||
}
|
||||
|
|
|
@ -136,7 +136,7 @@ class StreamImpl {
|
|||
}
|
||||
/**
|
||||
* Exposes the type of Stream without actually exposing the class.
|
||||
* Because the Stream class should not be instantiated directly,
|
||||
* Because the Stream class should not be instantiated directly,
|
||||
* but instead be created via :js:func:`Model.createStream`.
|
||||
*/
|
||||
export type Stream = StreamImpl;
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue