Add more parameters.
Implement some change request.
This commit is contained in:
parent
a79cc0cee9
commit
b27e0347b1
@ -1,32 +1,23 @@
|
|||||||
Download librispeech corpus
|
The LM binary was generated from the LibriSpeech normalized LM training text, available `here <http://www.openslr.org/11>`_.
|
||||||
|
It is created with `KenLM <https://github.com/kpu/kenlm>`_.
|
||||||
|
|
||||||
|
You can download the librispeech corpus with the following commands:
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -O librispeech.txt.gz
|
wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -O librispeech.txt.gz
|
||||||
gunzip librispeech.txt.gz
|
gunzip librispeech.txt.gz
|
||||||
|
|
||||||
|
|
|
||||||
| Generate vocab-500000.txt and lm.binary files
|
|
||||||
| Optional Parameters:
|
|
||||||
|
|
||||||
* '--kenlm_bins path/to/bins/': Change the path of the kenlm binaries (defaults to directory in docker container)
|
|
||||||
* '--top_k 500000': Change the number of most frequent words
|
|
||||||
* '--arpa_order 5': Change order of k-grams in arpa-file generation
|
|
||||||
* '--max_arpa_memory 75%': Set maximum allowed memory usage for arpa-file generation
|
|
||||||
|
|
||||||
|
Then use the `generate_lm.py` script to generate `lm.binary` and `vocab-500000.txt`.
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
python3 data/lm/generate_lm.py --input_txt path/to/vocab_sentences.txt --output_dir path/lm/
|
python3 data/lm/generate_lm.py --input_txt path/to/vocab_sentences.txt --output_dir path/lm/
|
||||||
|
|
||||||
|
|
|
||||||
| Generate scorer package with the above vocab-500000.txt and lm.binary files
|
|
||||||
| Optional Parameters:
|
|
||||||
|
|
||||||
* '--default_alpha 0.75'
|
Afterwards you can generate the scorer package with the above vocab-500000.txt and lm.binary files
|
||||||
* '--default_beta 1.85'
|
|
||||||
* '--force_utf8 ""': See `link <https://github.com/mozilla/DeepSpeech/blob/master/doc/Decoder.rst#utf-8-mode>`_ for explanation
|
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab librispeech-vocab-500k.txt --package kenlm.scorer
|
python generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt --package kenlm.scorer --default_alpha 0.75 --default_beta 1.85
|
||||||
|
@ -6,129 +6,172 @@ from collections import Counter
|
|||||||
import progressbar
|
import progressbar
|
||||||
|
|
||||||
|
|
||||||
# ======================================================================================================================
|
|
||||||
|
|
||||||
def convert_and_filter_topk(args):
|
def convert_and_filter_topk(args):
|
||||||
""" Convert to lowercase, count word occurrences and save top-k words to a file """
|
""" Convert to lowercase, count word occurrences and save top-k words to a file """
|
||||||
|
|
||||||
counter = Counter()
|
counter = Counter()
|
||||||
data_lower = os.path.join(args.output_dir, 'lower.txt')
|
data_lower = os.path.join(args.output_dir, "lower.txt")
|
||||||
|
|
||||||
print('\nConverting to lowercase and counting word occurrences ...')
|
print("\nConverting to lowercase and counting word occurrences ...")
|
||||||
with open(data_lower, 'w+', encoding='utf8') as file_out:
|
with open(data_lower, "w+", encoding="utf8") as file_out:
|
||||||
with open(args.input_txt, encoding='utf8') as file_in:
|
with open(args.input_txt, encoding="utf8") as file_in:
|
||||||
for line in progressbar.progressbar(file_in):
|
for line in progressbar.progressbar(file_in):
|
||||||
line_lower = line.lower()
|
line_lower = line.lower()
|
||||||
counter.update(line_lower.split())
|
counter.update(line_lower.split())
|
||||||
file_out.write(line_lower)
|
file_out.write(line_lower)
|
||||||
|
|
||||||
# Save top-k words
|
# Save top-k words
|
||||||
print('\nSaving top {} words ...'.format(args.top_k))
|
print("\nSaving top {} words ...".format(args.top_k))
|
||||||
top_counter = counter.most_common(args.top_k)
|
top_counter = counter.most_common(args.top_k)
|
||||||
vocab_str = '\n'.join(word for word, count in top_counter)
|
vocab_str = "\n".join(word for word, count in top_counter)
|
||||||
vocab_path = 'vocab-{}.txt'.format(args.top_k)
|
vocab_path = "vocab-{}.txt".format(args.top_k)
|
||||||
vocab_path = os.path.join(args.output_dir, vocab_path)
|
vocab_path = os.path.join(args.output_dir, vocab_path)
|
||||||
with open(vocab_path, 'w+') as file:
|
with open(vocab_path, "w+") as file:
|
||||||
file.write(vocab_str)
|
file.write(vocab_str)
|
||||||
|
|
||||||
print('\nCalculating word statistics ...')
|
print("\nCalculating word statistics ...")
|
||||||
total_words = sum(counter.values())
|
total_words = sum(counter.values())
|
||||||
print(' Your text file has {} words in total'.format(total_words))
|
print(" Your text file has {} words in total".format(total_words))
|
||||||
print(' It has {} unique words'.format(len(counter)))
|
print(" It has {} unique words".format(len(counter)))
|
||||||
top_words_sum = sum(count for word, count in top_counter)
|
top_words_sum = sum(count for word, count in top_counter)
|
||||||
word_fraction = (top_words_sum / total_words) * 100
|
word_fraction = (top_words_sum / total_words) * 100
|
||||||
print(' Your top-{} words are {:.4f} percent of all words'.format(args.top_k, word_fraction))
|
print(
|
||||||
|
" Your top-{} words are {:.4f} percent of all words".format(
|
||||||
|
args.top_k, word_fraction
|
||||||
|
)
|
||||||
|
)
|
||||||
print(' Your most common word "{}" occurred {} times'.format(*top_counter[0]))
|
print(' Your most common word "{}" occurred {} times'.format(*top_counter[0]))
|
||||||
last_word, last_count = top_counter[-1]
|
last_word, last_count = top_counter[-1]
|
||||||
print(' The least common word in your top-k is "{}" with {} times'.format(last_word, last_count))
|
print(
|
||||||
|
' The least common word in your top-k is "{}" with {} times'.format(
|
||||||
|
last_word, last_count
|
||||||
|
)
|
||||||
|
)
|
||||||
for i, (w, c) in enumerate(reversed(top_counter)):
|
for i, (w, c) in enumerate(reversed(top_counter)):
|
||||||
if c > last_count:
|
if c > last_count:
|
||||||
print(' The first word with {} occurrences is "{}" at place {}'.format(c, w, len(top_counter) - 1 - i))
|
print(
|
||||||
|
' The first word with {} occurrences is "{}" at place {}'.format(
|
||||||
|
c, w, len(top_counter) - 1 - i
|
||||||
|
)
|
||||||
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
return data_lower, vocab_str
|
return data_lower, vocab_str
|
||||||
|
|
||||||
|
|
||||||
# ======================================================================================================================
|
|
||||||
|
|
||||||
def build_lm(args, data_lower, vocab_str):
|
def build_lm(args, data_lower, vocab_str):
|
||||||
print('\nCreating ARPA file ...')
|
print("\nCreating ARPA file ...")
|
||||||
lm_path = os.path.join(args.output_dir, 'lm.arpa')
|
lm_path = os.path.join(args.output_dir, "lm.arpa")
|
||||||
subprocess.check_call([
|
subprocess.check_call(
|
||||||
args.kenlm_bins + 'lmplz',
|
[
|
||||||
'--order', str(args.arpa_order),
|
args.kenlm_bins + "lmplz",
|
||||||
'--temp_prefix', args.output_dir,
|
"--order",
|
||||||
'--memory', args.max_arpa_memory,
|
str(args.arpa_order),
|
||||||
'--text', data_lower,
|
"--temp_prefix",
|
||||||
'--arpa', lm_path,
|
args.output_dir,
|
||||||
'--prune', '0', '0', '1'
|
"--memory",
|
||||||
])
|
args.max_arpa_memory,
|
||||||
|
"--text",
|
||||||
|
data_lower,
|
||||||
|
"--arpa",
|
||||||
|
lm_path,
|
||||||
|
"--prune",
|
||||||
|
*args.arpa_prune.split("|"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
# Filter LM using vocabulary of top 500k words
|
# Filter LM using vocabulary of top 500k words
|
||||||
print('\nFiltering ARPA file using vocabulary of top-k words ...')
|
print("\nFiltering ARPA file using vocabulary of top-k words ...")
|
||||||
filtered_path = os.path.join(args.output_dir, 'lm_filtered.arpa')
|
filtered_path = os.path.join(args.output_dir, "lm_filtered.arpa")
|
||||||
subprocess.run([
|
subprocess.run(
|
||||||
args.kenlm_bins + 'filter',
|
[
|
||||||
'single',
|
args.kenlm_bins + "filter",
|
||||||
'model:{}'.format(lm_path),
|
"single",
|
||||||
filtered_path
|
"model:{}".format(lm_path),
|
||||||
], input=vocab_str.encode('utf-8'), check=True)
|
filtered_path,
|
||||||
|
],
|
||||||
|
input=vocab_str.encode("utf-8"),
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
# Quantize and produce trie binary.
|
# Quantize and produce trie binary.
|
||||||
print('\nBuilding lm.binary ...')
|
print("\nBuilding lm.binary ...")
|
||||||
binary_path = os.path.join(args.output_dir, 'lm.binary')
|
binary_path = os.path.join(args.output_dir, "lm.binary")
|
||||||
subprocess.check_call([
|
subprocess.check_call(
|
||||||
args.kenlm_bins + 'build_binary',
|
[
|
||||||
'-a', '255',
|
args.kenlm_bins + "build_binary",
|
||||||
'-q', '8',
|
"-a",
|
||||||
'-v',
|
str(args.binary_a_bits),
|
||||||
'trie',
|
"-q",
|
||||||
filtered_path,
|
str(args.binary_q_bits),
|
||||||
binary_path
|
"-v",
|
||||||
])
|
args.binary_type,
|
||||||
|
filtered_path,
|
||||||
|
binary_path,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ======================================================================================================================
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description='Generate an lm.binary and top-k vocab for DeepSpeech.'
|
description="Generate lm.binary and top-k vocab for DeepSpeech."
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--input_txt',
|
"--input_txt",
|
||||||
help='File path to a .txt with sample sentences',
|
help="File path to a .txt with sample sentences",
|
||||||
type=str,
|
type=str,
|
||||||
required=True
|
required=True,
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--output_dir',
|
"--output_dir", help="Directory path for the output", type=str, required=True
|
||||||
help='Directory path for the output',
|
|
||||||
type=str,
|
|
||||||
required=True
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--top_k',
|
"--top_k",
|
||||||
help='Use top_k most frequent words for the vocab.txt file',
|
help="Use top_k most frequent words for the vocab.txt file",
|
||||||
type=int,
|
type=int,
|
||||||
default=500000
|
default=500000,
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--kenlm_bins',
|
"--kenlm_bins",
|
||||||
help='File path to the kenlm binaries lmplz, filter and build_binary',
|
help="File path to the KENLM binaries lmplz, filter and build_binary",
|
||||||
type=str,
|
type=str,
|
||||||
default='/DeepSpeech/native_client/kenlm/build/bin/'
|
default="/DeepSpeech/native_client/kenlm/build/bin/",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--arpa_order',
|
"--arpa_order",
|
||||||
help='Order of k-grams in arpa-file generation',
|
help="Order of k-grams in ARPA-file generation",
|
||||||
type=int,
|
type=int,
|
||||||
default=5
|
default=5,
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--max_arpa_memory',
|
"--max_arpa_memory",
|
||||||
help='Maximum allowed memory usage in arpa-file generation',
|
help="Maximum allowed memory usage for ARPA-file generation",
|
||||||
type=str,
|
type=str,
|
||||||
default='75%'
|
default="75%",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--arpa_prune",
|
||||||
|
help='ARPA pruning parameters. Separate values with "|"',
|
||||||
|
type=str,
|
||||||
|
default="0|0|1",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--binary_a_bits",
|
||||||
|
help="Build binary quantization value a in bits",
|
||||||
|
type=int,
|
||||||
|
default=255,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--binary_q_bits",
|
||||||
|
help="Build binary quantization value q in bits",
|
||||||
|
type=int,
|
||||||
|
default=8,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--binary_type",
|
||||||
|
help="Build binary data structure type",
|
||||||
|
type=str,
|
||||||
|
default="trie",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@ -136,12 +179,10 @@ def main():
|
|||||||
build_lm(args, data_lower, vocab_str)
|
build_lm(args, data_lower, vocab_str)
|
||||||
|
|
||||||
# Delete intermediate files
|
# Delete intermediate files
|
||||||
os.remove(os.path.join(args.output_dir, 'lower.txt'))
|
os.remove(os.path.join(args.output_dir, "lower.txt"))
|
||||||
os.remove(os.path.join(args.output_dir, 'lm.arpa'))
|
os.remove(os.path.join(args.output_dir, "lm.arpa"))
|
||||||
os.remove(os.path.join(args.output_dir, 'lm_filtered.arpa'))
|
os.remove(os.path.join(args.output_dir, "lm_filtered.arpa"))
|
||||||
|
|
||||||
|
|
||||||
# ======================================================================================================================
|
if __name__ == "__main__":
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
main()
|
||||||
|
@ -16,13 +16,13 @@ from ds_ctcdecoder import Scorer, Alphabet as NativeAlphabet
|
|||||||
|
|
||||||
|
|
||||||
def create_bundle(
|
def create_bundle(
|
||||||
alphabet_path,
|
alphabet_path,
|
||||||
lm_path,
|
lm_path,
|
||||||
vocab_path,
|
vocab_path,
|
||||||
package_path,
|
package_path,
|
||||||
force_utf8,
|
force_utf8,
|
||||||
default_alpha,
|
default_alpha,
|
||||||
default_beta,
|
default_beta,
|
||||||
):
|
):
|
||||||
words = set()
|
words = set()
|
||||||
vocab_looks_char_based = True
|
vocab_looks_char_based = True
|
||||||
@ -112,27 +112,23 @@ def main():
|
|||||||
required=True,
|
required=True,
|
||||||
help="Path of vocabulary file. Must contain words separated by whitespace.",
|
help="Path of vocabulary file. Must contain words separated by whitespace.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument("--package", required=True, help="Path to save scorer package.")
|
||||||
"--package",
|
|
||||||
required=True,
|
|
||||||
help="Path to save scorer package."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--default_alpha",
|
"--default_alpha",
|
||||||
type=float,
|
type=float,
|
||||||
default=0.75,
|
required=True,
|
||||||
help="Default value of alpha hyperparameter.",
|
help="Default value of alpha hyperparameter.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--default_beta",
|
"--default_beta",
|
||||||
type=float,
|
type=float,
|
||||||
default=1.85,
|
required=True,
|
||||||
help="Default value of beta hyperparameter.",
|
help="Default value of beta hyperparameter.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--force_utf8",
|
"--force_utf8",
|
||||||
default="",
|
default="",
|
||||||
help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. Using this wrong can result in a 'Segmentation fault' error at the model evaluation.",
|
help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See <https://github.com/mozilla/DeepSpeech/blob/master/doc/Decoder.rst#utf-8-mode> for further explanation",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user