Rename --force_utf8 flag to --force_bytes_output_mode to avoid confusion
This commit is contained in:
parent
83a36b7a34
commit
09f0aa3d75
@ -62,7 +62,7 @@ At decoding time, the scorer is queried every time a Unicode codepoint is predic
|
|||||||
|
|
||||||
UTF-8 scorers can be built by using an input corpus with space separated codepoints. If your corpus only contains single codepoints separated by spaces, ``generate_scorer_package`` should automatically enable bytes output mode, and it should print the message "Looks like a character based model."
|
UTF-8 scorers can be built by using an input corpus with space separated codepoints. If your corpus only contains single codepoints separated by spaces, ``generate_scorer_package`` should automatically enable bytes output mode, and it should print the message "Looks like a character based model."
|
||||||
|
|
||||||
If the message "Doesn't look like a character based model." is printed, you should double check your inputs to make sure it only contains single codepoints separated by spaces. Bytes output mode can be forced by specifying the ``--force_utf8`` flag when running ``generate_scorer_package``, but it is NOT RECOMMENDED.
|
If the message "Doesn't look like a character based model." is printed, you should double check your inputs to make sure it only contains single codepoints separated by spaces. Bytes output mode can be forced by specifying the ``--force_bytes_output_mode`` flag when running ``generate_scorer_package``, but it is NOT RECOMMENDED.
|
||||||
|
|
||||||
See :ref:`scorer-scripts` for more details on using ``generate_scorer_package``.
|
See :ref:`scorer-scripts` for more details on using ``generate_scorer_package``.
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ create_package(absl::optional<string> alphabet_path,
|
|||||||
string lm_path,
|
string lm_path,
|
||||||
string vocab_path,
|
string vocab_path,
|
||||||
string package_path,
|
string package_path,
|
||||||
absl::optional<bool> force_utf8,
|
absl::optional<bool> force_bytes_output_mode,
|
||||||
float default_alpha,
|
float default_alpha,
|
||||||
float default_beta)
|
float default_beta)
|
||||||
{
|
{
|
||||||
@ -43,27 +43,27 @@ create_package(absl::optional<string> alphabet_path,
|
|||||||
<< (vocab_looks_char_based ? "Looks" : "Doesn't look")
|
<< (vocab_looks_char_based ? "Looks" : "Doesn't look")
|
||||||
<< " like a character based (Bytes Are All You Need) model.\n";
|
<< " like a character based (Bytes Are All You Need) model.\n";
|
||||||
|
|
||||||
if (!force_utf8.has_value()) {
|
if (!force_bytes_output_mode.has_value()) {
|
||||||
force_utf8 = vocab_looks_char_based;
|
force_bytes_output_mode = vocab_looks_char_based;
|
||||||
cerr << "--force_utf8 was not specified, using value "
|
cerr << "--force_bytes_output_mode was not specified, using value "
|
||||||
<< "infered from vocabulary contents: "
|
<< "infered from vocabulary contents: "
|
||||||
<< (vocab_looks_char_based ? "true" : "false") << "\n";
|
<< (vocab_looks_char_based ? "true" : "false") << "\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!force_utf8.value() && !alphabet_path.has_value()) {
|
if (!force_bytes_output_mode.value() && !alphabet_path.has_value()) {
|
||||||
cerr << "No --alphabet file specified, not using bytes output mode, can't continue.\n";
|
cerr << "No --alphabet file specified, not using bytes output mode, can't continue.\n";
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
Scorer scorer;
|
Scorer scorer;
|
||||||
if (force_utf8.value()) {
|
if (force_bytes_output_mode.value()) {
|
||||||
scorer.set_alphabet(UTF8Alphabet());
|
scorer.set_alphabet(UTF8Alphabet());
|
||||||
} else {
|
} else {
|
||||||
Alphabet alphabet;
|
Alphabet alphabet;
|
||||||
alphabet.init(alphabet_path->c_str());
|
alphabet.init(alphabet_path->c_str());
|
||||||
scorer.set_alphabet(alphabet);
|
scorer.set_alphabet(alphabet);
|
||||||
}
|
}
|
||||||
scorer.set_utf8_mode(force_utf8.value());
|
scorer.set_utf8_mode(force_bytes_output_mode.value());
|
||||||
scorer.reset_params(default_alpha, default_beta);
|
scorer.reset_params(default_alpha, default_beta);
|
||||||
int err = scorer.load_lm(lm_path);
|
int err = scorer.load_lm(lm_path);
|
||||||
if (err != DS_ERR_SCORER_NO_TRIE) {
|
if (err != DS_ERR_SCORER_NO_TRIE) {
|
||||||
@ -96,13 +96,13 @@ main(int argc, char** argv)
|
|||||||
po::options_description desc("Options");
|
po::options_description desc("Options");
|
||||||
desc.add_options()
|
desc.add_options()
|
||||||
("help", "show help message")
|
("help", "show help message")
|
||||||
("alphabet", po::value<string>(), "Path of alphabet file to use for vocabulary construction. Words with characters not in the alphabet will not be included in the vocabulary. Optional if using UTF-8 mode.")
|
("alphabet", po::value<string>(), "Path of alphabet file to use for vocabulary construction. Words with characters not in the alphabet will not be included in the vocabulary. Optional if using bytes output mode.")
|
||||||
("lm", po::value<string>(), "Path of KenLM binary LM file. Must be built without including the vocabulary (use the -v flag). See generate_lm.py for how to create a binary LM.")
|
("lm", po::value<string>(), "Path of KenLM binary LM file. Must be built without including the vocabulary (use the -v flag). See generate_lm.py for how to create a binary LM.")
|
||||||
("vocab", po::value<string>(), "Path of vocabulary file. Must contain words separated by whitespace.")
|
("vocab", po::value<string>(), "Path of vocabulary file. Must contain words separated by whitespace.")
|
||||||
("package", po::value<string>(), "Path to save scorer package.")
|
("package", po::value<string>(), "Path to save scorer package.")
|
||||||
("default_alpha", po::value<float>(), "Default value of alpha hyperparameter (float).")
|
("default_alpha", po::value<float>(), "Default value of alpha hyperparameter (float).")
|
||||||
("default_beta", po::value<float>(), "Default value of beta hyperparameter (float).")
|
("default_beta", po::value<float>(), "Default value of beta hyperparameter (float).")
|
||||||
("force_utf8", po::value<bool>(), "Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See <https://deepspeech.readthedocs.io/en/master/Decoder.html#utf-8-mode> for further explanation.")
|
("force_bytes_output_mode", po::value<bool>(), "Boolean flag, force set or unset bytes output mode in the scorer package. If not set, infers from the vocabulary. See <https://deepspeech.readthedocs.io/en/master/Decoder.html#bytes-output-mode> for further explanation.")
|
||||||
;
|
;
|
||||||
|
|
||||||
po::variables_map vm;
|
po::variables_map vm;
|
||||||
@ -122,10 +122,10 @@ main(int argc, char** argv)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse optional --force_utf8
|
// Parse optional --force_bytes_output_mode
|
||||||
absl::optional<bool> force_utf8 = absl::nullopt;
|
absl::optional<bool> force_bytes_output_mode = absl::nullopt;
|
||||||
if (vm.count("force_utf8")) {
|
if (vm.count("force_bytes_output_mode")) {
|
||||||
force_utf8 = vm["force_utf8"].as<bool>();
|
force_bytes_output_mode = vm["force_bytes_output_mode"].as<bool>();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse optional --alphabet
|
// Parse optional --alphabet
|
||||||
@ -138,7 +138,7 @@ main(int argc, char** argv)
|
|||||||
vm["lm"].as<string>(),
|
vm["lm"].as<string>(),
|
||||||
vm["vocab"].as<string>(),
|
vm["vocab"].as<string>(),
|
||||||
vm["package"].as<string>(),
|
vm["package"].as<string>(),
|
||||||
force_utf8,
|
force_bytes_output_mode,
|
||||||
vm["default_alpha"].as<float>(),
|
vm["default_alpha"].as<float>(),
|
||||||
vm["default_beta"].as<float>());
|
vm["default_beta"].as<float>());
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user