diff --git a/doc/Decoder.rst b/doc/Decoder.rst index a491ba82..da974bc4 100644 --- a/doc/Decoder.rst +++ b/doc/Decoder.rst @@ -62,7 +62,7 @@ At decoding time, the scorer is queried every time a Unicode codepoint is predic UTF-8 scorers can be built by using an input corpus with space separated codepoints. If your corpus only contains single codepoints separated by spaces, ``generate_scorer_package`` should automatically enable bytes output mode, and it should print the message "Looks like a character based model." -If the message "Doesn't look like a character based model." is printed, you should double check your inputs to make sure it only contains single codepoints separated by spaces. Bytes output mode can be forced by specifying the ``--force_utf8`` flag when running ``generate_scorer_package``, but it is NOT RECOMMENDED. +If the message "Doesn't look like a character based model." is printed, you should double check your inputs to make sure it only contains single codepoints separated by spaces. Bytes output mode can be forced by specifying the ``--force_bytes_output_mode`` flag when running ``generate_scorer_package``, but it is NOT RECOMMENDED. See :ref:`scorer-scripts` for more details on using ``generate_scorer_package``. diff --git a/native_client/generate_scorer_package.cpp b/native_client/generate_scorer_package.cpp index 4486b42c..1576a744 100644 --- a/native_client/generate_scorer_package.cpp +++ b/native_client/generate_scorer_package.cpp @@ -20,7 +20,7 @@ create_package(absl::optional alphabet_path, string lm_path, string vocab_path, string package_path, - absl::optional force_utf8, + absl::optional force_bytes_output_mode, float default_alpha, float default_beta) { @@ -43,27 +43,27 @@ create_package(absl::optional alphabet_path, << (vocab_looks_char_based ? "Looks" : "Doesn't look") << " like a character based (Bytes Are All You Need) model.\n"; - if (!force_utf8.has_value()) { - force_utf8 = vocab_looks_char_based; - cerr << "--force_utf8 was not specified, using value " + if (!force_bytes_output_mode.has_value()) { + force_bytes_output_mode = vocab_looks_char_based; + cerr << "--force_bytes_output_mode was not specified, using value " << "infered from vocabulary contents: " << (vocab_looks_char_based ? "true" : "false") << "\n"; } - if (!force_utf8.value() && !alphabet_path.has_value()) { + if (!force_bytes_output_mode.value() && !alphabet_path.has_value()) { cerr << "No --alphabet file specified, not using bytes output mode, can't continue.\n"; return 1; } Scorer scorer; - if (force_utf8.value()) { + if (force_bytes_output_mode.value()) { scorer.set_alphabet(UTF8Alphabet()); } else { Alphabet alphabet; alphabet.init(alphabet_path->c_str()); scorer.set_alphabet(alphabet); } - scorer.set_utf8_mode(force_utf8.value()); + scorer.set_utf8_mode(force_bytes_output_mode.value()); scorer.reset_params(default_alpha, default_beta); int err = scorer.load_lm(lm_path); if (err != DS_ERR_SCORER_NO_TRIE) { @@ -96,13 +96,13 @@ main(int argc, char** argv) po::options_description desc("Options"); desc.add_options() ("help", "show help message") - ("alphabet", po::value(), "Path of alphabet file to use for vocabulary construction. Words with characters not in the alphabet will not be included in the vocabulary. Optional if using UTF-8 mode.") + ("alphabet", po::value(), "Path of alphabet file to use for vocabulary construction. Words with characters not in the alphabet will not be included in the vocabulary. Optional if using bytes output mode.") ("lm", po::value(), "Path of KenLM binary LM file. Must be built without including the vocabulary (use the -v flag). See generate_lm.py for how to create a binary LM.") ("vocab", po::value(), "Path of vocabulary file. Must contain words separated by whitespace.") ("package", po::value(), "Path to save scorer package.") ("default_alpha", po::value(), "Default value of alpha hyperparameter (float).") ("default_beta", po::value(), "Default value of beta hyperparameter (float).") - ("force_utf8", po::value(), "Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See for further explanation.") + ("force_bytes_output_mode", po::value(), "Boolean flag, force set or unset bytes output mode in the scorer package. If not set, infers from the vocabulary. See for further explanation.") ; po::variables_map vm; @@ -122,10 +122,10 @@ main(int argc, char** argv) } } - // Parse optional --force_utf8 - absl::optional force_utf8 = absl::nullopt; - if (vm.count("force_utf8")) { - force_utf8 = vm["force_utf8"].as(); + // Parse optional --force_bytes_output_mode + absl::optional force_bytes_output_mode = absl::nullopt; + if (vm.count("force_bytes_output_mode")) { + force_bytes_output_mode = vm["force_bytes_output_mode"].as(); } // Parse optional --alphabet @@ -138,7 +138,7 @@ main(int argc, char** argv) vm["lm"].as(), vm["vocab"].as(), vm["package"].as(), - force_utf8, + force_bytes_output_mode, vm["default_alpha"].as(), vm["default_beta"].as());