From d65422c8ab4a4ec9261ac42892483b0117ecb490 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Wed, 15 Jan 2020 23:25:59 +0100 Subject: [PATCH] Update KenLM to b9f35777d112ce2fc10bd3986302517a16dc3883 --- native_client/kenlm/.gitignore | 3 + native_client/kenlm/BUILDING | 4 + native_client/kenlm/GIT_REVISION | 2 +- native_client/kenlm/README.md | 16 +- native_client/kenlm/README.mozilla | 22 +- native_client/kenlm/lm/build_binary_main.cc | 9 +- native_client/kenlm/lm/max_order.hh | 2 +- native_client/kenlm/lm/query_main.cc | 4 +- native_client/kenlm/lm/read_arpa.cc | 24 +- native_client/kenlm/lm/vocab.cc | 4 +- native_client/kenlm/lm/vocab.hh | 6 +- native_client/kenlm/setup.py | 24 +- native_client/kenlm/util/bit_packing.hh | 4 +- .../util/double-conversion/bignum-dtoa.cc | 13 +- .../kenlm/util/double-conversion/bignum.cc | 26 ++- .../kenlm/util/double-conversion/bignum.h | 1 - .../util/double-conversion/cached-powers.cc | 10 +- .../kenlm/util/double-conversion/diy-fp.h | 22 +- .../double-conversion/double-conversion.cc | 217 +++++++++++++----- .../double-conversion/double-conversion.h | 33 +-- .../kenlm/util/double-conversion/fast-dtoa.cc | 19 +- .../util/double-conversion/fixed-dtoa.cc | 19 +- .../kenlm/util/double-conversion/ieee.h | 6 +- .../kenlm/util/double-conversion/strtod.cc | 13 +- .../kenlm/util/double-conversion/utils.h | 58 +++-- native_client/kenlm/util/exception.hh | 2 +- native_client/kenlm/util/file.cc | 2 +- native_client/kenlm/util/file_piece.hh | 2 +- native_client/kenlm/util/mmap.cc | 2 +- .../kenlm/util/probing_hash_table.hh | 4 +- native_client/kenlm/util/tokenize_piece.hh | 32 ++- 31 files changed, 386 insertions(+), 219 deletions(-) diff --git a/native_client/kenlm/.gitignore b/native_client/kenlm/.gitignore index 2e28eaf4..c921fff8 100644 --- a/native_client/kenlm/.gitignore +++ b/native_client/kenlm/.gitignore @@ -3,6 +3,9 @@ util/file_piece.cc.gz *.o doc/ build/ +/bin +/lib +/tests ._* windows/Win32 windows/x64 diff --git a/native_client/kenlm/BUILDING b/native_client/kenlm/BUILDING index f6a10812..da36b87f 100644 --- a/native_client/kenlm/BUILDING +++ b/native_client/kenlm/BUILDING @@ -12,3 +12,7 @@ If you only want the query code and do not care about compression (.gz, .bz2, an Windows: The windows directory has visual studio files. Note that you need to compile the kenlm project before build_binary and ngram_query projects. + +OSX: + Missing dependencies can be remedied with brew. + brew install cmake boost eigen diff --git a/native_client/kenlm/GIT_REVISION b/native_client/kenlm/GIT_REVISION index 36ed3dfd..d2243f52 100644 --- a/native_client/kenlm/GIT_REVISION +++ b/native_client/kenlm/GIT_REVISION @@ -1 +1 @@ -cdd794598ea15dc23a7daaf7a8cf89423c97f7e6 +b9f35777d112ce2fc10bd3986302517a16dc3883 diff --git a/native_client/kenlm/README.md b/native_client/kenlm/README.md index 2cef6588..45965c03 100644 --- a/native_client/kenlm/README.md +++ b/native_client/kenlm/README.md @@ -2,9 +2,9 @@ Language model inference code by Kenneth Heafield (kenlm at kheafield.com) -I do development in master on https://github.com/kpu/kenlm/. Normally, it works, but I do not guarantee it will compile, give correct answers, or generate non-broken binary files. For a more stable release, get http://kheafield.com/code/kenlm.tar.gz . +I do development in master on https://github.com/kpu/kenlm/. Normally, it works, but I do not guarantee it will compile, give correct answers, or generate non-broken binary files. For a more stable release, get https://kheafield.com/code/kenlm.tar.gz . -The website http://kheafield.com/code/kenlm/ has more documentation. If you're a decoder developer, please download the latest version from there instead of copying from another decoder. +The website https://kheafield.com/code/kenlm/ has more documentation. If you're a decoder developer, please download the latest version from there instead of copying from another decoder. ## Compiling Use cmake, see [BUILDING](BUILDING) for more detail. @@ -33,7 +33,7 @@ lmplz estimates unpruned language models with modified Kneser-Ney smoothing. Af ```bash bin/lmplz -o 5 text.arpa ``` -The algorithm is on-disk, using an amount of memory that you specify. See http://kheafield.com/code/kenlm/estimation/ for more. +The algorithm is on-disk, using an amount of memory that you specify. See https://kheafield.com/code/kenlm/estimation/ for more. MT Marathon 2012 team members Ivan Pouzyrevsky and Mohammed Mediani contributed to the computation design and early implementation. Jon Clark contributed to the design, clarified points about smoothing, and added logging. @@ -43,15 +43,15 @@ filter takes an ARPA or count file and removes entries that will never be querie ```bash bin/filter ``` -and see http://kheafield.com/code/kenlm/filter/ for more documentation. +and see https://kheafield.com/code/kenlm/filter/ for more documentation. ## Querying -Two data structures are supported: probing and trie. Probing is a probing hash table with keys that are 64-bit hashes of n-grams and floats as values. Trie is a fairly standard trie but with bit-level packing so it uses the minimum number of bits to store word indices and pointers. The trie node entries are sorted by word index. Probing is the fastest and uses the most memory. Trie uses the least memory and a bit slower. +Two data structures are supported: probing and trie. Probing is a probing hash table with keys that are 64-bit hashes of n-grams and floats as values. Trie is a fairly standard trie but with bit-level packing so it uses the minimum number of bits to store word indices and pointers. The trie node entries are sorted by word index. Probing is the fastest and uses the most memory. Trie uses the least memory and is a bit slower. As is the custom in language modeling, all probabilities are log base 10. -With trie, resident memory is 58% of IRST's smallest version and 21% of SRI's compact version. Simultaneously, trie CPU's use is 81% of IRST's fastest version and 84% of SRI's fast version. KenLM's probing hash table implementation goes even faster at the expense of using more memory. See http://kheafield.com/code/kenlm/benchmark/. +With trie, resident memory is 58% of IRST's smallest version and 21% of SRI's compact version. Simultaneously, trie CPU's use is 81% of IRST's fastest version and 84% of SRI's fast version. KenLM's probing hash table implementation goes even faster at the expense of using more memory. See https://kheafield.com/code/kenlm/benchmark/. Binary format via mmap is supported. Run `./build_binary` to make one then pass the binary file name to the appropriate Model constructor. @@ -71,7 +71,7 @@ Hideo Okuma and Tomoyuki Yoshimura from NICT contributed ports to ARM and MinGW. - Select the macros you want, listed in the previous section. -- There are two build systems: compile.sh and Jamroot+Jamfile. They're pretty simple and are intended to be reimplemented in your build system. +- There are two build systems: compile.sh and cmake. They're pretty simple and are intended to be reimplemented in your build system. - Use either the interface in `lm/model.hh` or `lm/virtual_interface.hh`. Interface documentation is in comments of `lm/virtual_interface.hh` and `lm/model.hh`. @@ -101,4 +101,4 @@ See [python/example.py](python/example.py) and [python/kenlm.pyx](python/kenlm.p --- -The name was Hieu Hoang's idea, not mine. +The name was Hieu Hoang's idea, not mine. diff --git a/native_client/kenlm/README.mozilla b/native_client/kenlm/README.mozilla index 7bad32fd..f5badcbb 100644 --- a/native_client/kenlm/README.mozilla +++ b/native_client/kenlm/README.mozilla @@ -1,7 +1,7 @@ -KenLM source downloaded from http://kheafield.com/code/kenlm.tar.gz on 2017/08/05 -sha256 c4c9f587048470c9a6a592914f0609a71fbb959f0a4cad371e8c355ce81f7c6b +KenLM source downloaded from https://github.com/kpu/kenlm on 2020/01/15 +commit b9f35777d112ce2fc10bd3986302517a16dc3883 -This corresponds to https://github.com/kpu/kenlm/commit/cdd794598ea15dc23a7daaf7a8cf89423c97f7e6 +This corresponds to https://github.com/kpu/kenlm/commit/b9f35777d112ce2fc10bd3986302517a16dc3883 The following procedure was run to remove unneeded files: @@ -10,19 +10,3 @@ rm -rf windows include lm/filter lm/builder util/stream util/getopt.* python This was done in order to ensure uniqueness of double_conversion: git grep 'double_conversion' | cut -d':' -f1 | sort | uniq | xargs sed -ri 's/double_conversion/kenlm_double_conversion/g' - -Please apply this patch to be able to build on Android: -diff --git a/native_client/kenlm/util/file.cc b/native_client/kenlm/util/file.cc -index d53dc0a..b5e36b2 100644 ---- a/native_client/kenlm/util/file.cc -+++ b/native_client/kenlm/util/file.cc -@@ -540,7 +540,7 @@ std::string DefaultTempDirectory() { - const char *const vars[] = {"TMPDIR", "TMP", "TEMPDIR", "TEMP", 0}; - for (int i=0; vars[i]; ++i) { - char *val = --#if defined(_GNU_SOURCE) -+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) - #if __GLIBC_PREREQ(2,17) - secure_getenv - #else // __GLIBC_PREREQ - diff --git a/native_client/kenlm/lm/build_binary_main.cc b/native_client/kenlm/lm/build_binary_main.cc index 35206e60..cd377b03 100644 --- a/native_client/kenlm/lm/build_binary_main.cc +++ b/native_client/kenlm/lm/build_binary_main.cc @@ -10,7 +10,6 @@ #include #include #include -#include #ifdef WIN32 #include "util/getopt.hh" @@ -23,11 +22,12 @@ namespace ngram { namespace { void Usage(const char *name, const char *default_mem) { - std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" + std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-v] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" "-u sets the log10 probability for if the ARPA file does not have one.\n" " Default is -100. The ARPA file will always take precedence.\n" "-s allows models to be built even if they do not have and .\n" "-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n" +"-v disables inclusion of the vocabulary in the binary file.\n" "-w mmap|after determines how writing is done.\n" " mmap maps the binary file and writes to it. Default for trie.\n" " after allocates anonymous memory, builds, and writes. Default for probing.\n" @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) { lm::ngram::Config config; config.building_memory = util::ParseSize(default_mem); int opt; - while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:h")) != -1) { + while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:vh")) != -1) { switch(opt) { case 'q': config.prob_bits = ParseBitCount(optarg); @@ -165,6 +165,9 @@ int main(int argc, char *argv[]) { ParseFileList(optarg, config.rest_lower_files); config.rest_function = Config::REST_LOWER; break; + case 'v': + config.include_vocab = false; + break; case 'h': // help default: Usage(argv[0], default_mem); diff --git a/native_client/kenlm/lm/max_order.hh b/native_client/kenlm/lm/max_order.hh index 0ad1379e..4e28031a 100644 --- a/native_client/kenlm/lm/max_order.hh +++ b/native_client/kenlm/lm/max_order.hh @@ -7,7 +7,7 @@ * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead */ #ifndef KENLM_ORDER_MESSAGE -#define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'. Otherwise, edit lm/max_order.hh." +#define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. With cmake:\n cmake -DKENLM_MAX_ORDER=10 ..\nWith Moses:\n bjam --max-kenlm-order=10 -a\nOtherwise, edit lm/max_order.hh." #endif #endif // LM_MAX_ORDER_H diff --git a/native_client/kenlm/lm/query_main.cc b/native_client/kenlm/lm/query_main.cc index f3ca6e61..76466030 100644 --- a/native_client/kenlm/lm/query_main.cc +++ b/native_client/kenlm/lm/query_main.cc @@ -19,8 +19,8 @@ void Usage(const char *name) { "Each word in the output is formatted as:\n" " word=vocab_id ngram_length log10(p(word|context))\n" "where ngram_length is the length of n-gram matched. A vocab_id of 0 indicates\n" - "indicates the unknown word. Sentence-level output includes log10 probability of\n" - "the sentence and OOV count.\n"; + "the unknown word. Sentence-level output includes log10 probability of the\n" + "sentence and OOV count.\n"; exit(1); } diff --git a/native_client/kenlm/lm/read_arpa.cc b/native_client/kenlm/lm/read_arpa.cc index dc05a653..6ee9bfb2 100644 --- a/native_client/kenlm/lm/read_arpa.cc +++ b/native_client/kenlm/lm/read_arpa.cc @@ -19,8 +19,8 @@ namespace lm { -// 1 for '\t', '\n', and ' '. This is stricter than isspace. -const bool kARPASpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; +// 1 for '\t', '\n', '\r', and ' '. This is stricter than isspace. Apparently ARPA allows vertical tab inside a word. +const bool kARPASpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; namespace { @@ -85,6 +85,11 @@ void ReadNGramHeader(util::FilePiece &in, unsigned int length) { if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead"); } +void ConsumeNewline(util::FilePiece &in) { + char follow = in.get(); + UTIL_THROW_IF('\n' != follow, FormatLoadException, "Expected newline got '" << follow << "'"); +} + void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) { switch (in.get()) { case '\t': @@ -94,6 +99,9 @@ void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) { UTIL_THROW(FormatLoadException, "Non-zero backoff " << got << " provided for an n-gram that should have no backoff"); } break; + case '\r': + ConsumeNewline(in); + // Intentionally no break. case '\n': break; default: @@ -120,8 +128,18 @@ void ReadBackoff(util::FilePiece &in, float &backoff) { UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << backoff); #endif } - UTIL_THROW_IF(in.get() != '\n', FormatLoadException, "Expected newline after backoff"); + switch (char got = in.get()) { + case '\r': + ConsumeNewline(in); + case '\n': + break; + default: + UTIL_THROW(FormatLoadException, "Expected newline after backoffs, got " << got); + } break; + case '\r': + ConsumeNewline(in); + // Intentionally no break. case '\n': backoff = ngram::kNoExtensionBackoff; break; diff --git a/native_client/kenlm/lm/vocab.cc b/native_client/kenlm/lm/vocab.cc index 5df5ca27..7996ec7e 100644 --- a/native_client/kenlm/lm/vocab.cc +++ b/native_client/kenlm/lm/vocab.cc @@ -282,7 +282,7 @@ void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to if (have_words) ReadWords(fd, to, bound_, offset); } -void MissingUnknown(const Config &config) throw(SpecialWordMissingException) { +void MissingUnknown(const Config &config) { switch(config.unknown_missing) { case SILENT: return; @@ -294,7 +294,7 @@ void MissingUnknown(const Config &config) throw(SpecialWordMissingException) { } } -void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException) { +void MissingSentenceMarker(const Config &config, const char *str) { switch (config.sentence_marker_missing) { case SILENT: return; diff --git a/native_client/kenlm/lm/vocab.hh b/native_client/kenlm/lm/vocab.hh index 99c0aa83..f36e62ca 100644 --- a/native_client/kenlm/lm/vocab.hh +++ b/native_client/kenlm/lm/vocab.hh @@ -207,10 +207,10 @@ class ProbingVocabulary : public base::Vocabulary { detail::ProbingVocabularyHeader *header_; }; -void MissingUnknown(const Config &config) throw(SpecialWordMissingException); -void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException); +void MissingUnknown(const Config &config); +void MissingSentenceMarker(const Config &config, const char *str); -template void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) { +template void CheckSpecials(const Config &config, const Vocab &vocab) { if (!vocab.SawUnk()) MissingUnknown(config); if (vocab.BeginSentence() == vocab.NotFound()) MissingSentenceMarker(config, ""); if (vocab.EndSentence() == vocab.NotFound()) MissingSentenceMarker(config, ""); diff --git a/native_client/kenlm/setup.py b/native_client/kenlm/setup.py index 9d40c019..9e0f0d15 100644 --- a/native_client/kenlm/setup.py +++ b/native_client/kenlm/setup.py @@ -2,6 +2,8 @@ from setuptools import setup, Extension import glob import platform import os +import sys +import re #Does gcc compile with this header and library? def compile_test(header, library): @@ -9,16 +11,28 @@ def compile_test(header, library): command = "bash -c \"g++ -include " + header + " -l" + library + " -x c++ - <<<'int main() {}' -o " + dummy_path + " >/dev/null 2>/dev/null && rm " + dummy_path + " 2>/dev/null\"" return os.system(command) == 0 +max_order = "6" +is_max_order = [s for s in sys.argv if "--max_order" in s] +for element in is_max_order: + max_order = re.split('[= ]',element)[1] + sys.argv.remove(element) -FILES = glob.glob('util/*.cc') + glob.glob('lm/*.cc') + glob.glob('util/double-conversion/*.cc') +FILES = glob.glob('util/*.cc') + glob.glob('lm/*.cc') + glob.glob('util/double-conversion/*.cc') + glob.glob('python/*.cc') FILES = [fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc'))] -LIBS = ['stdc++'] -if platform.system() != 'Darwin': - LIBS.append('rt') +if platform.system() == 'Linux': + LIBS = ['stdc++', 'rt'] +elif platform.system() == 'Darwin': + LIBS = ['c++'] +else: + LIBS = [] #We don't need -std=c++11 but python seems to be compiled with it now. https://github.com/kpu/kenlm/issues/86 -ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER=6', '-std=c++11'] +ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER='+max_order, '-std=c++11'] + +#Attempted fix to https://github.com/kpu/kenlm/issues/186 and https://github.com/kpu/kenlm/issues/197 +if platform.system() == 'Darwin': + ARGS += ["-stdlib=libc++", "-mmacosx-version-min=10.7"] if compile_test('zlib.h', 'z'): ARGS.append('-DHAVE_ZLIB') diff --git a/native_client/kenlm/util/bit_packing.hh b/native_client/kenlm/util/bit_packing.hh index b24fd9c1..77abc0df 100644 --- a/native_client/kenlm/util/bit_packing.hh +++ b/native_client/kenlm/util/bit_packing.hh @@ -108,7 +108,7 @@ typedef union { float f; uint32_t i; } FloatEnc; inline float ReadFloat32(const void *base, uint64_t bit_off) { FloatEnc encoded; - encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 32); + encoded.i = static_cast(ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 32)); return encoded.f; } inline void WriteFloat32(void *base, uint64_t bit_off, float value) { @@ -135,7 +135,7 @@ inline void UnsetSign(float &to) { inline float ReadNonPositiveFloat31(const void *base, uint64_t bit_off) { FloatEnc encoded; - encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31); + encoded.i = static_cast(ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31)); // Sign bit set means negative. encoded.i |= kSignBit; return encoded.f; diff --git a/native_client/kenlm/util/double-conversion/bignum-dtoa.cc b/native_client/kenlm/util/double-conversion/bignum-dtoa.cc index 4825888d..a687b90a 100644 --- a/native_client/kenlm/util/double-conversion/bignum-dtoa.cc +++ b/native_client/kenlm/util/double-conversion/bignum-dtoa.cc @@ -25,7 +25,7 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include +#include #include "bignum-dtoa.h" @@ -192,13 +192,13 @@ static void GenerateShortestDigits(Bignum* numerator, Bignum* denominator, delta_plus = delta_minus; } *length = 0; - while (true) { + for (;;) { uint16_t digit; digit = numerator->DivideModuloIntBignum(*denominator); ASSERT(digit <= 9); // digit is a uint16_t and therefore always positive. // digit = numerator / denominator (integer division). // numerator = numerator % denominator. - buffer[(*length)++] = digit + '0'; + buffer[(*length)++] = static_cast(digit + '0'); // Can we stop already? // If the remainder of the division is less than the distance to the lower @@ -282,7 +282,7 @@ static void GenerateShortestDigits(Bignum* numerator, Bignum* denominator, // exponent (decimal_point), when rounding upwards. static void GenerateCountedDigits(int count, int* decimal_point, Bignum* numerator, Bignum* denominator, - Vector(buffer), int* length) { + Vector buffer, int* length) { ASSERT(count >= 0); for (int i = 0; i < count - 1; ++i) { uint16_t digit; @@ -290,7 +290,7 @@ static void GenerateCountedDigits(int count, int* decimal_point, ASSERT(digit <= 9); // digit is a uint16_t and therefore always positive. // digit = numerator / denominator (integer division). // numerator = numerator % denominator. - buffer[i] = digit + '0'; + buffer[i] = static_cast(digit + '0'); // Prepare for next iteration. numerator->Times10(); } @@ -300,7 +300,8 @@ static void GenerateCountedDigits(int count, int* decimal_point, if (Bignum::PlusCompare(*numerator, *numerator, *denominator) >= 0) { digit++; } - buffer[count - 1] = digit + '0'; + ASSERT(digit <= 10); + buffer[count - 1] = static_cast(digit + '0'); // Correct bad digits (in case we had a sequence of '9's). Propagate the // carry until we hat a non-'9' or til we reach the first digit. for (int i = count - 1; i > 0; --i) { diff --git a/native_client/kenlm/util/double-conversion/bignum.cc b/native_client/kenlm/util/double-conversion/bignum.cc index 3ff99d36..cbcc4ea2 100644 --- a/native_client/kenlm/util/double-conversion/bignum.cc +++ b/native_client/kenlm/util/double-conversion/bignum.cc @@ -40,6 +40,7 @@ Bignum::Bignum() template static int BitSize(S value) { + (void) value; // Mark variable as used. return 8 * sizeof(value); } @@ -103,7 +104,7 @@ void Bignum::AssignDecimalString(Vector value) { const int kMaxUint64DecimalDigits = 19; Zero(); int length = value.length(); - int pos = 0; + unsigned int pos = 0; // Let's just say that each digit needs 4 bits. while (length >= kMaxUint64DecimalDigits) { uint64_t digits = ReadUInt64(value, pos, kMaxUint64DecimalDigits); @@ -122,9 +123,8 @@ void Bignum::AssignDecimalString(Vector value) { static int HexCharValue(char c) { if ('0' <= c && c <= '9') return c - '0'; if ('a' <= c && c <= 'f') return 10 + c - 'a'; - if ('A' <= c && c <= 'F') return 10 + c - 'A'; - UNREACHABLE(); - return 0; // To make compiler happy. + ASSERT('A' <= c && c <= 'F'); + return 10 + c - 'A'; } @@ -501,13 +501,14 @@ uint16_t Bignum::DivideModuloIntBignum(const Bignum& other) { // Start by removing multiples of 'other' until both numbers have the same // number of digits. while (BigitLength() > other.BigitLength()) { - // This naive approach is extremely inefficient if the this divided other - // might be big. This function is implemented for doubleToString where + // This naive approach is extremely inefficient if `this` divided by other + // is big. This function is implemented for doubleToString where // the result should be small (less than 10). ASSERT(other.bigits_[other.used_digits_ - 1] >= ((1 << kBigitSize) / 16)); + ASSERT(bigits_[used_digits_ - 1] < 0x10000); // Remove the multiples of the first digit. // Example this = 23 and other equals 9. -> Remove 2 multiples. - result += bigits_[used_digits_ - 1]; + result += static_cast(bigits_[used_digits_ - 1]); SubtractTimes(other, bigits_[used_digits_ - 1]); } @@ -523,13 +524,15 @@ uint16_t Bignum::DivideModuloIntBignum(const Bignum& other) { // Shortcut for easy (and common) case. int quotient = this_bigit / other_bigit; bigits_[used_digits_ - 1] = this_bigit - other_bigit * quotient; - result += quotient; + ASSERT(quotient < 0x10000); + result += static_cast(quotient); Clamp(); return result; } int division_estimate = this_bigit / (other_bigit + 1); - result += division_estimate; + ASSERT(division_estimate < 0x10000); + result += static_cast(division_estimate); SubtractTimes(other, division_estimate); if (other_bigit * (division_estimate + 1) > this_bigit) { @@ -560,8 +563,8 @@ static int SizeInHexChars(S number) { static char HexCharOfValue(int value) { ASSERT(0 <= value && value <= 16); - if (value < 10) return value + '0'; - return value - 10 + 'A'; + if (value < 10) return static_cast(value + '0'); + return static_cast(value - 10 + 'A'); } @@ -755,7 +758,6 @@ void Bignum::SubtractTimes(const Bignum& other, int factor) { Chunk difference = bigits_[i] - borrow; bigits_[i] = difference & kBigitMask; borrow = difference >> (kChunkSize - 1); - ++i; } Clamp(); } diff --git a/native_client/kenlm/util/double-conversion/bignum.h b/native_client/kenlm/util/double-conversion/bignum.h index 03a20601..553189f7 100644 --- a/native_client/kenlm/util/double-conversion/bignum.h +++ b/native_client/kenlm/util/double-conversion/bignum.h @@ -49,7 +49,6 @@ class Bignum { void AssignPowerUInt16(uint16_t base, int exponent); - void AddUInt16(uint16_t operand); void AddUInt64(uint64_t operand); void AddBignum(const Bignum& other); // Precondition: this >= other. diff --git a/native_client/kenlm/util/double-conversion/cached-powers.cc b/native_client/kenlm/util/double-conversion/cached-powers.cc index e61d7f34..e186bba6 100644 --- a/native_client/kenlm/util/double-conversion/cached-powers.cc +++ b/native_client/kenlm/util/double-conversion/cached-powers.cc @@ -25,9 +25,9 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include -#include -#include +#include +#include +#include #include "utils.h" @@ -131,7 +131,6 @@ static const CachedPower kCachedPowers[] = { {UINT64_2PART_C(0xaf87023b, 9bf0ee6b), 1066, 340}, }; -static const int kCachedPowersLength = ARRAY_SIZE(kCachedPowers); static const int kCachedPowersOffset = 348; // -1 * the first decimal_exponent. static const double kD_1_LOG2_10 = 0.30102999566398114; // 1 / lg(10) // Difference between the decimal exponents in the table above. @@ -149,9 +148,10 @@ void PowersOfTenCache::GetCachedPowerForBinaryExponentRange( int foo = kCachedPowersOffset; int index = (foo + static_cast(k) - 1) / kDecimalExponentDistance + 1; - ASSERT(0 <= index && index < kCachedPowersLength); + ASSERT(0 <= index && index < static_cast(ARRAY_SIZE(kCachedPowers))); CachedPower cached_power = kCachedPowers[index]; ASSERT(min_exponent <= cached_power.binary_exponent); + (void) max_exponent; // Mark variable as used. ASSERT(cached_power.binary_exponent <= max_exponent); *decimal_exponent = cached_power.decimal_exponent; *power = DiyFp(cached_power.significand, cached_power.binary_exponent); diff --git a/native_client/kenlm/util/double-conversion/diy-fp.h b/native_client/kenlm/util/double-conversion/diy-fp.h index 71552b9b..6495d1d9 100644 --- a/native_client/kenlm/util/double-conversion/diy-fp.h +++ b/native_client/kenlm/util/double-conversion/diy-fp.h @@ -42,7 +42,7 @@ class DiyFp { static const int kSignificandSize = 64; DiyFp() : f_(0), e_(0) {} - DiyFp(uint64_t f, int e) : f_(f), e_(e) {} + DiyFp(uint64_t significand, int exponent) : f_(significand), e_(exponent) {} // this = this - other. // The exponents of both numbers must be the same and the significand of this @@ -76,22 +76,22 @@ class DiyFp { void Normalize() { ASSERT(f_ != 0); - uint64_t f = f_; - int e = e_; + uint64_t significand = f_; + int exponent = e_; // This method is mainly called for normalizing boundaries. In general // boundaries need to be shifted by 10 bits. We thus optimize for this case. const uint64_t k10MSBits = UINT64_2PART_C(0xFFC00000, 00000000); - while ((f & k10MSBits) == 0) { - f <<= 10; - e -= 10; + while ((significand & k10MSBits) == 0) { + significand <<= 10; + exponent -= 10; } - while ((f & kUint64MSB) == 0) { - f <<= 1; - e--; + while ((significand & kUint64MSB) == 0) { + significand <<= 1; + exponent--; } - f_ = f; - e_ = e; + f_ = significand; + e_ = exponent; } static DiyFp Normalize(const DiyFp& a) { diff --git a/native_client/kenlm/util/double-conversion/double-conversion.cc b/native_client/kenlm/util/double-conversion/double-conversion.cc index 115fe16f..be5cf75f 100644 --- a/native_client/kenlm/util/double-conversion/double-conversion.cc +++ b/native_client/kenlm/util/double-conversion/double-conversion.cc @@ -25,8 +25,8 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include -#include +#include +#include #include "double-conversion.h" @@ -118,7 +118,7 @@ void DoubleToStringConverter::CreateDecimalRepresentation( StringBuilder* result_builder) const { // Create a representation that is padded with zeros if needed. if (decimal_point <= 0) { - // "0.00000decimal_rep". + // "0.00000decimal_rep" or "0.000decimal_rep00". result_builder->AddCharacter('0'); if (digits_after_point > 0) { result_builder->AddCharacter('.'); @@ -129,7 +129,7 @@ void DoubleToStringConverter::CreateDecimalRepresentation( result_builder->AddPadding('0', remaining_digits); } } else if (decimal_point >= length) { - // "decimal_rep0000.00000" or "decimal_rep.0000" + // "decimal_rep0000.00000" or "decimal_rep.0000". result_builder->AddSubstring(decimal_digits, length); result_builder->AddPadding('0', decimal_point - length); if (digits_after_point > 0) { @@ -137,7 +137,7 @@ void DoubleToStringConverter::CreateDecimalRepresentation( result_builder->AddPadding('0', digits_after_point); } } else { - // "decima.l_rep000" + // "decima.l_rep000". ASSERT(digits_after_point > 0); result_builder->AddSubstring(decimal_digits, decimal_point); result_builder->AddCharacter('.'); @@ -348,7 +348,6 @@ static BignumDtoaMode DtoaToBignumDtoaMode( case DoubleToStringConverter::PRECISION: return BIGNUM_DTOA_PRECISION; default: UNREACHABLE(); - return BIGNUM_DTOA_SHORTEST; // To silence compiler. } } @@ -403,8 +402,8 @@ void DoubleToStringConverter::DoubleToAscii(double v, vector, length, point); break; default: - UNREACHABLE(); fast_worked = false; + UNREACHABLE(); } if (fast_worked) return; @@ -417,8 +416,9 @@ void DoubleToStringConverter::DoubleToAscii(double v, // Consumes the given substring from the iterator. // Returns false, if the substring does not match. -static bool ConsumeSubString(const char** current, - const char* end, +template +static bool ConsumeSubString(Iterator* current, + Iterator end, const char* substring) { ASSERT(**current == *substring); for (substring++; *substring != '\0'; substring++) { @@ -440,10 +440,36 @@ static bool ConsumeSubString(const char** current, const int kMaxSignificantDigits = 772; +static const char kWhitespaceTable7[] = { 32, 13, 10, 9, 11, 12 }; +static const int kWhitespaceTable7Length = ARRAY_SIZE(kWhitespaceTable7); + + +static const uc16 kWhitespaceTable16[] = { + 160, 8232, 8233, 5760, 6158, 8192, 8193, 8194, 8195, + 8196, 8197, 8198, 8199, 8200, 8201, 8202, 8239, 8287, 12288, 65279 +}; +static const int kWhitespaceTable16Length = ARRAY_SIZE(kWhitespaceTable16); + + +static bool isWhitespace(int x) { + if (x < 128) { + for (int i = 0; i < kWhitespaceTable7Length; i++) { + if (kWhitespaceTable7[i] == x) return true; + } + } else { + for (int i = 0; i < kWhitespaceTable16Length; i++) { + if (kWhitespaceTable16[i] == x) return true; + } + } + return false; +} + + // Returns true if a nonspace found and false if the end has reached. -static inline bool AdvanceToNonspace(const char** current, const char* end) { +template +static inline bool AdvanceToNonspace(Iterator* current, Iterator end) { while (*current != end) { - if (**current != ' ') return true; + if (!isWhitespace(**current)) return true; ++*current; } return false; @@ -462,26 +488,57 @@ static double SignedZero(bool sign) { } +// Returns true if 'c' is a decimal digit that is valid for the given radix. +// +// The function is small and could be inlined, but VS2012 emitted a warning +// because it constant-propagated the radix and concluded that the last +// condition was always true. By moving it into a separate function the +// compiler wouldn't warn anymore. +#if _MSC_VER +#pragma optimize("",off) +static bool IsDecimalDigitForRadix(int c, int radix) { + return '0' <= c && c <= '9' && (c - '0') < radix; +} +#pragma optimize("",on) +#else +static bool inline IsDecimalDigitForRadix(int c, int radix) { + return '0' <= c && c <= '9' && (c - '0') < radix; +} +#endif +// Returns true if 'c' is a character digit that is valid for the given radix. +// The 'a_character' should be 'a' or 'A'. +// +// The function is small and could be inlined, but VS2012 emitted a warning +// because it constant-propagated the radix and concluded that the first +// condition was always false. By moving it into a separate function the +// compiler wouldn't warn anymore. +static bool IsCharacterDigitForRadix(int c, int radix, char a_character) { + return radix > 10 && c >= a_character && c < a_character + radix - 10; +} + + // Parsing integers with radix 2, 4, 8, 16, 32. Assumes current != end. -template -static double RadixStringToIeee(const char* current, - const char* end, +template +static double RadixStringToIeee(Iterator* current, + Iterator end, bool sign, bool allow_trailing_junk, double junk_string_value, bool read_as_double, - const char** trailing_pointer) { - ASSERT(current != end); + bool* result_is_junk) { + ASSERT(*current != end); const int kDoubleSize = Double::kSignificandSize; const int kSingleSize = Single::kSignificandSize; const int kSignificandSize = read_as_double? kDoubleSize: kSingleSize; + *result_is_junk = true; + // Skip leading 0s. - while (*current == '0') { - ++current; - if (current == end) { - *trailing_pointer = end; + while (**current == '0') { + ++(*current); + if (*current == end) { + *result_is_junk = false; return SignedZero(sign); } } @@ -492,14 +549,14 @@ static double RadixStringToIeee(const char* current, do { int digit; - if (*current >= '0' && *current <= '9' && *current < '0' + radix) { - digit = static_cast(*current) - '0'; - } else if (radix > 10 && *current >= 'a' && *current < 'a' + radix - 10) { - digit = static_cast(*current) - 'a' + 10; - } else if (radix > 10 && *current >= 'A' && *current < 'A' + radix - 10) { - digit = static_cast(*current) - 'A' + 10; + if (IsDecimalDigitForRadix(**current, radix)) { + digit = static_cast(**current) - '0'; + } else if (IsCharacterDigitForRadix(**current, radix, 'a')) { + digit = static_cast(**current) - 'a' + 10; + } else if (IsCharacterDigitForRadix(**current, radix, 'A')) { + digit = static_cast(**current) - 'A' + 10; } else { - if (allow_trailing_junk || !AdvanceToNonspace(¤t, end)) { + if (allow_trailing_junk || !AdvanceToNonspace(current, end)) { break; } else { return junk_string_value; @@ -523,14 +580,14 @@ static double RadixStringToIeee(const char* current, exponent = overflow_bits_count; bool zero_tail = true; - while (true) { - ++current; - if (current == end || !isDigit(*current, radix)) break; - zero_tail = zero_tail && *current == '0'; + for (;;) { + ++(*current); + if (*current == end || !isDigit(**current, radix)) break; + zero_tail = zero_tail && **current == '0'; exponent += radix_log_2; } - if (!allow_trailing_junk && AdvanceToNonspace(¤t, end)) { + if (!allow_trailing_junk && AdvanceToNonspace(current, end)) { return junk_string_value; } @@ -552,13 +609,13 @@ static double RadixStringToIeee(const char* current, } break; } - ++current; - } while (current != end); + ++(*current); + } while (*current != end); ASSERT(number < ((int64_t)1 << kSignificandSize)); ASSERT(static_cast(static_cast(number)) == number); - *trailing_pointer = current; + *result_is_junk = false; if (exponent == 0) { if (sign) { @@ -573,13 +630,14 @@ static double RadixStringToIeee(const char* current, } +template double StringToDoubleConverter::StringToIeee( - const char* input, + Iterator input, int length, - int* processed_characters_count, - bool read_as_double) const { - const char* current = input; - const char* end = input + length; + bool read_as_double, + int* processed_characters_count) const { + Iterator current = input; + Iterator end = input + length; *processed_characters_count = 0; @@ -600,7 +658,7 @@ double StringToDoubleConverter::StringToIeee( if (allow_leading_spaces || allow_trailing_spaces) { if (!AdvanceToNonspace(¤t, end)) { - *processed_characters_count = current - input; + *processed_characters_count = static_cast(current - input); return empty_string_value_; } if (!allow_leading_spaces && (input != current)) { @@ -626,7 +684,7 @@ double StringToDoubleConverter::StringToIeee( if (*current == '+' || *current == '-') { sign = (*current == '-'); ++current; - const char* next_non_space = current; + Iterator next_non_space = current; // Skip following spaces (if allowed). if (!AdvanceToNonspace(&next_non_space, end)) return junk_string_value_; if (!allow_spaces_after_sign && (current != next_non_space)) { @@ -649,7 +707,7 @@ double StringToDoubleConverter::StringToIeee( } ASSERT(buffer_pos == 0); - *processed_characters_count = current - input; + *processed_characters_count = static_cast(current - input); return sign ? -Double::Infinity() : Double::Infinity(); } } @@ -668,7 +726,7 @@ double StringToDoubleConverter::StringToIeee( } ASSERT(buffer_pos == 0); - *processed_characters_count = current - input; + *processed_characters_count = static_cast(current - input); return sign ? -Double::NaN() : Double::NaN(); } } @@ -677,7 +735,7 @@ double StringToDoubleConverter::StringToIeee( if (*current == '0') { ++current; if (current == end) { - *processed_characters_count = current - input; + *processed_characters_count = static_cast(current - input); return SignedZero(sign); } @@ -690,17 +748,17 @@ double StringToDoubleConverter::StringToIeee( return junk_string_value_; // "0x". } - const char* tail_pointer = NULL; - double result = RadixStringToIeee<4>(current, + bool result_is_junk; + double result = RadixStringToIeee<4>(¤t, end, sign, allow_trailing_junk, junk_string_value_, read_as_double, - &tail_pointer); - if (tail_pointer != NULL) { - if (allow_trailing_spaces) AdvanceToNonspace(&tail_pointer, end); - *processed_characters_count = tail_pointer - input; + &result_is_junk); + if (!result_is_junk) { + if (allow_trailing_spaces) AdvanceToNonspace(¤t, end); + *processed_characters_count = static_cast(current - input); } return result; } @@ -709,7 +767,7 @@ double StringToDoubleConverter::StringToIeee( while (*current == '0') { ++current; if (current == end) { - *processed_characters_count = current - input; + *processed_characters_count = static_cast(current - input); return SignedZero(sign); } } @@ -757,7 +815,7 @@ double StringToDoubleConverter::StringToIeee( while (*current == '0') { ++current; if (current == end) { - *processed_characters_count = current - input; + *processed_characters_count = static_cast(current - input); return SignedZero(sign); } exponent--; // Move this 0 into the exponent. @@ -801,9 +859,9 @@ double StringToDoubleConverter::StringToIeee( return junk_string_value_; } } - char sign = '+'; + char exponen_sign = '+'; if (*current == '+' || *current == '-') { - sign = static_cast(*current); + exponen_sign = static_cast(*current); ++current; if (current == end) { if (allow_trailing_junk) { @@ -837,7 +895,7 @@ double StringToDoubleConverter::StringToIeee( ++current; } while (current != end && *current >= '0' && *current <= '9'); - exponent += (sign == '-' ? -num : num); + exponent += (exponen_sign == '-' ? -num : num); } if (!(allow_trailing_spaces || allow_trailing_junk) && (current != end)) { @@ -855,16 +913,17 @@ double StringToDoubleConverter::StringToIeee( if (octal) { double result; - const char* tail_pointer = NULL; - result = RadixStringToIeee<3>(buffer, + bool result_is_junk; + char* start = buffer; + result = RadixStringToIeee<3>(&start, buffer + buffer_pos, sign, allow_trailing_junk, junk_string_value_, read_as_double, - &tail_pointer); - ASSERT(tail_pointer != NULL); - *processed_characters_count = current - input; + &result_is_junk); + ASSERT(!result_is_junk); + *processed_characters_count = static_cast(current - input); return result; } @@ -882,8 +941,42 @@ double StringToDoubleConverter::StringToIeee( } else { converted = Strtof(Vector(buffer, buffer_pos), exponent); } - *processed_characters_count = current - input; + *processed_characters_count = static_cast(current - input); return sign? -converted: converted; } + +double StringToDoubleConverter::StringToDouble( + const char* buffer, + int length, + int* processed_characters_count) const { + return StringToIeee(buffer, length, true, processed_characters_count); +} + + +double StringToDoubleConverter::StringToDouble( + const uc16* buffer, + int length, + int* processed_characters_count) const { + return StringToIeee(buffer, length, true, processed_characters_count); +} + + +float StringToDoubleConverter::StringToFloat( + const char* buffer, + int length, + int* processed_characters_count) const { + return static_cast(StringToIeee(buffer, length, false, + processed_characters_count)); +} + + +float StringToDoubleConverter::StringToFloat( + const uc16* buffer, + int length, + int* processed_characters_count) const { + return static_cast(StringToIeee(buffer, length, false, + processed_characters_count)); +} + } // namespace kenlm_double_conversion diff --git a/native_client/kenlm/util/double-conversion/double-conversion.h b/native_client/kenlm/util/double-conversion/double-conversion.h index d3a57c05..62e5bbf0 100644 --- a/native_client/kenlm/util/double-conversion/double-conversion.h +++ b/native_client/kenlm/util/double-conversion/double-conversion.h @@ -415,9 +415,10 @@ class StringToDoubleConverter { // junk, too. // - ALLOW_TRAILING_JUNK: ignore trailing characters that are not part of // a double literal. - // - ALLOW_LEADING_SPACES: skip over leading spaces. - // - ALLOW_TRAILING_SPACES: ignore trailing spaces. - // - ALLOW_SPACES_AFTER_SIGN: ignore spaces after the sign. + // - ALLOW_LEADING_SPACES: skip over leading whitespace, including spaces, + // new-lines, and tabs. + // - ALLOW_TRAILING_SPACES: ignore trailing whitespace. + // - ALLOW_SPACES_AFTER_SIGN: ignore whitespace after the sign. // Ex: StringToDouble("- 123.2") -> -123.2. // StringToDouble("+ 123.2") -> 123.2 // @@ -502,19 +503,24 @@ class StringToDoubleConverter { // in the 'processed_characters_count'. Trailing junk is never included. double StringToDouble(const char* buffer, int length, - int* processed_characters_count) const { - return StringToIeee(buffer, length, processed_characters_count, true); - } + int* processed_characters_count) const; + + // Same as StringToDouble above but for 16 bit characters. + double StringToDouble(const uc16* buffer, + int length, + int* processed_characters_count) const; // Same as StringToDouble but reads a float. // Note that this is not equivalent to static_cast(StringToDouble(...)) // due to potential double-rounding. float StringToFloat(const char* buffer, int length, - int* processed_characters_count) const { - return static_cast(StringToIeee(buffer, length, - processed_characters_count, false)); - } + int* processed_characters_count) const; + + // Same as StringToFloat above but for 16 bit characters. + float StringToFloat(const uc16* buffer, + int length, + int* processed_characters_count) const; private: const int flags_; @@ -523,10 +529,11 @@ class StringToDoubleConverter { const char* const infinity_symbol_; const char* const nan_symbol_; - double StringToIeee(const char* buffer, + template + double StringToIeee(Iterator start_pointer, int length, - int* processed_characters_count, - bool read_as_double) const; + bool read_as_double, + int* processed_characters_count) const; DISALLOW_IMPLICIT_CONSTRUCTORS(StringToDoubleConverter); }; diff --git a/native_client/kenlm/util/double-conversion/fast-dtoa.cc b/native_client/kenlm/util/double-conversion/fast-dtoa.cc index ff2936d6..1a3d8496 100644 --- a/native_client/kenlm/util/double-conversion/fast-dtoa.cc +++ b/native_client/kenlm/util/double-conversion/fast-dtoa.cc @@ -248,10 +248,7 @@ static void BiggestPowerTen(uint32_t number, // Note: kPowersOf10[i] == 10^(i-1). exponent_plus_one_guess++; // We don't have any guarantees that 2^number_bits <= number. - // TODO(floitsch): can we change the 'while' into an 'if'? We definitely see - // number < (2^number_bits - 1), but I haven't encountered - // number < (2^number_bits - 2) yet. - while (number < kSmallPowersOfTen[exponent_plus_one_guess]) { + if (number < kSmallPowersOfTen[exponent_plus_one_guess]) { exponent_plus_one_guess--; } *power = kSmallPowersOfTen[exponent_plus_one_guess]; @@ -350,7 +347,8 @@ static bool DigitGen(DiyFp low, // that is smaller than integrals. while (*kappa > 0) { int digit = integrals / divisor; - buffer[*length] = '0' + digit; + ASSERT(digit <= 9); + buffer[*length] = static_cast('0' + digit); (*length)++; integrals %= divisor; (*kappa)--; @@ -379,13 +377,14 @@ static bool DigitGen(DiyFp low, ASSERT(one.e() >= -60); ASSERT(fractionals < one.f()); ASSERT(UINT64_2PART_C(0xFFFFFFFF, FFFFFFFF) / 10 >= one.f()); - while (true) { + for (;;) { fractionals *= 10; unit *= 10; unsafe_interval.set_f(unsafe_interval.f() * 10); // Integer division by one. int digit = static_cast(fractionals >> -one.e()); - buffer[*length] = '0' + digit; + ASSERT(digit <= 9); + buffer[*length] = static_cast('0' + digit); (*length)++; fractionals &= one.f() - 1; // Modulo by one. (*kappa)--; @@ -459,7 +458,8 @@ static bool DigitGenCounted(DiyFp w, // that is smaller than 'integrals'. while (*kappa > 0) { int digit = integrals / divisor; - buffer[*length] = '0' + digit; + ASSERT(digit <= 9); + buffer[*length] = static_cast('0' + digit); (*length)++; requested_digits--; integrals %= divisor; @@ -492,7 +492,8 @@ static bool DigitGenCounted(DiyFp w, w_error *= 10; // Integer division by one. int digit = static_cast(fractionals >> -one.e()); - buffer[*length] = '0' + digit; + ASSERT(digit <= 9); + buffer[*length] = static_cast('0' + digit); (*length)++; requested_digits--; fractionals &= one.f() - 1; // Modulo by one. diff --git a/native_client/kenlm/util/double-conversion/fixed-dtoa.cc b/native_client/kenlm/util/double-conversion/fixed-dtoa.cc index a1a16a62..d5eec223 100644 --- a/native_client/kenlm/util/double-conversion/fixed-dtoa.cc +++ b/native_client/kenlm/util/double-conversion/fixed-dtoa.cc @@ -25,7 +25,7 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include +#include #include "fixed-dtoa.h" #include "ieee.h" @@ -98,7 +98,7 @@ class UInt128 { return high_bits_ == 0 && low_bits_ == 0; } - int BitAt(int position) { + int BitAt(int position) const { if (position >= 64) { return static_cast(high_bits_ >> (position - 64)) & 1; } else { @@ -133,7 +133,7 @@ static void FillDigits32(uint32_t number, Vector buffer, int* length) { while (number != 0) { int digit = number % 10; number /= 10; - buffer[(*length) + number_length] = '0' + digit; + buffer[(*length) + number_length] = static_cast('0' + digit); number_length++; } // Exchange the digits. @@ -150,7 +150,7 @@ static void FillDigits32(uint32_t number, Vector buffer, int* length) { } -static void FillDigits64FixedLength(uint64_t number, int requested_length, +static void FillDigits64FixedLength(uint64_t number, Vector buffer, int* length) { const uint32_t kTen7 = 10000000; // For efficiency cut the number into 3 uint32_t parts, and print those. @@ -253,12 +253,14 @@ static void FillFractionals(uint64_t fractionals, int exponent, fractionals *= 5; point--; int digit = static_cast(fractionals >> point); - buffer[*length] = '0' + digit; + ASSERT(digit <= 9); + buffer[*length] = static_cast('0' + digit); (*length)++; fractionals -= static_cast(digit) << point; } // If the first bit after the point is set we have to round up. - if (((fractionals >> (point - 1)) & 1) == 1) { + ASSERT(fractionals == 0 || point - 1 >= 0); + if ((fractionals != 0) && ((fractionals >> (point - 1)) & 1) == 1) { RoundUp(buffer, length, decimal_point); } } else { // We need 128 bits. @@ -274,7 +276,8 @@ static void FillFractionals(uint64_t fractionals, int exponent, fractionals128.Multiply(5); point--; int digit = fractionals128.DivModPowerOf2(point); - buffer[*length] = '0' + digit; + ASSERT(digit <= 9); + buffer[*length] = static_cast('0' + digit); (*length)++; } if (fractionals128.BitAt(point - 1) == 1) { @@ -358,7 +361,7 @@ bool FastFixedDtoa(double v, remainder = (dividend % divisor) << exponent; } FillDigits32(quotient, buffer, length); - FillDigits64FixedLength(remainder, divisor_power, buffer, length); + FillDigits64FixedLength(remainder, buffer, length); *decimal_point = *length; } else if (exponent >= 0) { // 0 <= exponent <= 11 diff --git a/native_client/kenlm/util/double-conversion/ieee.h b/native_client/kenlm/util/double-conversion/ieee.h index ee11508f..1525d1b2 100644 --- a/native_client/kenlm/util/double-conversion/ieee.h +++ b/native_client/kenlm/util/double-conversion/ieee.h @@ -99,7 +99,7 @@ class Double { } double PreviousDouble() const { - if (d64_ == (kInfinity | kSignMask)) return -Double::Infinity(); + if (d64_ == (kInfinity | kSignMask)) return -Infinity(); if (Sign() < 0) { return Double(d64_ + 1).value(); } else { @@ -256,6 +256,8 @@ class Double { return (significand & kSignificandMask) | (biased_exponent << kPhysicalSignificandSize); } + + DISALLOW_COPY_AND_ASSIGN(Double); }; class Single { @@ -391,6 +393,8 @@ class Single { static const uint32_t kNaN = 0x7FC00000; const uint32_t d32_; + + DISALLOW_COPY_AND_ASSIGN(Single); }; } // namespace kenlm_double_conversion diff --git a/native_client/kenlm/util/double-conversion/strtod.cc b/native_client/kenlm/util/double-conversion/strtod.cc index 2c66e6e5..33e68e1c 100644 --- a/native_client/kenlm/util/double-conversion/strtod.cc +++ b/native_client/kenlm/util/double-conversion/strtod.cc @@ -25,8 +25,8 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include -#include +#include +#include #include "strtod.h" #include "bignum.h" @@ -137,6 +137,7 @@ static void TrimAndCut(Vector buffer, int exponent, Vector right_trimmed = TrimTrailingZeros(left_trimmed); exponent += left_trimmed.length() - right_trimmed.length(); if (right_trimmed.length() > kMaxSignificantDecimalDigits) { + (void) space_size; // Mark variable as used. ASSERT(space_size >= kMaxSignificantDecimalDigits); CutToMaxSignificantDigits(right_trimmed, exponent, buffer_copy_space, updated_exponent); @@ -263,7 +264,6 @@ static DiyFp AdjustmentPowerOfTen(int exponent) { case 7: return DiyFp(UINT64_2PART_C(0x98968000, 00000000), -40); default: UNREACHABLE(); - return DiyFp(0, 0); } } @@ -286,7 +286,7 @@ static bool DiyFpStrtod(Vector buffer, const int kDenominator = 1 << kDenominatorLog; // Move the remaining decimals into the exponent. exponent += remaining_decimals; - int error = (remaining_decimals == 0 ? 0 : kDenominator / 2); + uint64_t error = (remaining_decimals == 0 ? 0 : kDenominator / 2); int old_e = input.e(); input.Normalize(); @@ -506,9 +506,7 @@ float Strtof(Vector buffer, int exponent) { double double_previous = Double(double_guess).PreviousDouble(); float f1 = static_cast(double_previous); -#ifndef NDEBUG float f2 = float_guess; -#endif float f3 = static_cast(double_next); float f4; if (is_correct) { @@ -517,9 +515,8 @@ float Strtof(Vector buffer, int exponent) { double double_next2 = Double(double_next).NextDouble(); f4 = static_cast(double_next2); } -#ifndef NDEBUG + (void) f2; // Mark variable as used. ASSERT(f1 <= f2 && f2 <= f3 && f3 <= f4); -#endif // If the guess doesn't lie near a single-precision boundary we can simply // return its float-value. diff --git a/native_client/kenlm/util/double-conversion/utils.h b/native_client/kenlm/util/double-conversion/utils.h index ae40b116..41386253 100644 --- a/native_client/kenlm/util/double-conversion/utils.h +++ b/native_client/kenlm/util/double-conversion/utils.h @@ -33,14 +33,29 @@ #include #ifndef ASSERT -#define ASSERT(condition) (assert(condition)) +#define ASSERT(condition) \ + assert(condition); #endif #ifndef UNIMPLEMENTED #define UNIMPLEMENTED() (abort()) #endif +#ifndef DOUBLE_CONVERSION_NO_RETURN +#ifdef _MSC_VER +#define DOUBLE_CONVERSION_NO_RETURN __declspec(noreturn) +#else +#define DOUBLE_CONVERSION_NO_RETURN __attribute__((noreturn)) +#endif +#endif #ifndef UNREACHABLE +#ifdef _MSC_VER +void DOUBLE_CONVERSION_NO_RETURN abort_noreturn(); +inline void abort_noreturn() { abort(); } +#define UNREACHABLE() (abort_noreturn()) +#else #define UNREACHABLE() (abort()) #endif +#endif + // Double operations detection based on target architecture. // Linux uses a 80bit wide floating point stack on x86. This induces double @@ -55,11 +70,17 @@ #if defined(_M_X64) || defined(__x86_64__) || \ defined(__ARMEL__) || defined(__avr32__) || \ defined(__hppa__) || defined(__ia64__) || \ - defined(__mips__) || defined(__powerpc__) || \ + defined(__mips__) || \ + defined(__powerpc__) || defined(__ppc__) || defined(__ppc64__) || \ + defined(_POWER) || defined(_ARCH_PPC) || defined(_ARCH_PPC64) || \ defined(__sparc__) || defined(__sparc) || defined(__s390__) || \ defined(__SH4__) || defined(__alpha__) || \ - defined(_MIPS_ARCH_MIPS32R2) || defined(__aarch64__) + defined(_MIPS_ARCH_MIPS32R2) || \ + defined(__AARCH64EL__) || defined(__aarch64__) || \ + defined(__riscv) #define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1 +#elif defined(__mc68000__) +#undef DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS #elif defined(_M_IX86) || defined(__i386__) || defined(__i386) #if defined(_WIN32) // Windows uses a 64bit wide floating point stack. @@ -71,6 +92,11 @@ #error Target architecture was not detected as supported by Double-Conversion. #endif +#if defined(__GNUC__) +#define DOUBLE_CONVERSION_UNUSED __attribute__((unused)) +#else +#define DOUBLE_CONVERSION_UNUSED +#endif #if defined(_WIN32) && !defined(__MINGW32__) @@ -90,6 +116,8 @@ typedef unsigned __int64 uint64_t; #endif +typedef uint16_t uc16; + // The following macro works on both 32 and 64-bit platforms. // Usage: instead of writing 0x1234567890123456 // write UINT64_2PART_C(0x12345678,90123456); @@ -155,8 +183,8 @@ template class Vector { public: Vector() : start_(NULL), length_(0) {} - Vector(T* data, int length) : start_(data), length_(length) { - ASSERT(length == 0 || (length > 0 && data != NULL)); + Vector(T* data, int len) : start_(data), length_(len) { + ASSERT(len == 0 || (len > 0 && data != NULL)); } // Returns a vector using the same backing storage as this one, @@ -198,8 +226,8 @@ class Vector { // buffer bounds on all operations in debug mode. class StringBuilder { public: - StringBuilder(char* buffer, int size) - : buffer_(buffer, size), position_(0) { } + StringBuilder(char* buffer, int buffer_size) + : buffer_(buffer, buffer_size), position_(0) { } ~StringBuilder() { if (!is_finalized()) Finalize(); } @@ -218,8 +246,7 @@ class StringBuilder { // 0-characters; use the Finalize() method to terminate the string // instead. void AddCharacter(char c) { - // I just extract raw data not a cstr so null is fine. - //ASSERT(c != '\0'); + ASSERT(c != '\0'); ASSERT(!is_finalized() && position_ < buffer_.length()); buffer_[position_++] = c; } @@ -234,8 +261,7 @@ class StringBuilder { // builder. The input string must have enough characters. void AddSubstring(const char* s, int n) { ASSERT(!is_finalized() && position_ + n < buffer_.length()); - // I just extract raw data not a cstr so null is fine. - //ASSERT(static_cast(n) <= strlen(s)); + ASSERT(static_cast(n) <= strlen(s)); memmove(&buffer_[position_], s, n * kCharSize); position_ += n; } @@ -255,8 +281,7 @@ class StringBuilder { buffer_[position_] = '\0'; // Make sure nobody managed to add a 0-character to the // buffer while building the string. - // I just extract raw data not a cstr so null is fine. - //ASSERT(strlen(buffer_.start()) == static_cast(position_)); + ASSERT(strlen(buffer_.start()) == static_cast(position_)); position_ = -1; ASSERT(is_finalized()); return buffer_.start(); @@ -299,11 +324,8 @@ template inline Dest BitCast(const Source& source) { // Compile time assertion: sizeof(Dest) == sizeof(Source) // A compile error here means your Dest and Source have different sizes. - typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1] -#if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8 - __attribute__((unused)) -#endif - ; + DOUBLE_CONVERSION_UNUSED + typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1]; Dest dest; memmove(&dest, &source, sizeof(dest)); diff --git a/native_client/kenlm/util/exception.hh b/native_client/kenlm/util/exception.hh index 03543a9b..614a88fa 100644 --- a/native_client/kenlm/util/exception.hh +++ b/native_client/kenlm/util/exception.hh @@ -134,7 +134,7 @@ class OverflowException : public Exception { template inline std::size_t CheckOverflowInternal(uint64_t value) { UTIL_THROW_IF(value > static_cast(std::numeric_limits::max()), OverflowException, "Integer overflow detected. This model is too big for 32-bit code."); - return value; + return static_cast(value); } template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) { diff --git a/native_client/kenlm/util/file.cc b/native_client/kenlm/util/file.cc index b5e36b20..1a70387e 100644 --- a/native_client/kenlm/util/file.cc +++ b/native_client/kenlm/util/file.cc @@ -490,7 +490,7 @@ int mkstemp_and_unlink(char *tmpl) { int ret = mkstemp(tmpl); if (ret != -1) { - UTIL_THROW_IF(unlink(tmpl), ErrnoException, "while deleting delete " << tmpl); + UTIL_THROW_IF(unlink(tmpl), ErrnoException, "while deleting " << tmpl); } return ret; } diff --git a/native_client/kenlm/util/file_piece.hh b/native_client/kenlm/util/file_piece.hh index 67b28848..042a78e9 100644 --- a/native_client/kenlm/util/file_piece.hh +++ b/native_client/kenlm/util/file_piece.hh @@ -103,7 +103,7 @@ class FilePiece { if (position_ == position_end_) { try { Shift(); - } catch (const util::EndOfFileException &e) { return false; } + } catch (const util::EndOfFileException &) { return false; } // And break out at end of file. if (position_ == position_end_) return false; } diff --git a/native_client/kenlm/util/mmap.cc b/native_client/kenlm/util/mmap.cc index 4da5a975..39b9cd59 100644 --- a/native_client/kenlm/util/mmap.cc +++ b/native_client/kenlm/util/mmap.cc @@ -142,7 +142,7 @@ void UnmapOrThrow(void *start, size_t length) { #if defined(_WIN32) || defined(_WIN64) UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file"); #else - UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed"); + UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed with " << start << " for length " << length); #endif } diff --git a/native_client/kenlm/util/probing_hash_table.hh b/native_client/kenlm/util/probing_hash_table.hh index 438de92f..1d45b619 100644 --- a/native_client/kenlm/util/probing_hash_table.hh +++ b/native_client/kenlm/util/probing_hash_table.hh @@ -30,7 +30,7 @@ class DivMod { public: explicit DivMod(std::size_t buckets) : buckets_(buckets) {} - static std::size_t RoundBuckets(std::size_t from) { + static uint64_t RoundBuckets(uint64_t from) { return from; } @@ -58,7 +58,7 @@ class Power2Mod { } // Round up to next power of 2. - static std::size_t RoundBuckets(std::size_t from) { + static uint64_t RoundBuckets(uint64_t from) { --from; from |= from >> 1; from |= from >> 2; diff --git a/native_client/kenlm/util/tokenize_piece.hh b/native_client/kenlm/util/tokenize_piece.hh index 14ff9885..f5ce3367 100644 --- a/native_client/kenlm/util/tokenize_piece.hh +++ b/native_client/kenlm/util/tokenize_piece.hh @@ -5,10 +5,9 @@ #include "util/spaces.hh" #include "util/string_piece.hh" -#include - #include #include +#include namespace util { @@ -97,12 +96,12 @@ class AnyCharacterLast { StringPiece chars_; }; -template class TokenIter : public boost::iterator_facade, const StringPiece, boost::forward_traversal_tag> { +template class TokenIter : public std::iterator { public: TokenIter() {} template TokenIter(const StringPiece &str, const Construct &construct) : after_(str), finder_(construct) { - increment(); + ++*this; } bool operator!() const { @@ -116,10 +115,15 @@ template class TokenIter : public boost::it return TokenIter(); } - private: - friend class boost::iterator_core_access; + bool operator==(const TokenIter &other) const { + return current_.data() == other.current_.data(); + } - void increment() { + bool operator!=(const TokenIter &other) const { + return !(*this == other); + } + + TokenIter &operator++() { do { StringPiece found(finder_.Find(after_)); current_ = StringPiece(after_.data(), found.data() - after_.data()); @@ -129,17 +133,25 @@ template class TokenIter : public boost::it after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size()); } } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false. + return *this; } - bool equal(const TokenIter &other) const { - return current_.data() == other.current_.data(); + TokenIter &operator++(int) { + TokenIter ret(*this); + ++*this; + return ret; } - const StringPiece &dereference() const { + const StringPiece &operator*() const { UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens"); return current_; } + const StringPiece *operator->() const { + UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens"); + return ¤t_; + } + private: StringPiece current_; StringPiece after_;