From d2eb305b73823efacb3f8de2b480346017c50cd7 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 12 Nov 2019 21:56:42 +0100 Subject: [PATCH] Address review comment and add missing check for presence of scorer --- .../ctcdecode/ctc_beam_search_decoder.cpp | 38 +++++++++---------- native_client/ctcdecode/decoder_utils.cpp | 6 --- native_client/ctcdecode/decoder_utils.h | 7 +++- 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/native_client/ctcdecode/ctc_beam_search_decoder.cpp b/native_client/ctcdecode/ctc_beam_search_decoder.cpp index 31999078..5a2c834e 100644 --- a/native_client/ctcdecode/ctc_beam_search_decoder.cpp +++ b/native_client/ctcdecode/ctc_beam_search_decoder.cpp @@ -109,27 +109,25 @@ DecoderState::next(const double *probs, log_p = log_prob_c + prefix->score; } - // skip scoring the space in word based LMs - PathTrie* prefix_to_score; - if (ext_scorer_->is_utf8_mode()) { - prefix_to_score = prefix_new; - } else { - prefix_to_score = prefix; - } + if (ext_scorer_ != nullptr) { + // skip scoring the space in word based LMs + PathTrie* prefix_to_score; + if (ext_scorer_->is_utf8_mode()) { + prefix_to_score = prefix_new; + } else { + prefix_to_score = prefix; + } - // check if we need to score - bool is_scoring_boundary = ext_scorer_ != nullptr && - ext_scorer_->is_scoring_boundary(prefix_to_score, c); - - // language model scoring - if (is_scoring_boundary) { - float score = 0.0; - std::vector ngram; - ngram = ext_scorer_->make_ngram(prefix_to_score); - bool bos = ngram.size() < ext_scorer_->get_max_order(); - score = ext_scorer_->get_log_cond_prob(ngram, bos) * ext_scorer_->alpha; - log_p += score; - log_p += ext_scorer_->beta; + // language model scoring + if (ext_scorer_->is_scoring_boundary(prefix_to_score, c)) { + float score = 0.0; + std::vector ngram; + ngram = ext_scorer_->make_ngram(prefix_to_score); + bool bos = ngram.size() < ext_scorer_->get_max_order(); + score = ext_scorer_->get_log_cond_prob(ngram, bos) * ext_scorer_->alpha; + log_p += score; + log_p += ext_scorer_->beta; + } } prefix_new->log_prob_nb_cur = diff --git a/native_client/ctcdecode/decoder_utils.cpp b/native_client/ctcdecode/decoder_utils.cpp index be810c07..ed244c3a 100644 --- a/native_client/ctcdecode/decoder_utils.cpp +++ b/native_client/ctcdecode/decoder_utils.cpp @@ -46,12 +46,6 @@ size_t get_utf8_str_len(const std::string &str) { return str_len; } -// Return weather a byte is a code point boundary (not a continuation byte). -bool byte_is_codepoint_boundary(unsigned char c) { - // only continuation bytes have their most significant bits set to 10 - return (c & 0xC0) != 0x80; -} - std::vector split_into_codepoints(const std::string &str) { std::vector result; std::string out_str; diff --git a/native_client/ctcdecode/decoder_utils.h b/native_client/ctcdecode/decoder_utils.h index ec0a93fc..3ba1d7e6 100644 --- a/native_client/ctcdecode/decoder_utils.h +++ b/native_client/ctcdecode/decoder_utils.h @@ -89,8 +89,11 @@ std::vector split_into_bytes(const std::string &str); void add_word_to_fst(const std::vector &word, fst::StdVectorFst *dictionary); -// Return weather a byte is a code point boundary (not a continuation byte). -bool byte_is_codepoint_boundary(unsigned char c); +// Return whether a byte is a code point boundary (not a continuation byte). +inline bool byte_is_codepoint_boundary(unsigned char c) { + // only continuation bytes have their most significant bits set to 10 + return (c & 0xC0) != 0x80; +} // Add a word in string to dictionary bool add_word_to_dictionary(