diff --git a/src/libime/core/historybigram.cpp b/src/libime/core/historybigram.cpp index 3e6e2c8..3d16518 100644 --- a/src/libime/core/historybigram.cpp +++ b/src/libime/core/historybigram.cpp @@ -96,6 +96,8 @@ struct WeightedTrie { auto v = trie_.traverse(wordAndCode.first, pos); if (TrieType::isValid(v)) { result += v; + } else if (TrieType::isNoPath(v)) { + return 0; } const char separator[] = {wordCodeSeparator, '\0'}; v = trie_.traverse(separator, pos); @@ -395,6 +397,32 @@ class HistoryBigramPool { bigram_.fillPredict(words, word, maxSize); } + bool maybeAppendToLatestSentence(const std::vector &context, + std::vector &newSentence) { + if (recent_.empty() || newSentence.empty()) { + return false; + } + auto &latestSentence = recent_.front(); + if (latestSentence.size() < context.size() || + !std::ranges::equal( + context, + std::views::drop(latestSentence, + latestSentence.size() - context.size()))) { + return false; + } + + const int delta = 1; + decBigram(latestSentence.back(), {"", ""}, delta); + for (auto &item : newSentence) { + unigram_.incFreq(item, delta); + incBigram(latestSentence.back(), item, delta); + latestSentence.push_back(std::move(item)); + } + incBigram(latestSentence.back(), {"", ""}, delta); + + return true; + } + private: template void remove(const R &sentence) { @@ -742,4 +770,13 @@ float HistoryBigram::scoreWithCode( {cur ? cur->word() : "", extractor && cur ? extractor(cur) : ""}); } +void HistoryBigram::addWithContext(const std::vector &context, + std::vector newSentence) { + FCITX_D(); + if (context.empty() || + !d->pools_[0].maybeAppendToLatestSentence(context, newSentence)) { + addWithCode(newSentence); + } +} + } // namespace libime diff --git a/src/libime/core/historybigram.h b/src/libime/core/historybigram.h index 82f22ec..1ab01cd 100644 --- a/src/libime/core/historybigram.h +++ b/src/libime/core/historybigram.h @@ -102,6 +102,9 @@ class LIBIMECORE_EXPORT HistoryBigram { int32_t rawBigramFrequency(WordWithCodeView prev, WordWithCodeView cur) const; + void addWithContext(const std::vector &context, + std::vector newSentence); + private: std::unique_ptr d_ptr; FCITX_DECLARE_PRIVATE(HistoryBigram); diff --git a/src/libime/pinyin/pinyincontext.cpp b/src/libime/pinyin/pinyincontext.cpp index 8e7a7d8..015e8f4 100644 --- a/src/libime/pinyin/pinyincontext.cpp +++ b/src/libime/pinyin/pinyincontext.cpp @@ -919,10 +919,10 @@ std::vector PinyinContext::selectedWords() const { return newSentence; } -std::vector> +std::vector PinyinContext::selectedWordsWithPinyin() const { FCITX_D(); - std::vector> newSentence; + std::vector newSentence; for (const auto &s : d->selected_) { for (const auto &item : s) { if (item.type_ != SelectedPinyinType::Separator) { @@ -976,31 +976,30 @@ void PinyinContext::learn() { return; } + std::vector newSentence; if (auto [result, encodedWordPinyin] = d->learnWord(); result != LearnWordResult::Ignored) { // Do not insert custom to history for the first time. if (result == LearnWordResult::Normal) { // Create new sentence with the whole new learned word. - std::vector newSentence{ - {sentence(), encodedWordPinyin}}; - d->ime_->model()->history().addWithCode(newSentence); + newSentence.push_back({sentence(), encodedWordPinyin}); + } else { + return; } } else { - std::vector newSentence; - for (auto &s : d->selected_) { - for (auto &item : s) { - if (item.type_ != SelectedPinyinType::Separator) { - // Non pinyin word. Skip it. - if (item.encodedPinyin().empty()) { - return; - } - newSentence.push_back( - {item.word_.word(), item.encodedPinyin()}); - } - } - } - d->ime_->model()->history().addWithCode(newSentence); + newSentence = selectedWordsWithPinyin(); + } + + if (std::ranges::any_of(newSentence, [](const auto &word) { + return word.second.empty(); + })) { + // Don't add to history if there is any non-pinyin word. + return; } + + auto context = contextWordsWithPinyin(); + d->ime_->model()->history().addWithContext(contextWordsWithPinyin(), + std::move(newSentence)); } void PinyinContext::setContextWords( diff --git a/src/libime/pinyin/pinyinprediction.cpp b/src/libime/pinyin/pinyinprediction.cpp index c1f4f9e..e9fe618 100644 --- a/src/libime/pinyin/pinyinprediction.cpp +++ b/src/libime/pinyin/pinyinprediction.cpp @@ -48,12 +48,12 @@ PinyinPrediction::predict(const State &state, if (lastEncodedPinyin.empty() || sentence.empty()) { auto result = Prediction::predictWithScore(state, sentence, maxSize); - std::transform(result.begin(), result.end(), - std::back_inserter(finalResult), - [](std::pair &value) { - return std::make_pair(std::move(value.first), - PinyinPredictionSource::Model); - }); + std::ranges::transform(result, std::back_inserter(finalResult), + [](std::pair &value) { + return std::make_pair( + std::move(value.first), + PinyinPredictionSource::Model); + }); return finalResult; } @@ -119,11 +119,9 @@ PinyinPrediction::predict(const State &state, dup.insert(std::get(newItem)); intermedidateResult.push_back(std::move(newItem)); - std::push_heap(intermedidateResult.begin(), - intermedidateResult.end(), cmp); + std::ranges::push_heap(intermedidateResult, cmp); while (intermedidateResult.size() > maxSize) { - std::pop_heap(intermedidateResult.begin(), - intermedidateResult.end(), cmp); + std::ranges::pop_heap(intermedidateResult, cmp); dup.erase( std::get(intermedidateResult.back())); intermedidateResult.pop_back(); diff --git a/test/testhistorybigram.cpp b/test/testhistorybigram.cpp index 8a18912..7f5054e 100644 --- a/test/testhistorybigram.cpp +++ b/test/testhistorybigram.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include "libime/core/historybigram.h" namespace { @@ -266,6 +267,24 @@ void testWithCodePredict() { } } +void testAppend() { + using namespace libime; + HistoryBigram history; + history.addWithCode({{"你", "code1"}, {"是", "code2"}, {"一个", "code3"}}); + + history.addWithContext({{"是", "code2"}, {"一个", "code3"}}, + {{"好人", "code4"}}); + + history.addWithContext({{"不是", "code5"}}, {{"你的", "code6"}}); + std::stringstream ss; + history.dump(ss); + auto lines = fcitx::stringutils::split(ss.str(), "\n"); + FCITX_ASSERT(lines.size() == 2) << lines.size(); + FCITX_ASSERT(lines[0] == "你的\tcode6") << lines[0]; + FCITX_ASSERT(lines[1] == "你\tcode1 是\tcode2 一个\tcode3 好人\tcode4") + << lines[1]; +} + } // namespace int main() { @@ -276,5 +295,6 @@ int main() { testSaveAndLoadText(); testWithCode(); testWithCodePredict(); + testAppend(); return 0; } diff --git a/test/testpinyinprediction.cpp b/test/testpinyinprediction.cpp index f8c1459..06c3c58 100644 --- a/test/testpinyinprediction.cpp +++ b/test/testpinyinprediction.cpp @@ -10,6 +10,7 @@ #include "libime/core/userlanguagemodel.h" #include "libime/pinyin/pinyindictionary.h" #include "libime/pinyin/pinyinencoder.h" +#include "libime/pinyin/pinyinime.h" #include "libime/pinyin/pinyinprediction.h" #include "testdir.h" @@ -33,14 +34,21 @@ LogMessageBuilder &operator<<(LogMessageBuilder &log, } // namespace fcitx int main() { - UserLanguageModel model(LIBIME_BINARY_DIR "/data/sc.lm"); - PinyinDictionary dict; - dict.load(PinyinDictionary::SystemDict, - LIBIME_BINARY_DIR "/data/dict_sc.txt", PinyinDictFormat::Text); + PinyinIME ime( + std::make_unique(), + std::make_unique(LIBIME_BINARY_DIR "/data/sc.lm")); + ime.setNBest(2); + ime.dict()->load(PinyinDictionary::SystemDict, + LIBIME_BINARY_DIR "/data/sc.dict", + PinyinDictFormat::Binary); + ime.model()->history().addWithCode({{"可", "JF"}}); + auto &model = *ime.model(); + auto &dict = *ime.dict(); PinyinPrediction prediction; - prediction.setUserLanguageModel(&model); - prediction.setPinyinDictionary(&dict); + prediction.setUserLanguageModel(ime.model()); + prediction.setPinyinDictionary(ime.dict()); + auto py = PinyinEncoder::encodeFullPinyin("zhong'guo"); auto result = prediction.predict(model.nullState(), {"我", "喜欢", "中国"}, {py.data(), py.size()}, 20);