Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions src/libime/core/historybigram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ struct WeightedTrie {
auto v = trie_.traverse(wordAndCode.first, pos);
if (TrieType::isValid(v)) {
result += v;
} else if (TrieType::isNoPath(v)) {
return 0;
}
const char separator[] = {wordCodeSeparator, '\0'};
v = trie_.traverse(separator, pos);
Expand Down Expand Up @@ -395,6 +397,32 @@ class HistoryBigramPool {
bigram_.fillPredict(words, word, maxSize);
}

bool maybeAppendToLatestSentence(const std::vector<WordWithCode> &context,
std::vector<WordWithCode> &newSentence) {
if (recent_.empty() || newSentence.empty()) {
return false;
}
auto &latestSentence = recent_.front();
if (latestSentence.size() < context.size() ||
!std::ranges::equal(
context,
std::views::drop(latestSentence,
latestSentence.size() - context.size()))) {
return false;
}

const int delta = 1;
decBigram(latestSentence.back(), {"</s>", ""}, delta);
for (auto &item : newSentence) {
unigram_.incFreq(item, delta);
incBigram(latestSentence.back(), item, delta);
latestSentence.push_back(std::move(item));
}
incBigram(latestSentence.back(), {"</s>", ""}, delta);

return true;
}

private:
template <typename R>
void remove(const R &sentence) {
Expand Down Expand Up @@ -742,4 +770,13 @@ float HistoryBigram::scoreWithCode(
{cur ? cur->word() : "", extractor && cur ? extractor(cur) : ""});
}

void HistoryBigram::addWithContext(const std::vector<WordWithCode> &context,
std::vector<WordWithCode> newSentence) {
FCITX_D();
if (context.empty() ||
!d->pools_[0].maybeAppendToLatestSentence(context, newSentence)) {
addWithCode(newSentence);
}
}

} // namespace libime
3 changes: 3 additions & 0 deletions src/libime/core/historybigram.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ class LIBIMECORE_EXPORT HistoryBigram {
int32_t rawBigramFrequency(WordWithCodeView prev,
WordWithCodeView cur) const;

void addWithContext(const std::vector<WordWithCode> &context,
std::vector<WordWithCode> newSentence);

private:
std::unique_ptr<HistoryBigramPrivate> d_ptr;
FCITX_DECLARE_PRIVATE(HistoryBigram);
Expand Down
37 changes: 18 additions & 19 deletions src/libime/pinyin/pinyincontext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -919,10 +919,10 @@ std::vector<std::string> PinyinContext::selectedWords() const {
return newSentence;
}

std::vector<std::pair<std::string, std::string>>
std::vector<HistoryBigram::WordWithCode>
PinyinContext::selectedWordsWithPinyin() const {
FCITX_D();
std::vector<std::pair<std::string, std::string>> newSentence;
std::vector<HistoryBigram::WordWithCode> newSentence;
for (const auto &s : d->selected_) {
for (const auto &item : s) {
if (item.type_ != SelectedPinyinType::Separator) {
Expand Down Expand Up @@ -976,31 +976,30 @@ void PinyinContext::learn() {
return;
}

std::vector<HistoryBigram::WordWithCode> newSentence;
if (auto [result, encodedWordPinyin] = d->learnWord();
result != LearnWordResult::Ignored) {
// Do not insert custom to history for the first time.
if (result == LearnWordResult::Normal) {
// Create new sentence with the whole new learned word.
std::vector<HistoryBigram::WordWithCode> newSentence{
{sentence(), encodedWordPinyin}};
d->ime_->model()->history().addWithCode(newSentence);
newSentence.push_back({sentence(), encodedWordPinyin});
} else {
return;
}
} else {
std::vector<HistoryBigram::WordWithCode> newSentence;
for (auto &s : d->selected_) {
for (auto &item : s) {
if (item.type_ != SelectedPinyinType::Separator) {
// Non pinyin word. Skip it.
if (item.encodedPinyin().empty()) {
return;
}
newSentence.push_back(
{item.word_.word(), item.encodedPinyin()});
}
}
}
d->ime_->model()->history().addWithCode(newSentence);
newSentence = selectedWordsWithPinyin();
}

if (std::ranges::any_of(newSentence, [](const auto &word) {
return word.second.empty();
})) {
// Don't add to history if there is any non-pinyin word.
return;
}

auto context = contextWordsWithPinyin();
d->ime_->model()->history().addWithContext(contextWordsWithPinyin(),
std::move(newSentence));
}

void PinyinContext::setContextWords(
Expand Down
18 changes: 8 additions & 10 deletions src/libime/pinyin/pinyinprediction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,12 @@ PinyinPrediction::predict(const State &state,

if (lastEncodedPinyin.empty() || sentence.empty()) {
auto result = Prediction::predictWithScore(state, sentence, maxSize);
std::transform(result.begin(), result.end(),
std::back_inserter(finalResult),
[](std::pair<std::string, float> &value) {
return std::make_pair(std::move(value.first),
PinyinPredictionSource::Model);
});
std::ranges::transform(result, std::back_inserter(finalResult),
[](std::pair<std::string, float> &value) {
return std::make_pair(
std::move(value.first),
PinyinPredictionSource::Model);
});
return finalResult;
}

Expand Down Expand Up @@ -119,11 +119,9 @@ PinyinPrediction::predict(const State &state,

dup.insert(std::get<std::string>(newItem));
intermedidateResult.push_back(std::move(newItem));
std::push_heap(intermedidateResult.begin(),
intermedidateResult.end(), cmp);
std::ranges::push_heap(intermedidateResult, cmp);
while (intermedidateResult.size() > maxSize) {
std::pop_heap(intermedidateResult.begin(),
intermedidateResult.end(), cmp);
std::ranges::pop_heap(intermedidateResult, cmp);
dup.erase(
std::get<std::string>(intermedidateResult.back()));
intermedidateResult.pop_back();
Expand Down
20 changes: 20 additions & 0 deletions test/testhistorybigram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <string>
#include <unordered_set>
#include <fcitx-utils/log.h>
#include <fcitx-utils/stringutils.h>
#include "libime/core/historybigram.h"

namespace {
Expand Down Expand Up @@ -266,6 +267,24 @@ void testWithCodePredict() {
}
}

void testAppend() {
using namespace libime;
HistoryBigram history;
history.addWithCode({{"你", "code1"}, {"是", "code2"}, {"一个", "code3"}});

history.addWithContext({{"是", "code2"}, {"一个", "code3"}},
{{"好人", "code4"}});

history.addWithContext({{"不是", "code5"}}, {{"你的", "code6"}});
std::stringstream ss;
history.dump(ss);
auto lines = fcitx::stringutils::split(ss.str(), "\n");
FCITX_ASSERT(lines.size() == 2) << lines.size();
FCITX_ASSERT(lines[0] == "你的\tcode6") << lines[0];
FCITX_ASSERT(lines[1] == "你\tcode1 是\tcode2 一个\tcode3 好人\tcode4")
<< lines[1];
}

} // namespace

int main() {
Expand All @@ -276,5 +295,6 @@ int main() {
testSaveAndLoadText();
testWithCode();
testWithCodePredict();
testAppend();
return 0;
}
20 changes: 14 additions & 6 deletions test/testpinyinprediction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "libime/core/userlanguagemodel.h"
#include "libime/pinyin/pinyindictionary.h"
#include "libime/pinyin/pinyinencoder.h"
#include "libime/pinyin/pinyinime.h"
#include "libime/pinyin/pinyinprediction.h"
#include "testdir.h"

Expand All @@ -33,14 +34,21 @@ LogMessageBuilder &operator<<(LogMessageBuilder &log,
} // namespace fcitx

int main() {
UserLanguageModel model(LIBIME_BINARY_DIR "/data/sc.lm");
PinyinDictionary dict;
dict.load(PinyinDictionary::SystemDict,
LIBIME_BINARY_DIR "/data/dict_sc.txt", PinyinDictFormat::Text);
PinyinIME ime(
std::make_unique<PinyinDictionary>(),
std::make_unique<UserLanguageModel>(LIBIME_BINARY_DIR "/data/sc.lm"));
ime.setNBest(2);
ime.dict()->load(PinyinDictionary::SystemDict,
LIBIME_BINARY_DIR "/data/sc.dict",
PinyinDictFormat::Binary);
ime.model()->history().addWithCode({{"可", "JF"}});
auto &model = *ime.model();
auto &dict = *ime.dict();

PinyinPrediction prediction;
prediction.setUserLanguageModel(&model);
prediction.setPinyinDictionary(&dict);
prediction.setUserLanguageModel(ime.model());
prediction.setPinyinDictionary(ime.dict());

auto py = PinyinEncoder::encodeFullPinyin("zhong'guo");
auto result = prediction.predict(model.nullState(), {"我", "喜欢", "中国"},
{py.data(), py.size()}, 20);
Expand Down
Loading