Skip to content

Commit

Permalink
Fix pinyin parsing for case like suang -> suan g.
Browse files Browse the repository at this point in the history
When doing backtrack during parsing pinyin, the pinyin map checking
should consider fuzzy flag, otherwise invalid pinyin like sua may be
considered as valid during parsing.
  • Loading branch information
wengxt committed May 2, 2024
1 parent 9ed5f07 commit 46797cd
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 12 deletions.
32 changes: 20 additions & 12 deletions src/libime/pinyin/pinyinencoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,21 @@ struct LongestMatchResult {
bool isCompletePinyin;
};

bool hasMatchInMap(const PinyinMap &map, std::string_view range,
PinyinFuzzyFlags flags) {
auto iterPair = map.equal_range(range);
if (iterPair.first != iterPair.second) {
for (const auto &item :
boost::make_iterator_range(iterPair.first, iterPair.second)) {
if (flags.test(item.flags())) {
// do not consider m/n/r as complete pinyin
return true;
}
}
}
return false;
}

template <typename Iter>
LongestMatchResult longestMatch(Iter iter, Iter end, PinyinFuzzyFlags flags,
const PinyinMap &map) {
Expand All @@ -109,16 +124,10 @@ LongestMatchResult longestMatch(Iter iter, Iter end, PinyinFuzzyFlags flags,
}
auto range = std::string_view(&*iter, std::distance(iter, end));
for (; !range.empty(); range.remove_suffix(1)) {
auto iterPair = map.equal_range(range);
if (iterPair.first != iterPair.second) {
for (const auto &item :
boost::make_iterator_range(iterPair.first, iterPair.second)) {
if (flags.test(item.flags())) {
// do not consider m/n/r as complete pinyin
return {true, range,
(range != "m" && range != "n" && range != "r")};
}
}
if (hasMatchInMap(map, range, flags)) {
// do not consider m/n/r as complete pinyin
return {true, range,
(range != "m" && range != "n" && range != "r")};
}
if (range.size() <= 2) {
auto iter = initialMap.right.find(std::string{range});
Expand Down Expand Up @@ -217,8 +226,7 @@ PinyinEncoder::parseUserPinyin(std::string userPinyin,
str.back() == 'o' || str.back() == 'r' ||
str.back() == 'h' ||
fuzzyFlags.test(PinyinFuzzyFlag::Correction)) &&
pinyinMap.find(str.substr(0, str.size() - 1)) !=
pinyinMap.end()) {
hasMatchInMap(pinyinMap, str.substr(0, str.size() - 1), fuzzyFlags)) {
// str[0:-1] is also a full pinyin, check next pinyin
auto nextMatch = longestMatch(iter + str.size(), end,
fuzzyFlags, pinyinMap);
Expand Down
4 changes: 4 additions & 0 deletions test/testpinyinencoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,10 @@ int main() {
PinyinFuzzyFlag::Correction);
dfs(graph, {"xiang", "uao", "yi", "ge"});
dfs(graph, {"xian", "gu", "ao", "yi", "ge"});

graph = PinyinEncoder::parseUserPinyin("suang", &profile,
PinyinFuzzyFlag::Correction);
dfs(graph, {"suan", "g"});
}

return 0;
Expand Down

0 comments on commit 46797cd

Please sign in to comment.