修复分词问题;

This commit is contained in:
jixiaoxu 2022-05-18 10:32:47 +08:00 committed by iaom
parent 76c1a7f73b
commit 32c3747818
1 changed files with 18 additions and 8 deletions

View File

@ -138,15 +138,15 @@ public:
string str = GetStringFromRunes(s, words[i].left, words[i].right); string str = GetStringFromRunes(s, words[i].left, words[i].right);
if (stopWords_.find(str) != stopWords_.end()) {
continue;
}
if (words[i].left != words[i].right) { if (words[i].left != words[i].right) {
if (stopWords_.find(str) != stopWords_.end()) {
continue;
}
res[str].offsets.push_back(words[i].left->offset); res[str].offsets.push_back(words[i].left->offset);
res[str].weight += 1.0; res[str].weight += 1.0;
continue; continue;
} }
if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune) if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune)
|| i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back || i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back
if (stopWords_.find(str) != stopWords_.end()) { if (stopWords_.find(str) != stopWords_.end()) {
@ -156,20 +156,27 @@ public:
res[str].weight += 1.0; res[str].weight += 1.0;
continue; continue;
} }
// if mp Get a single one and it is not in userdict, collect it in sequence // if mp Get a single one and it is not in userdict, collect it in sequence
size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里i字符不是最后一个字符直接判定j字符 size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里i字符不是最后一个字符直接判定j字符
bool isLastWordsSingle(false);
while (j < (words.size() - 1) while (j <= (words.size() - 1)
&& words[j].left == words[j].right && words[j].left == words[j].right
&& !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
if (j == (words.size() - 1)) {//最后一个分词结果是单字
isLastWordsSingle = true;
break;
}
j++; j++;
} }
// Cut the sequence with hmm // Cut the sequence with hmm
assert(j - 1 >= i); assert(j - 1 >= i);
// TODO // TODO
hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes); if (isLastWordsSingle) {
hmmSeg_.CutRuneArray(words[i].left, words[j].left + 1, hmmRes);
} else {
hmmSeg_.CutRuneArray(words[i].left, words[j].left, hmmRes);
}
//put hmm result to result //put hmm result to result
for (size_t k = 0; k < hmmRes.size(); k++) { for (size_t k = 0; k < hmmRes.size(); k++) {
@ -185,6 +192,9 @@ public:
hmmRes.clear(); hmmRes.clear();
//let i jump over this piece //let i jump over this piece
if (isLastWordsSingle) {
break;
}
i = j - 1; i = j - 1;
} }
} else {//不存在中文分词结果 } else {//不存在中文分词结果