From ff62a1e2b976a4a04eb46cb49fb8b59508327ab3 Mon Sep 17 00:00:00 2001 From: jixiaoxu Date: Wed, 7 Jul 2021 11:37:00 +0800 Subject: [PATCH] Merge DAG and DP code --- .../chinese-segmentation.cpp | 2 - libchinese-segmentation/cppjieba/DatTrie.hpp | 117 +++++++++++++++++- libchinese-segmentation/cppjieba/DictTrie.hpp | 7 ++ .../cppjieba/HMMSegment.hpp | 8 +- .../cppjieba/MPSegment.hpp | 39 +++++- .../cppjieba/MixSegment.hpp | 107 +++++++++------- .../cppjieba/PreFilter.hpp | 10 +- src/src.pro | 2 +- 8 files changed, 228 insertions(+), 64 deletions(-) diff --git a/libchinese-segmentation/chinese-segmentation.cpp b/libchinese-segmentation/chinese-segmentation.cpp index fe4e95f..3b6f04c 100644 --- a/libchinese-segmentation/chinese-segmentation.cpp +++ b/libchinese-segmentation/chinese-segmentation.cpp @@ -66,8 +66,6 @@ QVector ChineseSegmentation::callSegement(std::string s) { keywordres.clear(); // keywordres.shrink_to_fit(); - - return vecNeeds; } diff --git a/libchinese-segmentation/cppjieba/DatTrie.hpp b/libchinese-segmentation/cppjieba/DatTrie.hpp index a4967a3..0709a4f 100644 --- a/libchinese-segmentation/cppjieba/DatTrie.hpp +++ b/libchinese-segmentation/cppjieba/DatTrie.hpp @@ -167,6 +167,121 @@ public: } } + void Find_Reverse(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, + vector&res, size_t max_word_len) const { + + res.clear(); + res.resize(end - begin); + + string text_str; + EncodeRunesToString(begin, end, text_str); + + static const size_t max_num = 128; + JiebaDAT::result_pair_type result_pairs[max_num] = {}; + + size_t str_size = end - begin; + for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) { + + begin_pos -= (end - i - 1)->len; + std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num); + res[str_size - i - 1].nexts.push_back(pair(str_size - i, nullptr)); + + for (std::size_t idx = 0; idx < num_results; ++idx) { + auto & match = result_pairs[idx]; + if ((match.value < 0) || ((size_t)match.value >= elements_num_)) { + continue; + } + + auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length); + + if (char_num > max_word_len) { + continue; + } + + auto pValue = &elements_ptr_[match.value]; + + if (1 == char_num) { + res[str_size - i - 1].nexts[0].second = pValue; + continue; + } + + res[str_size - i - 1].nexts.push_back(pair(str_size - 1 - i + char_num, pValue)); + } + } + } + void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, + vector& words, size_t max_word_len) const { + + string text_str; + EncodeRunesToString(begin, end, text_str); + + static const size_t max_num = 128; + JiebaDAT::result_pair_type result_pairs[max_num] = {};//存放字典查询结果 + size_t str_size = end - begin; + double max_weight[str_size];//存放逆向路径最大weight + for (size_t i = 0; ilen; + + std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num); + if (0 == num_results) {//字典不存在则单独分词 + val = min_weight_; + + if (nextPos < str_size) { + val += max_weight[nextPos]; + } + if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) { + max_weight[nextPos - 1] = val; + max_next[nextPos - 1] = nextPos; + } + } else {//字典存在则根据查询结果数量计算最大概率路径 + for (std::size_t idx = 0; idx < num_results; ++idx) { + auto & match = result_pairs[idx]; + if ((match.value < 0) || ((size_t)match.value >= elements_num_)) { + continue; + } + auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length); + if (char_num > max_word_len) { + continue; + } + auto pValue = &elements_ptr_[match.value]; + + val = pValue->weight; + if (1 == char_num) { + if (nextPos < str_size) { + val += max_weight[nextPos]; + } + if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) { + max_weight[nextPos - 1] = val; + max_next[nextPos - 1] = nextPos; + } + } else { + if (nextPos - 1 + char_num < str_size) { + val += max_weight[nextPos - 1 + char_num]; + } + if ((nextPos - 1 + char_num <= str_size) && (val > max_weight[nextPos - 1])) { + max_weight[nextPos - 1] = val; + max_next[nextPos - 1] = nextPos - 1 + char_num; + } + } + } + } + } + for (size_t i = 0; i < str_size;) {//统计动态规划结果 + assert(max_next[i] > i); + assert(max_next[i] <= str_size); + WordRange wr(begin + i, begin + max_next[i] - 1); + words.push_back(wr); + i = max_next[i]; + } + } double GetMinWeight() const { return min_weight_; } @@ -284,7 +399,7 @@ private: //const int fd =::mkstemp(&tmp_filepath[0]); //原mkstemp用法有误,已修复--jxx20210519 const int fd =::mkstemp((char *)tmp_filepath.data()); - qDebug() << "mkstemp error:" << errno << tmp_filepath.data(); + qDebug() << "mkstemp :" << errno << tmp_filepath.data(); assert(fd >= 0); ::fchmod(fd, 0644); diff --git a/libchinese-segmentation/cppjieba/DictTrie.hpp b/libchinese-segmentation/cppjieba/DictTrie.hpp index 5ecee54..44a6cb9 100644 --- a/libchinese-segmentation/cppjieba/DictTrie.hpp +++ b/libchinese-segmentation/cppjieba/DictTrie.hpp @@ -49,6 +49,13 @@ public: dat_.Find(begin, end, res, max_word_len); } + void Find(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + vector& words, + size_t max_word_len = MAX_WORD_LENGTH) const { + dat_.Find(begin, end, words, max_word_len); + } + bool IsUserDictSingleChineseWord(const Rune& word) const { return IsIn(user_dict_single_chinese_word_, word); } diff --git a/libchinese-segmentation/cppjieba/HMMSegment.hpp b/libchinese-segmentation/cppjieba/HMMSegment.hpp index 30af449..1a9937b 100644 --- a/libchinese-segmentation/cppjieba/HMMSegment.hpp +++ b/libchinese-segmentation/cppjieba/HMMSegment.hpp @@ -138,10 +138,10 @@ private: size_t now, old, stat; double tmp, endE, endS; - vector path(XYSize); - vector weight(XYSize); - //int path[XYSize]; - //double weight[XYSize]; + //vector path(XYSize); + //vector weight(XYSize); + int path[XYSize]; + double weight[XYSize]; //start for (size_t y = 0; y < Y; y++) { diff --git a/libchinese-segmentation/cppjieba/MPSegment.hpp b/libchinese-segmentation/cppjieba/MPSegment.hpp index d615fe2..0158e4a 100644 --- a/libchinese-segmentation/cppjieba/MPSegment.hpp +++ b/libchinese-segmentation/cppjieba/MPSegment.hpp @@ -22,10 +22,11 @@ public: RuneStrArray::const_iterator end, vector& words, bool, size_t max_word_len) const override { - vector dags; - dictTrie_->Find(begin, end, dags, max_word_len);//依据DAG词典生成DAG--jxx - CalcDP(dags);//动态规划(Dynamic Programming,DP),根据DAG计算最优动态规划路径--jxx - CutByDag(begin, end, dags, words);//依据DAG最优路径分词--jxx +// vector dags; +// dictTrie_->Find(begin, end, dags, max_word_len);//依据DAG词典生成DAG--jxx +// CalcDP(dags);//动态规划(Dynamic Programming,DP),根据DAG计算最优动态规划路径--jxx +// CutByDag(begin, end, dags, words);//依据DAG最优路径分词--jxx + dictTrie_->Find(begin, end, words, max_word_len); } virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm, @@ -48,6 +49,7 @@ public: return dictTrie_->IsUserDictSingleChineseWord(value); } private: +/* void CalcDP(vector& dags) const { double val(0); for (auto rit = dags.rbegin(); rit != dags.rend(); rit++) { @@ -73,6 +75,35 @@ private: } } } +*/ +/* 倒叙方式重写CalcDP函数,初步测试未发现问题*/ + void CalcDP(vector& dags) const { + double val(0); + size_t size = dags.size(); + + for (size_t i = 0; i < size; i++) { + dags[size - 1 - i].max_next = -1; + dags[size - 1 - i].max_weight = MIN_DOUBLE; + + for (const auto & it : dags[size - 1 - i].nexts) { + const auto nextPos = it.first; + val = dictTrie_->GetMinWeight(); + + if (nullptr != it.second) { + val = it.second->weight; + } + + if (nextPos < dags.size()) { + val += dags[nextPos].max_weight; + } + + if ((nextPos <= dags.size()) && (val > dags[size - 1 - i].max_weight)) { + dags[size - 1 - i].max_weight = val; + dags[size - 1 - i].max_next = nextPos; + } + } + } + } void CutByDag(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator, diff --git a/libchinese-segmentation/cppjieba/MixSegment.hpp b/libchinese-segmentation/cppjieba/MixSegment.hpp index a539039..9e67069 100644 --- a/libchinese-segmentation/cppjieba/MixSegment.hpp +++ b/libchinese-segmentation/cppjieba/MixSegment.hpp @@ -123,65 +123,76 @@ public: virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map& res, bool hmm, size_t) const override { vector words; - assert(end >= begin); - words.reserve(end - begin); - mpSeg_.CutRuneArray(begin, end, words); - vector hmmRes; - hmmRes.reserve(end - begin); + assert(end >= begin); + if (3 == begin->len or 4 == begin->len) { + words.reserve(end - begin); + mpSeg_.CutRuneArray(begin, end, words); + hmmRes.reserve(words.size()); + } else { + hmmRes.reserve(end - begin); + } - for (size_t i = 0; i < words.size(); i++) { + if (words.size() != 0) {//存在中文分词结果 + for (size_t i = 0; i < words.size(); i++) { - string str = GetStringFromRunes(s, words[i].left, words[i].right); + string str = GetStringFromRunes(s, words[i].left, words[i].right); - if (stopWords_.find(str) != stopWords_.end()) { - continue; - } - - if (words[i].left != words[i].right) { - res[str].offsets.push_back(words[i].left->offset); - res[str].weight += 1.0; - continue; - } - if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune) - || i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back if (stopWords_.find(str) != stopWords_.end()) { continue; } - res[str].offsets.push_back(words[i].left->offset); - res[str].weight += 1.0; - continue; - } - // if mp Get a single one and it is not in userdict, collect it in sequence - size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里(i字符不是最后一个字符),直接判定j字符 - - while (j < (words.size() - 1) - && words[j].left == words[j].right - && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { - j++; - } - - // Cut the sequence with hmm - assert(j - 1 >= i); - // TODO - hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes); - - //put hmm result to result - for (size_t k = 0; k < hmmRes.size(); k++) { - string hmmStr = GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right); - if (IsSingleWord(hmmStr) || stopWords_.find(hmmStr) != stopWords_.end()) { + if (words[i].left != words[i].right) { + res[str].offsets.push_back(words[i].left->offset); + res[str].weight += 1.0; continue; } - res[hmmStr].offsets.push_back(hmmRes[k].left->offset); - res[hmmStr].weight += 1.0; + if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune) + || i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back + if (stopWords_.find(str) != stopWords_.end()) { + continue; + } + res[str].offsets.push_back(words[i].left->offset); + res[str].weight += 1.0; + continue; + } + + // if mp Get a single one and it is not in userdict, collect it in sequence + size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里(i字符不是最后一个字符),直接判定j字符 + + while (j < (words.size() - 1) + && words[j].left == words[j].right + && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { + j++; + } + + // Cut the sequence with hmm + assert(j - 1 >= i); + // TODO + hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes); + + //put hmm result to result + for (size_t k = 0; k < hmmRes.size(); k++) { + string hmmStr = GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right); + if (IsSingleWord(hmmStr) || stopWords_.find(hmmStr) != stopWords_.end()) { + continue; + } + res[hmmStr].offsets.push_back(hmmRes[k].left->offset); + res[hmmStr].weight += 1.0; + } + + //clear tmp vars + hmmRes.clear(); + + //let i jump over this piece + i = j - 1; + } + } else {//不存在中文分词结果 + for (size_t i = 0; i < (size_t)(end - begin); i++) { + string str = s.substr((begin+i)->offset, (begin+i)->len); + res[str].offsets.push_back((begin+i)->offset); + res[str].weight += 1.0; } - - //clear tmp vars - hmmRes.clear(); - - //let i jump over this piece - i = j - 1; } } diff --git a/libchinese-segmentation/cppjieba/PreFilter.hpp b/libchinese-segmentation/cppjieba/PreFilter.hpp index 3f04dcf..2dd30dd 100644 --- a/libchinese-segmentation/cppjieba/PreFilter.hpp +++ b/libchinese-segmentation/cppjieba/PreFilter.hpp @@ -57,7 +57,6 @@ public: } wordRange.left = cursor_; - if (cursor_->rune == 0x20) { while (cursor_ != sentence_.end()) { if (cursor_->rune != 0x20) { @@ -71,7 +70,10 @@ public: cursor_ ++; } } - int num = 0; + + int max_num = 0; + uint32_t utf8_num = cursor_->len; + while (cursor_ != sentence_.end()) { if (cursor_->rune == 0x20) { if (wordRange.left == cursor_) { @@ -83,8 +85,8 @@ public: } cursor_ ++; - num++; - if (num >= 1024) { //todo 防止一次性传入过多字节,暂定限制为1024个字 + max_num++; + if (max_num >= 1024 or cursor_->len != utf8_num) { //todo 防止一次性传入过多字节,暂定限制为1024个字 wordRange.right = cursor_; return true; } diff --git a/src/src.pro b/src/src.pro index ff309e5..f61b5a7 100644 --- a/src/src.pro +++ b/src/src.pro @@ -9,7 +9,7 @@ TEMPLATE = app PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0 CONFIG += c++11 link_pkgconfig no_keywords lrelease LIBS += -lxapian -lgsettings-qt -lquazip5 -lX11 -#LIBS += -lukui-log4qt -L/usr/local/lib/libjemalloc -ljemalloc +LIBS += -lukui-log4qt #-L/usr/local/lib/libjemalloc -ljemalloc # The following define makes your compiler emit warnings if you use # any Qt feature that has been marked deprecated (the exact warnings # depend on your compiler). Please consult the documentation of the