Merge DAG and DP code
This commit is contained in:
parent
ec538ad214
commit
ff62a1e2b9
|
@ -66,8 +66,6 @@ QVector<SKeyWord> ChineseSegmentation::callSegement(std::string s) {
|
||||||
|
|
||||||
keywordres.clear();
|
keywordres.clear();
|
||||||
// keywordres.shrink_to_fit();
|
// keywordres.shrink_to_fit();
|
||||||
|
|
||||||
|
|
||||||
return vecNeeds;
|
return vecNeeds;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -167,6 +167,121 @@ public:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Find_Reverse(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
||||||
|
vector<struct DatDag>&res, size_t max_word_len) const {
|
||||||
|
|
||||||
|
res.clear();
|
||||||
|
res.resize(end - begin);
|
||||||
|
|
||||||
|
string text_str;
|
||||||
|
EncodeRunesToString(begin, end, text_str);
|
||||||
|
|
||||||
|
static const size_t max_num = 128;
|
||||||
|
JiebaDAT::result_pair_type result_pairs[max_num] = {};
|
||||||
|
|
||||||
|
size_t str_size = end - begin;
|
||||||
|
for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
|
||||||
|
|
||||||
|
begin_pos -= (end - i - 1)->len;
|
||||||
|
std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
|
||||||
|
res[str_size - i - 1].nexts.push_back(pair<size_t, const DatMemElem *>(str_size - i, nullptr));
|
||||||
|
|
||||||
|
for (std::size_t idx = 0; idx < num_results; ++idx) {
|
||||||
|
auto & match = result_pairs[idx];
|
||||||
|
if ((match.value < 0) || ((size_t)match.value >= elements_num_)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
|
||||||
|
|
||||||
|
if (char_num > max_word_len) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto pValue = &elements_ptr_[match.value];
|
||||||
|
|
||||||
|
if (1 == char_num) {
|
||||||
|
res[str_size - i - 1].nexts[0].second = pValue;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
res[str_size - i - 1].nexts.push_back(pair<size_t, const DatMemElem *>(str_size - 1 - i + char_num, pValue));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
||||||
|
vector<WordRange>& words, size_t max_word_len) const {
|
||||||
|
|
||||||
|
string text_str;
|
||||||
|
EncodeRunesToString(begin, end, text_str);
|
||||||
|
|
||||||
|
static const size_t max_num = 128;
|
||||||
|
JiebaDAT::result_pair_type result_pairs[max_num] = {};//存放字典查询结果
|
||||||
|
size_t str_size = end - begin;
|
||||||
|
double max_weight[str_size];//存放逆向路径最大weight
|
||||||
|
for (size_t i = 0; i<str_size; i++) {
|
||||||
|
max_weight[i] = -3.14e+100;
|
||||||
|
}
|
||||||
|
int max_next[str_size];//存放动态规划后的分词结果
|
||||||
|
memset(max_next,-1,str_size);
|
||||||
|
|
||||||
|
double val(0);
|
||||||
|
for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
|
||||||
|
size_t nextPos = str_size - i;//逆向计算
|
||||||
|
begin_pos -= (end - i - 1)->len;
|
||||||
|
|
||||||
|
std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
|
||||||
|
if (0 == num_results) {//字典不存在则单独分词
|
||||||
|
val = min_weight_;
|
||||||
|
|
||||||
|
if (nextPos < str_size) {
|
||||||
|
val += max_weight[nextPos];
|
||||||
|
}
|
||||||
|
if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
|
||||||
|
max_weight[nextPos - 1] = val;
|
||||||
|
max_next[nextPos - 1] = nextPos;
|
||||||
|
}
|
||||||
|
} else {//字典存在则根据查询结果数量计算最大概率路径
|
||||||
|
for (std::size_t idx = 0; idx < num_results; ++idx) {
|
||||||
|
auto & match = result_pairs[idx];
|
||||||
|
if ((match.value < 0) || ((size_t)match.value >= elements_num_)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
|
||||||
|
if (char_num > max_word_len) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto pValue = &elements_ptr_[match.value];
|
||||||
|
|
||||||
|
val = pValue->weight;
|
||||||
|
if (1 == char_num) {
|
||||||
|
if (nextPos < str_size) {
|
||||||
|
val += max_weight[nextPos];
|
||||||
|
}
|
||||||
|
if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
|
||||||
|
max_weight[nextPos - 1] = val;
|
||||||
|
max_next[nextPos - 1] = nextPos;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (nextPos - 1 + char_num < str_size) {
|
||||||
|
val += max_weight[nextPos - 1 + char_num];
|
||||||
|
}
|
||||||
|
if ((nextPos - 1 + char_num <= str_size) && (val > max_weight[nextPos - 1])) {
|
||||||
|
max_weight[nextPos - 1] = val;
|
||||||
|
max_next[nextPos - 1] = nextPos - 1 + char_num;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < str_size;) {//统计动态规划结果
|
||||||
|
assert(max_next[i] > i);
|
||||||
|
assert(max_next[i] <= str_size);
|
||||||
|
WordRange wr(begin + i, begin + max_next[i] - 1);
|
||||||
|
words.push_back(wr);
|
||||||
|
i = max_next[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
double GetMinWeight() const {
|
double GetMinWeight() const {
|
||||||
return min_weight_;
|
return min_weight_;
|
||||||
}
|
}
|
||||||
|
@ -284,7 +399,7 @@ private:
|
||||||
//const int fd =::mkstemp(&tmp_filepath[0]);
|
//const int fd =::mkstemp(&tmp_filepath[0]);
|
||||||
//原mkstemp用法有误,已修复--jxx20210519
|
//原mkstemp用法有误,已修复--jxx20210519
|
||||||
const int fd =::mkstemp((char *)tmp_filepath.data());
|
const int fd =::mkstemp((char *)tmp_filepath.data());
|
||||||
qDebug() << "mkstemp error:" << errno << tmp_filepath.data();
|
qDebug() << "mkstemp :" << errno << tmp_filepath.data();
|
||||||
assert(fd >= 0);
|
assert(fd >= 0);
|
||||||
::fchmod(fd, 0644);
|
::fchmod(fd, 0644);
|
||||||
|
|
||||||
|
|
|
@ -49,6 +49,13 @@ public:
|
||||||
dat_.Find(begin, end, res, max_word_len);
|
dat_.Find(begin, end, res, max_word_len);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Find(RuneStrArray::const_iterator begin,
|
||||||
|
RuneStrArray::const_iterator end,
|
||||||
|
vector<WordRange>& words,
|
||||||
|
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||||
|
dat_.Find(begin, end, words, max_word_len);
|
||||||
|
}
|
||||||
|
|
||||||
bool IsUserDictSingleChineseWord(const Rune& word) const {
|
bool IsUserDictSingleChineseWord(const Rune& word) const {
|
||||||
return IsIn(user_dict_single_chinese_word_, word);
|
return IsIn(user_dict_single_chinese_word_, word);
|
||||||
}
|
}
|
||||||
|
|
|
@ -138,10 +138,10 @@ private:
|
||||||
size_t now, old, stat;
|
size_t now, old, stat;
|
||||||
double tmp, endE, endS;
|
double tmp, endE, endS;
|
||||||
|
|
||||||
vector<int> path(XYSize);
|
//vector<int> path(XYSize);
|
||||||
vector<double> weight(XYSize);
|
//vector<double> weight(XYSize);
|
||||||
//int path[XYSize];
|
int path[XYSize];
|
||||||
//double weight[XYSize];
|
double weight[XYSize];
|
||||||
|
|
||||||
//start
|
//start
|
||||||
for (size_t y = 0; y < Y; y++) {
|
for (size_t y = 0; y < Y; y++) {
|
||||||
|
|
|
@ -22,10 +22,11 @@ public:
|
||||||
RuneStrArray::const_iterator end,
|
RuneStrArray::const_iterator end,
|
||||||
vector<WordRange>& words,
|
vector<WordRange>& words,
|
||||||
bool, size_t max_word_len) const override {
|
bool, size_t max_word_len) const override {
|
||||||
vector<DatDag> dags;
|
// vector<DatDag> dags;
|
||||||
dictTrie_->Find(begin, end, dags, max_word_len);//依据DAG词典生成DAG--jxx
|
// dictTrie_->Find(begin, end, dags, max_word_len);//依据DAG词典生成DAG--jxx
|
||||||
CalcDP(dags);//动态规划(Dynamic Programming,DP),根据DAG计算最优动态规划路径--jxx
|
// CalcDP(dags);//动态规划(Dynamic Programming,DP),根据DAG计算最优动态规划路径--jxx
|
||||||
CutByDag(begin, end, dags, words);//依据DAG最优路径分词--jxx
|
// CutByDag(begin, end, dags, words);//依据DAG最优路径分词--jxx
|
||||||
|
dictTrie_->Find(begin, end, words, max_word_len);
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
|
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
|
||||||
|
@ -48,6 +49,7 @@ public:
|
||||||
return dictTrie_->IsUserDictSingleChineseWord(value);
|
return dictTrie_->IsUserDictSingleChineseWord(value);
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
|
/*
|
||||||
void CalcDP(vector<DatDag>& dags) const {
|
void CalcDP(vector<DatDag>& dags) const {
|
||||||
double val(0);
|
double val(0);
|
||||||
for (auto rit = dags.rbegin(); rit != dags.rend(); rit++) {
|
for (auto rit = dags.rbegin(); rit != dags.rend(); rit++) {
|
||||||
|
@ -73,6 +75,35 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
/* 倒叙方式重写CalcDP函数,初步测试未发现问题*/
|
||||||
|
void CalcDP(vector<DatDag>& dags) const {
|
||||||
|
double val(0);
|
||||||
|
size_t size = dags.size();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size; i++) {
|
||||||
|
dags[size - 1 - i].max_next = -1;
|
||||||
|
dags[size - 1 - i].max_weight = MIN_DOUBLE;
|
||||||
|
|
||||||
|
for (const auto & it : dags[size - 1 - i].nexts) {
|
||||||
|
const auto nextPos = it.first;
|
||||||
|
val = dictTrie_->GetMinWeight();
|
||||||
|
|
||||||
|
if (nullptr != it.second) {
|
||||||
|
val = it.second->weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nextPos < dags.size()) {
|
||||||
|
val += dags[nextPos].max_weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((nextPos <= dags.size()) && (val > dags[size - 1 - i].max_weight)) {
|
||||||
|
dags[size - 1 - i].max_weight = val;
|
||||||
|
dags[size - 1 - i].max_next = nextPos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void CutByDag(RuneStrArray::const_iterator begin,
|
void CutByDag(RuneStrArray::const_iterator begin,
|
||||||
RuneStrArray::const_iterator,
|
RuneStrArray::const_iterator,
|
||||||
|
|
|
@ -123,65 +123,76 @@ public:
|
||||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
|
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
|
||||||
size_t) const override {
|
size_t) const override {
|
||||||
vector<WordRange> words;
|
vector<WordRange> words;
|
||||||
assert(end >= begin);
|
|
||||||
words.reserve(end - begin);
|
|
||||||
mpSeg_.CutRuneArray(begin, end, words);
|
|
||||||
|
|
||||||
vector<WordRange> hmmRes;
|
vector<WordRange> hmmRes;
|
||||||
hmmRes.reserve(end - begin);
|
assert(end >= begin);
|
||||||
|
if (3 == begin->len or 4 == begin->len) {
|
||||||
|
words.reserve(end - begin);
|
||||||
|
mpSeg_.CutRuneArray(begin, end, words);
|
||||||
|
hmmRes.reserve(words.size());
|
||||||
|
} else {
|
||||||
|
hmmRes.reserve(end - begin);
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < words.size(); i++) {
|
if (words.size() != 0) {//存在中文分词结果
|
||||||
|
for (size_t i = 0; i < words.size(); i++) {
|
||||||
|
|
||||||
string str = GetStringFromRunes(s, words[i].left, words[i].right);
|
string str = GetStringFromRunes(s, words[i].left, words[i].right);
|
||||||
|
|
||||||
if (stopWords_.find(str) != stopWords_.end()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (words[i].left != words[i].right) {
|
|
||||||
res[str].offsets.push_back(words[i].left->offset);
|
|
||||||
res[str].weight += 1.0;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune)
|
|
||||||
|| i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back
|
|
||||||
if (stopWords_.find(str) != stopWords_.end()) {
|
if (stopWords_.find(str) != stopWords_.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
res[str].offsets.push_back(words[i].left->offset);
|
|
||||||
res[str].weight += 1.0;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// if mp Get a single one and it is not in userdict, collect it in sequence
|
if (words[i].left != words[i].right) {
|
||||||
size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里(i字符不是最后一个字符),直接判定j字符
|
res[str].offsets.push_back(words[i].left->offset);
|
||||||
|
res[str].weight += 1.0;
|
||||||
while (j < (words.size() - 1)
|
|
||||||
&& words[j].left == words[j].right
|
|
||||||
&& !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
|
|
||||||
j++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Cut the sequence with hmm
|
|
||||||
assert(j - 1 >= i);
|
|
||||||
// TODO
|
|
||||||
hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
|
|
||||||
|
|
||||||
//put hmm result to result
|
|
||||||
for (size_t k = 0; k < hmmRes.size(); k++) {
|
|
||||||
string hmmStr = GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right);
|
|
||||||
if (IsSingleWord(hmmStr) || stopWords_.find(hmmStr) != stopWords_.end()) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
res[hmmStr].offsets.push_back(hmmRes[k].left->offset);
|
if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune)
|
||||||
res[hmmStr].weight += 1.0;
|
|| i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back
|
||||||
|
if (stopWords_.find(str) != stopWords_.end()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
res[str].offsets.push_back(words[i].left->offset);
|
||||||
|
res[str].weight += 1.0;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if mp Get a single one and it is not in userdict, collect it in sequence
|
||||||
|
size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里(i字符不是最后一个字符),直接判定j字符
|
||||||
|
|
||||||
|
while (j < (words.size() - 1)
|
||||||
|
&& words[j].left == words[j].right
|
||||||
|
&& !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cut the sequence with hmm
|
||||||
|
assert(j - 1 >= i);
|
||||||
|
// TODO
|
||||||
|
hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
|
||||||
|
|
||||||
|
//put hmm result to result
|
||||||
|
for (size_t k = 0; k < hmmRes.size(); k++) {
|
||||||
|
string hmmStr = GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right);
|
||||||
|
if (IsSingleWord(hmmStr) || stopWords_.find(hmmStr) != stopWords_.end()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
res[hmmStr].offsets.push_back(hmmRes[k].left->offset);
|
||||||
|
res[hmmStr].weight += 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
//clear tmp vars
|
||||||
|
hmmRes.clear();
|
||||||
|
|
||||||
|
//let i jump over this piece
|
||||||
|
i = j - 1;
|
||||||
|
}
|
||||||
|
} else {//不存在中文分词结果
|
||||||
|
for (size_t i = 0; i < (size_t)(end - begin); i++) {
|
||||||
|
string str = s.substr((begin+i)->offset, (begin+i)->len);
|
||||||
|
res[str].offsets.push_back((begin+i)->offset);
|
||||||
|
res[str].weight += 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
//clear tmp vars
|
|
||||||
hmmRes.clear();
|
|
||||||
|
|
||||||
//let i jump over this piece
|
|
||||||
i = j - 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -57,7 +57,6 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
wordRange.left = cursor_;
|
wordRange.left = cursor_;
|
||||||
|
|
||||||
if (cursor_->rune == 0x20) {
|
if (cursor_->rune == 0x20) {
|
||||||
while (cursor_ != sentence_.end()) {
|
while (cursor_ != sentence_.end()) {
|
||||||
if (cursor_->rune != 0x20) {
|
if (cursor_->rune != 0x20) {
|
||||||
|
@ -71,7 +70,10 @@ public:
|
||||||
cursor_ ++;
|
cursor_ ++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int num = 0;
|
|
||||||
|
int max_num = 0;
|
||||||
|
uint32_t utf8_num = cursor_->len;
|
||||||
|
|
||||||
while (cursor_ != sentence_.end()) {
|
while (cursor_ != sentence_.end()) {
|
||||||
if (cursor_->rune == 0x20) {
|
if (cursor_->rune == 0x20) {
|
||||||
if (wordRange.left == cursor_) {
|
if (wordRange.left == cursor_) {
|
||||||
|
@ -83,8 +85,8 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
cursor_ ++;
|
cursor_ ++;
|
||||||
num++;
|
max_num++;
|
||||||
if (num >= 1024) { //todo 防止一次性传入过多字节,暂定限制为1024个字
|
if (max_num >= 1024 or cursor_->len != utf8_num) { //todo 防止一次性传入过多字节,暂定限制为1024个字
|
||||||
wordRange.right = cursor_;
|
wordRange.right = cursor_;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,7 +9,7 @@ TEMPLATE = app
|
||||||
PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0
|
PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0
|
||||||
CONFIG += c++11 link_pkgconfig no_keywords lrelease
|
CONFIG += c++11 link_pkgconfig no_keywords lrelease
|
||||||
LIBS += -lxapian -lgsettings-qt -lquazip5 -lX11
|
LIBS += -lxapian -lgsettings-qt -lquazip5 -lX11
|
||||||
#LIBS += -lukui-log4qt -L/usr/local/lib/libjemalloc -ljemalloc
|
LIBS += -lukui-log4qt #-L/usr/local/lib/libjemalloc -ljemalloc
|
||||||
# The following define makes your compiler emit warnings if you use
|
# The following define makes your compiler emit warnings if you use
|
||||||
# any Qt feature that has been marked deprecated (the exact warnings
|
# any Qt feature that has been marked deprecated (the exact warnings
|
||||||
# depend on your compiler). Please consult the documentation of the
|
# depend on your compiler). Please consult the documentation of the
|
||||||
|
|
Loading…
Reference in New Issue