Merge branch '0607-dev' into 'dev-unity'

Optimization of Jieba keyword extraction; Optimize memory and efficiency;

See merge request kylin-desktop/ukui-search!21
This commit is contained in:
Zhai Kangning 2021-06-11 07:27:25 +00:00
commit 00365d291c
15 changed files with 257 additions and 98 deletions

View File

@ -58,7 +58,7 @@ QVector<SKeyWord> ChineseSegmentation::callSegement(std::string s) {
// str.squeeze(); // str.squeeze();
const size_t topk = -1; const size_t topk = -1;
std::vector<cppjieba::KeywordExtractor::Word> keywordres; std::vector<cppjieba::KeyWord> keywordres;
ChineseSegmentation::m_jieba->extractor.Extract(s, keywordres, topk); ChineseSegmentation::m_jieba->extractor.Extract(s, keywordres, topk);
std::string().swap(s); std::string().swap(s);
QVector<SKeyWord> vecNeeds; QVector<SKeyWord> vecNeeds;
@ -72,16 +72,16 @@ QVector<SKeyWord> ChineseSegmentation::callSegement(std::string s) {
} }
std::vector<cppjieba::KeywordExtractor::Word> ChineseSegmentation::callSegementStd(const std::string &str) { std::vector<cppjieba::KeyWord> ChineseSegmentation::callSegementStd(const std::string &str) {
const size_t topk = -1; const size_t topk = -1;
std::vector<cppjieba::KeywordExtractor::Word> keywordres; std::vector<cppjieba::KeyWord> keywordres;
ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk); ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk);
return keywordres; return keywordres;
} }
void ChineseSegmentation::convert(std::vector<cppjieba::KeywordExtractor::Word> &keywordres, QVector<SKeyWord> &kw) { void ChineseSegmentation::convert(std::vector<cppjieba::KeyWord> &keywordres, QVector<SKeyWord> &kw) {
for(auto i : keywordres) { for(auto i : keywordres) {
SKeyWord temp; SKeyWord temp;
temp.word = i.word; temp.word = i.word;

View File

@ -50,8 +50,8 @@ public:
QVector<SKeyWord> callSegement(std::string s); QVector<SKeyWord> callSegement(std::string s);
//新添加callSegementStd函数修改返回值为stdvector<cppjieba::KeywordExtractor::Word>并简化内部处理流程--jxx20210517 //新添加callSegementStd函数修改返回值为stdvector<cppjieba::KeywordExtractor::Word>并简化内部处理流程--jxx20210517
//修改函数入参形式为引用去掉Qstring与std::string转换代码--jxx20210519 //修改函数入参形式为引用去掉Qstring与std::string转换代码--jxx20210519
std::vector<cppjieba::KeywordExtractor::Word> callSegementStd(const std::string& str); std::vector<cppjieba::KeyWord> callSegementStd(const std::string& str);
void convert(std::vector<cppjieba::KeywordExtractor::Word>& keywordres, QVector<SKeyWord>& kw); void convert(std::vector<cppjieba::KeyWord>& keywordres, QVector<SKeyWord>& kw);
private: private:
static QMutex m_mutex; static QMutex m_mutex;
cppjieba::Jieba *m_jieba; cppjieba::Jieba *m_jieba;

View File

@ -47,7 +47,10 @@ public:
size_t) const override { size_t) const override {
} }
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
size_t) const override {
}
private: private:
const DictTrie* dictTrie_; const DictTrie* dictTrie_;
}; };

View File

@ -21,7 +21,7 @@ public:
RuneStrArray::const_iterator right = begin; RuneStrArray::const_iterator right = begin;
while (right != end) { while (right != end) {
if (right->rune < 0x80) { if (right->rune < 0x80) { //asc码
if (left != right) { if (left != right) {
InternalCut(left, right, res); InternalCut(left, right, res);
} }
@ -29,13 +29,13 @@ public:
left = right; left = right;
do { do {
right = SequentialLetterRule(left, end); right = SequentialLetterRule(left, end);//非英文字符则返回left否则返回left后非英文字母的位置
if (right != left) { if (right != left) {
break; break;
} }
right = NumbersRule(left, end); right = NumbersRule(left, end);//非数字则返回left否则返回left后非数字的位置
if (right != left) { if (right != left) {
break; break;
@ -61,7 +61,10 @@ public:
size_t) const override { size_t) const override {
} }
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
size_t) const override {
}
private: private:
// sequential letters rule // sequential letters rule
RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin,
@ -135,8 +138,10 @@ private:
size_t now, old, stat; size_t now, old, stat;
double tmp, endE, endS; double tmp, endE, endS;
vector<int> path(XYSize); //vector<int> path(XYSize);
vector<double> weight(XYSize); //vector<double> weight(XYSize);
int path[XYSize];
double weight[XYSize];
//start //start
for (size_t y = 0; y < Y; y++) { for (size_t y = 0; y < Y; y++) {

View File

@ -18,9 +18,9 @@ public:
model_(model_path), model_(model_path),
mp_seg_(&dict_trie_), mp_seg_(&dict_trie_),
hmm_seg_(&model_), hmm_seg_(&model_),
mix_seg_(&dict_trie_, &model_), mix_seg_(&dict_trie_, &model_, stopWordPath),
full_seg_(&dict_trie_), full_seg_(&dict_trie_),
query_seg_(&dict_trie_, &model_), query_seg_(&dict_trie_, &model_, stopWordPath),
extractor(&dict_trie_, &model_, idfPath, stopWordPath){ } extractor(&dict_trie_, &model_, idfPath, stopWordPath){ }
~Jieba() { } ~Jieba() { }

View File

@ -1,7 +1,6 @@
#pragma once #pragma once
#include <cmath> #include <cmath>
#include <set>
#include "MixSegment.hpp" #include "MixSegment.hpp"
namespace cppjieba { namespace cppjieba {
@ -12,25 +11,24 @@ using namespace std;
/*utf8*/ /*utf8*/
class KeywordExtractor { class KeywordExtractor {
public: public:
struct Word { // struct Word {
string word; // string word;
vector<size_t> offsets; // vector<size_t> offsets;
double weight; // double weight;
}; // struct Word // }; // struct Word
KeywordExtractor(const DictTrie* dictTrie, KeywordExtractor(const DictTrie* dictTrie,
const HMMModel* model, const HMMModel* model,
const string& idfPath, const string& idfPath,
const string& stopWordPath) const string& stopWordPath)
: segment_(dictTrie, model) { : segment_(dictTrie, model, stopWordPath) {
LoadIdfDict(idfPath); LoadIdfDict(idfPath);
LoadStopWordDict(stopWordPath);
} }
~KeywordExtractor() { ~KeywordExtractor() {
} }
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const { void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
vector<Word> topWords; vector<KeyWord> topWords;
Extract(sentence, topWords, topN); Extract(sentence, topWords, topN);
for (size_t i = 0; i < topWords.size(); i++) { for (size_t i = 0; i < topWords.size(); i++) {
@ -39,7 +37,7 @@ public:
} }
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const { void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
vector<Word> topWords; vector<KeyWord> topWords;
Extract(sentence, topWords, topN); Extract(sentence, topWords, topN);
for (size_t i = 0; i < topWords.size(); i++) { for (size_t i = 0; i < topWords.size(); i++) {
@ -47,34 +45,24 @@ public:
} }
} }
void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const { void Extract(const string& sentence, vector<KeyWord>& keywords, size_t topN) const {
vector<string> words;
segment_.CutToStr(sentence, words);//将字符串string分解为words放入vector
map<string, Word> wordmap;//插入字符串与Word的map相同string统计词频叠加权重 unordered_map<string, KeyWord> wordmap;//插入字符串与Word的map相同string统计词频叠加权重
size_t offset = 0; PreFilter pre_filter(symbols_, sentence);
RuneStrArray::const_iterator null_p;
for (size_t i = 0; i < words.size(); ++i) { WordRange range(null_p, null_p);
size_t t = offset; bool isNull(false);
offset += words[i].size(); while (pre_filter.Next(range, isNull)) {
if (isNull) {
if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
continue; continue;
} }
segment_.CutToStr(sentence, range, wordmap);
wordmap[words[i]].offsets.push_back(t);
wordmap[words[i]].weight += 1.0;
}
if (offset != sentence.size()) {
XLOG(ERROR) << "words illegal";
return;
} }
keywords.clear(); keywords.clear();
keywords.reserve(wordmap.size()); keywords.reserve(wordmap.size());
for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { for (unordered_map<string, KeyWord>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);//IDF词典查找 unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);//IDF词典查找
if (cit != idfMap_.end()) { if (cit != idfMap_.end()) {
@ -129,22 +117,8 @@ private:
idfAverage_ = idfSum / lineno; idfAverage_ = idfSum / lineno;
assert(idfAverage_ > 0.0); assert(idfAverage_ > 0.0);
} }
void LoadStopWordDict(const string& filePath) {
ifstream ifs(filePath.c_str());
if(not ifs.is_open()){
return ;
}
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
string line ;
while (getline(ifs, line)) { static bool Compare(const KeyWord& lhs, const KeyWord& rhs) {
stopWords_.insert(line);
}
assert(stopWords_.size());
}
static bool Compare(const Word& lhs, const Word& rhs) {
return lhs.weight > rhs.weight; return lhs.weight > rhs.weight;
} }
@ -152,10 +126,10 @@ private:
unordered_map<string, double> idfMap_; unordered_map<string, double> idfMap_;
double idfAverage_; double idfAverage_;
unordered_set<string> stopWords_; unordered_set<Rune> symbols_;
}; // class KeywordExtractor }; // class KeywordExtractor
inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) { inline ostream& operator << (ostream& os, const KeyWord& word) {
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
"}"; "}";
} }

View File

@ -32,7 +32,10 @@ public:
size_t) const override { size_t) const override {
} }
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
size_t) const override {
}
const DictTrie* GetDictTrie() const override { const DictTrie* GetDictTrie() const override {
return dictTrie_; return dictTrie_;
} }
@ -46,13 +49,14 @@ public:
} }
private: private:
void CalcDP(vector<DatDag>& dags) const { void CalcDP(vector<DatDag>& dags) const {
double val(0);
for (auto rit = dags.rbegin(); rit != dags.rend(); rit++) { for (auto rit = dags.rbegin(); rit != dags.rend(); rit++) {
rit->max_next = -1; rit->max_next = -1;
rit->max_weight = MIN_DOUBLE; rit->max_weight = MIN_DOUBLE;
for (const auto & it : rit->nexts) { for (const auto & it : rit->nexts) {
const auto nextPos = it.first; const auto nextPos = it.first;
double val = dictTrie_->GetMinWeight(); val = dictTrie_->GetMinWeight();
if (nullptr != it.second) { if (nullptr != it.second) {
val = it.second->weight; val = it.second->weight;

View File

@ -9,8 +9,11 @@
namespace cppjieba { namespace cppjieba {
class MixSegment: public SegmentTagged { class MixSegment: public SegmentTagged {
public: public:
MixSegment(const DictTrie* dictTrie, const HMMModel* model) MixSegment(const DictTrie* dictTrie,
const HMMModel* model,
const string& stopWordPath)
: mpSeg_(dictTrie), hmmSeg_(model) { : mpSeg_(dictTrie), hmmSeg_(model) {
LoadStopWordDict(stopWordPath);
} }
~MixSegment() {} ~MixSegment() {}
@ -81,16 +84,20 @@ public:
for (size_t i = 0; i < words.size(); i++) { for (size_t i = 0; i < words.size(); i++) {
//if mp Get a word, it's ok, put it into result //if mp Get a word, it's ok, put it into result
if (words[i].left != words[i].right || (words[i].left == words[i].right && if (words[i].left != words[i].right) {
mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) { res.push_back(GetStringFromRunes(s, words[i].left, words[i].right));
continue;
}
if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune)
|| i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back
res.push_back(GetStringFromRunes(s, words[i].left, words[i].right)); res.push_back(GetStringFromRunes(s, words[i].left, words[i].right));
continue; continue;
} }
// if mp Get a single one and it is not in userdict, collect it in sequence // if mp Get a single one and it is not in userdict, collect it in sequence
size_t j = i; size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里i字符不是最后一个字符直接判定j字符
while (j < words.size() && words[j].left == words[j].right && while (j < (words.size() - 1) && words[j].left == words[j].right &&
!mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
j++; j++;
} }
@ -113,6 +120,70 @@ public:
} }
} }
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
size_t) const override {
vector<WordRange> words;
assert(end >= begin);
words.reserve(end - begin);
mpSeg_.CutRuneArray(begin, end, words);
vector<WordRange> hmmRes;
hmmRes.reserve(end - begin);
for (size_t i = 0; i < words.size(); i++) {
string str = GetStringFromRunes(s, words[i].left, words[i].right);
if (stopWords_.find(str) != stopWords_.end()) {
continue;
}
if (words[i].left != words[i].right) {
res[str].offsets.push_back(words[i].left->offset);
res[str].weight += 1.0;
continue;
}
if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune)
|| i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back
if (stopWords_.find(str) != stopWords_.end()) {
continue;
}
res[str].offsets.push_back(words[i].left->offset);
res[str].weight += 1.0;
continue;
}
// if mp Get a single one and it is not in userdict, collect it in sequence
size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里i字符不是最后一个字符直接判定j字符
while (j < (words.size() - 1) && words[j].left == words[j].right &&
!mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
j++;
}
// Cut the sequence with hmm
assert(j - 1 >= i);
// TODO
hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
//put hmm result to result
for (size_t k = 0; k < hmmRes.size(); k++) {
string hmmStr = GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right);
if (IsSingleWord(hmmStr) || stopWords_.find(hmmStr) != stopWords_.end()) {
continue;
}
res[hmmStr].offsets.push_back(hmmRes[k].left->offset);
res[hmmStr].weight += 1.0;
}
//clear tmp vars
hmmRes.clear();
//let i jump over this piece
i = j - 1;
}
}
const DictTrie* GetDictTrie() const override { const DictTrie* GetDictTrie() const override {
return mpSeg_.GetDictTrie(); return mpSeg_.GetDictTrie();
} }
@ -125,7 +196,23 @@ public:
return tagger_.LookupTag(str, *this); return tagger_.LookupTag(str, *this);
} }
void LoadStopWordDict(const string& filePath) {
ifstream ifs(filePath.c_str());
if(not ifs.is_open()){
return ;
}
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
string line ;
while (getline(ifs, line)) {
stopWords_.insert(line);
}
assert(stopWords_.size());
}
private: private:
unordered_set<string> stopWords_;
MPSegment mpSeg_; MPSegment mpSeg_;
HMMSegment hmmSeg_; HMMSegment hmmSeg_;
PosTagger tagger_; PosTagger tagger_;

View File

@ -22,6 +22,73 @@ public:
bool HasNext() const { bool HasNext() const {
return cursor_ != sentence_.end(); return cursor_ != sentence_.end();
} }
bool Next(WordRange& wordRange) {
if (cursor_ == sentence_.end()) {
return false;
}
wordRange.left = cursor_;
while (cursor_->rune == 0x20 && cursor_ != sentence_.end()) {
cursor_++;
}
if (cursor_ == sentence_.end()) {
wordRange.right = cursor_;
return true;
}
while (++cursor_ != sentence_.end()) {
if (cursor_->rune == 0x20) {
wordRange.right = cursor_;
return true;
}
}
wordRange.right = sentence_.end();
return true;
}
bool Next(WordRange& wordRange, bool& isNull) {
isNull = false;
if (cursor_ == sentence_.end()) {
return false;
}
wordRange.left = cursor_;
if (cursor_->rune == 0x20) {
while (cursor_ != sentence_.end()) {
if (cursor_->rune != 0x20) {
if (wordRange.left == cursor_) {
cursor_ ++;
}
wordRange.right = cursor_;
isNull = true;
return true;
}
cursor_ ++;
}
}
while (cursor_ != sentence_.end()) {
if (cursor_->rune == 0x20) {
if (wordRange.left == cursor_) {
cursor_ ++;
}
wordRange.right = cursor_;
return true;
}
cursor_ ++;
}
wordRange.right = sentence_.end();
return true;
}
WordRange Next() { WordRange Next() {
WordRange range(cursor_, cursor_); WordRange range(cursor_, cursor_);

View File

@ -14,8 +14,10 @@
namespace cppjieba { namespace cppjieba {
class QuerySegment: public SegmentBase { class QuerySegment: public SegmentBase {
public: public:
QuerySegment(const DictTrie* dictTrie, const HMMModel* model) QuerySegment(const DictTrie* dictTrie,
: mixSeg_(dictTrie, model), trie_(dictTrie) { const HMMModel* model,
const string& stopWordPath)
: mixSeg_(dictTrie, model, stopWordPath), trie_(dictTrie) {
} }
~QuerySegment() { ~QuerySegment() {
} }
@ -59,7 +61,10 @@ public:
size_t) const override { size_t) const override {
} }
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
size_t) const override {
}
private: private:
bool IsAllAscii(const RuneArray& s) const { bool IsAllAscii(const RuneArray& s) const {
for (size_t i = 0; i < s.size(); i++) { for (size_t i = 0; i < s.size(); i++) {

View File

@ -23,23 +23,28 @@ public:
//添加基于sentence的cut方法减少中间变量的存储与格式转换--jxx20210517 //添加基于sentence的cut方法减少中间变量的存储与格式转换--jxx20210517
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm, virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
size_t max_word_len) const = 0; size_t max_word_len) const = 0;
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
size_t max_word_len) const = 0;
//重写CutToStr函数简化获取vector<string>& words的流程降低内存占用--jxx20210517 //重写CutToStr函数简化获取vector<string>& words的流程降低内存占用--jxx20210517
void CutToStr(const string& sentence, vector<string>& words, bool hmm = true, void CutToStr(const string& sentence, vector<string>& words, bool hmm = true,
size_t max_word_len = MAX_WORD_LENGTH) const { size_t max_word_len = MAX_WORD_LENGTH) const {
/*
vector<Word> tmp;
CutToWord(sentence, tmp, hmm, max_word_len);
GetStringsFromWords(tmp, words);
*/
PreFilter pre_filter(symbols_, sentence); PreFilter pre_filter(symbols_, sentence);
words.clear(); words.clear();
words.reserve(sentence.size() / 2);//todo 参考源码,参数待定 words.reserve(sentence.size() / 2);//todo 参考源码,参数待定
while (pre_filter.HasNext()) { RuneStrArray::const_iterator null_p;
auto range = pre_filter.Next(); WordRange range(null_p, null_p);
while (pre_filter.Next(range)) {
CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len); CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len);
} }
} }
void CutToStr(const string& sentence, WordRange range, vector<string>& words, bool hmm = true,
size_t max_word_len = MAX_WORD_LENGTH) const {
CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len);
}
void CutToStr(const string& sentence, WordRange range, unordered_map<string, KeyWord>& words, bool hmm = true,
size_t max_word_len = MAX_WORD_LENGTH) const {
CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len);
}
void CutToWord(const string& sentence, vector<Word>& words, bool hmm = true, void CutToWord(const string& sentence, vector<Word>& words, bool hmm = true,
size_t max_word_len = MAX_WORD_LENGTH) const { size_t max_word_len = MAX_WORD_LENGTH) const {
PreFilter pre_filter(symbols_, sentence); PreFilter pre_filter(symbols_, sentence);

View File

@ -15,6 +15,12 @@ using std::vector;
typedef uint32_t Rune; typedef uint32_t Rune;
struct KeyWord {
string word;
vector<size_t> offsets;
double weight;
}; // struct Word
struct Word { struct Word {
string word; string word;
uint32_t offset; uint32_t offset;
@ -63,7 +69,7 @@ struct WordRange {
: left(l), right(r) { : left(l), right(r) {
} }
size_t Length() const { size_t Length() const {
return right - left + 1; return right - left;
} }
bool IsAllAscii() const { bool IsAllAscii() const {
@ -113,11 +119,13 @@ inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
uint32_t tmp; uint32_t tmp;
uint32_t offset = 0; uint32_t offset = 0;
runes.clear(); runes.clear();
for(size_t i = 0; i < s.size();) { uint32_t len(0);
if(!(s.data()[i] & 0x80)) { // 0xxxxxxx for (size_t i = 0; i < s.size();) {
if (!(s.data()[i] & 0x80)) { // 0xxxxxxx
// 7bit, total 7bit // 7bit, total 7bit
tmp = (uint8_t)(s.data()[i]) & 0x7f; tmp = (uint8_t)(s.data()[i]) & 0x7f;
i++; i++;
len = 1;
} else if ((uint8_t)s.data()[i] <= 0xdf && i + 1 < s.size()) { // 110xxxxxx } else if ((uint8_t)s.data()[i] <= 0xdf && i + 1 < s.size()) { // 110xxxxxx
// 5bit, total 5bit // 5bit, total 5bit
tmp = (uint8_t)(s.data()[i]) & 0x1f; tmp = (uint8_t)(s.data()[i]) & 0x1f;
@ -126,6 +134,7 @@ inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
tmp <<= 6; tmp <<= 6;
tmp |= (uint8_t)(s.data()[i+1]) & 0x3f; tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
i += 2; i += 2;
len = 2;
} else if((uint8_t)s.data()[i] <= 0xef && i + 2 < s.size()) { // 1110xxxxxx } else if((uint8_t)s.data()[i] <= 0xef && i + 2 < s.size()) { // 1110xxxxxx
// 4bit, total 4bit // 4bit, total 4bit
tmp = (uint8_t)(s.data()[i]) & 0x0f; tmp = (uint8_t)(s.data()[i]) & 0x0f;
@ -139,6 +148,7 @@ inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
tmp |= (uint8_t)(s.data()[i+2]) & 0x3f; tmp |= (uint8_t)(s.data()[i+2]) & 0x3f;
i += 3; i += 3;
len = 3;
} else if((uint8_t)s.data()[i] <= 0xf7 && i + 3 < s.size()) { // 11110xxxx } else if((uint8_t)s.data()[i] <= 0xf7 && i + 3 < s.size()) { // 11110xxxx
// 3bit, total 3bit // 3bit, total 3bit
tmp = (uint8_t)(s.data()[i]) & 0x07; tmp = (uint8_t)(s.data()[i]) & 0x07;
@ -156,10 +166,10 @@ inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
tmp |= (uint8_t)(s.data()[i+3]) & 0x3f; tmp |= (uint8_t)(s.data()[i+3]) & 0x3f;
i += 4; i += 4;
len = 4;
} else { } else {
return false; return false;
} }
uint32_t len = limonp::UnicodeToUtf8Bytes(tmp);
RuneInfo x(tmp, offset, len, i, 1); RuneInfo x(tmp, offset, len, i, 1);
runes.push_back(x); runes.push_back(x);
offset += len; offset += len;
@ -241,9 +251,8 @@ inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left,
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
assert(right->offset >= left->offset); assert(right->offset >= left->offset);
uint32_t len = right->offset - left->offset + right->len; //uint32_t len = right->offset - left->offset + right->len;
uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length; return s.substr(left->offset, right->offset - left->offset + right->len);
return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length).word;
} }
inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) { inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {

View File

@ -120,8 +120,7 @@ void ConstructDocumentForContent::run() {
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " "); content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
// QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000)); // QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000));
//修改函数返回类型修改入参为std::string引用--jxx20210519 std::vector<cppjieba::KeyWord> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
std::vector<cppjieba::KeywordExtractor::Word> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
for(size_t i = 0; i < term.size(); ++i) { for(size_t i = 0; i < term.size(); ++i) {
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight)); doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));

View File

@ -31,9 +31,8 @@ void FileReader::getTextContent(QString path, QString &textContent) {
QFileInfo file(path); QFileInfo file(path);
QString strsfx = file.suffix(); QString strsfx = file.suffix();
if(name == "application/zip") { if(name == "application/zip") {
if(strsfx.endsWith("docx")){ if(strsfx.endsWith("docx"))
FileUtils::getDocxTextContent(path, textContent); FileUtils::getDocxTextContent(path, textContent);
}
if(strsfx.endsWith("pptx")) if(strsfx.endsWith("pptx"))
FileUtils::getPptxTextContent(path, textContent); FileUtils::getPptxTextContent(path, textContent);
if(strsfx.endsWith("xlsx")) if(strsfx.endsWith("xlsx"))

View File

@ -49,7 +49,7 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
this->q_index->enqueue(QVector<QString>() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0")); this->q_index->enqueue(QVector<QString>() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0"));
if((fileInfo.fileName().split(".", QString::SkipEmptyParts).length() > 1) && (true == targetFileTypeMap[fileInfo.fileName().split(".").last()])) { if((fileInfo.fileName().split(".", QString::SkipEmptyParts).length() > 1) && (true == targetFileTypeMap[fileInfo.fileName().split(".").last()])) {
//this->q_content_index->enqueue(fileInfo.absoluteFilePath()); //this->q_content_index->enqueue(fileInfo.absoluteFilePath());
if(fileInfo.fileName().split(".").last() == "docx"){ if (fileInfo.fileName().split(".").last() == "docx") {
QuaZip file(fileInfo.absoluteFilePath()); QuaZip file(fileInfo.absoluteFilePath());
if(!file.open(QuaZip::mdUnzip)) if(!file.open(QuaZip::mdUnzip))
return; return;
@ -57,10 +57,8 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
return; return;
QuaZipFile fileR(&file); QuaZipFile fileR(&file);
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//docx解压缩后的xml文件为实际需要解析文件大小 this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//docx解压缩后的xml文件为实际需要解析文件大小
qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
qDebug() << "文件大小:" << fileR.usize();
file.close(); file.close();
}else if(fileInfo.fileName().split(".").last() == "pptx"){ } else if (fileInfo.fileName().split(".").last() == "pptx") {
QuaZip file(fileInfo.absoluteFilePath()); QuaZip file(fileInfo.absoluteFilePath());
if(!file.open(QuaZip::mdUnzip)) if(!file.open(QuaZip::mdUnzip))
return; return;
@ -79,10 +77,8 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
} }
} }
file.close(); file.close();
qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
qDebug() << "文件大小:" << fileSize;
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileSize));//pptx解压缩后的xml文件为实际需要解析文件大小 this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileSize));//pptx解压缩后的xml文件为实际需要解析文件大小
}else if(fileInfo.fileName().split(".").last() == "xlsx"){ } else if (fileInfo.fileName().split(".").last() == "xlsx") {
QuaZip file(fileInfo.absoluteFilePath()); QuaZip file(fileInfo.absoluteFilePath());
if(!file.open(QuaZip::mdUnzip)) if(!file.open(QuaZip::mdUnzip))
return; return;
@ -90,10 +86,8 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
return; return;
QuaZipFile fileR(&file); QuaZipFile fileR(&file);
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//xlsx解压缩后的xml文件为实际解析文件大小 this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//xlsx解压缩后的xml文件为实际解析文件大小
qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
qDebug() << "文件大小:" << fileR.usize();
file.close(); file.close();
}else{ } else {
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size())); this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
} }
} }
@ -220,9 +214,17 @@ void FirstIndex::run() {
// for (size_t i = 0; (i < this->u_send_length) && (!this->q_content_index->empty()); ++i){ // for (size_t i = 0; (i < this->u_send_length) && (!this->q_content_index->empty()); ++i){
qint64 fileSize = 0; qint64 fileSize = 0;
//修改一次处理的数据量从30个文件改为文件总大小为50M以下50M为暂定值--jxx20210519 //修改一次处理的数据量从30个文件改为文件总大小为50M以下50M为暂定值--jxx20210519
for(size_t i = 0;/* (i < 30) && */(fileSize < 50*1024*1024) && (!this->q_content_index->empty()); ++i) { for(size_t i = 0;/* (i < 30) && (fileSize < 52428800) && */(!this->q_content_index->empty()); ++i) {
QPair<QString,qint64> tempPair = this->q_content_index->dequeue(); QPair<QString,qint64> tempPair = this->q_content_index->dequeue();
fileSize += tempPair.second; fileSize += tempPair.second;
if (fileSize > 52428800 ) {
if (tmp2->size() == 0) {
tmp2->enqueue(tempPair.first);
break;
}
this->q_content_index->enqueue(tempPair);
break;
}
tmp2->enqueue(tempPair.first); tmp2->enqueue(tempPair.first);
} }
// qDebug() << ">>>>>>>>all fileSize:" << fileSize << "file num:" << tmp->size() << "<<<<<<<<<<<<<<<<<<<"; // qDebug() << ">>>>>>>>all fileSize:" << fileSize << "file num:" << tmp->size() << "<<<<<<<<<<<<<<<<<<<";