diff --git a/libchinese-segmentation/chinese-segmentation.cpp b/libchinese-segmentation/chinese-segmentation.cpp index 15c5207..fe4e95f 100644 --- a/libchinese-segmentation/chinese-segmentation.cpp +++ b/libchinese-segmentation/chinese-segmentation.cpp @@ -58,7 +58,7 @@ QVector ChineseSegmentation::callSegement(std::string s) { // str.squeeze(); const size_t topk = -1; - std::vector keywordres; + std::vector keywordres; ChineseSegmentation::m_jieba->extractor.Extract(s, keywordres, topk); std::string().swap(s); QVector vecNeeds; @@ -72,16 +72,16 @@ QVector ChineseSegmentation::callSegement(std::string s) { } -std::vector ChineseSegmentation::callSegementStd(const std::string &str) { +std::vector ChineseSegmentation::callSegementStd(const std::string &str) { const size_t topk = -1; - std::vector keywordres; + std::vector keywordres; ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk); return keywordres; } -void ChineseSegmentation::convert(std::vector &keywordres, QVector &kw) { +void ChineseSegmentation::convert(std::vector &keywordres, QVector &kw) { for(auto i : keywordres) { SKeyWord temp; temp.word = i.word; diff --git a/libchinese-segmentation/chinese-segmentation.h b/libchinese-segmentation/chinese-segmentation.h index e653f66..01e8046 100644 --- a/libchinese-segmentation/chinese-segmentation.h +++ b/libchinese-segmentation/chinese-segmentation.h @@ -50,8 +50,8 @@ public: QVector callSegement(std::string s); //新添加callSegementStd函数,修改返回值为std::vector并简化内部处理流程--jxx20210517 //修改函数入参形式为引用,去掉Qstring与std::string转换代码--jxx20210519 - std::vector callSegementStd(const std::string& str); - void convert(std::vector& keywordres, QVector& kw); + std::vector callSegementStd(const std::string& str); + void convert(std::vector& keywordres, QVector& kw); private: static QMutex m_mutex; cppjieba::Jieba *m_jieba; diff --git a/libchinese-segmentation/cppjieba/FullSegment.hpp b/libchinese-segmentation/cppjieba/FullSegment.hpp index 6eedbd9..1652b75 100644 --- a/libchinese-segmentation/cppjieba/FullSegment.hpp +++ b/libchinese-segmentation/cppjieba/FullSegment.hpp @@ -47,7 +47,10 @@ public: size_t) const override { } + virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map& res, bool hmm, + size_t) const override { + } private: const DictTrie* dictTrie_; }; diff --git a/libchinese-segmentation/cppjieba/HMMSegment.hpp b/libchinese-segmentation/cppjieba/HMMSegment.hpp index 1e5d08c..1a9937b 100644 --- a/libchinese-segmentation/cppjieba/HMMSegment.hpp +++ b/libchinese-segmentation/cppjieba/HMMSegment.hpp @@ -21,7 +21,7 @@ public: RuneStrArray::const_iterator right = begin; while (right != end) { - if (right->rune < 0x80) { + if (right->rune < 0x80) { //asc码 if (left != right) { InternalCut(left, right, res); } @@ -29,13 +29,13 @@ public: left = right; do { - right = SequentialLetterRule(left, end); + right = SequentialLetterRule(left, end);//非英文字符则返回left,否则返回left后非英文字母的位置 if (right != left) { break; } - right = NumbersRule(left, end); + right = NumbersRule(left, end);//非数字则返回left,否则返回left后非数字的位置 if (right != left) { break; @@ -61,7 +61,10 @@ public: size_t) const override { } + virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map& res, bool hmm, + size_t) const override { + } private: // sequential letters rule RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, @@ -135,8 +138,10 @@ private: size_t now, old, stat; double tmp, endE, endS; - vector path(XYSize); - vector weight(XYSize); + //vector path(XYSize); + //vector weight(XYSize); + int path[XYSize]; + double weight[XYSize]; //start for (size_t y = 0; y < Y; y++) { diff --git a/libchinese-segmentation/cppjieba/Jieba.hpp b/libchinese-segmentation/cppjieba/Jieba.hpp index c862fd8..c017bd6 100644 --- a/libchinese-segmentation/cppjieba/Jieba.hpp +++ b/libchinese-segmentation/cppjieba/Jieba.hpp @@ -18,9 +18,9 @@ public: model_(model_path), mp_seg_(&dict_trie_), hmm_seg_(&model_), - mix_seg_(&dict_trie_, &model_), + mix_seg_(&dict_trie_, &model_, stopWordPath), full_seg_(&dict_trie_), - query_seg_(&dict_trie_, &model_), + query_seg_(&dict_trie_, &model_, stopWordPath), extractor(&dict_trie_, &model_, idfPath, stopWordPath){ } ~Jieba() { } diff --git a/libchinese-segmentation/cppjieba/KeywordExtractor.hpp b/libchinese-segmentation/cppjieba/KeywordExtractor.hpp index 3bcbc54..f87ad5f 100644 --- a/libchinese-segmentation/cppjieba/KeywordExtractor.hpp +++ b/libchinese-segmentation/cppjieba/KeywordExtractor.hpp @@ -1,7 +1,6 @@ #pragma once #include -#include #include "MixSegment.hpp" namespace cppjieba { @@ -12,25 +11,24 @@ using namespace std; /*utf8*/ class KeywordExtractor { public: - struct Word { - string word; - vector offsets; - double weight; - }; // struct Word +// struct Word { +// string word; +// vector offsets; +// double weight; +// }; // struct Word KeywordExtractor(const DictTrie* dictTrie, const HMMModel* model, const string& idfPath, const string& stopWordPath) - : segment_(dictTrie, model) { + : segment_(dictTrie, model, stopWordPath) { LoadIdfDict(idfPath); - LoadStopWordDict(stopWordPath); } ~KeywordExtractor() { } void Extract(const string& sentence, vector& keywords, size_t topN) const { - vector topWords; + vector topWords; Extract(sentence, topWords, topN); for (size_t i = 0; i < topWords.size(); i++) { @@ -39,7 +37,7 @@ public: } void Extract(const string& sentence, vector >& keywords, size_t topN) const { - vector topWords; + vector topWords; Extract(sentence, topWords, topN); for (size_t i = 0; i < topWords.size(); i++) { @@ -47,34 +45,24 @@ public: } } - void Extract(const string& sentence, vector& keywords, size_t topN) const { - vector words; - segment_.CutToStr(sentence, words);//将字符串string分解为words放入vector + void Extract(const string& sentence, vector& keywords, size_t topN) const { - map wordmap;//插入字符串与Word的map,相同string统计词频叠加权重 - size_t offset = 0; - - for (size_t i = 0; i < words.size(); ++i) { - size_t t = offset; - offset += words[i].size(); - - if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { + unordered_map wordmap;//插入字符串与Word的map,相同string统计词频叠加权重 + PreFilter pre_filter(symbols_, sentence); + RuneStrArray::const_iterator null_p; + WordRange range(null_p, null_p); + bool isNull(false); + while (pre_filter.Next(range, isNull)) { + if (isNull) { continue; } - - wordmap[words[i]].offsets.push_back(t); - wordmap[words[i]].weight += 1.0; - } - - if (offset != sentence.size()) { - XLOG(ERROR) << "words illegal"; - return; + segment_.CutToStr(sentence, range, wordmap); } keywords.clear(); keywords.reserve(wordmap.size()); - for (map::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { + for (unordered_map::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { unordered_map::const_iterator cit = idfMap_.find(itr->first);//IDF词典查找 if (cit != idfMap_.end()) { @@ -129,22 +117,8 @@ private: idfAverage_ = idfSum / lineno; assert(idfAverage_ > 0.0); } - void LoadStopWordDict(const string& filePath) { - ifstream ifs(filePath.c_str()); - if(not ifs.is_open()){ - return ; - } - XCHECK(ifs.is_open()) << "open " << filePath << " failed"; - string line ; - while (getline(ifs, line)) { - stopWords_.insert(line); - } - - assert(stopWords_.size()); - } - - static bool Compare(const Word& lhs, const Word& rhs) { + static bool Compare(const KeyWord& lhs, const KeyWord& rhs) { return lhs.weight > rhs.weight; } @@ -152,10 +126,10 @@ private: unordered_map idfMap_; double idfAverage_; - unordered_set stopWords_; + unordered_set symbols_; }; // class KeywordExtractor -inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) { +inline ostream& operator << (ostream& os, const KeyWord& word) { return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; } diff --git a/libchinese-segmentation/cppjieba/MPSegment.hpp b/libchinese-segmentation/cppjieba/MPSegment.hpp index 149af03..d615fe2 100644 --- a/libchinese-segmentation/cppjieba/MPSegment.hpp +++ b/libchinese-segmentation/cppjieba/MPSegment.hpp @@ -32,7 +32,10 @@ public: size_t) const override { } + virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map& res, bool hmm, + size_t) const override { + } const DictTrie* GetDictTrie() const override { return dictTrie_; } @@ -46,13 +49,14 @@ public: } private: void CalcDP(vector& dags) const { + double val(0); for (auto rit = dags.rbegin(); rit != dags.rend(); rit++) { rit->max_next = -1; rit->max_weight = MIN_DOUBLE; for (const auto & it : rit->nexts) { const auto nextPos = it.first; - double val = dictTrie_->GetMinWeight(); + val = dictTrie_->GetMinWeight(); if (nullptr != it.second) { val = it.second->weight; diff --git a/libchinese-segmentation/cppjieba/MixSegment.hpp b/libchinese-segmentation/cppjieba/MixSegment.hpp index 489df4f..4c93748 100644 --- a/libchinese-segmentation/cppjieba/MixSegment.hpp +++ b/libchinese-segmentation/cppjieba/MixSegment.hpp @@ -9,8 +9,11 @@ namespace cppjieba { class MixSegment: public SegmentTagged { public: - MixSegment(const DictTrie* dictTrie, const HMMModel* model) + MixSegment(const DictTrie* dictTrie, + const HMMModel* model, + const string& stopWordPath) : mpSeg_(dictTrie), hmmSeg_(model) { + LoadStopWordDict(stopWordPath); } ~MixSegment() {} @@ -81,16 +84,20 @@ public: for (size_t i = 0; i < words.size(); i++) { //if mp Get a word, it's ok, put it into result - if (words[i].left != words[i].right || (words[i].left == words[i].right && - mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) { + if (words[i].left != words[i].right) { + res.push_back(GetStringFromRunes(s, words[i].left, words[i].right)); + continue; + } + if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune) + || i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back res.push_back(GetStringFromRunes(s, words[i].left, words[i].right)); continue; } // if mp Get a single one and it is not in userdict, collect it in sequence - size_t j = i; + size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里(i字符不是最后一个字符),直接判定j字符 - while (j < words.size() && words[j].left == words[j].right && + while (j < (words.size() - 1) && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { j++; } @@ -113,6 +120,70 @@ public: } } + virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map& res, bool hmm, + size_t) const override { + vector words; + assert(end >= begin); + words.reserve(end - begin); + mpSeg_.CutRuneArray(begin, end, words); + + vector hmmRes; + hmmRes.reserve(end - begin); + + for (size_t i = 0; i < words.size(); i++) { + + string str = GetStringFromRunes(s, words[i].left, words[i].right); + + if (stopWords_.find(str) != stopWords_.end()) { + continue; + } + + if (words[i].left != words[i].right) { + res[str].offsets.push_back(words[i].left->offset); + res[str].weight += 1.0; + continue; + } + if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune) + || i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back + if (stopWords_.find(str) != stopWords_.end()) { + continue; + } + res[str].offsets.push_back(words[i].left->offset); + res[str].weight += 1.0; + continue; + } + + // if mp Get a single one and it is not in userdict, collect it in sequence + size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里(i字符不是最后一个字符),直接判定j字符 + + while (j < (words.size() - 1) && words[j].left == words[j].right && + !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { + j++; + } + + // Cut the sequence with hmm + assert(j - 1 >= i); + // TODO + hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes); + + //put hmm result to result + for (size_t k = 0; k < hmmRes.size(); k++) { + string hmmStr = GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right); + if (IsSingleWord(hmmStr) || stopWords_.find(hmmStr) != stopWords_.end()) { + continue; + } + res[hmmStr].offsets.push_back(hmmRes[k].left->offset); + res[hmmStr].weight += 1.0; + } + + //clear tmp vars + hmmRes.clear(); + + //let i jump over this piece + i = j - 1; + } + } + const DictTrie* GetDictTrie() const override { return mpSeg_.GetDictTrie(); } @@ -125,7 +196,23 @@ public: return tagger_.LookupTag(str, *this); } + void LoadStopWordDict(const string& filePath) { + ifstream ifs(filePath.c_str()); + if(not ifs.is_open()){ + return ; + } + XCHECK(ifs.is_open()) << "open " << filePath << " failed"; + string line ; + + while (getline(ifs, line)) { + stopWords_.insert(line); + } + + assert(stopWords_.size()); + } private: + unordered_set stopWords_; + MPSegment mpSeg_; HMMSegment hmmSeg_; PosTagger tagger_; diff --git a/libchinese-segmentation/cppjieba/PreFilter.hpp b/libchinese-segmentation/cppjieba/PreFilter.hpp index 4830f2f..1a75a57 100644 --- a/libchinese-segmentation/cppjieba/PreFilter.hpp +++ b/libchinese-segmentation/cppjieba/PreFilter.hpp @@ -22,6 +22,73 @@ public: bool HasNext() const { return cursor_ != sentence_.end(); } + bool Next(WordRange& wordRange) { + + if (cursor_ == sentence_.end()) { + return false; + } + + wordRange.left = cursor_; + + while (cursor_->rune == 0x20 && cursor_ != sentence_.end()) { + cursor_++; + } + + if (cursor_ == sentence_.end()) { + wordRange.right = cursor_; + return true; + } + + while (++cursor_ != sentence_.end()) { + if (cursor_->rune == 0x20) { + wordRange.right = cursor_; + return true; + } + } + + wordRange.right = sentence_.end(); + return true; + } + + bool Next(WordRange& wordRange, bool& isNull) { + isNull = false; + if (cursor_ == sentence_.end()) { + return false; + } + + wordRange.left = cursor_; + + if (cursor_->rune == 0x20) { + while (cursor_ != sentence_.end()) { + if (cursor_->rune != 0x20) { + if (wordRange.left == cursor_) { + cursor_ ++; + } + wordRange.right = cursor_; + isNull = true; + return true; + } + cursor_ ++; + } + } + + while (cursor_ != sentence_.end()) { + if (cursor_->rune == 0x20) { + if (wordRange.left == cursor_) { + cursor_ ++; + } + + wordRange.right = cursor_; + return true; + } + + cursor_ ++; + } + + wordRange.right = sentence_.end(); + return true; + } + WordRange Next() { WordRange range(cursor_, cursor_); diff --git a/libchinese-segmentation/cppjieba/QuerySegment.hpp b/libchinese-segmentation/cppjieba/QuerySegment.hpp index 1a8db0e..9db0b97 100644 --- a/libchinese-segmentation/cppjieba/QuerySegment.hpp +++ b/libchinese-segmentation/cppjieba/QuerySegment.hpp @@ -14,8 +14,10 @@ namespace cppjieba { class QuerySegment: public SegmentBase { public: - QuerySegment(const DictTrie* dictTrie, const HMMModel* model) - : mixSeg_(dictTrie, model), trie_(dictTrie) { + QuerySegment(const DictTrie* dictTrie, + const HMMModel* model, + const string& stopWordPath) + : mixSeg_(dictTrie, model, stopWordPath), trie_(dictTrie) { } ~QuerySegment() { } @@ -59,7 +61,10 @@ public: size_t) const override { } + virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map& res, bool hmm, + size_t) const override { + } private: bool IsAllAscii(const RuneArray& s) const { for (size_t i = 0; i < s.size(); i++) { diff --git a/libchinese-segmentation/cppjieba/SegmentBase.hpp b/libchinese-segmentation/cppjieba/SegmentBase.hpp index eff78ea..942e0bd 100644 --- a/libchinese-segmentation/cppjieba/SegmentBase.hpp +++ b/libchinese-segmentation/cppjieba/SegmentBase.hpp @@ -23,23 +23,28 @@ public: //添加基于sentence的cut方法,减少中间变量的存储与格式转换--jxx20210517 virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm, size_t max_word_len) const = 0; + virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map& res, bool hmm, + size_t max_word_len) const = 0; //重写CutToStr函数,简化获取vector& words的流程,降低内存占用--jxx20210517 void CutToStr(const string& sentence, vector& words, bool hmm = true, size_t max_word_len = MAX_WORD_LENGTH) const { -/* - vector tmp; - CutToWord(sentence, tmp, hmm, max_word_len); - GetStringsFromWords(tmp, words); -*/ PreFilter pre_filter(symbols_, sentence); words.clear(); words.reserve(sentence.size() / 2);//todo 参考源码,参数待定 - while (pre_filter.HasNext()) { - auto range = pre_filter.Next(); + RuneStrArray::const_iterator null_p; + WordRange range(null_p, null_p); + while (pre_filter.Next(range)) { CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len); } } - + void CutToStr(const string& sentence, WordRange range, vector& words, bool hmm = true, + size_t max_word_len = MAX_WORD_LENGTH) const { + CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len); + } + void CutToStr(const string& sentence, WordRange range, unordered_map& words, bool hmm = true, + size_t max_word_len = MAX_WORD_LENGTH) const { + CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len); + } void CutToWord(const string& sentence, vector& words, bool hmm = true, size_t max_word_len = MAX_WORD_LENGTH) const { PreFilter pre_filter(symbols_, sentence); diff --git a/libchinese-segmentation/cppjieba/Unicode.hpp b/libchinese-segmentation/cppjieba/Unicode.hpp index a4d765e..d77b5dd 100644 --- a/libchinese-segmentation/cppjieba/Unicode.hpp +++ b/libchinese-segmentation/cppjieba/Unicode.hpp @@ -15,6 +15,12 @@ using std::vector; typedef uint32_t Rune; +struct KeyWord { + string word; + vector offsets; + double weight; +}; // struct Word + struct Word { string word; uint32_t offset; @@ -63,7 +69,7 @@ struct WordRange { : left(l), right(r) { } size_t Length() const { - return right - left + 1; + return right - left; } bool IsAllAscii() const { @@ -113,11 +119,13 @@ inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) { uint32_t tmp; uint32_t offset = 0; runes.clear(); - for(size_t i = 0; i < s.size();) { - if(!(s.data()[i] & 0x80)) { // 0xxxxxxx + uint32_t len(0); + for (size_t i = 0; i < s.size();) { + if (!(s.data()[i] & 0x80)) { // 0xxxxxxx // 7bit, total 7bit tmp = (uint8_t)(s.data()[i]) & 0x7f; i++; + len = 1; } else if ((uint8_t)s.data()[i] <= 0xdf && i + 1 < s.size()) { // 110xxxxxx // 5bit, total 5bit tmp = (uint8_t)(s.data()[i]) & 0x1f; @@ -126,6 +134,7 @@ inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) { tmp <<= 6; tmp |= (uint8_t)(s.data()[i+1]) & 0x3f; i += 2; + len = 2; } else if((uint8_t)s.data()[i] <= 0xef && i + 2 < s.size()) { // 1110xxxxxx // 4bit, total 4bit tmp = (uint8_t)(s.data()[i]) & 0x0f; @@ -139,6 +148,7 @@ inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) { tmp |= (uint8_t)(s.data()[i+2]) & 0x3f; i += 3; + len = 3; } else if((uint8_t)s.data()[i] <= 0xf7 && i + 3 < s.size()) { // 11110xxxx // 3bit, total 3bit tmp = (uint8_t)(s.data()[i]) & 0x07; @@ -156,10 +166,10 @@ inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) { tmp |= (uint8_t)(s.data()[i+3]) & 0x3f; i += 4; + len = 4; } else { return false; } - uint32_t len = limonp::UnicodeToUtf8Bytes(tmp); RuneInfo x(tmp, offset, len, i, 1); runes.push_back(x); offset += len; @@ -241,9 +251,8 @@ inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { assert(right->offset >= left->offset); - uint32_t len = right->offset - left->offset + right->len; - uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length; - return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length).word; + //uint32_t len = right->offset - left->offset + right->len; + return s.substr(left->offset, right->offset - left->offset + right->len); } inline void GetWordsFromWordRanges(const string& s, const vector& wrs, vector& words) { diff --git a/libsearch/index/construct-document.cpp b/libsearch/index/construct-document.cpp index e54ff84..584fddc 100644 --- a/libsearch/index/construct-document.cpp +++ b/libsearch/index/construct-document.cpp @@ -120,8 +120,7 @@ void ConstructDocumentForContent::run() { content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " "); // QVector term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000)); - //修改函数返回类型,修改入参为std::string引用--jxx20210519 - std::vector term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString()); + std::vector term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString()); for(size_t i = 0; i < term.size(); ++i) { doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast(term.at(i).weight)); diff --git a/libsearch/index/file-reader.cpp b/libsearch/index/file-reader.cpp index f146981..e409374 100644 --- a/libsearch/index/file-reader.cpp +++ b/libsearch/index/file-reader.cpp @@ -31,9 +31,8 @@ void FileReader::getTextContent(QString path, QString &textContent) { QFileInfo file(path); QString strsfx = file.suffix(); if(name == "application/zip") { - if(strsfx.endsWith("docx")){ + if(strsfx.endsWith("docx")) FileUtils::getDocxTextContent(path, textContent); - } if(strsfx.endsWith("pptx")) FileUtils::getPptxTextContent(path, textContent); if(strsfx.endsWith("xlsx")) diff --git a/libsearch/index/first-index.cpp b/libsearch/index/first-index.cpp index f94b625..d23ba88 100644 --- a/libsearch/index/first-index.cpp +++ b/libsearch/index/first-index.cpp @@ -49,7 +49,7 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) { this->q_index->enqueue(QVector() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0")); if((fileInfo.fileName().split(".", QString::SkipEmptyParts).length() > 1) && (true == targetFileTypeMap[fileInfo.fileName().split(".").last()])) { //this->q_content_index->enqueue(fileInfo.absoluteFilePath()); - if(fileInfo.fileName().split(".").last() == "docx"){ + if (fileInfo.fileName().split(".").last() == "docx") { QuaZip file(fileInfo.absoluteFilePath()); if(!file.open(QuaZip::mdUnzip)) return; @@ -57,10 +57,8 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) { return; QuaZipFile fileR(&file); this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//docx解压缩后的xml文件为实际需要解析文件大小 - qDebug() << "文件路径:" <q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileSize));//pptx解压缩后的xml文件为实际需要解析文件大小 - }else if(fileInfo.fileName().split(".").last() == "xlsx"){ + } else if (fileInfo.fileName().split(".").last() == "xlsx") { QuaZip file(fileInfo.absoluteFilePath()); if(!file.open(QuaZip::mdUnzip)) return; @@ -90,10 +86,8 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) { return; QuaZipFile fileR(&file); this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//xlsx解压缩后的xml文件为实际解析文件大小 - qDebug() << "文件路径:" <q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size())); } } @@ -220,9 +214,17 @@ void FirstIndex::run() { // for (size_t i = 0; (i < this->u_send_length) && (!this->q_content_index->empty()); ++i){ qint64 fileSize = 0; //修改一次处理的数据量,从30个文件改为文件总大小为50M以下,50M为暂定值--jxx20210519 - for(size_t i = 0;/* (i < 30) && */(fileSize < 50*1024*1024) && (!this->q_content_index->empty()); ++i) { + for(size_t i = 0;/* (i < 30) && (fileSize < 52428800) && */(!this->q_content_index->empty()); ++i) { QPair tempPair = this->q_content_index->dequeue(); fileSize += tempPair.second; + if (fileSize > 52428800 ) { + if (tmp2->size() == 0) { + tmp2->enqueue(tempPair.first); + break; + } + this->q_content_index->enqueue(tempPair); + break; + } tmp2->enqueue(tempPair.first); } // qDebug() << ">>>>>>>>all fileSize:" << fileSize << "file num:" << tmp->size() << "<<<<<<<<<<<<<<<<<<<";