diff --git a/libchinese-segmentation/chinese-segmentation.cpp b/libchinese-segmentation/chinese-segmentation.cpp index 1492ee1..e3a1207 100644 --- a/libchinese-segmentation/chinese-segmentation.cpp +++ b/libchinese-segmentation/chinese-segmentation.cpp @@ -55,14 +55,16 @@ ChineseSegmentation *ChineseSegmentation::getInstance() return global_instance_chinese_segmentation; } -QVector ChineseSegmentation::callSegement(QString& str) +QVector ChineseSegmentation::callSegement(QString str) { std::string s; s=str.toStdString(); + str.squeeze(); const size_t topk = -1; std::vector keywordres; ChineseSegmentation::m_jieba->extractor.Extract(s, keywordres, topk); + std::string().swap(s); QVector vecNeeds; convert(keywordres, vecNeeds); diff --git a/libchinese-segmentation/chinese-segmentation.h b/libchinese-segmentation/chinese-segmentation.h index 1ea1ecf..db054c4 100644 --- a/libchinese-segmentation/chinese-segmentation.h +++ b/libchinese-segmentation/chinese-segmentation.h @@ -48,7 +48,7 @@ class CHINESESEGMENTATION_EXPORT ChineseSegmentation public: static ChineseSegmentation *getInstance(); ~ChineseSegmentation(); - QVector callSegement(QString &str); + QVector callSegement(QString str); void convert(std::vector& keywordres,QVector& kw); private: static QMutex m_mutex; diff --git a/libsearch/file-utils.cpp b/libsearch/file-utils.cpp index 7912066..0cb80c7 100644 --- a/libsearch/file-utils.cpp +++ b/libsearch/file-utils.cpp @@ -529,6 +529,11 @@ void FileUtils::getDocxTextContent(QString &path,QString &textcontent) { QDomElement wt = wr.firstChildElement("w:t"); textcontent.append(wt.text().replace("\n","")); + if(textcontent.length() >= 682666) //20480000/3 + { + file.close(); + return; + } wr = wr.nextSiblingElement(); } wp = wp.nextSiblingElement(); @@ -545,7 +550,7 @@ void FileUtils::getTxtContent(QString &path, QString &textcontent) if(!file.open(QIODevice::ReadOnly|QIODevice::Text)) return; - QByteArray encodedString = file.readAll(); + QByteArray encodedString = file.read(20480000); uchardet_t chardet = uchardet_new(); if(uchardet_handle_data(chardet,encodedString.constData(),encodedString.size()) !=0) diff --git a/libsearch/index/construct-document.cpp b/libsearch/index/construct-document.cpp index 2021fc2..3b9b780 100644 --- a/libsearch/index/construct-document.cpp +++ b/libsearch/index/construct-document.cpp @@ -118,7 +118,7 @@ void ConstructDocumentForContent::run() QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path)); QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/",0,-2,QString::SectionIncludeLeadingSep))); - QVector term = ChineseSegmentation::getInstance()->callSegement(content); + QVector term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000)); Document doc; doc.setData(content); @@ -135,6 +135,7 @@ void ConstructDocumentForContent::run() _doc_list_content->append(doc); _mutex_doc_list_content.unlock(); content.clear(); + content.squeeze(); term.clear(); return; } diff --git a/libsearch/parser/binary-parser.cpp b/libsearch/parser/binary-parser.cpp index b16deb4..2057ed4 100644 --- a/libsearch/parser/binary-parser.cpp +++ b/libsearch/parser/binary-parser.cpp @@ -5061,6 +5061,8 @@ bool KBinaryParser::read8DocText(FILE *pFile, const ppsInfoType *pPPS, ushort* usAucData = (ushort*)ptaucBytes; content.append(QString::fromUtf16(usAucData).replace("\r","")); usAucData = (ushort*)xfree((void*)usAucData); + if(content.length() >= 682666) //20480000/3 + break; } else {