Optimize xml-file treatment; Optimize jieba escape character;
This commit is contained in:
parent
bb17943e1d
commit
e01552618d
|
@ -77,10 +77,8 @@ std::vector<cppjieba::KeywordExtractor::Word> ChineseSegmentation::callSegementS
|
||||||
const size_t topk = -1;
|
const size_t topk = -1;
|
||||||
std::vector<cppjieba::KeywordExtractor::Word> keywordres;
|
std::vector<cppjieba::KeywordExtractor::Word> keywordres;
|
||||||
ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk);
|
ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk);
|
||||||
// std::string().swap(s);
|
|
||||||
|
|
||||||
return keywordres;
|
return keywordres;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ChineseSegmentation::convert(std::vector<cppjieba::KeywordExtractor::Word> &keywordres, QVector<SKeyWord> &kw) {
|
void ChineseSegmentation::convert(std::vector<cppjieba::KeywordExtractor::Word> &keywordres, QVector<SKeyWord> &kw) {
|
||||||
|
|
|
@ -103,11 +103,15 @@ public:
|
||||||
|
|
||||||
res.clear();
|
res.clear();
|
||||||
res.resize(end - begin);
|
res.resize(end - begin);
|
||||||
const string text_str = EncodeRunesToString(begin, end);
|
|
||||||
|
string text_str;
|
||||||
|
EncodeRunesToString(begin, end, text_str);
|
||||||
|
|
||||||
|
static const size_t max_num = 128;
|
||||||
|
JiebaDAT::result_pair_type result_pairs[max_num] = {};
|
||||||
|
|
||||||
for (size_t i = 0, begin_pos = 0; i < size_t(end - begin); i++) {
|
for (size_t i = 0, begin_pos = 0; i < size_t(end - begin); i++) {
|
||||||
static const size_t max_num = 128;
|
|
||||||
JiebaDAT::result_pair_type result_pairs[max_num] = {};
|
|
||||||
std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
|
std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
|
||||||
|
|
||||||
res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + 1, nullptr));
|
res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + 1, nullptr));
|
||||||
|
|
|
@ -26,7 +26,8 @@ public:
|
||||||
WordRange range(cursor_, cursor_);
|
WordRange range(cursor_, cursor_);
|
||||||
|
|
||||||
while (cursor_ != sentence_.end()) {
|
while (cursor_ != sentence_.end()) {
|
||||||
if (IsIn(symbols_, cursor_->rune)) {
|
//if (IsIn(symbols_, cursor_->rune)) {
|
||||||
|
if (cursor_->rune == 0x20) {
|
||||||
if (range.left == cursor_) {
|
if (range.left == cursor_) {
|
||||||
cursor_ ++;
|
cursor_ ++;
|
||||||
}
|
}
|
||||||
|
|
|
@ -195,6 +195,12 @@ inline string EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArr
|
||||||
return str;
|
return str;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, string& str) {
|
||||||
|
RunePtrWrapper it_begin(begin), it_end(end);
|
||||||
|
limonp::Unicode32ToUtf8(it_begin, it_end, str);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
class Unicode32Counter {
|
class Unicode32Counter {
|
||||||
public :
|
public :
|
||||||
size_t length = 0;
|
size_t length = 0;
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
#include "file-utils.h"
|
#include "file-utils.h"
|
||||||
|
#include <QXmlStreamReader>
|
||||||
|
|
||||||
using namespace Zeeker;
|
using namespace Zeeker;
|
||||||
size_t FileUtils::_max_index_count = 0;
|
size_t FileUtils::_max_index_count = 0;
|
||||||
|
@ -488,6 +489,22 @@ void FileUtils::getDocxTextContent(QString &path, QString &textcontent) {
|
||||||
|
|
||||||
fileR.open(QIODevice::ReadOnly); //读取方式打开
|
fileR.open(QIODevice::ReadOnly); //读取方式打开
|
||||||
|
|
||||||
|
QXmlStreamReader reader(&fileR);
|
||||||
|
|
||||||
|
while (!reader.atEnd()){
|
||||||
|
if(reader.readNextStartElement() and reader.name().toString() == "t"){
|
||||||
|
textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
||||||
|
if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fileR.close();
|
||||||
|
file.close();
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* //原加载DOM文档方式;
|
||||||
QDomDocument doc;
|
QDomDocument doc;
|
||||||
doc.setContent(fileR.readAll());
|
doc.setContent(fileR.readAll());
|
||||||
fileR.close();
|
fileR.close();
|
||||||
|
@ -512,6 +529,7 @@ void FileUtils::getDocxTextContent(QString &path, QString &textcontent) {
|
||||||
}
|
}
|
||||||
file.close();
|
file.close();
|
||||||
return;
|
return;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
void FileUtils::getPptxTextContent(QString &path, QString &textcontent) {
|
void FileUtils::getPptxTextContent(QString &path, QString &textcontent) {
|
||||||
|
@ -529,6 +547,31 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent) {
|
||||||
}
|
}
|
||||||
if(fileList.isEmpty())
|
if(fileList.isEmpty())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
for(int i = 0; i < fileList.size(); ++i){
|
||||||
|
QString name = prefix + QString::number(i + 1) + ".xml";
|
||||||
|
if(!file.setCurrentFile(name)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
QuaZipFile fileR(&file);
|
||||||
|
fileR.open(QIODevice::ReadOnly);
|
||||||
|
|
||||||
|
QXmlStreamReader reader(&fileR);
|
||||||
|
|
||||||
|
while (!reader.atEnd()){
|
||||||
|
if(reader.readNextStartElement() and reader.name().toString() == "t"){
|
||||||
|
textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
||||||
|
if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fileR.close();
|
||||||
|
}
|
||||||
|
file.close();
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
QDomElement sptree;
|
QDomElement sptree;
|
||||||
QDomElement sp;
|
QDomElement sp;
|
||||||
QDomElement txbody;
|
QDomElement txbody;
|
||||||
|
@ -596,6 +639,7 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent) {
|
||||||
}
|
}
|
||||||
file.close();
|
file.close();
|
||||||
return;
|
return;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) {
|
void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) {
|
||||||
|
@ -610,8 +654,24 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) {
|
||||||
return;
|
return;
|
||||||
QuaZipFile fileR(&file);
|
QuaZipFile fileR(&file);
|
||||||
|
|
||||||
fileR.open(QIODevice::ReadOnly); //读取方式打开
|
fileR.open(QIODevice::ReadOnly);
|
||||||
|
|
||||||
|
QXmlStreamReader reader(&fileR);
|
||||||
|
|
||||||
|
while (!reader.atEnd()){
|
||||||
|
if(reader.readNextStartElement() and reader.name().toString() == "t"){
|
||||||
|
textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
||||||
|
if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fileR.close();
|
||||||
|
file.close();
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
QDomDocument doc;
|
QDomDocument doc;
|
||||||
doc.setContent(fileR.readAll());
|
doc.setContent(fileR.readAll());
|
||||||
fileR.close();
|
fileR.close();
|
||||||
|
@ -641,6 +701,7 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) {
|
||||||
}
|
}
|
||||||
file.close();
|
file.close();
|
||||||
return;
|
return;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
void FileUtils::getPdfTextContent(QString &path, QString &textcontent) {
|
void FileUtils::getPdfTextContent(QString &path, QString &textcontent) {
|
||||||
|
@ -650,7 +711,7 @@ void FileUtils::getPdfTextContent(QString &path, QString &textcontent) {
|
||||||
const QRectF qf;
|
const QRectF qf;
|
||||||
int pageNum = doc->numPages();
|
int pageNum = doc->numPages();
|
||||||
for(int i = 0; i < pageNum; ++i) {
|
for(int i = 0; i < pageNum; ++i) {
|
||||||
textcontent.append(doc->page(i)->text(qf).replace("\n", ""));
|
textcontent.append(doc->page(i)->text(qf).replace("\n", "").replace("\r", " "));
|
||||||
if(textcontent.length() >= MAX_CONTENT_LENGTH / 3)
|
if(textcontent.length() >= MAX_CONTENT_LENGTH / 3)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -679,7 +740,7 @@ void FileUtils::getTxtContent(QString &path, QString &textcontent) {
|
||||||
stream.setCodec(codec);
|
stream.setCodec(codec);
|
||||||
uchardet_delete(chardet);
|
uchardet_delete(chardet);
|
||||||
|
|
||||||
textcontent = stream.readAll().replace("\n", "");
|
textcontent = stream.readAll().replace("\n", "").replace("\r", " ");
|
||||||
|
|
||||||
file.close();
|
file.close();
|
||||||
encodedString.clear();
|
encodedString.clear();
|
||||||
|
|
|
@ -110,16 +110,18 @@ void ConstructDocumentForContent::run() {
|
||||||
return;
|
return;
|
||||||
QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
|
QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
|
||||||
QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
|
QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
|
||||||
|
|
||||||
|
|
||||||
// QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000));
|
|
||||||
//修改函数返回类型,修改入参为std::string引用--jxx20210519
|
|
||||||
std::vector<cppjieba::KeywordExtractor::Word> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
|
|
||||||
Document doc;
|
Document doc;
|
||||||
doc.setData(content);
|
doc.setData(content);
|
||||||
doc.setUniqueTerm(uniqueterm);
|
doc.setUniqueTerm(uniqueterm);
|
||||||
doc.addTerm(upTerm);
|
doc.addTerm(upTerm);
|
||||||
doc.addValue(m_path);
|
doc.addValue(m_path);
|
||||||
|
|
||||||
|
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
|
||||||
|
|
||||||
|
// QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000));
|
||||||
|
//修改函数返回类型,修改入参为std::string引用--jxx20210519
|
||||||
|
std::vector<cppjieba::KeywordExtractor::Word> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
|
||||||
|
|
||||||
for(size_t i = 0; i < term.size(); ++i) {
|
for(size_t i = 0; i < term.size(); ++i) {
|
||||||
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
|
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,8 +31,9 @@ void FileReader::getTextContent(QString path, QString &textContent) {
|
||||||
QFileInfo file(path);
|
QFileInfo file(path);
|
||||||
QString strsfx = file.suffix();
|
QString strsfx = file.suffix();
|
||||||
if(name == "application/zip") {
|
if(name == "application/zip") {
|
||||||
if(strsfx.endsWith("docx"))
|
if(strsfx.endsWith("docx")){
|
||||||
FileUtils::getDocxTextContent(path, textContent);
|
FileUtils::getDocxTextContent(path, textContent);
|
||||||
|
}
|
||||||
if(strsfx.endsWith("pptx"))
|
if(strsfx.endsWith("pptx"))
|
||||||
FileUtils::getPptxTextContent(path, textContent);
|
FileUtils::getPptxTextContent(path, textContent);
|
||||||
if(strsfx.endsWith("xlsx"))
|
if(strsfx.endsWith("xlsx"))
|
||||||
|
|
|
@ -47,7 +47,53 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
|
||||||
this->q_index->enqueue(QVector<QString>() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0"));
|
this->q_index->enqueue(QVector<QString>() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0"));
|
||||||
if((fileInfo.fileName().split(".", QString::SkipEmptyParts).length() > 1) && (true == targetFileTypeMap[fileInfo.fileName().split(".").last()])) {
|
if((fileInfo.fileName().split(".", QString::SkipEmptyParts).length() > 1) && (true == targetFileTypeMap[fileInfo.fileName().split(".").last()])) {
|
||||||
//this->q_content_index->enqueue(fileInfo.absoluteFilePath());
|
//this->q_content_index->enqueue(fileInfo.absoluteFilePath());
|
||||||
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
|
if(fileInfo.fileName().split(".").last() == "docx"){
|
||||||
|
QuaZip file(fileInfo.absoluteFilePath());
|
||||||
|
if(!file.open(QuaZip::mdUnzip))
|
||||||
|
return;
|
||||||
|
if(!file.setCurrentFile("word/document.xml", QuaZip::csSensitive))
|
||||||
|
return;
|
||||||
|
QuaZipFile fileR(&file);
|
||||||
|
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//docx解压缩后的xml文件为实际需要解析文件大小
|
||||||
|
qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
|
||||||
|
qDebug() << "文件大小:" << fileR.usize();
|
||||||
|
file.close();
|
||||||
|
}else if(fileInfo.fileName().split(".").last() == "pptx"){
|
||||||
|
QuaZip file(fileInfo.absoluteFilePath());
|
||||||
|
if(!file.open(QuaZip::mdUnzip))
|
||||||
|
return;
|
||||||
|
QString prefix("ppt/slides/slide");
|
||||||
|
qint64 fileSize(0);
|
||||||
|
qint64 fileIndex(0);
|
||||||
|
for(QString i : file.getFileNameList()) {
|
||||||
|
if(i.startsWith(prefix)){
|
||||||
|
QString name = prefix + QString::number(fileIndex + 1) + ".xml";
|
||||||
|
fileIndex++;
|
||||||
|
if(!file.setCurrentFile(name)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
QuaZipFile fileR(&file);
|
||||||
|
fileSize += fileR.usize();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
file.close();
|
||||||
|
qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
|
||||||
|
qDebug() << "文件大小:" << fileSize;
|
||||||
|
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileSize));//pptx解压缩后的xml文件为实际需要解析文件大小
|
||||||
|
}else if(fileInfo.fileName().split(".").last() == "xlsx"){
|
||||||
|
QuaZip file(fileInfo.absoluteFilePath());
|
||||||
|
if(!file.open(QuaZip::mdUnzip))
|
||||||
|
return;
|
||||||
|
if(!file.setCurrentFile("xl/sharedStrings.xml", QuaZip::csSensitive))
|
||||||
|
return;
|
||||||
|
QuaZipFile fileR(&file);
|
||||||
|
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//xlsx解压缩后的xml文件为实际解析文件大小
|
||||||
|
qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
|
||||||
|
qDebug() << "文件大小:" << fileR.usize();
|
||||||
|
file.close();
|
||||||
|
}else{
|
||||||
|
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4963,7 +4963,7 @@ bool KBinaryParser::read8DocText(FILE *pFile, const ppsInfoType *pPPS,
|
||||||
|
|
||||||
if(bUsesUnicode) {
|
if(bUsesUnicode) {
|
||||||
ushort* usAucData = (ushort*)ptaucBytes;
|
ushort* usAucData = (ushort*)ptaucBytes;
|
||||||
content.append(QString::fromUtf16(usAucData).replace("\r", ""));
|
content.append(QString::fromUtf16(usAucData).replace("\n", "").replace("\r", " "));
|
||||||
usAucData = (ushort*)xfree((void*)usAucData);
|
usAucData = (ushort*)xfree((void*)usAucData);
|
||||||
ptaucBytes = NULL;
|
ptaucBytes = NULL;
|
||||||
if(content.length() >= 682666) //20480000/3
|
if(content.length() >= 682666) //20480000/3
|
||||||
|
@ -5066,7 +5066,7 @@ int KBinaryParser:: readSSTRecord(readDataParam &rdParam, ppsInfoType PPS_info,
|
||||||
} else {
|
} else {
|
||||||
ushort* usData = (ushort*)chData;
|
ushort* usData = (ushort*)chData;
|
||||||
|
|
||||||
content.append(QString::fromUtf16(usData).replace("\r", ""));
|
content.append(QString::fromUtf16(usData).replace("\n", "").replace("\r", " "));
|
||||||
usData = (ushort*)xfree((void*)usData);
|
usData = (ushort*)xfree((void*)usData);
|
||||||
chData = NULL;
|
chData = NULL;
|
||||||
if(content.length() >= 682666) //20480000/3
|
if(content.length() >= 682666) //20480000/3
|
||||||
|
@ -5131,7 +5131,7 @@ ULONG KBinaryParser::readPPtRecord(FILE* pFile, ppsInfoType* PPS_info, ULONG* au
|
||||||
return -1;
|
return -1;
|
||||||
ushort* usData = (ushort*)chData;
|
ushort* usData = (ushort*)chData;
|
||||||
|
|
||||||
content.append(QString::fromUtf16(usData).replace("\r", ""));
|
content.append(QString::fromUtf16(usData).replace("\n", "").replace("\r", " "));
|
||||||
|
|
||||||
usData = (ushort*)xfree((void*)usData);
|
usData = (ushort*)xfree((void*)usData);
|
||||||
chData = NULL;
|
chData = NULL;
|
||||||
|
|
Loading…
Reference in New Issue