From e01552618d6c6bb60c0865d8eaa1bad4bda4a5c3 Mon Sep 17 00:00:00 2001 From: rookie-J Date: Fri, 28 May 2021 15:55:26 +0800 Subject: [PATCH] Optimize xml-file treatment; Optimize jieba escape character; --- .../chinese-segmentation.cpp | 2 - libchinese-segmentation/cppjieba/DatTrie.hpp | 10 ++- .../cppjieba/PreFilter.hpp | 3 +- libchinese-segmentation/cppjieba/Unicode.hpp | 6 ++ libsearch/file-utils.cpp | 67 ++++++++++++++++++- libsearch/index/construct-document.cpp | 12 ++-- libsearch/index/file-reader.cpp | 3 +- libsearch/index/first-index.cpp | 48 ++++++++++++- libsearch/parser/binary-parser.cpp | 6 +- 9 files changed, 138 insertions(+), 19 deletions(-) diff --git a/libchinese-segmentation/chinese-segmentation.cpp b/libchinese-segmentation/chinese-segmentation.cpp index bc3d15a..15c5207 100644 --- a/libchinese-segmentation/chinese-segmentation.cpp +++ b/libchinese-segmentation/chinese-segmentation.cpp @@ -77,10 +77,8 @@ std::vector ChineseSegmentation::callSegementS const size_t topk = -1; std::vector keywordres; ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk); -// std::string().swap(s); return keywordres; - } void ChineseSegmentation::convert(std::vector &keywordres, QVector &kw) { diff --git a/libchinese-segmentation/cppjieba/DatTrie.hpp b/libchinese-segmentation/cppjieba/DatTrie.hpp index cf9c098..d4e64d1 100644 --- a/libchinese-segmentation/cppjieba/DatTrie.hpp +++ b/libchinese-segmentation/cppjieba/DatTrie.hpp @@ -103,11 +103,15 @@ public: res.clear(); res.resize(end - begin); - const string text_str = EncodeRunesToString(begin, end); + + string text_str; + EncodeRunesToString(begin, end, text_str); + + static const size_t max_num = 128; + JiebaDAT::result_pair_type result_pairs[max_num] = {}; for (size_t i = 0, begin_pos = 0; i < size_t(end - begin); i++) { - static const size_t max_num = 128; - JiebaDAT::result_pair_type result_pairs[max_num] = {}; + std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num); res[i].nexts.push_back(pair(i + 1, nullptr)); diff --git a/libchinese-segmentation/cppjieba/PreFilter.hpp b/libchinese-segmentation/cppjieba/PreFilter.hpp index ee37b93..4830f2f 100644 --- a/libchinese-segmentation/cppjieba/PreFilter.hpp +++ b/libchinese-segmentation/cppjieba/PreFilter.hpp @@ -26,7 +26,8 @@ public: WordRange range(cursor_, cursor_); while (cursor_ != sentence_.end()) { - if (IsIn(symbols_, cursor_->rune)) { + //if (IsIn(symbols_, cursor_->rune)) { + if (cursor_->rune == 0x20) { if (range.left == cursor_) { cursor_ ++; } diff --git a/libchinese-segmentation/cppjieba/Unicode.hpp b/libchinese-segmentation/cppjieba/Unicode.hpp index 9b4d2bd..a4d765e 100644 --- a/libchinese-segmentation/cppjieba/Unicode.hpp +++ b/libchinese-segmentation/cppjieba/Unicode.hpp @@ -195,6 +195,12 @@ inline string EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArr return str; } +inline void EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, string& str) { + RunePtrWrapper it_begin(begin), it_end(end); + limonp::Unicode32ToUtf8(it_begin, it_end, str); + return; +} + class Unicode32Counter { public : size_t length = 0; diff --git a/libsearch/file-utils.cpp b/libsearch/file-utils.cpp index 5d4d3f1..732be89 100644 --- a/libsearch/file-utils.cpp +++ b/libsearch/file-utils.cpp @@ -20,6 +20,7 @@ * */ #include "file-utils.h" +#include using namespace Zeeker; size_t FileUtils::_max_index_count = 0; @@ -488,6 +489,22 @@ void FileUtils::getDocxTextContent(QString &path, QString &textcontent) { fileR.open(QIODevice::ReadOnly); //读取方式打开 + QXmlStreamReader reader(&fileR); + + while (!reader.atEnd()){ + if(reader.readNextStartElement() and reader.name().toString() == "t"){ + textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " ")); + if(textcontent.length() >= MAX_CONTENT_LENGTH/3){ + break; + } + } + } + + fileR.close(); + file.close(); + return; + +/* //原加载DOM文档方式; QDomDocument doc; doc.setContent(fileR.readAll()); fileR.close(); @@ -512,6 +529,7 @@ void FileUtils::getDocxTextContent(QString &path, QString &textcontent) { } file.close(); return; +*/ } void FileUtils::getPptxTextContent(QString &path, QString &textcontent) { @@ -529,6 +547,31 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent) { } if(fileList.isEmpty()) return; + + for(int i = 0; i < fileList.size(); ++i){ + QString name = prefix + QString::number(i + 1) + ".xml"; + if(!file.setCurrentFile(name)) { + continue; + } + QuaZipFile fileR(&file); + fileR.open(QIODevice::ReadOnly); + + QXmlStreamReader reader(&fileR); + + while (!reader.atEnd()){ + if(reader.readNextStartElement() and reader.name().toString() == "t"){ + textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " ")); + if(textcontent.length() >= MAX_CONTENT_LENGTH/3){ + break; + } + } + } + fileR.close(); + } + file.close(); + return; + +/* QDomElement sptree; QDomElement sp; QDomElement txbody; @@ -596,6 +639,7 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent) { } file.close(); return; +*/ } void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) { @@ -610,8 +654,24 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) { return; QuaZipFile fileR(&file); - fileR.open(QIODevice::ReadOnly); //读取方式打开 + fileR.open(QIODevice::ReadOnly); + QXmlStreamReader reader(&fileR); + + while (!reader.atEnd()){ + if(reader.readNextStartElement() and reader.name().toString() == "t"){ + textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " ")); + if(textcontent.length() >= MAX_CONTENT_LENGTH/3){ + break; + } + } + } + + fileR.close(); + file.close(); + return; + +/* QDomDocument doc; doc.setContent(fileR.readAll()); fileR.close(); @@ -641,6 +701,7 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) { } file.close(); return; +*/ } void FileUtils::getPdfTextContent(QString &path, QString &textcontent) { @@ -650,7 +711,7 @@ void FileUtils::getPdfTextContent(QString &path, QString &textcontent) { const QRectF qf; int pageNum = doc->numPages(); for(int i = 0; i < pageNum; ++i) { - textcontent.append(doc->page(i)->text(qf).replace("\n", "")); + textcontent.append(doc->page(i)->text(qf).replace("\n", "").replace("\r", " ")); if(textcontent.length() >= MAX_CONTENT_LENGTH / 3) break; } @@ -679,7 +740,7 @@ void FileUtils::getTxtContent(QString &path, QString &textcontent) { stream.setCodec(codec); uchardet_delete(chardet); - textcontent = stream.readAll().replace("\n", ""); + textcontent = stream.readAll().replace("\n", "").replace("\r", " "); file.close(); encodedString.clear(); diff --git a/libsearch/index/construct-document.cpp b/libsearch/index/construct-document.cpp index aa98e44..f1751a9 100644 --- a/libsearch/index/construct-document.cpp +++ b/libsearch/index/construct-document.cpp @@ -110,16 +110,18 @@ void ConstructDocumentForContent::run() { return; QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path)); QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep))); - - -// QVector term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000)); - //修改函数返回类型,修改入参为std::string引用--jxx20210519 - std::vector term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString()); Document doc; doc.setData(content); doc.setUniqueTerm(uniqueterm); doc.addTerm(upTerm); doc.addValue(m_path); + + content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " "); + +// QVector term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000)); + //修改函数返回类型,修改入参为std::string引用--jxx20210519 + std::vector term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString()); + for(size_t i = 0; i < term.size(); ++i) { doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast(term.at(i).weight)); } diff --git a/libsearch/index/file-reader.cpp b/libsearch/index/file-reader.cpp index e409374..f146981 100644 --- a/libsearch/index/file-reader.cpp +++ b/libsearch/index/file-reader.cpp @@ -31,8 +31,9 @@ void FileReader::getTextContent(QString path, QString &textContent) { QFileInfo file(path); QString strsfx = file.suffix(); if(name == "application/zip") { - if(strsfx.endsWith("docx")) + if(strsfx.endsWith("docx")){ FileUtils::getDocxTextContent(path, textContent); + } if(strsfx.endsWith("pptx")) FileUtils::getPptxTextContent(path, textContent); if(strsfx.endsWith("xlsx")) diff --git a/libsearch/index/first-index.cpp b/libsearch/index/first-index.cpp index 8cdac42..081f76b 100644 --- a/libsearch/index/first-index.cpp +++ b/libsearch/index/first-index.cpp @@ -47,7 +47,53 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) { this->q_index->enqueue(QVector() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0")); if((fileInfo.fileName().split(".", QString::SkipEmptyParts).length() > 1) && (true == targetFileTypeMap[fileInfo.fileName().split(".").last()])) { //this->q_content_index->enqueue(fileInfo.absoluteFilePath()); - this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size())); + if(fileInfo.fileName().split(".").last() == "docx"){ + QuaZip file(fileInfo.absoluteFilePath()); + if(!file.open(QuaZip::mdUnzip)) + return; + if(!file.setCurrentFile("word/document.xml", QuaZip::csSensitive)) + return; + QuaZipFile fileR(&file); + this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//docx解压缩后的xml文件为实际需要解析文件大小 + qDebug() << "文件路径:" <q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileSize));//pptx解压缩后的xml文件为实际需要解析文件大小 + }else if(fileInfo.fileName().split(".").last() == "xlsx"){ + QuaZip file(fileInfo.absoluteFilePath()); + if(!file.open(QuaZip::mdUnzip)) + return; + if(!file.setCurrentFile("xl/sharedStrings.xml", QuaZip::csSensitive)) + return; + QuaZipFile fileR(&file); + this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//xlsx解压缩后的xml文件为实际解析文件大小 + qDebug() << "文件路径:" <q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size())); + } } } diff --git a/libsearch/parser/binary-parser.cpp b/libsearch/parser/binary-parser.cpp index 0f927c5..968e066 100644 --- a/libsearch/parser/binary-parser.cpp +++ b/libsearch/parser/binary-parser.cpp @@ -4963,7 +4963,7 @@ bool KBinaryParser::read8DocText(FILE *pFile, const ppsInfoType *pPPS, if(bUsesUnicode) { ushort* usAucData = (ushort*)ptaucBytes; - content.append(QString::fromUtf16(usAucData).replace("\r", "")); + content.append(QString::fromUtf16(usAucData).replace("\n", "").replace("\r", " ")); usAucData = (ushort*)xfree((void*)usAucData); ptaucBytes = NULL; if(content.length() >= 682666) //20480000/3 @@ -5066,7 +5066,7 @@ int KBinaryParser:: readSSTRecord(readDataParam &rdParam, ppsInfoType PPS_info, } else { ushort* usData = (ushort*)chData; - content.append(QString::fromUtf16(usData).replace("\r", "")); + content.append(QString::fromUtf16(usData).replace("\n", "").replace("\r", " ")); usData = (ushort*)xfree((void*)usData); chData = NULL; if(content.length() >= 682666) //20480000/3 @@ -5131,7 +5131,7 @@ ULONG KBinaryParser::readPPtRecord(FILE* pFile, ppsInfoType* PPS_info, ULONG* au return -1; ushort* usData = (ushort*)chData; - content.append(QString::fromUtf16(usData).replace("\r", "")); + content.append(QString::fromUtf16(usData).replace("\n", "").replace("\r", " ")); usData = (ushort*)xfree((void*)usData); chData = NULL;