Merge pull request #95 from iaom/0119-dev

Optimized docx parsing method;
This commit is contained in:
Mouse Zhang 2021-01-19 21:10:40 +08:00 committed by GitHub
commit 9765bd0cd4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 31 additions and 15 deletions

View File

@ -16,6 +16,11 @@ struct SKeyWord{
std::string word;
QVector<size_t> offsets;
double weight;
~SKeyWord(){
word = std::move("");
offsets.clear();
offsets.shrink_to_fit();
}
};
class CHINESESEGMENTATION_EXPORT ChineseSegmentation

View File

@ -493,18 +493,24 @@ void FileUtils::getDocxTextContent(QString &path,QString &textcontent)
QDomDocument doc;
doc.setContent(fileR.readAll());
QDomElement first = doc.firstChildElement("w:document");
first = first.firstChildElement().firstChildElement();
while(!first.isNull())
QDomElement body = first.firstChildElement("w:body");
while(!body.isNull())
{
QDomElement wr= first.firstChildElement("w:r");
QDomElement wp= body.firstChildElement("w:p");
while(!wp.isNull())
{
QDomElement wr= wp.firstChildElement("w:r");
while(!wr.isNull())
{
QDomElement wt = wr.firstChildElement("w:t");
textcontent.append(wt.text().replace("\n",""));
wr = wr.nextSiblingElement();
}
first = first.nextSiblingElement();
wp = wp.nextSiblingElement();
}
body = body.nextSiblingElement();
}
file.close();
return;
}

View File

@ -72,7 +72,7 @@ ConstructDocumentForContent::~ConstructDocumentForContent()
void ConstructDocumentForContent::run()
{
qDebug() << "ConstructDocumentForContent currentThreadId()" << QThread::currentThreadId();
// qDebug() << "ConstructDocumentForContent currentThreadId()" << QThread::currentThreadId();
// 构造文本索引的document
if (!_doc_list_content)
_doc_list_content = new QList<Document>;

View File

@ -97,7 +97,7 @@ void FileSearcher::onKeywordSearch(QString keyword,QQueue<QString> *searchResult
while(total<20)
{
keywordSearchContent(uniqueSymbol3,keyword,begin,num);
resultCount = keywordSearchContent(uniqueSymbol3,keyword,begin,num);
if(resultCount == 0 || resultCount == -1)
break;
total += resultCount;

View File

@ -4,6 +4,7 @@
#include <QtConcurrent>
#include <QFuture>
#include <QThreadPool>
#include <QFile>
#include "file-utils.h"
#include "index-generator.h"
#include "global-settings.h"
@ -113,6 +114,13 @@ IndexGenerator::IndexGenerator(bool rebuild, QObject *parent) : QObject(parent)
{
if(rebuild)
{
QDir database(QString::fromStdString(INDEX_PATH));
if(database.exists())
database.removeRecursively();
database.setPath(QString::fromStdString(CONTENT_INDEX_PATH));
if(database.exists())
database.removeRecursively();
m_database_path = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OVERWRITE);
m_database_content = new Xapian::WritableDatabase(CONTENT_INDEX_PATH, Xapian::DB_CREATE_OR_OVERWRITE);
}
@ -198,14 +206,13 @@ void IndexGenerator::HandlePathList(QQueue<QVector<QString>> *messageList)
// m_doc_list_path = new QList<Document>(docList);
QThreadPool pool;
// pool.setMaxThreadCount(1);
pool.setExpiryTimeout(100);
ConstructDocumentForPath *constructer;
while(!messageList->isEmpty())
{
constructer = new ConstructDocumentForPath(messageList->dequeue());
pool.start(constructer);
}
// while(!pool.waitForDone(1))
// qDebug()<<"fuck"<<pool.waitForDone(1);
qDebug()<<"pool finish"<<pool.waitForDone(-1);
// if(constructer)
// delete constructer;
@ -231,14 +238,12 @@ void IndexGenerator::HandlePathList(QQueue<QString> *messageList)
ConstructDocumentForContent *constructer;
QThreadPool pool;
// pool.setMaxThreadCount(2);
pool.setExpiryTimeout(1000);
pool.setExpiryTimeout(100);
while(!messageList->isEmpty())
{
constructer = new ConstructDocumentForContent(messageList->dequeue());
pool.start(constructer);
}
// while(!pool.waitForDone(1))
// qDebug()<<"fuck"<<pool.waitForDone(1);
qDebug()<<"pool finish"<<pool.waitForDone(-1);
// if(constructer)
// delete constructer;