Merge pull request #95 from iaom/0119-dev

Optimized docx parsing method;
This commit is contained in:
Mouse Zhang 2021-01-19 21:10:40 +08:00 committed by GitHub
commit 9765bd0cd4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 31 additions and 15 deletions

View File

@ -16,6 +16,11 @@ struct SKeyWord{
std::string word; std::string word;
QVector<size_t> offsets; QVector<size_t> offsets;
double weight; double weight;
~SKeyWord(){
word = std::move("");
offsets.clear();
offsets.shrink_to_fit();
}
}; };
class CHINESESEGMENTATION_EXPORT ChineseSegmentation class CHINESESEGMENTATION_EXPORT ChineseSegmentation

View File

@ -493,18 +493,24 @@ void FileUtils::getDocxTextContent(QString &path,QString &textcontent)
QDomDocument doc; QDomDocument doc;
doc.setContent(fileR.readAll()); doc.setContent(fileR.readAll());
QDomElement first = doc.firstChildElement("w:document"); QDomElement first = doc.firstChildElement("w:document");
first = first.firstChildElement().firstChildElement(); QDomElement body = first.firstChildElement("w:body");
while(!first.isNull()) while(!body.isNull())
{ {
QDomElement wr= first.firstChildElement("w:r"); QDomElement wp= body.firstChildElement("w:p");
while(!wr.isNull()) while(!wp.isNull())
{ {
QDomElement wt = wr.firstChildElement("w:t"); QDomElement wr= wp.firstChildElement("w:r");
textcontent.append(wt.text().replace("\n","")); while(!wr.isNull())
wr = wr.nextSiblingElement(); {
QDomElement wt = wr.firstChildElement("w:t");
textcontent.append(wt.text().replace("\n",""));
wr = wr.nextSiblingElement();
}
wp = wp.nextSiblingElement();
} }
first = first.nextSiblingElement(); body = body.nextSiblingElement();
} }
file.close();
return; return;
} }

View File

@ -72,7 +72,7 @@ ConstructDocumentForContent::~ConstructDocumentForContent()
void ConstructDocumentForContent::run() void ConstructDocumentForContent::run()
{ {
qDebug() << "ConstructDocumentForContent currentThreadId()" << QThread::currentThreadId(); // qDebug() << "ConstructDocumentForContent currentThreadId()" << QThread::currentThreadId();
// 构造文本索引的document // 构造文本索引的document
if (!_doc_list_content) if (!_doc_list_content)
_doc_list_content = new QList<Document>; _doc_list_content = new QList<Document>;

View File

@ -97,7 +97,7 @@ void FileSearcher::onKeywordSearch(QString keyword,QQueue<QString> *searchResult
while(total<20) while(total<20)
{ {
keywordSearchContent(uniqueSymbol3,keyword,begin,num); resultCount = keywordSearchContent(uniqueSymbol3,keyword,begin,num);
if(resultCount == 0 || resultCount == -1) if(resultCount == 0 || resultCount == -1)
break; break;
total += resultCount; total += resultCount;

View File

@ -4,6 +4,7 @@
#include <QtConcurrent> #include <QtConcurrent>
#include <QFuture> #include <QFuture>
#include <QThreadPool> #include <QThreadPool>
#include <QFile>
#include "file-utils.h" #include "file-utils.h"
#include "index-generator.h" #include "index-generator.h"
#include "global-settings.h" #include "global-settings.h"
@ -113,6 +114,13 @@ IndexGenerator::IndexGenerator(bool rebuild, QObject *parent) : QObject(parent)
{ {
if(rebuild) if(rebuild)
{ {
QDir database(QString::fromStdString(INDEX_PATH));
if(database.exists())
database.removeRecursively();
database.setPath(QString::fromStdString(CONTENT_INDEX_PATH));
if(database.exists())
database.removeRecursively();
m_database_path = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OVERWRITE); m_database_path = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OVERWRITE);
m_database_content = new Xapian::WritableDatabase(CONTENT_INDEX_PATH, Xapian::DB_CREATE_OR_OVERWRITE); m_database_content = new Xapian::WritableDatabase(CONTENT_INDEX_PATH, Xapian::DB_CREATE_OR_OVERWRITE);
} }
@ -198,14 +206,13 @@ void IndexGenerator::HandlePathList(QQueue<QVector<QString>> *messageList)
// m_doc_list_path = new QList<Document>(docList); // m_doc_list_path = new QList<Document>(docList);
QThreadPool pool; QThreadPool pool;
// pool.setMaxThreadCount(1); // pool.setMaxThreadCount(1);
pool.setExpiryTimeout(100);
ConstructDocumentForPath *constructer; ConstructDocumentForPath *constructer;
while(!messageList->isEmpty()) while(!messageList->isEmpty())
{ {
constructer = new ConstructDocumentForPath(messageList->dequeue()); constructer = new ConstructDocumentForPath(messageList->dequeue());
pool.start(constructer); pool.start(constructer);
} }
// while(!pool.waitForDone(1))
// qDebug()<<"fuck"<<pool.waitForDone(1);
qDebug()<<"pool finish"<<pool.waitForDone(-1); qDebug()<<"pool finish"<<pool.waitForDone(-1);
// if(constructer) // if(constructer)
// delete constructer; // delete constructer;
@ -231,14 +238,12 @@ void IndexGenerator::HandlePathList(QQueue<QString> *messageList)
ConstructDocumentForContent *constructer; ConstructDocumentForContent *constructer;
QThreadPool pool; QThreadPool pool;
// pool.setMaxThreadCount(2); // pool.setMaxThreadCount(2);
pool.setExpiryTimeout(1000); pool.setExpiryTimeout(100);
while(!messageList->isEmpty()) while(!messageList->isEmpty())
{ {
constructer = new ConstructDocumentForContent(messageList->dequeue()); constructer = new ConstructDocumentForContent(messageList->dequeue());
pool.start(constructer); pool.start(constructer);
} }
// while(!pool.waitForDone(1))
// qDebug()<<"fuck"<<pool.waitForDone(1);
qDebug()<<"pool finish"<<pool.waitForDone(-1); qDebug()<<"pool finish"<<pool.waitForDone(-1);
// if(constructer) // if(constructer)
// delete constructer; // delete constructer;