Delete content index if file content is empty.
This commit is contained in:
parent
a095150f09
commit
c2905bc331
|
@ -24,8 +24,8 @@
|
|||
#include <QThread>
|
||||
#include <QUrl>
|
||||
|
||||
//extern QList<Document> *_doc_list_path;
|
||||
//extern QMutex _mutex_doc_list_path;
|
||||
//extern QList<Document> *g_docListForPath;
|
||||
//extern QMutex g_mutexDocListForPath;
|
||||
using namespace Zeeker;
|
||||
ConstructDocumentForPath::ConstructDocumentForPath(QVector<QString> list) {
|
||||
this->setAutoDelete(true);
|
||||
|
@ -34,9 +34,9 @@ ConstructDocumentForPath::ConstructDocumentForPath(QVector<QString> list) {
|
|||
|
||||
void ConstructDocumentForPath::run() {
|
||||
// qDebug()<<"ConstructDocumentForPath";
|
||||
// if(!Zeeker::_doc_list_path)
|
||||
// Zeeker::_doc_list_path = new QVector<Document>;
|
||||
// qDebug()<<_doc_list_path->size();
|
||||
// if(!Zeeker::g_docListForPath)
|
||||
// Zeeker::g_docListForPath = new QVector<Document>;
|
||||
// qDebug()<<g_docListForPath->size();
|
||||
QString index_text = m_list.at(0).toLower();
|
||||
QString sourcePath = m_list.at(1);
|
||||
Document doc;
|
||||
|
@ -86,9 +86,9 @@ void ConstructDocumentForPath::run() {
|
|||
}
|
||||
|
||||
// QMetaObject::invokeMethod(m_indexGenerator,"appendDocListPath",Q_ARG(Document,doc));
|
||||
IndexGenerator::_mutex_doc_list_path.lock();
|
||||
IndexGenerator::_doc_list_path.append(doc);
|
||||
IndexGenerator::_mutex_doc_list_path.unlock();
|
||||
IndexGenerator::g_mutexDocListForPath.lock();
|
||||
IndexGenerator::g_docListForPath.append(doc);
|
||||
IndexGenerator::g_mutexDocListForPath.unlock();
|
||||
// qDebug()<<"ConstructDocumentForPath finish";
|
||||
return;
|
||||
}
|
||||
|
@ -100,40 +100,33 @@ ConstructDocumentForContent::ConstructDocumentForContent(QString path) {
|
|||
|
||||
void ConstructDocumentForContent::run() {
|
||||
// qDebug() << "ConstructDocumentForContent currentThreadId()" << QThread::currentThreadId();
|
||||
// 构造文本索引的document
|
||||
// if(!Zeeker::_doc_list_content)
|
||||
// Zeeker::_doc_list_content = new QVector<Document>;
|
||||
//构造文本索引的document
|
||||
QString content;
|
||||
FileReader::getTextContent(m_path, content);
|
||||
if(content.isEmpty())
|
||||
return;
|
||||
//QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
|
||||
//QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
|
||||
|
||||
Document doc;
|
||||
doc.setData(content);
|
||||
//doc.setUniqueTerm(uniqueterm);
|
||||
doc.setUniqueTerm(FileUtils::makeDocUterm(m_path));
|
||||
//doc.addTerm(upTerm);
|
||||
doc.addTerm("ZEEKERUPTERM" + FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
|
||||
doc.addValue(m_path);
|
||||
|
||||
if(content.isEmpty()) {
|
||||
doc.reuireDeleted();
|
||||
} else {
|
||||
doc.setData(content);
|
||||
//'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
|
||||
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
|
||||
|
||||
// QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000));
|
||||
std::vector<cppjieba::KeyWord> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
|
||||
|
||||
for(size_t i = 0; i < term.size(); ++i) {
|
||||
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
|
||||
}
|
||||
|
||||
IndexGenerator::_mutex_doc_list_content.lock();
|
||||
IndexGenerator::_doc_list_content.append(doc);
|
||||
IndexGenerator::_mutex_doc_list_content.unlock();
|
||||
term.clear();
|
||||
term.shrink_to_fit();
|
||||
}
|
||||
IndexGenerator::g_mutexDocListForContent.lock();
|
||||
IndexGenerator::g_docListForContent.append(doc);
|
||||
IndexGenerator::g_mutexDocListForContent.unlock();
|
||||
content.clear();
|
||||
content.squeeze();
|
||||
|
||||
term.clear();
|
||||
term.shrink_to_fit();
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -108,3 +108,13 @@ QStringList Document::getIndexText() {
|
|||
Xapian::Document Document::getXapianDocument() {
|
||||
return m_document;
|
||||
}
|
||||
|
||||
void Document::reuireDeleted()
|
||||
{
|
||||
m_shouldDelete = true;
|
||||
}
|
||||
|
||||
bool Document::isRequiredDeleted()
|
||||
{
|
||||
return m_shouldDelete;
|
||||
}
|
||||
|
|
|
@ -33,11 +33,13 @@ public:
|
|||
m_document = other.m_document;
|
||||
m_index_text = other.m_index_text;
|
||||
m_unique_term = other.m_unique_term;
|
||||
m_shouldDelete = other.m_shouldDelete;
|
||||
}
|
||||
void operator=(const Document& other) {
|
||||
m_document = other.m_document;
|
||||
m_index_text = other.m_index_text;
|
||||
m_unique_term = other.m_unique_term;
|
||||
m_shouldDelete = other.m_shouldDelete;
|
||||
}
|
||||
void setData(QString &data);
|
||||
void addPosting(std::string term, QVector<size_t> offset, int weight = 1);
|
||||
|
@ -52,11 +54,14 @@ public:
|
|||
void setIndexText(QStringList indexText);
|
||||
QStringList getIndexText();
|
||||
Xapian::Document getXapianDocument();
|
||||
void reuireDeleted();
|
||||
bool isRequiredDeleted();
|
||||
private:
|
||||
Xapian::Document m_document;
|
||||
QStringList m_index_text;
|
||||
//QString m_unique_term;
|
||||
std::string m_unique_term;
|
||||
bool m_shouldDelete = false;
|
||||
|
||||
};
|
||||
}
|
||||
|
|
|
@ -38,14 +38,14 @@ using namespace Zeeker;
|
|||
|
||||
static IndexGenerator *global_instance = nullptr;
|
||||
QMutex IndexGenerator::m_mutex;
|
||||
//QVector<Document> *Zeeker::_doc_list_path;
|
||||
//QMutex Zeeker::_mutex_doc_list_path;
|
||||
//QVector<Document> *Zeeker::_doc_list_content;
|
||||
//QMutex Zeeker::_mutex_doc_list_content;
|
||||
QMutex IndexGenerator::_mutex_doc_list_path;
|
||||
QMutex IndexGenerator::_mutex_doc_list_content;
|
||||
QVector<Document> IndexGenerator::_doc_list_path = QVector<Document>();
|
||||
QVector<Document> IndexGenerator::_doc_list_content = QVector<Document>();
|
||||
//QVector<Document> *Zeeker::g_docListForPath;
|
||||
//QMutex Zeeker::g_mutexDocListForPath;
|
||||
//QVector<Document> *Zeeker::g_docListForContent;
|
||||
//QMutex Zeeker::g_mutexDocListForContent;
|
||||
QMutex IndexGenerator::g_mutexDocListForPath;
|
||||
QMutex IndexGenerator::g_mutexDocListForContent;
|
||||
QVector<Document> IndexGenerator::g_docListForPath = QVector<Document>();
|
||||
QVector<Document> IndexGenerator::g_docListForContent = QVector<Document>();
|
||||
|
||||
IndexGenerator *IndexGenerator::getInstance(bool rebuild, QObject *parent) {
|
||||
QMutexLocker locker(&m_mutex);
|
||||
|
@ -65,15 +65,15 @@ bool IndexGenerator::setIndexdataPath() {
|
|||
//文件名索引
|
||||
bool IndexGenerator::creatAllIndex(QQueue<QVector<QString> > *messageList) {
|
||||
HandlePathList(messageList);
|
||||
// if(_doc_list_path == NULL) {
|
||||
// if(g_docListForPath == NULL) {
|
||||
// return false;
|
||||
// }
|
||||
if(IndexGenerator::_doc_list_path.isEmpty()) {
|
||||
if(IndexGenerator::g_docListForPath.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
qDebug() << "begin creatAllIndex";
|
||||
try {
|
||||
for(auto i : IndexGenerator::_doc_list_path) {
|
||||
for(auto i : IndexGenerator::g_docListForPath) {
|
||||
|
||||
insertIntoDatabase(i);
|
||||
}
|
||||
|
@ -85,33 +85,32 @@ bool IndexGenerator::creatAllIndex(QQueue<QVector<QString> > *messageList) {
|
|||
assert(false);
|
||||
}
|
||||
qDebug() << "finish creatAllIndex";
|
||||
IndexGenerator::_doc_list_path.clear();
|
||||
IndexGenerator::_doc_list_path.squeeze();
|
||||
QVector<Document>().swap(IndexGenerator::_doc_list_path);
|
||||
IndexGenerator::g_docListForPath.clear();
|
||||
IndexGenerator::g_docListForPath.squeeze();
|
||||
QVector<Document>().swap(IndexGenerator::g_docListForPath);
|
||||
|
||||
// delete _doc_list_path;
|
||||
// _doc_list_path = nullptr;
|
||||
// delete g_docListForPath;
|
||||
// g_docListForPath = nullptr;
|
||||
return true;
|
||||
}
|
||||
//文件内容索引
|
||||
bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) {
|
||||
// FileUtils::_index_status |= 0x2;
|
||||
HandlePathList(messageList);
|
||||
qDebug() << "begin creatAllIndex for content";
|
||||
// if(_doc_list_content == NULL) {
|
||||
// return false;
|
||||
// }
|
||||
if(IndexGenerator::_doc_list_content.isEmpty()) {
|
||||
if(IndexGenerator::g_docListForContent.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
int size = IndexGenerator::_doc_list_content.size();
|
||||
int size = IndexGenerator::g_docListForContent.size();
|
||||
qDebug() << "begin creatAllIndex for content" << size;
|
||||
if(!size == 0) {
|
||||
// GlobalSettings::getInstance()->setValue(CONTENT_INDEX_DATABASE_STATE, "0");
|
||||
try {
|
||||
int count = 0;
|
||||
for(auto i : IndexGenerator::_doc_list_content) {
|
||||
insertIntoContentDatabase(i);
|
||||
for(Document i : IndexGenerator::g_docListForContent) {
|
||||
if(!i.isRequiredDeleted()) {
|
||||
m_database_content->replace_document(i.getUniqueTerm(), i.getXapianDocument());
|
||||
} else {
|
||||
m_database_content->delete_document(i.getUniqueTerm());
|
||||
}
|
||||
if(++count > 999) {
|
||||
count = 0;
|
||||
m_database_content->commit();
|
||||
|
@ -121,16 +120,13 @@ bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) {
|
|||
} catch(const Xapian::Error &e) {
|
||||
qWarning() << "creat content Index fail!" << QString::fromStdString(e.get_description());
|
||||
IndexStatusRecorder::getInstance()->setStatus(CONTENT_INDEX_DATABASE_STATE, "1");
|
||||
// FileUtils::_index_status &= ~0x2;
|
||||
assert(false);
|
||||
}
|
||||
// GlobalSettings::getInstance()->setValue(CONTENT_INDEX_DATABASE_STATE, "2");
|
||||
// FileUtils::_index_status &= ~0x2;
|
||||
qDebug() << "finish creatAllIndex for content";
|
||||
|
||||
IndexGenerator::_doc_list_content.clear();
|
||||
IndexGenerator::_doc_list_content.squeeze();
|
||||
QVector<Document>().swap(IndexGenerator::_doc_list_content);
|
||||
IndexGenerator::g_docListForContent.clear();
|
||||
IndexGenerator::g_docListForContent.squeeze();
|
||||
QVector<Document>().swap(IndexGenerator::g_docListForContent);
|
||||
malloc_trim(0);
|
||||
}
|
||||
Q_EMIT this->transactionFinished();
|
||||
|
@ -175,12 +171,12 @@ IndexGenerator::~IndexGenerator() {
|
|||
// if(m_index_map)
|
||||
// delete m_index_map;
|
||||
// m_index_map = nullptr;
|
||||
// if(m_doc_list_path)
|
||||
// delete m_doc_list_path;
|
||||
// m_doc_list_path = nullptr;
|
||||
// if(m_doc_list_content)
|
||||
// delete m_doc_list_content;
|
||||
// m_doc_list_content = nullptr;
|
||||
// if(mg_docListForPath)
|
||||
// delete mg_docListForPath;
|
||||
// mg_docListForPath = nullptr;
|
||||
// if(mg_docListForContent)
|
||||
// delete mg_docListForContent;
|
||||
// mg_docListForContent = nullptr;
|
||||
// if(m_index_data_path)
|
||||
// delete m_index_data_path;
|
||||
// m_index_data_path = nullptr;
|
||||
|
@ -230,7 +226,7 @@ void IndexGenerator::HandlePathList(QQueue<QVector<QString>> *messageList) {
|
|||
|
||||
// QList<Document> docList = future.results();
|
||||
// future.cancel();
|
||||
// m_doc_list_path = new QList<Document>(docList);
|
||||
// mg_docListForPath = new QList<Document>(docList);
|
||||
QThreadPool pool;
|
||||
pool.setMaxThreadCount(((QThread::idealThreadCount() - 1) / 2) + 1);
|
||||
pool.setExpiryTimeout(100);
|
||||
|
@ -244,12 +240,12 @@ void IndexGenerator::HandlePathList(QQueue<QVector<QString>> *messageList) {
|
|||
// delete constructer;
|
||||
// constructer = nullptr;
|
||||
|
||||
// qDebug()<<_doc_list_path->size();
|
||||
// qWarning() << _doc_list_path;
|
||||
// qDebug()<<g_docListForPath->size();
|
||||
// qWarning() << g_docListForPath;
|
||||
// QList<Document> docList = future.results();
|
||||
// m_doc_list_path = new QList<Document>(docList);
|
||||
// m_doc_list_path = std::move(future.results());
|
||||
// qDebug()<<m_doc_list_path.size();
|
||||
// mg_docListForPath = new QList<Document>(docList);
|
||||
// mg_docListForPath = std::move(future.results());
|
||||
// qDebug()<<mg_docListForPath.size();
|
||||
|
||||
qDebug() << "Finish HandlePathList!";
|
||||
return;
|
||||
|
@ -280,13 +276,13 @@ void IndexGenerator::HandlePathList(QQueue<QString> *messageList) {
|
|||
// ChineseSegmentation::getInstance()->~ChineseSegmentation();
|
||||
|
||||
// QList<Document> docList = future.results();
|
||||
// m_doc_list_content = new QList<Document>(docList);
|
||||
// mg_docListForContent = new QList<Document>(docList);
|
||||
|
||||
// qDebug()<<_doc_list_content->size();
|
||||
// qDebug()<<g_docListForContent->size();
|
||||
|
||||
// QList<Document> docList = future.results();
|
||||
// m_doc_list_content = new QList<Document>(docList);
|
||||
// m_doc_list_content = std::move(future.results());
|
||||
// mg_docListForContent = new QList<Document>(docList);
|
||||
// mg_docListForContent = std::move(future.results());
|
||||
// future.cancel();
|
||||
|
||||
qDebug() << "Finish HandlePathList for content index!";
|
||||
|
|
|
@ -78,10 +78,10 @@ private:
|
|||
void insertIntoDatabase(Document& doc);
|
||||
void insertIntoContentDatabase(Document& doc);
|
||||
|
||||
static QVector<Document> _doc_list_path;
|
||||
static QMutex _mutex_doc_list_path;
|
||||
static QVector<Document> _doc_list_content;
|
||||
static QMutex _mutex_doc_list_content;
|
||||
static QVector<Document> g_docListForPath;
|
||||
static QMutex g_mutexDocListForPath;
|
||||
static QVector<Document> g_docListForContent;
|
||||
static QMutex g_mutexDocListForContent;
|
||||
QMap<QString, QStringList> m_index_map;
|
||||
QString m_index_data_path;
|
||||
Xapian::WritableDatabase* m_database_path;
|
||||
|
|
Loading…
Reference in New Issue