Delete content index if file content is empty.

This commit is contained in:
iaom 2021-11-02 15:44:12 +08:00
parent a095150f09
commit c2905bc331
5 changed files with 87 additions and 83 deletions

View File

@ -24,8 +24,8 @@
#include <QThread> #include <QThread>
#include <QUrl> #include <QUrl>
//extern QList<Document> *_doc_list_path; //extern QList<Document> *g_docListForPath;
//extern QMutex _mutex_doc_list_path; //extern QMutex g_mutexDocListForPath;
using namespace Zeeker; using namespace Zeeker;
ConstructDocumentForPath::ConstructDocumentForPath(QVector<QString> list) { ConstructDocumentForPath::ConstructDocumentForPath(QVector<QString> list) {
this->setAutoDelete(true); this->setAutoDelete(true);
@ -34,9 +34,9 @@ ConstructDocumentForPath::ConstructDocumentForPath(QVector<QString> list) {
void ConstructDocumentForPath::run() { void ConstructDocumentForPath::run() {
// qDebug()<<"ConstructDocumentForPath"; // qDebug()<<"ConstructDocumentForPath";
// if(!Zeeker::_doc_list_path) // if(!Zeeker::g_docListForPath)
// Zeeker::_doc_list_path = new QVector<Document>; // Zeeker::g_docListForPath = new QVector<Document>;
// qDebug()<<_doc_list_path->size(); // qDebug()<<g_docListForPath->size();
QString index_text = m_list.at(0).toLower(); QString index_text = m_list.at(0).toLower();
QString sourcePath = m_list.at(1); QString sourcePath = m_list.at(1);
Document doc; Document doc;
@ -86,9 +86,9 @@ void ConstructDocumentForPath::run() {
} }
// QMetaObject::invokeMethod(m_indexGenerator,"appendDocListPath",Q_ARG(Document,doc)); // QMetaObject::invokeMethod(m_indexGenerator,"appendDocListPath",Q_ARG(Document,doc));
IndexGenerator::_mutex_doc_list_path.lock(); IndexGenerator::g_mutexDocListForPath.lock();
IndexGenerator::_doc_list_path.append(doc); IndexGenerator::g_docListForPath.append(doc);
IndexGenerator::_mutex_doc_list_path.unlock(); IndexGenerator::g_mutexDocListForPath.unlock();
// qDebug()<<"ConstructDocumentForPath finish"; // qDebug()<<"ConstructDocumentForPath finish";
return; return;
} }
@ -100,40 +100,33 @@ ConstructDocumentForContent::ConstructDocumentForContent(QString path) {
void ConstructDocumentForContent::run() { void ConstructDocumentForContent::run() {
// qDebug() << "ConstructDocumentForContent currentThreadId()" << QThread::currentThreadId(); // qDebug() << "ConstructDocumentForContent currentThreadId()" << QThread::currentThreadId();
// 构造文本索引的document //构造文本索引的document
// if(!Zeeker::_doc_list_content)
// Zeeker::_doc_list_content = new QVector<Document>;
QString content; QString content;
FileReader::getTextContent(m_path, content); FileReader::getTextContent(m_path, content);
if(content.isEmpty())
return;
//QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
//QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
Document doc; Document doc;
doc.setData(content);
//doc.setUniqueTerm(uniqueterm);
doc.setUniqueTerm(FileUtils::makeDocUterm(m_path)); doc.setUniqueTerm(FileUtils::makeDocUterm(m_path));
//doc.addTerm(upTerm);
doc.addTerm("ZEEKERUPTERM" + FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep))); doc.addTerm("ZEEKERUPTERM" + FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
doc.addValue(m_path); doc.addValue(m_path);
//'\xEF\xBC\x8C' is "" "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info. if(content.isEmpty()) {
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " "); doc.reuireDeleted();
} else {
// QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000)); doc.setData(content);
std::vector<cppjieba::KeyWord> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString()); //'\xEF\xBC\x8C' is "" "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
for(size_t i = 0; i < term.size(); ++i) { std::vector<cppjieba::KeyWord> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight)); for(size_t i = 0; i < term.size(); ++i) {
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
}
term.clear();
term.shrink_to_fit();
} }
IndexGenerator::g_mutexDocListForContent.lock();
IndexGenerator::_mutex_doc_list_content.lock(); IndexGenerator::g_docListForContent.append(doc);
IndexGenerator::_doc_list_content.append(doc); IndexGenerator::g_mutexDocListForContent.unlock();
IndexGenerator::_mutex_doc_list_content.unlock();
content.clear(); content.clear();
content.squeeze(); content.squeeze();
term.clear();
term.shrink_to_fit();
return; return;
} }

View File

@ -108,3 +108,13 @@ QStringList Document::getIndexText() {
Xapian::Document Document::getXapianDocument() { Xapian::Document Document::getXapianDocument() {
return m_document; return m_document;
} }
void Document::reuireDeleted()
{
m_shouldDelete = true;
}
bool Document::isRequiredDeleted()
{
return m_shouldDelete;
}

View File

@ -33,11 +33,13 @@ public:
m_document = other.m_document; m_document = other.m_document;
m_index_text = other.m_index_text; m_index_text = other.m_index_text;
m_unique_term = other.m_unique_term; m_unique_term = other.m_unique_term;
m_shouldDelete = other.m_shouldDelete;
} }
void operator=(const Document& other) { void operator=(const Document& other) {
m_document = other.m_document; m_document = other.m_document;
m_index_text = other.m_index_text; m_index_text = other.m_index_text;
m_unique_term = other.m_unique_term; m_unique_term = other.m_unique_term;
m_shouldDelete = other.m_shouldDelete;
} }
void setData(QString &data); void setData(QString &data);
void addPosting(std::string term, QVector<size_t> offset, int weight = 1); void addPosting(std::string term, QVector<size_t> offset, int weight = 1);
@ -52,11 +54,14 @@ public:
void setIndexText(QStringList indexText); void setIndexText(QStringList indexText);
QStringList getIndexText(); QStringList getIndexText();
Xapian::Document getXapianDocument(); Xapian::Document getXapianDocument();
void reuireDeleted();
bool isRequiredDeleted();
private: private:
Xapian::Document m_document; Xapian::Document m_document;
QStringList m_index_text; QStringList m_index_text;
//QString m_unique_term; //QString m_unique_term;
std::string m_unique_term; std::string m_unique_term;
bool m_shouldDelete = false;
}; };
} }

View File

@ -38,14 +38,14 @@ using namespace Zeeker;
static IndexGenerator *global_instance = nullptr; static IndexGenerator *global_instance = nullptr;
QMutex IndexGenerator::m_mutex; QMutex IndexGenerator::m_mutex;
//QVector<Document> *Zeeker::_doc_list_path; //QVector<Document> *Zeeker::g_docListForPath;
//QMutex Zeeker::_mutex_doc_list_path; //QMutex Zeeker::g_mutexDocListForPath;
//QVector<Document> *Zeeker::_doc_list_content; //QVector<Document> *Zeeker::g_docListForContent;
//QMutex Zeeker::_mutex_doc_list_content; //QMutex Zeeker::g_mutexDocListForContent;
QMutex IndexGenerator::_mutex_doc_list_path; QMutex IndexGenerator::g_mutexDocListForPath;
QMutex IndexGenerator::_mutex_doc_list_content; QMutex IndexGenerator::g_mutexDocListForContent;
QVector<Document> IndexGenerator::_doc_list_path = QVector<Document>(); QVector<Document> IndexGenerator::g_docListForPath = QVector<Document>();
QVector<Document> IndexGenerator::_doc_list_content = QVector<Document>(); QVector<Document> IndexGenerator::g_docListForContent = QVector<Document>();
IndexGenerator *IndexGenerator::getInstance(bool rebuild, QObject *parent) { IndexGenerator *IndexGenerator::getInstance(bool rebuild, QObject *parent) {
QMutexLocker locker(&m_mutex); QMutexLocker locker(&m_mutex);
@ -65,15 +65,15 @@ bool IndexGenerator::setIndexdataPath() {
//文件名索引 //文件名索引
bool IndexGenerator::creatAllIndex(QQueue<QVector<QString> > *messageList) { bool IndexGenerator::creatAllIndex(QQueue<QVector<QString> > *messageList) {
HandlePathList(messageList); HandlePathList(messageList);
// if(_doc_list_path == NULL) { // if(g_docListForPath == NULL) {
// return false; // return false;
// } // }
if(IndexGenerator::_doc_list_path.isEmpty()) { if(IndexGenerator::g_docListForPath.isEmpty()) {
return false; return false;
} }
qDebug() << "begin creatAllIndex"; qDebug() << "begin creatAllIndex";
try { try {
for(auto i : IndexGenerator::_doc_list_path) { for(auto i : IndexGenerator::g_docListForPath) {
insertIntoDatabase(i); insertIntoDatabase(i);
} }
@ -85,33 +85,32 @@ bool IndexGenerator::creatAllIndex(QQueue<QVector<QString> > *messageList) {
assert(false); assert(false);
} }
qDebug() << "finish creatAllIndex"; qDebug() << "finish creatAllIndex";
IndexGenerator::_doc_list_path.clear(); IndexGenerator::g_docListForPath.clear();
IndexGenerator::_doc_list_path.squeeze(); IndexGenerator::g_docListForPath.squeeze();
QVector<Document>().swap(IndexGenerator::_doc_list_path); QVector<Document>().swap(IndexGenerator::g_docListForPath);
// delete _doc_list_path; // delete g_docListForPath;
// _doc_list_path = nullptr; // g_docListForPath = nullptr;
return true; return true;
} }
//文件内容索引 //文件内容索引
bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) { bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) {
// FileUtils::_index_status |= 0x2;
HandlePathList(messageList); HandlePathList(messageList);
qDebug() << "begin creatAllIndex for content"; qDebug() << "begin creatAllIndex for content";
// if(_doc_list_content == NULL) { if(IndexGenerator::g_docListForContent.isEmpty()) {
// return false;
// }
if(IndexGenerator::_doc_list_content.isEmpty()) {
return false; return false;
} }
int size = IndexGenerator::_doc_list_content.size(); int size = IndexGenerator::g_docListForContent.size();
qDebug() << "begin creatAllIndex for content" << size; qDebug() << "begin creatAllIndex for content" << size;
if(!size == 0) { if(!size == 0) {
// GlobalSettings::getInstance()->setValue(CONTENT_INDEX_DATABASE_STATE, "0");
try { try {
int count = 0; int count = 0;
for(auto i : IndexGenerator::_doc_list_content) { for(Document i : IndexGenerator::g_docListForContent) {
insertIntoContentDatabase(i); if(!i.isRequiredDeleted()) {
m_database_content->replace_document(i.getUniqueTerm(), i.getXapianDocument());
} else {
m_database_content->delete_document(i.getUniqueTerm());
}
if(++count > 999) { if(++count > 999) {
count = 0; count = 0;
m_database_content->commit(); m_database_content->commit();
@ -121,16 +120,13 @@ bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) {
} catch(const Xapian::Error &e) { } catch(const Xapian::Error &e) {
qWarning() << "creat content Index fail!" << QString::fromStdString(e.get_description()); qWarning() << "creat content Index fail!" << QString::fromStdString(e.get_description());
IndexStatusRecorder::getInstance()->setStatus(CONTENT_INDEX_DATABASE_STATE, "1"); IndexStatusRecorder::getInstance()->setStatus(CONTENT_INDEX_DATABASE_STATE, "1");
// FileUtils::_index_status &= ~0x2;
assert(false); assert(false);
} }
// GlobalSettings::getInstance()->setValue(CONTENT_INDEX_DATABASE_STATE, "2");
// FileUtils::_index_status &= ~0x2;
qDebug() << "finish creatAllIndex for content"; qDebug() << "finish creatAllIndex for content";
IndexGenerator::_doc_list_content.clear(); IndexGenerator::g_docListForContent.clear();
IndexGenerator::_doc_list_content.squeeze(); IndexGenerator::g_docListForContent.squeeze();
QVector<Document>().swap(IndexGenerator::_doc_list_content); QVector<Document>().swap(IndexGenerator::g_docListForContent);
malloc_trim(0); malloc_trim(0);
} }
Q_EMIT this->transactionFinished(); Q_EMIT this->transactionFinished();
@ -175,12 +171,12 @@ IndexGenerator::~IndexGenerator() {
// if(m_index_map) // if(m_index_map)
// delete m_index_map; // delete m_index_map;
// m_index_map = nullptr; // m_index_map = nullptr;
// if(m_doc_list_path) // if(mg_docListForPath)
// delete m_doc_list_path; // delete mg_docListForPath;
// m_doc_list_path = nullptr; // mg_docListForPath = nullptr;
// if(m_doc_list_content) // if(mg_docListForContent)
// delete m_doc_list_content; // delete mg_docListForContent;
// m_doc_list_content = nullptr; // mg_docListForContent = nullptr;
// if(m_index_data_path) // if(m_index_data_path)
// delete m_index_data_path; // delete m_index_data_path;
// m_index_data_path = nullptr; // m_index_data_path = nullptr;
@ -230,7 +226,7 @@ void IndexGenerator::HandlePathList(QQueue<QVector<QString>> *messageList) {
// QList<Document> docList = future.results(); // QList<Document> docList = future.results();
// future.cancel(); // future.cancel();
// m_doc_list_path = new QList<Document>(docList); // mg_docListForPath = new QList<Document>(docList);
QThreadPool pool; QThreadPool pool;
pool.setMaxThreadCount(((QThread::idealThreadCount() - 1) / 2) + 1); pool.setMaxThreadCount(((QThread::idealThreadCount() - 1) / 2) + 1);
pool.setExpiryTimeout(100); pool.setExpiryTimeout(100);
@ -244,12 +240,12 @@ void IndexGenerator::HandlePathList(QQueue<QVector<QString>> *messageList) {
// delete constructer; // delete constructer;
// constructer = nullptr; // constructer = nullptr;
// qDebug()<<_doc_list_path->size(); // qDebug()<<g_docListForPath->size();
// qWarning() << _doc_list_path; // qWarning() << g_docListForPath;
// QList<Document> docList = future.results(); // QList<Document> docList = future.results();
// m_doc_list_path = new QList<Document>(docList); // mg_docListForPath = new QList<Document>(docList);
// m_doc_list_path = std::move(future.results()); // mg_docListForPath = std::move(future.results());
// qDebug()<<m_doc_list_path.size(); // qDebug()<<mg_docListForPath.size();
qDebug() << "Finish HandlePathList!"; qDebug() << "Finish HandlePathList!";
return; return;
@ -280,13 +276,13 @@ void IndexGenerator::HandlePathList(QQueue<QString> *messageList) {
// ChineseSegmentation::getInstance()->~ChineseSegmentation(); // ChineseSegmentation::getInstance()->~ChineseSegmentation();
// QList<Document> docList = future.results(); // QList<Document> docList = future.results();
// m_doc_list_content = new QList<Document>(docList); // mg_docListForContent = new QList<Document>(docList);
// qDebug()<<_doc_list_content->size(); // qDebug()<<g_docListForContent->size();
// QList<Document> docList = future.results(); // QList<Document> docList = future.results();
// m_doc_list_content = new QList<Document>(docList); // mg_docListForContent = new QList<Document>(docList);
// m_doc_list_content = std::move(future.results()); // mg_docListForContent = std::move(future.results());
// future.cancel(); // future.cancel();
qDebug() << "Finish HandlePathList for content index!"; qDebug() << "Finish HandlePathList for content index!";

View File

@ -78,10 +78,10 @@ private:
void insertIntoDatabase(Document& doc); void insertIntoDatabase(Document& doc);
void insertIntoContentDatabase(Document& doc); void insertIntoContentDatabase(Document& doc);
static QVector<Document> _doc_list_path; static QVector<Document> g_docListForPath;
static QMutex _mutex_doc_list_path; static QMutex g_mutexDocListForPath;
static QVector<Document> _doc_list_content; static QVector<Document> g_docListForContent;
static QMutex _mutex_doc_list_content; static QMutex g_mutexDocListForContent;
QMap<QString, QStringList> m_index_map; QMap<QString, QStringList> m_index_map;
QString m_index_data_path; QString m_index_data_path;
Xapian::WritableDatabase* m_database_path; Xapian::WritableDatabase* m_database_path;