Delete content index if file content is empty.

This commit is contained in:
iaom 2021-11-02 15:44:12 +08:00
parent a095150f09
commit c2905bc331
5 changed files with 87 additions and 83 deletions

View File

@ -24,8 +24,8 @@
#include <QThread>
#include <QUrl>
//extern QList<Document> *_doc_list_path;
//extern QMutex _mutex_doc_list_path;
//extern QList<Document> *g_docListForPath;
//extern QMutex g_mutexDocListForPath;
using namespace Zeeker;
ConstructDocumentForPath::ConstructDocumentForPath(QVector<QString> list) {
this->setAutoDelete(true);
@ -34,9 +34,9 @@ ConstructDocumentForPath::ConstructDocumentForPath(QVector<QString> list) {
void ConstructDocumentForPath::run() {
// qDebug()<<"ConstructDocumentForPath";
// if(!Zeeker::_doc_list_path)
// Zeeker::_doc_list_path = new QVector<Document>;
// qDebug()<<_doc_list_path->size();
// if(!Zeeker::g_docListForPath)
// Zeeker::g_docListForPath = new QVector<Document>;
// qDebug()<<g_docListForPath->size();
QString index_text = m_list.at(0).toLower();
QString sourcePath = m_list.at(1);
Document doc;
@ -86,9 +86,9 @@ void ConstructDocumentForPath::run() {
}
// QMetaObject::invokeMethod(m_indexGenerator,"appendDocListPath",Q_ARG(Document,doc));
IndexGenerator::_mutex_doc_list_path.lock();
IndexGenerator::_doc_list_path.append(doc);
IndexGenerator::_mutex_doc_list_path.unlock();
IndexGenerator::g_mutexDocListForPath.lock();
IndexGenerator::g_docListForPath.append(doc);
IndexGenerator::g_mutexDocListForPath.unlock();
// qDebug()<<"ConstructDocumentForPath finish";
return;
}
@ -100,40 +100,33 @@ ConstructDocumentForContent::ConstructDocumentForContent(QString path) {
void ConstructDocumentForContent::run() {
// qDebug() << "ConstructDocumentForContent currentThreadId()" << QThread::currentThreadId();
// 构造文本索引的document
// if(!Zeeker::_doc_list_content)
// Zeeker::_doc_list_content = new QVector<Document>;
//构造文本索引的document
QString content;
FileReader::getTextContent(m_path, content);
if(content.isEmpty())
return;
//QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
//QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
Document doc;
doc.setData(content);
//doc.setUniqueTerm(uniqueterm);
doc.setUniqueTerm(FileUtils::makeDocUterm(m_path));
//doc.addTerm(upTerm);
doc.addTerm("ZEEKERUPTERM" + FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
doc.addValue(m_path);
//'\xEF\xBC\x8C' is "" "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
// QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000));
std::vector<cppjieba::KeyWord> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
for(size_t i = 0; i < term.size(); ++i) {
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
if(content.isEmpty()) {
doc.reuireDeleted();
} else {
doc.setData(content);
//'\xEF\xBC\x8C' is "" "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
std::vector<cppjieba::KeyWord> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
for(size_t i = 0; i < term.size(); ++i) {
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
}
term.clear();
term.shrink_to_fit();
}
IndexGenerator::_mutex_doc_list_content.lock();
IndexGenerator::_doc_list_content.append(doc);
IndexGenerator::_mutex_doc_list_content.unlock();
IndexGenerator::g_mutexDocListForContent.lock();
IndexGenerator::g_docListForContent.append(doc);
IndexGenerator::g_mutexDocListForContent.unlock();
content.clear();
content.squeeze();
term.clear();
term.shrink_to_fit();
return;
}

View File

@ -108,3 +108,13 @@ QStringList Document::getIndexText() {
Xapian::Document Document::getXapianDocument() {
return m_document;
}
void Document::reuireDeleted()
{
m_shouldDelete = true;
}
bool Document::isRequiredDeleted()
{
return m_shouldDelete;
}

View File

@ -33,11 +33,13 @@ public:
m_document = other.m_document;
m_index_text = other.m_index_text;
m_unique_term = other.m_unique_term;
m_shouldDelete = other.m_shouldDelete;
}
void operator=(const Document& other) {
m_document = other.m_document;
m_index_text = other.m_index_text;
m_unique_term = other.m_unique_term;
m_shouldDelete = other.m_shouldDelete;
}
void setData(QString &data);
void addPosting(std::string term, QVector<size_t> offset, int weight = 1);
@ -52,11 +54,14 @@ public:
void setIndexText(QStringList indexText);
QStringList getIndexText();
Xapian::Document getXapianDocument();
void reuireDeleted();
bool isRequiredDeleted();
private:
Xapian::Document m_document;
QStringList m_index_text;
//QString m_unique_term;
std::string m_unique_term;
bool m_shouldDelete = false;
};
}

View File

@ -38,14 +38,14 @@ using namespace Zeeker;
static IndexGenerator *global_instance = nullptr;
QMutex IndexGenerator::m_mutex;
//QVector<Document> *Zeeker::_doc_list_path;
//QMutex Zeeker::_mutex_doc_list_path;
//QVector<Document> *Zeeker::_doc_list_content;
//QMutex Zeeker::_mutex_doc_list_content;
QMutex IndexGenerator::_mutex_doc_list_path;
QMutex IndexGenerator::_mutex_doc_list_content;
QVector<Document> IndexGenerator::_doc_list_path = QVector<Document>();
QVector<Document> IndexGenerator::_doc_list_content = QVector<Document>();
//QVector<Document> *Zeeker::g_docListForPath;
//QMutex Zeeker::g_mutexDocListForPath;
//QVector<Document> *Zeeker::g_docListForContent;
//QMutex Zeeker::g_mutexDocListForContent;
QMutex IndexGenerator::g_mutexDocListForPath;
QMutex IndexGenerator::g_mutexDocListForContent;
QVector<Document> IndexGenerator::g_docListForPath = QVector<Document>();
QVector<Document> IndexGenerator::g_docListForContent = QVector<Document>();
IndexGenerator *IndexGenerator::getInstance(bool rebuild, QObject *parent) {
QMutexLocker locker(&m_mutex);
@ -65,15 +65,15 @@ bool IndexGenerator::setIndexdataPath() {
//文件名索引
bool IndexGenerator::creatAllIndex(QQueue<QVector<QString> > *messageList) {
HandlePathList(messageList);
// if(_doc_list_path == NULL) {
// if(g_docListForPath == NULL) {
// return false;
// }
if(IndexGenerator::_doc_list_path.isEmpty()) {
if(IndexGenerator::g_docListForPath.isEmpty()) {
return false;
}
qDebug() << "begin creatAllIndex";
try {
for(auto i : IndexGenerator::_doc_list_path) {
for(auto i : IndexGenerator::g_docListForPath) {
insertIntoDatabase(i);
}
@ -85,33 +85,32 @@ bool IndexGenerator::creatAllIndex(QQueue<QVector<QString> > *messageList) {
assert(false);
}
qDebug() << "finish creatAllIndex";
IndexGenerator::_doc_list_path.clear();
IndexGenerator::_doc_list_path.squeeze();
QVector<Document>().swap(IndexGenerator::_doc_list_path);
IndexGenerator::g_docListForPath.clear();
IndexGenerator::g_docListForPath.squeeze();
QVector<Document>().swap(IndexGenerator::g_docListForPath);
// delete _doc_list_path;
// _doc_list_path = nullptr;
// delete g_docListForPath;
// g_docListForPath = nullptr;
return true;
}
//文件内容索引
bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) {
// FileUtils::_index_status |= 0x2;
HandlePathList(messageList);
qDebug() << "begin creatAllIndex for content";
// if(_doc_list_content == NULL) {
// return false;
// }
if(IndexGenerator::_doc_list_content.isEmpty()) {
if(IndexGenerator::g_docListForContent.isEmpty()) {
return false;
}
int size = IndexGenerator::_doc_list_content.size();
int size = IndexGenerator::g_docListForContent.size();
qDebug() << "begin creatAllIndex for content" << size;
if(!size == 0) {
// GlobalSettings::getInstance()->setValue(CONTENT_INDEX_DATABASE_STATE, "0");
try {
int count = 0;
for(auto i : IndexGenerator::_doc_list_content) {
insertIntoContentDatabase(i);
for(Document i : IndexGenerator::g_docListForContent) {
if(!i.isRequiredDeleted()) {
m_database_content->replace_document(i.getUniqueTerm(), i.getXapianDocument());
} else {
m_database_content->delete_document(i.getUniqueTerm());
}
if(++count > 999) {
count = 0;
m_database_content->commit();
@ -121,16 +120,13 @@ bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) {
} catch(const Xapian::Error &e) {
qWarning() << "creat content Index fail!" << QString::fromStdString(e.get_description());
IndexStatusRecorder::getInstance()->setStatus(CONTENT_INDEX_DATABASE_STATE, "1");
// FileUtils::_index_status &= ~0x2;
assert(false);
}
// GlobalSettings::getInstance()->setValue(CONTENT_INDEX_DATABASE_STATE, "2");
// FileUtils::_index_status &= ~0x2;
qDebug() << "finish creatAllIndex for content";
IndexGenerator::_doc_list_content.clear();
IndexGenerator::_doc_list_content.squeeze();
QVector<Document>().swap(IndexGenerator::_doc_list_content);
IndexGenerator::g_docListForContent.clear();
IndexGenerator::g_docListForContent.squeeze();
QVector<Document>().swap(IndexGenerator::g_docListForContent);
malloc_trim(0);
}
Q_EMIT this->transactionFinished();
@ -175,12 +171,12 @@ IndexGenerator::~IndexGenerator() {
// if(m_index_map)
// delete m_index_map;
// m_index_map = nullptr;
// if(m_doc_list_path)
// delete m_doc_list_path;
// m_doc_list_path = nullptr;
// if(m_doc_list_content)
// delete m_doc_list_content;
// m_doc_list_content = nullptr;
// if(mg_docListForPath)
// delete mg_docListForPath;
// mg_docListForPath = nullptr;
// if(mg_docListForContent)
// delete mg_docListForContent;
// mg_docListForContent = nullptr;
// if(m_index_data_path)
// delete m_index_data_path;
// m_index_data_path = nullptr;
@ -230,7 +226,7 @@ void IndexGenerator::HandlePathList(QQueue<QVector<QString>> *messageList) {
// QList<Document> docList = future.results();
// future.cancel();
// m_doc_list_path = new QList<Document>(docList);
// mg_docListForPath = new QList<Document>(docList);
QThreadPool pool;
pool.setMaxThreadCount(((QThread::idealThreadCount() - 1) / 2) + 1);
pool.setExpiryTimeout(100);
@ -244,12 +240,12 @@ void IndexGenerator::HandlePathList(QQueue<QVector<QString>> *messageList) {
// delete constructer;
// constructer = nullptr;
// qDebug()<<_doc_list_path->size();
// qWarning() << _doc_list_path;
// qDebug()<<g_docListForPath->size();
// qWarning() << g_docListForPath;
// QList<Document> docList = future.results();
// m_doc_list_path = new QList<Document>(docList);
// m_doc_list_path = std::move(future.results());
// qDebug()<<m_doc_list_path.size();
// mg_docListForPath = new QList<Document>(docList);
// mg_docListForPath = std::move(future.results());
// qDebug()<<mg_docListForPath.size();
qDebug() << "Finish HandlePathList!";
return;
@ -280,13 +276,13 @@ void IndexGenerator::HandlePathList(QQueue<QString> *messageList) {
// ChineseSegmentation::getInstance()->~ChineseSegmentation();
// QList<Document> docList = future.results();
// m_doc_list_content = new QList<Document>(docList);
// mg_docListForContent = new QList<Document>(docList);
// qDebug()<<_doc_list_content->size();
// qDebug()<<g_docListForContent->size();
// QList<Document> docList = future.results();
// m_doc_list_content = new QList<Document>(docList);
// m_doc_list_content = std::move(future.results());
// mg_docListForContent = new QList<Document>(docList);
// mg_docListForContent = std::move(future.results());
// future.cancel();
qDebug() << "Finish HandlePathList for content index!";

View File

@ -78,10 +78,10 @@ private:
void insertIntoDatabase(Document& doc);
void insertIntoContentDatabase(Document& doc);
static QVector<Document> _doc_list_path;
static QMutex _mutex_doc_list_path;
static QVector<Document> _doc_list_content;
static QMutex _mutex_doc_list_content;
static QVector<Document> g_docListForPath;
static QMutex g_mutexDocListForPath;
static QVector<Document> g_docListForContent;
static QMutex g_mutexDocListForContent;
QMap<QString, QStringList> m_index_map;
QString m_index_data_path;
Xapian::WritableDatabase* m_database_path;