Delete content index if file content is empty.
This commit is contained in:
parent
a095150f09
commit
c2905bc331
|
@ -24,8 +24,8 @@
|
||||||
#include <QThread>
|
#include <QThread>
|
||||||
#include <QUrl>
|
#include <QUrl>
|
||||||
|
|
||||||
//extern QList<Document> *_doc_list_path;
|
//extern QList<Document> *g_docListForPath;
|
||||||
//extern QMutex _mutex_doc_list_path;
|
//extern QMutex g_mutexDocListForPath;
|
||||||
using namespace Zeeker;
|
using namespace Zeeker;
|
||||||
ConstructDocumentForPath::ConstructDocumentForPath(QVector<QString> list) {
|
ConstructDocumentForPath::ConstructDocumentForPath(QVector<QString> list) {
|
||||||
this->setAutoDelete(true);
|
this->setAutoDelete(true);
|
||||||
|
@ -34,9 +34,9 @@ ConstructDocumentForPath::ConstructDocumentForPath(QVector<QString> list) {
|
||||||
|
|
||||||
void ConstructDocumentForPath::run() {
|
void ConstructDocumentForPath::run() {
|
||||||
// qDebug()<<"ConstructDocumentForPath";
|
// qDebug()<<"ConstructDocumentForPath";
|
||||||
// if(!Zeeker::_doc_list_path)
|
// if(!Zeeker::g_docListForPath)
|
||||||
// Zeeker::_doc_list_path = new QVector<Document>;
|
// Zeeker::g_docListForPath = new QVector<Document>;
|
||||||
// qDebug()<<_doc_list_path->size();
|
// qDebug()<<g_docListForPath->size();
|
||||||
QString index_text = m_list.at(0).toLower();
|
QString index_text = m_list.at(0).toLower();
|
||||||
QString sourcePath = m_list.at(1);
|
QString sourcePath = m_list.at(1);
|
||||||
Document doc;
|
Document doc;
|
||||||
|
@ -86,9 +86,9 @@ void ConstructDocumentForPath::run() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// QMetaObject::invokeMethod(m_indexGenerator,"appendDocListPath",Q_ARG(Document,doc));
|
// QMetaObject::invokeMethod(m_indexGenerator,"appendDocListPath",Q_ARG(Document,doc));
|
||||||
IndexGenerator::_mutex_doc_list_path.lock();
|
IndexGenerator::g_mutexDocListForPath.lock();
|
||||||
IndexGenerator::_doc_list_path.append(doc);
|
IndexGenerator::g_docListForPath.append(doc);
|
||||||
IndexGenerator::_mutex_doc_list_path.unlock();
|
IndexGenerator::g_mutexDocListForPath.unlock();
|
||||||
// qDebug()<<"ConstructDocumentForPath finish";
|
// qDebug()<<"ConstructDocumentForPath finish";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -100,40 +100,33 @@ ConstructDocumentForContent::ConstructDocumentForContent(QString path) {
|
||||||
|
|
||||||
void ConstructDocumentForContent::run() {
|
void ConstructDocumentForContent::run() {
|
||||||
// qDebug() << "ConstructDocumentForContent currentThreadId()" << QThread::currentThreadId();
|
// qDebug() << "ConstructDocumentForContent currentThreadId()" << QThread::currentThreadId();
|
||||||
// 构造文本索引的document
|
//构造文本索引的document
|
||||||
// if(!Zeeker::_doc_list_content)
|
|
||||||
// Zeeker::_doc_list_content = new QVector<Document>;
|
|
||||||
QString content;
|
QString content;
|
||||||
FileReader::getTextContent(m_path, content);
|
FileReader::getTextContent(m_path, content);
|
||||||
if(content.isEmpty())
|
|
||||||
return;
|
|
||||||
//QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
|
|
||||||
//QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
|
|
||||||
Document doc;
|
Document doc;
|
||||||
doc.setData(content);
|
|
||||||
//doc.setUniqueTerm(uniqueterm);
|
|
||||||
doc.setUniqueTerm(FileUtils::makeDocUterm(m_path));
|
doc.setUniqueTerm(FileUtils::makeDocUterm(m_path));
|
||||||
//doc.addTerm(upTerm);
|
|
||||||
doc.addTerm("ZEEKERUPTERM" + FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
|
doc.addTerm("ZEEKERUPTERM" + FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
|
||||||
doc.addValue(m_path);
|
doc.addValue(m_path);
|
||||||
|
|
||||||
//'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
|
if(content.isEmpty()) {
|
||||||
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
|
doc.reuireDeleted();
|
||||||
|
} else {
|
||||||
// QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000));
|
doc.setData(content);
|
||||||
std::vector<cppjieba::KeyWord> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
|
//'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
|
||||||
|
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
|
||||||
for(size_t i = 0; i < term.size(); ++i) {
|
std::vector<cppjieba::KeyWord> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
|
||||||
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
|
for(size_t i = 0; i < term.size(); ++i) {
|
||||||
|
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
|
||||||
|
}
|
||||||
|
term.clear();
|
||||||
|
term.shrink_to_fit();
|
||||||
}
|
}
|
||||||
|
IndexGenerator::g_mutexDocListForContent.lock();
|
||||||
IndexGenerator::_mutex_doc_list_content.lock();
|
IndexGenerator::g_docListForContent.append(doc);
|
||||||
IndexGenerator::_doc_list_content.append(doc);
|
IndexGenerator::g_mutexDocListForContent.unlock();
|
||||||
IndexGenerator::_mutex_doc_list_content.unlock();
|
|
||||||
content.clear();
|
content.clear();
|
||||||
content.squeeze();
|
content.squeeze();
|
||||||
|
|
||||||
term.clear();
|
|
||||||
term.shrink_to_fit();
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -108,3 +108,13 @@ QStringList Document::getIndexText() {
|
||||||
Xapian::Document Document::getXapianDocument() {
|
Xapian::Document Document::getXapianDocument() {
|
||||||
return m_document;
|
return m_document;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Document::reuireDeleted()
|
||||||
|
{
|
||||||
|
m_shouldDelete = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Document::isRequiredDeleted()
|
||||||
|
{
|
||||||
|
return m_shouldDelete;
|
||||||
|
}
|
||||||
|
|
|
@ -33,11 +33,13 @@ public:
|
||||||
m_document = other.m_document;
|
m_document = other.m_document;
|
||||||
m_index_text = other.m_index_text;
|
m_index_text = other.m_index_text;
|
||||||
m_unique_term = other.m_unique_term;
|
m_unique_term = other.m_unique_term;
|
||||||
|
m_shouldDelete = other.m_shouldDelete;
|
||||||
}
|
}
|
||||||
void operator=(const Document& other) {
|
void operator=(const Document& other) {
|
||||||
m_document = other.m_document;
|
m_document = other.m_document;
|
||||||
m_index_text = other.m_index_text;
|
m_index_text = other.m_index_text;
|
||||||
m_unique_term = other.m_unique_term;
|
m_unique_term = other.m_unique_term;
|
||||||
|
m_shouldDelete = other.m_shouldDelete;
|
||||||
}
|
}
|
||||||
void setData(QString &data);
|
void setData(QString &data);
|
||||||
void addPosting(std::string term, QVector<size_t> offset, int weight = 1);
|
void addPosting(std::string term, QVector<size_t> offset, int weight = 1);
|
||||||
|
@ -52,11 +54,14 @@ public:
|
||||||
void setIndexText(QStringList indexText);
|
void setIndexText(QStringList indexText);
|
||||||
QStringList getIndexText();
|
QStringList getIndexText();
|
||||||
Xapian::Document getXapianDocument();
|
Xapian::Document getXapianDocument();
|
||||||
|
void reuireDeleted();
|
||||||
|
bool isRequiredDeleted();
|
||||||
private:
|
private:
|
||||||
Xapian::Document m_document;
|
Xapian::Document m_document;
|
||||||
QStringList m_index_text;
|
QStringList m_index_text;
|
||||||
//QString m_unique_term;
|
//QString m_unique_term;
|
||||||
std::string m_unique_term;
|
std::string m_unique_term;
|
||||||
|
bool m_shouldDelete = false;
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,14 +38,14 @@ using namespace Zeeker;
|
||||||
|
|
||||||
static IndexGenerator *global_instance = nullptr;
|
static IndexGenerator *global_instance = nullptr;
|
||||||
QMutex IndexGenerator::m_mutex;
|
QMutex IndexGenerator::m_mutex;
|
||||||
//QVector<Document> *Zeeker::_doc_list_path;
|
//QVector<Document> *Zeeker::g_docListForPath;
|
||||||
//QMutex Zeeker::_mutex_doc_list_path;
|
//QMutex Zeeker::g_mutexDocListForPath;
|
||||||
//QVector<Document> *Zeeker::_doc_list_content;
|
//QVector<Document> *Zeeker::g_docListForContent;
|
||||||
//QMutex Zeeker::_mutex_doc_list_content;
|
//QMutex Zeeker::g_mutexDocListForContent;
|
||||||
QMutex IndexGenerator::_mutex_doc_list_path;
|
QMutex IndexGenerator::g_mutexDocListForPath;
|
||||||
QMutex IndexGenerator::_mutex_doc_list_content;
|
QMutex IndexGenerator::g_mutexDocListForContent;
|
||||||
QVector<Document> IndexGenerator::_doc_list_path = QVector<Document>();
|
QVector<Document> IndexGenerator::g_docListForPath = QVector<Document>();
|
||||||
QVector<Document> IndexGenerator::_doc_list_content = QVector<Document>();
|
QVector<Document> IndexGenerator::g_docListForContent = QVector<Document>();
|
||||||
|
|
||||||
IndexGenerator *IndexGenerator::getInstance(bool rebuild, QObject *parent) {
|
IndexGenerator *IndexGenerator::getInstance(bool rebuild, QObject *parent) {
|
||||||
QMutexLocker locker(&m_mutex);
|
QMutexLocker locker(&m_mutex);
|
||||||
|
@ -65,15 +65,15 @@ bool IndexGenerator::setIndexdataPath() {
|
||||||
//文件名索引
|
//文件名索引
|
||||||
bool IndexGenerator::creatAllIndex(QQueue<QVector<QString> > *messageList) {
|
bool IndexGenerator::creatAllIndex(QQueue<QVector<QString> > *messageList) {
|
||||||
HandlePathList(messageList);
|
HandlePathList(messageList);
|
||||||
// if(_doc_list_path == NULL) {
|
// if(g_docListForPath == NULL) {
|
||||||
// return false;
|
// return false;
|
||||||
// }
|
// }
|
||||||
if(IndexGenerator::_doc_list_path.isEmpty()) {
|
if(IndexGenerator::g_docListForPath.isEmpty()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
qDebug() << "begin creatAllIndex";
|
qDebug() << "begin creatAllIndex";
|
||||||
try {
|
try {
|
||||||
for(auto i : IndexGenerator::_doc_list_path) {
|
for(auto i : IndexGenerator::g_docListForPath) {
|
||||||
|
|
||||||
insertIntoDatabase(i);
|
insertIntoDatabase(i);
|
||||||
}
|
}
|
||||||
|
@ -85,33 +85,32 @@ bool IndexGenerator::creatAllIndex(QQueue<QVector<QString> > *messageList) {
|
||||||
assert(false);
|
assert(false);
|
||||||
}
|
}
|
||||||
qDebug() << "finish creatAllIndex";
|
qDebug() << "finish creatAllIndex";
|
||||||
IndexGenerator::_doc_list_path.clear();
|
IndexGenerator::g_docListForPath.clear();
|
||||||
IndexGenerator::_doc_list_path.squeeze();
|
IndexGenerator::g_docListForPath.squeeze();
|
||||||
QVector<Document>().swap(IndexGenerator::_doc_list_path);
|
QVector<Document>().swap(IndexGenerator::g_docListForPath);
|
||||||
|
|
||||||
// delete _doc_list_path;
|
// delete g_docListForPath;
|
||||||
// _doc_list_path = nullptr;
|
// g_docListForPath = nullptr;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
//文件内容索引
|
//文件内容索引
|
||||||
bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) {
|
bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) {
|
||||||
// FileUtils::_index_status |= 0x2;
|
|
||||||
HandlePathList(messageList);
|
HandlePathList(messageList);
|
||||||
qDebug() << "begin creatAllIndex for content";
|
qDebug() << "begin creatAllIndex for content";
|
||||||
// if(_doc_list_content == NULL) {
|
if(IndexGenerator::g_docListForContent.isEmpty()) {
|
||||||
// return false;
|
|
||||||
// }
|
|
||||||
if(IndexGenerator::_doc_list_content.isEmpty()) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int size = IndexGenerator::_doc_list_content.size();
|
int size = IndexGenerator::g_docListForContent.size();
|
||||||
qDebug() << "begin creatAllIndex for content" << size;
|
qDebug() << "begin creatAllIndex for content" << size;
|
||||||
if(!size == 0) {
|
if(!size == 0) {
|
||||||
// GlobalSettings::getInstance()->setValue(CONTENT_INDEX_DATABASE_STATE, "0");
|
|
||||||
try {
|
try {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for(auto i : IndexGenerator::_doc_list_content) {
|
for(Document i : IndexGenerator::g_docListForContent) {
|
||||||
insertIntoContentDatabase(i);
|
if(!i.isRequiredDeleted()) {
|
||||||
|
m_database_content->replace_document(i.getUniqueTerm(), i.getXapianDocument());
|
||||||
|
} else {
|
||||||
|
m_database_content->delete_document(i.getUniqueTerm());
|
||||||
|
}
|
||||||
if(++count > 999) {
|
if(++count > 999) {
|
||||||
count = 0;
|
count = 0;
|
||||||
m_database_content->commit();
|
m_database_content->commit();
|
||||||
|
@ -121,16 +120,13 @@ bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) {
|
||||||
} catch(const Xapian::Error &e) {
|
} catch(const Xapian::Error &e) {
|
||||||
qWarning() << "creat content Index fail!" << QString::fromStdString(e.get_description());
|
qWarning() << "creat content Index fail!" << QString::fromStdString(e.get_description());
|
||||||
IndexStatusRecorder::getInstance()->setStatus(CONTENT_INDEX_DATABASE_STATE, "1");
|
IndexStatusRecorder::getInstance()->setStatus(CONTENT_INDEX_DATABASE_STATE, "1");
|
||||||
// FileUtils::_index_status &= ~0x2;
|
|
||||||
assert(false);
|
assert(false);
|
||||||
}
|
}
|
||||||
// GlobalSettings::getInstance()->setValue(CONTENT_INDEX_DATABASE_STATE, "2");
|
|
||||||
// FileUtils::_index_status &= ~0x2;
|
|
||||||
qDebug() << "finish creatAllIndex for content";
|
qDebug() << "finish creatAllIndex for content";
|
||||||
|
|
||||||
IndexGenerator::_doc_list_content.clear();
|
IndexGenerator::g_docListForContent.clear();
|
||||||
IndexGenerator::_doc_list_content.squeeze();
|
IndexGenerator::g_docListForContent.squeeze();
|
||||||
QVector<Document>().swap(IndexGenerator::_doc_list_content);
|
QVector<Document>().swap(IndexGenerator::g_docListForContent);
|
||||||
malloc_trim(0);
|
malloc_trim(0);
|
||||||
}
|
}
|
||||||
Q_EMIT this->transactionFinished();
|
Q_EMIT this->transactionFinished();
|
||||||
|
@ -175,12 +171,12 @@ IndexGenerator::~IndexGenerator() {
|
||||||
// if(m_index_map)
|
// if(m_index_map)
|
||||||
// delete m_index_map;
|
// delete m_index_map;
|
||||||
// m_index_map = nullptr;
|
// m_index_map = nullptr;
|
||||||
// if(m_doc_list_path)
|
// if(mg_docListForPath)
|
||||||
// delete m_doc_list_path;
|
// delete mg_docListForPath;
|
||||||
// m_doc_list_path = nullptr;
|
// mg_docListForPath = nullptr;
|
||||||
// if(m_doc_list_content)
|
// if(mg_docListForContent)
|
||||||
// delete m_doc_list_content;
|
// delete mg_docListForContent;
|
||||||
// m_doc_list_content = nullptr;
|
// mg_docListForContent = nullptr;
|
||||||
// if(m_index_data_path)
|
// if(m_index_data_path)
|
||||||
// delete m_index_data_path;
|
// delete m_index_data_path;
|
||||||
// m_index_data_path = nullptr;
|
// m_index_data_path = nullptr;
|
||||||
|
@ -230,7 +226,7 @@ void IndexGenerator::HandlePathList(QQueue<QVector<QString>> *messageList) {
|
||||||
|
|
||||||
// QList<Document> docList = future.results();
|
// QList<Document> docList = future.results();
|
||||||
// future.cancel();
|
// future.cancel();
|
||||||
// m_doc_list_path = new QList<Document>(docList);
|
// mg_docListForPath = new QList<Document>(docList);
|
||||||
QThreadPool pool;
|
QThreadPool pool;
|
||||||
pool.setMaxThreadCount(((QThread::idealThreadCount() - 1) / 2) + 1);
|
pool.setMaxThreadCount(((QThread::idealThreadCount() - 1) / 2) + 1);
|
||||||
pool.setExpiryTimeout(100);
|
pool.setExpiryTimeout(100);
|
||||||
|
@ -244,12 +240,12 @@ void IndexGenerator::HandlePathList(QQueue<QVector<QString>> *messageList) {
|
||||||
// delete constructer;
|
// delete constructer;
|
||||||
// constructer = nullptr;
|
// constructer = nullptr;
|
||||||
|
|
||||||
// qDebug()<<_doc_list_path->size();
|
// qDebug()<<g_docListForPath->size();
|
||||||
// qWarning() << _doc_list_path;
|
// qWarning() << g_docListForPath;
|
||||||
// QList<Document> docList = future.results();
|
// QList<Document> docList = future.results();
|
||||||
// m_doc_list_path = new QList<Document>(docList);
|
// mg_docListForPath = new QList<Document>(docList);
|
||||||
// m_doc_list_path = std::move(future.results());
|
// mg_docListForPath = std::move(future.results());
|
||||||
// qDebug()<<m_doc_list_path.size();
|
// qDebug()<<mg_docListForPath.size();
|
||||||
|
|
||||||
qDebug() << "Finish HandlePathList!";
|
qDebug() << "Finish HandlePathList!";
|
||||||
return;
|
return;
|
||||||
|
@ -280,13 +276,13 @@ void IndexGenerator::HandlePathList(QQueue<QString> *messageList) {
|
||||||
// ChineseSegmentation::getInstance()->~ChineseSegmentation();
|
// ChineseSegmentation::getInstance()->~ChineseSegmentation();
|
||||||
|
|
||||||
// QList<Document> docList = future.results();
|
// QList<Document> docList = future.results();
|
||||||
// m_doc_list_content = new QList<Document>(docList);
|
// mg_docListForContent = new QList<Document>(docList);
|
||||||
|
|
||||||
// qDebug()<<_doc_list_content->size();
|
// qDebug()<<g_docListForContent->size();
|
||||||
|
|
||||||
// QList<Document> docList = future.results();
|
// QList<Document> docList = future.results();
|
||||||
// m_doc_list_content = new QList<Document>(docList);
|
// mg_docListForContent = new QList<Document>(docList);
|
||||||
// m_doc_list_content = std::move(future.results());
|
// mg_docListForContent = std::move(future.results());
|
||||||
// future.cancel();
|
// future.cancel();
|
||||||
|
|
||||||
qDebug() << "Finish HandlePathList for content index!";
|
qDebug() << "Finish HandlePathList for content index!";
|
||||||
|
|
|
@ -78,10 +78,10 @@ private:
|
||||||
void insertIntoDatabase(Document& doc);
|
void insertIntoDatabase(Document& doc);
|
||||||
void insertIntoContentDatabase(Document& doc);
|
void insertIntoContentDatabase(Document& doc);
|
||||||
|
|
||||||
static QVector<Document> _doc_list_path;
|
static QVector<Document> g_docListForPath;
|
||||||
static QMutex _mutex_doc_list_path;
|
static QMutex g_mutexDocListForPath;
|
||||||
static QVector<Document> _doc_list_content;
|
static QVector<Document> g_docListForContent;
|
||||||
static QMutex _mutex_doc_list_content;
|
static QMutex g_mutexDocListForContent;
|
||||||
QMap<QString, QStringList> m_index_map;
|
QMap<QString, QStringList> m_index_map;
|
||||||
QString m_index_data_path;
|
QString m_index_data_path;
|
||||||
Xapian::WritableDatabase* m_database_path;
|
Xapian::WritableDatabase* m_database_path;
|
||||||
|
|
Loading…
Reference in New Issue