新增OCR功能后端;修复内存泄漏一处;
This commit is contained in:
parent
607e021bfc
commit
2e668d374a
|
@ -20,7 +20,9 @@ Build-Depends: debhelper (>=9.0.0),
|
||||||
libpoppler-qt5-dev,
|
libpoppler-qt5-dev,
|
||||||
libukui-log4qt-dev,
|
libukui-log4qt-dev,
|
||||||
libqt5xdg-dev,
|
libqt5xdg-dev,
|
||||||
libukcc-dev
|
libukcc-dev,
|
||||||
|
libopencv-dev,
|
||||||
|
libtesseract-dev
|
||||||
Standards-Version: 4.5.0
|
Standards-Version: 4.5.0
|
||||||
Homepage: https://www.ukui.org/
|
Homepage: https://www.ukui.org/
|
||||||
Vcs-Git: https://github.com/ukui/ukui-search.git
|
Vcs-Git: https://github.com/ukui/ukui-search.git
|
||||||
|
|
|
@ -27,5 +27,11 @@ static const QMap<QString, bool> targetFileTypeMap = {
|
||||||
std::map<QString, bool>::value_type("et", true),
|
std::map<QString, bool>::value_type("et", true),
|
||||||
std::map<QString, bool>::value_type("pdf", true)
|
std::map<QString, bool>::value_type("pdf", true)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static const QMap<QString, bool> targetPhotographTypeMap = {
|
||||||
|
std::map<QString, bool>::value_type("png", true),
|
||||||
|
std::map<QString, bool>::value_type("jpg", true),
|
||||||
|
std::map<QString, bool>::value_type("jpeg", true)//TODO 待完善,后续改为配置文件
|
||||||
|
};
|
||||||
//TODO Put things that needed to be put here here.
|
//TODO Put things that needed to be put here here.
|
||||||
#endif // COMMON_H
|
#endif // COMMON_H
|
||||||
|
|
|
@ -130,3 +130,40 @@ void ConstructDocumentForContent::run() {
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ConstructDocumentForOcr::ConstructDocumentForOcr(QString path)
|
||||||
|
{
|
||||||
|
this->setAutoDelete(true);
|
||||||
|
m_path = std::move(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ConstructDocumentForOcr::run()
|
||||||
|
{
|
||||||
|
QString content;
|
||||||
|
FileReader::getTextContent(m_path, content);
|
||||||
|
|
||||||
|
Document doc;
|
||||||
|
doc.setUniqueTerm(FileUtils::makeDocUterm(m_path));
|
||||||
|
doc.addTerm("ZEEKERUPTERM" + FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
|
||||||
|
doc.addValue(1, m_path);
|
||||||
|
|
||||||
|
if(content.isEmpty()) {
|
||||||
|
doc.reuireDeleted();
|
||||||
|
} else {
|
||||||
|
doc.setData(content);
|
||||||
|
//'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
|
||||||
|
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
|
||||||
|
std::vector<cppjieba::KeyWord> term = ChineseSegmentation::getInstance()->callSegementStd(content.toStdString());
|
||||||
|
for(size_t i = 0; i < term.size(); ++i) {
|
||||||
|
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
|
||||||
|
}
|
||||||
|
term.clear();
|
||||||
|
term.shrink_to_fit();
|
||||||
|
}
|
||||||
|
IndexGenerator::g_mutexDocListForOcr.lock();
|
||||||
|
IndexGenerator::g_docListForOcr.append(doc);
|
||||||
|
IndexGenerator::g_mutexDocListForOcr.unlock();
|
||||||
|
content.clear();
|
||||||
|
content.squeeze();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -48,6 +48,16 @@ protected:
|
||||||
private:
|
private:
|
||||||
QString m_path;
|
QString m_path;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class ConstructDocumentForOcr : public QRunnable {
|
||||||
|
public:
|
||||||
|
explicit ConstructDocumentForOcr(QString path);
|
||||||
|
~ConstructDocumentForOcr() = default;
|
||||||
|
protected:
|
||||||
|
void run();
|
||||||
|
private:
|
||||||
|
QString m_path;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // CONSTRUCTDOCUMENT_H
|
#endif // CONSTRUCTDOCUMENT_H
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
#include "file-reader.h"
|
#include "file-reader.h"
|
||||||
#include "file-utils.h"
|
#include "file-utils.h"
|
||||||
#include "binary-parser.h"
|
#include "binary-parser.h"
|
||||||
|
#include "ocrobject.h"
|
||||||
using namespace UkuiSearch;
|
using namespace UkuiSearch;
|
||||||
FileReader::FileReader(QObject *parent) : QObject(parent) {
|
FileReader::FileReader(QObject *parent) : QObject(parent) {
|
||||||
|
|
||||||
|
@ -41,6 +42,8 @@ void FileReader::getTextContent(QString path, QString &textContent) {
|
||||||
searchdata.RunParser(path, textContent);
|
searchdata.RunParser(path, textContent);
|
||||||
} else if (strsfx == "pdf") {
|
} else if (strsfx == "pdf") {
|
||||||
FileUtils::getPdfTextContent(path, textContent);
|
FileUtils::getPdfTextContent(path, textContent);
|
||||||
|
} else if (strsfx == "png" || strsfx == "jpg" || strsfx == "jpeg"){
|
||||||
|
OcrObject::getInstance()->getTxtContent(path, textContent);;
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,6 +39,9 @@ FirstIndex::~FirstIndex() {
|
||||||
if(this->q_content_index)
|
if(this->q_content_index)
|
||||||
delete this->q_content_index;
|
delete this->q_content_index;
|
||||||
this->q_content_index = nullptr;
|
this->q_content_index = nullptr;
|
||||||
|
if(this->m_ocr_index)
|
||||||
|
delete this->m_ocr_index;
|
||||||
|
this->m_ocr_index = nullptr;
|
||||||
if(this->p_indexGenerator)
|
if(this->p_indexGenerator)
|
||||||
delete this->p_indexGenerator;
|
delete this->p_indexGenerator;
|
||||||
this->p_indexGenerator = nullptr;
|
this->p_indexGenerator = nullptr;
|
||||||
|
@ -48,10 +51,10 @@ FirstIndex::~FirstIndex() {
|
||||||
void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
|
void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
|
||||||
// qDebug() << "there are some shit here"<<fileInfo.fileName() << fileInfo.absoluteFilePath() << QString(fileInfo.isDir() ? "1" : "0");
|
// qDebug() << "there are some shit here"<<fileInfo.fileName() << fileInfo.absoluteFilePath() << QString(fileInfo.isDir() ? "1" : "0");
|
||||||
this->q_index->enqueue(QVector<QString>() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0"));
|
this->q_index->enqueue(QVector<QString>() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0"));
|
||||||
if((fileInfo.fileName().split(".", QString::SkipEmptyParts).length() > 1)
|
if (fileInfo.fileName().split(".", QString::SkipEmptyParts).length() < 2)
|
||||||
&& (true == targetFileTypeMap[fileInfo.fileName().split(".").last()])
|
return;
|
||||||
&& (!FileUtils::isEncrypedOrUnreadable(fileInfo.absoluteFilePath()))) {
|
if (true == targetFileTypeMap[fileInfo.fileName().split(".").last()]
|
||||||
//this->q_content_index->enqueue(fileInfo.absoluteFilePath());
|
and false == FileUtils::isEncrypedOrUnreadable(fileInfo.absoluteFilePath())) {
|
||||||
if (fileInfo.fileName().split(".").last() == "docx") {
|
if (fileInfo.fileName().split(".").last() == "docx") {
|
||||||
QuaZip file(fileInfo.absoluteFilePath());
|
QuaZip file(fileInfo.absoluteFilePath());
|
||||||
if(!file.open(QuaZip::mdUnzip))
|
if(!file.open(QuaZip::mdUnzip))
|
||||||
|
@ -93,6 +96,8 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
|
||||||
} else {
|
} else {
|
||||||
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
|
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
|
||||||
}
|
}
|
||||||
|
} else if (true == targetPhotographTypeMap[fileInfo.fileName().split(".").last()]) {
|
||||||
|
this->m_ocr_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -120,6 +125,7 @@ void FirstIndex::run() {
|
||||||
|
|
||||||
this->q_index = new QQueue<QVector<QString>>();
|
this->q_index = new QQueue<QVector<QString>>();
|
||||||
this->q_content_index = new QQueue<QPair<QString,qint64>>();
|
this->q_content_index = new QQueue<QPair<QString,qint64>>();
|
||||||
|
this->m_ocr_index = new QQueue<QPair<QString,qint64>>();
|
||||||
|
|
||||||
int fifo_fd;
|
int fifo_fd;
|
||||||
char buffer[2];
|
char buffer[2];
|
||||||
|
@ -214,7 +220,34 @@ void FirstIndex::run() {
|
||||||
qDebug() << "content index end;";
|
qDebug() << "content index end;";
|
||||||
sem.release(2);
|
sem.release(2);
|
||||||
});
|
});
|
||||||
|
//OCR功能暂时屏蔽
|
||||||
|
// QtConcurrent::run(&m_pool,[&]() {
|
||||||
|
// sem.acquire(5);
|
||||||
|
// QQueue<QString>* tmpOcr = new QQueue<QString>();
|
||||||
|
// qDebug() << "m_ocr_index:" << m_ocr_index->size();
|
||||||
|
// while(!this->m_ocr_index->empty()) {
|
||||||
|
// qint64 fileSize = 0;
|
||||||
|
// //一次处理的数据量文件总大小为50M以下,50M为暂定值
|
||||||
|
// for(size_t i = 0;/* (i < 30) && (fileSize < 52428800) && */(!this->m_ocr_index->empty()); ++i) {
|
||||||
|
// QPair<QString,qint64> tempPair = this->m_ocr_index->dequeue();
|
||||||
|
// fileSize += tempPair.second;
|
||||||
|
// if (fileSize > 52428800) {
|
||||||
|
// if (tmpOcr->size() == 0) {
|
||||||
|
// tmpOcr->enqueue(tempPair.first);
|
||||||
|
// break;
|
||||||
|
// }
|
||||||
|
// this->m_ocr_index->enqueue(tempPair);
|
||||||
|
// break;
|
||||||
|
// }
|
||||||
|
// tmpOcr->enqueue(tempPair.first);
|
||||||
|
// }
|
||||||
|
// this->p_indexGenerator->creatOcrIndex(tmpOcr);
|
||||||
|
// tmpOcr->clear();
|
||||||
|
// }
|
||||||
|
// delete tmpOcr;
|
||||||
|
// qDebug() << "OCR index end;";
|
||||||
|
// sem.release(5);
|
||||||
|
// });
|
||||||
mutex1.lock();
|
mutex1.lock();
|
||||||
mutex2.lock();
|
mutex2.lock();
|
||||||
mutex3.lock();
|
mutex3.lock();
|
||||||
|
@ -223,14 +256,15 @@ void FirstIndex::run() {
|
||||||
mutex2.unlock();
|
mutex2.unlock();
|
||||||
mutex3.unlock();
|
mutex3.unlock();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if(this->q_index)
|
if(this->q_index)
|
||||||
delete this->q_index;
|
delete this->q_index;
|
||||||
this->q_index = nullptr;
|
this->q_index = nullptr;
|
||||||
if(this->q_content_index)
|
if(this->q_content_index)
|
||||||
delete this->q_content_index;
|
delete this->q_content_index;
|
||||||
this->q_content_index = nullptr;
|
this->q_content_index = nullptr;
|
||||||
|
if(this->m_ocr_index)
|
||||||
|
delete this->m_ocr_index;
|
||||||
|
this->m_ocr_index = nullptr;
|
||||||
if(p_indexGenerator)
|
if(p_indexGenerator)
|
||||||
delete p_indexGenerator;
|
delete p_indexGenerator;
|
||||||
p_indexGenerator = nullptr;
|
p_indexGenerator = nullptr;
|
||||||
|
|
|
@ -60,7 +60,8 @@ private:
|
||||||
// QQueue<QString>* q_content_index;
|
// QQueue<QString>* q_content_index;
|
||||||
//修改QQueue存储数据为QPair<QString,qint64>,增加存储文件大小数据便于处理时统计--jxx20210519
|
//修改QQueue存储数据为QPair<QString,qint64>,增加存储文件大小数据便于处理时统计--jxx20210519
|
||||||
QQueue<QPair<QString,qint64>>* q_content_index;
|
QQueue<QPair<QString,qint64>>* q_content_index;
|
||||||
|
//新增ocr队列存储ocr可识别处理的图片信息及大小;
|
||||||
|
QQueue<QPair<QString,qint64>>* m_ocr_index;
|
||||||
//xapian will auto commit per 10,000 changes, donnot change it!!!
|
//xapian will auto commit per 10,000 changes, donnot change it!!!
|
||||||
const size_t u_send_length = 8192;
|
const size_t u_send_length = 8192;
|
||||||
};
|
};
|
||||||
|
|
|
@ -33,6 +33,7 @@
|
||||||
|
|
||||||
#define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/index_data").toStdString()
|
#define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/index_data").toStdString()
|
||||||
#define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/content_index_data").toStdString()
|
#define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/content_index_data").toStdString()
|
||||||
|
#define OCR_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/ocr_index_data").toStdString()
|
||||||
|
|
||||||
using namespace UkuiSearch;
|
using namespace UkuiSearch;
|
||||||
|
|
||||||
|
@ -44,8 +45,11 @@ QMutex IndexGenerator::m_mutex;
|
||||||
//QMutex UkuiSearch::g_mutexDocListForContent;
|
//QMutex UkuiSearch::g_mutexDocListForContent;
|
||||||
QMutex IndexGenerator::g_mutexDocListForPath;
|
QMutex IndexGenerator::g_mutexDocListForPath;
|
||||||
QMutex IndexGenerator::g_mutexDocListForContent;
|
QMutex IndexGenerator::g_mutexDocListForContent;
|
||||||
|
QMutex IndexGenerator::g_mutexDocListForOcr;
|
||||||
QVector<Document> IndexGenerator::g_docListForPath = QVector<Document>();
|
QVector<Document> IndexGenerator::g_docListForPath = QVector<Document>();
|
||||||
QVector<Document> IndexGenerator::g_docListForContent = QVector<Document>();
|
QVector<Document> IndexGenerator::g_docListForContent = QVector<Document>();
|
||||||
|
QVector<Document> IndexGenerator::g_docListForOcr = QVector<Document>();
|
||||||
|
|
||||||
|
|
||||||
IndexGenerator *IndexGenerator::getInstance(bool rebuild, QObject *parent) {
|
IndexGenerator *IndexGenerator::getInstance(bool rebuild, QObject *parent) {
|
||||||
QMutexLocker locker(&m_mutex);
|
QMutexLocker locker(&m_mutex);
|
||||||
|
@ -134,6 +138,44 @@ bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool IndexGenerator::creatOcrIndex(QQueue<QString> *messageList)
|
||||||
|
{
|
||||||
|
HandleOcrPathList(messageList);
|
||||||
|
if(IndexGenerator::g_docListForOcr.isEmpty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
int size = IndexGenerator::g_docListForOcr.size();
|
||||||
|
qDebug() << "begin creatAllIndex for ocr" << size;
|
||||||
|
if(!size == 0) {
|
||||||
|
try {
|
||||||
|
int count = 0;
|
||||||
|
for(Document i : IndexGenerator::g_docListForOcr) {
|
||||||
|
if(!i.isRequiredDeleted()) {
|
||||||
|
m_database_ocr->replace_document(i.getUniqueTerm(), i.getXapianDocument());
|
||||||
|
} else {
|
||||||
|
m_database_ocr->delete_document(i.getUniqueTerm());
|
||||||
|
}
|
||||||
|
if(++count > 999) {
|
||||||
|
count = 0;
|
||||||
|
m_database_ocr->commit();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
m_database_ocr->commit();
|
||||||
|
} catch(const Xapian::Error &e) {
|
||||||
|
qWarning() << "creat ocr Index fail!" << QString::fromStdString(e.get_description());
|
||||||
|
IndexStatusRecorder::getInstance()->setStatus(CONTENT_INDEX_DATABASE_STATE, "1");
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
qDebug() << "finish creatAllIndex for ocr";
|
||||||
|
|
||||||
|
IndexGenerator::g_docListForOcr.clear();
|
||||||
|
IndexGenerator::g_docListForOcr.squeeze();
|
||||||
|
QVector<Document>().swap(IndexGenerator::g_docListForOcr);
|
||||||
|
malloc_trim(0);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
IndexGenerator::IndexGenerator(bool rebuild, QObject *parent) : QObject(parent) {
|
IndexGenerator::IndexGenerator(bool rebuild, QObject *parent) : QObject(parent) {
|
||||||
QDir database(QString::fromStdString(INDEX_PATH));
|
QDir database(QString::fromStdString(INDEX_PATH));
|
||||||
|
|
||||||
|
@ -153,6 +195,7 @@ IndexGenerator::IndexGenerator(bool rebuild, QObject *parent) : QObject(parent)
|
||||||
|
|
||||||
m_database_path = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
|
m_database_path = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
|
||||||
m_database_content = new Xapian::WritableDatabase(CONTENT_INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
|
m_database_content = new Xapian::WritableDatabase(CONTENT_INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
|
||||||
|
m_database_ocr = new Xapian::WritableDatabase(OCR_INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
|
||||||
}
|
}
|
||||||
|
|
||||||
IndexGenerator::~IndexGenerator() {
|
IndexGenerator::~IndexGenerator() {
|
||||||
|
@ -165,8 +208,11 @@ IndexGenerator::~IndexGenerator() {
|
||||||
if(m_database_content)
|
if(m_database_content)
|
||||||
m_database_content->~WritableDatabase();
|
m_database_content->~WritableDatabase();
|
||||||
// delete m_database_content;
|
// delete m_database_content;
|
||||||
|
if(m_database_ocr)
|
||||||
|
m_database_ocr->~WritableDatabase();
|
||||||
m_database_path = nullptr;
|
m_database_path = nullptr;
|
||||||
m_database_content = nullptr;
|
m_database_content = nullptr;
|
||||||
|
m_database_ocr = nullptr;
|
||||||
global_instance = nullptr;
|
global_instance = nullptr;
|
||||||
// if(m_index_map)
|
// if(m_index_map)
|
||||||
// delete m_index_map;
|
// delete m_index_map;
|
||||||
|
@ -266,28 +312,25 @@ void IndexGenerator::HandlePathList(QQueue<QString> *messageList) {
|
||||||
pool.start(constructer);
|
pool.start(constructer);
|
||||||
}
|
}
|
||||||
qDebug() << "pool finish" << pool.waitForDone(-1);
|
qDebug() << "pool finish" << pool.waitForDone(-1);
|
||||||
// if(constructer)
|
|
||||||
// delete constructer;
|
|
||||||
// constructer = nullptr;
|
|
||||||
|
|
||||||
// QFuture<Document> future = QtConcurrent::mapped(*messageList,&IndexGenerator::GenerateContentDocument);
|
|
||||||
|
|
||||||
// future.waitForFinished();
|
|
||||||
// ChineseSegmentation::getInstance()->~ChineseSegmentation();
|
|
||||||
|
|
||||||
// QList<Document> docList = future.results();
|
|
||||||
// mg_docListForContent = new QList<Document>(docList);
|
|
||||||
|
|
||||||
// qDebug()<<g_docListForContent->size();
|
|
||||||
|
|
||||||
// QList<Document> docList = future.results();
|
|
||||||
// mg_docListForContent = new QList<Document>(docList);
|
|
||||||
// mg_docListForContent = std::move(future.results());
|
|
||||||
// future.cancel();
|
|
||||||
|
|
||||||
qDebug() << "Finish HandlePathList for content index!";
|
qDebug() << "Finish HandlePathList for content index!";
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
void IndexGenerator::HandleOcrPathList(QQueue<QString> *messageList)
|
||||||
|
{
|
||||||
|
qDebug() << "Begin HandlePathList for ocr index!";
|
||||||
|
qDebug() << messageList->size();
|
||||||
|
ConstructDocumentForOcr *constructer;
|
||||||
|
QThreadPool pool;
|
||||||
|
pool.setMaxThreadCount(1);
|
||||||
|
pool.setExpiryTimeout(100);
|
||||||
|
while(!messageList->isEmpty()) {
|
||||||
|
constructer = new ConstructDocumentForOcr(messageList->dequeue());
|
||||||
|
pool.start(constructer);
|
||||||
|
}
|
||||||
|
qDebug() << "pool finish" << pool.waitForDone(-1);
|
||||||
|
qDebug() << "Finish HandlePathList for content index!";
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
//deprecated
|
//deprecated
|
||||||
Document IndexGenerator::GenerateDocument(const QVector<QString> &list) {
|
Document IndexGenerator::GenerateDocument(const QVector<QString> &list) {
|
||||||
|
@ -460,10 +503,13 @@ bool IndexGenerator::deleteAllIndex(QStringList *pathlist) {
|
||||||
|
|
||||||
m_database_path->delete_document(uniqueterm);
|
m_database_path->delete_document(uniqueterm);
|
||||||
m_database_content->delete_document(uniqueterm);
|
m_database_content->delete_document(uniqueterm);
|
||||||
|
m_database_ocr->delete_document(uniqueterm);
|
||||||
|
|
||||||
//delete all files under it if it's a dir.
|
//delete all files under it if it's a dir.
|
||||||
m_database_path->delete_document(upterm);
|
m_database_path->delete_document(upterm);
|
||||||
m_database_content->delete_document(upterm);
|
m_database_content->delete_document(upterm);
|
||||||
|
m_database_ocr->delete_document(upterm);
|
||||||
|
|
||||||
qDebug() << "delete path" << doc;
|
qDebug() << "delete path" << doc;
|
||||||
// qDebug() << "delete md5" << QString::fromStdString(uniqueterm);
|
// qDebug() << "delete md5" << QString::fromStdString(uniqueterm);
|
||||||
|
|
||||||
|
@ -472,6 +518,7 @@ bool IndexGenerator::deleteAllIndex(QStringList *pathlist) {
|
||||||
}
|
}
|
||||||
m_database_path->commit();
|
m_database_path->commit();
|
||||||
m_database_content->commit();
|
m_database_content->commit();
|
||||||
|
m_database_ocr->commit();
|
||||||
qDebug() << "--delete finish--";
|
qDebug() << "--delete finish--";
|
||||||
} catch(const Xapian::Error &e) {
|
} catch(const Xapian::Error &e) {
|
||||||
qWarning() << QString::fromStdString(e.get_description());
|
qWarning() << QString::fromStdString(e.get_description());
|
||||||
|
@ -503,43 +550,85 @@ bool IndexGenerator::deleteContentIndex(QStringList *pathlist)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool IndexGenerator::deleteOcrIndex(QStringList *pathlist)
|
||||||
|
{
|
||||||
|
if(pathlist->isEmpty())
|
||||||
|
return true;
|
||||||
|
try {
|
||||||
|
qDebug() << "--delete start--";
|
||||||
|
for(int i = 0; i < pathlist->size(); i++) {
|
||||||
|
QString doc = pathlist->at(i);
|
||||||
|
std::string uniqueterm = FileUtils::makeDocUterm(doc);
|
||||||
|
m_database_ocr->delete_document(uniqueterm);
|
||||||
|
qDebug() << "delete path" << doc;
|
||||||
|
}
|
||||||
|
m_database_ocr->commit();
|
||||||
|
qDebug() << "--delete finish--";
|
||||||
|
} catch(const Xapian::Error &e) {
|
||||||
|
qWarning() << QString::fromStdString(e.get_description());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool IndexGenerator::updateIndex(QVector<PendingFile> *pendingFiles)
|
bool IndexGenerator::updateIndex(QVector<PendingFile> *pendingFiles)
|
||||||
{
|
{
|
||||||
|
|
||||||
QQueue<QVector<QString>> *fileIndexInfo = new QQueue<QVector<QString>>;
|
QQueue<QVector<QString>> *fileIndexInfo = new QQueue<QVector<QString>>;
|
||||||
QQueue<QString> *fileContentIndexInfo = new QQueue<QString>;
|
QQueue<QString> *fileContentIndexInfo = new QQueue<QString>;
|
||||||
|
QQueue<QString> *fileOcrIndexInfo = new QQueue<QString>;
|
||||||
QStringList *deleteList = new QStringList;
|
QStringList *deleteList = new QStringList;
|
||||||
QStringList *contentDeleteList = new QStringList;
|
QStringList *contentDeleteList = new QStringList;
|
||||||
for(PendingFile file : *pendingFiles) {
|
for (PendingFile file : *pendingFiles) {
|
||||||
if(file.shouldRemoveIndex()) {
|
if (file.shouldRemoveIndex()) {
|
||||||
|
|
||||||
deleteList->append(file.path());
|
deleteList->append(file.path());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
fileIndexInfo->append(QVector<QString>() << file.path().section("/" , -1) << file.path() << QString(file.isDir() ? "1" : "0"));
|
fileIndexInfo->append(QVector<QString>() << file.path().section("/" , -1) << file.path() << QString(file.isDir() ? "1" : "0"));
|
||||||
if((!file.path().split(".").isEmpty()) && (true == targetFileTypeMap[file.path().section("/" , -1) .split(".").last()])) {
|
if ((!file.path().split(".").isEmpty()) && (true == targetFileTypeMap[file.path().section("/" , -1) .split(".").last()])) {
|
||||||
if(!FileUtils::isEncrypedOrUnreadable(file.path())) {
|
if (!FileUtils::isEncrypedOrUnreadable(file.path())) {
|
||||||
fileContentIndexInfo->append(file.path());
|
fileContentIndexInfo->append(file.path());
|
||||||
} else {
|
} else {
|
||||||
contentDeleteList->append(file.path());
|
contentDeleteList->append(file.path());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
if(!deleteList->isEmpty()) {
|
if (!deleteList->isEmpty()) {
|
||||||
deleteAllIndex(deleteList);
|
deleteAllIndex(deleteList);
|
||||||
}
|
}
|
||||||
if(!contentDeleteList->isEmpty()) {
|
if (!contentDeleteList->isEmpty()) {
|
||||||
deleteContentIndex(contentDeleteList);
|
deleteContentIndex(contentDeleteList);
|
||||||
}
|
}
|
||||||
if(!fileIndexInfo->isEmpty()) {
|
if (!fileIndexInfo->isEmpty()) {
|
||||||
creatAllIndex(fileIndexInfo);
|
creatAllIndex(fileIndexInfo);
|
||||||
}
|
}
|
||||||
if(!fileContentIndexInfo->isEmpty()) {
|
if (!fileContentIndexInfo->isEmpty()) {
|
||||||
creatAllIndex(fileContentIndexInfo);
|
creatAllIndex(fileContentIndexInfo);
|
||||||
}
|
}
|
||||||
|
if (!fileOcrIndexInfo->isEmpty()) {
|
||||||
|
creatOcrIndex(fileOcrIndexInfo);
|
||||||
|
}
|
||||||
|
if (fileIndexInfo) {
|
||||||
delete fileIndexInfo;
|
delete fileIndexInfo;
|
||||||
|
fileIndexInfo = nullptr;
|
||||||
|
}
|
||||||
|
if (fileContentIndexInfo) {
|
||||||
delete fileContentIndexInfo;
|
delete fileContentIndexInfo;
|
||||||
|
fileContentIndexInfo = nullptr;
|
||||||
|
}
|
||||||
|
if (fileOcrIndexInfo) {
|
||||||
|
delete fileOcrIndexInfo;
|
||||||
|
fileOcrIndexInfo = nullptr;
|
||||||
|
}
|
||||||
|
if (deleteList) {
|
||||||
|
delete deleteList;
|
||||||
|
deleteList = nullptr;
|
||||||
|
}
|
||||||
|
if (contentDeleteList) {
|
||||||
|
delete contentDeleteList;
|
||||||
|
contentDeleteList = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -45,6 +45,7 @@ namespace UkuiSearch {
|
||||||
class IndexGenerator : public QObject {
|
class IndexGenerator : public QObject {
|
||||||
friend class ConstructDocumentForPath;
|
friend class ConstructDocumentForPath;
|
||||||
friend class ConstructDocumentForContent;
|
friend class ConstructDocumentForContent;
|
||||||
|
friend class ConstructDocumentForOcr;
|
||||||
Q_OBJECT
|
Q_OBJECT
|
||||||
public:
|
public:
|
||||||
static IndexGenerator *getInstance(bool rebuild = false, QObject *parent = nullptr);
|
static IndexGenerator *getInstance(bool rebuild = false, QObject *parent = nullptr);
|
||||||
|
@ -61,8 +62,10 @@ Q_SIGNALS:
|
||||||
public Q_SLOTS:
|
public Q_SLOTS:
|
||||||
bool creatAllIndex(QQueue<QVector<QString>> *messageList);
|
bool creatAllIndex(QQueue<QVector<QString>> *messageList);
|
||||||
bool creatAllIndex(QQueue<QString> *messageList);
|
bool creatAllIndex(QQueue<QString> *messageList);
|
||||||
|
bool creatOcrIndex(QQueue<QString> *messageList);
|
||||||
bool deleteAllIndex(QStringList *pathlist);
|
bool deleteAllIndex(QStringList *pathlist);
|
||||||
bool deleteContentIndex(QStringList *pathlist);
|
bool deleteContentIndex(QStringList *pathlist);
|
||||||
|
bool deleteOcrIndex(QStringList *pathlist);
|
||||||
bool updateIndex(QVector<PendingFile> *pendingFiles);
|
bool updateIndex(QVector<PendingFile> *pendingFiles);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -72,6 +75,8 @@ private:
|
||||||
void HandlePathList(QQueue<QVector<QString> > *messageList);
|
void HandlePathList(QQueue<QVector<QString> > *messageList);
|
||||||
//For file content index
|
//For file content index
|
||||||
void HandlePathList(QQueue<QString> *messageList);
|
void HandlePathList(QQueue<QString> *messageList);
|
||||||
|
//For ocr index
|
||||||
|
void HandleOcrPathList(QQueue<QString> *messageList);
|
||||||
static Document GenerateDocument(const QVector<QString> &list);
|
static Document GenerateDocument(const QVector<QString> &list);
|
||||||
static Document GenerateContentDocument(const QString &list);
|
static Document GenerateContentDocument(const QString &list);
|
||||||
//add one data in database
|
//add one data in database
|
||||||
|
@ -82,10 +87,13 @@ private:
|
||||||
static QMutex g_mutexDocListForPath;
|
static QMutex g_mutexDocListForPath;
|
||||||
static QVector<Document> g_docListForContent;
|
static QVector<Document> g_docListForContent;
|
||||||
static QMutex g_mutexDocListForContent;
|
static QMutex g_mutexDocListForContent;
|
||||||
|
static QVector<Document> g_docListForOcr;
|
||||||
|
static QMutex g_mutexDocListForOcr;
|
||||||
QMap<QString, QStringList> m_index_map;
|
QMap<QString, QStringList> m_index_map;
|
||||||
QString m_index_data_path;
|
QString m_index_data_path;
|
||||||
Xapian::WritableDatabase* m_database_path;
|
Xapian::WritableDatabase* m_database_path;
|
||||||
Xapian::WritableDatabase* m_database_content;
|
Xapian::WritableDatabase* m_database_content;
|
||||||
|
Xapian::WritableDatabase* m_database_ocr;
|
||||||
std::string m_docstr;
|
std::string m_docstr;
|
||||||
std::string m_index_text_str;
|
std::string m_index_text_str;
|
||||||
Xapian::TermGenerator m_indexer;
|
Xapian::TermGenerator m_indexer;
|
||||||
|
|
|
@ -9,6 +9,7 @@ HEADERS += \
|
||||||
$$PWD/index-generator.h \
|
$$PWD/index-generator.h \
|
||||||
$$PWD/index-status-recorder.h \
|
$$PWD/index-status-recorder.h \
|
||||||
$$PWD/inotify-watch.h \
|
$$PWD/inotify-watch.h \
|
||||||
|
$$PWD/ocrobject.h \
|
||||||
$$PWD/pending-file-queue.h \
|
$$PWD/pending-file-queue.h \
|
||||||
$$PWD/pending-file.h \
|
$$PWD/pending-file.h \
|
||||||
$$PWD/search-manager.h \
|
$$PWD/search-manager.h \
|
||||||
|
@ -25,6 +26,7 @@ SOURCES += \
|
||||||
$$PWD/index-generator.cpp \
|
$$PWD/index-generator.cpp \
|
||||||
$$PWD/index-status-recorder.cpp \
|
$$PWD/index-status-recorder.cpp \
|
||||||
$$PWD/inotify-watch.cpp \
|
$$PWD/inotify-watch.cpp \
|
||||||
|
$$PWD/ocrobject.cpp \
|
||||||
$$PWD/pending-file-queue.cpp \
|
$$PWD/pending-file-queue.cpp \
|
||||||
$$PWD/pending-file.cpp \
|
$$PWD/pending-file.cpp \
|
||||||
$$PWD/search-manager.cpp \
|
$$PWD/search-manager.cpp \
|
||||||
|
|
|
@ -0,0 +1,98 @@
|
||||||
|
#include "ocrobject.h"
|
||||||
|
|
||||||
|
OcrObject *OcrObject::m_instance = nullptr;
|
||||||
|
once_flag g_instanceFlag;
|
||||||
|
|
||||||
|
OcrObject *OcrObject::getInstance()
|
||||||
|
{
|
||||||
|
std::call_once(g_instanceFlag, [] () {
|
||||||
|
m_instance = new OcrObject;
|
||||||
|
});
|
||||||
|
return m_instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
void OcrObject::getTxtContent(QString &path, QString &textcontent)
|
||||||
|
{
|
||||||
|
m_api = new tesseract::TessBaseAPI();
|
||||||
|
if (m_api->Init(NULL, "chi_sim")) {
|
||||||
|
qDebug() << "Could not initialize tesseract.\n";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
m_api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080
|
||||||
|
|
||||||
|
Pix *image = pixRead(path.toStdString().data());
|
||||||
|
if (!image) {
|
||||||
|
qDebug() << "path:" << path <<" pixRead error!";
|
||||||
|
if (m_api) {
|
||||||
|
m_api->End();
|
||||||
|
delete m_api;
|
||||||
|
m_api = nullptr;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
m_api->SetImage(image);
|
||||||
|
textcontent = m_api->GetUTF8Text();
|
||||||
|
qDebug() << "path:" << path << " Text:" << textcontent;
|
||||||
|
pixDestroy(&image);
|
||||||
|
m_api->Clear();
|
||||||
|
|
||||||
|
if (m_api) {
|
||||||
|
m_api->End();
|
||||||
|
delete m_api;
|
||||||
|
m_api = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
//多进程版本
|
||||||
|
// tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
|
||||||
|
// if (api->Init(NULL, "chi_sim")) {
|
||||||
|
// qDebug() << "Could not initialize tesseract.\n";
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
// api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080
|
||||||
|
|
||||||
|
// Pix *image = pixRead(path.toStdString().data());
|
||||||
|
// if (!image) {
|
||||||
|
// qDebug() << "path:" << path <<" pixRead error!";
|
||||||
|
// if (api) {
|
||||||
|
// api->End();
|
||||||
|
// delete api;
|
||||||
|
// api = nullptr;
|
||||||
|
// }
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
// api->SetImage(image);
|
||||||
|
// textcontent = api->GetUTF8Text();
|
||||||
|
// qDebug() << "path:" << path << " Text:" << textcontent;
|
||||||
|
// pixDestroy(&image);
|
||||||
|
// api->Clear();
|
||||||
|
|
||||||
|
// if (api) {
|
||||||
|
// api->End();
|
||||||
|
// delete api;
|
||||||
|
// api = nullptr;
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
OcrObject::OcrObject(QObject *parent) : QObject(parent)
|
||||||
|
{
|
||||||
|
init();
|
||||||
|
}
|
||||||
|
|
||||||
|
OcrObject::~OcrObject()
|
||||||
|
{
|
||||||
|
if (m_api) {
|
||||||
|
m_api->End();
|
||||||
|
delete m_api;
|
||||||
|
m_api = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void OcrObject::init()
|
||||||
|
{
|
||||||
|
m_api = new tesseract::TessBaseAPI();
|
||||||
|
if (m_api->Init(NULL, "chi_sim")) {
|
||||||
|
qDebug() << "Could not initialize tesseract.\n";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
m_api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080
|
||||||
|
}
|
|
@ -0,0 +1,41 @@
|
||||||
|
#ifndef OCROBJECT_H
|
||||||
|
#define OCROBJECT_H
|
||||||
|
|
||||||
|
#include <QObject>
|
||||||
|
#include <mutex>
|
||||||
|
#include <tesseract/baseapi.h>
|
||||||
|
#include <leptonica/allheaders.h>
|
||||||
|
#include <QDebug>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
class OcrObject : public QObject
|
||||||
|
{
|
||||||
|
Q_OBJECT
|
||||||
|
public:
|
||||||
|
static OcrObject* getInstance();
|
||||||
|
|
||||||
|
void getTxtContent(QString &path, QString &textcontent);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
explicit OcrObject(QObject *parent = nullptr);
|
||||||
|
~OcrObject();
|
||||||
|
|
||||||
|
private:
|
||||||
|
static OcrObject *m_instance;
|
||||||
|
|
||||||
|
tesseract::TessBaseAPI *m_api = nullptr;
|
||||||
|
void init();
|
||||||
|
|
||||||
|
class Garbo
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
~Garbo() {
|
||||||
|
if (OcrObject::m_instance)
|
||||||
|
delete OcrObject::m_instance;
|
||||||
|
}
|
||||||
|
static Garbo g_garbo;
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // OCROBJECT_H
|
|
@ -19,15 +19,17 @@
|
||||||
*/
|
*/
|
||||||
#include "search-manager.h"
|
#include "search-manager.h"
|
||||||
using namespace UkuiSearch;
|
using namespace UkuiSearch;
|
||||||
|
|
||||||
size_t SearchManager::uniqueSymbolFile = 0;
|
size_t SearchManager::uniqueSymbolFile = 0;
|
||||||
size_t SearchManager::uniqueSymbolDir = 0;
|
size_t SearchManager::uniqueSymbolDir = 0;
|
||||||
size_t SearchManager::uniqueSymbolContent = 0;
|
size_t SearchManager::uniqueSymbolContent = 0;
|
||||||
|
size_t SearchManager::uniqueSymbolOcr = 0;
|
||||||
QMutex SearchManager::m_mutexFile;
|
QMutex SearchManager::m_mutexFile;
|
||||||
QMutex SearchManager::m_mutexDir;
|
QMutex SearchManager::m_mutexDir;
|
||||||
QMutex SearchManager::m_mutexContent;
|
QMutex SearchManager::m_mutexContent;
|
||||||
|
QMutex SearchManager::m_mutexOcr;
|
||||||
|
|
||||||
SearchManager::SearchManager(QObject *parent) : QObject(parent) {
|
SearchManager::SearchManager(QObject *parent) : QObject(parent) {
|
||||||
m_pool.setMaxThreadCount(3);
|
|
||||||
m_pool.setExpiryTimeout(1000);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SearchManager::~SearchManager() {
|
SearchManager::~SearchManager() {
|
||||||
|
@ -43,39 +45,6 @@ int SearchManager::getCurrentIndexCount() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void SearchManager::onKeywordSearch(QString keyword, QQueue<QString> *searchResultFile, QQueue<QString> *searchResultDir,
|
|
||||||
QQueue<QPair<QString, QStringList>> *searchResultContent) {
|
|
||||||
// m_mutexFile.lock();
|
|
||||||
// ++uniqueSymbolFile;
|
|
||||||
// m_mutexFile.unlock();
|
|
||||||
// m_mutexDir.lock();
|
|
||||||
// ++uniqueSymbolDir;
|
|
||||||
// m_mutexDir.unlock();
|
|
||||||
// m_mutexContent.lock();
|
|
||||||
// ++uniqueSymbolContent;
|
|
||||||
// m_mutexContent.unlock();
|
|
||||||
// if(FileUtils::SearchMethod::DIRECTSEARCH == FileUtils::searchMethod) {
|
|
||||||
// DirectSearch *directSearch;
|
|
||||||
// directSearch = new DirectSearch(keyword, searchResultFile, searchResultDir, uniqueSymbolFile);
|
|
||||||
// m_pool.start(directSearch);
|
|
||||||
// } else if(FileUtils::SearchMethod::INDEXSEARCH == FileUtils::searchMethod) {
|
|
||||||
// FileSearch *filesearch;
|
|
||||||
// filesearch = new FileSearch(searchResultFile, uniqueSymbolFile, keyword, "0", 1, 0, 5);
|
|
||||||
// m_pool.start(filesearch);
|
|
||||||
|
|
||||||
// FileSearch *dirsearch;
|
|
||||||
// dirsearch = new FileSearch(searchResultDir, uniqueSymbolDir, keyword, "1", 1, 0, 5);
|
|
||||||
// m_pool.start(dirsearch);
|
|
||||||
|
|
||||||
// FileContentSearch *contentSearch;
|
|
||||||
// contentSearch = new FileContentSearch(searchResultContent, uniqueSymbolContent, keyword, 0, 5);
|
|
||||||
// m_pool.start(contentSearch);
|
|
||||||
// } else {
|
|
||||||
// qWarning() << "Unknown search method! FileUtils::searchMethod: " << static_cast<int>(FileUtils::searchMethod);
|
|
||||||
// }
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool SearchManager::isBlocked(QString &path) {
|
bool SearchManager::isBlocked(QString &path) {
|
||||||
QStringList blockList = GlobalSettings::getInstance()->getBlockDirs();
|
QStringList blockList = GlobalSettings::getInstance()->getBlockDirs();
|
||||||
for(QString i : blockList) {
|
for(QString i : blockList) {
|
||||||
|
@ -101,6 +70,7 @@ bool SearchManager::creatResultInfo(SearchPluginIface::ResultInfo &ri, QString p
|
||||||
ri.type = 0;
|
ri.type = 0;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
FileSearch::FileSearch(DataQueue<SearchPluginIface::ResultInfo> *searchResult, size_t uniqueSymbol, QString keyword, QString value, unsigned slot, int begin, int num) {
|
FileSearch::FileSearch(DataQueue<SearchPluginIface::ResultInfo> *searchResult, size_t uniqueSymbol, QString keyword, QString value, unsigned slot, int begin, int num) {
|
||||||
this->setAutoDelete(true);
|
this->setAutoDelete(true);
|
||||||
m_search_result = searchResult;
|
m_search_result = searchResult;
|
||||||
|
@ -428,6 +398,121 @@ int FileContentSearch::getResult(Xapian::MSet &result, std::string &keyWord) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OcrSearch::OcrSearch(DataQueue<SearchPluginIface::ResultInfo> *searchResult, size_t uniqueSymbol, QString keyword, int begin, int num) {
|
||||||
|
this->setAutoDelete(true);
|
||||||
|
m_search_result = searchResult;
|
||||||
|
m_uniqueSymbol = uniqueSymbol;
|
||||||
|
m_keyword = keyword;
|
||||||
|
m_begin = begin;
|
||||||
|
m_num = num;
|
||||||
|
m_matchDecider = new OcrMatchDecider();
|
||||||
|
}
|
||||||
|
|
||||||
|
OcrSearch::~OcrSearch() {
|
||||||
|
m_search_result = nullptr;
|
||||||
|
if(m_matchDecider)
|
||||||
|
delete m_matchDecider;
|
||||||
|
}
|
||||||
|
|
||||||
|
void OcrSearch::run() {
|
||||||
|
SearchManager::m_mutexOcr.lock();
|
||||||
|
if(!m_search_result->isEmpty()) {
|
||||||
|
m_search_result->clear();
|
||||||
|
}
|
||||||
|
SearchManager::m_mutexOcr.unlock();
|
||||||
|
|
||||||
|
//这里同文件搜索,待优化。
|
||||||
|
m_begin = 0;
|
||||||
|
m_num = 100;
|
||||||
|
int resultCount = 1;
|
||||||
|
int totalCount = 0;
|
||||||
|
while(resultCount > 0) {
|
||||||
|
resultCount = keywordSearchOcr();
|
||||||
|
m_begin += m_num;
|
||||||
|
totalCount += resultCount;
|
||||||
|
}
|
||||||
|
qDebug() << "Total count:" << totalCount;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int OcrSearch::keywordSearchOcr() {
|
||||||
|
try {
|
||||||
|
qDebug() << "--keywordSearch OCR search start--";
|
||||||
|
Xapian::Database db(OCR_INDEX_PATH);
|
||||||
|
Xapian::Enquire enquire(db);
|
||||||
|
Xapian::QueryParser qp;
|
||||||
|
qp.set_default_op(Xapian::Query::OP_AND);
|
||||||
|
qp.set_database(db);
|
||||||
|
QVector<SKeyWord> sKeyWord = ChineseSegmentation::getInstance()->callSegement(m_keyword.toStdString());
|
||||||
|
//Creat a query
|
||||||
|
std::string words;
|
||||||
|
for(int i = 0; i < sKeyWord.size(); i++) {
|
||||||
|
words.append(sKeyWord.at(i).word).append(" ");
|
||||||
|
}
|
||||||
|
std::vector<Xapian::Query> v;
|
||||||
|
for(int i=0; i<sKeyWord.size(); i++) {
|
||||||
|
v.push_back(Xapian::Query(sKeyWord.at(i).word));
|
||||||
|
qDebug() << QString::fromStdString(sKeyWord.at(i).word);
|
||||||
|
}
|
||||||
|
Xapian::Query query = Xapian::Query(Xapian::Query::OP_AND, v.begin(), v.end());
|
||||||
|
|
||||||
|
qDebug() << "keywordSearch OCR:" << QString::fromStdString(query.get_description());
|
||||||
|
|
||||||
|
enquire.set_query(query);
|
||||||
|
|
||||||
|
Xapian::MSet result = enquire.get_mset(m_begin, m_num, 0, m_matchDecider);
|
||||||
|
int resultCount = result.size();
|
||||||
|
if(result.size() == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
qDebug() << "keywordSearch OCR results count=" << resultCount;
|
||||||
|
|
||||||
|
if(getResult(result, words) == -1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
qDebug() << "--keywordSearch OCR search finish--";
|
||||||
|
return resultCount;
|
||||||
|
} catch(const Xapian::Error &e) {
|
||||||
|
qWarning() << QString::fromStdString(e.get_description());
|
||||||
|
qDebug() << "--keywordSearch OCR search finish--";
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int OcrSearch::getResult(Xapian::MSet &result, std::string &keyWord) {
|
||||||
|
for(auto it = result.begin(); it != result.end(); ++it) {
|
||||||
|
Xapian::Document doc = it.get_document();
|
||||||
|
std::string data = doc.get_data();
|
||||||
|
QString path = QString::fromStdString(doc.get_value(1));
|
||||||
|
|
||||||
|
SearchPluginIface::ResultInfo ri;
|
||||||
|
if(!SearchManager::creatResultInfo(ri, path)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Construct snippets containing keyword.
|
||||||
|
auto term = doc.termlist_begin();
|
||||||
|
std::string wordTobeFound = QString::fromStdString(keyWord).section(" ", 0, 0).toStdString();
|
||||||
|
term.skip_to(wordTobeFound);
|
||||||
|
//fix me: make a snippet without cut cjk char.
|
||||||
|
auto pos = term.positionlist_begin();
|
||||||
|
QString snippet = FileUtils::chineseSubString(data,*pos,120);
|
||||||
|
|
||||||
|
ri.description.prepend(SearchPluginIface::DescriptionInfo{"",snippet});
|
||||||
|
QString().swap(snippet);
|
||||||
|
std::string().swap(data);
|
||||||
|
SearchManager::m_mutexOcr.lock();
|
||||||
|
if(m_uniqueSymbol == SearchManager::uniqueSymbolOcr) {
|
||||||
|
m_search_result->enqueue(ri);
|
||||||
|
SearchManager::m_mutexOcr.unlock();
|
||||||
|
} else {
|
||||||
|
SearchManager::m_mutexOcr.unlock();
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
DirectSearch::DirectSearch(QString keyword, DataQueue<SearchPluginIface::ResultInfo> *searchResult, QString value, size_t uniqueSymbol) {
|
DirectSearch::DirectSearch(QString keyword, DataQueue<SearchPluginIface::ResultInfo> *searchResult, QString value, size_t uniqueSymbol) {
|
||||||
this->setAutoDelete(true);
|
this->setAutoDelete(true);
|
||||||
m_keyword = keyword;
|
m_keyword = keyword;
|
||||||
|
@ -521,3 +606,12 @@ bool FileContentMatchDecider::operator ()(const Xapian::Document &doc) const
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool OcrMatchDecider::operator ()(const Xapian::Document &doc) const
|
||||||
|
{
|
||||||
|
QString path = QString::fromStdString(doc.get_value(1));
|
||||||
|
if(SearchManager::isBlocked(path)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
|
@ -45,15 +45,19 @@
|
||||||
|
|
||||||
#define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/index_data").toStdString()
|
#define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/index_data").toStdString()
|
||||||
#define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/content_index_data").toStdString()
|
#define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/content_index_data").toStdString()
|
||||||
|
#define OCR_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/ocr_index_data").toStdString()
|
||||||
namespace UkuiSearch {
|
namespace UkuiSearch {
|
||||||
class FileMatchDecider;
|
class FileMatchDecider;
|
||||||
class FileContentMatchDecider;
|
class FileContentMatchDecider;
|
||||||
|
class OcrMatchDecider;
|
||||||
class LIBSEARCH_EXPORT SearchManager : public QObject {
|
class LIBSEARCH_EXPORT SearchManager : public QObject {
|
||||||
friend class FileSearch;
|
friend class FileSearch;
|
||||||
friend class FileContentSearch;
|
friend class FileContentSearch;
|
||||||
|
friend class OcrSearch;
|
||||||
friend class DirectSearch;
|
friend class DirectSearch;
|
||||||
friend class FileMatchDecider;
|
friend class FileMatchDecider;
|
||||||
friend class FileContentMatchDecider;
|
friend class FileContentMatchDecider;
|
||||||
|
friend class OcrMatchDecider;
|
||||||
Q_OBJECT
|
Q_OBJECT
|
||||||
public:
|
public:
|
||||||
explicit SearchManager(QObject *parent = nullptr);
|
explicit SearchManager(QObject *parent = nullptr);
|
||||||
|
@ -64,22 +68,15 @@ public:
|
||||||
static size_t uniqueSymbolFile;
|
static size_t uniqueSymbolFile;
|
||||||
static size_t uniqueSymbolDir;
|
static size_t uniqueSymbolDir;
|
||||||
static size_t uniqueSymbolContent;
|
static size_t uniqueSymbolContent;
|
||||||
|
static size_t uniqueSymbolOcr;
|
||||||
static QMutex m_mutexFile;
|
static QMutex m_mutexFile;
|
||||||
static QMutex m_mutexDir;
|
static QMutex m_mutexDir;
|
||||||
static QMutex m_mutexContent;
|
static QMutex m_mutexContent;
|
||||||
|
static QMutex m_mutexOcr;
|
||||||
|
|
||||||
public Q_SLOTS:
|
|
||||||
void onKeywordSearch(QString keyword, QQueue<QString> *searchResultFile, QQueue<QString> *searchResultDir, QQueue<QPair<QString, QStringList>> *searchResultContent);
|
|
||||||
|
|
||||||
Q_SIGNALS:
|
|
||||||
void resultFile(QQueue<QString> *);
|
|
||||||
void resultDir(QQueue<QString> *);
|
|
||||||
void resultContent(QQueue<QPair<QString, QStringList>> *);
|
|
||||||
private:
|
private:
|
||||||
static bool isBlocked(QString &path);
|
static bool isBlocked(QString &path);
|
||||||
static bool creatResultInfo(UkuiSearch::SearchPluginIface::ResultInfo &ri, QString path);
|
static bool creatResultInfo(UkuiSearch::SearchPluginIface::ResultInfo &ri, QString path);
|
||||||
|
|
||||||
QThreadPool m_pool;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class FileSearch : public QRunnable {
|
class FileSearch : public QRunnable {
|
||||||
|
@ -121,6 +118,24 @@ private:
|
||||||
int m_num = 20;
|
int m_num = 20;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class OcrSearch : public QRunnable {
|
||||||
|
public:
|
||||||
|
explicit OcrSearch(DataQueue<SearchPluginIface::ResultInfo> *searchResult, size_t uniqueSymbol, QString keyword, int begin = 0, int num = 20);
|
||||||
|
~OcrSearch();
|
||||||
|
protected:
|
||||||
|
void run();
|
||||||
|
private:
|
||||||
|
int keywordSearchOcr();
|
||||||
|
int getResult(Xapian::MSet &result, std::string &keyWord);
|
||||||
|
|
||||||
|
DataQueue<SearchPluginIface::ResultInfo> *m_search_result = nullptr;
|
||||||
|
OcrMatchDecider *m_matchDecider;
|
||||||
|
size_t m_uniqueSymbol;
|
||||||
|
QString m_keyword;
|
||||||
|
int m_begin = 0;
|
||||||
|
int m_num = 20;
|
||||||
|
};
|
||||||
|
|
||||||
class DirectSearch : public QRunnable {
|
class DirectSearch : public QRunnable {
|
||||||
public:
|
public:
|
||||||
explicit DirectSearch(QString keyword, DataQueue<SearchPluginIface::ResultInfo> *searchResult, QString value, size_t uniqueSymbol);
|
explicit DirectSearch(QString keyword, DataQueue<SearchPluginIface::ResultInfo> *searchResult, QString value, size_t uniqueSymbol);
|
||||||
|
@ -133,11 +148,15 @@ private:
|
||||||
QString m_value;
|
QString m_value;
|
||||||
};
|
};
|
||||||
|
|
||||||
class FileMatchDecider :public Xapian::MatchDecider {
|
class FileMatchDecider : public Xapian::MatchDecider {
|
||||||
public:
|
public:
|
||||||
bool operator ()(const Xapian::Document &doc) const;
|
bool operator ()(const Xapian::Document &doc) const;
|
||||||
};
|
};
|
||||||
class FileContentMatchDecider :public Xapian::MatchDecider {
|
class FileContentMatchDecider : public Xapian::MatchDecider {
|
||||||
|
public:
|
||||||
|
bool operator ()(const Xapian::Document &doc) const;
|
||||||
|
};
|
||||||
|
class OcrMatchDecider : public Xapian::MatchDecider {
|
||||||
public:
|
public:
|
||||||
bool operator ()(const Xapian::Document &doc) const;
|
bool operator ()(const Xapian::Document &doc) const;
|
||||||
};
|
};
|
||||||
|
|
|
@ -41,7 +41,7 @@ include(dirwatcher/dirwatcher.pri)
|
||||||
include(mailsearch/mailsearch.pri)
|
include(mailsearch/mailsearch.pri)
|
||||||
|
|
||||||
LIBS += -L$$OUT_PWD/../libchinese-segmentation/ -lchinese-segmentation
|
LIBS += -L$$OUT_PWD/../libchinese-segmentation/ -lchinese-segmentation
|
||||||
LIBS += -lxapian -lquazip5 -luchardet -lQt5Xdg#-L/usr/local/lib/libjemalloc -ljemalloc
|
LIBS += -lxapian -lquazip5 -luchardet -lQt5Xdg -ltesseract #-L/usr/local/lib/libjemalloc -ljemalloc
|
||||||
|
|
||||||
SOURCES += \
|
SOURCES += \
|
||||||
file-utils.cpp \
|
file-utils.cpp \
|
||||||
|
|
Loading…
Reference in New Issue