新增OCR功能后端;修复内存泄漏一处;

This commit is contained in:
jixiaoxu 2022-01-21 16:53:19 +08:00 committed by iaom
parent 607e021bfc
commit 2e668d374a
15 changed files with 531 additions and 87 deletions

4
debian/control vendored
View File

@ -20,7 +20,9 @@ Build-Depends: debhelper (>=9.0.0),
libpoppler-qt5-dev, libpoppler-qt5-dev,
libukui-log4qt-dev, libukui-log4qt-dev,
libqt5xdg-dev, libqt5xdg-dev,
libukcc-dev libukcc-dev,
libopencv-dev,
libtesseract-dev
Standards-Version: 4.5.0 Standards-Version: 4.5.0
Homepage: https://www.ukui.org/ Homepage: https://www.ukui.org/
Vcs-Git: https://github.com/ukui/ukui-search.git Vcs-Git: https://github.com/ukui/ukui-search.git

View File

@ -27,5 +27,11 @@ static const QMap<QString, bool> targetFileTypeMap = {
std::map<QString, bool>::value_type("et", true), std::map<QString, bool>::value_type("et", true),
std::map<QString, bool>::value_type("pdf", true) std::map<QString, bool>::value_type("pdf", true)
}; };
static const QMap<QString, bool> targetPhotographTypeMap = {
std::map<QString, bool>::value_type("png", true),
std::map<QString, bool>::value_type("jpg", true),
std::map<QString, bool>::value_type("jpeg", true)//TODO 待完善,后续改为配置文件
};
//TODO Put things that needed to be put here here. //TODO Put things that needed to be put here here.
#endif // COMMON_H #endif // COMMON_H

View File

@ -130,3 +130,40 @@ void ConstructDocumentForContent::run() {
return; return;
} }
ConstructDocumentForOcr::ConstructDocumentForOcr(QString path)
{
this->setAutoDelete(true);
m_path = std::move(path);
}
void ConstructDocumentForOcr::run()
{
QString content;
FileReader::getTextContent(m_path, content);
Document doc;
doc.setUniqueTerm(FileUtils::makeDocUterm(m_path));
doc.addTerm("ZEEKERUPTERM" + FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
doc.addValue(1, m_path);
if(content.isEmpty()) {
doc.reuireDeleted();
} else {
doc.setData(content);
//'\xEF\xBC\x8C' is "" "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
std::vector<cppjieba::KeyWord> term = ChineseSegmentation::getInstance()->callSegementStd(content.toStdString());
for(size_t i = 0; i < term.size(); ++i) {
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
}
term.clear();
term.shrink_to_fit();
}
IndexGenerator::g_mutexDocListForOcr.lock();
IndexGenerator::g_docListForOcr.append(doc);
IndexGenerator::g_mutexDocListForOcr.unlock();
content.clear();
content.squeeze();
}

View File

@ -48,6 +48,16 @@ protected:
private: private:
QString m_path; QString m_path;
}; };
class ConstructDocumentForOcr : public QRunnable {
public:
explicit ConstructDocumentForOcr(QString path);
~ConstructDocumentForOcr() = default;
protected:
void run();
private:
QString m_path;
};
} }
#endif // CONSTRUCTDOCUMENT_H #endif // CONSTRUCTDOCUMENT_H

View File

@ -20,6 +20,7 @@
#include "file-reader.h" #include "file-reader.h"
#include "file-utils.h" #include "file-utils.h"
#include "binary-parser.h" #include "binary-parser.h"
#include "ocrobject.h"
using namespace UkuiSearch; using namespace UkuiSearch;
FileReader::FileReader(QObject *parent) : QObject(parent) { FileReader::FileReader(QObject *parent) : QObject(parent) {
@ -41,6 +42,8 @@ void FileReader::getTextContent(QString path, QString &textContent) {
searchdata.RunParser(path, textContent); searchdata.RunParser(path, textContent);
} else if (strsfx == "pdf") { } else if (strsfx == "pdf") {
FileUtils::getPdfTextContent(path, textContent); FileUtils::getPdfTextContent(path, textContent);
} else if (strsfx == "png" || strsfx == "jpg" || strsfx == "jpeg"){
OcrObject::getInstance()->getTxtContent(path, textContent);;
} }
return; return;
} }

View File

@ -39,6 +39,9 @@ FirstIndex::~FirstIndex() {
if(this->q_content_index) if(this->q_content_index)
delete this->q_content_index; delete this->q_content_index;
this->q_content_index = nullptr; this->q_content_index = nullptr;
if(this->m_ocr_index)
delete this->m_ocr_index;
this->m_ocr_index = nullptr;
if(this->p_indexGenerator) if(this->p_indexGenerator)
delete this->p_indexGenerator; delete this->p_indexGenerator;
this->p_indexGenerator = nullptr; this->p_indexGenerator = nullptr;
@ -48,10 +51,10 @@ FirstIndex::~FirstIndex() {
void FirstIndex::DoSomething(const QFileInfo& fileInfo) { void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
// qDebug() << "there are some shit here"<<fileInfo.fileName() << fileInfo.absoluteFilePath() << QString(fileInfo.isDir() ? "1" : "0"); // qDebug() << "there are some shit here"<<fileInfo.fileName() << fileInfo.absoluteFilePath() << QString(fileInfo.isDir() ? "1" : "0");
this->q_index->enqueue(QVector<QString>() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0")); this->q_index->enqueue(QVector<QString>() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0"));
if((fileInfo.fileName().split(".", QString::SkipEmptyParts).length() > 1) if (fileInfo.fileName().split(".", QString::SkipEmptyParts).length() < 2)
&& (true == targetFileTypeMap[fileInfo.fileName().split(".").last()]) return;
&& (!FileUtils::isEncrypedOrUnreadable(fileInfo.absoluteFilePath()))) { if (true == targetFileTypeMap[fileInfo.fileName().split(".").last()]
//this->q_content_index->enqueue(fileInfo.absoluteFilePath()); and false == FileUtils::isEncrypedOrUnreadable(fileInfo.absoluteFilePath())) {
if (fileInfo.fileName().split(".").last() == "docx") { if (fileInfo.fileName().split(".").last() == "docx") {
QuaZip file(fileInfo.absoluteFilePath()); QuaZip file(fileInfo.absoluteFilePath());
if(!file.open(QuaZip::mdUnzip)) if(!file.open(QuaZip::mdUnzip))
@ -93,6 +96,8 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
} else { } else {
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size())); this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
} }
} else if (true == targetPhotographTypeMap[fileInfo.fileName().split(".").last()]) {
this->m_ocr_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
} }
} }
@ -120,6 +125,7 @@ void FirstIndex::run() {
this->q_index = new QQueue<QVector<QString>>(); this->q_index = new QQueue<QVector<QString>>();
this->q_content_index = new QQueue<QPair<QString,qint64>>(); this->q_content_index = new QQueue<QPair<QString,qint64>>();
this->m_ocr_index = new QQueue<QPair<QString,qint64>>();
int fifo_fd; int fifo_fd;
char buffer[2]; char buffer[2];
@ -214,7 +220,34 @@ void FirstIndex::run() {
qDebug() << "content index end;"; qDebug() << "content index end;";
sem.release(2); sem.release(2);
}); });
//OCR功能暂时屏蔽
// QtConcurrent::run(&m_pool,[&]() {
// sem.acquire(5);
// QQueue<QString>* tmpOcr = new QQueue<QString>();
// qDebug() << "m_ocr_index:" << m_ocr_index->size();
// while(!this->m_ocr_index->empty()) {
// qint64 fileSize = 0;
// //一次处理的数据量文件总大小为50M以下50M为暂定值
// for(size_t i = 0;/* (i < 30) && (fileSize < 52428800) && */(!this->m_ocr_index->empty()); ++i) {
// QPair<QString,qint64> tempPair = this->m_ocr_index->dequeue();
// fileSize += tempPair.second;
// if (fileSize > 52428800) {
// if (tmpOcr->size() == 0) {
// tmpOcr->enqueue(tempPair.first);
// break;
// }
// this->m_ocr_index->enqueue(tempPair);
// break;
// }
// tmpOcr->enqueue(tempPair.first);
// }
// this->p_indexGenerator->creatOcrIndex(tmpOcr);
// tmpOcr->clear();
// }
// delete tmpOcr;
// qDebug() << "OCR index end;";
// sem.release(5);
// });
mutex1.lock(); mutex1.lock();
mutex2.lock(); mutex2.lock();
mutex3.lock(); mutex3.lock();
@ -223,14 +256,15 @@ void FirstIndex::run() {
mutex2.unlock(); mutex2.unlock();
mutex3.unlock(); mutex3.unlock();
if(this->q_index) if(this->q_index)
delete this->q_index; delete this->q_index;
this->q_index = nullptr; this->q_index = nullptr;
if(this->q_content_index) if(this->q_content_index)
delete this->q_content_index; delete this->q_content_index;
this->q_content_index = nullptr; this->q_content_index = nullptr;
if(this->m_ocr_index)
delete this->m_ocr_index;
this->m_ocr_index = nullptr;
if(p_indexGenerator) if(p_indexGenerator)
delete p_indexGenerator; delete p_indexGenerator;
p_indexGenerator = nullptr; p_indexGenerator = nullptr;

View File

@ -60,7 +60,8 @@ private:
// QQueue<QString>* q_content_index; // QQueue<QString>* q_content_index;
//修改QQueue存储数据为QPair<QString,qint64>,增加存储文件大小数据便于处理时统计--jxx20210519 //修改QQueue存储数据为QPair<QString,qint64>,增加存储文件大小数据便于处理时统计--jxx20210519
QQueue<QPair<QString,qint64>>* q_content_index; QQueue<QPair<QString,qint64>>* q_content_index;
//新增ocr队列存储ocr可识别处理的图片信息及大小
QQueue<QPair<QString,qint64>>* m_ocr_index;
//xapian will auto commit per 10,000 changes, donnot change it!!! //xapian will auto commit per 10,000 changes, donnot change it!!!
const size_t u_send_length = 8192; const size_t u_send_length = 8192;
}; };

View File

@ -33,6 +33,7 @@
#define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/index_data").toStdString() #define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/index_data").toStdString()
#define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/content_index_data").toStdString() #define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/content_index_data").toStdString()
#define OCR_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/ocr_index_data").toStdString()
using namespace UkuiSearch; using namespace UkuiSearch;
@ -44,8 +45,11 @@ QMutex IndexGenerator::m_mutex;
//QMutex UkuiSearch::g_mutexDocListForContent; //QMutex UkuiSearch::g_mutexDocListForContent;
QMutex IndexGenerator::g_mutexDocListForPath; QMutex IndexGenerator::g_mutexDocListForPath;
QMutex IndexGenerator::g_mutexDocListForContent; QMutex IndexGenerator::g_mutexDocListForContent;
QMutex IndexGenerator::g_mutexDocListForOcr;
QVector<Document> IndexGenerator::g_docListForPath = QVector<Document>(); QVector<Document> IndexGenerator::g_docListForPath = QVector<Document>();
QVector<Document> IndexGenerator::g_docListForContent = QVector<Document>(); QVector<Document> IndexGenerator::g_docListForContent = QVector<Document>();
QVector<Document> IndexGenerator::g_docListForOcr = QVector<Document>();
IndexGenerator *IndexGenerator::getInstance(bool rebuild, QObject *parent) { IndexGenerator *IndexGenerator::getInstance(bool rebuild, QObject *parent) {
QMutexLocker locker(&m_mutex); QMutexLocker locker(&m_mutex);
@ -134,6 +138,44 @@ bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) {
} }
bool IndexGenerator::creatOcrIndex(QQueue<QString> *messageList)
{
HandleOcrPathList(messageList);
if(IndexGenerator::g_docListForOcr.isEmpty()) {
return false;
}
int size = IndexGenerator::g_docListForOcr.size();
qDebug() << "begin creatAllIndex for ocr" << size;
if(!size == 0) {
try {
int count = 0;
for(Document i : IndexGenerator::g_docListForOcr) {
if(!i.isRequiredDeleted()) {
m_database_ocr->replace_document(i.getUniqueTerm(), i.getXapianDocument());
} else {
m_database_ocr->delete_document(i.getUniqueTerm());
}
if(++count > 999) {
count = 0;
m_database_ocr->commit();
}
}
m_database_ocr->commit();
} catch(const Xapian::Error &e) {
qWarning() << "creat ocr Index fail!" << QString::fromStdString(e.get_description());
IndexStatusRecorder::getInstance()->setStatus(CONTENT_INDEX_DATABASE_STATE, "1");
assert(false);
}
qDebug() << "finish creatAllIndex for ocr";
IndexGenerator::g_docListForOcr.clear();
IndexGenerator::g_docListForOcr.squeeze();
QVector<Document>().swap(IndexGenerator::g_docListForOcr);
malloc_trim(0);
}
return true;
}
IndexGenerator::IndexGenerator(bool rebuild, QObject *parent) : QObject(parent) { IndexGenerator::IndexGenerator(bool rebuild, QObject *parent) : QObject(parent) {
QDir database(QString::fromStdString(INDEX_PATH)); QDir database(QString::fromStdString(INDEX_PATH));
@ -153,6 +195,7 @@ IndexGenerator::IndexGenerator(bool rebuild, QObject *parent) : QObject(parent)
m_database_path = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OPEN); m_database_path = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
m_database_content = new Xapian::WritableDatabase(CONTENT_INDEX_PATH, Xapian::DB_CREATE_OR_OPEN); m_database_content = new Xapian::WritableDatabase(CONTENT_INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
m_database_ocr = new Xapian::WritableDatabase(OCR_INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
} }
IndexGenerator::~IndexGenerator() { IndexGenerator::~IndexGenerator() {
@ -165,8 +208,11 @@ IndexGenerator::~IndexGenerator() {
if(m_database_content) if(m_database_content)
m_database_content->~WritableDatabase(); m_database_content->~WritableDatabase();
// delete m_database_content; // delete m_database_content;
if(m_database_ocr)
m_database_ocr->~WritableDatabase();
m_database_path = nullptr; m_database_path = nullptr;
m_database_content = nullptr; m_database_content = nullptr;
m_database_ocr = nullptr;
global_instance = nullptr; global_instance = nullptr;
// if(m_index_map) // if(m_index_map)
// delete m_index_map; // delete m_index_map;
@ -266,28 +312,25 @@ void IndexGenerator::HandlePathList(QQueue<QString> *messageList) {
pool.start(constructer); pool.start(constructer);
} }
qDebug() << "pool finish" << pool.waitForDone(-1); qDebug() << "pool finish" << pool.waitForDone(-1);
// if(constructer)
// delete constructer;
// constructer = nullptr;
// QFuture<Document> future = QtConcurrent::mapped(*messageList,&IndexGenerator::GenerateContentDocument);
// future.waitForFinished();
// ChineseSegmentation::getInstance()->~ChineseSegmentation();
// QList<Document> docList = future.results();
// mg_docListForContent = new QList<Document>(docList);
// qDebug()<<g_docListForContent->size();
// QList<Document> docList = future.results();
// mg_docListForContent = new QList<Document>(docList);
// mg_docListForContent = std::move(future.results());
// future.cancel();
qDebug() << "Finish HandlePathList for content index!"; qDebug() << "Finish HandlePathList for content index!";
return; return;
}
void IndexGenerator::HandleOcrPathList(QQueue<QString> *messageList)
{
qDebug() << "Begin HandlePathList for ocr index!";
qDebug() << messageList->size();
ConstructDocumentForOcr *constructer;
QThreadPool pool;
pool.setMaxThreadCount(1);
pool.setExpiryTimeout(100);
while(!messageList->isEmpty()) {
constructer = new ConstructDocumentForOcr(messageList->dequeue());
pool.start(constructer);
}
qDebug() << "pool finish" << pool.waitForDone(-1);
qDebug() << "Finish HandlePathList for content index!";
return;
} }
//deprecated //deprecated
Document IndexGenerator::GenerateDocument(const QVector<QString> &list) { Document IndexGenerator::GenerateDocument(const QVector<QString> &list) {
@ -460,10 +503,13 @@ bool IndexGenerator::deleteAllIndex(QStringList *pathlist) {
m_database_path->delete_document(uniqueterm); m_database_path->delete_document(uniqueterm);
m_database_content->delete_document(uniqueterm); m_database_content->delete_document(uniqueterm);
m_database_ocr->delete_document(uniqueterm);
//delete all files under it if it's a dir. //delete all files under it if it's a dir.
m_database_path->delete_document(upterm); m_database_path->delete_document(upterm);
m_database_content->delete_document(upterm); m_database_content->delete_document(upterm);
m_database_ocr->delete_document(upterm);
qDebug() << "delete path" << doc; qDebug() << "delete path" << doc;
// qDebug() << "delete md5" << QString::fromStdString(uniqueterm); // qDebug() << "delete md5" << QString::fromStdString(uniqueterm);
@ -472,6 +518,7 @@ bool IndexGenerator::deleteAllIndex(QStringList *pathlist) {
} }
m_database_path->commit(); m_database_path->commit();
m_database_content->commit(); m_database_content->commit();
m_database_ocr->commit();
qDebug() << "--delete finish--"; qDebug() << "--delete finish--";
} catch(const Xapian::Error &e) { } catch(const Xapian::Error &e) {
qWarning() << QString::fromStdString(e.get_description()); qWarning() << QString::fromStdString(e.get_description());
@ -503,43 +550,85 @@ bool IndexGenerator::deleteContentIndex(QStringList *pathlist)
return true; return true;
} }
bool IndexGenerator::deleteOcrIndex(QStringList *pathlist)
{
if(pathlist->isEmpty())
return true;
try {
qDebug() << "--delete start--";
for(int i = 0; i < pathlist->size(); i++) {
QString doc = pathlist->at(i);
std::string uniqueterm = FileUtils::makeDocUterm(doc);
m_database_ocr->delete_document(uniqueterm);
qDebug() << "delete path" << doc;
}
m_database_ocr->commit();
qDebug() << "--delete finish--";
} catch(const Xapian::Error &e) {
qWarning() << QString::fromStdString(e.get_description());
return false;
}
return true;
}
bool IndexGenerator::updateIndex(QVector<PendingFile> *pendingFiles) bool IndexGenerator::updateIndex(QVector<PendingFile> *pendingFiles)
{ {
QQueue<QVector<QString>> *fileIndexInfo = new QQueue<QVector<QString>>; QQueue<QVector<QString>> *fileIndexInfo = new QQueue<QVector<QString>>;
QQueue<QString> *fileContentIndexInfo = new QQueue<QString>; QQueue<QString> *fileContentIndexInfo = new QQueue<QString>;
QQueue<QString> *fileOcrIndexInfo = new QQueue<QString>;
QStringList *deleteList = new QStringList; QStringList *deleteList = new QStringList;
QStringList *contentDeleteList = new QStringList; QStringList *contentDeleteList = new QStringList;
for(PendingFile file : *pendingFiles) { for (PendingFile file : *pendingFiles) {
if(file.shouldRemoveIndex()) { if (file.shouldRemoveIndex()) {
deleteList->append(file.path()); deleteList->append(file.path());
continue; continue;
} }
fileIndexInfo->append(QVector<QString>() << file.path().section("/" , -1) << file.path() << QString(file.isDir() ? "1" : "0")); fileIndexInfo->append(QVector<QString>() << file.path().section("/" , -1) << file.path() << QString(file.isDir() ? "1" : "0"));
if((!file.path().split(".").isEmpty()) && (true == targetFileTypeMap[file.path().section("/" , -1) .split(".").last()])) { if ((!file.path().split(".").isEmpty()) && (true == targetFileTypeMap[file.path().section("/" , -1) .split(".").last()])) {
if(!FileUtils::isEncrypedOrUnreadable(file.path())) { if (!FileUtils::isEncrypedOrUnreadable(file.path())) {
fileContentIndexInfo->append(file.path()); fileContentIndexInfo->append(file.path());
} else { } else {
contentDeleteList->append(file.path()); contentDeleteList->append(file.path());
} }
} }
} }
if(!deleteList->isEmpty()) { if (!deleteList->isEmpty()) {
deleteAllIndex(deleteList); deleteAllIndex(deleteList);
} }
if(!contentDeleteList->isEmpty()) { if (!contentDeleteList->isEmpty()) {
deleteContentIndex(contentDeleteList); deleteContentIndex(contentDeleteList);
} }
if(!fileIndexInfo->isEmpty()) { if (!fileIndexInfo->isEmpty()) {
creatAllIndex(fileIndexInfo); creatAllIndex(fileIndexInfo);
} }
if(!fileContentIndexInfo->isEmpty()) { if (!fileContentIndexInfo->isEmpty()) {
creatAllIndex(fileContentIndexInfo); creatAllIndex(fileContentIndexInfo);
} }
if (!fileOcrIndexInfo->isEmpty()) {
creatOcrIndex(fileOcrIndexInfo);
}
if (fileIndexInfo) {
delete fileIndexInfo; delete fileIndexInfo;
fileIndexInfo = nullptr;
}
if (fileContentIndexInfo) {
delete fileContentIndexInfo; delete fileContentIndexInfo;
fileContentIndexInfo = nullptr;
}
if (fileOcrIndexInfo) {
delete fileOcrIndexInfo;
fileOcrIndexInfo = nullptr;
}
if (deleteList) {
delete deleteList;
deleteList = nullptr;
}
if (contentDeleteList) {
delete contentDeleteList;
contentDeleteList = nullptr;
}
return true; return true;
} }

View File

@ -45,6 +45,7 @@ namespace UkuiSearch {
class IndexGenerator : public QObject { class IndexGenerator : public QObject {
friend class ConstructDocumentForPath; friend class ConstructDocumentForPath;
friend class ConstructDocumentForContent; friend class ConstructDocumentForContent;
friend class ConstructDocumentForOcr;
Q_OBJECT Q_OBJECT
public: public:
static IndexGenerator *getInstance(bool rebuild = false, QObject *parent = nullptr); static IndexGenerator *getInstance(bool rebuild = false, QObject *parent = nullptr);
@ -61,8 +62,10 @@ Q_SIGNALS:
public Q_SLOTS: public Q_SLOTS:
bool creatAllIndex(QQueue<QVector<QString>> *messageList); bool creatAllIndex(QQueue<QVector<QString>> *messageList);
bool creatAllIndex(QQueue<QString> *messageList); bool creatAllIndex(QQueue<QString> *messageList);
bool creatOcrIndex(QQueue<QString> *messageList);
bool deleteAllIndex(QStringList *pathlist); bool deleteAllIndex(QStringList *pathlist);
bool deleteContentIndex(QStringList *pathlist); bool deleteContentIndex(QStringList *pathlist);
bool deleteOcrIndex(QStringList *pathlist);
bool updateIndex(QVector<PendingFile> *pendingFiles); bool updateIndex(QVector<PendingFile> *pendingFiles);
private: private:
@ -72,6 +75,8 @@ private:
void HandlePathList(QQueue<QVector<QString> > *messageList); void HandlePathList(QQueue<QVector<QString> > *messageList);
//For file content index //For file content index
void HandlePathList(QQueue<QString> *messageList); void HandlePathList(QQueue<QString> *messageList);
//For ocr index
void HandleOcrPathList(QQueue<QString> *messageList);
static Document GenerateDocument(const QVector<QString> &list); static Document GenerateDocument(const QVector<QString> &list);
static Document GenerateContentDocument(const QString &list); static Document GenerateContentDocument(const QString &list);
//add one data in database //add one data in database
@ -82,10 +87,13 @@ private:
static QMutex g_mutexDocListForPath; static QMutex g_mutexDocListForPath;
static QVector<Document> g_docListForContent; static QVector<Document> g_docListForContent;
static QMutex g_mutexDocListForContent; static QMutex g_mutexDocListForContent;
static QVector<Document> g_docListForOcr;
static QMutex g_mutexDocListForOcr;
QMap<QString, QStringList> m_index_map; QMap<QString, QStringList> m_index_map;
QString m_index_data_path; QString m_index_data_path;
Xapian::WritableDatabase* m_database_path; Xapian::WritableDatabase* m_database_path;
Xapian::WritableDatabase* m_database_content; Xapian::WritableDatabase* m_database_content;
Xapian::WritableDatabase* m_database_ocr;
std::string m_docstr; std::string m_docstr;
std::string m_index_text_str; std::string m_index_text_str;
Xapian::TermGenerator m_indexer; Xapian::TermGenerator m_indexer;

View File

@ -9,6 +9,7 @@ HEADERS += \
$$PWD/index-generator.h \ $$PWD/index-generator.h \
$$PWD/index-status-recorder.h \ $$PWD/index-status-recorder.h \
$$PWD/inotify-watch.h \ $$PWD/inotify-watch.h \
$$PWD/ocrobject.h \
$$PWD/pending-file-queue.h \ $$PWD/pending-file-queue.h \
$$PWD/pending-file.h \ $$PWD/pending-file.h \
$$PWD/search-manager.h \ $$PWD/search-manager.h \
@ -25,6 +26,7 @@ SOURCES += \
$$PWD/index-generator.cpp \ $$PWD/index-generator.cpp \
$$PWD/index-status-recorder.cpp \ $$PWD/index-status-recorder.cpp \
$$PWD/inotify-watch.cpp \ $$PWD/inotify-watch.cpp \
$$PWD/ocrobject.cpp \
$$PWD/pending-file-queue.cpp \ $$PWD/pending-file-queue.cpp \
$$PWD/pending-file.cpp \ $$PWD/pending-file.cpp \
$$PWD/search-manager.cpp \ $$PWD/search-manager.cpp \

View File

@ -0,0 +1,98 @@
#include "ocrobject.h"
OcrObject *OcrObject::m_instance = nullptr;
once_flag g_instanceFlag;
OcrObject *OcrObject::getInstance()
{
std::call_once(g_instanceFlag, [] () {
m_instance = new OcrObject;
});
return m_instance;
}
void OcrObject::getTxtContent(QString &path, QString &textcontent)
{
m_api = new tesseract::TessBaseAPI();
if (m_api->Init(NULL, "chi_sim")) {
qDebug() << "Could not initialize tesseract.\n";
return;
}
m_api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080
Pix *image = pixRead(path.toStdString().data());
if (!image) {
qDebug() << "path:" << path <<" pixRead error!";
if (m_api) {
m_api->End();
delete m_api;
m_api = nullptr;
}
return;
}
m_api->SetImage(image);
textcontent = m_api->GetUTF8Text();
qDebug() << "path:" << path << " Text:" << textcontent;
pixDestroy(&image);
m_api->Clear();
if (m_api) {
m_api->End();
delete m_api;
m_api = nullptr;
}
//多进程版本
// tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
// if (api->Init(NULL, "chi_sim")) {
// qDebug() << "Could not initialize tesseract.\n";
// return;
// }
// api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080
// Pix *image = pixRead(path.toStdString().data());
// if (!image) {
// qDebug() << "path:" << path <<" pixRead error!";
// if (api) {
// api->End();
// delete api;
// api = nullptr;
// }
// return;
// }
// api->SetImage(image);
// textcontent = api->GetUTF8Text();
// qDebug() << "path:" << path << " Text:" << textcontent;
// pixDestroy(&image);
// api->Clear();
// if (api) {
// api->End();
// delete api;
// api = nullptr;
// }
}
OcrObject::OcrObject(QObject *parent) : QObject(parent)
{
init();
}
OcrObject::~OcrObject()
{
if (m_api) {
m_api->End();
delete m_api;
m_api = nullptr;
}
}
void OcrObject::init()
{
m_api = new tesseract::TessBaseAPI();
if (m_api->Init(NULL, "chi_sim")) {
qDebug() << "Could not initialize tesseract.\n";
return;
}
m_api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080
}

View File

@ -0,0 +1,41 @@
#ifndef OCROBJECT_H
#define OCROBJECT_H
#include <QObject>
#include <mutex>
#include <tesseract/baseapi.h>
#include <leptonica/allheaders.h>
#include <QDebug>
using namespace std;
class OcrObject : public QObject
{
Q_OBJECT
public:
static OcrObject* getInstance();
void getTxtContent(QString &path, QString &textcontent);
protected:
explicit OcrObject(QObject *parent = nullptr);
~OcrObject();
private:
static OcrObject *m_instance;
tesseract::TessBaseAPI *m_api = nullptr;
void init();
class Garbo
{
public:
~Garbo() {
if (OcrObject::m_instance)
delete OcrObject::m_instance;
}
static Garbo g_garbo;
};
};
#endif // OCROBJECT_H

View File

@ -19,15 +19,17 @@
*/ */
#include "search-manager.h" #include "search-manager.h"
using namespace UkuiSearch; using namespace UkuiSearch;
size_t SearchManager::uniqueSymbolFile = 0; size_t SearchManager::uniqueSymbolFile = 0;
size_t SearchManager::uniqueSymbolDir = 0; size_t SearchManager::uniqueSymbolDir = 0;
size_t SearchManager::uniqueSymbolContent = 0; size_t SearchManager::uniqueSymbolContent = 0;
size_t SearchManager::uniqueSymbolOcr = 0;
QMutex SearchManager::m_mutexFile; QMutex SearchManager::m_mutexFile;
QMutex SearchManager::m_mutexDir; QMutex SearchManager::m_mutexDir;
QMutex SearchManager::m_mutexContent; QMutex SearchManager::m_mutexContent;
QMutex SearchManager::m_mutexOcr;
SearchManager::SearchManager(QObject *parent) : QObject(parent) { SearchManager::SearchManager(QObject *parent) : QObject(parent) {
m_pool.setMaxThreadCount(3);
m_pool.setExpiryTimeout(1000);
} }
SearchManager::~SearchManager() { SearchManager::~SearchManager() {
@ -43,39 +45,6 @@ int SearchManager::getCurrentIndexCount() {
} }
} }
void SearchManager::onKeywordSearch(QString keyword, QQueue<QString> *searchResultFile, QQueue<QString> *searchResultDir,
QQueue<QPair<QString, QStringList>> *searchResultContent) {
// m_mutexFile.lock();
// ++uniqueSymbolFile;
// m_mutexFile.unlock();
// m_mutexDir.lock();
// ++uniqueSymbolDir;
// m_mutexDir.unlock();
// m_mutexContent.lock();
// ++uniqueSymbolContent;
// m_mutexContent.unlock();
// if(FileUtils::SearchMethod::DIRECTSEARCH == FileUtils::searchMethod) {
// DirectSearch *directSearch;
// directSearch = new DirectSearch(keyword, searchResultFile, searchResultDir, uniqueSymbolFile);
// m_pool.start(directSearch);
// } else if(FileUtils::SearchMethod::INDEXSEARCH == FileUtils::searchMethod) {
// FileSearch *filesearch;
// filesearch = new FileSearch(searchResultFile, uniqueSymbolFile, keyword, "0", 1, 0, 5);
// m_pool.start(filesearch);
// FileSearch *dirsearch;
// dirsearch = new FileSearch(searchResultDir, uniqueSymbolDir, keyword, "1", 1, 0, 5);
// m_pool.start(dirsearch);
// FileContentSearch *contentSearch;
// contentSearch = new FileContentSearch(searchResultContent, uniqueSymbolContent, keyword, 0, 5);
// m_pool.start(contentSearch);
// } else {
// qWarning() << "Unknown search method! FileUtils::searchMethod: " << static_cast<int>(FileUtils::searchMethod);
// }
return;
}
bool SearchManager::isBlocked(QString &path) { bool SearchManager::isBlocked(QString &path) {
QStringList blockList = GlobalSettings::getInstance()->getBlockDirs(); QStringList blockList = GlobalSettings::getInstance()->getBlockDirs();
for(QString i : blockList) { for(QString i : blockList) {
@ -101,6 +70,7 @@ bool SearchManager::creatResultInfo(SearchPluginIface::ResultInfo &ri, QString p
ri.type = 0; ri.type = 0;
return true; return true;
} }
FileSearch::FileSearch(DataQueue<SearchPluginIface::ResultInfo> *searchResult, size_t uniqueSymbol, QString keyword, QString value, unsigned slot, int begin, int num) { FileSearch::FileSearch(DataQueue<SearchPluginIface::ResultInfo> *searchResult, size_t uniqueSymbol, QString keyword, QString value, unsigned slot, int begin, int num) {
this->setAutoDelete(true); this->setAutoDelete(true);
m_search_result = searchResult; m_search_result = searchResult;
@ -428,6 +398,121 @@ int FileContentSearch::getResult(Xapian::MSet &result, std::string &keyWord) {
return 0; return 0;
} }
OcrSearch::OcrSearch(DataQueue<SearchPluginIface::ResultInfo> *searchResult, size_t uniqueSymbol, QString keyword, int begin, int num) {
this->setAutoDelete(true);
m_search_result = searchResult;
m_uniqueSymbol = uniqueSymbol;
m_keyword = keyword;
m_begin = begin;
m_num = num;
m_matchDecider = new OcrMatchDecider();
}
OcrSearch::~OcrSearch() {
m_search_result = nullptr;
if(m_matchDecider)
delete m_matchDecider;
}
void OcrSearch::run() {
SearchManager::m_mutexOcr.lock();
if(!m_search_result->isEmpty()) {
m_search_result->clear();
}
SearchManager::m_mutexOcr.unlock();
//这里同文件搜索,待优化。
m_begin = 0;
m_num = 100;
int resultCount = 1;
int totalCount = 0;
while(resultCount > 0) {
resultCount = keywordSearchOcr();
m_begin += m_num;
totalCount += resultCount;
}
qDebug() << "Total count:" << totalCount;
return;
}
int OcrSearch::keywordSearchOcr() {
try {
qDebug() << "--keywordSearch OCR search start--";
Xapian::Database db(OCR_INDEX_PATH);
Xapian::Enquire enquire(db);
Xapian::QueryParser qp;
qp.set_default_op(Xapian::Query::OP_AND);
qp.set_database(db);
QVector<SKeyWord> sKeyWord = ChineseSegmentation::getInstance()->callSegement(m_keyword.toStdString());
//Creat a query
std::string words;
for(int i = 0; i < sKeyWord.size(); i++) {
words.append(sKeyWord.at(i).word).append(" ");
}
std::vector<Xapian::Query> v;
for(int i=0; i<sKeyWord.size(); i++) {
v.push_back(Xapian::Query(sKeyWord.at(i).word));
qDebug() << QString::fromStdString(sKeyWord.at(i).word);
}
Xapian::Query query = Xapian::Query(Xapian::Query::OP_AND, v.begin(), v.end());
qDebug() << "keywordSearch OCR:" << QString::fromStdString(query.get_description());
enquire.set_query(query);
Xapian::MSet result = enquire.get_mset(m_begin, m_num, 0, m_matchDecider);
int resultCount = result.size();
if(result.size() == 0) {
return 0;
}
qDebug() << "keywordSearch OCR results count=" << resultCount;
if(getResult(result, words) == -1) {
return -1;
}
qDebug() << "--keywordSearch OCR search finish--";
return resultCount;
} catch(const Xapian::Error &e) {
qWarning() << QString::fromStdString(e.get_description());
qDebug() << "--keywordSearch OCR search finish--";
return -1;
}
}
int OcrSearch::getResult(Xapian::MSet &result, std::string &keyWord) {
for(auto it = result.begin(); it != result.end(); ++it) {
Xapian::Document doc = it.get_document();
std::string data = doc.get_data();
QString path = QString::fromStdString(doc.get_value(1));
SearchPluginIface::ResultInfo ri;
if(!SearchManager::creatResultInfo(ri, path)) {
continue;
}
// Construct snippets containing keyword.
auto term = doc.termlist_begin();
std::string wordTobeFound = QString::fromStdString(keyWord).section(" ", 0, 0).toStdString();
term.skip_to(wordTobeFound);
//fix me: make a snippet without cut cjk char.
auto pos = term.positionlist_begin();
QString snippet = FileUtils::chineseSubString(data,*pos,120);
ri.description.prepend(SearchPluginIface::DescriptionInfo{"",snippet});
QString().swap(snippet);
std::string().swap(data);
SearchManager::m_mutexOcr.lock();
if(m_uniqueSymbol == SearchManager::uniqueSymbolOcr) {
m_search_result->enqueue(ri);
SearchManager::m_mutexOcr.unlock();
} else {
SearchManager::m_mutexOcr.unlock();
return -1;
}
}
return 0;
}
DirectSearch::DirectSearch(QString keyword, DataQueue<SearchPluginIface::ResultInfo> *searchResult, QString value, size_t uniqueSymbol) { DirectSearch::DirectSearch(QString keyword, DataQueue<SearchPluginIface::ResultInfo> *searchResult, QString value, size_t uniqueSymbol) {
this->setAutoDelete(true); this->setAutoDelete(true);
m_keyword = keyword; m_keyword = keyword;
@ -521,3 +606,12 @@ bool FileContentMatchDecider::operator ()(const Xapian::Document &doc) const
} }
return true; return true;
} }
bool OcrMatchDecider::operator ()(const Xapian::Document &doc) const
{
QString path = QString::fromStdString(doc.get_value(1));
if(SearchManager::isBlocked(path)) {
return false;
}
return true;
}

View File

@ -45,15 +45,19 @@
#define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/index_data").toStdString() #define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/index_data").toStdString()
#define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/content_index_data").toStdString() #define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/content_index_data").toStdString()
#define OCR_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/ocr_index_data").toStdString()
namespace UkuiSearch { namespace UkuiSearch {
class FileMatchDecider; class FileMatchDecider;
class FileContentMatchDecider; class FileContentMatchDecider;
class OcrMatchDecider;
class LIBSEARCH_EXPORT SearchManager : public QObject { class LIBSEARCH_EXPORT SearchManager : public QObject {
friend class FileSearch; friend class FileSearch;
friend class FileContentSearch; friend class FileContentSearch;
friend class OcrSearch;
friend class DirectSearch; friend class DirectSearch;
friend class FileMatchDecider; friend class FileMatchDecider;
friend class FileContentMatchDecider; friend class FileContentMatchDecider;
friend class OcrMatchDecider;
Q_OBJECT Q_OBJECT
public: public:
explicit SearchManager(QObject *parent = nullptr); explicit SearchManager(QObject *parent = nullptr);
@ -64,22 +68,15 @@ public:
static size_t uniqueSymbolFile; static size_t uniqueSymbolFile;
static size_t uniqueSymbolDir; static size_t uniqueSymbolDir;
static size_t uniqueSymbolContent; static size_t uniqueSymbolContent;
static size_t uniqueSymbolOcr;
static QMutex m_mutexFile; static QMutex m_mutexFile;
static QMutex m_mutexDir; static QMutex m_mutexDir;
static QMutex m_mutexContent; static QMutex m_mutexContent;
static QMutex m_mutexOcr;
public Q_SLOTS:
void onKeywordSearch(QString keyword, QQueue<QString> *searchResultFile, QQueue<QString> *searchResultDir, QQueue<QPair<QString, QStringList>> *searchResultContent);
Q_SIGNALS:
void resultFile(QQueue<QString> *);
void resultDir(QQueue<QString> *);
void resultContent(QQueue<QPair<QString, QStringList>> *);
private: private:
static bool isBlocked(QString &path); static bool isBlocked(QString &path);
static bool creatResultInfo(UkuiSearch::SearchPluginIface::ResultInfo &ri, QString path); static bool creatResultInfo(UkuiSearch::SearchPluginIface::ResultInfo &ri, QString path);
QThreadPool m_pool;
}; };
class FileSearch : public QRunnable { class FileSearch : public QRunnable {
@ -121,6 +118,24 @@ private:
int m_num = 20; int m_num = 20;
}; };
class OcrSearch : public QRunnable {
public:
explicit OcrSearch(DataQueue<SearchPluginIface::ResultInfo> *searchResult, size_t uniqueSymbol, QString keyword, int begin = 0, int num = 20);
~OcrSearch();
protected:
void run();
private:
int keywordSearchOcr();
int getResult(Xapian::MSet &result, std::string &keyWord);
DataQueue<SearchPluginIface::ResultInfo> *m_search_result = nullptr;
OcrMatchDecider *m_matchDecider;
size_t m_uniqueSymbol;
QString m_keyword;
int m_begin = 0;
int m_num = 20;
};
class DirectSearch : public QRunnable { class DirectSearch : public QRunnable {
public: public:
explicit DirectSearch(QString keyword, DataQueue<SearchPluginIface::ResultInfo> *searchResult, QString value, size_t uniqueSymbol); explicit DirectSearch(QString keyword, DataQueue<SearchPluginIface::ResultInfo> *searchResult, QString value, size_t uniqueSymbol);
@ -133,11 +148,15 @@ private:
QString m_value; QString m_value;
}; };
class FileMatchDecider :public Xapian::MatchDecider { class FileMatchDecider : public Xapian::MatchDecider {
public: public:
bool operator ()(const Xapian::Document &doc) const; bool operator ()(const Xapian::Document &doc) const;
}; };
class FileContentMatchDecider :public Xapian::MatchDecider { class FileContentMatchDecider : public Xapian::MatchDecider {
public:
bool operator ()(const Xapian::Document &doc) const;
};
class OcrMatchDecider : public Xapian::MatchDecider {
public: public:
bool operator ()(const Xapian::Document &doc) const; bool operator ()(const Xapian::Document &doc) const;
}; };

View File

@ -41,7 +41,7 @@ include(dirwatcher/dirwatcher.pri)
include(mailsearch/mailsearch.pri) include(mailsearch/mailsearch.pri)
LIBS += -L$$OUT_PWD/../libchinese-segmentation/ -lchinese-segmentation LIBS += -L$$OUT_PWD/../libchinese-segmentation/ -lchinese-segmentation
LIBS += -lxapian -lquazip5 -luchardet -lQt5Xdg#-L/usr/local/lib/libjemalloc -ljemalloc LIBS += -lxapian -lquazip5 -luchardet -lQt5Xdg -ltesseract #-L/usr/local/lib/libjemalloc -ljemalloc
SOURCES += \ SOURCES += \
file-utils.cpp \ file-utils.cpp \