From 2e668d374a06a2eeede986a8f9de0c105a83d7ad Mon Sep 17 00:00:00 2001 From: jixiaoxu Date: Fri, 21 Jan 2022 16:53:19 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9EOCR=E5=8A=9F=E8=83=BD?= =?UTF-8?q?=E5=90=8E=E7=AB=AF=EF=BC=9B=E4=BF=AE=E5=A4=8D=E5=86=85=E5=AD=98?= =?UTF-8?q?=E6=B3=84=E6=BC=8F=E4=B8=80=E5=A4=84=EF=BC=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debian/control | 4 +- libsearch/common.h | 6 + libsearch/index/construct-document.cpp | 37 ++++++ libsearch/index/construct-document.h | 10 ++ libsearch/index/file-reader.cpp | 3 + libsearch/index/first-index.cpp | 48 ++++++-- libsearch/index/first-index.h | 3 +- libsearch/index/index-generator.cpp | 151 ++++++++++++++++++----- libsearch/index/index-generator.h | 8 ++ libsearch/index/index.pri | 2 + libsearch/index/ocrobject.cpp | 98 +++++++++++++++ libsearch/index/ocrobject.h | 41 +++++++ libsearch/index/search-manager.cpp | 164 +++++++++++++++++++------ libsearch/index/search-manager.h | 41 +++++-- libsearch/libsearch.pro | 2 +- 15 files changed, 531 insertions(+), 87 deletions(-) create mode 100644 libsearch/index/ocrobject.cpp create mode 100644 libsearch/index/ocrobject.h diff --git a/debian/control b/debian/control index a9c0826..3d3d383 100644 --- a/debian/control +++ b/debian/control @@ -20,7 +20,9 @@ Build-Depends: debhelper (>=9.0.0), libpoppler-qt5-dev, libukui-log4qt-dev, libqt5xdg-dev, - libukcc-dev + libukcc-dev, + libopencv-dev, + libtesseract-dev Standards-Version: 4.5.0 Homepage: https://www.ukui.org/ Vcs-Git: https://github.com/ukui/ukui-search.git diff --git a/libsearch/common.h b/libsearch/common.h index 810996c..2aabf21 100644 --- a/libsearch/common.h +++ b/libsearch/common.h @@ -27,5 +27,11 @@ static const QMap targetFileTypeMap = { std::map::value_type("et", true), std::map::value_type("pdf", true) }; + +static const QMap targetPhotographTypeMap = { + std::map::value_type("png", true), + std::map::value_type("jpg", true), + std::map::value_type("jpeg", true)//TODO 待完善,后续改为配置文件 +}; //TODO Put things that needed to be put here here. #endif // COMMON_H diff --git a/libsearch/index/construct-document.cpp b/libsearch/index/construct-document.cpp index 0a31189..c50f7a8 100644 --- a/libsearch/index/construct-document.cpp +++ b/libsearch/index/construct-document.cpp @@ -130,3 +130,40 @@ void ConstructDocumentForContent::run() { return; } + +ConstructDocumentForOcr::ConstructDocumentForOcr(QString path) +{ + this->setAutoDelete(true); + m_path = std::move(path); +} + +void ConstructDocumentForOcr::run() +{ + QString content; + FileReader::getTextContent(m_path, content); + + Document doc; + doc.setUniqueTerm(FileUtils::makeDocUterm(m_path)); + doc.addTerm("ZEEKERUPTERM" + FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep))); + doc.addValue(1, m_path); + + if(content.isEmpty()) { + doc.reuireDeleted(); + } else { + doc.setData(content); + //'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info. + content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " "); + std::vector term = ChineseSegmentation::getInstance()->callSegementStd(content.toStdString()); + for(size_t i = 0; i < term.size(); ++i) { + doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast(term.at(i).weight)); + } + term.clear(); + term.shrink_to_fit(); + } + IndexGenerator::g_mutexDocListForOcr.lock(); + IndexGenerator::g_docListForOcr.append(doc); + IndexGenerator::g_mutexDocListForOcr.unlock(); + content.clear(); + content.squeeze(); +} + diff --git a/libsearch/index/construct-document.h b/libsearch/index/construct-document.h index 99c776e..dbe0486 100644 --- a/libsearch/index/construct-document.h +++ b/libsearch/index/construct-document.h @@ -48,6 +48,16 @@ protected: private: QString m_path; }; + +class ConstructDocumentForOcr : public QRunnable { +public: + explicit ConstructDocumentForOcr(QString path); + ~ConstructDocumentForOcr() = default; +protected: + void run(); +private: + QString m_path; +}; } #endif // CONSTRUCTDOCUMENT_H diff --git a/libsearch/index/file-reader.cpp b/libsearch/index/file-reader.cpp index 6a05962..1bf2897 100644 --- a/libsearch/index/file-reader.cpp +++ b/libsearch/index/file-reader.cpp @@ -20,6 +20,7 @@ #include "file-reader.h" #include "file-utils.h" #include "binary-parser.h" +#include "ocrobject.h" using namespace UkuiSearch; FileReader::FileReader(QObject *parent) : QObject(parent) { @@ -41,6 +42,8 @@ void FileReader::getTextContent(QString path, QString &textContent) { searchdata.RunParser(path, textContent); } else if (strsfx == "pdf") { FileUtils::getPdfTextContent(path, textContent); + } else if (strsfx == "png" || strsfx == "jpg" || strsfx == "jpeg"){ + OcrObject::getInstance()->getTxtContent(path, textContent);; } return; } diff --git a/libsearch/index/first-index.cpp b/libsearch/index/first-index.cpp index 51c0e35..41ff695 100644 --- a/libsearch/index/first-index.cpp +++ b/libsearch/index/first-index.cpp @@ -39,6 +39,9 @@ FirstIndex::~FirstIndex() { if(this->q_content_index) delete this->q_content_index; this->q_content_index = nullptr; + if(this->m_ocr_index) + delete this->m_ocr_index; + this->m_ocr_index = nullptr; if(this->p_indexGenerator) delete this->p_indexGenerator; this->p_indexGenerator = nullptr; @@ -48,10 +51,10 @@ FirstIndex::~FirstIndex() { void FirstIndex::DoSomething(const QFileInfo& fileInfo) { // qDebug() << "there are some shit here"<q_index->enqueue(QVector() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0")); - if((fileInfo.fileName().split(".", QString::SkipEmptyParts).length() > 1) - && (true == targetFileTypeMap[fileInfo.fileName().split(".").last()]) - && (!FileUtils::isEncrypedOrUnreadable(fileInfo.absoluteFilePath()))) { - //this->q_content_index->enqueue(fileInfo.absoluteFilePath()); + if (fileInfo.fileName().split(".", QString::SkipEmptyParts).length() < 2) + return; + if (true == targetFileTypeMap[fileInfo.fileName().split(".").last()] + and false == FileUtils::isEncrypedOrUnreadable(fileInfo.absoluteFilePath())) { if (fileInfo.fileName().split(".").last() == "docx") { QuaZip file(fileInfo.absoluteFilePath()); if(!file.open(QuaZip::mdUnzip)) @@ -93,6 +96,8 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) { } else { this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size())); } + } else if (true == targetPhotographTypeMap[fileInfo.fileName().split(".").last()]) { + this->m_ocr_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size())); } } @@ -120,6 +125,7 @@ void FirstIndex::run() { this->q_index = new QQueue>(); this->q_content_index = new QQueue>(); + this->m_ocr_index = new QQueue>(); int fifo_fd; char buffer[2]; @@ -214,7 +220,34 @@ void FirstIndex::run() { qDebug() << "content index end;"; sem.release(2); }); - + //OCR功能暂时屏蔽 +// QtConcurrent::run(&m_pool,[&]() { +// sem.acquire(5); +// QQueue* tmpOcr = new QQueue(); +// qDebug() << "m_ocr_index:" << m_ocr_index->size(); +// while(!this->m_ocr_index->empty()) { +// qint64 fileSize = 0; +// //一次处理的数据量文件总大小为50M以下,50M为暂定值 +// for(size_t i = 0;/* (i < 30) && (fileSize < 52428800) && */(!this->m_ocr_index->empty()); ++i) { +// QPair tempPair = this->m_ocr_index->dequeue(); +// fileSize += tempPair.second; +// if (fileSize > 52428800) { +// if (tmpOcr->size() == 0) { +// tmpOcr->enqueue(tempPair.first); +// break; +// } +// this->m_ocr_index->enqueue(tempPair); +// break; +// } +// tmpOcr->enqueue(tempPair.first); +// } +// this->p_indexGenerator->creatOcrIndex(tmpOcr); +// tmpOcr->clear(); +// } +// delete tmpOcr; +// qDebug() << "OCR index end;"; +// sem.release(5); +// }); mutex1.lock(); mutex2.lock(); mutex3.lock(); @@ -223,14 +256,15 @@ void FirstIndex::run() { mutex2.unlock(); mutex3.unlock(); - - if(this->q_index) delete this->q_index; this->q_index = nullptr; if(this->q_content_index) delete this->q_content_index; this->q_content_index = nullptr; + if(this->m_ocr_index) + delete this->m_ocr_index; + this->m_ocr_index = nullptr; if(p_indexGenerator) delete p_indexGenerator; p_indexGenerator = nullptr; diff --git a/libsearch/index/first-index.h b/libsearch/index/first-index.h index 83b9262..635836d 100644 --- a/libsearch/index/first-index.h +++ b/libsearch/index/first-index.h @@ -60,7 +60,8 @@ private: // QQueue* q_content_index; //修改QQueue存储数据为QPair,增加存储文件大小数据便于处理时统计--jxx20210519 QQueue>* q_content_index; - + //新增ocr队列存储ocr可识别处理的图片信息及大小; + QQueue>* m_ocr_index; //xapian will auto commit per 10,000 changes, donnot change it!!! const size_t u_send_length = 8192; }; diff --git a/libsearch/index/index-generator.cpp b/libsearch/index/index-generator.cpp index fb93174..ea7f7d8 100644 --- a/libsearch/index/index-generator.cpp +++ b/libsearch/index/index-generator.cpp @@ -33,6 +33,7 @@ #define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/index_data").toStdString() #define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/content_index_data").toStdString() +#define OCR_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/ocr_index_data").toStdString() using namespace UkuiSearch; @@ -44,8 +45,11 @@ QMutex IndexGenerator::m_mutex; //QMutex UkuiSearch::g_mutexDocListForContent; QMutex IndexGenerator::g_mutexDocListForPath; QMutex IndexGenerator::g_mutexDocListForContent; +QMutex IndexGenerator::g_mutexDocListForOcr; QVector IndexGenerator::g_docListForPath = QVector(); QVector IndexGenerator::g_docListForContent = QVector(); +QVector IndexGenerator::g_docListForOcr = QVector(); + IndexGenerator *IndexGenerator::getInstance(bool rebuild, QObject *parent) { QMutexLocker locker(&m_mutex); @@ -134,6 +138,44 @@ bool IndexGenerator::creatAllIndex(QQueue *messageList) { } +bool IndexGenerator::creatOcrIndex(QQueue *messageList) +{ + HandleOcrPathList(messageList); + if(IndexGenerator::g_docListForOcr.isEmpty()) { + return false; + } + int size = IndexGenerator::g_docListForOcr.size(); + qDebug() << "begin creatAllIndex for ocr" << size; + if(!size == 0) { + try { + int count = 0; + for(Document i : IndexGenerator::g_docListForOcr) { + if(!i.isRequiredDeleted()) { + m_database_ocr->replace_document(i.getUniqueTerm(), i.getXapianDocument()); + } else { + m_database_ocr->delete_document(i.getUniqueTerm()); + } + if(++count > 999) { + count = 0; + m_database_ocr->commit(); + } + } + m_database_ocr->commit(); + } catch(const Xapian::Error &e) { + qWarning() << "creat ocr Index fail!" << QString::fromStdString(e.get_description()); + IndexStatusRecorder::getInstance()->setStatus(CONTENT_INDEX_DATABASE_STATE, "1"); + assert(false); + } + qDebug() << "finish creatAllIndex for ocr"; + + IndexGenerator::g_docListForOcr.clear(); + IndexGenerator::g_docListForOcr.squeeze(); + QVector().swap(IndexGenerator::g_docListForOcr); + malloc_trim(0); + } + return true; +} + IndexGenerator::IndexGenerator(bool rebuild, QObject *parent) : QObject(parent) { QDir database(QString::fromStdString(INDEX_PATH)); @@ -153,6 +195,7 @@ IndexGenerator::IndexGenerator(bool rebuild, QObject *parent) : QObject(parent) m_database_path = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OPEN); m_database_content = new Xapian::WritableDatabase(CONTENT_INDEX_PATH, Xapian::DB_CREATE_OR_OPEN); + m_database_ocr = new Xapian::WritableDatabase(OCR_INDEX_PATH, Xapian::DB_CREATE_OR_OPEN); } IndexGenerator::~IndexGenerator() { @@ -165,8 +208,11 @@ IndexGenerator::~IndexGenerator() { if(m_database_content) m_database_content->~WritableDatabase(); // delete m_database_content; + if(m_database_ocr) + m_database_ocr->~WritableDatabase(); m_database_path = nullptr; m_database_content = nullptr; + m_database_ocr = nullptr; global_instance = nullptr; // if(m_index_map) // delete m_index_map; @@ -266,28 +312,25 @@ void IndexGenerator::HandlePathList(QQueue *messageList) { pool.start(constructer); } qDebug() << "pool finish" << pool.waitForDone(-1); -// if(constructer) -// delete constructer; -// constructer = nullptr; - -// QFuture future = QtConcurrent::mapped(*messageList,&IndexGenerator::GenerateContentDocument); - -// future.waitForFinished(); -// ChineseSegmentation::getInstance()->~ChineseSegmentation(); - -// QList docList = future.results(); -// mg_docListForContent = new QList(docList); - -// qDebug()<size(); - -// QList docList = future.results(); -// mg_docListForContent = new QList(docList); -// mg_docListForContent = std::move(future.results()); -// future.cancel(); - qDebug() << "Finish HandlePathList for content index!"; return; +} +void IndexGenerator::HandleOcrPathList(QQueue *messageList) +{ + qDebug() << "Begin HandlePathList for ocr index!"; + qDebug() << messageList->size(); + ConstructDocumentForOcr *constructer; + QThreadPool pool; + pool.setMaxThreadCount(1); + pool.setExpiryTimeout(100); + while(!messageList->isEmpty()) { + constructer = new ConstructDocumentForOcr(messageList->dequeue()); + pool.start(constructer); + } + qDebug() << "pool finish" << pool.waitForDone(-1); + qDebug() << "Finish HandlePathList for content index!"; + return; } //deprecated Document IndexGenerator::GenerateDocument(const QVector &list) { @@ -460,10 +503,13 @@ bool IndexGenerator::deleteAllIndex(QStringList *pathlist) { m_database_path->delete_document(uniqueterm); m_database_content->delete_document(uniqueterm); + m_database_ocr->delete_document(uniqueterm); //delete all files under it if it's a dir. m_database_path->delete_document(upterm); m_database_content->delete_document(upterm); + m_database_ocr->delete_document(upterm); + qDebug() << "delete path" << doc; // qDebug() << "delete md5" << QString::fromStdString(uniqueterm); @@ -472,6 +518,7 @@ bool IndexGenerator::deleteAllIndex(QStringList *pathlist) { } m_database_path->commit(); m_database_content->commit(); + m_database_ocr->commit(); qDebug() << "--delete finish--"; } catch(const Xapian::Error &e) { qWarning() << QString::fromStdString(e.get_description()); @@ -503,43 +550,85 @@ bool IndexGenerator::deleteContentIndex(QStringList *pathlist) return true; } +bool IndexGenerator::deleteOcrIndex(QStringList *pathlist) +{ + if(pathlist->isEmpty()) + return true; + try { + qDebug() << "--delete start--"; + for(int i = 0; i < pathlist->size(); i++) { + QString doc = pathlist->at(i); + std::string uniqueterm = FileUtils::makeDocUterm(doc); + m_database_ocr->delete_document(uniqueterm); + qDebug() << "delete path" << doc; + } + m_database_ocr->commit(); + qDebug() << "--delete finish--"; + } catch(const Xapian::Error &e) { + qWarning() << QString::fromStdString(e.get_description()); + return false; + } + return true; +} + bool IndexGenerator::updateIndex(QVector *pendingFiles) { QQueue> *fileIndexInfo = new QQueue>; QQueue *fileContentIndexInfo = new QQueue; + QQueue *fileOcrIndexInfo = new QQueue; QStringList *deleteList = new QStringList; QStringList *contentDeleteList = new QStringList; - for(PendingFile file : *pendingFiles) { - if(file.shouldRemoveIndex()) { - + for (PendingFile file : *pendingFiles) { + if (file.shouldRemoveIndex()) { deleteList->append(file.path()); continue; } fileIndexInfo->append(QVector() << file.path().section("/" , -1) << file.path() << QString(file.isDir() ? "1" : "0")); - if((!file.path().split(".").isEmpty()) && (true == targetFileTypeMap[file.path().section("/" , -1) .split(".").last()])) { - if(!FileUtils::isEncrypedOrUnreadable(file.path())) { + if ((!file.path().split(".").isEmpty()) && (true == targetFileTypeMap[file.path().section("/" , -1) .split(".").last()])) { + if (!FileUtils::isEncrypedOrUnreadable(file.path())) { fileContentIndexInfo->append(file.path()); } else { contentDeleteList->append(file.path()); } } - } - if(!deleteList->isEmpty()) { + if (!deleteList->isEmpty()) { deleteAllIndex(deleteList); } - if(!contentDeleteList->isEmpty()) { + if (!contentDeleteList->isEmpty()) { deleteContentIndex(contentDeleteList); } - if(!fileIndexInfo->isEmpty()) { + if (!fileIndexInfo->isEmpty()) { creatAllIndex(fileIndexInfo); } - if(!fileContentIndexInfo->isEmpty()) { + if (!fileContentIndexInfo->isEmpty()) { creatAllIndex(fileContentIndexInfo); } - delete fileIndexInfo; - delete fileContentIndexInfo; + if (!fileOcrIndexInfo->isEmpty()) { + creatOcrIndex(fileOcrIndexInfo); + } + if (fileIndexInfo) { + delete fileIndexInfo; + fileIndexInfo = nullptr; + } + if (fileContentIndexInfo) { + delete fileContentIndexInfo; + fileContentIndexInfo = nullptr; + } + if (fileOcrIndexInfo) { + delete fileOcrIndexInfo; + fileOcrIndexInfo = nullptr; + } + if (deleteList) { + delete deleteList; + deleteList = nullptr; + } + if (contentDeleteList) { + delete contentDeleteList; + contentDeleteList = nullptr; + } + return true; } diff --git a/libsearch/index/index-generator.h b/libsearch/index/index-generator.h index 252af64..d7ecdce 100644 --- a/libsearch/index/index-generator.h +++ b/libsearch/index/index-generator.h @@ -45,6 +45,7 @@ namespace UkuiSearch { class IndexGenerator : public QObject { friend class ConstructDocumentForPath; friend class ConstructDocumentForContent; + friend class ConstructDocumentForOcr; Q_OBJECT public: static IndexGenerator *getInstance(bool rebuild = false, QObject *parent = nullptr); @@ -61,8 +62,10 @@ Q_SIGNALS: public Q_SLOTS: bool creatAllIndex(QQueue> *messageList); bool creatAllIndex(QQueue *messageList); + bool creatOcrIndex(QQueue *messageList); bool deleteAllIndex(QStringList *pathlist); bool deleteContentIndex(QStringList *pathlist); + bool deleteOcrIndex(QStringList *pathlist); bool updateIndex(QVector *pendingFiles); private: @@ -72,6 +75,8 @@ private: void HandlePathList(QQueue > *messageList); //For file content index void HandlePathList(QQueue *messageList); + //For ocr index + void HandleOcrPathList(QQueue *messageList); static Document GenerateDocument(const QVector &list); static Document GenerateContentDocument(const QString &list); //add one data in database @@ -82,10 +87,13 @@ private: static QMutex g_mutexDocListForPath; static QVector g_docListForContent; static QMutex g_mutexDocListForContent; + static QVector g_docListForOcr; + static QMutex g_mutexDocListForOcr; QMap m_index_map; QString m_index_data_path; Xapian::WritableDatabase* m_database_path; Xapian::WritableDatabase* m_database_content; + Xapian::WritableDatabase* m_database_ocr; std::string m_docstr; std::string m_index_text_str; Xapian::TermGenerator m_indexer; diff --git a/libsearch/index/index.pri b/libsearch/index/index.pri index 1b41386..cfae419 100644 --- a/libsearch/index/index.pri +++ b/libsearch/index/index.pri @@ -9,6 +9,7 @@ HEADERS += \ $$PWD/index-generator.h \ $$PWD/index-status-recorder.h \ $$PWD/inotify-watch.h \ + $$PWD/ocrobject.h \ $$PWD/pending-file-queue.h \ $$PWD/pending-file.h \ $$PWD/search-manager.h \ @@ -25,6 +26,7 @@ SOURCES += \ $$PWD/index-generator.cpp \ $$PWD/index-status-recorder.cpp \ $$PWD/inotify-watch.cpp \ + $$PWD/ocrobject.cpp \ $$PWD/pending-file-queue.cpp \ $$PWD/pending-file.cpp \ $$PWD/search-manager.cpp \ diff --git a/libsearch/index/ocrobject.cpp b/libsearch/index/ocrobject.cpp new file mode 100644 index 0000000..4ecbcc6 --- /dev/null +++ b/libsearch/index/ocrobject.cpp @@ -0,0 +1,98 @@ +#include "ocrobject.h" + +OcrObject *OcrObject::m_instance = nullptr; +once_flag g_instanceFlag; + +OcrObject *OcrObject::getInstance() +{ + std::call_once(g_instanceFlag, [] () { + m_instance = new OcrObject; + }); + return m_instance; +} + +void OcrObject::getTxtContent(QString &path, QString &textcontent) +{ + m_api = new tesseract::TessBaseAPI(); + if (m_api->Init(NULL, "chi_sim")) { + qDebug() << "Could not initialize tesseract.\n"; + return; + } + m_api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080 + + Pix *image = pixRead(path.toStdString().data()); + if (!image) { + qDebug() << "path:" << path <<" pixRead error!"; + if (m_api) { + m_api->End(); + delete m_api; + m_api = nullptr; + } + return; + } + m_api->SetImage(image); + textcontent = m_api->GetUTF8Text(); + qDebug() << "path:" << path << " Text:" << textcontent; + pixDestroy(&image); + m_api->Clear(); + + if (m_api) { + m_api->End(); + delete m_api; + m_api = nullptr; + } + +//多进程版本 +// tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI(); +// if (api->Init(NULL, "chi_sim")) { +// qDebug() << "Could not initialize tesseract.\n"; +// return; +// } +// api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080 + +// Pix *image = pixRead(path.toStdString().data()); +// if (!image) { +// qDebug() << "path:" << path <<" pixRead error!"; +// if (api) { +// api->End(); +// delete api; +// api = nullptr; +// } +// return; +// } +// api->SetImage(image); +// textcontent = api->GetUTF8Text(); +// qDebug() << "path:" << path << " Text:" << textcontent; +// pixDestroy(&image); +// api->Clear(); + +// if (api) { +// api->End(); +// delete api; +// api = nullptr; +// } +} + +OcrObject::OcrObject(QObject *parent) : QObject(parent) +{ + init(); +} + +OcrObject::~OcrObject() +{ + if (m_api) { + m_api->End(); + delete m_api; + m_api = nullptr; + } +} + +void OcrObject::init() +{ + m_api = new tesseract::TessBaseAPI(); + if (m_api->Init(NULL, "chi_sim")) { + qDebug() << "Could not initialize tesseract.\n"; + return; + } + m_api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080 +} diff --git a/libsearch/index/ocrobject.h b/libsearch/index/ocrobject.h new file mode 100644 index 0000000..b686352 --- /dev/null +++ b/libsearch/index/ocrobject.h @@ -0,0 +1,41 @@ +#ifndef OCROBJECT_H +#define OCROBJECT_H + +#include +#include +#include +#include +#include + +using namespace std; +class OcrObject : public QObject +{ + Q_OBJECT +public: + static OcrObject* getInstance(); + + void getTxtContent(QString &path, QString &textcontent); + +protected: + explicit OcrObject(QObject *parent = nullptr); + ~OcrObject(); + +private: + static OcrObject *m_instance; + + tesseract::TessBaseAPI *m_api = nullptr; + void init(); + + class Garbo + { + public: + ~Garbo() { + if (OcrObject::m_instance) + delete OcrObject::m_instance; + } + static Garbo g_garbo; + }; + +}; + +#endif // OCROBJECT_H diff --git a/libsearch/index/search-manager.cpp b/libsearch/index/search-manager.cpp index c7d5114..7024c92 100644 --- a/libsearch/index/search-manager.cpp +++ b/libsearch/index/search-manager.cpp @@ -19,15 +19,17 @@ */ #include "search-manager.h" using namespace UkuiSearch; + size_t SearchManager::uniqueSymbolFile = 0; size_t SearchManager::uniqueSymbolDir = 0; size_t SearchManager::uniqueSymbolContent = 0; +size_t SearchManager::uniqueSymbolOcr = 0; QMutex SearchManager::m_mutexFile; QMutex SearchManager::m_mutexDir; QMutex SearchManager::m_mutexContent; +QMutex SearchManager::m_mutexOcr; + SearchManager::SearchManager(QObject *parent) : QObject(parent) { - m_pool.setMaxThreadCount(3); - m_pool.setExpiryTimeout(1000); } SearchManager::~SearchManager() { @@ -43,39 +45,6 @@ int SearchManager::getCurrentIndexCount() { } } -void SearchManager::onKeywordSearch(QString keyword, QQueue *searchResultFile, QQueue *searchResultDir, - QQueue> *searchResultContent) { -// m_mutexFile.lock(); -// ++uniqueSymbolFile; -// m_mutexFile.unlock(); -// m_mutexDir.lock(); -// ++uniqueSymbolDir; -// m_mutexDir.unlock(); -// m_mutexContent.lock(); -// ++uniqueSymbolContent; -// m_mutexContent.unlock(); -// if(FileUtils::SearchMethod::DIRECTSEARCH == FileUtils::searchMethod) { -// DirectSearch *directSearch; -// directSearch = new DirectSearch(keyword, searchResultFile, searchResultDir, uniqueSymbolFile); -// m_pool.start(directSearch); -// } else if(FileUtils::SearchMethod::INDEXSEARCH == FileUtils::searchMethod) { -// FileSearch *filesearch; -// filesearch = new FileSearch(searchResultFile, uniqueSymbolFile, keyword, "0", 1, 0, 5); -// m_pool.start(filesearch); - -// FileSearch *dirsearch; -// dirsearch = new FileSearch(searchResultDir, uniqueSymbolDir, keyword, "1", 1, 0, 5); -// m_pool.start(dirsearch); - -// FileContentSearch *contentSearch; -// contentSearch = new FileContentSearch(searchResultContent, uniqueSymbolContent, keyword, 0, 5); -// m_pool.start(contentSearch); -// } else { -// qWarning() << "Unknown search method! FileUtils::searchMethod: " << static_cast(FileUtils::searchMethod); -// } - return; -} - bool SearchManager::isBlocked(QString &path) { QStringList blockList = GlobalSettings::getInstance()->getBlockDirs(); for(QString i : blockList) { @@ -101,6 +70,7 @@ bool SearchManager::creatResultInfo(SearchPluginIface::ResultInfo &ri, QString p ri.type = 0; return true; } + FileSearch::FileSearch(DataQueue *searchResult, size_t uniqueSymbol, QString keyword, QString value, unsigned slot, int begin, int num) { this->setAutoDelete(true); m_search_result = searchResult; @@ -428,6 +398,121 @@ int FileContentSearch::getResult(Xapian::MSet &result, std::string &keyWord) { return 0; } +OcrSearch::OcrSearch(DataQueue *searchResult, size_t uniqueSymbol, QString keyword, int begin, int num) { + this->setAutoDelete(true); + m_search_result = searchResult; + m_uniqueSymbol = uniqueSymbol; + m_keyword = keyword; + m_begin = begin; + m_num = num; + m_matchDecider = new OcrMatchDecider(); +} + +OcrSearch::~OcrSearch() { + m_search_result = nullptr; + if(m_matchDecider) + delete m_matchDecider; +} + +void OcrSearch::run() { + SearchManager::m_mutexOcr.lock(); + if(!m_search_result->isEmpty()) { + m_search_result->clear(); + } + SearchManager::m_mutexOcr.unlock(); + + //这里同文件搜索,待优化。 + m_begin = 0; + m_num = 100; + int resultCount = 1; + int totalCount = 0; + while(resultCount > 0) { + resultCount = keywordSearchOcr(); + m_begin += m_num; + totalCount += resultCount; + } + qDebug() << "Total count:" << totalCount; + return; +} + +int OcrSearch::keywordSearchOcr() { + try { + qDebug() << "--keywordSearch OCR search start--"; + Xapian::Database db(OCR_INDEX_PATH); + Xapian::Enquire enquire(db); + Xapian::QueryParser qp; + qp.set_default_op(Xapian::Query::OP_AND); + qp.set_database(db); + QVector sKeyWord = ChineseSegmentation::getInstance()->callSegement(m_keyword.toStdString()); + //Creat a query + std::string words; + for(int i = 0; i < sKeyWord.size(); i++) { + words.append(sKeyWord.at(i).word).append(" "); + } + std::vector v; + for(int i=0; ienqueue(ri); + SearchManager::m_mutexOcr.unlock(); + } else { + SearchManager::m_mutexOcr.unlock(); + return -1; + } + } + return 0; +} + DirectSearch::DirectSearch(QString keyword, DataQueue *searchResult, QString value, size_t uniqueSymbol) { this->setAutoDelete(true); m_keyword = keyword; @@ -521,3 +606,12 @@ bool FileContentMatchDecider::operator ()(const Xapian::Document &doc) const } return true; } + +bool OcrMatchDecider::operator ()(const Xapian::Document &doc) const +{ + QString path = QString::fromStdString(doc.get_value(1)); + if(SearchManager::isBlocked(path)) { + return false; + } + return true; +} diff --git a/libsearch/index/search-manager.h b/libsearch/index/search-manager.h index 36d5bf2..006ae0e 100644 --- a/libsearch/index/search-manager.h +++ b/libsearch/index/search-manager.h @@ -45,15 +45,19 @@ #define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/index_data").toStdString() #define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/content_index_data").toStdString() +#define OCR_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/ocr_index_data").toStdString() namespace UkuiSearch { class FileMatchDecider; class FileContentMatchDecider; +class OcrMatchDecider; class LIBSEARCH_EXPORT SearchManager : public QObject { friend class FileSearch; friend class FileContentSearch; + friend class OcrSearch; friend class DirectSearch; friend class FileMatchDecider; friend class FileContentMatchDecider; + friend class OcrMatchDecider; Q_OBJECT public: explicit SearchManager(QObject *parent = nullptr); @@ -64,22 +68,15 @@ public: static size_t uniqueSymbolFile; static size_t uniqueSymbolDir; static size_t uniqueSymbolContent; + static size_t uniqueSymbolOcr; static QMutex m_mutexFile; static QMutex m_mutexDir; static QMutex m_mutexContent; + static QMutex m_mutexOcr; -public Q_SLOTS: - void onKeywordSearch(QString keyword, QQueue *searchResultFile, QQueue *searchResultDir, QQueue> *searchResultContent); - -Q_SIGNALS: - void resultFile(QQueue *); - void resultDir(QQueue *); - void resultContent(QQueue> *); private: static bool isBlocked(QString &path); static bool creatResultInfo(UkuiSearch::SearchPluginIface::ResultInfo &ri, QString path); - - QThreadPool m_pool; }; class FileSearch : public QRunnable { @@ -121,6 +118,24 @@ private: int m_num = 20; }; +class OcrSearch : public QRunnable { +public: + explicit OcrSearch(DataQueue *searchResult, size_t uniqueSymbol, QString keyword, int begin = 0, int num = 20); + ~OcrSearch(); +protected: + void run(); +private: + int keywordSearchOcr(); + int getResult(Xapian::MSet &result, std::string &keyWord); + + DataQueue *m_search_result = nullptr; + OcrMatchDecider *m_matchDecider; + size_t m_uniqueSymbol; + QString m_keyword; + int m_begin = 0; + int m_num = 20; +}; + class DirectSearch : public QRunnable { public: explicit DirectSearch(QString keyword, DataQueue *searchResult, QString value, size_t uniqueSymbol); @@ -133,11 +148,15 @@ private: QString m_value; }; -class FileMatchDecider :public Xapian::MatchDecider { +class FileMatchDecider : public Xapian::MatchDecider { public: bool operator ()(const Xapian::Document &doc) const; }; -class FileContentMatchDecider :public Xapian::MatchDecider { +class FileContentMatchDecider : public Xapian::MatchDecider { +public: + bool operator ()(const Xapian::Document &doc) const; +}; +class OcrMatchDecider : public Xapian::MatchDecider { public: bool operator ()(const Xapian::Document &doc) const; }; diff --git a/libsearch/libsearch.pro b/libsearch/libsearch.pro index 6f3f9a1..97bb8f0 100644 --- a/libsearch/libsearch.pro +++ b/libsearch/libsearch.pro @@ -41,7 +41,7 @@ include(dirwatcher/dirwatcher.pri) include(mailsearch/mailsearch.pri) LIBS += -L$$OUT_PWD/../libchinese-segmentation/ -lchinese-segmentation -LIBS += -lxapian -lquazip5 -luchardet -lQt5Xdg#-L/usr/local/lib/libjemalloc -ljemalloc +LIBS += -lxapian -lquazip5 -luchardet -lQt5Xdg -ltesseract #-L/usr/local/lib/libjemalloc -ljemalloc SOURCES += \ file-utils.cpp \