Merge branch 'ukss' into 'ukss-dev'

文本内容搜索扩展OCR功能;

See merge request kylin-desktop/ukui-search!288
This commit is contained in:
PengfeiZhang 2022-04-14 01:17:21 +00:00
commit 324da7cb11
12 changed files with 175 additions and 109 deletions

View File

@ -13,6 +13,7 @@ static const QString OCR_INDEX_PATH = HOME_PATH + QStringLiteral("/.config/org.u
static const QString FILE_SEARCH_VALUE = QStringLiteral("0");
static const QString DIR_SEARCH_VALUE = QStringLiteral("1");
static const QString INDEX_SEM = QStringLiteral("ukui-search-index-sem");
static const int OCR_MIN_SIZE = 200;
static const QStringList allAppPath = {
{HOME_PATH + "/.local/share/applications/"},
@ -37,6 +38,20 @@ static const QMap<QString, bool> targetFileTypeMap = {
static const QMap<QString, bool> targetPhotographTypeMap = {
{"png", true},
{"bmp", true},
{"hdr", false},
{"gif", true},
{"tif", true},
{"tiff", true},
{"heif", false},
{"webp", true},
{"jpe", true},
{"dib", false},
{"psd", false},
{"jng", false},
{"xpm", false},//pix read error.
{"j2k", false},
{"jp2", false},
{"jpg", true},
{"jpeg", true} // TODO 待完善,后续改为配置文件
};

View File

@ -961,6 +961,28 @@ bool FileUtils::isEncrypedOrUnreadable(QString path)
}
}
bool FileUtils::isOcrSupportSize(QString path)
{
/*
bool res;
Pix *image = pixRead(path.toStdString().data());
if (image->h < OCR_MIN_SIZE or image->w < OCR_MIN_SIZE) {//限制图片像素尺寸
qDebug() << "file:" << path << "is not right size.";
res = false;
} else
res = true;
pixDestroy(&image);
return res;
*/
QImage file(path);
if (file.height() < OCR_MIN_SIZE or file.width() < OCR_MIN_SIZE) {//限制图片像素尺寸
qDebug() << "file:" << path << "is not right size.";
return false;
} else
return true;
}
QString FileUtils::getHtmlText(const QString &text, const QString &keyword)
{
QString htmlString;

View File

@ -54,6 +54,8 @@
#include "libsearch_global.h"
#include "common.h"
#include <leptonica/allheaders.h>
//#define INITIAL_STATE 0
//#define CREATING_INDEX 1
//#define FINISH_CREATING_INDEX 2
@ -97,6 +99,7 @@ public:
static QIcon iconFromTheme(const QString &name, const QIcon &iconDefault);
static bool isOpenXMLFileEncrypted(QString &path);
static bool isEncrypedOrUnreadable(QString path);
static bool isOcrSupportSize(QString path);
static size_t maxIndexCount;
static unsigned short indexStatus;

View File

@ -102,12 +102,14 @@ void ConstructDocumentForContent::run() {
// qDebug() << "ConstructDocumentForContent currentThreadId()" << QThread::currentThreadId();
//构造文本索引的document
QString content;
FileReader::getTextContent(m_path, content);
QString suffix;
FileReader::getTextContent(m_path, content, suffix);
Document doc;
doc.setUniqueTerm(FileUtils::makeDocUterm(m_path));
doc.addTerm("ZEEKERUPTERM" + FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
doc.addValue(1, m_path);
doc.addValue(2, suffix);
if(content.isEmpty()) {
doc.reuireDeleted();
@ -140,12 +142,14 @@ ConstructDocumentForOcr::ConstructDocumentForOcr(QString path)
void ConstructDocumentForOcr::run()
{
QString content;
FileReader::getTextContent(m_path, content);
QString suffix;
FileReader::getTextContent(m_path, content, suffix);
Document doc;
doc.setUniqueTerm(FileUtils::makeDocUterm(m_path));
doc.addTerm("ZEEKERUPTERM" + FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
doc.addValue(1, m_path);
doc.addValue(2, suffix);
if(content.isEmpty()) {
doc.reuireDeleted();

View File

@ -25,25 +25,26 @@ using namespace UkuiSearch;
FileReader::FileReader(QObject *parent) : QObject(parent) {
}
void FileReader::getTextContent(QString path, QString &textContent) {
void FileReader::getTextContent(QString path, QString &textContent, QString &suffix) {
QFileInfo file(path);
QString strsfx = file.suffix();
if (strsfx == "docx") {
suffix = file.suffix();
if (suffix == "docx") {
FileUtils::getDocxTextContent(path, textContent);
} else if (strsfx == "pptx") {
} else if (suffix == "pptx") {
FileUtils::getPptxTextContent(path, textContent);
} else if (strsfx == "xlsx") {
} else if (suffix == "xlsx") {
FileUtils::getXlsxTextContent(path, textContent);
} else if (strsfx == "txt") {
} else if (suffix == "txt") {
FileUtils::getTxtContent(path, textContent);
} else if (strsfx == "doc" || strsfx == "dot" || strsfx == "wps" || strsfx == "ppt" ||
strsfx == "pps" || strsfx == "dps" || strsfx == "et" || strsfx == "xls") {
} else if (suffix == "doc" || suffix == "dot" || suffix == "wps" || suffix == "ppt" ||
suffix == "pps" || suffix == "dps" || suffix == "et" || suffix == "xls") {
KBinaryParser searchdata;
searchdata.RunParser(path, textContent);
} else if (strsfx == "pdf") {
} else if (suffix == "pdf") {
FileUtils::getPdfTextContent(path, textContent);
} else if (strsfx == "png" || strsfx == "jpg" || strsfx == "jpeg"){
OcrObject::getInstance()->getTxtContent(path, textContent);;
} else if (true == targetPhotographTypeMap[suffix]){
OcrObject::getInstance()->getTxtContent(path, textContent);
}
return;
}

View File

@ -28,7 +28,7 @@ class FileReader : public QObject {
public:
explicit FileReader(QObject *parent = nullptr);
~FileReader() = default;
static void getTextContent(QString path, QString &textContent);
static void getTextContent(QString path, QString &textContent, QString &suffix);
};
}

View File

@ -4,6 +4,9 @@
#include <QLabel>
#include <QHBoxLayout>
#include <QMessageBox>
#define OCR_ICONLABLE_WITH 352
#define OCR_ICONLABLE_HEIGHT 256
using namespace UkuiSearch;
FileSearchPlugin::FileSearchPlugin(QObject *parent) : QObject(parent)
@ -471,10 +474,27 @@ void FileContengSearchPlugin::openAction(int actionkey, QString key, int type)
QWidget *FileContengSearchPlugin::detailPage(const ResultInfo &ri)
{
if (1 == ri.type) {
QPixmap pixmap;
pixmap.load(ri.actionKey);
if (pixmap.width()/OCR_ICONLABLE_WITH > pixmap.height()/OCR_ICONLABLE_HEIGHT) {
pixmap = pixmap.scaled(OCR_ICONLABLE_WITH, (pixmap.height()*OCR_ICONLABLE_WITH)/pixmap.width(), Qt::KeepAspectRatio, Qt::SmoothTransformation);
} else {
pixmap = pixmap.scaled((pixmap.width()*OCR_ICONLABLE_HEIGHT)/pixmap.height(), OCR_ICONLABLE_HEIGHT, Qt::KeepAspectRatio, Qt::SmoothTransformation);
}
m_iconLabel->setPixmap(pixmap);
m_pluginLabel->setText(tr("OCR"));
m_detailLyt->setContentsMargins(8, (OCR_ICONLABLE_HEIGHT-pixmap.height())/2+8, 16, 0);
m_snippetLabel->hide();
} else {
m_iconLabel->setPixmap(ri.icon.pixmap(120, 120));
m_pluginLabel->setText(tr("File"));
m_snippetLabel->setText(getHtmlText(wrapData(m_snippetLabel,ri.description.at(0).value), m_keyWord));
m_snippetLabel->show();
m_detailLyt->setContentsMargins(8, 50, 16, 0);
}
m_currentActionKey = ri.actionKey;
m_iconLabel->setPixmap(ri.icon.pixmap(120, 120));
m_pluginLabel->setText(tr("File"));
QFontMetrics fontMetrics = m_nameLabel->fontMetrics();
QString showname = fontMetrics.elidedText(ri.name, Qt::ElideRight, 215); //当字体长度超过215时显示为省略号
m_nameLabel->setText(FileUtils::setAllTextBold(showname));
@ -484,7 +504,6 @@ QWidget *FileContengSearchPlugin::detailPage(const ResultInfo &ri)
m_nameLabel->setToolTip("");
}
m_snippetLabel->setText(getHtmlText(wrapData(m_snippetLabel,ri.description.at(0).value), m_keyWord));
m_pathLabel2->setText(m_pathLabel2->fontMetrics().elidedText(m_currentActionKey, Qt::ElideRight, m_pathLabel2->width()));
m_pathLabel2->setToolTip(m_currentActionKey);
m_timeLabel2->setText(ri.description.at(2).value);
@ -553,10 +572,10 @@ void FileContengSearchPlugin::initDetailPage()
m_detailPage->setFixedWidth(360);
m_detailPage->setAttribute(Qt::WA_TranslucentBackground);
m_detailLyt = new QVBoxLayout(m_detailPage);
m_detailLyt->setContentsMargins(8, 0, 16, 0);
m_detailLyt->setContentsMargins(8, 50, 16, 0);
m_iconLabel = new QLabel(m_detailPage);
m_iconLabel->setAlignment(Qt::AlignCenter);
m_iconLabel->setFixedHeight(128);
//m_iconLabel->setFixedHeight(128);
m_nameFrame = new QFrame(m_detailPage);
m_nameFrameLyt = new QHBoxLayout(m_nameFrame);
@ -611,7 +630,7 @@ void FileContengSearchPlugin::initDetailPage()
m_actionFrameLyt->addWidget(m_actionLabel3);
m_actionFrame->setLayout(m_actionFrameLyt);
m_detailLyt->addSpacing(50);
// m_detailLyt->addSpacing(50);
m_detailLyt->addWidget(m_iconLabel);
m_detailLyt->addWidget(m_nameFrame);
m_detailLyt->addWidget(m_line_1);

View File

@ -102,9 +102,12 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
} else {
this->m_contentIndexData->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
}
} /*else if (true == targetPhotographTypeMap[fileInfo.fileName().split(".").last()]) {
this->m_ocrIndexData->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
}*/
} else if (true == targetPhotographTypeMap[fileInfo.fileName().split(".").last()]) {
if (FileUtils::isOcrSupportSize(fileInfo.absoluteFilePath())) {
this->m_contentIndexData->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
//this->m_ocrIndexData->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
}
}
}
void FirstIndex::run() {
@ -211,7 +214,6 @@ void FirstIndex::run() {
}
tmp2->enqueue(tempPair.first);
}
// qDebug() << ">>>>>>>>all fileSize:" << fileSize << "file num:" << tmp->size() << "<<<<<<<<<<<<<<<<<<<";
if(!IndexGenerator::getInstance()->creatAllIndex(tmp2)) {
sucess = false;
break;
@ -225,7 +227,7 @@ void FirstIndex::run() {
}
sem.release(2);
});
// OCR功能暂时屏蔽
// OCR功能目前合到内容搜索分类中
// QtConcurrent::run(&m_pool,[&]() {
// sem.acquire(5);
// mutex3.unlock();

View File

@ -175,15 +175,15 @@ IndexGenerator::IndexGenerator(QObject *parent) : QObject(parent)
if(!database.exists()) {
qDebug() << "create content index path" << CONTENT_INDEX_PATH << database.mkpath(CONTENT_INDEX_PATH);
}
database.setPath(OCR_INDEX_PATH);
if(!database.exists()) {
qDebug() << "create ocr index path" << OCR_INDEX_PATH << database.mkpath(OCR_INDEX_PATH);
}
// database.setPath(OCR_INDEX_PATH);
// if(!database.exists()) {
// qDebug() << "create ocr index path" << OCR_INDEX_PATH << database.mkpath(OCR_INDEX_PATH);
// }
try {
m_database_path = new Xapian::WritableDatabase(INDEX_PATH.toStdString(), Xapian::DB_CREATE_OR_OPEN);
m_database_content = new Xapian::WritableDatabase(CONTENT_INDEX_PATH.toStdString(), Xapian::DB_CREATE_OR_OPEN);
m_database_ocr = new Xapian::WritableDatabase(OCR_INDEX_PATH.toStdString(), Xapian::DB_CREATE_OR_OPEN);
// m_database_ocr = new Xapian::WritableDatabase(OCR_INDEX_PATH.toStdString(), Xapian::DB_CREATE_OR_OPEN);
} catch(const Xapian::Error &e) {
qWarning() << "creat Index fail!" << QString::fromStdString(e.get_description());
IndexStatusRecorder::getInstance()->setStatus(INOTIFY_NORMAL_EXIT, "1");
@ -419,7 +419,8 @@ Document IndexGenerator::GenerateContentDocument(const QString &path) {
Document doc;
QString uniqueterm;
QString upTerm;
FileReader::getTextContent(path, content);
QString suffix;
FileReader::getTextContent(path, content, suffix);
term = ChineseSegmentation::getInstance()->callSegement(content.toStdString());
// QStringList term = content.split("");
@ -428,9 +429,9 @@ Document IndexGenerator::GenerateContentDocument(const QString &path) {
doc.setUniqueTerm(uniqueterm);
doc.addTerm(upTerm);
doc.addValue(1, path);
doc.addValue(2, suffix);
for(int i = 0; i < term.size(); ++i) {
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
}
// Document doc;
@ -528,12 +529,12 @@ bool IndexGenerator::deleteAllIndex(QStringList *pathlist) {
m_database_path->delete_document(uniqueterm);
m_database_content->delete_document(uniqueterm);
m_database_ocr->delete_document(uniqueterm);
// m_database_ocr->delete_document(uniqueterm);
//delete all files under it if it's a dir.
m_database_path->delete_document(upterm);
m_database_content->delete_document(upterm);
m_database_ocr->delete_document(upterm);
// m_database_ocr->delete_document(upterm);
qDebug() << "delete path" << doc;
// qDebug() << "delete md5" << QString::fromStdString(uniqueterm);
@ -543,7 +544,7 @@ bool IndexGenerator::deleteAllIndex(QStringList *pathlist) {
}
m_database_path->commit();
m_database_content->commit();
m_database_ocr->commit();
// m_database_ocr->commit();
qDebug() << "--delete finish--";
} catch(const Xapian::Error &e) {
qWarning() << QString::fromStdString(e.get_description());
@ -601,7 +602,7 @@ bool IndexGenerator::updateIndex(QVector<PendingFile> *pendingFiles)
QQueue<QVector<QString>> *fileIndexInfo = new QQueue<QVector<QString>>;
QQueue<QString> *fileContentIndexInfo = new QQueue<QString>;
QQueue<QString> *fileOcrIndexInfo = new QQueue<QString>;
//QQueue<QString> *fileOcrIndexInfo = new QQueue<QString>;
QStringList *deleteList = new QStringList;
QStringList *contentDeleteList = new QStringList;
for (PendingFile file : *pendingFiles) {
@ -610,12 +611,19 @@ bool IndexGenerator::updateIndex(QVector<PendingFile> *pendingFiles)
continue;
}
fileIndexInfo->append(QVector<QString>() << file.path().section("/" , -1) << file.path() << QString(file.isDir() ? "1" : "0"));
if ((!file.path().split(".").isEmpty()) && (true == targetFileTypeMap[file.path().section("/" , -1) .split(".").last()])) {
if (file.path().split(".").isEmpty()){
continue;
}
if (true == targetFileTypeMap[file.path().section("/" , -1) .split(".").last()]) {
if (!FileUtils::isEncrypedOrUnreadable(file.path())) {
fileContentIndexInfo->append(file.path());
} else {
contentDeleteList->append(file.path());
}
} else if (true == targetPhotographTypeMap[file.path().section("/" , -1) .split(".").last()]) {
if (FileUtils::isOcrSupportSize(file.path())) {
fileContentIndexInfo->append(file.path());
}
}
}
if (!deleteList->isEmpty()) {
@ -630,9 +638,9 @@ bool IndexGenerator::updateIndex(QVector<PendingFile> *pendingFiles)
if (!fileContentIndexInfo->isEmpty()) {
creatAllIndex(fileContentIndexInfo);
}
if (!fileOcrIndexInfo->isEmpty()) {
creatOcrIndex(fileOcrIndexInfo);
}
//if (!fileOcrIndexInfo->isEmpty()) {
// creatOcrIndex(fileOcrIndexInfo);
//}
if (fileIndexInfo) {
delete fileIndexInfo;
fileIndexInfo = nullptr;
@ -641,10 +649,10 @@ bool IndexGenerator::updateIndex(QVector<PendingFile> *pendingFiles)
delete fileContentIndexInfo;
fileContentIndexInfo = nullptr;
}
if (fileOcrIndexInfo) {
delete fileOcrIndexInfo;
fileOcrIndexInfo = nullptr;
}
//if (fileOcrIndexInfo) {
// delete fileOcrIndexInfo;
// fileOcrIndexInfo = nullptr;
//}
if (deleteList) {
delete deleteList;
deleteList = nullptr;

View File

@ -35,7 +35,6 @@
#include "file-reader.h"
#include "common.h"
#include "pending-file.h"
#include "common.h"
namespace UkuiSearch {
//extern QVector<Document> *_doc_list_path;

View File

@ -13,78 +13,79 @@ OcrObject *OcrObject::getInstance()
void OcrObject::getTxtContent(QString &path, QString &textcontent)
{
m_api = new tesseract::TessBaseAPI();
if (m_api->Init(NULL, "chi_sim")) {
qDebug() << "Could not initialize tesseract.\n";
return;
}
m_api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080
Pix *image = pixRead(path.toStdString().data());
if (!image) {
qDebug() << "path:" << path <<" pixRead error!";
if (m_api) {
m_api->End();
delete m_api;
m_api = nullptr;
}
return;
}
m_api->SetImage(image);
textcontent = m_api->GetUTF8Text();
qDebug() << "path:" << path << " Text:" << textcontent;
pixDestroy(&image);
m_api->Clear();
if (m_api) {
m_api->End();
delete m_api;
m_api = nullptr;
}
//多进程版本
// tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
// if (api->Init(NULL, "chi_sim")) {
// m_api = new tesseract::TessBaseAPI();
// if (m_api->Init(NULL, "chi_sim")) {
// qDebug() << "Could not initialize tesseract.\n";
// return;
// }
// api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080
// m_api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080
// Pix *image = pixRead(path.toStdString().data());
// if (!image) {
// qDebug() << "path:" << path <<" pixRead error!";
// if (api) {
// api->End();
// delete api;
// api = nullptr;
// if (m_api) {
// m_api->End();
// delete m_api;
// m_api = nullptr;
// }
// return;
// }
// api->SetImage(image);
// textcontent = api->GetUTF8Text();
// m_api->SetImage(image);
// textcontent = m_api->GetUTF8Text();
// qDebug() << "path:" << path << " Text:" << textcontent;
// pixDestroy(&image);
// api->Clear();
// m_api->Clear();
// if (api) {
// api->End();
// delete api;
// api = nullptr;
// if (m_api) {
// m_api->End();
// delete m_api;
// m_api = nullptr;
// }
//多进程版本
//qDebug() << "path:" << path;
tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
if (api->Init(NULL, "chi_sim")) {
qDebug() << "Could not initialize tesseract.\n";
return;
}
api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080
Pix *image = pixRead(path.toStdString().data());
if (!image) {
qDebug() << "path:" << path <<" pixRead error!";
if (api) {
api->End();
delete api;
api = nullptr;
}
return;
}
api->SetImage(image);
textcontent = api->GetUTF8Text();
//qDebug() << " Text:" << textcontent;
pixDestroy(&image);
api->Clear();
if (api) {
api->End();
delete api;
api = nullptr;
}
}
OcrObject::OcrObject(QObject *parent) : QObject(parent)
{
init();
// init();
}
OcrObject::~OcrObject()
{
if (m_api) {
m_api->End();
delete m_api;
m_api = nullptr;
}
// if (m_api) {
// m_api->End();
// delete m_api;
// m_api = nullptr;
// }
}
void OcrObject::init()

View File

@ -68,7 +68,11 @@ bool SearchManager::creatResultInfo(SearchPluginIface::ResultInfo &ri, QString p
<< SearchPluginIface::DescriptionInfo{tr("Path:"), path} \
<< SearchPluginIface::DescriptionInfo{tr("Modified time:"), info.lastModified().toString("yyyy/MM/dd hh:mm:ss")};
ri.actionKey = path;
ri.type = 0;
if (true == targetPhotographTypeMap[info.suffix()]) {
ri.type = 1;//1为ocr图片文件
} else {
ri.type = 0;//0为默认文本文件
}
return true;
}
@ -257,20 +261,7 @@ int FileContentSearch::keywordSearchContent() {
Xapian::QueryParser qp;
qp.set_default_op(Xapian::Query::OP_AND);
qp.set_database(db);
/*
::friso::ResultMap ret;
::friso::FrisoSegmentation::getInstance()->callSegement(ret, keyword.toLocal8Bit().data());
for (::friso::ResultMap::iterator it_map = ret.begin(); it_map != ret.end(); ++it_map){
target_str += it_map->first;
target_str += " ";
it_map->second.first.clear();
::std::vector<size_t>().swap(it_map->second.first);
}
ret.clear();
ret.erase(ret.begin(), ret.end());
::friso::ResultMap().swap(ret);
*/
QVector<SKeyWord> sKeyWord = ChineseSegmentation::getInstance()->callSegement(m_keyword.toStdString());
//Creat a query
std::string words;
@ -318,6 +309,7 @@ int FileContentSearch::getResult(Xapian::MSet &result, std::string &keyWord) {
double docScoreWeight = it.get_weight();
Xapian::percent docScorePercent = it.get_percent();
QString path = QString::fromStdString(doc.get_value(1));
QString suffix = QString::fromStdString(doc.get_value(2));
SearchPluginIface::ResultInfo ri;
if(!SearchManager::creatResultInfo(ri, path)) {