From 8b2ab37f910ce5966bfafcaa29fc318c6095bca7 Mon Sep 17 00:00:00 2001 From: iaom Date: Wed, 8 Nov 2023 10:08:04 +0800 Subject: [PATCH] =?UTF-8?q?perf(file-index):=E4=BD=BF=E7=94=A8ukui-file-me?= =?UTF-8?q?tadata=E6=8F=90=E4=BE=9B=E7=9A=84=E6=96=87=E4=BB=B6=E5=86=85?= =?UTF-8?q?=E5=AE=B9=E8=A7=A3=E6=9E=90=E6=8E=A5=E5=8F=A3=E6=9B=BF=E6=8D=A2?= =?UTF-8?q?=E5=8E=9F=E6=9C=89=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- libsearch/CMakeLists.txt | 8 +- libsearch/file-utils.cpp | 737 +-------------------- libsearch/file-utils.h | 16 +- libsearch/index/file-content-indexer.cpp | 7 +- libsearch/index/file-extraction-result.cpp | 82 +++ libsearch/index/file-extraction-result.h | 46 ++ libsearch/index/file-reader.cpp | 72 +- libsearch/index/file-reader.h | 14 +- libsearch/index/ocrobject.cpp | 119 ---- libsearch/index/ocrobject.h | 60 -- ukui-search-service/monitor.cpp | 4 +- 11 files changed, 181 insertions(+), 984 deletions(-) create mode 100644 libsearch/index/file-extraction-result.cpp create mode 100644 libsearch/index/file-extraction-result.h delete mode 100644 libsearch/index/ocrobject.cpp delete mode 100644 libsearch/index/ocrobject.h diff --git a/libsearch/CMakeLists.txt b/libsearch/CMakeLists.txt index 372e144..263cfaa 100644 --- a/libsearch/CMakeLists.txt +++ b/libsearch/CMakeLists.txt @@ -13,6 +13,7 @@ find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Core DBus Widgets Xml Concurrent S find_package(PkgConfig REQUIRED) find_package(KF5WindowSystem) find_package(qt5xdg) +find_package(ukui-file-metadata) set(LIBUKUI_SEARCH_EXTERNAL_LIBS "") set(LIBUKUI_SEARCH_PC_PKGS @@ -23,9 +24,7 @@ set(LIBUKUI_SEARCH_PC_PKGS gsettings-qt poppler-qt5 kysdk-qtwidgets - lept uchardet - tesseract kysdk-systime kysdk-datacollect) @@ -67,7 +66,6 @@ set(LIBUKUI_SEARCH_SRC index/index-scheduler.cpp index/index-scheduler.h index/index-status-recorder.cpp index/index-status-recorder.h index/index-updater.cpp index/index-updater.h - index/ocrobject.cpp index/ocrobject.h index/pending-file.cpp index/pending-file.h index/pending-file-queue.cpp index/pending-file-queue.h index/search-manager.cpp index/search-manager.h @@ -103,6 +101,8 @@ set(LIBUKUI_SEARCH_SRC icon-loader.cpp icon-loader.h data-collecter.cpp data-collecter.h + index/file-extraction-result.cpp + index/file-extraction-result.h ) set(QRC_FILES resource1.qrc) file(GLOB TS_FILES ${CMAKE_CURRENT_SOURCE_DIR}/../translations/libukui-search/*.ts) @@ -169,9 +169,9 @@ target_link_libraries(libukui-search PUBLIC Qt${QT_VERSION_MAJOR}::Xml chinese-segmentation quazip5 - tesseract uchardet xapian + ukui-file-metadata ${LIBUKUI_SEARCH_EXTERNAL_LIBS} ) diff --git a/libsearch/file-utils.cpp b/libsearch/file-utils.cpp index f11c5fe..679a6c5 100644 --- a/libsearch/file-utils.cpp +++ b/libsearch/file-utils.cpp @@ -20,38 +20,22 @@ * */ #include "file-utils.h" -#include #include #include #include #include -#include #include #include #include #include #include -#include #include #include -#include #include #include -#include #include -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include "gobject-template.h" #include "hanzi-to-pinyin.h" #include "common.h" @@ -59,155 +43,10 @@ using namespace UkuiSearch; -#define MAX_CONTENT_LENGTH 20480000 -/** - * @brief 查找elem的子节点 - * @param elem 起始节点 - * @param names 名称链 - * @param nodes 查找到的全部结果 - */ -void findNodes(const QDomElement &elem, QQueue &names, QList &nodes) -{ - QString targetName = names.dequeue(); - QDomNode node = elem.firstChild(); - while (!node.isNull()) { - QDomElement e = node.toElement(); - if (!e.isNull() && e.tagName() == targetName) { - if (names.empty()) { - nodes.append(e); - - } else { - findNodes(e, names, nodes); - break; - } - } - node = node.nextSibling(); - } -} - -void findNodesByAttr(const QDomElement &elem, QQueue &names, QList &nodes, const QString &attr, const QStringList &values) -{ - findNodes(elem, names, nodes); - - QList::iterator it = nodes.begin(); - while (it != nodes.end()) { - if ((*it).hasAttribute(attr) && values.contains((*it).attribute(attr))) { - it++; - } else { - it = nodes.erase(it); - } - } -} - -bool findNodeText(const QDomElement &elem, QQueue &names, QString &content) -{ - QList nodes; - findNodes(elem, names, nodes); - - for (const auto &node : nodes) { - content.append(node.text()); - if (content.length() >= MAX_CONTENT_LENGTH / 3) { - return true; - } - } - return false; -} - -void findNodeAttr(const QDomElement &elem, QQueue &names, const QString &attr, QStringList &attrs) -{ - QList nodes; - findNodes(elem, names, nodes); - - for (const auto &node : nodes) { - if (node.hasAttribute(attr)) { - attrs.append(node.attribute(attr)); - } - } -} - -void processUOFPPT(const QDomDocument &doc, QString &content) -{ - QDomElement rootElem = doc.documentElement(); - QList nodes; - QQueue names; //每个节点的名称 - names << "uof:演示文稿" << "演:主体" << "演:幻灯片集" << "演:幻灯片"; - - findNodes(rootElem, names, nodes); - - if (nodes.empty()) { - //TODO 在uof-ppt不存在锚点节点时,直接查找文本节点? - return; - } - - QStringList objs; - //每一个 演:幻灯片 -> 锚点 - for (const auto &node : nodes) { - names.clear(); - names << "uof:锚点"; - findNodeAttr(node, names, "uof:图形引用", objs); - } - - nodes.clear(); - names.clear(); - names << "uof:对象集" << "图:图形"; - findNodesByAttr(rootElem, names, nodes, "图:标识符", objs); - - if (nodes.empty()) { - return; - } - - QList paraNodes; //全部段落节点 - for (const auto &node : nodes) { - names.clear(); - names << "图:文本内容" << "字:段落"; - findNodes(node, names, paraNodes); - } - - nodes.clear(); - for (const auto &node : paraNodes) { - names.clear(); - names << "字:句"; - findNodes(node, names, nodes); //全部段落下的全部句节点 - } - - for (const auto &node : nodes) { - names.clear(); - names << "字:文本串"; - if (findNodeText(node, names, content)) { - break; - } - } -} - -bool loadZipFileToDoc(QuaZip &zipFile, QDomDocument &doc, const QString &fileName) -{ - if (!zipFile.isOpen() && !zipFile.open(QuaZip::mdUnzip)) { - return false; - } - - if (!zipFile.setCurrentFile(fileName)) { - return false; - } - - QuaZipFile file(&zipFile); - if (!file.open(QIODevice::ReadOnly)) { - return false; - } - - doc.clear(); - if (!doc.setContent(&file)) { - file.close(); - return false; - } - file.close(); - - return true; -} - FileUtils::FileUtils() { } -std::string FileUtils::makeDocUterm(QString path) { +std::string FileUtils::makeDocUterm(const QString& path) { return QCryptographicHash::hash(path.toUtf8(), QCryptographicHash::Md5).toHex().toStdString(); } @@ -253,54 +92,7 @@ QIcon FileUtils::getSettingIcon() { // 返回控制面板应用图标 } -/** - * @brief FileUtils::getFileName 获取文件名 - * @param uri 格式为"file:///home/xxx/xxx/xxxx.txt" - * @return - */ -QString FileUtils::getFileName(const QString &uri) { - QFileInfo info(uri); - if(info.exists()) { - return info.fileName(); - } else { - return "Unknown File"; - } -// QUrl url = uri; -// if (url.fileName().isEmpty()) { -// return "Unknown File"; -// } -// return url.fileName(); -} - -/** - * @brief FileUtils::getAppName 获取应用名 - * @param path .destop文件的完整路径 - * @return - */ -QString FileUtils::getAppName(const QString &path) { - QByteArray ba; - ba = path.toUtf8(); - GKeyFile * keyfile; - keyfile = g_key_file_new(); - if(!g_key_file_load_from_file(keyfile, ba.data(), G_KEY_FILE_NONE, NULL)) { - g_key_file_free(keyfile); - return "Unknown App"; - } - QString name = QString(g_key_file_get_locale_string(keyfile, G_KEY_FILE_DESKTOP_GROUP, G_KEY_FILE_DESKTOP_KEY_NAME, NULL, NULL)); - g_key_file_free(keyfile); - return name; -} - -/** - * @brief FileUtils::getSettingName 获取设置项名 - * @param setting 设置项传入参数,格式为 About/About->Properties - * @return - */ -QString FileUtils::getSettingName(const QString &setting) { - return setting.right(setting.length() - setting.lastIndexOf("/") - 1); -} - -bool FileUtils::isOrUnder(QString pathA, QString pathB) +bool FileUtils::isOrUnder(const QString& pathA, const QString& pathB) { if (pathB == "/") { return true; @@ -334,301 +126,6 @@ QStringList FileUtils::findMultiToneWords(const QString &hanzi) { return output << oneResult << firstLetter; } -/** - * @brief FileUtils::getDocxTextContent - * @param path: abs path - * @return docx to QString - */ -void FileUtils::getDocxTextContent(const QString &path, QString &textcontent) { - //fix me :optimized by xpath?? - QFileInfo info = QFileInfo(path); - if(!info.exists() || info.isDir()) - return; - QuaZip file(path); - if(!file.open(QuaZip::mdUnzip)) - return; - - if(!file.setCurrentFile("word/document.xml", QuaZip::csSensitive)) { - file.close(); - return; - } - QuaZipFile fileR(&file); - - fileR.open(QIODevice::ReadOnly); //读取方式打开 - - QXmlStreamReader reader(&fileR); - - while (!reader.atEnd()){ - if(reader.readNextStartElement() and reader.name().toString() == "t"){ - textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " ")); - if(textcontent.length() >= MAX_CONTENT_LENGTH/3){ - break; - } - } - } - - fileR.close(); - file.close(); - return; - -/* //原加载DOM文档方式; - QDomDocument doc; - doc.setContent(fileR.readAll()); - fileR.close(); - QDomElement first = doc.firstChildElement("w:document"); - QDomElement body = first.firstChildElement("w:body"); - while(!body.isNull()) { - QDomElement wp = body.firstChildElement("w:p"); - while(!wp.isNull()) { - QDomElement wr = wp.firstChildElement("w:r"); - while(!wr.isNull()) { - QDomElement wt = wr.firstChildElement("w:t"); - textcontent.append(wt.text().replace("\n", "")).replace("\r", " "); - if(textcontent.length() >= MAX_CONTENT_LENGTH / 3) { - file.close(); - return; - } - wr = wr.nextSiblingElement(); - } - wp = wp.nextSiblingElement(); - } - body = body.nextSiblingElement(); - } - file.close(); - return; -*/ -} - -void FileUtils::getPptxTextContent(const QString &path, QString &textcontent) { - QFileInfo info = QFileInfo(path); - if(!info.exists() || info.isDir()) - return; - QuaZip file(path); - if(!file.open(QuaZip::mdUnzip)) - return; - QString prefix("ppt/slides/slide"); - QStringList fileList; - for(QString i : file.getFileNameList()) { - if(i.startsWith(prefix)) - fileList << i; - } - if(fileList.isEmpty()) { - file.close(); - return; - } - - for(int i = 0; i < fileList.size(); ++i){ - QString name = prefix + QString::number(i + 1) + ".xml"; - if(!file.setCurrentFile(name)) { - continue; - } - QuaZipFile fileR(&file); - fileR.open(QIODevice::ReadOnly); - - QXmlStreamReader reader(&fileR); - - while (!reader.atEnd()){ - if(reader.readNextStartElement() and reader.name().toString() == "t"){ - textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " ")); - if(textcontent.length() >= MAX_CONTENT_LENGTH/3){ - break; - } - } - } - fileR.close(); - } - file.close(); - return; - -/* - QDomElement sptree; - QDomElement sp; - QDomElement txbody; - QDomElement ap; - QDomElement ar; - QDomDocument doc; - QDomElement at; -// QDomNodeList atList; - for(int i = 0; i < fileList.size(); ++i) { - QString name = prefix + QString::number(i + 1) + ".xml"; - if(!file.setCurrentFile(name)) { - continue; - } - QuaZipFile fileR(&file); - fileR.open(QIODevice::ReadOnly); - doc.clear(); - doc.setContent(fileR.readAll()); - fileR.close(); - - //fix me :optimized by xpath?? - //This method looks better but slower, - //If xml file is very large with many useless node,this method will take a lot of time. - -// atList = doc.elementsByTagName("a:t"); -// for(int i = 0; i= MAX_CONTENT_LENGTH/3) -// { -// file.close(); -// return; -// } -// } -// } - //This is ugly but seems more efficient when handel a large file. - sptree = doc.firstChildElement("p:sld").firstChildElement("p:cSld").firstChildElement("p:spTree"); - while(!sptree.isNull()) { - sp = sptree.firstChildElement("p:sp"); - while(!sp.isNull()) { - txbody = sp.firstChildElement("p:txBody"); - while(!txbody.isNull()) { - ap = txbody.firstChildElement("a:p"); - while(!ap.isNull()) { - ar = ap.firstChildElement("a:r"); - while(!ar.isNull()) { - at = ar.firstChildElement("a:t"); - textcontent.append(at.text().replace("\r", "")).replace("\t", ""); - if(textcontent.length() >= MAX_CONTENT_LENGTH / 3) { - file.close(); - return; - } - ar = ar.nextSiblingElement(); - } - ap = ap.nextSiblingElement(); - } - txbody = txbody.nextSiblingElement(); - } - sp = sp.nextSiblingElement(); - } - sptree = sptree.nextSiblingElement(); - } - } - file.close(); - return; -*/ -} - -void FileUtils::getXlsxTextContent(const QString &path, QString &textcontent) { - QFileInfo info = QFileInfo(path); - if(!info.exists() || info.isDir()) - return; - QuaZip file(path); - if(!file.open(QuaZip::mdUnzip)) - return; - - if(!file.setCurrentFile("xl/sharedStrings.xml", QuaZip::csSensitive)) { - file.close(); - return; - } - QuaZipFile fileR(&file); - - fileR.open(QIODevice::ReadOnly); - - QXmlStreamReader reader(&fileR); - - while (!reader.atEnd()){ - if(reader.readNextStartElement() and reader.name().toString() == "t"){ - textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " ")); - if(textcontent.length() >= MAX_CONTENT_LENGTH/3){ - break; - } - } - } - - fileR.close(); - file.close(); - return; - -/* - QDomDocument doc; - doc.setContent(fileR.readAll()); - fileR.close(); - QDomElement sst = doc.firstChildElement("sst"); - QDomElement si; - QDomElement r; - QDomElement t; - while(!sst.isNull()) { - si = sst.firstChildElement("si"); - while(!si.isNull()) { - r = si.firstChildElement("r"); - if(r.isNull()) { - t = si.firstChildElement("t"); - } else { - t = r.firstChildElement("t"); - } - if(t.isNull()) - continue; - textcontent.append(t.text().replace("\r", "").replace("\n", "")); - if(textcontent.length() >= MAX_CONTENT_LENGTH / 3) { - file.close(); - return; - } - si = si.nextSiblingElement(); - } - sst = sst.nextSiblingElement(); - } - file.close(); - return; -*/ -} - -void FileUtils::getPdfTextContent(const QString &path, QString &textcontent) { - Poppler::Document *doc = Poppler::Document::load(path); - if(doc->isLocked()) { - delete doc; - return; - } - const QRectF qf; - int pageNum = doc->numPages(); - for(int i = 0; i < pageNum; ++i) { - Poppler::Page *page = doc->page(i); - if(page) { - textcontent.append(page->text(qf).replace("\n", "").replace("\r", " ")); - delete page; - if(textcontent.length() >= MAX_CONTENT_LENGTH / 3) - break; - } - } - delete doc; - return; -} - -void FileUtils::getTxtContent(const QString &path, QString &textcontent) { - QFile file(path); - if(!file.open(QIODevice::ReadOnly | QIODevice::Text)) - return; - - QByteArray encodedString = file.read(MAX_CONTENT_LENGTH); - - uchardet_t chardet = uchardet_new(); - if(uchardet_handle_data(chardet, encodedString.constData(), encodedString.size()) != 0) - qWarning() << "Txt file encoding format detect fail!" << path; - - uchardet_data_end(chardet); - const char *codec = uchardet_get_charset(chardet); - - if(QTextCodec::codecForName(codec) == nullptr) { - qWarning() << "Unsupported Text encoding format" << path << QString::fromLocal8Bit(codec); - return; - } - - QTextStream stream(encodedString, QIODevice::ReadOnly); - stream.setCodec(codec); - uchardet_delete(chardet); - - textcontent = stream.readAll().replace("\n", "").replace("\r", " "); - - file.close(); - encodedString.clear(); - chardet = NULL; - stream.flush(); - - return; -} - int FileUtils::openFile(QString &path, bool openInDir) { int res = -1; @@ -710,7 +207,7 @@ int FileUtils::openFile(QString &path, bool openInDir) delete appLaunchInterface; } appLaunchInterface = nullptr; - + if (!isSuccess){ QDesktopServices::openUrl(QUrl::fromLocalFile(path)); } @@ -1140,234 +637,6 @@ qreal FileUtils::horizontalAdvanceContainsKeyword(const QString &content, const return contentSize; } -/** - * uof1.0解析 - * 参考规范:GB/T 20916-2007 - * 1.文字处理 - * 2.电子表格 - * 3.演示文稿 - * ppt的内容存放在对象集中, - * 可以通过演示文稿-主体-幻灯片集-幻灯片下的锚点属性获取引用了哪些内容: - * - * 目标:文本串 - */ -void FileUtils::getUOFTextContent(const QString &path, QString &textContent) -{ - QFileInfo info(path); - if (!info.exists() || info.isDir()) { - return; - } - - QFile file(path); - if (!file.open(QIODevice::ReadOnly)) { - return; - } - - QDomDocument doc; - if (!doc.setContent(&file)) { - file.close(); - return; - } - file.close(); - - bool isPPT = false; - QDomElement rootElem = doc.documentElement(); - QDomNode node = rootElem.firstChild(); - while (!node.isNull()) { - QDomElement e = node.toElement(); - if (!e.isNull() && e.tagName() == "uof:演示文稿") { - isPPT = true; - break; - } - node = node.nextSibling(); - } - - //单独处理pdf文档 - if (isPPT) { - qDebug() << path << "is PPT"; - processUOFPPT(doc, textContent); - return; - } - - file.open(QIODevice::ReadOnly); - QXmlStreamReader reader(&file); - while (!reader.atEnd()) { - //适用于文字处理与电子表格 - if (reader.readNextStartElement() && reader.name().toString() == "文本串") { - textContent.append(reader.readElementText().replace("\n", "").replace("\r", " ")); - if (textContent.length() >= MAX_CONTENT_LENGTH / 3) { - break; - } - } - } - - file.close(); -} - - - - - -/** - * uof2.0解析 - * @brief 参考规范文档 https://www.doc88.com/p-9089133923912.html 或 GJB/Z 165-2012 - * ppt文档的内容存放在graphics.xml中,需要先解析content中的引用再解析graphics内容 - * @param path - * @param textContent - */ -void FileUtils::getUOF2TextContent(const QString &path, QString &textContent) -{ - QFileInfo info = QFileInfo(path); - if (!info.exists() || info.isDir()) - return; - - QuaZip file(path); - if (!file.open(QuaZip::mdUnzip)) - return; - - if (!file.setCurrentFile("content.xml")) { - return; - } - - QuaZipFile fileR(&file); - if (!fileR.open(QIODevice::ReadOnly)) { - return; - } - - QXmlStreamReader reader(&fileR); - - while (!reader.atEnd()) { - if (reader.readNextStartElement() && reader.name().toString() == "文本串_415B") { - textContent.append(reader.readElementText().replace("\n", "").replace("\r", " ")); - if (textContent.length() >= MAX_CONTENT_LENGTH / 3) { - break; - } - } - } - - fileR.close(); - file.close(); -} - -void FileUtils::getUOF2PPTContent(const QString &path, QString &textContent) -{ - QFileInfo info = QFileInfo(path); - if (!info.exists() || info.isDir()) - return; - - QuaZip zipFile(path); - QDomDocument doc; - if (!loadZipFileToDoc(zipFile, doc, "content.xml")) { - return; - } - - QDomElement rootElem = doc.documentElement(); - QList nodes; - QQueue names; //每个节点的名称 - names << "演:幻灯片集_6C0E" << "演:幻灯片_6C0F"; - findNodes(rootElem, names, nodes); - - if (nodes.empty()) { - return; - } - - QStringList attrs; - for (const auto &node : nodes) { - names.clear(); - names << "uof:锚点_C644"; - findNodeAttr(node, names, "图形引用_C62E", attrs); - } - - if (attrs.empty()) { - return; - } - - if (!loadZipFileToDoc(zipFile, doc, "graphics.xml")) { - return; - } - - nodes.clear(); - names.clear(); - names << "图:图形_8062"; - rootElem = doc.documentElement(); - findNodesByAttr(rootElem, names, nodes, "标识符_804B", attrs); - - QList nodes416B; //字:段落_416B - for (const auto &node : nodes) { - names.clear(); - names << "图:文本_803C" << "图:内容_8043" << "字:段落_416B"; - findNodes(node, names, nodes416B); - } - - nodes.clear(); - for (const auto &node : nodes416B) { - names.clear(); - names << "字:句_419D"; - findNodes(node, names, nodes); //所有的 字:句_419D - } - - for (const auto &node : nodes) { - names.clear(); - names << "字:文本串_415B"; - if (findNodeText(node, names, textContent)) { - break; - } - } -} - - -/** - * OFD文件解析 - * @brief 参考: GB/T 33190-2016 - * @param path - * @param textContent - */ -void FileUtils::getOFDTextContent(const QString &path, QString &textContent) -{ - QFileInfo info = QFileInfo(path); - if (!info.exists() || info.isDir()) - return; - - QuaZip zipfile(path); - if (!zipfile.open(QuaZip::mdUnzip)) - return; - - // GB/T 33190-2016规范定义可以存在多个Doc_x目录,暂时只取第一个目录的内容 - QString prefix("Doc_0/Pages/"); - QStringList fileList; - for (const auto &file: zipfile.getFileNameList()) { - if (file.startsWith(prefix)) { - fileList << file; - } - } - - for (int i = 0; i < fileList.count(); ++i) { - QString filename = prefix + "Page_" + QString::number(i) + "/Content.xml"; - if (!zipfile.setCurrentFile(filename)) { - continue; - } - - QuaZipFile fileR(&zipfile); - fileR.open(QIODevice::ReadOnly); - QXmlStreamReader reader(&fileR); - - while (!reader.atEnd()) { - if (reader.readNextStartElement() && reader.name().toString() == "TextCode") { - textContent.append(reader.readElementText().replace("\n", "").replace("\r", " ")); - if (textContent.length() >= MAX_CONTENT_LENGTH / 3) { - fileR.close(); - zipfile.close(); - return; - } - } - } - - fileR.close(); - } - - zipfile.close(); -} - QString FileUtils::getSnippetWithoutKeyword(const QString &content, int lineCount) { QString snippet; int numOfLine = 0; diff --git a/libsearch/file-utils.h b/libsearch/file-utils.h index 49d6460..859491e 100644 --- a/libsearch/file-utils.h +++ b/libsearch/file-utils.h @@ -35,28 +35,16 @@ public: static QString setAllTextBold(const QString &name); static QString wrapData(QLabel *p_label, const QString &text); static qreal horizontalAdvanceContainsKeyword(const QString &content, const QString &keyword); - static std::string makeDocUterm(QString path); + static std::string makeDocUterm(const QString& path); static QIcon getFileIcon(const QString &uri, bool checkValid = true); static QIcon getSettingIcon(); - static QString getFileName(const QString &uri); - static QString getAppName(const QString &path); - static QString getSettingName(const QString &setting); //A is or under B - static bool isOrUnder(QString pathA, QString pathB); + static bool isOrUnder(const QString& pathA, const QString& pathB); static QStringList findMultiToneWords(const QString &hanzi); //parse text,docx..... static QMimeType getMimetype(const QString &path); - static void getDocxTextContent(const QString &path, QString &textcontent); - static void getPptxTextContent(const QString &path, QString &textcontent); - static void getXlsxTextContent(const QString &path, QString &textcontent); - static void getPdfTextContent(const QString &path, QString &textcontent); - static void getTxtContent(const QString &path, QString &textcontent); - static void getUOFTextContent(const QString &path, QString &textContent); - static void getUOF2TextContent(const QString &path, QString &textContent); - static void getUOF2PPTContent(const QString &path, QString &textContent); - static void getOFDTextContent(const QString &path, QString &textContent); static int openFile(QString &path, bool openInDir = false); static bool copyPath(QString &path); diff --git a/libsearch/index/file-content-indexer.cpp b/libsearch/index/file-content-indexer.cpp index b889e14..a93d811 100644 --- a/libsearch/index/file-content-indexer.cpp +++ b/libsearch/index/file-content-indexer.cpp @@ -19,6 +19,7 @@ */ #include "file-content-indexer.h" #include +#include #include "file-reader.h" #include "file-utils.h" #include "chinese-segmentation.h" @@ -36,7 +37,7 @@ bool fileContentIndexer::index() return false; } QString suffix = info.suffix(); - FileReader::getTextContent(m_filePath, content, suffix); + FileReader::getInstance()->getTextContent(m_filePath, content, suffix); if(content.isEmpty()) { return false; } @@ -47,8 +48,8 @@ bool fileContentIndexer::index() content.clear(); content.squeeze(); - for(size_t i = 0; i < term.size(); ++i) { - m_document.addPosting(term.at(i).word, term.at(i).offsets, static_cast(term.at(i).weight)); + for(auto & i : term) { + m_document.addPosting(i.word, i.offsets, static_cast(i.weight)); } term.clear(); term.shrink_to_fit(); diff --git a/libsearch/index/file-extraction-result.cpp b/libsearch/index/file-extraction-result.cpp new file mode 100644 index 0000000..fadd14a --- /dev/null +++ b/libsearch/index/file-extraction-result.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2022, KylinSoft Co., Ltd. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Authors: iaom + * + */ + +#include "file-extraction-result.h" +namespace UkuiSearch { +class FileExtractionResultPrivate +{ +public: + UkuiFileMetadata::PropertyMultiMap m_properties; + QString m_text; + QVector m_types; +}; + +UkuiSearch::FileExtractionResult::FileExtractionResult(const QString &url, const QString &mimetype, + const UkuiFileMetadata::ExtractionResult::Flags &flags) + : ExtractionResult(url, mimetype, flags) + , d(new FileExtractionResultPrivate) +{ +} + +FileExtractionResult::~FileExtractionResult() = default; + + +FileExtractionResult::FileExtractionResult(const FileExtractionResult &rhs): ExtractionResult(*this) + , d(new FileExtractionResultPrivate(*rhs.d)) +{ +} + +FileExtractionResult &FileExtractionResult::operator=(const FileExtractionResult &rhs) +{ + *d = *rhs.d; + return *this; +} + +void FileExtractionResult::add(UkuiFileMetadata::Property::Property property, const QVariant &value) +{ + d->m_properties.insert(property, value); +} + +void FileExtractionResult::addType(UkuiFileMetadata::Type::Type type) +{ + d->m_types << type; +} + +void FileExtractionResult::append(const QString &text) +{ + QString tmp = text; + d->m_text.append(tmp.replace("\n", "").replace("\r", " ")); +} + +UkuiFileMetadata::PropertyMultiMap FileExtractionResult::properties() const +{ + return d->m_properties; +} + +QString FileExtractionResult::text() const +{ + return d->m_text; +} + +QVector FileExtractionResult::types() const +{ + return d->m_types; +} +} \ No newline at end of file diff --git a/libsearch/index/file-extraction-result.h b/libsearch/index/file-extraction-result.h new file mode 100644 index 0000000..84b7192 --- /dev/null +++ b/libsearch/index/file-extraction-result.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2022, KylinSoft Co., Ltd. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Authors: iaom + * + */ +#ifndef UKUI_SEARCH_FILE_EXTRACTION_RESULT_H +#define UKUI_SEARCH_FILE_EXTRACTION_RESULT_H + +#include + +namespace UkuiSearch { +class FileExtractionResultPrivate; +class FileExtractionResult : public UkuiFileMetadata::ExtractionResult +{ +public: + explicit FileExtractionResult(const QString& url, const QString& mimetype = QString(), const Flags& flags = Flags{ExtractPlainText | ExtractMetaData}); + FileExtractionResult(const FileExtractionResult& rhs); + ~FileExtractionResult() override; + FileExtractionResult& operator=(const FileExtractionResult& rhs); + + void add(UkuiFileMetadata::Property::Property property, const QVariant& value) override; + void addType(UkuiFileMetadata::Type::Type type) override; + void append(const QString& text) override; + + UkuiFileMetadata::PropertyMultiMap properties() const; + QString text() const; + QVector types() const; +private: + const std::unique_ptr d; +}; +} +#endif //UKUI_SEARCH_FILE_EXTRACTION_RESULT_H diff --git a/libsearch/index/file-reader.cpp b/libsearch/index/file-reader.cpp index 30c62e1..6ab614f 100644 --- a/libsearch/index/file-reader.cpp +++ b/libsearch/index/file-reader.cpp @@ -18,51 +18,39 @@ * */ #include "file-reader.h" -#include "file-utils.h" -#include "binary-parser.h" -#include "ocrobject.h" +#include +#include +#include +#include "file-extraction-result.h" #include "common.h" -using namespace UkuiSearch; -FileReader::FileReader(QObject *parent) : QObject(parent) -{ +using namespace UkuiSearch; +FileReader *g_instance = nullptr; +std::once_flag g_instanceFlag; +FileReader *FileReader::getInstance() +{ + std::call_once(g_instanceFlag, [] () { + g_instance = new FileReader; + }); + return g_instance; } + +FileReader::FileReader() += default; void FileReader::getTextContent(const QString &path, QString &textContent, const QString &suffix) { - if (suffix == "docx") { - FileUtils::getDocxTextContent(path, textContent); - } else if (suffix == "pptx") { - FileUtils::getPptxTextContent(path, textContent); - } else if (suffix == "xlsx") { - FileUtils::getXlsxTextContent(path, textContent); - } else if (suffix == "txt" or suffix == "html") { - FileUtils::getTxtContent(path, textContent); - } else if (suffix == "doc" || suffix == "dot" || suffix == "wps" || suffix == "ppt" || - suffix == "pps" || suffix == "dps" || suffix == "et" || suffix == "xls") { - KBinaryParser searchdata; - searchdata.RunParser(path, textContent); - } else if (suffix == "pdf") { - FileUtils::getPdfTextContent(path, textContent); - } else if (true == targetPhotographTypeMap[suffix]){ - OcrObject::getInstance()->getTxtContent(path, textContent); - } else if (suffix == "uof") { - QString mimeName = FileUtils::getMimetype(path).name(); - if (mimeName == "application/xml" || mimeName == "application/uof") { - FileUtils::getUOFTextContent(path, textContent); - - } else if (mimeName == "application/x-ole-storage") { - //uof的ppt文档不支持修改母版。一旦进行这些操作,uof文档可能会被wps存为doc文件 - KBinaryParser searchdata; - searchdata.RunParser(path, textContent); - } - } else if (suffix == "uot" || suffix == "uos") { - FileUtils::getUOF2TextContent(path, textContent); - - } else if (suffix == "uop") { - FileUtils::getUOF2PPTContent(path, textContent); - - } else if (suffix == "ofd") { - FileUtils::getOFDTextContent(path, textContent); + if(targetPhotographTypeMap[suffix]) { + textContent = UkuiFileMetadata::OcrUtils::getTextInPicture(path); + return; } - return; -} + QString mimeType = UkuiFileMetadata::MimeUtils::strictMimeType(path, {}).name(); + QList extractors = m_extractorManager.fetchExtractors(mimeType); + FileExtractionResult result(path, mimeType, UkuiFileMetadata::ExtractionResult::Flag::ExtractPlainText); + for(auto extractor : extractors) { + extractor->extract(&result); + if(!result.text().isEmpty()) { + textContent = result.text(); + break; + } + } +} \ No newline at end of file diff --git a/libsearch/index/file-reader.h b/libsearch/index/file-reader.h index 6514446..ef4865f 100644 --- a/libsearch/index/file-reader.h +++ b/libsearch/index/file-reader.h @@ -20,15 +20,17 @@ #ifndef FILEREADER_H #define FILEREADER_H -#include -#include +#include namespace UkuiSearch { -class FileReader : public QObject { - Q_OBJECT +class FileReader{ public: - explicit FileReader(QObject *parent = nullptr); + static FileReader* getInstance(); ~FileReader() = default; - static void getTextContent(const QString &path, QString &textContent, const QString &suffix); + void getTextContent(const QString &path, QString &textContent, const QString &suffix); + +private: + FileReader(); + UkuiFileMetadata::ExtractorManager m_extractorManager; }; } diff --git a/libsearch/index/ocrobject.cpp b/libsearch/index/ocrobject.cpp deleted file mode 100644 index ab4cd9a..0000000 --- a/libsearch/index/ocrobject.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* - * - * Copyright (C) 2023, KylinSoft Co., Ltd. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - */ -#include "ocrobject.h" - -OcrObject *OcrObject::m_instance = nullptr; -once_flag g_instanceFlag; - -OcrObject *OcrObject::getInstance() -{ - std::call_once(g_instanceFlag, [] () { - m_instance = new OcrObject; - }); - return m_instance; -} - -void OcrObject::getTxtContent(const QString &path, QString &textcontent) -{ -// m_api = new tesseract::TessBaseAPI(); -// if (m_api->Init(NULL, "chi_sim")) { -// qDebug() << "Could not initialize tesseract.\n"; -// return; -// } -// m_api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080 - -// Pix *image = pixRead(path.toStdString().data()); -// if (!image) { -// qDebug() << "path:" << path <<" pixRead error!"; -// if (m_api) { -// m_api->End(); -// delete m_api; -// m_api = nullptr; -// } -// return; -// } -// m_api->SetImage(image); -// textcontent = m_api->GetUTF8Text(); -// qDebug() << "path:" << path << " Text:" << textcontent; -// pixDestroy(&image); -// m_api->Clear(); - -// if (m_api) { -// m_api->End(); -// delete m_api; -// m_api = nullptr; -// } - -//多进程版本 - //qDebug() << "path:" << path; - tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI(); - if (api->Init(NULL, "chi_sim")) { - qDebug() << "Could not initialize tesseract.\n"; - return; - } - api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080 - - Pix *image = pixRead(path.toStdString().data()); - if (!image) { -// qDebug() << "path:" << path <<" pixRead error!"; - if (api) { - api->End(); - delete api; - api = nullptr; - } - return; - } - api->SetImage(image); - char *tmp = api->GetUTF8Text(); - textcontent = QString::fromLocal8Bit(tmp); - delete [] tmp; - //qDebug() << " Text:" << textcontent; - pixDestroy(&image); - api->Clear(); - - if (api) { - api->End(); - delete api; - api = nullptr; - } -} - -OcrObject::OcrObject(QObject *parent) : QObject(parent) -{ -// init(); -} - -OcrObject::~OcrObject() -{ -// if (m_api) { -// m_api->End(); -// delete m_api; -// m_api = nullptr; -// } -} - -void OcrObject::init() -{ - m_api = new tesseract::TessBaseAPI(); - if (m_api->Init(NULL, "chi_sim")) { - qDebug() << "Could not initialize tesseract.\n"; - return; - } - m_api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080 -} diff --git a/libsearch/index/ocrobject.h b/libsearch/index/ocrobject.h deleted file mode 100644 index bf9cf29..0000000 --- a/libsearch/index/ocrobject.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * - * Copyright (C) 2023, KylinSoft Co., Ltd. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Authors: iaom - */ -#ifndef OCROBJECT_H -#define OCROBJECT_H - -#include -#include -#include -#include -#include - -using namespace std; -class OcrObject : public QObject -{ - Q_OBJECT -public: - static OcrObject* getInstance(); - - void getTxtContent(const QString &path, QString &textcontent); - -protected: - explicit OcrObject(QObject *parent = nullptr); - ~OcrObject(); - -private: - static OcrObject *m_instance; - - tesseract::TessBaseAPI *m_api = nullptr; - void init(); - - class Garbo - { - public: - ~Garbo() { - if (OcrObject::m_instance) - delete OcrObject::m_instance; - } - static Garbo g_garbo; - }; - -}; - -#endif // OCROBJECT_H diff --git a/ukui-search-service/monitor.cpp b/ukui-search-service/monitor.cpp index 79d1939..31ca871 100644 --- a/ukui-search-service/monitor.cpp +++ b/ukui-search-service/monitor.cpp @@ -133,8 +133,8 @@ void Monitor::processUpdate(IndexType type, uint all, uint finished) case IndexType::OCR: m_ocrIndexSize = all; Q_EMIT ocrIndexSizeChanged(m_ocrIndexSize); - m_contentIndexProgress = finished; - Q_EMIT ocrIndexProgressChanged(m_contentIndexProgress); + m_ocrIndexProgress = finished; + Q_EMIT ocrIndexProgressChanged(m_ocrIndexProgress); m_ocrContentIndexDocNum = m_ocrContentDatabase.getIndexDocCount(); Q_EMIT ocrContentIndexDocNumChanged(m_ocrContentDatabase.getIndexDocCount()); break;