perf(file-index):使用ukui-file-metadata提供的文件内容解析接口替换原有功能
This commit is contained in:
parent
e18bb390a7
commit
8b2ab37f91
|
@ -13,6 +13,7 @@ find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Core DBus Widgets Xml Concurrent S
|
||||||
find_package(PkgConfig REQUIRED)
|
find_package(PkgConfig REQUIRED)
|
||||||
find_package(KF5WindowSystem)
|
find_package(KF5WindowSystem)
|
||||||
find_package(qt5xdg)
|
find_package(qt5xdg)
|
||||||
|
find_package(ukui-file-metadata)
|
||||||
|
|
||||||
set(LIBUKUI_SEARCH_EXTERNAL_LIBS "")
|
set(LIBUKUI_SEARCH_EXTERNAL_LIBS "")
|
||||||
set(LIBUKUI_SEARCH_PC_PKGS
|
set(LIBUKUI_SEARCH_PC_PKGS
|
||||||
|
@ -23,9 +24,7 @@ set(LIBUKUI_SEARCH_PC_PKGS
|
||||||
gsettings-qt
|
gsettings-qt
|
||||||
poppler-qt5
|
poppler-qt5
|
||||||
kysdk-qtwidgets
|
kysdk-qtwidgets
|
||||||
lept
|
|
||||||
uchardet
|
uchardet
|
||||||
tesseract
|
|
||||||
kysdk-systime
|
kysdk-systime
|
||||||
kysdk-datacollect)
|
kysdk-datacollect)
|
||||||
|
|
||||||
|
@ -67,7 +66,6 @@ set(LIBUKUI_SEARCH_SRC
|
||||||
index/index-scheduler.cpp index/index-scheduler.h
|
index/index-scheduler.cpp index/index-scheduler.h
|
||||||
index/index-status-recorder.cpp index/index-status-recorder.h
|
index/index-status-recorder.cpp index/index-status-recorder.h
|
||||||
index/index-updater.cpp index/index-updater.h
|
index/index-updater.cpp index/index-updater.h
|
||||||
index/ocrobject.cpp index/ocrobject.h
|
|
||||||
index/pending-file.cpp index/pending-file.h
|
index/pending-file.cpp index/pending-file.h
|
||||||
index/pending-file-queue.cpp index/pending-file-queue.h
|
index/pending-file-queue.cpp index/pending-file-queue.h
|
||||||
index/search-manager.cpp index/search-manager.h
|
index/search-manager.cpp index/search-manager.h
|
||||||
|
@ -103,6 +101,8 @@ set(LIBUKUI_SEARCH_SRC
|
||||||
icon-loader.cpp icon-loader.h
|
icon-loader.cpp icon-loader.h
|
||||||
data-collecter.cpp
|
data-collecter.cpp
|
||||||
data-collecter.h
|
data-collecter.h
|
||||||
|
index/file-extraction-result.cpp
|
||||||
|
index/file-extraction-result.h
|
||||||
)
|
)
|
||||||
set(QRC_FILES resource1.qrc)
|
set(QRC_FILES resource1.qrc)
|
||||||
file(GLOB TS_FILES ${CMAKE_CURRENT_SOURCE_DIR}/../translations/libukui-search/*.ts)
|
file(GLOB TS_FILES ${CMAKE_CURRENT_SOURCE_DIR}/../translations/libukui-search/*.ts)
|
||||||
|
@ -169,9 +169,9 @@ target_link_libraries(libukui-search PUBLIC
|
||||||
Qt${QT_VERSION_MAJOR}::Xml
|
Qt${QT_VERSION_MAJOR}::Xml
|
||||||
chinese-segmentation
|
chinese-segmentation
|
||||||
quazip5
|
quazip5
|
||||||
tesseract
|
|
||||||
uchardet
|
uchardet
|
||||||
xapian
|
xapian
|
||||||
|
ukui-file-metadata
|
||||||
${LIBUKUI_SEARCH_EXTERNAL_LIBS}
|
${LIBUKUI_SEARCH_EXTERNAL_LIBS}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -20,38 +20,22 @@
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
#include "file-utils.h"
|
#include "file-utils.h"
|
||||||
#include <QXmlStreamReader>
|
|
||||||
#include <QMutexLocker>
|
#include <QMutexLocker>
|
||||||
#include <gio/gdesktopappinfo.h>
|
#include <gio/gdesktopappinfo.h>
|
||||||
#include <QDBusMessage>
|
#include <QDBusMessage>
|
||||||
#include <QDBusConnection>
|
#include <QDBusConnection>
|
||||||
#include <QDomDocument>
|
|
||||||
#include <QDBusInterface>
|
#include <QDBusInterface>
|
||||||
#include <QDBusReply>
|
#include <QDBusReply>
|
||||||
#include <QDesktopServices>
|
#include <QDesktopServices>
|
||||||
#include <QMimeDatabase>
|
#include <QMimeDatabase>
|
||||||
#include <QCryptographicHash>
|
#include <QCryptographicHash>
|
||||||
#include <QFileInfo>
|
|
||||||
#include <QFile>
|
#include <QFile>
|
||||||
#include <QApplication>
|
#include <QApplication>
|
||||||
#include <QDir>
|
|
||||||
#include <QDebug>
|
#include <QDebug>
|
||||||
#include <QUrl>
|
#include <QUrl>
|
||||||
#include <QDomElement>
|
|
||||||
#include <QClipboard>
|
#include <QClipboard>
|
||||||
#include <QQueue>
|
|
||||||
#include <QFontMetrics>
|
#include <QFontMetrics>
|
||||||
#include <QTextBoundaryFinder>
|
#include <QTextBoundaryFinder>
|
||||||
#include <quazip5/quazipfile.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
#include <quazip5/quazip.h>
|
|
||||||
#include <uchardet/uchardet.h>
|
|
||||||
#include <poppler/qt5/poppler-qt5.h>
|
|
||||||
#include "gobject-template.h"
|
#include "gobject-template.h"
|
||||||
#include "hanzi-to-pinyin.h"
|
#include "hanzi-to-pinyin.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
@ -59,155 +43,10 @@
|
||||||
|
|
||||||
using namespace UkuiSearch;
|
using namespace UkuiSearch;
|
||||||
|
|
||||||
#define MAX_CONTENT_LENGTH 20480000
|
|
||||||
/**
|
|
||||||
* @brief 查找elem的子节点
|
|
||||||
* @param elem 起始节点
|
|
||||||
* @param names 名称链
|
|
||||||
* @param nodes 查找到的全部结果
|
|
||||||
*/
|
|
||||||
void findNodes(const QDomElement &elem, QQueue<QString> &names, QList<QDomElement> &nodes)
|
|
||||||
{
|
|
||||||
QString targetName = names.dequeue();
|
|
||||||
QDomNode node = elem.firstChild();
|
|
||||||
while (!node.isNull()) {
|
|
||||||
QDomElement e = node.toElement();
|
|
||||||
if (!e.isNull() && e.tagName() == targetName) {
|
|
||||||
if (names.empty()) {
|
|
||||||
nodes.append(e);
|
|
||||||
|
|
||||||
} else {
|
|
||||||
findNodes(e, names, nodes);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
node = node.nextSibling();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void findNodesByAttr(const QDomElement &elem, QQueue <QString> &names, QList <QDomElement> &nodes, const QString &attr, const QStringList &values)
|
|
||||||
{
|
|
||||||
findNodes(elem, names, nodes);
|
|
||||||
|
|
||||||
QList<QDomElement>::iterator it = nodes.begin();
|
|
||||||
while (it != nodes.end()) {
|
|
||||||
if ((*it).hasAttribute(attr) && values.contains((*it).attribute(attr))) {
|
|
||||||
it++;
|
|
||||||
} else {
|
|
||||||
it = nodes.erase(it);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool findNodeText(const QDomElement &elem, QQueue<QString> &names, QString &content)
|
|
||||||
{
|
|
||||||
QList<QDomElement> nodes;
|
|
||||||
findNodes(elem, names, nodes);
|
|
||||||
|
|
||||||
for (const auto &node : nodes) {
|
|
||||||
content.append(node.text());
|
|
||||||
if (content.length() >= MAX_CONTENT_LENGTH / 3) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void findNodeAttr(const QDomElement &elem, QQueue<QString> &names, const QString &attr, QStringList &attrs)
|
|
||||||
{
|
|
||||||
QList<QDomElement> nodes;
|
|
||||||
findNodes(elem, names, nodes);
|
|
||||||
|
|
||||||
for (const auto &node : nodes) {
|
|
||||||
if (node.hasAttribute(attr)) {
|
|
||||||
attrs.append(node.attribute(attr));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void processUOFPPT(const QDomDocument &doc, QString &content)
|
|
||||||
{
|
|
||||||
QDomElement rootElem = doc.documentElement();
|
|
||||||
QList<QDomElement> nodes;
|
|
||||||
QQueue<QString> names; //每个节点的名称
|
|
||||||
names << "uof:演示文稿" << "演:主体" << "演:幻灯片集" << "演:幻灯片";
|
|
||||||
|
|
||||||
findNodes(rootElem, names, nodes);
|
|
||||||
|
|
||||||
if (nodes.empty()) {
|
|
||||||
//TODO 在uof-ppt不存在锚点节点时,直接查找文本节点?
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
QStringList objs;
|
|
||||||
//每一个 演:幻灯片 -> 锚点
|
|
||||||
for (const auto &node : nodes) {
|
|
||||||
names.clear();
|
|
||||||
names << "uof:锚点";
|
|
||||||
findNodeAttr(node, names, "uof:图形引用", objs);
|
|
||||||
}
|
|
||||||
|
|
||||||
nodes.clear();
|
|
||||||
names.clear();
|
|
||||||
names << "uof:对象集" << "图:图形";
|
|
||||||
findNodesByAttr(rootElem, names, nodes, "图:标识符", objs);
|
|
||||||
|
|
||||||
if (nodes.empty()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
QList<QDomElement> paraNodes; //全部段落节点
|
|
||||||
for (const auto &node : nodes) {
|
|
||||||
names.clear();
|
|
||||||
names << "图:文本内容" << "字:段落";
|
|
||||||
findNodes(node, names, paraNodes);
|
|
||||||
}
|
|
||||||
|
|
||||||
nodes.clear();
|
|
||||||
for (const auto &node : paraNodes) {
|
|
||||||
names.clear();
|
|
||||||
names << "字:句";
|
|
||||||
findNodes(node, names, nodes); //全部段落下的全部句节点
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto &node : nodes) {
|
|
||||||
names.clear();
|
|
||||||
names << "字:文本串";
|
|
||||||
if (findNodeText(node, names, content)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool loadZipFileToDoc(QuaZip &zipFile, QDomDocument &doc, const QString &fileName)
|
|
||||||
{
|
|
||||||
if (!zipFile.isOpen() && !zipFile.open(QuaZip::mdUnzip)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!zipFile.setCurrentFile(fileName)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
QuaZipFile file(&zipFile);
|
|
||||||
if (!file.open(QIODevice::ReadOnly)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
doc.clear();
|
|
||||||
if (!doc.setContent(&file)) {
|
|
||||||
file.close();
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
file.close();
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
FileUtils::FileUtils() {
|
FileUtils::FileUtils() {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string FileUtils::makeDocUterm(QString path) {
|
std::string FileUtils::makeDocUterm(const QString& path) {
|
||||||
return QCryptographicHash::hash(path.toUtf8(), QCryptographicHash::Md5).toHex().toStdString();
|
return QCryptographicHash::hash(path.toUtf8(), QCryptographicHash::Md5).toHex().toStdString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -253,54 +92,7 @@ QIcon FileUtils::getSettingIcon() {
|
||||||
// 返回控制面板应用图标
|
// 返回控制面板应用图标
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
bool FileUtils::isOrUnder(const QString& pathA, const QString& pathB)
|
||||||
* @brief FileUtils::getFileName 获取文件名
|
|
||||||
* @param uri 格式为"file:///home/xxx/xxx/xxxx.txt"
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
QString FileUtils::getFileName(const QString &uri) {
|
|
||||||
QFileInfo info(uri);
|
|
||||||
if(info.exists()) {
|
|
||||||
return info.fileName();
|
|
||||||
} else {
|
|
||||||
return "Unknown File";
|
|
||||||
}
|
|
||||||
// QUrl url = uri;
|
|
||||||
// if (url.fileName().isEmpty()) {
|
|
||||||
// return "Unknown File";
|
|
||||||
// }
|
|
||||||
// return url.fileName();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief FileUtils::getAppName 获取应用名
|
|
||||||
* @param path .destop文件的完整路径
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
QString FileUtils::getAppName(const QString &path) {
|
|
||||||
QByteArray ba;
|
|
||||||
ba = path.toUtf8();
|
|
||||||
GKeyFile * keyfile;
|
|
||||||
keyfile = g_key_file_new();
|
|
||||||
if(!g_key_file_load_from_file(keyfile, ba.data(), G_KEY_FILE_NONE, NULL)) {
|
|
||||||
g_key_file_free(keyfile);
|
|
||||||
return "Unknown App";
|
|
||||||
}
|
|
||||||
QString name = QString(g_key_file_get_locale_string(keyfile, G_KEY_FILE_DESKTOP_GROUP, G_KEY_FILE_DESKTOP_KEY_NAME, NULL, NULL));
|
|
||||||
g_key_file_free(keyfile);
|
|
||||||
return name;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief FileUtils::getSettingName 获取设置项名
|
|
||||||
* @param setting 设置项传入参数,格式为 About/About->Properties
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
QString FileUtils::getSettingName(const QString &setting) {
|
|
||||||
return setting.right(setting.length() - setting.lastIndexOf("/") - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool FileUtils::isOrUnder(QString pathA, QString pathB)
|
|
||||||
{
|
{
|
||||||
if (pathB == "/") {
|
if (pathB == "/") {
|
||||||
return true;
|
return true;
|
||||||
|
@ -334,301 +126,6 @@ QStringList FileUtils::findMultiToneWords(const QString &hanzi) {
|
||||||
return output << oneResult << firstLetter;
|
return output << oneResult << firstLetter;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief FileUtils::getDocxTextContent
|
|
||||||
* @param path: abs path
|
|
||||||
* @return docx to QString
|
|
||||||
*/
|
|
||||||
void FileUtils::getDocxTextContent(const QString &path, QString &textcontent) {
|
|
||||||
//fix me :optimized by xpath??
|
|
||||||
QFileInfo info = QFileInfo(path);
|
|
||||||
if(!info.exists() || info.isDir())
|
|
||||||
return;
|
|
||||||
QuaZip file(path);
|
|
||||||
if(!file.open(QuaZip::mdUnzip))
|
|
||||||
return;
|
|
||||||
|
|
||||||
if(!file.setCurrentFile("word/document.xml", QuaZip::csSensitive)) {
|
|
||||||
file.close();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
QuaZipFile fileR(&file);
|
|
||||||
|
|
||||||
fileR.open(QIODevice::ReadOnly); //读取方式打开
|
|
||||||
|
|
||||||
QXmlStreamReader reader(&fileR);
|
|
||||||
|
|
||||||
while (!reader.atEnd()){
|
|
||||||
if(reader.readNextStartElement() and reader.name().toString() == "t"){
|
|
||||||
textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
|
||||||
if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fileR.close();
|
|
||||||
file.close();
|
|
||||||
return;
|
|
||||||
|
|
||||||
/* //原加载DOM文档方式;
|
|
||||||
QDomDocument doc;
|
|
||||||
doc.setContent(fileR.readAll());
|
|
||||||
fileR.close();
|
|
||||||
QDomElement first = doc.firstChildElement("w:document");
|
|
||||||
QDomElement body = first.firstChildElement("w:body");
|
|
||||||
while(!body.isNull()) {
|
|
||||||
QDomElement wp = body.firstChildElement("w:p");
|
|
||||||
while(!wp.isNull()) {
|
|
||||||
QDomElement wr = wp.firstChildElement("w:r");
|
|
||||||
while(!wr.isNull()) {
|
|
||||||
QDomElement wt = wr.firstChildElement("w:t");
|
|
||||||
textcontent.append(wt.text().replace("\n", "")).replace("\r", " ");
|
|
||||||
if(textcontent.length() >= MAX_CONTENT_LENGTH / 3) {
|
|
||||||
file.close();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
wr = wr.nextSiblingElement();
|
|
||||||
}
|
|
||||||
wp = wp.nextSiblingElement();
|
|
||||||
}
|
|
||||||
body = body.nextSiblingElement();
|
|
||||||
}
|
|
||||||
file.close();
|
|
||||||
return;
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
void FileUtils::getPptxTextContent(const QString &path, QString &textcontent) {
|
|
||||||
QFileInfo info = QFileInfo(path);
|
|
||||||
if(!info.exists() || info.isDir())
|
|
||||||
return;
|
|
||||||
QuaZip file(path);
|
|
||||||
if(!file.open(QuaZip::mdUnzip))
|
|
||||||
return;
|
|
||||||
QString prefix("ppt/slides/slide");
|
|
||||||
QStringList fileList;
|
|
||||||
for(QString i : file.getFileNameList()) {
|
|
||||||
if(i.startsWith(prefix))
|
|
||||||
fileList << i;
|
|
||||||
}
|
|
||||||
if(fileList.isEmpty()) {
|
|
||||||
file.close();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int i = 0; i < fileList.size(); ++i){
|
|
||||||
QString name = prefix + QString::number(i + 1) + ".xml";
|
|
||||||
if(!file.setCurrentFile(name)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
QuaZipFile fileR(&file);
|
|
||||||
fileR.open(QIODevice::ReadOnly);
|
|
||||||
|
|
||||||
QXmlStreamReader reader(&fileR);
|
|
||||||
|
|
||||||
while (!reader.atEnd()){
|
|
||||||
if(reader.readNextStartElement() and reader.name().toString() == "t"){
|
|
||||||
textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
|
||||||
if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fileR.close();
|
|
||||||
}
|
|
||||||
file.close();
|
|
||||||
return;
|
|
||||||
|
|
||||||
/*
|
|
||||||
QDomElement sptree;
|
|
||||||
QDomElement sp;
|
|
||||||
QDomElement txbody;
|
|
||||||
QDomElement ap;
|
|
||||||
QDomElement ar;
|
|
||||||
QDomDocument doc;
|
|
||||||
QDomElement at;
|
|
||||||
// QDomNodeList atList;
|
|
||||||
for(int i = 0; i < fileList.size(); ++i) {
|
|
||||||
QString name = prefix + QString::number(i + 1) + ".xml";
|
|
||||||
if(!file.setCurrentFile(name)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
QuaZipFile fileR(&file);
|
|
||||||
fileR.open(QIODevice::ReadOnly);
|
|
||||||
doc.clear();
|
|
||||||
doc.setContent(fileR.readAll());
|
|
||||||
fileR.close();
|
|
||||||
|
|
||||||
//fix me :optimized by xpath??
|
|
||||||
//This method looks better but slower,
|
|
||||||
//If xml file is very large with many useless node,this method will take a lot of time.
|
|
||||||
|
|
||||||
// atList = doc.elementsByTagName("a:t");
|
|
||||||
// for(int i = 0; i<atList.size(); ++i)
|
|
||||||
// {
|
|
||||||
// at = atList.at(i).toElement();
|
|
||||||
// if(!at.isNull())
|
|
||||||
// {
|
|
||||||
// textcontent.append(at.text().replace("\r","")).replace("\t"," ");
|
|
||||||
// if(textcontent.length() >= MAX_CONTENT_LENGTH/3)
|
|
||||||
// {
|
|
||||||
// file.close();
|
|
||||||
// return;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
//This is ugly but seems more efficient when handel a large file.
|
|
||||||
sptree = doc.firstChildElement("p:sld").firstChildElement("p:cSld").firstChildElement("p:spTree");
|
|
||||||
while(!sptree.isNull()) {
|
|
||||||
sp = sptree.firstChildElement("p:sp");
|
|
||||||
while(!sp.isNull()) {
|
|
||||||
txbody = sp.firstChildElement("p:txBody");
|
|
||||||
while(!txbody.isNull()) {
|
|
||||||
ap = txbody.firstChildElement("a:p");
|
|
||||||
while(!ap.isNull()) {
|
|
||||||
ar = ap.firstChildElement("a:r");
|
|
||||||
while(!ar.isNull()) {
|
|
||||||
at = ar.firstChildElement("a:t");
|
|
||||||
textcontent.append(at.text().replace("\r", "")).replace("\t", "");
|
|
||||||
if(textcontent.length() >= MAX_CONTENT_LENGTH / 3) {
|
|
||||||
file.close();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
ar = ar.nextSiblingElement();
|
|
||||||
}
|
|
||||||
ap = ap.nextSiblingElement();
|
|
||||||
}
|
|
||||||
txbody = txbody.nextSiblingElement();
|
|
||||||
}
|
|
||||||
sp = sp.nextSiblingElement();
|
|
||||||
}
|
|
||||||
sptree = sptree.nextSiblingElement();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
file.close();
|
|
||||||
return;
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
void FileUtils::getXlsxTextContent(const QString &path, QString &textcontent) {
|
|
||||||
QFileInfo info = QFileInfo(path);
|
|
||||||
if(!info.exists() || info.isDir())
|
|
||||||
return;
|
|
||||||
QuaZip file(path);
|
|
||||||
if(!file.open(QuaZip::mdUnzip))
|
|
||||||
return;
|
|
||||||
|
|
||||||
if(!file.setCurrentFile("xl/sharedStrings.xml", QuaZip::csSensitive)) {
|
|
||||||
file.close();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
QuaZipFile fileR(&file);
|
|
||||||
|
|
||||||
fileR.open(QIODevice::ReadOnly);
|
|
||||||
|
|
||||||
QXmlStreamReader reader(&fileR);
|
|
||||||
|
|
||||||
while (!reader.atEnd()){
|
|
||||||
if(reader.readNextStartElement() and reader.name().toString() == "t"){
|
|
||||||
textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
|
||||||
if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fileR.close();
|
|
||||||
file.close();
|
|
||||||
return;
|
|
||||||
|
|
||||||
/*
|
|
||||||
QDomDocument doc;
|
|
||||||
doc.setContent(fileR.readAll());
|
|
||||||
fileR.close();
|
|
||||||
QDomElement sst = doc.firstChildElement("sst");
|
|
||||||
QDomElement si;
|
|
||||||
QDomElement r;
|
|
||||||
QDomElement t;
|
|
||||||
while(!sst.isNull()) {
|
|
||||||
si = sst.firstChildElement("si");
|
|
||||||
while(!si.isNull()) {
|
|
||||||
r = si.firstChildElement("r");
|
|
||||||
if(r.isNull()) {
|
|
||||||
t = si.firstChildElement("t");
|
|
||||||
} else {
|
|
||||||
t = r.firstChildElement("t");
|
|
||||||
}
|
|
||||||
if(t.isNull())
|
|
||||||
continue;
|
|
||||||
textcontent.append(t.text().replace("\r", "").replace("\n", ""));
|
|
||||||
if(textcontent.length() >= MAX_CONTENT_LENGTH / 3) {
|
|
||||||
file.close();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
si = si.nextSiblingElement();
|
|
||||||
}
|
|
||||||
sst = sst.nextSiblingElement();
|
|
||||||
}
|
|
||||||
file.close();
|
|
||||||
return;
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
void FileUtils::getPdfTextContent(const QString &path, QString &textcontent) {
|
|
||||||
Poppler::Document *doc = Poppler::Document::load(path);
|
|
||||||
if(doc->isLocked()) {
|
|
||||||
delete doc;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const QRectF qf;
|
|
||||||
int pageNum = doc->numPages();
|
|
||||||
for(int i = 0; i < pageNum; ++i) {
|
|
||||||
Poppler::Page *page = doc->page(i);
|
|
||||||
if(page) {
|
|
||||||
textcontent.append(page->text(qf).replace("\n", "").replace("\r", " "));
|
|
||||||
delete page;
|
|
||||||
if(textcontent.length() >= MAX_CONTENT_LENGTH / 3)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
delete doc;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
void FileUtils::getTxtContent(const QString &path, QString &textcontent) {
|
|
||||||
QFile file(path);
|
|
||||||
if(!file.open(QIODevice::ReadOnly | QIODevice::Text))
|
|
||||||
return;
|
|
||||||
|
|
||||||
QByteArray encodedString = file.read(MAX_CONTENT_LENGTH);
|
|
||||||
|
|
||||||
uchardet_t chardet = uchardet_new();
|
|
||||||
if(uchardet_handle_data(chardet, encodedString.constData(), encodedString.size()) != 0)
|
|
||||||
qWarning() << "Txt file encoding format detect fail!" << path;
|
|
||||||
|
|
||||||
uchardet_data_end(chardet);
|
|
||||||
const char *codec = uchardet_get_charset(chardet);
|
|
||||||
|
|
||||||
if(QTextCodec::codecForName(codec) == nullptr) {
|
|
||||||
qWarning() << "Unsupported Text encoding format" << path << QString::fromLocal8Bit(codec);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
QTextStream stream(encodedString, QIODevice::ReadOnly);
|
|
||||||
stream.setCodec(codec);
|
|
||||||
uchardet_delete(chardet);
|
|
||||||
|
|
||||||
textcontent = stream.readAll().replace("\n", "").replace("\r", " ");
|
|
||||||
|
|
||||||
file.close();
|
|
||||||
encodedString.clear();
|
|
||||||
chardet = NULL;
|
|
||||||
stream.flush();
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
int FileUtils::openFile(QString &path, bool openInDir)
|
int FileUtils::openFile(QString &path, bool openInDir)
|
||||||
{
|
{
|
||||||
int res = -1;
|
int res = -1;
|
||||||
|
@ -1140,234 +637,6 @@ qreal FileUtils::horizontalAdvanceContainsKeyword(const QString &content, const
|
||||||
return contentSize;
|
return contentSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* uof1.0解析
|
|
||||||
* 参考规范:GB/T 20916-2007
|
|
||||||
* 1.文字处理
|
|
||||||
* 2.电子表格
|
|
||||||
* 3.演示文稿
|
|
||||||
* ppt的内容存放在对象集中,
|
|
||||||
* 可以通过演示文稿-主体-幻灯片集-幻灯片下的锚点属性获取引用了哪些内容:
|
|
||||||
* <uof:锚点 uof:图形引用="OBJ16"/>
|
|
||||||
* 目标:文本串
|
|
||||||
*/
|
|
||||||
void FileUtils::getUOFTextContent(const QString &path, QString &textContent)
|
|
||||||
{
|
|
||||||
QFileInfo info(path);
|
|
||||||
if (!info.exists() || info.isDir()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
QFile file(path);
|
|
||||||
if (!file.open(QIODevice::ReadOnly)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
QDomDocument doc;
|
|
||||||
if (!doc.setContent(&file)) {
|
|
||||||
file.close();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
file.close();
|
|
||||||
|
|
||||||
bool isPPT = false;
|
|
||||||
QDomElement rootElem = doc.documentElement();
|
|
||||||
QDomNode node = rootElem.firstChild();
|
|
||||||
while (!node.isNull()) {
|
|
||||||
QDomElement e = node.toElement();
|
|
||||||
if (!e.isNull() && e.tagName() == "uof:演示文稿") {
|
|
||||||
isPPT = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
node = node.nextSibling();
|
|
||||||
}
|
|
||||||
|
|
||||||
//单独处理pdf文档
|
|
||||||
if (isPPT) {
|
|
||||||
qDebug() << path << "is PPT";
|
|
||||||
processUOFPPT(doc, textContent);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
file.open(QIODevice::ReadOnly);
|
|
||||||
QXmlStreamReader reader(&file);
|
|
||||||
while (!reader.atEnd()) {
|
|
||||||
//适用于文字处理与电子表格
|
|
||||||
if (reader.readNextStartElement() && reader.name().toString() == "文本串") {
|
|
||||||
textContent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
|
||||||
if (textContent.length() >= MAX_CONTENT_LENGTH / 3) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
file.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* uof2.0解析
|
|
||||||
* @brief 参考规范文档 https://www.doc88.com/p-9089133923912.html 或 GJB/Z 165-2012
|
|
||||||
* ppt文档的内容存放在graphics.xml中,需要先解析content中的引用再解析graphics内容
|
|
||||||
* @param path
|
|
||||||
* @param textContent
|
|
||||||
*/
|
|
||||||
void FileUtils::getUOF2TextContent(const QString &path, QString &textContent)
|
|
||||||
{
|
|
||||||
QFileInfo info = QFileInfo(path);
|
|
||||||
if (!info.exists() || info.isDir())
|
|
||||||
return;
|
|
||||||
|
|
||||||
QuaZip file(path);
|
|
||||||
if (!file.open(QuaZip::mdUnzip))
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (!file.setCurrentFile("content.xml")) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
QuaZipFile fileR(&file);
|
|
||||||
if (!fileR.open(QIODevice::ReadOnly)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
QXmlStreamReader reader(&fileR);
|
|
||||||
|
|
||||||
while (!reader.atEnd()) {
|
|
||||||
if (reader.readNextStartElement() && reader.name().toString() == "文本串_415B") {
|
|
||||||
textContent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
|
||||||
if (textContent.length() >= MAX_CONTENT_LENGTH / 3) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fileR.close();
|
|
||||||
file.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
void FileUtils::getUOF2PPTContent(const QString &path, QString &textContent)
|
|
||||||
{
|
|
||||||
QFileInfo info = QFileInfo(path);
|
|
||||||
if (!info.exists() || info.isDir())
|
|
||||||
return;
|
|
||||||
|
|
||||||
QuaZip zipFile(path);
|
|
||||||
QDomDocument doc;
|
|
||||||
if (!loadZipFileToDoc(zipFile, doc, "content.xml")) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
QDomElement rootElem = doc.documentElement();
|
|
||||||
QList<QDomElement> nodes;
|
|
||||||
QQueue<QString> names; //每个节点的名称
|
|
||||||
names << "演:幻灯片集_6C0E" << "演:幻灯片_6C0F";
|
|
||||||
findNodes(rootElem, names, nodes);
|
|
||||||
|
|
||||||
if (nodes.empty()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
QStringList attrs;
|
|
||||||
for (const auto &node : nodes) {
|
|
||||||
names.clear();
|
|
||||||
names << "uof:锚点_C644";
|
|
||||||
findNodeAttr(node, names, "图形引用_C62E", attrs);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (attrs.empty()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!loadZipFileToDoc(zipFile, doc, "graphics.xml")) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
nodes.clear();
|
|
||||||
names.clear();
|
|
||||||
names << "图:图形_8062";
|
|
||||||
rootElem = doc.documentElement();
|
|
||||||
findNodesByAttr(rootElem, names, nodes, "标识符_804B", attrs);
|
|
||||||
|
|
||||||
QList<QDomElement> nodes416B; //字:段落_416B
|
|
||||||
for (const auto &node : nodes) {
|
|
||||||
names.clear();
|
|
||||||
names << "图:文本_803C" << "图:内容_8043" << "字:段落_416B";
|
|
||||||
findNodes(node, names, nodes416B);
|
|
||||||
}
|
|
||||||
|
|
||||||
nodes.clear();
|
|
||||||
for (const auto &node : nodes416B) {
|
|
||||||
names.clear();
|
|
||||||
names << "字:句_419D";
|
|
||||||
findNodes(node, names, nodes); //所有的 字:句_419D
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto &node : nodes) {
|
|
||||||
names.clear();
|
|
||||||
names << "字:文本串_415B";
|
|
||||||
if (findNodeText(node, names, textContent)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* OFD文件解析
|
|
||||||
* @brief 参考: GB/T 33190-2016
|
|
||||||
* @param path
|
|
||||||
* @param textContent
|
|
||||||
*/
|
|
||||||
void FileUtils::getOFDTextContent(const QString &path, QString &textContent)
|
|
||||||
{
|
|
||||||
QFileInfo info = QFileInfo(path);
|
|
||||||
if (!info.exists() || info.isDir())
|
|
||||||
return;
|
|
||||||
|
|
||||||
QuaZip zipfile(path);
|
|
||||||
if (!zipfile.open(QuaZip::mdUnzip))
|
|
||||||
return;
|
|
||||||
|
|
||||||
// GB/T 33190-2016规范定义可以存在多个Doc_x目录,暂时只取第一个目录的内容
|
|
||||||
QString prefix("Doc_0/Pages/");
|
|
||||||
QStringList fileList;
|
|
||||||
for (const auto &file: zipfile.getFileNameList()) {
|
|
||||||
if (file.startsWith(prefix)) {
|
|
||||||
fileList << file;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < fileList.count(); ++i) {
|
|
||||||
QString filename = prefix + "Page_" + QString::number(i) + "/Content.xml";
|
|
||||||
if (!zipfile.setCurrentFile(filename)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
QuaZipFile fileR(&zipfile);
|
|
||||||
fileR.open(QIODevice::ReadOnly);
|
|
||||||
QXmlStreamReader reader(&fileR);
|
|
||||||
|
|
||||||
while (!reader.atEnd()) {
|
|
||||||
if (reader.readNextStartElement() && reader.name().toString() == "TextCode") {
|
|
||||||
textContent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
|
||||||
if (textContent.length() >= MAX_CONTENT_LENGTH / 3) {
|
|
||||||
fileR.close();
|
|
||||||
zipfile.close();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fileR.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
zipfile.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
QString FileUtils::getSnippetWithoutKeyword(const QString &content, int lineCount) {
|
QString FileUtils::getSnippetWithoutKeyword(const QString &content, int lineCount) {
|
||||||
QString snippet;
|
QString snippet;
|
||||||
int numOfLine = 0;
|
int numOfLine = 0;
|
||||||
|
|
|
@ -35,28 +35,16 @@ public:
|
||||||
static QString setAllTextBold(const QString &name);
|
static QString setAllTextBold(const QString &name);
|
||||||
static QString wrapData(QLabel *p_label, const QString &text);
|
static QString wrapData(QLabel *p_label, const QString &text);
|
||||||
static qreal horizontalAdvanceContainsKeyword(const QString &content, const QString &keyword);
|
static qreal horizontalAdvanceContainsKeyword(const QString &content, const QString &keyword);
|
||||||
static std::string makeDocUterm(QString path);
|
static std::string makeDocUterm(const QString& path);
|
||||||
static QIcon getFileIcon(const QString &uri, bool checkValid = true);
|
static QIcon getFileIcon(const QString &uri, bool checkValid = true);
|
||||||
static QIcon getSettingIcon();
|
static QIcon getSettingIcon();
|
||||||
|
|
||||||
static QString getFileName(const QString &uri);
|
|
||||||
static QString getAppName(const QString &path);
|
|
||||||
static QString getSettingName(const QString &setting);
|
|
||||||
//A is or under B
|
//A is or under B
|
||||||
static bool isOrUnder(QString pathA, QString pathB);
|
static bool isOrUnder(const QString& pathA, const QString& pathB);
|
||||||
static QStringList findMultiToneWords(const QString &hanzi);
|
static QStringList findMultiToneWords(const QString &hanzi);
|
||||||
|
|
||||||
//parse text,docx.....
|
//parse text,docx.....
|
||||||
static QMimeType getMimetype(const QString &path);
|
static QMimeType getMimetype(const QString &path);
|
||||||
static void getDocxTextContent(const QString &path, QString &textcontent);
|
|
||||||
static void getPptxTextContent(const QString &path, QString &textcontent);
|
|
||||||
static void getXlsxTextContent(const QString &path, QString &textcontent);
|
|
||||||
static void getPdfTextContent(const QString &path, QString &textcontent);
|
|
||||||
static void getTxtContent(const QString &path, QString &textcontent);
|
|
||||||
static void getUOFTextContent(const QString &path, QString &textContent);
|
|
||||||
static void getUOF2TextContent(const QString &path, QString &textContent);
|
|
||||||
static void getUOF2PPTContent(const QString &path, QString &textContent);
|
|
||||||
static void getOFDTextContent(const QString &path, QString &textContent);
|
|
||||||
|
|
||||||
static int openFile(QString &path, bool openInDir = false);
|
static int openFile(QString &path, bool openInDir = false);
|
||||||
static bool copyPath(QString &path);
|
static bool copyPath(QString &path);
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
*/
|
*/
|
||||||
#include "file-content-indexer.h"
|
#include "file-content-indexer.h"
|
||||||
#include <QDateTime>
|
#include <QDateTime>
|
||||||
|
#include <QFileInfo>
|
||||||
#include "file-reader.h"
|
#include "file-reader.h"
|
||||||
#include "file-utils.h"
|
#include "file-utils.h"
|
||||||
#include "chinese-segmentation.h"
|
#include "chinese-segmentation.h"
|
||||||
|
@ -36,7 +37,7 @@ bool fileContentIndexer::index()
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
QString suffix = info.suffix();
|
QString suffix = info.suffix();
|
||||||
FileReader::getTextContent(m_filePath, content, suffix);
|
FileReader::getInstance()->getTextContent(m_filePath, content, suffix);
|
||||||
if(content.isEmpty()) {
|
if(content.isEmpty()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -47,8 +48,8 @@ bool fileContentIndexer::index()
|
||||||
content.clear();
|
content.clear();
|
||||||
content.squeeze();
|
content.squeeze();
|
||||||
|
|
||||||
for(size_t i = 0; i < term.size(); ++i) {
|
for(auto & i : term) {
|
||||||
m_document.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
|
m_document.addPosting(i.word, i.offsets, static_cast<int>(i.weight));
|
||||||
}
|
}
|
||||||
term.clear();
|
term.clear();
|
||||||
term.shrink_to_fit();
|
term.shrink_to_fit();
|
||||||
|
|
|
@ -0,0 +1,82 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||||
|
*
|
||||||
|
* This program is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*
|
||||||
|
* Authors: iaom <zhangpengfei@kylinos.cn>
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "file-extraction-result.h"
|
||||||
|
namespace UkuiSearch {
|
||||||
|
class FileExtractionResultPrivate
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
UkuiFileMetadata::PropertyMultiMap m_properties;
|
||||||
|
QString m_text;
|
||||||
|
QVector<UkuiFileMetadata::Type::Type> m_types;
|
||||||
|
};
|
||||||
|
|
||||||
|
UkuiSearch::FileExtractionResult::FileExtractionResult(const QString &url, const QString &mimetype,
|
||||||
|
const UkuiFileMetadata::ExtractionResult::Flags &flags)
|
||||||
|
: ExtractionResult(url, mimetype, flags)
|
||||||
|
, d(new FileExtractionResultPrivate)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
FileExtractionResult::~FileExtractionResult() = default;
|
||||||
|
|
||||||
|
|
||||||
|
FileExtractionResult::FileExtractionResult(const FileExtractionResult &rhs): ExtractionResult(*this)
|
||||||
|
, d(new FileExtractionResultPrivate(*rhs.d))
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
FileExtractionResult &FileExtractionResult::operator=(const FileExtractionResult &rhs)
|
||||||
|
{
|
||||||
|
*d = *rhs.d;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
void FileExtractionResult::add(UkuiFileMetadata::Property::Property property, const QVariant &value)
|
||||||
|
{
|
||||||
|
d->m_properties.insert(property, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
void FileExtractionResult::addType(UkuiFileMetadata::Type::Type type)
|
||||||
|
{
|
||||||
|
d->m_types << type;
|
||||||
|
}
|
||||||
|
|
||||||
|
void FileExtractionResult::append(const QString &text)
|
||||||
|
{
|
||||||
|
QString tmp = text;
|
||||||
|
d->m_text.append(tmp.replace("\n", "").replace("\r", " "));
|
||||||
|
}
|
||||||
|
|
||||||
|
UkuiFileMetadata::PropertyMultiMap FileExtractionResult::properties() const
|
||||||
|
{
|
||||||
|
return d->m_properties;
|
||||||
|
}
|
||||||
|
|
||||||
|
QString FileExtractionResult::text() const
|
||||||
|
{
|
||||||
|
return d->m_text;
|
||||||
|
}
|
||||||
|
|
||||||
|
QVector<UkuiFileMetadata::Type::Type> FileExtractionResult::types() const
|
||||||
|
{
|
||||||
|
return d->m_types;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,46 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||||
|
*
|
||||||
|
* This program is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*
|
||||||
|
* Authors: iaom <zhangpengfei@kylinos.cn>
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
#ifndef UKUI_SEARCH_FILE_EXTRACTION_RESULT_H
|
||||||
|
#define UKUI_SEARCH_FILE_EXTRACTION_RESULT_H
|
||||||
|
|
||||||
|
#include <extraction-result.h>
|
||||||
|
|
||||||
|
namespace UkuiSearch {
|
||||||
|
class FileExtractionResultPrivate;
|
||||||
|
class FileExtractionResult : public UkuiFileMetadata::ExtractionResult
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
explicit FileExtractionResult(const QString& url, const QString& mimetype = QString(), const Flags& flags = Flags{ExtractPlainText | ExtractMetaData});
|
||||||
|
FileExtractionResult(const FileExtractionResult& rhs);
|
||||||
|
~FileExtractionResult() override;
|
||||||
|
FileExtractionResult& operator=(const FileExtractionResult& rhs);
|
||||||
|
|
||||||
|
void add(UkuiFileMetadata::Property::Property property, const QVariant& value) override;
|
||||||
|
void addType(UkuiFileMetadata::Type::Type type) override;
|
||||||
|
void append(const QString& text) override;
|
||||||
|
|
||||||
|
UkuiFileMetadata::PropertyMultiMap properties() const;
|
||||||
|
QString text() const;
|
||||||
|
QVector<UkuiFileMetadata::Type::Type> types() const;
|
||||||
|
private:
|
||||||
|
const std::unique_ptr<FileExtractionResultPrivate> d;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif //UKUI_SEARCH_FILE_EXTRACTION_RESULT_H
|
|
@ -18,51 +18,39 @@
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
#include "file-reader.h"
|
#include "file-reader.h"
|
||||||
#include "file-utils.h"
|
#include <ocr-utils.h>
|
||||||
#include "binary-parser.h"
|
#include <mime-utils.h>
|
||||||
#include "ocrobject.h"
|
#include <mutex>
|
||||||
|
#include "file-extraction-result.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
using namespace UkuiSearch;
|
|
||||||
FileReader::FileReader(QObject *parent) : QObject(parent)
|
|
||||||
{
|
|
||||||
|
|
||||||
|
using namespace UkuiSearch;
|
||||||
|
FileReader *g_instance = nullptr;
|
||||||
|
std::once_flag g_instanceFlag;
|
||||||
|
FileReader *FileReader::getInstance()
|
||||||
|
{
|
||||||
|
std::call_once(g_instanceFlag, [] () {
|
||||||
|
g_instance = new FileReader;
|
||||||
|
});
|
||||||
|
return g_instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FileReader::FileReader()
|
||||||
|
= default;
|
||||||
void FileReader::getTextContent(const QString &path, QString &textContent, const QString &suffix)
|
void FileReader::getTextContent(const QString &path, QString &textContent, const QString &suffix)
|
||||||
{
|
{
|
||||||
if (suffix == "docx") {
|
if(targetPhotographTypeMap[suffix]) {
|
||||||
FileUtils::getDocxTextContent(path, textContent);
|
textContent = UkuiFileMetadata::OcrUtils::getTextInPicture(path);
|
||||||
} else if (suffix == "pptx") {
|
|
||||||
FileUtils::getPptxTextContent(path, textContent);
|
|
||||||
} else if (suffix == "xlsx") {
|
|
||||||
FileUtils::getXlsxTextContent(path, textContent);
|
|
||||||
} else if (suffix == "txt" or suffix == "html") {
|
|
||||||
FileUtils::getTxtContent(path, textContent);
|
|
||||||
} else if (suffix == "doc" || suffix == "dot" || suffix == "wps" || suffix == "ppt" ||
|
|
||||||
suffix == "pps" || suffix == "dps" || suffix == "et" || suffix == "xls") {
|
|
||||||
KBinaryParser searchdata;
|
|
||||||
searchdata.RunParser(path, textContent);
|
|
||||||
} else if (suffix == "pdf") {
|
|
||||||
FileUtils::getPdfTextContent(path, textContent);
|
|
||||||
} else if (true == targetPhotographTypeMap[suffix]){
|
|
||||||
OcrObject::getInstance()->getTxtContent(path, textContent);
|
|
||||||
} else if (suffix == "uof") {
|
|
||||||
QString mimeName = FileUtils::getMimetype(path).name();
|
|
||||||
if (mimeName == "application/xml" || mimeName == "application/uof") {
|
|
||||||
FileUtils::getUOFTextContent(path, textContent);
|
|
||||||
|
|
||||||
} else if (mimeName == "application/x-ole-storage") {
|
|
||||||
//uof的ppt文档不支持修改母版。一旦进行这些操作,uof文档可能会被wps存为doc文件
|
|
||||||
KBinaryParser searchdata;
|
|
||||||
searchdata.RunParser(path, textContent);
|
|
||||||
}
|
|
||||||
} else if (suffix == "uot" || suffix == "uos") {
|
|
||||||
FileUtils::getUOF2TextContent(path, textContent);
|
|
||||||
|
|
||||||
} else if (suffix == "uop") {
|
|
||||||
FileUtils::getUOF2PPTContent(path, textContent);
|
|
||||||
|
|
||||||
} else if (suffix == "ofd") {
|
|
||||||
FileUtils::getOFDTextContent(path, textContent);
|
|
||||||
}
|
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
QString mimeType = UkuiFileMetadata::MimeUtils::strictMimeType(path, {}).name();
|
||||||
|
QList<UkuiFileMetadata::Extractor*> extractors = m_extractorManager.fetchExtractors(mimeType);
|
||||||
|
FileExtractionResult result(path, mimeType, UkuiFileMetadata::ExtractionResult::Flag::ExtractPlainText);
|
||||||
|
for(auto extractor : extractors) {
|
||||||
|
extractor->extract(&result);
|
||||||
|
if(!result.text().isEmpty()) {
|
||||||
|
textContent = result.text();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -20,15 +20,17 @@
|
||||||
#ifndef FILEREADER_H
|
#ifndef FILEREADER_H
|
||||||
#define FILEREADER_H
|
#define FILEREADER_H
|
||||||
|
|
||||||
#include <QObject>
|
#include <extractor-manager.h>
|
||||||
#include <QFileInfo>
|
|
||||||
namespace UkuiSearch {
|
namespace UkuiSearch {
|
||||||
class FileReader : public QObject {
|
class FileReader{
|
||||||
Q_OBJECT
|
|
||||||
public:
|
public:
|
||||||
explicit FileReader(QObject *parent = nullptr);
|
static FileReader* getInstance();
|
||||||
~FileReader() = default;
|
~FileReader() = default;
|
||||||
static void getTextContent(const QString &path, QString &textContent, const QString &suffix);
|
void getTextContent(const QString &path, QString &textContent, const QString &suffix);
|
||||||
|
|
||||||
|
private:
|
||||||
|
FileReader();
|
||||||
|
UkuiFileMetadata::ExtractorManager m_extractorManager;
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,119 +0,0 @@
|
||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2023, KylinSoft Co., Ltd.
|
|
||||||
*
|
|
||||||
* This program is free software: you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation, either version 3 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
#include "ocrobject.h"
|
|
||||||
|
|
||||||
OcrObject *OcrObject::m_instance = nullptr;
|
|
||||||
once_flag g_instanceFlag;
|
|
||||||
|
|
||||||
OcrObject *OcrObject::getInstance()
|
|
||||||
{
|
|
||||||
std::call_once(g_instanceFlag, [] () {
|
|
||||||
m_instance = new OcrObject;
|
|
||||||
});
|
|
||||||
return m_instance;
|
|
||||||
}
|
|
||||||
|
|
||||||
void OcrObject::getTxtContent(const QString &path, QString &textcontent)
|
|
||||||
{
|
|
||||||
// m_api = new tesseract::TessBaseAPI();
|
|
||||||
// if (m_api->Init(NULL, "chi_sim")) {
|
|
||||||
// qDebug() << "Could not initialize tesseract.\n";
|
|
||||||
// return;
|
|
||||||
// }
|
|
||||||
// m_api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080
|
|
||||||
|
|
||||||
// Pix *image = pixRead(path.toStdString().data());
|
|
||||||
// if (!image) {
|
|
||||||
// qDebug() << "path:" << path <<" pixRead error!";
|
|
||||||
// if (m_api) {
|
|
||||||
// m_api->End();
|
|
||||||
// delete m_api;
|
|
||||||
// m_api = nullptr;
|
|
||||||
// }
|
|
||||||
// return;
|
|
||||||
// }
|
|
||||||
// m_api->SetImage(image);
|
|
||||||
// textcontent = m_api->GetUTF8Text();
|
|
||||||
// qDebug() << "path:" << path << " Text:" << textcontent;
|
|
||||||
// pixDestroy(&image);
|
|
||||||
// m_api->Clear();
|
|
||||||
|
|
||||||
// if (m_api) {
|
|
||||||
// m_api->End();
|
|
||||||
// delete m_api;
|
|
||||||
// m_api = nullptr;
|
|
||||||
// }
|
|
||||||
|
|
||||||
//多进程版本
|
|
||||||
//qDebug() << "path:" << path;
|
|
||||||
tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
|
|
||||||
if (api->Init(NULL, "chi_sim")) {
|
|
||||||
qDebug() << "Could not initialize tesseract.\n";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080
|
|
||||||
|
|
||||||
Pix *image = pixRead(path.toStdString().data());
|
|
||||||
if (!image) {
|
|
||||||
// qDebug() << "path:" << path <<" pixRead error!";
|
|
||||||
if (api) {
|
|
||||||
api->End();
|
|
||||||
delete api;
|
|
||||||
api = nullptr;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
api->SetImage(image);
|
|
||||||
char *tmp = api->GetUTF8Text();
|
|
||||||
textcontent = QString::fromLocal8Bit(tmp);
|
|
||||||
delete [] tmp;
|
|
||||||
//qDebug() << " Text:" << textcontent;
|
|
||||||
pixDestroy(&image);
|
|
||||||
api->Clear();
|
|
||||||
|
|
||||||
if (api) {
|
|
||||||
api->End();
|
|
||||||
delete api;
|
|
||||||
api = nullptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
OcrObject::OcrObject(QObject *parent) : QObject(parent)
|
|
||||||
{
|
|
||||||
// init();
|
|
||||||
}
|
|
||||||
|
|
||||||
OcrObject::~OcrObject()
|
|
||||||
{
|
|
||||||
// if (m_api) {
|
|
||||||
// m_api->End();
|
|
||||||
// delete m_api;
|
|
||||||
// m_api = nullptr;
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
|
|
||||||
void OcrObject::init()
|
|
||||||
{
|
|
||||||
m_api = new tesseract::TessBaseAPI();
|
|
||||||
if (m_api->Init(NULL, "chi_sim")) {
|
|
||||||
qDebug() << "Could not initialize tesseract.\n";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
m_api->SetVariable("user_defined_dpi", "1080");//图片中未标明分辨率的默认设置为1080
|
|
||||||
}
|
|
|
@ -1,60 +0,0 @@
|
||||||
/*
|
|
||||||
*
|
|
||||||
* Copyright (C) 2023, KylinSoft Co., Ltd.
|
|
||||||
*
|
|
||||||
* This program is free software: you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation, either version 3 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
||||||
*
|
|
||||||
* Authors: iaom <zhangpengfei@kylinos.cn>
|
|
||||||
*/
|
|
||||||
#ifndef OCROBJECT_H
|
|
||||||
#define OCROBJECT_H
|
|
||||||
|
|
||||||
#include <QObject>
|
|
||||||
#include <mutex>
|
|
||||||
#include <tesseract/baseapi.h>
|
|
||||||
#include <leptonica/allheaders.h>
|
|
||||||
#include <QDebug>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
class OcrObject : public QObject
|
|
||||||
{
|
|
||||||
Q_OBJECT
|
|
||||||
public:
|
|
||||||
static OcrObject* getInstance();
|
|
||||||
|
|
||||||
void getTxtContent(const QString &path, QString &textcontent);
|
|
||||||
|
|
||||||
protected:
|
|
||||||
explicit OcrObject(QObject *parent = nullptr);
|
|
||||||
~OcrObject();
|
|
||||||
|
|
||||||
private:
|
|
||||||
static OcrObject *m_instance;
|
|
||||||
|
|
||||||
tesseract::TessBaseAPI *m_api = nullptr;
|
|
||||||
void init();
|
|
||||||
|
|
||||||
class Garbo
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
~Garbo() {
|
|
||||||
if (OcrObject::m_instance)
|
|
||||||
delete OcrObject::m_instance;
|
|
||||||
}
|
|
||||||
static Garbo g_garbo;
|
|
||||||
};
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // OCROBJECT_H
|
|
|
@ -133,8 +133,8 @@ void Monitor::processUpdate(IndexType type, uint all, uint finished)
|
||||||
case IndexType::OCR:
|
case IndexType::OCR:
|
||||||
m_ocrIndexSize = all;
|
m_ocrIndexSize = all;
|
||||||
Q_EMIT ocrIndexSizeChanged(m_ocrIndexSize);
|
Q_EMIT ocrIndexSizeChanged(m_ocrIndexSize);
|
||||||
m_contentIndexProgress = finished;
|
m_ocrIndexProgress = finished;
|
||||||
Q_EMIT ocrIndexProgressChanged(m_contentIndexProgress);
|
Q_EMIT ocrIndexProgressChanged(m_ocrIndexProgress);
|
||||||
m_ocrContentIndexDocNum = m_ocrContentDatabase.getIndexDocCount();
|
m_ocrContentIndexDocNum = m_ocrContentDatabase.getIndexDocCount();
|
||||||
Q_EMIT ocrContentIndexDocNumChanged(m_ocrContentDatabase.getIndexDocCount());
|
Q_EMIT ocrContentIndexDocNumChanged(m_ocrContentDatabase.getIndexDocCount());
|
||||||
break;
|
break;
|
||||||
|
|
Loading…
Reference in New Issue