From 4294019e28c7fce1acd5fdbfbce130c0df336251 Mon Sep 17 00:00:00 2001 From: hewenfei Date: Wed, 11 May 2022 09:08:23 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E8=A7=A3=E6=9E=90uop?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E6=96=87=E6=A1=A3,=E5=A2=9E=E5=8A=A0uof=20pp?= =?UTF-8?q?t=E6=96=87=E6=A1=A3=E8=A7=A3=E6=9E=90=E7=B2=92=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- libsearch/common.h | 2 +- libsearch/file-utils.cpp | 115 ++++++++++++++++++++++++++++++-- libsearch/file-utils.h | 2 + libsearch/index/file-reader.cpp | 6 +- 4 files changed, 117 insertions(+), 8 deletions(-) diff --git a/libsearch/common.h b/libsearch/common.h index 08bd864..b7ccb42 100644 --- a/libsearch/common.h +++ b/libsearch/common.h @@ -43,7 +43,7 @@ static const QMap targetFileTypeMap = { {"uof", true}, {"uot", true}, {"uos", true}, - {"uop", false}, + {"uop", true}, {"ofd", true} }; diff --git a/libsearch/file-utils.cpp b/libsearch/file-utils.cpp index fcfbb49..28d2f95 100644 --- a/libsearch/file-utils.cpp +++ b/libsearch/file-utils.cpp @@ -1092,21 +1092,21 @@ void FileUtils::getUOFTextContent(QString &path, QString &textContent) } file.close(); - bool isPDF = false; + bool isPPT = false; QDomElement rootElem = doc.documentElement(); QDomNode node = rootElem.firstChild(); while (!node.isNull()) { QDomElement e = node.toElement(); if (!e.isNull() && e.tagName() == "uof:演示文稿") { - isPDF = true; + isPPT = true; break; } node = node.nextSibling(); } //单独处理pdf文档 - if (isPDF) { - qDebug() << path << "is PDF"; + if (isPPT) { + qDebug() << path << "is PPT"; processUOFPPT(doc, textContent); return; } @@ -1157,9 +1157,23 @@ void FileUtils::processUOFPPT(const QDomDocument &doc, QString &content) return; } + QList paraNodes; //全部段落节点 for (const auto &node : nodes) { names.clear(); - names << "图:文本内容" << "字:段落" << "字:句" << "字:文本串"; + names << "图:文本内容" << "字:段落"; + findNodes(node, names, paraNodes); + } + + nodes.clear(); + for (const auto &node : paraNodes) { + names.clear(); + names << "字:句"; + findNodes(node, names, nodes); //全部段落下的全部句节点 + } + + for (const auto &node : nodes) { + names.clear(); + names << "字:文本串"; if (findNodeText(node, names, content)) { break; } @@ -1272,6 +1286,97 @@ void FileUtils::getUOF2TextContent(QString &path, QString &textContent) file.close(); } +void FileUtils::getUOF2PPTContent(QString &path, QString &textContent) +{ + QFileInfo info = QFileInfo(path); + if (!info.exists() || info.isDir()) + return; + + QuaZip zipFile(path); + QDomDocument doc; + if (!loadZipFileToDoc(zipFile, doc, "content.xml")) { + return; + } + + QDomElement rootElem = doc.documentElement(); + QList nodes; + QQueue names; //每个节点的名称 + names << "演:幻灯片集_6C0E" << "演:幻灯片_6C0F"; + findNodes(rootElem, names, nodes); + + if (nodes.empty()) { + return; + } + + QStringList attrs; + for (const auto &node : nodes) { + names.clear(); + names << "uof:锚点_C644"; + findNodeAttr(node, names, "图形引用_C62E", attrs); + } + + if (attrs.empty()) { + return; + } + + if (!loadZipFileToDoc(zipFile, doc, "graphics.xml")) { + return; + } + + nodes.clear(); + names.clear(); + names << "图:图形_8062"; + rootElem = doc.documentElement(); + findNodesByAttr(rootElem, names, nodes, "标识符_804B", attrs); + + QList nodes416B; //字:段落_416B + for (const auto &node : nodes) { + names.clear(); + names << "图:文本_803C" << "图:内容_8043" << "字:段落_416B"; + findNodes(node, names, nodes416B); + } + + nodes.clear(); + for (const auto &node : nodes416B) { + names.clear(); + names << "字:句_419D"; + findNodes(node, names, nodes); //所有的 字:句_419D + } + + for (const auto &node : nodes) { + names.clear(); + names << "字:文本串_415B"; + if (findNodeText(node, names, textContent)) { + break; + } + } +} + +inline bool FileUtils::loadZipFileToDoc(QuaZip &zipFile, QDomDocument &doc, const QString &fileName) +{ + if (!zipFile.isOpen() && !zipFile.open(QuaZip::mdUnzip)) { + return false; + } + + if (!zipFile.setCurrentFile(fileName)) { + return false; + } + + QuaZipFile file(&zipFile); + if (!file.open(QIODevice::ReadOnly)) { + return false; + } + + doc.clear(); + if (!doc.setContent(&file)) { + file.close(); + return false; + } + file.close(); + + return true; +} + /** * OFD文件解析 * @brief 参考: GB/T 33190-2016 diff --git a/libsearch/file-utils.h b/libsearch/file-utils.h index b49ef23..d141f50 100644 --- a/libsearch/file-utils.h +++ b/libsearch/file-utils.h @@ -93,6 +93,7 @@ public: static void getTxtContent(QString &path, QString &textcontent); static void getUOFTextContent(QString &path, QString &textContent); static void getUOF2TextContent(QString &path, QString &textContent); + static void getUOF2PPTContent(QString &path, QString &textContent); static void getOFDTextContent(QString &path, QString &textContent); static int openFile(QString &path, bool openInDir = false); @@ -115,6 +116,7 @@ private: static inline bool findNodeText(const QDomElement &elem, QQueue &names, QString &content); static inline void findNodeAttr(const QDomElement &elem, QQueue &names, const QString &attr, QStringList &attrs); static void processUOFPPT(const QDomDocument &doc, QString &content); + static inline bool loadZipFileToDoc(QuaZip &zipFile, QDomDocument &doc, const QString &fileName); private: FileUtils(); diff --git a/libsearch/index/file-reader.cpp b/libsearch/index/file-reader.cpp index 45a74d6..401e6fd 100644 --- a/libsearch/index/file-reader.cpp +++ b/libsearch/index/file-reader.cpp @@ -55,10 +55,12 @@ void FileReader::getTextContent(QString path, QString &textContent, QString &suf KBinaryParser searchdata; searchdata.RunParser(path, textContent); } - - } else if (suffix == "uot" || suffix == "uos" || suffix == "uop") { + } else if (suffix == "uot" || suffix == "uos") { FileUtils::getUOF2TextContent(path, textContent); + } else if (suffix == "uop") { + FileUtils::getUOF2PPTContent(path, textContent); + } else if (suffix == "ofd") { FileUtils::getOFDTextContent(path, textContent); }