diff --git a/libsearch/file-utils.cpp b/libsearch/file-utils.cpp index ccf305f..fcfbb49 100644 --- a/libsearch/file-utils.cpp +++ b/libsearch/file-utils.cpp @@ -1107,6 +1107,7 @@ void FileUtils::getUOFTextContent(QString &path, QString &textContent) //单独处理pdf文档 if (isPDF) { qDebug() << path << "is PDF"; + processUOFPPT(doc, textContent); return; } @@ -1125,6 +1126,111 @@ void FileUtils::getUOFTextContent(QString &path, QString &textContent) file.close(); } +void FileUtils::processUOFPPT(const QDomDocument &doc, QString &content) +{ + QDomElement rootElem = doc.documentElement(); + QList nodes; + QQueue names; //每个节点的名称 + names << "uof:演示文稿" << "演:主体" << "演:幻灯片集" << "演:幻灯片"; + + findNodes(rootElem, names, nodes); + + if (nodes.empty()) { + //TODO 在uof-ppt不存在锚点节点时,直接查找文本节点? + return; + } + + QStringList objs; + //每一个 演:幻灯片 -> 锚点 + for (const auto &node : nodes) { + names.clear(); + names << "uof:锚点"; + findNodeAttr(node, names, "uof:图形引用", objs); + } + + nodes.clear(); + names.clear(); + names << "uof:对象集" << "图:图形"; + findNodesByAttr(rootElem, names, nodes, "图:标识符", objs); + + if (nodes.empty()) { + return; + } + + for (const auto &node : nodes) { + names.clear(); + names << "图:文本内容" << "字:段落" << "字:句" << "字:文本串"; + if (findNodeText(node, names, content)) { + break; + } + } +} + +/** + * @brief 查找elem的子节点 + * @param elem 起始节点 + * @param names 名称链 + * @param nodes 查找到的全部结果 + */ +void FileUtils::findNodes(const QDomElement &elem, QQueue &names, QList &nodes) +{ + QString targetName = names.dequeue(); + QDomNode node = elem.firstChild(); + while (!node.isNull()) { + QDomElement e = node.toElement(); + if (!e.isNull() && e.tagName() == targetName) { + if (names.empty()) { + nodes.append(e); + + } else { + findNodes(e, names, nodes); + break; + } + } + node = node.nextSibling(); + } +} + +inline void FileUtils::findNodesByAttr(const QDomElement &elem, QQueue &names, QList &nodes, const QString &attr, const QStringList &values) +{ + findNodes(elem, names, nodes); + + QList::iterator it = nodes.begin(); + while (it != nodes.end()) { + if ((*it).hasAttribute(attr) && values.contains((*it).attribute(attr))) { + it++; + } else { + it = nodes.erase(it); + } + } +} + +inline bool FileUtils::findNodeText(const QDomElement &elem, QQueue &names, QString &content) +{ + QList nodes; + findNodes(elem, names, nodes); + + for (const auto &node : nodes) { + content.append(node.text()); + if (content.length() >= MAX_CONTENT_LENGTH / 3) { + return true; + } + } + return false; +} + +inline void FileUtils::findNodeAttr(const QDomElement &elem, QQueue &names, const QString &attr, QStringList &attrs) +{ + QList nodes; + findNodes(elem, names, nodes); + + for (const auto &node : nodes) { + if (node.hasAttribute(attr)) { + attrs.append(node.attribute(attr)); + } + } +} + /** * uof2.0解析 * @brief 参考规范文档 https://www.doc88.com/p-9089133923912.html 或 GJB/Z 165-2012 diff --git a/libsearch/file-utils.h b/libsearch/file-utils.h index 13a0a0f..b49ef23 100644 --- a/libsearch/file-utils.h +++ b/libsearch/file-utils.h @@ -109,6 +109,13 @@ public: enum class SearchMethod { DIRECTSEARCH = 0, INDEXSEARCH = 1}; static SearchMethod searchMethod; +private: + static void findNodes(const QDomElement &elem, QQueue &names, QList &nodes); + static inline void findNodesByAttr(const QDomElement&, QQueue&, QList&, const QString &, const QStringList&); + static inline bool findNodeText(const QDomElement &elem, QQueue &names, QString &content); + static inline void findNodeAttr(const QDomElement &elem, QQueue &names, const QString &attr, QStringList &attrs); + static void processUOFPPT(const QDomDocument &doc, QString &content); + private: FileUtils(); };