新增解析uof格式ppt文档

This commit is contained in:
hewenfei 2022-05-09 14:47:40 +08:00 committed by iaom
parent a71a14ffbf
commit 04267253a6
2 changed files with 113 additions and 0 deletions

View File

@ -1107,6 +1107,7 @@ void FileUtils::getUOFTextContent(QString &path, QString &textContent)
//单独处理pdf文档 //单独处理pdf文档
if (isPDF) { if (isPDF) {
qDebug() << path << "is PDF"; qDebug() << path << "is PDF";
processUOFPPT(doc, textContent);
return; return;
} }
@ -1125,6 +1126,111 @@ void FileUtils::getUOFTextContent(QString &path, QString &textContent)
file.close(); file.close();
} }
void FileUtils::processUOFPPT(const QDomDocument &doc, QString &content)
{
QDomElement rootElem = doc.documentElement();
QList<QDomElement> nodes;
QQueue<QString> names; //每个节点的名称
names << "uof:演示文稿" << "演:主体" << "演:幻灯片集" << "演:幻灯片";
findNodes(rootElem, names, nodes);
if (nodes.empty()) {
//TODO 在uof-ppt不存在锚点节点时直接查找文本节点
return;
}
QStringList objs;
//每一个 演:幻灯片 -> 锚点
for (const auto &node : nodes) {
names.clear();
names << "uof:锚点";
findNodeAttr(node, names, "uof:图形引用", objs);
}
nodes.clear();
names.clear();
names << "uof:对象集" << "图:图形";
findNodesByAttr(rootElem, names, nodes, "图:标识符", objs);
if (nodes.empty()) {
return;
}
for (const auto &node : nodes) {
names.clear();
names << "图:文本内容" << "字:段落" << "字:句" << "字:文本串";
if (findNodeText(node, names, content)) {
break;
}
}
}
/**
* @brief elem的子节点
* @param elem
* @param names
* @param nodes
*/
void FileUtils::findNodes(const QDomElement &elem, QQueue<QString> &names, QList<QDomElement> &nodes)
{
QString targetName = names.dequeue();
QDomNode node = elem.firstChild();
while (!node.isNull()) {
QDomElement e = node.toElement();
if (!e.isNull() && e.tagName() == targetName) {
if (names.empty()) {
nodes.append(e);
} else {
findNodes(e, names, nodes);
break;
}
}
node = node.nextSibling();
}
}
inline void FileUtils::findNodesByAttr(const QDomElement &elem, QQueue <QString> &names, QList <QDomElement> &nodes, const QString &attr, const QStringList &values)
{
findNodes(elem, names, nodes);
QList<QDomElement>::iterator it = nodes.begin();
while (it != nodes.end()) {
if ((*it).hasAttribute(attr) && values.contains((*it).attribute(attr))) {
it++;
} else {
it = nodes.erase(it);
}
}
}
inline bool FileUtils::findNodeText(const QDomElement &elem, QQueue<QString> &names, QString &content)
{
QList<QDomElement> nodes;
findNodes(elem, names, nodes);
for (const auto &node : nodes) {
content.append(node.text());
if (content.length() >= MAX_CONTENT_LENGTH / 3) {
return true;
}
}
return false;
}
inline void FileUtils::findNodeAttr(const QDomElement &elem, QQueue<QString> &names, const QString &attr, QStringList &attrs)
{
QList<QDomElement> nodes;
findNodes(elem, names, nodes);
for (const auto &node : nodes) {
if (node.hasAttribute(attr)) {
attrs.append(node.attribute(attr));
}
}
}
/** /**
* uof2.0 * uof2.0
* @brief https://www.doc88.com/p-9089133923912.html 或 GJB/Z 165-2012 * @brief https://www.doc88.com/p-9089133923912.html 或 GJB/Z 165-2012

View File

@ -109,6 +109,13 @@ public:
enum class SearchMethod { DIRECTSEARCH = 0, INDEXSEARCH = 1}; enum class SearchMethod { DIRECTSEARCH = 0, INDEXSEARCH = 1};
static SearchMethod searchMethod; static SearchMethod searchMethod;
private:
static void findNodes(const QDomElement &elem, QQueue<QString> &names, QList<QDomElement> &nodes);
static inline void findNodesByAttr(const QDomElement&, QQueue<QString>&, QList<QDomElement>&, const QString &, const QStringList&);
static inline bool findNodeText(const QDomElement &elem, QQueue<QString> &names, QString &content);
static inline void findNodeAttr(const QDomElement &elem, QQueue<QString> &names, const QString &attr, QStringList &attrs);
static void processUOFPPT(const QDomDocument &doc, QString &content);
private: private:
FileUtils(); FileUtils();
}; };