forked from openkylin/ukui-search
新增解析uof格式ppt文档
This commit is contained in:
parent
a71a14ffbf
commit
04267253a6
|
@ -1107,6 +1107,7 @@ void FileUtils::getUOFTextContent(QString &path, QString &textContent)
|
||||||
//单独处理pdf文档
|
//单独处理pdf文档
|
||||||
if (isPDF) {
|
if (isPDF) {
|
||||||
qDebug() << path << "is PDF";
|
qDebug() << path << "is PDF";
|
||||||
|
processUOFPPT(doc, textContent);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1125,6 +1126,111 @@ void FileUtils::getUOFTextContent(QString &path, QString &textContent)
|
||||||
file.close();
|
file.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void FileUtils::processUOFPPT(const QDomDocument &doc, QString &content)
|
||||||
|
{
|
||||||
|
QDomElement rootElem = doc.documentElement();
|
||||||
|
QList<QDomElement> nodes;
|
||||||
|
QQueue<QString> names; //每个节点的名称
|
||||||
|
names << "uof:演示文稿" << "演:主体" << "演:幻灯片集" << "演:幻灯片";
|
||||||
|
|
||||||
|
findNodes(rootElem, names, nodes);
|
||||||
|
|
||||||
|
if (nodes.empty()) {
|
||||||
|
//TODO 在uof-ppt不存在锚点节点时,直接查找文本节点?
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
QStringList objs;
|
||||||
|
//每一个 演:幻灯片 -> 锚点
|
||||||
|
for (const auto &node : nodes) {
|
||||||
|
names.clear();
|
||||||
|
names << "uof:锚点";
|
||||||
|
findNodeAttr(node, names, "uof:图形引用", objs);
|
||||||
|
}
|
||||||
|
|
||||||
|
nodes.clear();
|
||||||
|
names.clear();
|
||||||
|
names << "uof:对象集" << "图:图形";
|
||||||
|
findNodesByAttr(rootElem, names, nodes, "图:标识符", objs);
|
||||||
|
|
||||||
|
if (nodes.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto &node : nodes) {
|
||||||
|
names.clear();
|
||||||
|
names << "图:文本内容" << "字:段落" << "字:句" << "字:文本串";
|
||||||
|
if (findNodeText(node, names, content)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief 查找elem的子节点
|
||||||
|
* @param elem 起始节点
|
||||||
|
* @param names 名称链
|
||||||
|
* @param nodes 查找到的全部结果
|
||||||
|
*/
|
||||||
|
void FileUtils::findNodes(const QDomElement &elem, QQueue<QString> &names, QList<QDomElement> &nodes)
|
||||||
|
{
|
||||||
|
QString targetName = names.dequeue();
|
||||||
|
QDomNode node = elem.firstChild();
|
||||||
|
while (!node.isNull()) {
|
||||||
|
QDomElement e = node.toElement();
|
||||||
|
if (!e.isNull() && e.tagName() == targetName) {
|
||||||
|
if (names.empty()) {
|
||||||
|
nodes.append(e);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
findNodes(e, names, nodes);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
node = node.nextSibling();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void FileUtils::findNodesByAttr(const QDomElement &elem, QQueue <QString> &names, QList <QDomElement> &nodes, const QString &attr, const QStringList &values)
|
||||||
|
{
|
||||||
|
findNodes(elem, names, nodes);
|
||||||
|
|
||||||
|
QList<QDomElement>::iterator it = nodes.begin();
|
||||||
|
while (it != nodes.end()) {
|
||||||
|
if ((*it).hasAttribute(attr) && values.contains((*it).attribute(attr))) {
|
||||||
|
it++;
|
||||||
|
} else {
|
||||||
|
it = nodes.erase(it);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool FileUtils::findNodeText(const QDomElement &elem, QQueue<QString> &names, QString &content)
|
||||||
|
{
|
||||||
|
QList<QDomElement> nodes;
|
||||||
|
findNodes(elem, names, nodes);
|
||||||
|
|
||||||
|
for (const auto &node : nodes) {
|
||||||
|
content.append(node.text());
|
||||||
|
if (content.length() >= MAX_CONTENT_LENGTH / 3) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void FileUtils::findNodeAttr(const QDomElement &elem, QQueue<QString> &names, const QString &attr, QStringList &attrs)
|
||||||
|
{
|
||||||
|
QList<QDomElement> nodes;
|
||||||
|
findNodes(elem, names, nodes);
|
||||||
|
|
||||||
|
for (const auto &node : nodes) {
|
||||||
|
if (node.hasAttribute(attr)) {
|
||||||
|
attrs.append(node.attribute(attr));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* uof2.0解析
|
* uof2.0解析
|
||||||
* @brief 参考规范文档 https://www.doc88.com/p-9089133923912.html 或 GJB/Z 165-2012
|
* @brief 参考规范文档 https://www.doc88.com/p-9089133923912.html 或 GJB/Z 165-2012
|
||||||
|
|
|
@ -109,6 +109,13 @@ public:
|
||||||
enum class SearchMethod { DIRECTSEARCH = 0, INDEXSEARCH = 1};
|
enum class SearchMethod { DIRECTSEARCH = 0, INDEXSEARCH = 1};
|
||||||
static SearchMethod searchMethod;
|
static SearchMethod searchMethod;
|
||||||
|
|
||||||
|
private:
|
||||||
|
static void findNodes(const QDomElement &elem, QQueue<QString> &names, QList<QDomElement> &nodes);
|
||||||
|
static inline void findNodesByAttr(const QDomElement&, QQueue<QString>&, QList<QDomElement>&, const QString &, const QStringList&);
|
||||||
|
static inline bool findNodeText(const QDomElement &elem, QQueue<QString> &names, QString &content);
|
||||||
|
static inline void findNodeAttr(const QDomElement &elem, QQueue<QString> &names, const QString &attr, QStringList &attrs);
|
||||||
|
static void processUOFPPT(const QDomDocument &doc, QString &content);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
FileUtils();
|
FileUtils();
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue