新增解析uop格式文档,增加uof ppt文档解析粒度

This commit is contained in:
hewenfei 2022-05-11 09:08:23 +08:00 committed by iaom
parent 04267253a6
commit 4294019e28
4 changed files with 117 additions and 8 deletions

View File

@ -43,7 +43,7 @@ static const QMap<QString, bool> targetFileTypeMap = {
{"uof", true},
{"uot", true},
{"uos", true},
{"uop", false},
{"uop", true},
{"ofd", true}
};

View File

@ -1092,21 +1092,21 @@ void FileUtils::getUOFTextContent(QString &path, QString &textContent)
}
file.close();
bool isPDF = false;
bool isPPT = false;
QDomElement rootElem = doc.documentElement();
QDomNode node = rootElem.firstChild();
while (!node.isNull()) {
QDomElement e = node.toElement();
if (!e.isNull() && e.tagName() == "uof:演示文稿") {
isPDF = true;
isPPT = true;
break;
}
node = node.nextSibling();
}
//单独处理pdf文档
if (isPDF) {
qDebug() << path << "is PDF";
if (isPPT) {
qDebug() << path << "is PPT";
processUOFPPT(doc, textContent);
return;
}
@ -1157,9 +1157,23 @@ void FileUtils::processUOFPPT(const QDomDocument &doc, QString &content)
return;
}
QList<QDomElement> paraNodes; //全部段落节点
for (const auto &node : nodes) {
names.clear();
names << "图:文本内容" << "字:段落" << "字:句" << "字:文本串";
names << "图:文本内容" << "字:段落";
findNodes(node, names, paraNodes);
}
nodes.clear();
for (const auto &node : paraNodes) {
names.clear();
names << "字:句";
findNodes(node, names, nodes); //全部段落下的全部句节点
}
for (const auto &node : nodes) {
names.clear();
names << "字:文本串";
if (findNodeText(node, names, content)) {
break;
}
@ -1272,6 +1286,97 @@ void FileUtils::getUOF2TextContent(QString &path, QString &textContent)
file.close();
}
void FileUtils::getUOF2PPTContent(QString &path, QString &textContent)
{
QFileInfo info = QFileInfo(path);
if (!info.exists() || info.isDir())
return;
QuaZip zipFile(path);
QDomDocument doc;
if (!loadZipFileToDoc(zipFile, doc, "content.xml")) {
return;
}
QDomElement rootElem = doc.documentElement();
QList<QDomElement> nodes;
QQueue<QString> names; //每个节点的名称
names << "演:幻灯片集_6C0E" << "演:幻灯片_6C0F";
findNodes(rootElem, names, nodes);
if (nodes.empty()) {
return;
}
QStringList attrs;
for (const auto &node : nodes) {
names.clear();
names << "uof:锚点_C644";
findNodeAttr(node, names, "图形引用_C62E", attrs);
}
if (attrs.empty()) {
return;
}
if (!loadZipFileToDoc(zipFile, doc, "graphics.xml")) {
return;
}
nodes.clear();
names.clear();
names << "图:图形_8062";
rootElem = doc.documentElement();
findNodesByAttr(rootElem, names, nodes, "标识符_804B", attrs);
QList<QDomElement> nodes416B; //字:段落_416B
for (const auto &node : nodes) {
names.clear();
names << "图:文本_803C" << "图:内容_8043" << "字:段落_416B";
findNodes(node, names, nodes416B);
}
nodes.clear();
for (const auto &node : nodes416B) {
names.clear();
names << "字:句_419D";
findNodes(node, names, nodes); //所有的 字:句_419D
}
for (const auto &node : nodes) {
names.clear();
names << "字:文本串_415B";
if (findNodeText(node, names, textContent)) {
break;
}
}
}
inline bool FileUtils::loadZipFileToDoc(QuaZip &zipFile, QDomDocument &doc, const QString &fileName)
{
if (!zipFile.isOpen() && !zipFile.open(QuaZip::mdUnzip)) {
return false;
}
if (!zipFile.setCurrentFile(fileName)) {
return false;
}
QuaZipFile file(&zipFile);
if (!file.open(QIODevice::ReadOnly)) {
return false;
}
doc.clear();
if (!doc.setContent(&file)) {
file.close();
return false;
}
file.close();
return true;
}
/**
* OFD文件解析
* @brief GB/T 33190-2016

View File

@ -93,6 +93,7 @@ public:
static void getTxtContent(QString &path, QString &textcontent);
static void getUOFTextContent(QString &path, QString &textContent);
static void getUOF2TextContent(QString &path, QString &textContent);
static void getUOF2PPTContent(QString &path, QString &textContent);
static void getOFDTextContent(QString &path, QString &textContent);
static int openFile(QString &path, bool openInDir = false);
@ -115,6 +116,7 @@ private:
static inline bool findNodeText(const QDomElement &elem, QQueue<QString> &names, QString &content);
static inline void findNodeAttr(const QDomElement &elem, QQueue<QString> &names, const QString &attr, QStringList &attrs);
static void processUOFPPT(const QDomDocument &doc, QString &content);
static inline bool loadZipFileToDoc(QuaZip &zipFile, QDomDocument &doc, const QString &fileName);
private:
FileUtils();

View File

@ -55,10 +55,12 @@ void FileReader::getTextContent(QString path, QString &textContent, QString &suf
KBinaryParser searchdata;
searchdata.RunParser(path, textContent);
}
} else if (suffix == "uot" || suffix == "uos" || suffix == "uop") {
} else if (suffix == "uot" || suffix == "uos") {
FileUtils::getUOF2TextContent(path, textContent);
} else if (suffix == "uop") {
FileUtils::getUOF2PPTContent(path, textContent);
} else if (suffix == "ofd") {
FileUtils::getOFDTextContent(path, textContent);
}