forked from openkylin/ukui-search
新增解析uop格式文档,增加uof ppt文档解析粒度
This commit is contained in:
parent
04267253a6
commit
4294019e28
|
@ -43,7 +43,7 @@ static const QMap<QString, bool> targetFileTypeMap = {
|
||||||
{"uof", true},
|
{"uof", true},
|
||||||
{"uot", true},
|
{"uot", true},
|
||||||
{"uos", true},
|
{"uos", true},
|
||||||
{"uop", false},
|
{"uop", true},
|
||||||
{"ofd", true}
|
{"ofd", true}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1092,21 +1092,21 @@ void FileUtils::getUOFTextContent(QString &path, QString &textContent)
|
||||||
}
|
}
|
||||||
file.close();
|
file.close();
|
||||||
|
|
||||||
bool isPDF = false;
|
bool isPPT = false;
|
||||||
QDomElement rootElem = doc.documentElement();
|
QDomElement rootElem = doc.documentElement();
|
||||||
QDomNode node = rootElem.firstChild();
|
QDomNode node = rootElem.firstChild();
|
||||||
while (!node.isNull()) {
|
while (!node.isNull()) {
|
||||||
QDomElement e = node.toElement();
|
QDomElement e = node.toElement();
|
||||||
if (!e.isNull() && e.tagName() == "uof:演示文稿") {
|
if (!e.isNull() && e.tagName() == "uof:演示文稿") {
|
||||||
isPDF = true;
|
isPPT = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
node = node.nextSibling();
|
node = node.nextSibling();
|
||||||
}
|
}
|
||||||
|
|
||||||
//单独处理pdf文档
|
//单独处理pdf文档
|
||||||
if (isPDF) {
|
if (isPPT) {
|
||||||
qDebug() << path << "is PDF";
|
qDebug() << path << "is PPT";
|
||||||
processUOFPPT(doc, textContent);
|
processUOFPPT(doc, textContent);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -1157,9 +1157,23 @@ void FileUtils::processUOFPPT(const QDomDocument &doc, QString &content)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
QList<QDomElement> paraNodes; //全部段落节点
|
||||||
for (const auto &node : nodes) {
|
for (const auto &node : nodes) {
|
||||||
names.clear();
|
names.clear();
|
||||||
names << "图:文本内容" << "字:段落" << "字:句" << "字:文本串";
|
names << "图:文本内容" << "字:段落";
|
||||||
|
findNodes(node, names, paraNodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
nodes.clear();
|
||||||
|
for (const auto &node : paraNodes) {
|
||||||
|
names.clear();
|
||||||
|
names << "字:句";
|
||||||
|
findNodes(node, names, nodes); //全部段落下的全部句节点
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto &node : nodes) {
|
||||||
|
names.clear();
|
||||||
|
names << "字:文本串";
|
||||||
if (findNodeText(node, names, content)) {
|
if (findNodeText(node, names, content)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1272,6 +1286,97 @@ void FileUtils::getUOF2TextContent(QString &path, QString &textContent)
|
||||||
file.close();
|
file.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void FileUtils::getUOF2PPTContent(QString &path, QString &textContent)
|
||||||
|
{
|
||||||
|
QFileInfo info = QFileInfo(path);
|
||||||
|
if (!info.exists() || info.isDir())
|
||||||
|
return;
|
||||||
|
|
||||||
|
QuaZip zipFile(path);
|
||||||
|
QDomDocument doc;
|
||||||
|
if (!loadZipFileToDoc(zipFile, doc, "content.xml")) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
QDomElement rootElem = doc.documentElement();
|
||||||
|
QList<QDomElement> nodes;
|
||||||
|
QQueue<QString> names; //每个节点的名称
|
||||||
|
names << "演:幻灯片集_6C0E" << "演:幻灯片_6C0F";
|
||||||
|
findNodes(rootElem, names, nodes);
|
||||||
|
|
||||||
|
if (nodes.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
QStringList attrs;
|
||||||
|
for (const auto &node : nodes) {
|
||||||
|
names.clear();
|
||||||
|
names << "uof:锚点_C644";
|
||||||
|
findNodeAttr(node, names, "图形引用_C62E", attrs);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (attrs.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!loadZipFileToDoc(zipFile, doc, "graphics.xml")) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
nodes.clear();
|
||||||
|
names.clear();
|
||||||
|
names << "图:图形_8062";
|
||||||
|
rootElem = doc.documentElement();
|
||||||
|
findNodesByAttr(rootElem, names, nodes, "标识符_804B", attrs);
|
||||||
|
|
||||||
|
QList<QDomElement> nodes416B; //字:段落_416B
|
||||||
|
for (const auto &node : nodes) {
|
||||||
|
names.clear();
|
||||||
|
names << "图:文本_803C" << "图:内容_8043" << "字:段落_416B";
|
||||||
|
findNodes(node, names, nodes416B);
|
||||||
|
}
|
||||||
|
|
||||||
|
nodes.clear();
|
||||||
|
for (const auto &node : nodes416B) {
|
||||||
|
names.clear();
|
||||||
|
names << "字:句_419D";
|
||||||
|
findNodes(node, names, nodes); //所有的 字:句_419D
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto &node : nodes) {
|
||||||
|
names.clear();
|
||||||
|
names << "字:文本串_415B";
|
||||||
|
if (findNodeText(node, names, textContent)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool FileUtils::loadZipFileToDoc(QuaZip &zipFile, QDomDocument &doc, const QString &fileName)
|
||||||
|
{
|
||||||
|
if (!zipFile.isOpen() && !zipFile.open(QuaZip::mdUnzip)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!zipFile.setCurrentFile(fileName)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
QuaZipFile file(&zipFile);
|
||||||
|
if (!file.open(QIODevice::ReadOnly)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
doc.clear();
|
||||||
|
if (!doc.setContent(&file)) {
|
||||||
|
file.close();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
file.close();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* OFD文件解析
|
* OFD文件解析
|
||||||
* @brief 参考: GB/T 33190-2016
|
* @brief 参考: GB/T 33190-2016
|
||||||
|
|
|
@ -93,6 +93,7 @@ public:
|
||||||
static void getTxtContent(QString &path, QString &textcontent);
|
static void getTxtContent(QString &path, QString &textcontent);
|
||||||
static void getUOFTextContent(QString &path, QString &textContent);
|
static void getUOFTextContent(QString &path, QString &textContent);
|
||||||
static void getUOF2TextContent(QString &path, QString &textContent);
|
static void getUOF2TextContent(QString &path, QString &textContent);
|
||||||
|
static void getUOF2PPTContent(QString &path, QString &textContent);
|
||||||
static void getOFDTextContent(QString &path, QString &textContent);
|
static void getOFDTextContent(QString &path, QString &textContent);
|
||||||
|
|
||||||
static int openFile(QString &path, bool openInDir = false);
|
static int openFile(QString &path, bool openInDir = false);
|
||||||
|
@ -115,6 +116,7 @@ private:
|
||||||
static inline bool findNodeText(const QDomElement &elem, QQueue<QString> &names, QString &content);
|
static inline bool findNodeText(const QDomElement &elem, QQueue<QString> &names, QString &content);
|
||||||
static inline void findNodeAttr(const QDomElement &elem, QQueue<QString> &names, const QString &attr, QStringList &attrs);
|
static inline void findNodeAttr(const QDomElement &elem, QQueue<QString> &names, const QString &attr, QStringList &attrs);
|
||||||
static void processUOFPPT(const QDomDocument &doc, QString &content);
|
static void processUOFPPT(const QDomDocument &doc, QString &content);
|
||||||
|
static inline bool loadZipFileToDoc(QuaZip &zipFile, QDomDocument &doc, const QString &fileName);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
FileUtils();
|
FileUtils();
|
||||||
|
|
|
@ -55,10 +55,12 @@ void FileReader::getTextContent(QString path, QString &textContent, QString &suf
|
||||||
KBinaryParser searchdata;
|
KBinaryParser searchdata;
|
||||||
searchdata.RunParser(path, textContent);
|
searchdata.RunParser(path, textContent);
|
||||||
}
|
}
|
||||||
|
} else if (suffix == "uot" || suffix == "uos") {
|
||||||
} else if (suffix == "uot" || suffix == "uos" || suffix == "uop") {
|
|
||||||
FileUtils::getUOF2TextContent(path, textContent);
|
FileUtils::getUOF2TextContent(path, textContent);
|
||||||
|
|
||||||
|
} else if (suffix == "uop") {
|
||||||
|
FileUtils::getUOF2PPTContent(path, textContent);
|
||||||
|
|
||||||
} else if (suffix == "ofd") {
|
} else if (suffix == "ofd") {
|
||||||
FileUtils::getOFDTextContent(path, textContent);
|
FileUtils::getOFDTextContent(path, textContent);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue