新增uof,uot,uos,ofd格式文件内容解析(不支持演示文稿).

This commit is contained in:
iaom 2022-05-27 16:07:09 +08:00
parent a34a781d3d
commit a71a14ffbf
4 changed files with 196 additions and 3 deletions

View File

@ -39,7 +39,12 @@ static const QMap<QString, bool> targetFileTypeMap = {
{"dps", true},
{"et", true},
{"pdf", true},
{"html", true}
{"html", true},
{"uof", true},
{"uot", true},
{"uos", true},
{"uop", false},
{"ofd", true}
};
static const QMap<QString, bool> targetPhotographTypeMap = {

View File

@ -938,6 +938,12 @@ bool FileUtils::isEncrypedOrUnreadable(QString path)
if (strsfx == "docx" || strsfx == "pptx" || strsfx == "xlsx") {
return FileUtils::isOpenXMLFileEncrypted(path);
} else if (strsfx == "uot" || strsfx == "uos" || strsfx == "uop") {
return false;
} else if (strsfx == "ofd") {
return false;
} else {
return true;
}
@ -951,7 +957,7 @@ bool FileUtils::isEncrypedOrUnreadable(QString path)
return true;
} else if(type.inherits("application/msword") || type.name() == "application/x-ole-storage") {
if(strsfx == "doc" || strsfx == "dot" || strsfx == "wps" || strsfx == "ppt" ||
strsfx == "pps" || strsfx == "dps" || strsfx == "et" || strsfx == "xls") {
strsfx == "pps" || strsfx == "dps" || strsfx == "et" || strsfx == "xls" || strsfx == "uof") {
return false;
}
return true;
@ -959,6 +965,13 @@ bool FileUtils::isEncrypedOrUnreadable(QString path)
if(strsfx == "pdf")
return false;
return true;
} else if(name == "application/xml" || name == "application/uof") {
if(strsfx == "uof") {
return false;
}
return true;
} else {
qWarning() << "Unsupport format:[" << path << "][" << type.name() << "]";
return true;
@ -1048,3 +1061,159 @@ QString FileUtils::wrapData(QLabel *p_label, const QString &text)
// p_label->setText(wrapText);
return wrapText;
}
/**
* uof1.0
* GB/T 20916-2007
* 1.
* 2.
* 3.稿
* ppt的内容存放在对象集中
* 稿---
* <uof: uof:="OBJ16"/>
*
*/
void FileUtils::getUOFTextContent(QString &path, QString &textContent)
{
QFileInfo info(path);
if (!info.exists() || info.isDir()) {
return;
}
QFile file(path);
if (!file.open(QIODevice::ReadOnly)) {
return;
}
QDomDocument doc;
if (!doc.setContent(&file)) {
file.close();
return;
}
file.close();
bool isPDF = false;
QDomElement rootElem = doc.documentElement();
QDomNode node = rootElem.firstChild();
while (!node.isNull()) {
QDomElement e = node.toElement();
if (!e.isNull() && e.tagName() == "uof:演示文稿") {
isPDF = true;
break;
}
node = node.nextSibling();
}
//单独处理pdf文档
if (isPDF) {
qDebug() << path << "is PDF";
return;
}
file.open(QIODevice::ReadOnly);
QXmlStreamReader reader(&file);
while (!reader.atEnd()) {
//适用于文字处理与电子表格
if (reader.readNextStartElement() && reader.name().toString() == "文本串") {
textContent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
if (textContent.length() >= MAX_CONTENT_LENGTH / 3) {
break;
}
}
}
file.close();
}
/**
* uof2.0
* @brief https://www.doc88.com/p-9089133923912.html 或 GJB/Z 165-2012
* ppt文档的内容存放在graphics.xml中content中的引用再解析graphics内容
* @param path
* @param textContent
*/
void FileUtils::getUOF2TextContent(QString &path, QString &textContent)
{
QFileInfo info = QFileInfo(path);
if (!info.exists() || info.isDir())
return;
QuaZip file(path);
if (!file.open(QuaZip::mdUnzip))
return;
if (!file.setCurrentFile("content.xml")) {
return;
}
QuaZipFile fileR(&file);
if (!fileR.open(QIODevice::ReadOnly)) {
return;
}
QXmlStreamReader reader(&fileR);
while (!reader.atEnd()) {
if (reader.readNextStartElement() && reader.name().toString() == "文本串_415B") {
textContent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
if (textContent.length() >= MAX_CONTENT_LENGTH / 3) {
break;
}
}
}
fileR.close();
file.close();
}
/**
* OFD文件解析
* @brief GB/T 33190-2016
* @param path
* @param textContent
*/
void FileUtils::getOFDTextContent(QString &path, QString &textContent)
{
QFileInfo info = QFileInfo(path);
if (!info.exists() || info.isDir())
return;
QuaZip zipfile(path);
if (!zipfile.open(QuaZip::mdUnzip))
return;
// GB/T 33190-2016规范定义可以存在多个Doc_x目录暂时只取第一个目录的内容
QString prefix("Doc_0/Pages/");
QStringList fileList;
for (const auto &file: zipfile.getFileNameList()) {
if (file.startsWith(prefix)) {
fileList << file;
}
}
for (int i = 0; i < fileList.count(); ++i) {
QString filename = prefix + "Page_" + QString::number(i) + "/Content.xml";
if (!zipfile.setCurrentFile(filename)) {
continue;
}
QuaZipFile fileR(&zipfile);
fileR.open(QIODevice::ReadOnly);
QXmlStreamReader reader(&fileR);
while (!reader.atEnd()) {
if (reader.readNextStartElement() && reader.name().toString() == "TextCode") {
textContent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
if (textContent.length() >= MAX_CONTENT_LENGTH / 3) {
fileR.close();
zipfile.close();
return;
}
}
}
fileR.close();
}
zipfile.close();
}

View File

@ -91,6 +91,9 @@ public:
static void getXlsxTextContent(QString &path, QString &textcontent);
static void getPdfTextContent(QString &path, QString &textcontent);
static void getTxtContent(QString &path, QString &textcontent);
static void getUOFTextContent(QString &path, QString &textContent);
static void getUOF2TextContent(QString &path, QString &textContent);
static void getOFDTextContent(QString &path, QString &textContent);
static int openFile(QString &path, bool openInDir = false);
static bool copyPath(QString &path);

View File

@ -35,7 +35,7 @@ void FileReader::getTextContent(QString path, QString &textContent, QString &suf
FileUtils::getPptxTextContent(path, textContent);
} else if (suffix == "xlsx") {
FileUtils::getXlsxTextContent(path, textContent);
} else if (strsfx == "txt" or strsfx == "html") {
} else if (suffix == "txt" or suffix == "html") {
FileUtils::getTxtContent(path, textContent);
} else if (suffix == "doc" || suffix == "dot" || suffix == "wps" || suffix == "ppt" ||
suffix == "pps" || suffix == "dps" || suffix == "et" || suffix == "xls") {
@ -45,6 +45,22 @@ void FileReader::getTextContent(QString path, QString &textContent, QString &suf
FileUtils::getPdfTextContent(path, textContent);
} else if (true == targetPhotographTypeMap[suffix]){
OcrObject::getInstance()->getTxtContent(path, textContent);
} else if (suffix == "uof") {
QString mimeName = FileUtils::getMimetype(path).name();
if (mimeName == "application/xml" || mimeName == "application/uof") {
FileUtils::getUOFTextContent(path, textContent);
} else if (mimeName == "application/x-ole-storage") {
//uof的ppt文档不支持修改母版。一旦进行这些操作uof文档可能会被wps存为doc文件
KBinaryParser searchdata;
searchdata.RunParser(path, textContent);
}
} else if (suffix == "uot" || suffix == "uos" || suffix == "uop") {
FileUtils::getUOF2TextContent(path, textContent);
} else if (suffix == "ofd") {
FileUtils::getOFDTextContent(path, textContent);
}
return;
}