新增uof,uot,uos,ofd格式文件内容解析(不支持演示文稿).
This commit is contained in:
parent
a34a781d3d
commit
a71a14ffbf
|
@ -39,7 +39,12 @@ static const QMap<QString, bool> targetFileTypeMap = {
|
|||
{"dps", true},
|
||||
{"et", true},
|
||||
{"pdf", true},
|
||||
{"html", true}
|
||||
{"html", true},
|
||||
{"uof", true},
|
||||
{"uot", true},
|
||||
{"uos", true},
|
||||
{"uop", false},
|
||||
{"ofd", true}
|
||||
};
|
||||
|
||||
static const QMap<QString, bool> targetPhotographTypeMap = {
|
||||
|
|
|
@ -938,6 +938,12 @@ bool FileUtils::isEncrypedOrUnreadable(QString path)
|
|||
if (strsfx == "docx" || strsfx == "pptx" || strsfx == "xlsx") {
|
||||
|
||||
return FileUtils::isOpenXMLFileEncrypted(path);
|
||||
} else if (strsfx == "uot" || strsfx == "uos" || strsfx == "uop") {
|
||||
return false;
|
||||
|
||||
} else if (strsfx == "ofd") {
|
||||
return false;
|
||||
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
|
@ -951,7 +957,7 @@ bool FileUtils::isEncrypedOrUnreadable(QString path)
|
|||
return true;
|
||||
} else if(type.inherits("application/msword") || type.name() == "application/x-ole-storage") {
|
||||
if(strsfx == "doc" || strsfx == "dot" || strsfx == "wps" || strsfx == "ppt" ||
|
||||
strsfx == "pps" || strsfx == "dps" || strsfx == "et" || strsfx == "xls") {
|
||||
strsfx == "pps" || strsfx == "dps" || strsfx == "et" || strsfx == "xls" || strsfx == "uof") {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -959,6 +965,13 @@ bool FileUtils::isEncrypedOrUnreadable(QString path)
|
|||
if(strsfx == "pdf")
|
||||
return false;
|
||||
return true;
|
||||
|
||||
} else if(name == "application/xml" || name == "application/uof") {
|
||||
if(strsfx == "uof") {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
||||
} else {
|
||||
qWarning() << "Unsupport format:[" << path << "][" << type.name() << "]";
|
||||
return true;
|
||||
|
@ -1048,3 +1061,159 @@ QString FileUtils::wrapData(QLabel *p_label, const QString &text)
|
|||
// p_label->setText(wrapText);
|
||||
return wrapText;
|
||||
}
|
||||
|
||||
/**
|
||||
* uof1.0解析
|
||||
* 参考规范:GB/T 20916-2007
|
||||
* 1.文字处理
|
||||
* 2.电子表格
|
||||
* 3.演示文稿
|
||||
* ppt的内容存放在对象集中,
|
||||
* 可以通过演示文稿-主体-幻灯片集-幻灯片下的锚点属性获取引用了哪些内容:
|
||||
* <uof:锚点 uof:图形引用="OBJ16"/>
|
||||
* 目标:文本串
|
||||
*/
|
||||
void FileUtils::getUOFTextContent(QString &path, QString &textContent)
|
||||
{
|
||||
QFileInfo info(path);
|
||||
if (!info.exists() || info.isDir()) {
|
||||
return;
|
||||
}
|
||||
|
||||
QFile file(path);
|
||||
if (!file.open(QIODevice::ReadOnly)) {
|
||||
return;
|
||||
}
|
||||
|
||||
QDomDocument doc;
|
||||
if (!doc.setContent(&file)) {
|
||||
file.close();
|
||||
return;
|
||||
}
|
||||
file.close();
|
||||
|
||||
bool isPDF = false;
|
||||
QDomElement rootElem = doc.documentElement();
|
||||
QDomNode node = rootElem.firstChild();
|
||||
while (!node.isNull()) {
|
||||
QDomElement e = node.toElement();
|
||||
if (!e.isNull() && e.tagName() == "uof:演示文稿") {
|
||||
isPDF = true;
|
||||
break;
|
||||
}
|
||||
node = node.nextSibling();
|
||||
}
|
||||
|
||||
//单独处理pdf文档
|
||||
if (isPDF) {
|
||||
qDebug() << path << "is PDF";
|
||||
return;
|
||||
}
|
||||
|
||||
file.open(QIODevice::ReadOnly);
|
||||
QXmlStreamReader reader(&file);
|
||||
while (!reader.atEnd()) {
|
||||
//适用于文字处理与电子表格
|
||||
if (reader.readNextStartElement() && reader.name().toString() == "文本串") {
|
||||
textContent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
||||
if (textContent.length() >= MAX_CONTENT_LENGTH / 3) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
file.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* uof2.0解析
|
||||
* @brief 参考规范文档 https://www.doc88.com/p-9089133923912.html 或 GJB/Z 165-2012
|
||||
* ppt文档的内容存放在graphics.xml中,需要先解析content中的引用再解析graphics内容
|
||||
* @param path
|
||||
* @param textContent
|
||||
*/
|
||||
void FileUtils::getUOF2TextContent(QString &path, QString &textContent)
|
||||
{
|
||||
QFileInfo info = QFileInfo(path);
|
||||
if (!info.exists() || info.isDir())
|
||||
return;
|
||||
|
||||
QuaZip file(path);
|
||||
if (!file.open(QuaZip::mdUnzip))
|
||||
return;
|
||||
|
||||
if (!file.setCurrentFile("content.xml")) {
|
||||
return;
|
||||
}
|
||||
|
||||
QuaZipFile fileR(&file);
|
||||
if (!fileR.open(QIODevice::ReadOnly)) {
|
||||
return;
|
||||
}
|
||||
|
||||
QXmlStreamReader reader(&fileR);
|
||||
|
||||
while (!reader.atEnd()) {
|
||||
if (reader.readNextStartElement() && reader.name().toString() == "文本串_415B") {
|
||||
textContent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
||||
if (textContent.length() >= MAX_CONTENT_LENGTH / 3) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fileR.close();
|
||||
file.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* OFD文件解析
|
||||
* @brief 参考: GB/T 33190-2016
|
||||
* @param path
|
||||
* @param textContent
|
||||
*/
|
||||
void FileUtils::getOFDTextContent(QString &path, QString &textContent)
|
||||
{
|
||||
QFileInfo info = QFileInfo(path);
|
||||
if (!info.exists() || info.isDir())
|
||||
return;
|
||||
|
||||
QuaZip zipfile(path);
|
||||
if (!zipfile.open(QuaZip::mdUnzip))
|
||||
return;
|
||||
|
||||
// GB/T 33190-2016规范定义可以存在多个Doc_x目录,暂时只取第一个目录的内容
|
||||
QString prefix("Doc_0/Pages/");
|
||||
QStringList fileList;
|
||||
for (const auto &file: zipfile.getFileNameList()) {
|
||||
if (file.startsWith(prefix)) {
|
||||
fileList << file;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < fileList.count(); ++i) {
|
||||
QString filename = prefix + "Page_" + QString::number(i) + "/Content.xml";
|
||||
if (!zipfile.setCurrentFile(filename)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
QuaZipFile fileR(&zipfile);
|
||||
fileR.open(QIODevice::ReadOnly);
|
||||
QXmlStreamReader reader(&fileR);
|
||||
|
||||
while (!reader.atEnd()) {
|
||||
if (reader.readNextStartElement() && reader.name().toString() == "TextCode") {
|
||||
textContent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
||||
if (textContent.length() >= MAX_CONTENT_LENGTH / 3) {
|
||||
fileR.close();
|
||||
zipfile.close();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fileR.close();
|
||||
}
|
||||
|
||||
zipfile.close();
|
||||
}
|
||||
|
|
|
@ -91,6 +91,9 @@ public:
|
|||
static void getXlsxTextContent(QString &path, QString &textcontent);
|
||||
static void getPdfTextContent(QString &path, QString &textcontent);
|
||||
static void getTxtContent(QString &path, QString &textcontent);
|
||||
static void getUOFTextContent(QString &path, QString &textContent);
|
||||
static void getUOF2TextContent(QString &path, QString &textContent);
|
||||
static void getOFDTextContent(QString &path, QString &textContent);
|
||||
|
||||
static int openFile(QString &path, bool openInDir = false);
|
||||
static bool copyPath(QString &path);
|
||||
|
|
|
@ -35,7 +35,7 @@ void FileReader::getTextContent(QString path, QString &textContent, QString &suf
|
|||
FileUtils::getPptxTextContent(path, textContent);
|
||||
} else if (suffix == "xlsx") {
|
||||
FileUtils::getXlsxTextContent(path, textContent);
|
||||
} else if (strsfx == "txt" or strsfx == "html") {
|
||||
} else if (suffix == "txt" or suffix == "html") {
|
||||
FileUtils::getTxtContent(path, textContent);
|
||||
} else if (suffix == "doc" || suffix == "dot" || suffix == "wps" || suffix == "ppt" ||
|
||||
suffix == "pps" || suffix == "dps" || suffix == "et" || suffix == "xls") {
|
||||
|
@ -45,6 +45,22 @@ void FileReader::getTextContent(QString path, QString &textContent, QString &suf
|
|||
FileUtils::getPdfTextContent(path, textContent);
|
||||
} else if (true == targetPhotographTypeMap[suffix]){
|
||||
OcrObject::getInstance()->getTxtContent(path, textContent);
|
||||
} else if (suffix == "uof") {
|
||||
QString mimeName = FileUtils::getMimetype(path).name();
|
||||
if (mimeName == "application/xml" || mimeName == "application/uof") {
|
||||
FileUtils::getUOFTextContent(path, textContent);
|
||||
|
||||
} else if (mimeName == "application/x-ole-storage") {
|
||||
//uof的ppt文档不支持修改母版。一旦进行这些操作,uof文档可能会被wps存为doc文件
|
||||
KBinaryParser searchdata;
|
||||
searchdata.RunParser(path, textContent);
|
||||
}
|
||||
|
||||
} else if (suffix == "uot" || suffix == "uos" || suffix == "uop") {
|
||||
FileUtils::getUOF2TextContent(path, textContent);
|
||||
|
||||
} else if (suffix == "ofd") {
|
||||
FileUtils::getOFDTextContent(path, textContent);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue