Optimized file content parser.

This commit is contained in:
iaom 2021-03-04 14:10:00 +08:00
parent 4a8076122e
commit 3daf5ceab9
4 changed files with 23 additions and 14 deletions

View File

@ -28,8 +28,6 @@
#include "quazip/quazip.h"
#include <quazip/quazipfile.h>
#include <QDomDocument>
#include <QMimeDatabase>
#include <QMimeType>
#include <QQueue>
#include "uchardet/uchardet.h"
@ -211,14 +209,12 @@ void FileUtils::loadHanziTable(const QString &fileName)
return;
}
QString FileUtils::getMimetype(QString &path, bool getsuffix)
QMimeType FileUtils::getMimetype(QString &path)
{
QMimeDatabase mdb;
QMimeType type = mdb.mimeTypeForFile(path,QMimeDatabase::MatchContent);
if(getsuffix)
return type.name();
else
return type.preferredSuffix();
return type;
}
//aborted
@ -559,7 +555,7 @@ void FileUtils::getTxtContent(QString &path, QString &textcontent)
const char *codec = uchardet_get_charset(chardet);
if(QTextCodec::codecForName(codec) == 0)
qWarning()<<"Unsupported Text encoding format"<<path<<QString::fromLocal8Bit(codec)<<"zpf666";
qWarning()<<"Unsupported Text encoding format"<<path<<QString::fromLocal8Bit(codec);
QTextStream stream(encodedString,QIODevice::ReadOnly);
stream.setCodec(codec);

View File

@ -32,6 +32,8 @@
#include <QCryptographicHash>
#include <QIcon>
#include <QMap>
#include <QMimeDatabase>
#include <QMimeType>
#include "libsearch_global.h"
//#define INITIAL_STATE 0
//#define CREATING_INDEX 1
@ -59,7 +61,7 @@ public:
static void loadHanziTable(const QString&);
//parse text,docx.....
static QString getMimetype(QString &path, bool getsuffix = false);
static QMimeType getMimetype(QString &path);
static void getDocxTextContent(QString &path, QString &textcontent);
static void getTxtContent(QString &path, QString &textcontent);
static size_t _max_index_count;

View File

@ -28,20 +28,21 @@ FileReader::FileReader(QObject *parent) : QObject(parent)
void FileReader::getTextContent(QString path, QString &textContent)
{
QString type =FileUtils::getMimetype(path,true);
QMimeType type = FileUtils::getMimetype(path);
QString name = type.name();
QFileInfo file(path);
QString strsfx = file.suffix();
if(type == "application/zip")
if(name== "application/zip")
{
if(strsfx.endsWith( "docx"))
FileUtils::getDocxTextContent(path,textContent);
}
else if(type == "text/plain")
else if(name == "text/plain")
{
if(strsfx.endsWith( "txt"))
FileUtils::getTxtContent(path,textContent);
}
else if(type == "application/x-ole-storage")
else if(type.inherits("application/msword"))
{
if (strsfx.endsWith("doc"))
{
@ -49,6 +50,10 @@ void FileReader::getTextContent(QString path, QString &textContent)
searchdata.RunParser(path,textContent);
}
}
else
{
qWarning()<<"Unsupport format:["<<path<<"]["<<type.name()<<"]";
}
return;
}

View File

@ -5060,11 +5060,13 @@ bool KBinaryParser::read8DocText(FILE *pFile, const ppsInfoType *pPPS,
{
ushort* usAucData = (ushort*)ptaucBytes;
content.append(QString::fromUtf16(usAucData).replace("\r",""));
usAucData = (ushort*)xfree((void*)usAucData);
}
else
{
//need more format document
qWarning()<<"Parser error:";
ptaucBytes = (UCHAR*)xfree((void*)ptaucBytes);
qWarning()<<"Parser error:"<<m_strFileName;
}
}
}
@ -5393,6 +5395,10 @@ int KBinaryParser::InitDocOle(FILE* pFile,long lFilesize,QString &content)
aulBBD, tBBDLen, aulSBD, tSBDLen,
aucHeader,content);
}
else
{
qWarning()<<"Unsupport doc type:"<<m_strFileName;
}
// else if (PPS_info.type == Excel)
// {
// readParam.ulStBlk = PPS_info.tWorkBook.ulSB;