Merge pull request #156 from iaom/0301-dev

Add support for 'doc' file in file content search.
This commit is contained in:
Mouse Zhang 2021-03-04 18:31:04 +08:00 committed by GitHub
commit c79998301f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 5672 additions and 18 deletions

View File

@ -28,8 +28,6 @@
#include "quazip/quazip.h"
#include <quazip/quazipfile.h>
#include <QDomDocument>
#include <QMimeDatabase>
#include <QMimeType>
#include <QQueue>
#include "uchardet/uchardet.h"
@ -211,14 +209,12 @@ void FileUtils::loadHanziTable(const QString &fileName)
return;
}
QString FileUtils::getMimetype(QString &path, bool getsuffix)
QMimeType FileUtils::getMimetype(QString &path)
{
QMimeDatabase mdb;
QMimeType type = mdb.mimeTypeForFile(path,QMimeDatabase::MatchContent);
if(getsuffix)
return type.name();
else
return type.preferredSuffix();
return type;
}
//aborted
@ -559,7 +555,7 @@ void FileUtils::getTxtContent(QString &path, QString &textcontent)
const char *codec = uchardet_get_charset(chardet);
if(QTextCodec::codecForName(codec) == 0)
qWarning()<<"Unsupported Text encoding format"<<path<<QString::fromLocal8Bit(codec)<<"zpf666";
qWarning()<<"Unsupported Text encoding format"<<path<<QString::fromLocal8Bit(codec);
QTextStream stream(encodedString,QIODevice::ReadOnly);
stream.setCodec(codec);

View File

@ -32,6 +32,8 @@
#include <QCryptographicHash>
#include <QIcon>
#include <QMap>
#include <QMimeDatabase>
#include <QMimeType>
#include "libsearch_global.h"
//#define INITIAL_STATE 0
//#define CREATING_INDEX 1
@ -59,7 +61,7 @@ public:
static void loadHanziTable(const QString&);
//parse text,docx.....
static QString getMimetype(QString &path, bool getsuffix = false);
static QMimeType getMimetype(QString &path);
static void getDocxTextContent(QString &path, QString &textcontent);
static void getTxtContent(QString &path, QString &textcontent);
static size_t _max_index_count;

View File

@ -122,6 +122,8 @@ void ConstructDocumentForContent::run()
_doc_list_content = new QList<Document>;
QString content;
FileReader::getTextContent(m_path,content);
if(content.isEmpty())
return;
QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/",0,-2,QString::SectionIncludeLeadingSep)));

View File

@ -19,6 +19,7 @@
*/
#include "file-reader.h"
#include "file-utils.h"
#include "binary-parser.h"
FileReader::FileReader(QObject *parent) : QObject(parent)
{
@ -27,13 +28,32 @@ FileReader::FileReader(QObject *parent) : QObject(parent)
void FileReader::getTextContent(QString path, QString &textContent)
{
//获取所有文件内容
//先分类
QString type =FileUtils::getMimetype(path,true);
if(type == "application/zip")
FileUtils::getDocxTextContent(path,textContent);
else if(type == "text/plain")
FileUtils::getTxtContent(path,textContent);
QMimeType type = FileUtils::getMimetype(path);
QString name = type.name();
QFileInfo file(path);
QString strsfx = file.suffix();
if(name== "application/zip")
{
if(strsfx.endsWith( "docx"))
FileUtils::getDocxTextContent(path,textContent);
}
else if(name == "text/plain")
{
if(strsfx.endsWith( "txt"))
FileUtils::getTxtContent(path,textContent);
}
else if(type.inherits("application/msword"))
{
if (strsfx.endsWith("doc"))
{
KBinaryParser searchdata;
searchdata.RunParser(path,textContent);
}
}
else
{
qWarning()<<"Unsupport format:["<<path<<"]["<<type.name()<<"]";
}
return;
}

View File

@ -21,6 +21,7 @@
#define FILEREADER_H
#include <QObject>
#include <QFileInfo>
class FileReader : public QObject
{

View File

@ -63,7 +63,7 @@ private:
QQueue<QVector<QString>>* q_index;
QQueue<QString>* q_content_index;
const QVector<QString> targetFileTypeVec ={
// QString(".doc"),
QString(".doc"),
QString(".docx"),
// QString(".ppt"),
// QString(".pptx"),

View File

@ -61,7 +61,7 @@ private:
QMap<int, QString> currentPath;
const QVector<QString> targetFileTypeVec ={
// QString(".doc"),
QString(".doc"),
QString(".docx"),
// QString(".ppt"),
// QString(".pptx"),

View File

@ -22,6 +22,7 @@ DEFINES += QT_DEPRECATED_WARNINGS
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
include(index/index.pri)
include(parser/parser.pri))
include(appsearch/appsearch.pri)
include(settingsearch/settingsearch.pri))

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,116 @@
#ifndef SEARCHHELPER_H
#define SEARCHHELPER_H
#include <QtCore>
#include <QtConcurrent/QtConcurrent>
#define ULONG unsigned long
#define UCHAR unsigned char
#define USHORT unsigned short
typedef enum
{
Word = 0,
Excel,
Ppt
}TYPE;
/* Property Set Storage */
typedef struct pps_tag
{
ULONG ulSB;
ULONG ulSize;
} ppsTag;
typedef struct pps_info_tag
{
ppsTag tWordDocument; /* Text stream */
ppsTag tWorkBook;
ppsTag tPPTDocument;
ppsTag tData; /* Data stream */
ppsTag tTable; /* Table stream */
ppsTag tSummaryInfo; /* Summary Information */
ppsTag tDocSummaryInfo;/* Document Summary Information */
ppsTag t0Table; /* Table 0 stream */
ppsTag t1Table; /* Table 1 stream */
ppsTag tCurrentUser;
TYPE type;
} ppsInfoType;
/* Private type for Property Set Storage entries */
typedef struct pps_entry_tag
{
ULONG ulNext;
ULONG ulPrevious;
ULONG ulDir;
ULONG ulSB;
ULONG ulSize;
int iLevel;
char szName[32];
UCHAR ucType;
} ppsEntryType;
/* Excel Record Struct*/
//typedef struct excelRecord
//{
// excelRecord()
// {
// usLen = 0;
// usRichLen = 0;
// ulWLen = 0;
// bUni = false;
// }
// ushort usLen;
// ushort usRichLen;
// ulong ulWLen;
// bool bUni;
//} excelRecord;
typedef struct readDataParam
{
readDataParam()
{
ulStBlk = 0;
pFile = NULL;
ulBBd = NULL;
tBBdLen = 0;
usBlkSize = 0;
}
ulong ulStBlk;
FILE *pFile;
ulong *ulBBd;
size_t tBBdLen;
ushort usBlkSize;
}rdPara;
class KBinaryParser :public QObject
{
Q_OBJECT
public:
KBinaryParser(QObject *parent=0);
~KBinaryParser();
public:
bool RunParser(QString strFile,QString &content);
private:
bool bGetPPS(FILE *pFile,
const ULONG *aulRootList, size_t tRootListLen, ppsInfoType *pPPS);
int readData(rdPara &readParam, uchar *aucBuffer, ulong ulOffset, size_t tToRead);
int InitDocOle(FILE *pFile,long lFilesize,QString &content);
bool read8DocText(FILE *pFile, const ppsInfoType *pPPS,
const ULONG *aulBBD, size_t tBBDLen,
const ULONG *aulSBD, size_t tSBDLen,
const UCHAR *aucHeader,QString &content);
// int readSSTRecord(readDataParam &rdParam, ppsInfoType, ulong &ulOff, ushort usPartLen);
// int read8BiffRecord(uchar uFlag, ulong ulOff, ulong &ulNext, readDataParam &rdParam, excelRecord &eR);
// ULONG readPPtRecord(FILE* pFile, ppsInfoType* PPS_info, ULONG* aulBBD,
// size_t tBBDLen, ULONG ulPos);
QString m_strFileName;
};
#endif // SEARCHHELPER_H

25
libsearch/parser/common.h Normal file
View File

@ -0,0 +1,25 @@
#ifndef COMMON_H
#define COMMON_H
#include <QtCore>
#include <QtConcurrent/QtConcurrent>
#define SERVER "Everything"
#define LOG(a) \
//qWarning() << a;
#define REHASH(a) \
if (sl_minus_1 < (int)sizeof(int) * CHAR_BIT) \
hashHaystack -= (a) << sl_minus_1; \
hashHaystack <<= 1
void* xmalloc(size_t tSize);
void* xcalloc(size_t tNmemb, size_t tSize);
void* xrealloc(void *pvArg, size_t tSize);
void* xfree(void *pvArg);
#endif // COMMON_H

View File

@ -0,0 +1,9 @@
INCLUDEPATH += $$PWD
HEADERS += \
$$PWD/common.h \
$$PWD/binary-parser.h
SOURCES += \
$$PWD/binary-parser.cpp