Merge pull request #27 from iaom/1229-dev

Add docx parser.
This commit is contained in:
张佳萍 2020-12-29 20:27:41 +08:00 committed by GitHub
commit 0523977347
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 176 additions and 30 deletions

View File

@ -1,8 +1,15 @@
#include "file-utils.h"
#include <QDebug>
#include <QFile>
#include <QFileInfo>
#include <QUrl>
#include <QMap>
#include "quazip/quazip.h"
#include <quazip/quazipfile.h>
#include <QDomDocument>
#include <QMimeDatabase>
#include <QMimeType>
QMap<QString, QStringList> FileUtils::map_chinese2pinyin = QMap<QString, QStringList>();
FileUtils::FileUtils()
@ -167,6 +174,16 @@ void FileUtils::loadHanziTable(const QString &fileName)
return;
}
QString FileUtils::getMimetype(QString &path, bool getsuffix)
{
QMimeDatabase mdb;
QMimeType type = mdb.mimeTypeForFile(path,QMimeDatabase::MatchContent);
if(getsuffix)
return type.name();
else
return type.preferredSuffix();
}
QString FileUtils::find(const QString &hanzi)
{
// static QMap<QString, QStringList> map = loadHanziTable("://index/pinyinWithoutTone.txt");
@ -211,4 +228,51 @@ QStringList FileUtils::findMultiToneWords(const QString& hanzi)
stitchMultiToneWordsDFS(hanzi, tempAllPinYin, tempFirst, output);
// qDebug() << output;
return output;
/**
* @brief FileUtils::getDocxTextContent
* @param path: abs path
* @return docx to QString
*/
QString *FileUtils::getDocxTextContent(QString &path)
{
QFileInfo info = QFileInfo(path);
if(!info.exists()||info.isDir())
return nullptr;
QuaZip file("path");
if(file.open(QuaZip::mdUnzip))
return nullptr;
if(file.setCurrentFile("word/document.xml",QuaZip::csSensitive))
return nullptr;
QuaZipFile fileR(&file);
fileR.open(QIODevice::ReadOnly); //读取方式打开
QString *allText = new QString();
QDomDocument doc;
doc.setContent(fileR.readAll());
QDomElement first = doc.firstChildElement("w:document");
first = first.firstChildElement().firstChildElement();
while(!first.isNull())
{
QDomElement wr= first.firstChildElement("w:r");
while(!wr.isNull())
{
QDomElement wt = wr.firstChildElement("w:t");
allText->append(wt.text());
wr = wr.nextSiblingElement();
}
first = first.nextSiblingElement();
}
qDebug()<<"size!!!"<<allText->size();
return allText;
}
QString *FileUtils::getTxtContent(QString &path)
{
QFile file(path);
if(!file.open(QIODevice::ReadOnly|QIODevice::Text))
return nullptr;
QString *allText = new QString(file.readAll());
return allText;
}

View File

@ -13,7 +13,6 @@ class FileUtils
{
public:
static std::string makeDocUterm(QString );
static QIcon getFileIcon(const QString &, bool checkValid = true);
static QIcon getAppIcon(const QString &);
static QIcon getSettingIcon(const QString &, const bool&);
@ -22,13 +21,17 @@ public:
static QString getAppName(const QString &);
static QString getSettingName(const QString &);
static QMap<QString, QStringList> map_chinese2pinyin;
//chinese character to pinyin
static QMap<QString, QStringList> map_chinese2pinyin;
static QString find(const QString&);
static QStringList findMultiToneWords(const QString&);
static void loadHanziTable(const QString&);
//parse text,docx.....
static QString getMimetype(QString &path, bool getsuffix = false);
static QString * getDocxTextContent(QString &path);
static QString * getTxtContent(QString &path);
private:
FileUtils();
};

20
index/file-reader.cpp Normal file
View File

@ -0,0 +1,20 @@
#include "file-reader.h"
#include "file-utils.h"
FileReader::FileReader(QObject *parent) : QObject(parent)
{
}
QString *FileReader::getTextContent(QString path)
{
//获取所有文件内容
//先分类
QString type =FileUtils::getMimetype(path);
if(type == "application/zip")
return FileUtils::getDocxTextContent(path);
else if(type == "text/plain")
return FileUtils::getTxtContent(path);
return new QString();
}

15
index/file-reader.h Normal file
View File

@ -0,0 +1,15 @@
#ifndef FILEREADER_H
#define FILEREADER_H
#include <QObject>
class FileReader : public QObject
{
Q_OBJECT
public:
explicit FileReader(QObject *parent = nullptr);
static QString* getTextContent(QString path);
};
#endif // FILEREADER_H

View File

@ -11,6 +11,8 @@
using namespace std;
#define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/index_data").toStdString()
#define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/content_index_data").toStdString()
static IndexGenerator *global_instance = nullptr;
IndexGenerator *IndexGenerator::getInstance()
@ -32,43 +34,46 @@ bool IndexGenerator::creatAllIndex(QList<QVector<QString> > *messageList)
try
{
m_indexer = new Xapian::TermGenerator();
m_indexer->set_database(*m_datebase);
m_indexer->set_database(*m_datebase_path);
//可以实现拼写纠正
// m_indexer->set_flags(Xapian::TermGenerator::FLAG_SPELLING);
m_indexer->set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
int count =0;
for(int i = 0;i < m_doc_list->size(); i++)
for(int i = 0;i < m_doc_list_path->size(); i++)
{
insertIntoDatabase(m_doc_list->at(i));
insertIntoDatabase(m_doc_list_path->at(i));
if(++count == 9999)
{
count = 0;
m_datebase->commit();
m_datebase_path->commit();
}
}
m_datebase->commit();
m_datebase_path->commit();
}
catch(const Xapian::Error &e)
{
qDebug()<<"creatAllIndex fail!"<<QString::fromStdString(e.get_description());
return false;
}
m_doc_list->clear();
m_doc_list_path->clear();
Q_EMIT this->transactionFinished();
return true;
}
bool IndexGenerator::creatAllIndex(QVector<QString> *messageList)
{
HandlePathList(messageList);
return true;
}
IndexGenerator::IndexGenerator(QObject *parent) : QObject(parent)
{
m_datebase = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
m_cryp = new QCryptographicHash(QCryptographicHash::Md5);
m_datebase_path = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
m_database_content = new Xapian::WritableDatabase(CONTENT_INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
}
IndexGenerator::~IndexGenerator()
@ -86,7 +91,7 @@ void IndexGenerator::insertIntoDatabase(Document doc)
m_indexer->index_text(i.toStdString());
}
Xapian::docid innerId= m_datebase->replace_document(doc.getUniqueTerm(),document);
Xapian::docid innerId= m_datebase_path->replace_document(doc.getUniqueTerm(),document);
// qDebug()<<"replace doc docid="<<static_cast<int>(innerId);
// qDebug()<< "--index finish--";
return;
@ -102,13 +107,31 @@ void IndexGenerator::HandlePathList(QList<QVector<QString>> *messageList)
future.waitForFinished();
QList<Document> docList = future.results();
m_doc_list = new QList<Document>(docList);
qDebug()<<m_doc_list;
m_doc_list_path = new QList<Document>(docList);
qDebug()<<m_doc_list_path->size();
qDebug()<<"Finish HandlePathList!";
return;
}
void IndexGenerator::HandlePathList(QVector<QString> *messageList)
{
qDebug()<<"Begin HandlePathList for content index!";
qDebug()<<messageList->size();
// qDebug()<<QString::number(quintptr(QThread::currentThreadId()));
QFuture<Document> future = QtConcurrent::mapped(*messageList,&IndexGenerator::GenerateContentDocument);
future.waitForFinished();
QList<Document> docList = future.results();
m_doc_list_content = new QList<Document>(docList);
qDebug()<<m_doc_list_content->size();
qDebug()<<"Finish HandlePathList for content index!";
return;
}
Document IndexGenerator::GenerateDocument(const QVector<QString> &list)
{
// qDebug()<<QString::number(quintptr(QThread::currentThreadId()));
@ -157,6 +180,19 @@ Document IndexGenerator::GenerateDocument(const QVector<QString> &list)
}
Document IndexGenerator::GenerateContentDocument(const QString &path)
{
//构造文本索引的document
FileReader::getTextContent(path);
QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(path));
Document doc;
doc.setData(path);
doc.setUniqueTerm(uniqueterm);
return doc;
}
bool IndexGenerator::isIndexdataExist()
{
@ -249,9 +285,9 @@ bool IndexGenerator::deleteAllIndex(QStringList *pathlist)
try
{
qDebug()<<"--delete start--";
m_datebase->delete_document(uniqueterm);
m_datebase_path->delete_document(uniqueterm);
qDebug()<<"delete md5"<<QString::fromStdString(uniqueterm);
m_datebase->commit();
m_datebase_path->commit();
qDebug()<< "--delete finish--";
}
catch(const Xapian::Error &e)

View File

@ -8,6 +8,7 @@
#include <QMap>
#include <QCryptographicHash>
#include "document.h"
#include "file-reader.h"
class IndexGenerator : public QObject
{
@ -22,27 +23,30 @@ Q_SIGNALS:
void searchFinish();
public Q_SLOTS:
bool creatAllIndex(QList<QVector<QString>> *messageList);
bool creatAllIndex(QVector<QString> *messageList);
bool deleteAllIndex(QStringList *pathlist);
private:
explicit IndexGenerator(QObject *parent = nullptr);
//For file name index
void HandlePathList(QList<QVector<QString>> *messageList);
//For file content index
void HandlePathList(QVector<QString> *messageList);
static Document GenerateDocument(const QVector<QString> &list);
static Document GenerateContentDocument(const QString &list);
//add one data in database
void insertIntoDatabase(Document doc);
~IndexGenerator();
QMap<QString,QStringList> *m_index_map;
QList<Document> *m_doc_list;
QCryptographicHash *m_cryp;
QList<Document> *m_doc_list_path; //for path index
QList<Document> *m_doc_list_content; // for text content index
QString *m_index_data_path;
Xapian::WritableDatabase *m_datebase;
Xapian::WritableDatabase *m_datebase_path;
Xapian::WritableDatabase *m_database_content;
std::string m_docstr;
std::string m_index_text_str;
Xapian::TermGenerator *m_indexer;
};
#endif // INDEXGENERATOR_H

View File

@ -5,6 +5,7 @@ HEADERS += \
$$PWD/blockdirs.h \
$$PWD/document.h \
$$PWD/filetypefilter.h \
$$PWD/file-reader.h \
$$PWD/index-generator.h \
# $$PWD/inotify-manager.h \
$$PWD/inotify.h \
@ -19,6 +20,7 @@ SOURCES += \
$$PWD/blockdirs.cpp \
$$PWD/document.cpp \
$$PWD/filetypefilter.cpp \
$$PWD/file-reader.cpp \
$$PWD/index-generator.cpp \
# $$PWD/inotify-manager.cpp \
$$PWD/inotify.cpp \

View File

@ -95,8 +95,8 @@ int main(int argc, char *argv[])
w->activateWindow();
// w->loadMainWindow();
app.setActivationWindow(w);
if(arguments.size()>1)
w->searchContent(arguments.at(1));
// if(arguments.size()>1)
// w->searchContent(arguments.at(1));
QObject::connect(&app, SIGNAL(messageReceived(const QString&)),w, SLOT(bootOptionsFilter(const QString&)));

View File

@ -147,7 +147,9 @@ void MainWindow::initUi()
m_contentFrame->setCurrentIndex(0);
} else {
m_contentFrame->setCurrentIndex(1);
QTimer::singleShot(50,this,[=](){
searchContent(text);
});
}
});

View File

@ -23,7 +23,7 @@ include(appsearch/appsearch.pri)
include(singleapplication/qt-single-application.pri)
include(settingsearch/settingsearch.pri)
LIBS = -lxapian -lgsettings-qt
LIBS = -lxapian -lgsettings-qt -lquazip5
# Default rules for deployment.
qnx: target.path = /tmp/$${TARGET}/bin
else: unix:!android: target.path = /opt/$${TARGET}/bin