Add docx parser.
This commit is contained in:
parent
c34d0dbdce
commit
3d4c4df712
|
@ -1,8 +1,15 @@
|
|||
#include "file-utils.h"
|
||||
#include <QDebug>
|
||||
#include <QFile>
|
||||
#include <QFileInfo>
|
||||
#include <QUrl>
|
||||
#include <QMap>
|
||||
#include "quazip/quazip.h"
|
||||
#include <quazip/quazipfile.h>
|
||||
#include <QDomDocument>
|
||||
#include <QMimeDatabase>
|
||||
#include <QMimeType>
|
||||
|
||||
QMap<QString, QStringList> FileUtils::map_chinese2pinyin = QMap<QString, QStringList>();
|
||||
|
||||
FileUtils::FileUtils()
|
||||
|
@ -167,6 +174,16 @@ void FileUtils::loadHanziTable(const QString &fileName)
|
|||
return;
|
||||
}
|
||||
|
||||
QString FileUtils::getMimetype(QString &path, bool getsuffix)
|
||||
{
|
||||
QMimeDatabase mdb;
|
||||
QMimeType type = mdb.mimeTypeForFile(path,QMimeDatabase::MatchContent);
|
||||
if(getsuffix)
|
||||
return type.name();
|
||||
else
|
||||
return type.preferredSuffix();
|
||||
}
|
||||
|
||||
QString FileUtils::find(const QString &hanzi)
|
||||
{
|
||||
// static QMap<QString, QStringList> map = loadHanziTable("://index/pinyinWithoutTone.txt");
|
||||
|
@ -211,4 +228,51 @@ QStringList FileUtils::findMultiToneWords(const QString& hanzi)
|
|||
stitchMultiToneWordsDFS(hanzi, tempAllPinYin, tempFirst, output);
|
||||
// qDebug() << output;
|
||||
return output;
|
||||
/**
|
||||
* @brief FileUtils::getDocxTextContent
|
||||
* @param path: abs path
|
||||
* @return docx to QString
|
||||
*/
|
||||
QString *FileUtils::getDocxTextContent(QString &path)
|
||||
{
|
||||
QFileInfo info = QFileInfo(path);
|
||||
if(!info.exists()||info.isDir())
|
||||
return nullptr;
|
||||
QuaZip file("path");
|
||||
if(file.open(QuaZip::mdUnzip))
|
||||
return nullptr;
|
||||
|
||||
if(file.setCurrentFile("word/document.xml",QuaZip::csSensitive))
|
||||
return nullptr;
|
||||
QuaZipFile fileR(&file);
|
||||
|
||||
fileR.open(QIODevice::ReadOnly); //读取方式打开
|
||||
|
||||
QString *allText = new QString();
|
||||
QDomDocument doc;
|
||||
doc.setContent(fileR.readAll());
|
||||
QDomElement first = doc.firstChildElement("w:document");
|
||||
first = first.firstChildElement().firstChildElement();
|
||||
while(!first.isNull())
|
||||
{
|
||||
QDomElement wr= first.firstChildElement("w:r");
|
||||
while(!wr.isNull())
|
||||
{
|
||||
QDomElement wt = wr.firstChildElement("w:t");
|
||||
allText->append(wt.text());
|
||||
wr = wr.nextSiblingElement();
|
||||
}
|
||||
first = first.nextSiblingElement();
|
||||
}
|
||||
qDebug()<<"size!!!"<<allText->size();
|
||||
return allText;
|
||||
}
|
||||
|
||||
QString *FileUtils::getTxtContent(QString &path)
|
||||
{
|
||||
QFile file(path);
|
||||
if(!file.open(QIODevice::ReadOnly|QIODevice::Text))
|
||||
return nullptr;
|
||||
QString *allText = new QString(file.readAll());
|
||||
return allText;
|
||||
}
|
||||
|
|
|
@ -13,7 +13,6 @@ class FileUtils
|
|||
{
|
||||
public:
|
||||
static std::string makeDocUterm(QString );
|
||||
|
||||
static QIcon getFileIcon(const QString &, bool checkValid = true);
|
||||
static QIcon getAppIcon(const QString &);
|
||||
static QIcon getSettingIcon(const QString &, const bool&);
|
||||
|
@ -22,13 +21,17 @@ public:
|
|||
static QString getAppName(const QString &);
|
||||
static QString getSettingName(const QString &);
|
||||
|
||||
static QMap<QString, QStringList> map_chinese2pinyin;
|
||||
|
||||
//chinese character to pinyin
|
||||
static QMap<QString, QStringList> map_chinese2pinyin;
|
||||
static QString find(const QString&);
|
||||
static QStringList findMultiToneWords(const QString&);
|
||||
static void loadHanziTable(const QString&);
|
||||
|
||||
//parse text,docx.....
|
||||
static QString getMimetype(QString &path, bool getsuffix = false);
|
||||
static QString * getDocxTextContent(QString &path);
|
||||
static QString * getTxtContent(QString &path);
|
||||
|
||||
private:
|
||||
FileUtils();
|
||||
};
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
#include "file-reader.h"
|
||||
#include "file-utils.h"
|
||||
|
||||
FileReader::FileReader(QObject *parent) : QObject(parent)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
QString *FileReader::getTextContent(QString path)
|
||||
{
|
||||
//获取所有文件内容
|
||||
//先分类
|
||||
QString type =FileUtils::getMimetype(path);
|
||||
if(type == "application/zip")
|
||||
return FileUtils::getDocxTextContent(path);
|
||||
else if(type == "text/plain")
|
||||
return FileUtils::getTxtContent(path);
|
||||
|
||||
return new QString();
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
#ifndef FILEREADER_H
|
||||
#define FILEREADER_H
|
||||
|
||||
#include <QObject>
|
||||
|
||||
class FileReader : public QObject
|
||||
{
|
||||
Q_OBJECT
|
||||
public:
|
||||
explicit FileReader(QObject *parent = nullptr);
|
||||
static QString* getTextContent(QString path);
|
||||
|
||||
};
|
||||
|
||||
#endif // FILEREADER_H
|
|
@ -11,6 +11,8 @@
|
|||
using namespace std;
|
||||
|
||||
#define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/index_data").toStdString()
|
||||
#define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/content_index_data").toStdString()
|
||||
|
||||
static IndexGenerator *global_instance = nullptr;
|
||||
|
||||
IndexGenerator *IndexGenerator::getInstance()
|
||||
|
@ -32,43 +34,46 @@ bool IndexGenerator::creatAllIndex(QList<QVector<QString> > *messageList)
|
|||
try
|
||||
{
|
||||
m_indexer = new Xapian::TermGenerator();
|
||||
m_indexer->set_database(*m_datebase);
|
||||
m_indexer->set_database(*m_datebase_path);
|
||||
//可以实现拼写纠正
|
||||
// m_indexer->set_flags(Xapian::TermGenerator::FLAG_SPELLING);
|
||||
m_indexer->set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
|
||||
|
||||
|
||||
|
||||
int count =0;
|
||||
for(int i = 0;i < m_doc_list->size(); i++)
|
||||
for(int i = 0;i < m_doc_list_path->size(); i++)
|
||||
{
|
||||
insertIntoDatabase(m_doc_list->at(i));
|
||||
insertIntoDatabase(m_doc_list_path->at(i));
|
||||
|
||||
if(++count == 9999)
|
||||
{
|
||||
count = 0;
|
||||
m_datebase->commit();
|
||||
m_datebase_path->commit();
|
||||
}
|
||||
}
|
||||
m_datebase->commit();
|
||||
|
||||
|
||||
m_datebase_path->commit();
|
||||
}
|
||||
catch(const Xapian::Error &e)
|
||||
{
|
||||
qDebug()<<"creatAllIndex fail!"<<QString::fromStdString(e.get_description());
|
||||
return false;
|
||||
}
|
||||
m_doc_list->clear();
|
||||
m_doc_list_path->clear();
|
||||
Q_EMIT this->transactionFinished();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool IndexGenerator::creatAllIndex(QVector<QString> *messageList)
|
||||
{
|
||||
HandlePathList(messageList);
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
IndexGenerator::IndexGenerator(QObject *parent) : QObject(parent)
|
||||
{
|
||||
m_datebase = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
|
||||
m_cryp = new QCryptographicHash(QCryptographicHash::Md5);
|
||||
m_datebase_path = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
|
||||
m_database_content = new Xapian::WritableDatabase(CONTENT_INDEX_PATH, Xapian::DB_CREATE_OR_OPEN);
|
||||
}
|
||||
|
||||
IndexGenerator::~IndexGenerator()
|
||||
|
@ -86,7 +91,7 @@ void IndexGenerator::insertIntoDatabase(Document doc)
|
|||
m_indexer->index_text(i.toStdString());
|
||||
}
|
||||
|
||||
Xapian::docid innerId= m_datebase->replace_document(doc.getUniqueTerm(),document);
|
||||
Xapian::docid innerId= m_datebase_path->replace_document(doc.getUniqueTerm(),document);
|
||||
// qDebug()<<"replace doc docid="<<static_cast<int>(innerId);
|
||||
// qDebug()<< "--index finish--";
|
||||
return;
|
||||
|
@ -102,13 +107,31 @@ void IndexGenerator::HandlePathList(QList<QVector<QString>> *messageList)
|
|||
future.waitForFinished();
|
||||
|
||||
QList<Document> docList = future.results();
|
||||
m_doc_list = new QList<Document>(docList);
|
||||
qDebug()<<m_doc_list;
|
||||
m_doc_list_path = new QList<Document>(docList);
|
||||
qDebug()<<m_doc_list_path->size();
|
||||
|
||||
qDebug()<<"Finish HandlePathList!";
|
||||
return;
|
||||
}
|
||||
|
||||
void IndexGenerator::HandlePathList(QVector<QString> *messageList)
|
||||
{
|
||||
qDebug()<<"Begin HandlePathList for content index!";
|
||||
qDebug()<<messageList->size();
|
||||
// qDebug()<<QString::number(quintptr(QThread::currentThreadId()));
|
||||
QFuture<Document> future = QtConcurrent::mapped(*messageList,&IndexGenerator::GenerateContentDocument);
|
||||
|
||||
future.waitForFinished();
|
||||
|
||||
QList<Document> docList = future.results();
|
||||
m_doc_list_content = new QList<Document>(docList);
|
||||
qDebug()<<m_doc_list_content->size();
|
||||
|
||||
qDebug()<<"Finish HandlePathList for content index!";
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
Document IndexGenerator::GenerateDocument(const QVector<QString> &list)
|
||||
{
|
||||
// qDebug()<<QString::number(quintptr(QThread::currentThreadId()));
|
||||
|
@ -157,6 +180,19 @@ Document IndexGenerator::GenerateDocument(const QVector<QString> &list)
|
|||
|
||||
}
|
||||
|
||||
Document IndexGenerator::GenerateContentDocument(const QString &path)
|
||||
{
|
||||
//构造文本索引的document
|
||||
FileReader::getTextContent(path);
|
||||
QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(path));
|
||||
Document doc;
|
||||
doc.setData(path);
|
||||
doc.setUniqueTerm(uniqueterm);
|
||||
return doc;
|
||||
|
||||
|
||||
}
|
||||
|
||||
bool IndexGenerator::isIndexdataExist()
|
||||
{
|
||||
|
||||
|
@ -249,9 +285,9 @@ bool IndexGenerator::deleteAllIndex(QStringList *pathlist)
|
|||
try
|
||||
{
|
||||
qDebug()<<"--delete start--";
|
||||
m_datebase->delete_document(uniqueterm);
|
||||
m_datebase_path->delete_document(uniqueterm);
|
||||
qDebug()<<"delete md5"<<QString::fromStdString(uniqueterm);
|
||||
m_datebase->commit();
|
||||
m_datebase_path->commit();
|
||||
qDebug()<< "--delete finish--";
|
||||
}
|
||||
catch(const Xapian::Error &e)
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
#include <QMap>
|
||||
#include <QCryptographicHash>
|
||||
#include "document.h"
|
||||
#include "file-reader.h"
|
||||
|
||||
class IndexGenerator : public QObject
|
||||
{
|
||||
|
@ -22,27 +23,30 @@ Q_SIGNALS:
|
|||
void searchFinish();
|
||||
public Q_SLOTS:
|
||||
bool creatAllIndex(QList<QVector<QString>> *messageList);
|
||||
bool creatAllIndex(QVector<QString> *messageList);
|
||||
bool deleteAllIndex(QStringList *pathlist);
|
||||
|
||||
private:
|
||||
explicit IndexGenerator(QObject *parent = nullptr);
|
||||
//For file name index
|
||||
void HandlePathList(QList<QVector<QString>> *messageList);
|
||||
//For file content index
|
||||
void HandlePathList(QVector<QString> *messageList);
|
||||
static Document GenerateDocument(const QVector<QString> &list);
|
||||
static Document GenerateContentDocument(const QString &list);
|
||||
//add one data in database
|
||||
void insertIntoDatabase(Document doc);
|
||||
~IndexGenerator();
|
||||
|
||||
QMap<QString,QStringList> *m_index_map;
|
||||
QList<Document> *m_doc_list;
|
||||
|
||||
QCryptographicHash *m_cryp;
|
||||
QList<Document> *m_doc_list_path; //for path index
|
||||
QList<Document> *m_doc_list_content; // for text content index
|
||||
QString *m_index_data_path;
|
||||
Xapian::WritableDatabase *m_datebase;
|
||||
Xapian::WritableDatabase *m_datebase_path;
|
||||
Xapian::WritableDatabase *m_database_content;
|
||||
std::string m_docstr;
|
||||
std::string m_index_text_str;
|
||||
Xapian::TermGenerator *m_indexer;
|
||||
|
||||
|
||||
};
|
||||
|
||||
#endif // INDEXGENERATOR_H
|
||||
|
|
|
@ -5,6 +5,7 @@ HEADERS += \
|
|||
$$PWD/blockdirs.h \
|
||||
$$PWD/document.h \
|
||||
$$PWD/filetypefilter.h \
|
||||
$$PWD/file-reader.h \
|
||||
$$PWD/index-generator.h \
|
||||
# $$PWD/inotify-manager.h \
|
||||
$$PWD/inotify.h \
|
||||
|
@ -19,6 +20,7 @@ SOURCES += \
|
|||
$$PWD/blockdirs.cpp \
|
||||
$$PWD/document.cpp \
|
||||
$$PWD/filetypefilter.cpp \
|
||||
$$PWD/file-reader.cpp \
|
||||
$$PWD/index-generator.cpp \
|
||||
# $$PWD/inotify-manager.cpp \
|
||||
$$PWD/inotify.cpp \
|
||||
|
|
|
@ -95,8 +95,8 @@ int main(int argc, char *argv[])
|
|||
w->activateWindow();
|
||||
// w->loadMainWindow();
|
||||
app.setActivationWindow(w);
|
||||
if(arguments.size()>1)
|
||||
w->searchContent(arguments.at(1));
|
||||
// if(arguments.size()>1)
|
||||
// w->searchContent(arguments.at(1));
|
||||
QObject::connect(&app, SIGNAL(messageReceived(const QString&)),w, SLOT(bootOptionsFilter(const QString&)));
|
||||
|
||||
|
||||
|
|
|
@ -147,7 +147,9 @@ void MainWindow::initUi()
|
|||
m_contentFrame->setCurrentIndex(0);
|
||||
} else {
|
||||
m_contentFrame->setCurrentIndex(1);
|
||||
searchContent(text);
|
||||
QTimer::singleShot(50,this,[=](){
|
||||
searchContent(text);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ include(appsearch/appsearch.pri)
|
|||
include(singleapplication/qt-single-application.pri)
|
||||
include(settingsearch/settingsearch.pri)
|
||||
|
||||
LIBS = -lxapian -lgsettings-qt
|
||||
LIBS = -lxapian -lgsettings-qt -lquazip5
|
||||
# Default rules for deployment.
|
||||
qnx: target.path = /tmp/$${TARGET}/bin
|
||||
else: unix:!android: target.path = /opt/$${TARGET}/bin
|
||||
|
|
Loading…
Reference in New Issue