From 3d4c4df712dde2539abec4ef04424f41ea2879d4 Mon Sep 17 00:00:00 2001 From: zhangpengfei Date: Tue, 29 Dec 2020 20:18:36 +0800 Subject: [PATCH] Add docx parser. --- file-utils.cpp | 64 +++++++++++++++++++++++++++++++++++ file-utils.h | 9 +++-- index/file-reader.cpp | 20 +++++++++++ index/file-reader.h | 15 +++++++++ index/index-generator.cpp | 70 +++++++++++++++++++++++++++++---------- index/index-generator.h | 16 +++++---- index/index.pri | 2 ++ src/main.cpp | 4 +-- src/mainwindow.cpp | 4 ++- ukui-search.pro | 2 +- 10 files changed, 176 insertions(+), 30 deletions(-) create mode 100644 index/file-reader.cpp create mode 100644 index/file-reader.h diff --git a/file-utils.cpp b/file-utils.cpp index 14b8797..9635f6e 100644 --- a/file-utils.cpp +++ b/file-utils.cpp @@ -1,8 +1,15 @@ #include "file-utils.h" #include #include +#include #include #include +#include "quazip/quazip.h" +#include +#include +#include +#include + QMap FileUtils::map_chinese2pinyin = QMap(); FileUtils::FileUtils() @@ -167,6 +174,16 @@ void FileUtils::loadHanziTable(const QString &fileName) return; } +QString FileUtils::getMimetype(QString &path, bool getsuffix) +{ + QMimeDatabase mdb; + QMimeType type = mdb.mimeTypeForFile(path,QMimeDatabase::MatchContent); + if(getsuffix) + return type.name(); + else + return type.preferredSuffix(); +} + QString FileUtils::find(const QString &hanzi) { // static QMap map = loadHanziTable("://index/pinyinWithoutTone.txt"); @@ -211,4 +228,51 @@ QStringList FileUtils::findMultiToneWords(const QString& hanzi) stitchMultiToneWordsDFS(hanzi, tempAllPinYin, tempFirst, output); // qDebug() << output; return output; +/** + * @brief FileUtils::getDocxTextContent + * @param path: abs path + * @return docx to QString + */ +QString *FileUtils::getDocxTextContent(QString &path) +{ + QFileInfo info = QFileInfo(path); + if(!info.exists()||info.isDir()) + return nullptr; + QuaZip file("path"); + if(file.open(QuaZip::mdUnzip)) + return nullptr; + + if(file.setCurrentFile("word/document.xml",QuaZip::csSensitive)) + return nullptr; + QuaZipFile fileR(&file); + + fileR.open(QIODevice::ReadOnly); //读取方式打开 + + QString *allText = new QString(); + QDomDocument doc; + doc.setContent(fileR.readAll()); + QDomElement first = doc.firstChildElement("w:document"); + first = first.firstChildElement().firstChildElement(); + while(!first.isNull()) + { + QDomElement wr= first.firstChildElement("w:r"); + while(!wr.isNull()) + { + QDomElement wt = wr.firstChildElement("w:t"); + allText->append(wt.text()); + wr = wr.nextSiblingElement(); + } + first = first.nextSiblingElement(); + } + qDebug()<<"size!!!"<size(); + return allText; +} + +QString *FileUtils::getTxtContent(QString &path) +{ + QFile file(path); + if(!file.open(QIODevice::ReadOnly|QIODevice::Text)) + return nullptr; + QString *allText = new QString(file.readAll()); + return allText; } diff --git a/file-utils.h b/file-utils.h index af7fe8e..3a3ecd0 100644 --- a/file-utils.h +++ b/file-utils.h @@ -13,7 +13,6 @@ class FileUtils { public: static std::string makeDocUterm(QString ); - static QIcon getFileIcon(const QString &, bool checkValid = true); static QIcon getAppIcon(const QString &); static QIcon getSettingIcon(const QString &, const bool&); @@ -22,13 +21,17 @@ public: static QString getAppName(const QString &); static QString getSettingName(const QString &); - static QMap map_chinese2pinyin; - //chinese character to pinyin + static QMap map_chinese2pinyin; static QString find(const QString&); static QStringList findMultiToneWords(const QString&); static void loadHanziTable(const QString&); + //parse text,docx..... + static QString getMimetype(QString &path, bool getsuffix = false); + static QString * getDocxTextContent(QString &path); + static QString * getTxtContent(QString &path); + private: FileUtils(); }; diff --git a/index/file-reader.cpp b/index/file-reader.cpp new file mode 100644 index 0000000..bed96a0 --- /dev/null +++ b/index/file-reader.cpp @@ -0,0 +1,20 @@ +#include "file-reader.h" +#include "file-utils.h" + +FileReader::FileReader(QObject *parent) : QObject(parent) +{ + +} + +QString *FileReader::getTextContent(QString path) +{ + //获取所有文件内容 + //先分类 + QString type =FileUtils::getMimetype(path); + if(type == "application/zip") + return FileUtils::getDocxTextContent(path); + else if(type == "text/plain") + return FileUtils::getTxtContent(path); + + return new QString(); +} diff --git a/index/file-reader.h b/index/file-reader.h new file mode 100644 index 0000000..69256c9 --- /dev/null +++ b/index/file-reader.h @@ -0,0 +1,15 @@ +#ifndef FILEREADER_H +#define FILEREADER_H + +#include + +class FileReader : public QObject +{ + Q_OBJECT +public: + explicit FileReader(QObject *parent = nullptr); + static QString* getTextContent(QString path); + +}; + +#endif // FILEREADER_H diff --git a/index/index-generator.cpp b/index/index-generator.cpp index 0e29ebc..4ddb7ea 100644 --- a/index/index-generator.cpp +++ b/index/index-generator.cpp @@ -11,6 +11,8 @@ using namespace std; #define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/index_data").toStdString() +#define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/content_index_data").toStdString() + static IndexGenerator *global_instance = nullptr; IndexGenerator *IndexGenerator::getInstance() @@ -32,43 +34,46 @@ bool IndexGenerator::creatAllIndex(QList > *messageList) try { m_indexer = new Xapian::TermGenerator(); - m_indexer->set_database(*m_datebase); + m_indexer->set_database(*m_datebase_path); //可以实现拼写纠正 // m_indexer->set_flags(Xapian::TermGenerator::FLAG_SPELLING); m_indexer->set_stemming_strategy(Xapian::TermGenerator::STEM_SOME); - - int count =0; - for(int i = 0;i < m_doc_list->size(); i++) + for(int i = 0;i < m_doc_list_path->size(); i++) { - insertIntoDatabase(m_doc_list->at(i)); + insertIntoDatabase(m_doc_list_path->at(i)); if(++count == 9999) { count = 0; - m_datebase->commit(); + m_datebase_path->commit(); } } - m_datebase->commit(); - - + m_datebase_path->commit(); } catch(const Xapian::Error &e) { qDebug()<<"creatAllIndex fail!"<clear(); + m_doc_list_path->clear(); Q_EMIT this->transactionFinished(); return true; } +bool IndexGenerator::creatAllIndex(QVector *messageList) +{ + HandlePathList(messageList); + return true; + +} + IndexGenerator::IndexGenerator(QObject *parent) : QObject(parent) { - m_datebase = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OPEN); - m_cryp = new QCryptographicHash(QCryptographicHash::Md5); + m_datebase_path = new Xapian::WritableDatabase(INDEX_PATH, Xapian::DB_CREATE_OR_OPEN); + m_database_content = new Xapian::WritableDatabase(CONTENT_INDEX_PATH, Xapian::DB_CREATE_OR_OPEN); } IndexGenerator::~IndexGenerator() @@ -86,7 +91,7 @@ void IndexGenerator::insertIntoDatabase(Document doc) m_indexer->index_text(i.toStdString()); } - Xapian::docid innerId= m_datebase->replace_document(doc.getUniqueTerm(),document); + Xapian::docid innerId= m_datebase_path->replace_document(doc.getUniqueTerm(),document); // qDebug()<<"replace doc docid="<(innerId); // qDebug()<< "--index finish--"; return; @@ -102,13 +107,31 @@ void IndexGenerator::HandlePathList(QList> *messageList) future.waitForFinished(); QList docList = future.results(); - m_doc_list = new QList(docList); - qDebug()<(docList); + qDebug()<size(); qDebug()<<"Finish HandlePathList!"; return; } +void IndexGenerator::HandlePathList(QVector *messageList) +{ + qDebug()<<"Begin HandlePathList for content index!"; + qDebug()<size(); +// qDebug()< future = QtConcurrent::mapped(*messageList,&IndexGenerator::GenerateContentDocument); + + future.waitForFinished(); + + QList docList = future.results(); + m_doc_list_content = new QList(docList); + qDebug()<size(); + + qDebug()<<"Finish HandlePathList for content index!"; + return; + +} + Document IndexGenerator::GenerateDocument(const QVector &list) { // qDebug()< &list) } +Document IndexGenerator::GenerateContentDocument(const QString &path) +{ + //构造文本索引的document + FileReader::getTextContent(path); + QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(path)); + Document doc; + doc.setData(path); + doc.setUniqueTerm(uniqueterm); + return doc; + + +} + bool IndexGenerator::isIndexdataExist() { @@ -249,9 +285,9 @@ bool IndexGenerator::deleteAllIndex(QStringList *pathlist) try { qDebug()<<"--delete start--"; - m_datebase->delete_document(uniqueterm); + m_datebase_path->delete_document(uniqueterm); qDebug()<<"delete md5"<commit(); + m_datebase_path->commit(); qDebug()<< "--delete finish--"; } catch(const Xapian::Error &e) diff --git a/index/index-generator.h b/index/index-generator.h index 8da03fe..065c44d 100644 --- a/index/index-generator.h +++ b/index/index-generator.h @@ -8,6 +8,7 @@ #include #include #include "document.h" +#include "file-reader.h" class IndexGenerator : public QObject { @@ -22,27 +23,30 @@ Q_SIGNALS: void searchFinish(); public Q_SLOTS: bool creatAllIndex(QList> *messageList); + bool creatAllIndex(QVector *messageList); bool deleteAllIndex(QStringList *pathlist); private: explicit IndexGenerator(QObject *parent = nullptr); + //For file name index void HandlePathList(QList> *messageList); + //For file content index + void HandlePathList(QVector *messageList); static Document GenerateDocument(const QVector &list); + static Document GenerateContentDocument(const QString &list); //add one data in database void insertIntoDatabase(Document doc); ~IndexGenerator(); QMap *m_index_map; - QList *m_doc_list; - - QCryptographicHash *m_cryp; + QList *m_doc_list_path; //for path index + QList *m_doc_list_content; // for text content index QString *m_index_data_path; - Xapian::WritableDatabase *m_datebase; + Xapian::WritableDatabase *m_datebase_path; + Xapian::WritableDatabase *m_database_content; std::string m_docstr; std::string m_index_text_str; Xapian::TermGenerator *m_indexer; - - }; #endif // INDEXGENERATOR_H diff --git a/index/index.pri b/index/index.pri index 96b2aba..d800e45 100644 --- a/index/index.pri +++ b/index/index.pri @@ -5,6 +5,7 @@ HEADERS += \ $$PWD/blockdirs.h \ $$PWD/document.h \ $$PWD/filetypefilter.h \ + $$PWD/file-reader.h \ $$PWD/index-generator.h \ # $$PWD/inotify-manager.h \ $$PWD/inotify.h \ @@ -19,6 +20,7 @@ SOURCES += \ $$PWD/blockdirs.cpp \ $$PWD/document.cpp \ $$PWD/filetypefilter.cpp \ + $$PWD/file-reader.cpp \ $$PWD/index-generator.cpp \ # $$PWD/inotify-manager.cpp \ $$PWD/inotify.cpp \ diff --git a/src/main.cpp b/src/main.cpp index cb8a9e2..c4753c2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -95,8 +95,8 @@ int main(int argc, char *argv[]) w->activateWindow(); // w->loadMainWindow(); app.setActivationWindow(w); - if(arguments.size()>1) - w->searchContent(arguments.at(1)); +// if(arguments.size()>1) +// w->searchContent(arguments.at(1)); QObject::connect(&app, SIGNAL(messageReceived(const QString&)),w, SLOT(bootOptionsFilter(const QString&))); diff --git a/src/mainwindow.cpp b/src/mainwindow.cpp index 667df67..c199b80 100644 --- a/src/mainwindow.cpp +++ b/src/mainwindow.cpp @@ -147,7 +147,9 @@ void MainWindow::initUi() m_contentFrame->setCurrentIndex(0); } else { m_contentFrame->setCurrentIndex(1); - searchContent(text); + QTimer::singleShot(50,this,[=](){ + searchContent(text); + }); } }); diff --git a/ukui-search.pro b/ukui-search.pro index 43272df..20cbc34 100644 --- a/ukui-search.pro +++ b/ukui-search.pro @@ -23,7 +23,7 @@ include(appsearch/appsearch.pri) include(singleapplication/qt-single-application.pri) include(settingsearch/settingsearch.pri) -LIBS = -lxapian -lgsettings-qt +LIBS = -lxapian -lgsettings-qt -lquazip5 # Default rules for deployment. qnx: target.path = /tmp/$${TARGET}/bin else: unix:!android: target.path = /opt/$${TARGET}/bin