From f1485deeac782dbdc8972bd52273a667050eb386 Mon Sep 17 00:00:00 2001 From: iaom <18504285112@163.com> Date: Thu, 8 Apr 2021 16:11:58 +0800 Subject: [PATCH 1/3] Add support for 'pptx'. --- libsearch/file-utils.cpp | 79 ++++++++++++++++++++++++++++++++- libsearch/file-utils.h | 2 + libsearch/index/file-reader.cpp | 2 + libsearch/index/first-index.h | 4 +- libsearch/index/inotify-index.h | 4 +- 5 files changed, 85 insertions(+), 6 deletions(-) diff --git a/libsearch/file-utils.cpp b/libsearch/file-utils.cpp index 0cb80c7..73dc8f5 100644 --- a/libsearch/file-utils.cpp +++ b/libsearch/file-utils.cpp @@ -502,6 +502,7 @@ QStringList FileUtils::findMultiToneWords(const QString& hanzi) */ void FileUtils::getDocxTextContent(QString &path,QString &textcontent) { + //fix me :optimized by xpath?? QFileInfo info = QFileInfo(path); if(!info.exists()||info.isDir()) return; @@ -517,6 +518,7 @@ void FileUtils::getDocxTextContent(QString &path,QString &textcontent) QDomDocument doc; doc.setContent(fileR.readAll()); + fileR.close(); QDomElement first = doc.firstChildElement("w:document"); QDomElement body = first.firstChildElement("w:body"); while(!body.isNull()) @@ -529,7 +531,7 @@ void FileUtils::getDocxTextContent(QString &path,QString &textcontent) { QDomElement wt = wr.firstChildElement("w:t"); textcontent.append(wt.text().replace("\n","")); - if(textcontent.length() >= 682666) //20480000/3 + if(textcontent.length() >= MAX_CONTENT_LENGTH/3) { file.close(); return; @@ -544,13 +546,86 @@ void FileUtils::getDocxTextContent(QString &path,QString &textcontent) return; } +void FileUtils::getPptxTextContent(QString &path, QString &textcontent) +{ + //fix me :optimized by xpath?? + QFileInfo info = QFileInfo(path); + if(!info.exists()||info.isDir()) + return; + QuaZip file(path); + if(!file.open(QuaZip::mdUnzip)) + return; + QString prefix("ppt/slides/slide"); + QStringList fileList; + for(QString i : file.getFileNameList()) + { + if(i.startsWith(prefix)) + fileList<= MAX_CONTENT_LENGTH/3) + { + file.close(); + return; + } + ar = ar.nextSiblingElement(); + } + ap = ap.nextSiblingElement(); + } + txbody = txbody.nextSiblingElement(); + } + sp = sp.nextSiblingElement(); + } + sptree = sptree.nextSiblingElement(); + } + } + file.close(); + return; +} + void FileUtils::getTxtContent(QString &path, QString &textcontent) { QFile file(path); if(!file.open(QIODevice::ReadOnly|QIODevice::Text)) return; - QByteArray encodedString = file.read(20480000); + QByteArray encodedString = file.read(MAX_CONTENT_LENGTH); uchardet_t chardet = uchardet_new(); if(uchardet_handle_data(chardet,encodedString.constData(),encodedString.size()) !=0) diff --git a/libsearch/file-utils.h b/libsearch/file-utils.h index fca37a0..d04a02b 100644 --- a/libsearch/file-utils.h +++ b/libsearch/file-utils.h @@ -39,6 +39,7 @@ //#define INITIAL_STATE 0 //#define CREATING_INDEX 1 //#define FINISH_CREATING_INDEX 2 +#define MAX_CONTENT_LENGTH 20480000 #define UKUI_SEARCH_PIPE_PATH (QDir::homePath()+"/.config/org.ukui/ukui-search/ukuisearch").toLocal8Bit().constData() @@ -64,6 +65,7 @@ public: //parse text,docx..... static QMimeType getMimetype(QString &path); static void getDocxTextContent(QString &path, QString &textcontent); + static void getPptxTextContent(QString &path, QString &textcontent); static void getTxtContent(QString &path, QString &textcontent); static size_t _max_index_count; static size_t _current_index_count; //this one has been Abandoned,do not use it. diff --git a/libsearch/index/file-reader.cpp b/libsearch/index/file-reader.cpp index e4fed6a..e6babca 100644 --- a/libsearch/index/file-reader.cpp +++ b/libsearch/index/file-reader.cpp @@ -36,6 +36,8 @@ void FileReader::getTextContent(QString path, QString &textContent) { if(strsfx.endsWith( "docx")) FileUtils::getDocxTextContent(path,textContent); + if(strsfx.endsWith( "pptx")) + FileUtils::getPptxTextContent(path,textContent); } else if(name == "text/plain") { diff --git a/libsearch/index/first-index.h b/libsearch/index/first-index.h index d3368fb..1523192 100644 --- a/libsearch/index/first-index.h +++ b/libsearch/index/first-index.h @@ -67,9 +67,9 @@ private: std::map::value_type("doc", true), std::map::value_type("docx", true), std::map::value_type("ppt", true), -// std::map::value_type(".pptx", true), + std::map::value_type("pptx", true), std::map::value_type("xls", true), -// std::map::value_type(".xlsx", true), +// std::map::value_type("xlsx", true), std::map::value_type("txt", true), std::map::value_type("dot", true), std::map::value_type("wps", true), diff --git a/libsearch/index/inotify-index.h b/libsearch/index/inotify-index.h index fb2c4ad..e93cfb7 100644 --- a/libsearch/index/inotify-index.h +++ b/libsearch/index/inotify-index.h @@ -65,9 +65,9 @@ private: std::map::value_type("doc", true), std::map::value_type("docx", true), std::map::value_type("ppt", true), -// std::map::value_type(".pptx", true), + std::map::value_type("pptx", true), std::map::value_type("xls", true), -// std::map::value_type(".xlsx", true), +// std::map::value_type("xlsx", true), std::map::value_type("txt", true), std::map::value_type("dot", true), std::map::value_type("wps", true), From 270f959c38b47eee778a03ecfe18ef3bb1909634 Mon Sep 17 00:00:00 2001 From: iaom <18504285112@163.com> Date: Tue, 13 Apr 2021 13:57:02 +0800 Subject: [PATCH 2/3] Trying to optimize xml parser. --- libsearch/file-utils.cpp | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/libsearch/file-utils.cpp b/libsearch/file-utils.cpp index 73dc8f5..6860ffd 100644 --- a/libsearch/file-utils.cpp +++ b/libsearch/file-utils.cpp @@ -548,7 +548,6 @@ void FileUtils::getDocxTextContent(QString &path,QString &textcontent) void FileUtils::getPptxTextContent(QString &path, QString &textcontent) { - //fix me :optimized by xpath?? QFileInfo info = QFileInfo(path); if(!info.exists()||info.isDir()) return; @@ -569,7 +568,9 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent) QDomElement txbody; QDomElement ap; QDomElement ar; + QDomDocument doc; QDomElement at; +// QDomNodeList atList; for(int i =0;i= MAX_CONTENT_LENGTH/3) +// { +// file.close(); +// return; +// } +// } +// } + //This is ugly but seems more efficient when handel a large file. sptree = doc.firstChildElement("p:sld").firstChildElement("p:cSld").firstChildElement("p:spTree"); while(!sptree.isNull()) { From 5a73c03d9005592aa7370d0a7beb40de554a097d Mon Sep 17 00:00:00 2001 From: iaom <18504285112@163.com> Date: Tue, 13 Apr 2021 14:53:55 +0800 Subject: [PATCH 3/3] Add support for xlsx pasering. --- libsearch/file-utils.cpp | 52 +++++++++++++++++++++++++++++++++ libsearch/file-utils.h | 1 + libsearch/index/file-reader.cpp | 2 ++ libsearch/index/first-index.h | 2 +- libsearch/index/inotify-index.h | 2 +- 5 files changed, 57 insertions(+), 2 deletions(-) diff --git a/libsearch/file-utils.cpp b/libsearch/file-utils.cpp index 6860ffd..eb827ae 100644 --- a/libsearch/file-utils.cpp +++ b/libsearch/file-utils.cpp @@ -640,6 +640,58 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent) return; } +void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) +{ + QFileInfo info = QFileInfo(path); + if(!info.exists()||info.isDir()) + return; + QuaZip file(path); + if(!file.open(QuaZip::mdUnzip)) + return; + + if(!file.setCurrentFile("xl/sharedStrings.xml",QuaZip::csSensitive)) + return; + QuaZipFile fileR(&file); + + fileR.open(QIODevice::ReadOnly); //读取方式打开 + + QDomDocument doc; + doc.setContent(fileR.readAll()); + fileR.close(); + QDomElement sst = doc.firstChildElement("sst"); + QDomElement si; + QDomElement r; + QDomElement t; + while(!sst.isNull()) + { + si= sst.firstChildElement("si"); + while(!si.isNull()) + { + r= si.firstChildElement("r"); + if(r.isNull()) + { + t= si.firstChildElement("t"); + } + else + { + t = r.firstChildElement("t"); + } + if(t.isNull()) + continue; + textcontent.append(t.text().replace("\r","").replace("\n"," ")); + if(textcontent.length() >= MAX_CONTENT_LENGTH/3) + { + file.close(); + return; + } + si = si.nextSiblingElement(); + } + sst = sst.nextSiblingElement(); + } + file.close(); + return; +} + void FileUtils::getTxtContent(QString &path, QString &textcontent) { QFile file(path); diff --git a/libsearch/file-utils.h b/libsearch/file-utils.h index d04a02b..172bc9a 100644 --- a/libsearch/file-utils.h +++ b/libsearch/file-utils.h @@ -66,6 +66,7 @@ public: static QMimeType getMimetype(QString &path); static void getDocxTextContent(QString &path, QString &textcontent); static void getPptxTextContent(QString &path, QString &textcontent); + static void getXlsxTextContent(QString &path, QString &textcontent); static void getTxtContent(QString &path, QString &textcontent); static size_t _max_index_count; static size_t _current_index_count; //this one has been Abandoned,do not use it. diff --git a/libsearch/index/file-reader.cpp b/libsearch/index/file-reader.cpp index e6babca..b3dc833 100644 --- a/libsearch/index/file-reader.cpp +++ b/libsearch/index/file-reader.cpp @@ -38,6 +38,8 @@ void FileReader::getTextContent(QString path, QString &textContent) FileUtils::getDocxTextContent(path,textContent); if(strsfx.endsWith( "pptx")) FileUtils::getPptxTextContent(path,textContent); + if(strsfx.endsWith( "xlsx")) + FileUtils::getXlsxTextContent(path,textContent); } else if(name == "text/plain") { diff --git a/libsearch/index/first-index.h b/libsearch/index/first-index.h index 1523192..9bedf1c 100644 --- a/libsearch/index/first-index.h +++ b/libsearch/index/first-index.h @@ -69,7 +69,7 @@ private: std::map::value_type("ppt", true), std::map::value_type("pptx", true), std::map::value_type("xls", true), -// std::map::value_type("xlsx", true), + std::map::value_type("xlsx", true), std::map::value_type("txt", true), std::map::value_type("dot", true), std::map::value_type("wps", true), diff --git a/libsearch/index/inotify-index.h b/libsearch/index/inotify-index.h index e93cfb7..d50dc99 100644 --- a/libsearch/index/inotify-index.h +++ b/libsearch/index/inotify-index.h @@ -67,7 +67,7 @@ private: std::map::value_type("ppt", true), std::map::value_type("pptx", true), std::map::value_type("xls", true), -// std::map::value_type("xlsx", true), + std::map::value_type("xlsx", true), std::map::value_type("txt", true), std::map::value_type("dot", true), std::map::value_type("wps", true),