From d3caad4f56e481a5f58b5bb6e6e802a8aa3e4f25 Mon Sep 17 00:00:00 2001 From: iaom <18504285112@163.com> Date: Thu, 15 Apr 2021 09:19:36 +0800 Subject: [PATCH] Add support for pdf in file parser. --- debian/control | 3 ++- libsearch/file-utils.cpp | 22 ++++++++++++++++++++-- libsearch/file-utils.h | 2 ++ libsearch/index/file-reader.cpp | 5 +++++ libsearch/index/first-index.h | 3 ++- libsearch/index/inotify-index.h | 3 ++- libsearch/libsearch.pro | 2 +- 7 files changed, 34 insertions(+), 6 deletions(-) diff --git a/debian/control b/debian/control index d58c01b..411e22f 100644 --- a/debian/control +++ b/debian/control @@ -16,7 +16,8 @@ Build-Depends: debhelper (>=9.0.0), libkf5windowsystem-dev, libgsettings-qt-dev, libqt5x11extras5-dev, - libuchardet-dev + libuchardet-dev, + libpoppler-qt5-dev Standards-Version: 4.5.0 Homepage: https://www.ukui.org/ Vcs-Git: https://github.com/ukui/ukui-search.git diff --git a/libsearch/file-utils.cpp b/libsearch/file-utils.cpp index eb827ae..41c5110 100644 --- a/libsearch/file-utils.cpp +++ b/libsearch/file-utils.cpp @@ -30,6 +30,7 @@ #include #include #include "uchardet/uchardet.h" +#include "poppler-qt5.h" size_t FileUtils::_max_index_count = 0; @@ -619,7 +620,7 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent) while(!ar.isNull()) { at = ar.firstChildElement("a:t"); - textcontent.append(at.text().replace("\r","")).replace("\t"," "); + textcontent.append(at.text().replace("\r","")).replace("\t",""); if(textcontent.length() >= MAX_CONTENT_LENGTH/3) { file.close(); @@ -678,7 +679,7 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) } if(t.isNull()) continue; - textcontent.append(t.text().replace("\r","").replace("\n"," ")); + textcontent.append(t.text().replace("\r","").replace("\n","")); if(textcontent.length() >= MAX_CONTENT_LENGTH/3) { file.close(); @@ -692,6 +693,23 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) return; } +void FileUtils::getPdfTextContent(QString &path, QString &textcontent) +{ + Poppler::Document *doc = Poppler::Document::load(path); + if(doc->isLocked()) + return; + const QRectF qf; + int pageNum = doc->numPages(); + for(int i = 0; ipage(i)->text(qf).replace("\n","")); + if(textcontent.length() >= MAX_CONTENT_LENGTH/3) + break; + } + delete doc; + return; +} + void FileUtils::getTxtContent(QString &path, QString &textcontent) { QFile file(path); diff --git a/libsearch/file-utils.h b/libsearch/file-utils.h index 172bc9a..76b093e 100644 --- a/libsearch/file-utils.h +++ b/libsearch/file-utils.h @@ -35,6 +35,7 @@ #include #include #include + #include "libsearch_global.h" //#define INITIAL_STATE 0 //#define CREATING_INDEX 1 @@ -67,6 +68,7 @@ public: static void getDocxTextContent(QString &path, QString &textcontent); static void getPptxTextContent(QString &path, QString &textcontent); static void getXlsxTextContent(QString &path, QString &textcontent); + static void getPdfTextContent(QString &path, QString &textcontent); static void getTxtContent(QString &path, QString &textcontent); static size_t _max_index_count; static size_t _current_index_count; //this one has been Abandoned,do not use it. diff --git a/libsearch/index/file-reader.cpp b/libsearch/index/file-reader.cpp index b3dc833..35afefa 100644 --- a/libsearch/index/file-reader.cpp +++ b/libsearch/index/file-reader.cpp @@ -55,6 +55,11 @@ void FileReader::getTextContent(QString path, QString &textContent) searchdata.RunParser(path,textContent); } } + else if(name == "application/pdf") + { + if(strsfx.endsWith( "pdf")) + FileUtils::getPdfTextContent(path,textContent); + } else { qWarning()<<"Unsupport format:["<::value_type("wps", true), std::map::value_type("pps", true), std::map::value_type("dps", true), - std::map::value_type("et", true) + std::map::value_type("et", true), + std::map::value_type("pdf", true) }; //xapian will auto commit per 10,000 changes, donnot change it!!! diff --git a/libsearch/index/inotify-index.h b/libsearch/index/inotify-index.h index d50dc99..0bbb7c9 100644 --- a/libsearch/index/inotify-index.h +++ b/libsearch/index/inotify-index.h @@ -73,7 +73,8 @@ private: std::map::value_type("wps", true), std::map::value_type("pps", true), std::map::value_type("dps", true), - std::map::value_type("et", true) + std::map::value_type("et", true), + std::map::value_type("pdf", true) }; }; diff --git a/libsearch/libsearch.pro b/libsearch/libsearch.pro index 2c584c5..7a11e61 100644 --- a/libsearch/libsearch.pro +++ b/libsearch/libsearch.pro @@ -5,7 +5,7 @@ TARGET = ukui-search TEMPLATE = lib DEFINES += LIBSEARCH_LIBRARY -PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0 gsettings-qt +PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0 gsettings-qt poppler-qt5 CONFIG += c++11 link_pkgconfig no_keywords lrelease