Add support for pdf in file parser.

This commit is contained in:
iaom 2021-04-15 09:19:36 +08:00
parent 9ec8f32725
commit d3caad4f56
7 changed files with 34 additions and 6 deletions

3
debian/control vendored
View File

@ -16,7 +16,8 @@ Build-Depends: debhelper (>=9.0.0),
libkf5windowsystem-dev,
libgsettings-qt-dev,
libqt5x11extras5-dev,
libuchardet-dev
libuchardet-dev,
libpoppler-qt5-dev
Standards-Version: 4.5.0
Homepage: https://www.ukui.org/
Vcs-Git: https://github.com/ukui/ukui-search.git

View File

@ -30,6 +30,7 @@
#include <QDomDocument>
#include <QQueue>
#include "uchardet/uchardet.h"
#include "poppler-qt5.h"
size_t FileUtils::_max_index_count = 0;
@ -619,7 +620,7 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent)
while(!ar.isNull())
{
at = ar.firstChildElement("a:t");
textcontent.append(at.text().replace("\r","")).replace("\t"," ");
textcontent.append(at.text().replace("\r","")).replace("\t","");
if(textcontent.length() >= MAX_CONTENT_LENGTH/3)
{
file.close();
@ -678,7 +679,7 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent)
}
if(t.isNull())
continue;
textcontent.append(t.text().replace("\r","").replace("\n"," "));
textcontent.append(t.text().replace("\r","").replace("\n",""));
if(textcontent.length() >= MAX_CONTENT_LENGTH/3)
{
file.close();
@ -692,6 +693,23 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent)
return;
}
void FileUtils::getPdfTextContent(QString &path, QString &textcontent)
{
Poppler::Document *doc = Poppler::Document::load(path);
if(doc->isLocked())
return;
const QRectF qf;
int pageNum = doc->numPages();
for(int i = 0; i<pageNum; ++i)
{
textcontent.append(doc->page(i)->text(qf).replace("\n",""));
if(textcontent.length() >= MAX_CONTENT_LENGTH/3)
break;
}
delete doc;
return;
}
void FileUtils::getTxtContent(QString &path, QString &textcontent)
{
QFile file(path);

View File

@ -35,6 +35,7 @@
#include <QMimeDatabase>
#include <QMimeType>
#include <QDir>
#include "libsearch_global.h"
//#define INITIAL_STATE 0
//#define CREATING_INDEX 1
@ -67,6 +68,7 @@ public:
static void getDocxTextContent(QString &path, QString &textcontent);
static void getPptxTextContent(QString &path, QString &textcontent);
static void getXlsxTextContent(QString &path, QString &textcontent);
static void getPdfTextContent(QString &path, QString &textcontent);
static void getTxtContent(QString &path, QString &textcontent);
static size_t _max_index_count;
static size_t _current_index_count; //this one has been Abandoned,do not use it.

View File

@ -55,6 +55,11 @@ void FileReader::getTextContent(QString path, QString &textContent)
searchdata.RunParser(path,textContent);
}
}
else if(name == "application/pdf")
{
if(strsfx.endsWith( "pdf"))
FileUtils::getPdfTextContent(path,textContent);
}
else
{
qWarning()<<"Unsupport format:["<<path<<"]["<<type.name()<<"]";

View File

@ -75,7 +75,8 @@ private:
std::map<QString, bool>::value_type("wps", true),
std::map<QString, bool>::value_type("pps", true),
std::map<QString, bool>::value_type("dps", true),
std::map<QString, bool>::value_type("et", true)
std::map<QString, bool>::value_type("et", true),
std::map<QString, bool>::value_type("pdf", true)
};
//xapian will auto commit per 10,000 changes, donnot change it!!!

View File

@ -73,7 +73,8 @@ private:
std::map<QString, bool>::value_type("wps", true),
std::map<QString, bool>::value_type("pps", true),
std::map<QString, bool>::value_type("dps", true),
std::map<QString, bool>::value_type("et", true)
std::map<QString, bool>::value_type("et", true),
std::map<QString, bool>::value_type("pdf", true)
};
};

View File

@ -5,7 +5,7 @@ TARGET = ukui-search
TEMPLATE = lib
DEFINES += LIBSEARCH_LIBRARY
PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0 gsettings-qt
PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0 gsettings-qt poppler-qt5
CONFIG += c++11 link_pkgconfig no_keywords lrelease