Merge branch '0330-dev' into 'main'

Add support for 'pptx'.

See merge request kylin-desktop/ukui-search!11
This commit is contained in:
Zihao Zhang 2021-04-13 07:04:22 +00:00
commit c069f96fa9
5 changed files with 161 additions and 6 deletions

View File

@ -502,6 +502,7 @@ QStringList FileUtils::findMultiToneWords(const QString& hanzi)
*/ */
void FileUtils::getDocxTextContent(QString &path,QString &textcontent) void FileUtils::getDocxTextContent(QString &path,QString &textcontent)
{ {
//fix me :optimized by xpath??
QFileInfo info = QFileInfo(path); QFileInfo info = QFileInfo(path);
if(!info.exists()||info.isDir()) if(!info.exists()||info.isDir())
return; return;
@ -517,6 +518,7 @@ void FileUtils::getDocxTextContent(QString &path,QString &textcontent)
QDomDocument doc; QDomDocument doc;
doc.setContent(fileR.readAll()); doc.setContent(fileR.readAll());
fileR.close();
QDomElement first = doc.firstChildElement("w:document"); QDomElement first = doc.firstChildElement("w:document");
QDomElement body = first.firstChildElement("w:body"); QDomElement body = first.firstChildElement("w:body");
while(!body.isNull()) while(!body.isNull())
@ -529,7 +531,7 @@ void FileUtils::getDocxTextContent(QString &path,QString &textcontent)
{ {
QDomElement wt = wr.firstChildElement("w:t"); QDomElement wt = wr.firstChildElement("w:t");
textcontent.append(wt.text().replace("\n","")); textcontent.append(wt.text().replace("\n",""));
if(textcontent.length() >= 682666) //20480000/3 if(textcontent.length() >= MAX_CONTENT_LENGTH/3)
{ {
file.close(); file.close();
return; return;
@ -544,13 +546,159 @@ void FileUtils::getDocxTextContent(QString &path,QString &textcontent)
return; return;
} }
void FileUtils::getPptxTextContent(QString &path, QString &textcontent)
{
QFileInfo info = QFileInfo(path);
if(!info.exists()||info.isDir())
return;
QuaZip file(path);
if(!file.open(QuaZip::mdUnzip))
return;
QString prefix("ppt/slides/slide");
QStringList fileList;
for(QString i : file.getFileNameList())
{
if(i.startsWith(prefix))
fileList<<i;
}
if(fileList.isEmpty())
return;
QDomElement sptree;
QDomElement sp;
QDomElement txbody;
QDomElement ap;
QDomElement ar;
QDomDocument doc;
QDomElement at;
// QDomNodeList atList;
for(int i =0;i<fileList.size();++i)
{
QString name = prefix + QString::number(i+1) + ".xml";
if(!file.setCurrentFile(name))
{
continue;
}
QuaZipFile fileR(&file);
fileR.open(QIODevice::ReadOnly);
doc.clear();
doc.setContent(fileR.readAll());
fileR.close();
//fix me :optimized by xpath??
//This method looks better but slower,
//If xml file is very large with many useless node,this method will take a lot of time.
// atList = doc.elementsByTagName("a:t");
// for(int i = 0; i<atList.size(); ++i)
// {
// at = atList.at(i).toElement();
// if(!at.isNull())
// {
// textcontent.append(at.text().replace("\r","")).replace("\t"," ");
// if(textcontent.length() >= MAX_CONTENT_LENGTH/3)
// {
// file.close();
// return;
// }
// }
// }
//This is ugly but seems more efficient when handel a large file.
sptree = doc.firstChildElement("p:sld").firstChildElement("p:cSld").firstChildElement("p:spTree");
while(!sptree.isNull())
{
sp= sptree.firstChildElement("p:sp");
while(!sp.isNull())
{
txbody= sp.firstChildElement("p:txBody");
while(!txbody.isNull())
{
ap = txbody.firstChildElement("a:p");
while(!ap.isNull())
{
ar = ap.firstChildElement("a:r");
while(!ar.isNull())
{
at = ar.firstChildElement("a:t");
textcontent.append(at.text().replace("\r","")).replace("\t"," ");
if(textcontent.length() >= MAX_CONTENT_LENGTH/3)
{
file.close();
return;
}
ar = ar.nextSiblingElement();
}
ap = ap.nextSiblingElement();
}
txbody = txbody.nextSiblingElement();
}
sp = sp.nextSiblingElement();
}
sptree = sptree.nextSiblingElement();
}
}
file.close();
return;
}
void FileUtils::getXlsxTextContent(QString &path, QString &textcontent)
{
QFileInfo info = QFileInfo(path);
if(!info.exists()||info.isDir())
return;
QuaZip file(path);
if(!file.open(QuaZip::mdUnzip))
return;
if(!file.setCurrentFile("xl/sharedStrings.xml",QuaZip::csSensitive))
return;
QuaZipFile fileR(&file);
fileR.open(QIODevice::ReadOnly); //读取方式打开
QDomDocument doc;
doc.setContent(fileR.readAll());
fileR.close();
QDomElement sst = doc.firstChildElement("sst");
QDomElement si;
QDomElement r;
QDomElement t;
while(!sst.isNull())
{
si= sst.firstChildElement("si");
while(!si.isNull())
{
r= si.firstChildElement("r");
if(r.isNull())
{
t= si.firstChildElement("t");
}
else
{
t = r.firstChildElement("t");
}
if(t.isNull())
continue;
textcontent.append(t.text().replace("\r","").replace("\n"," "));
if(textcontent.length() >= MAX_CONTENT_LENGTH/3)
{
file.close();
return;
}
si = si.nextSiblingElement();
}
sst = sst.nextSiblingElement();
}
file.close();
return;
}
void FileUtils::getTxtContent(QString &path, QString &textcontent) void FileUtils::getTxtContent(QString &path, QString &textcontent)
{ {
QFile file(path); QFile file(path);
if(!file.open(QIODevice::ReadOnly|QIODevice::Text)) if(!file.open(QIODevice::ReadOnly|QIODevice::Text))
return; return;
QByteArray encodedString = file.read(20480000); QByteArray encodedString = file.read(MAX_CONTENT_LENGTH);
uchardet_t chardet = uchardet_new(); uchardet_t chardet = uchardet_new();
if(uchardet_handle_data(chardet,encodedString.constData(),encodedString.size()) !=0) if(uchardet_handle_data(chardet,encodedString.constData(),encodedString.size()) !=0)

View File

@ -39,6 +39,7 @@
//#define INITIAL_STATE 0 //#define INITIAL_STATE 0
//#define CREATING_INDEX 1 //#define CREATING_INDEX 1
//#define FINISH_CREATING_INDEX 2 //#define FINISH_CREATING_INDEX 2
#define MAX_CONTENT_LENGTH 20480000
#define UKUI_SEARCH_PIPE_PATH (QDir::homePath()+"/.config/org.ukui/ukui-search/ukuisearch").toLocal8Bit().constData() #define UKUI_SEARCH_PIPE_PATH (QDir::homePath()+"/.config/org.ukui/ukui-search/ukuisearch").toLocal8Bit().constData()
@ -64,6 +65,8 @@ public:
//parse text,docx..... //parse text,docx.....
static QMimeType getMimetype(QString &path); static QMimeType getMimetype(QString &path);
static void getDocxTextContent(QString &path, QString &textcontent); static void getDocxTextContent(QString &path, QString &textcontent);
static void getPptxTextContent(QString &path, QString &textcontent);
static void getXlsxTextContent(QString &path, QString &textcontent);
static void getTxtContent(QString &path, QString &textcontent); static void getTxtContent(QString &path, QString &textcontent);
static size_t _max_index_count; static size_t _max_index_count;
static size_t _current_index_count; //this one has been Abandoned,do not use it. static size_t _current_index_count; //this one has been Abandoned,do not use it.

View File

@ -36,6 +36,10 @@ void FileReader::getTextContent(QString path, QString &textContent)
{ {
if(strsfx.endsWith( "docx")) if(strsfx.endsWith( "docx"))
FileUtils::getDocxTextContent(path,textContent); FileUtils::getDocxTextContent(path,textContent);
if(strsfx.endsWith( "pptx"))
FileUtils::getPptxTextContent(path,textContent);
if(strsfx.endsWith( "xlsx"))
FileUtils::getXlsxTextContent(path,textContent);
} }
else if(name == "text/plain") else if(name == "text/plain")
{ {

View File

@ -67,9 +67,9 @@ private:
std::map<QString, bool>::value_type("doc", true), std::map<QString, bool>::value_type("doc", true),
std::map<QString, bool>::value_type("docx", true), std::map<QString, bool>::value_type("docx", true),
std::map<QString, bool>::value_type("ppt", true), std::map<QString, bool>::value_type("ppt", true),
// std::map<QString, bool>::value_type("pptx", true), std::map<QString, bool>::value_type("pptx", true),
std::map<QString, bool>::value_type("xls", true), std::map<QString, bool>::value_type("xls", true),
// std::map<QString, bool>::value_type("xlsx", true), std::map<QString, bool>::value_type("xlsx", true),
std::map<QString, bool>::value_type("txt", true), std::map<QString, bool>::value_type("txt", true),
std::map<QString, bool>::value_type("dot", true), std::map<QString, bool>::value_type("dot", true),
std::map<QString, bool>::value_type("wps", true), std::map<QString, bool>::value_type("wps", true),

View File

@ -65,9 +65,9 @@ private:
std::map<QString, bool>::value_type("doc", true), std::map<QString, bool>::value_type("doc", true),
std::map<QString, bool>::value_type("docx", true), std::map<QString, bool>::value_type("docx", true),
std::map<QString, bool>::value_type("ppt", true), std::map<QString, bool>::value_type("ppt", true),
// std::map<QString, bool>::value_type("pptx", true), std::map<QString, bool>::value_type("pptx", true),
std::map<QString, bool>::value_type("xls", true), std::map<QString, bool>::value_type("xls", true),
// std::map<QString, bool>::value_type("xlsx", true), std::map<QString, bool>::value_type("xlsx", true),
std::map<QString, bool>::value_type("txt", true), std::map<QString, bool>::value_type("txt", true),
std::map<QString, bool>::value_type("dot", true), std::map<QString, bool>::value_type("dot", true),
std::map<QString, bool>::value_type("wps", true), std::map<QString, bool>::value_type("wps", true),