Merge branch '0330-dev' into 'main'
Add support for 'pptx'. See merge request kylin-desktop/ukui-search!11
This commit is contained in:
commit
c069f96fa9
|
@ -502,6 +502,7 @@ QStringList FileUtils::findMultiToneWords(const QString& hanzi)
|
||||||
*/
|
*/
|
||||||
void FileUtils::getDocxTextContent(QString &path,QString &textcontent)
|
void FileUtils::getDocxTextContent(QString &path,QString &textcontent)
|
||||||
{
|
{
|
||||||
|
//fix me :optimized by xpath??
|
||||||
QFileInfo info = QFileInfo(path);
|
QFileInfo info = QFileInfo(path);
|
||||||
if(!info.exists()||info.isDir())
|
if(!info.exists()||info.isDir())
|
||||||
return;
|
return;
|
||||||
|
@ -517,6 +518,7 @@ void FileUtils::getDocxTextContent(QString &path,QString &textcontent)
|
||||||
|
|
||||||
QDomDocument doc;
|
QDomDocument doc;
|
||||||
doc.setContent(fileR.readAll());
|
doc.setContent(fileR.readAll());
|
||||||
|
fileR.close();
|
||||||
QDomElement first = doc.firstChildElement("w:document");
|
QDomElement first = doc.firstChildElement("w:document");
|
||||||
QDomElement body = first.firstChildElement("w:body");
|
QDomElement body = first.firstChildElement("w:body");
|
||||||
while(!body.isNull())
|
while(!body.isNull())
|
||||||
|
@ -529,7 +531,7 @@ void FileUtils::getDocxTextContent(QString &path,QString &textcontent)
|
||||||
{
|
{
|
||||||
QDomElement wt = wr.firstChildElement("w:t");
|
QDomElement wt = wr.firstChildElement("w:t");
|
||||||
textcontent.append(wt.text().replace("\n",""));
|
textcontent.append(wt.text().replace("\n",""));
|
||||||
if(textcontent.length() >= 682666) //20480000/3
|
if(textcontent.length() >= MAX_CONTENT_LENGTH/3)
|
||||||
{
|
{
|
||||||
file.close();
|
file.close();
|
||||||
return;
|
return;
|
||||||
|
@ -544,13 +546,159 @@ void FileUtils::getDocxTextContent(QString &path,QString &textcontent)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void FileUtils::getPptxTextContent(QString &path, QString &textcontent)
|
||||||
|
{
|
||||||
|
QFileInfo info = QFileInfo(path);
|
||||||
|
if(!info.exists()||info.isDir())
|
||||||
|
return;
|
||||||
|
QuaZip file(path);
|
||||||
|
if(!file.open(QuaZip::mdUnzip))
|
||||||
|
return;
|
||||||
|
QString prefix("ppt/slides/slide");
|
||||||
|
QStringList fileList;
|
||||||
|
for(QString i : file.getFileNameList())
|
||||||
|
{
|
||||||
|
if(i.startsWith(prefix))
|
||||||
|
fileList<<i;
|
||||||
|
}
|
||||||
|
if(fileList.isEmpty())
|
||||||
|
return;
|
||||||
|
QDomElement sptree;
|
||||||
|
QDomElement sp;
|
||||||
|
QDomElement txbody;
|
||||||
|
QDomElement ap;
|
||||||
|
QDomElement ar;
|
||||||
|
QDomDocument doc;
|
||||||
|
QDomElement at;
|
||||||
|
// QDomNodeList atList;
|
||||||
|
for(int i =0;i<fileList.size();++i)
|
||||||
|
{
|
||||||
|
QString name = prefix + QString::number(i+1) + ".xml";
|
||||||
|
if(!file.setCurrentFile(name))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
QuaZipFile fileR(&file);
|
||||||
|
fileR.open(QIODevice::ReadOnly);
|
||||||
|
doc.clear();
|
||||||
|
doc.setContent(fileR.readAll());
|
||||||
|
fileR.close();
|
||||||
|
|
||||||
|
//fix me :optimized by xpath??
|
||||||
|
//This method looks better but slower,
|
||||||
|
//If xml file is very large with many useless node,this method will take a lot of time.
|
||||||
|
|
||||||
|
// atList = doc.elementsByTagName("a:t");
|
||||||
|
// for(int i = 0; i<atList.size(); ++i)
|
||||||
|
// {
|
||||||
|
// at = atList.at(i).toElement();
|
||||||
|
// if(!at.isNull())
|
||||||
|
// {
|
||||||
|
// textcontent.append(at.text().replace("\r","")).replace("\t"," ");
|
||||||
|
// if(textcontent.length() >= MAX_CONTENT_LENGTH/3)
|
||||||
|
// {
|
||||||
|
// file.close();
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//This is ugly but seems more efficient when handel a large file.
|
||||||
|
sptree = doc.firstChildElement("p:sld").firstChildElement("p:cSld").firstChildElement("p:spTree");
|
||||||
|
while(!sptree.isNull())
|
||||||
|
{
|
||||||
|
sp= sptree.firstChildElement("p:sp");
|
||||||
|
while(!sp.isNull())
|
||||||
|
{
|
||||||
|
txbody= sp.firstChildElement("p:txBody");
|
||||||
|
while(!txbody.isNull())
|
||||||
|
{
|
||||||
|
ap = txbody.firstChildElement("a:p");
|
||||||
|
while(!ap.isNull())
|
||||||
|
{
|
||||||
|
ar = ap.firstChildElement("a:r");
|
||||||
|
while(!ar.isNull())
|
||||||
|
{
|
||||||
|
at = ar.firstChildElement("a:t");
|
||||||
|
textcontent.append(at.text().replace("\r","")).replace("\t"," ");
|
||||||
|
if(textcontent.length() >= MAX_CONTENT_LENGTH/3)
|
||||||
|
{
|
||||||
|
file.close();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ar = ar.nextSiblingElement();
|
||||||
|
}
|
||||||
|
ap = ap.nextSiblingElement();
|
||||||
|
}
|
||||||
|
txbody = txbody.nextSiblingElement();
|
||||||
|
}
|
||||||
|
sp = sp.nextSiblingElement();
|
||||||
|
}
|
||||||
|
sptree = sptree.nextSiblingElement();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
file.close();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
void FileUtils::getXlsxTextContent(QString &path, QString &textcontent)
|
||||||
|
{
|
||||||
|
QFileInfo info = QFileInfo(path);
|
||||||
|
if(!info.exists()||info.isDir())
|
||||||
|
return;
|
||||||
|
QuaZip file(path);
|
||||||
|
if(!file.open(QuaZip::mdUnzip))
|
||||||
|
return;
|
||||||
|
|
||||||
|
if(!file.setCurrentFile("xl/sharedStrings.xml",QuaZip::csSensitive))
|
||||||
|
return;
|
||||||
|
QuaZipFile fileR(&file);
|
||||||
|
|
||||||
|
fileR.open(QIODevice::ReadOnly); //读取方式打开
|
||||||
|
|
||||||
|
QDomDocument doc;
|
||||||
|
doc.setContent(fileR.readAll());
|
||||||
|
fileR.close();
|
||||||
|
QDomElement sst = doc.firstChildElement("sst");
|
||||||
|
QDomElement si;
|
||||||
|
QDomElement r;
|
||||||
|
QDomElement t;
|
||||||
|
while(!sst.isNull())
|
||||||
|
{
|
||||||
|
si= sst.firstChildElement("si");
|
||||||
|
while(!si.isNull())
|
||||||
|
{
|
||||||
|
r= si.firstChildElement("r");
|
||||||
|
if(r.isNull())
|
||||||
|
{
|
||||||
|
t= si.firstChildElement("t");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
t = r.firstChildElement("t");
|
||||||
|
}
|
||||||
|
if(t.isNull())
|
||||||
|
continue;
|
||||||
|
textcontent.append(t.text().replace("\r","").replace("\n"," "));
|
||||||
|
if(textcontent.length() >= MAX_CONTENT_LENGTH/3)
|
||||||
|
{
|
||||||
|
file.close();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
si = si.nextSiblingElement();
|
||||||
|
}
|
||||||
|
sst = sst.nextSiblingElement();
|
||||||
|
}
|
||||||
|
file.close();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
void FileUtils::getTxtContent(QString &path, QString &textcontent)
|
void FileUtils::getTxtContent(QString &path, QString &textcontent)
|
||||||
{
|
{
|
||||||
QFile file(path);
|
QFile file(path);
|
||||||
if(!file.open(QIODevice::ReadOnly|QIODevice::Text))
|
if(!file.open(QIODevice::ReadOnly|QIODevice::Text))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
QByteArray encodedString = file.read(20480000);
|
QByteArray encodedString = file.read(MAX_CONTENT_LENGTH);
|
||||||
|
|
||||||
uchardet_t chardet = uchardet_new();
|
uchardet_t chardet = uchardet_new();
|
||||||
if(uchardet_handle_data(chardet,encodedString.constData(),encodedString.size()) !=0)
|
if(uchardet_handle_data(chardet,encodedString.constData(),encodedString.size()) !=0)
|
||||||
|
|
|
@ -39,6 +39,7 @@
|
||||||
//#define INITIAL_STATE 0
|
//#define INITIAL_STATE 0
|
||||||
//#define CREATING_INDEX 1
|
//#define CREATING_INDEX 1
|
||||||
//#define FINISH_CREATING_INDEX 2
|
//#define FINISH_CREATING_INDEX 2
|
||||||
|
#define MAX_CONTENT_LENGTH 20480000
|
||||||
|
|
||||||
#define UKUI_SEARCH_PIPE_PATH (QDir::homePath()+"/.config/org.ukui/ukui-search/ukuisearch").toLocal8Bit().constData()
|
#define UKUI_SEARCH_PIPE_PATH (QDir::homePath()+"/.config/org.ukui/ukui-search/ukuisearch").toLocal8Bit().constData()
|
||||||
|
|
||||||
|
@ -64,6 +65,8 @@ public:
|
||||||
//parse text,docx.....
|
//parse text,docx.....
|
||||||
static QMimeType getMimetype(QString &path);
|
static QMimeType getMimetype(QString &path);
|
||||||
static void getDocxTextContent(QString &path, QString &textcontent);
|
static void getDocxTextContent(QString &path, QString &textcontent);
|
||||||
|
static void getPptxTextContent(QString &path, QString &textcontent);
|
||||||
|
static void getXlsxTextContent(QString &path, QString &textcontent);
|
||||||
static void getTxtContent(QString &path, QString &textcontent);
|
static void getTxtContent(QString &path, QString &textcontent);
|
||||||
static size_t _max_index_count;
|
static size_t _max_index_count;
|
||||||
static size_t _current_index_count; //this one has been Abandoned,do not use it.
|
static size_t _current_index_count; //this one has been Abandoned,do not use it.
|
||||||
|
|
|
@ -36,6 +36,10 @@ void FileReader::getTextContent(QString path, QString &textContent)
|
||||||
{
|
{
|
||||||
if(strsfx.endsWith( "docx"))
|
if(strsfx.endsWith( "docx"))
|
||||||
FileUtils::getDocxTextContent(path,textContent);
|
FileUtils::getDocxTextContent(path,textContent);
|
||||||
|
if(strsfx.endsWith( "pptx"))
|
||||||
|
FileUtils::getPptxTextContent(path,textContent);
|
||||||
|
if(strsfx.endsWith( "xlsx"))
|
||||||
|
FileUtils::getXlsxTextContent(path,textContent);
|
||||||
}
|
}
|
||||||
else if(name == "text/plain")
|
else if(name == "text/plain")
|
||||||
{
|
{
|
||||||
|
|
|
@ -67,9 +67,9 @@ private:
|
||||||
std::map<QString, bool>::value_type("doc", true),
|
std::map<QString, bool>::value_type("doc", true),
|
||||||
std::map<QString, bool>::value_type("docx", true),
|
std::map<QString, bool>::value_type("docx", true),
|
||||||
std::map<QString, bool>::value_type("ppt", true),
|
std::map<QString, bool>::value_type("ppt", true),
|
||||||
// std::map<QString, bool>::value_type("pptx", true),
|
std::map<QString, bool>::value_type("pptx", true),
|
||||||
std::map<QString, bool>::value_type("xls", true),
|
std::map<QString, bool>::value_type("xls", true),
|
||||||
// std::map<QString, bool>::value_type("xlsx", true),
|
std::map<QString, bool>::value_type("xlsx", true),
|
||||||
std::map<QString, bool>::value_type("txt", true),
|
std::map<QString, bool>::value_type("txt", true),
|
||||||
std::map<QString, bool>::value_type("dot", true),
|
std::map<QString, bool>::value_type("dot", true),
|
||||||
std::map<QString, bool>::value_type("wps", true),
|
std::map<QString, bool>::value_type("wps", true),
|
||||||
|
|
|
@ -65,9 +65,9 @@ private:
|
||||||
std::map<QString, bool>::value_type("doc", true),
|
std::map<QString, bool>::value_type("doc", true),
|
||||||
std::map<QString, bool>::value_type("docx", true),
|
std::map<QString, bool>::value_type("docx", true),
|
||||||
std::map<QString, bool>::value_type("ppt", true),
|
std::map<QString, bool>::value_type("ppt", true),
|
||||||
// std::map<QString, bool>::value_type("pptx", true),
|
std::map<QString, bool>::value_type("pptx", true),
|
||||||
std::map<QString, bool>::value_type("xls", true),
|
std::map<QString, bool>::value_type("xls", true),
|
||||||
// std::map<QString, bool>::value_type("xlsx", true),
|
std::map<QString, bool>::value_type("xlsx", true),
|
||||||
std::map<QString, bool>::value_type("txt", true),
|
std::map<QString, bool>::value_type("txt", true),
|
||||||
std::map<QString, bool>::value_type("dot", true),
|
std::map<QString, bool>::value_type("dot", true),
|
||||||
std::map<QString, bool>::value_type("wps", true),
|
std::map<QString, bool>::value_type("wps", true),
|
||||||
|
|
Loading…
Reference in New Issue