Store the content text for search and generate snippets when performing text content search.

This commit is contained in:
zhangpengfei 2021-01-02 17:21:38 +08:00
parent 5c4a9ba230
commit 9ee1b37693
8 changed files with 77 additions and 25 deletions

View File

@ -23,11 +23,14 @@ void Document::setData(QString data)
m_document->set_data(data.toStdString());
}
void Document::addterm(std::string term, int weight)
void Document::addterm(std::string term,QVector<size_t> offset, int weight)
{
if(term == "")
if(term == ""||term.length() > 240)
return;
m_document->add_term(term,weight);
for(size_t i : offset)
{
m_document->add_posting(term,i,weight);
}
}
void Document::addValue(QString value)

View File

@ -4,6 +4,7 @@
#include <xapian.h>
#include <QString>
#include <QStringList>
#include <QVector>
class Document
{
@ -11,7 +12,7 @@ public:
Document();
~Document();
void setData(QString data);
void addterm(std::string term,int weight =1);
void addterm(std::string term, QVector<size_t> offset, int weight =1);
void addValue(QString value);
void setUniqueTerm(QString term);
std::string getUniqueTerm();

View File

@ -75,7 +75,7 @@ void FileSearcher::onKeywordSearch(QString keyword, int begin, int num)
void FileSearcher::onKeywordSearchContent(QString keyword, int begin, int num)
{
QStringList searchResult;
QMap<QString,QStringList> searchResult = QMap<QString,QStringList>();
try
{
qDebug()<<"--content search start--";
@ -83,11 +83,11 @@ void FileSearcher::onKeywordSearchContent(QString keyword, int begin, int num)
Xapian::Database db(CONTENT_INDEX_PATH);
Xapian::Enquire enquire(db);
Xapian::QueryParser qp;
qp.set_default_op(Xapian::Query::OP_PHRASE);
// qp.set_default_op(Xapian::Query::OP_PHRASE);
qp.set_database(db);
//Creat a query
Xapian::Query queryPhrase = qp.parse_query(keyword.toStdString(),Xapian::QueryParser::FLAG_PHRASE);
Xapian::Query queryPhrase = qp.parse_query(keyword.toStdString());
qDebug()<<QString::fromStdString(queryPhrase.get_description());
@ -95,7 +95,7 @@ void FileSearcher::onKeywordSearchContent(QString keyword, int begin, int num)
//dir result
Xapian::MSet result = enquire.get_mset(begin, begin+num);
qDebug()<< "find results count=" <<static_cast<int>(result.get_matches_estimated());
searchResult = getResult(result);
searchResult = getContentResult(result,keyword);
qDebug()<< "--content search finish--";
}
@ -118,6 +118,7 @@ QStringList FileSearcher::getResult(Xapian::MSet &result)
QStringList searchResult = QStringList();
if(result.size() == 0)
return searchResult;
for (auto it = result.begin(); it != result.end(); ++it)
{
Xapian::Document doc = it.get_document();
@ -135,10 +136,53 @@ QStringList FileSearcher::getResult(Xapian::MSet &result)
{
searchResult.append(QString::fromStdString(data));
}
qDebug()<< "doc="<< QString::fromStdString(data) << ",weight=" <<docScoreWeight << ",percent=" << docScorePercent;
}
// if(!pathTobeDelete->isEmpty())
// deleteAllIndex(pathTobeDelete)
return searchResult;
}
QMap<QString,QStringList> FileSearcher::getContentResult(Xapian::MSet &result, QString &keyWord)
{
//QStringList *pathTobeDelete = new QStringList;
//Delete those path doc which is not already exist.
int size = keyWord.size();
QMap<QString,QStringList> searchResult;
if(result.size() == 0)
return searchResult;
for (auto it = result.begin(); it != result.end(); ++it)
{
Xapian::Document doc = it.get_document();
std::string data = doc.get_data();
double docScoreWeight = it.get_weight();
Xapian::percent docScorePercent = it.get_percent();
QString path = QString::fromStdString(doc.get_value(1));
QFileInfo *info = new QFileInfo(path);
if(!info->exists())
{
// pathTobeDelete->append(QString::fromStdString(data));
qDebug()<<path<<"is not exist!!";
continue;
}
// Construct snippets containing keyword.
QStringList snippets;
auto term = doc.termlist_begin();
term.skip_to(keyWord.toStdString());
for(auto pos = term.positionlist_begin();pos != term.positionlist_end();++pos)
{
QByteArray snippetByte = QByteArray::fromStdString(data);
QString snippet = "..."+QString(snippetByte.left(*pos)).right(size +5) + QString(snippetByte.mid(*pos,-1)).left(size+5) + "...";
// qDebug()<<snippet;
snippets.append(snippet);
}
searchResult.insert(path,snippets);
qDebug()<< "path="<< path << ",weight=" <<docScoreWeight << ",percent=" << docScorePercent;
}
// if(!pathTobeDelete->isEmpty())
// deleteAllIndex(pathTobeDelete)
return searchResult;
}

View File

@ -5,6 +5,7 @@
#include <xapian.h>
#include <QStandardPaths>
#include <QVector>
#include <QMap>
#define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/index_data").toStdString()
#define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/content_index_data").toStdString()
@ -20,10 +21,11 @@ public Q_SLOTS:
void onKeywordSearchContent(QString keyword, int begin = 0, int num = 20);
Q_SIGNALS:
void result(QVector<QStringList> resultV);
void contentResult(QStringList resultL);
void result(QVector<QStringList> resultP);
void contentResult(QMap<QString,QStringList> resultC);
private:
QStringList getResult(Xapian::MSet &result);
QMap<QString,QStringList> getContentResult(Xapian::MSet &result,QString &keyWord);
};
#endif // FILESEARCHER_H

View File

@ -21,7 +21,7 @@ void FileTypeFilter::DoSomething(const QFileInfo& fileInfo){
// qDebug() << qmt.preferredSuffix();
for (auto i : this->targetFileTypeVec){
if (fileInfo.fileName().endsWith(i)){
// qDebug() << fileInfo.fileName();
qDebug() << fileInfo.fileName();
this->result->append(fileInfo.absoluteFilePath());
}
}
@ -36,7 +36,7 @@ QList<QString>* FileTypeFilter::getTargetFileAbsolutePath(){
void FileTypeFilter::Test(){
IndexGenerator* ig = IndexGenerator::getInstance();
// this->result = new QList<QString>();
// this->result->append(QString("/home/zpf/桌面/DOCX 文档.docx"));
// this->result->append(QString("/home/zpf/桌面/DOCX 文档(1).docx"));
ig->creatAllIndex(this->result);
}

View File

@ -18,9 +18,10 @@ public:
void Test();
Q_SIGNALS:
private:
const QVector<QString> targetFileTypeVec ={ /*QString(".doc"),*/
QString(".docx")/*,*/
/*QString(".ppt"),
const QVector<QString> targetFileTypeVec ={
// QString(".doc"),
QString(".docx"),
/* QString(".ppt"),
QString(".pptx"),
QString(".xls"),
QString(".xlsx"),

View File

@ -201,7 +201,6 @@ Document IndexGenerator::GenerateDocument(const QVector<QString> &list)
doc.setData(sourcePath);
doc.setUniqueTerm(uniqueterm);
doc.addValue(list.at(2));
if(list.at(2) == QString("1"))
QStringList temp;
temp.append(index_text);
// temp.append(pinyin_text_list);
@ -219,11 +218,12 @@ Document IndexGenerator::GenerateContentDocument(const QString &path)
QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(path));
QVector<SKeyWord> term = ChineseSegmentation::callSegement(content);
Document doc;
doc.setData(path);
doc.setData(*content);
doc.setUniqueTerm(uniqueterm);
doc.addValue(path);
for(int i = 0;i<term.size();++i)
{
doc.addterm(term.at(i).word,static_cast<int>(term.at(i).weight));
doc.addterm(term.at(i).word,term.at(i).offsets,static_cast<int>(term.at(i).weight));
}
return doc;
@ -322,6 +322,7 @@ bool IndexGenerator::deleteAllIndex(QStringList *pathlist)
{
qDebug()<<"--delete start--";
m_datebase_path->delete_document(uniqueterm);
m_database_content->delete_document(uniqueterm);
qDebug()<<"delete md5"<<QString::fromStdString(uniqueterm);
m_datebase_path->commit();
qDebug()<< "--delete finish--";

View File

@ -69,8 +69,8 @@ int main(int argc, char *argv[])
/*-------------InotyifyRefact Test End-----------------*/
/*-------------文本搜索 Test start-----------------*/
// FileSearcher *search = new FileSearcher();
// search->onKeywordSearchContent("麒麟");
FileSearcher *search = new FileSearcher();
search->onKeywordSearchContent("测试");
/*-------------文本搜索 Test End-----------------*/
qRegisterMetaType<QVector<QStringList>>("QVector<QStringList>");