ukui-search/libsearch/index/batch-indexer.cpp

320 lines
10 KiB
C++
Raw Permalink Normal View History

2022-10-26 18:01:40 +08:00
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: iaom <zhangpengfei@kylinos.cn>
*
*/
#include "batch-indexer.h"
2022-10-26 18:01:40 +08:00
#include <QFileInfo>
#include <QElapsedTimer>
#include <QDebug>
2022-10-26 18:01:40 +08:00
#include <malloc.h>
#include <QQueue>
#include <QDateTime>
2022-10-26 18:01:40 +08:00
#include "file-utils.h"
#include "basic-indexer.h"
#include "file-indexer-config.h"
#include "file-content-indexer.h"
#include "writable-database.h"
#include "compatible-define.h"
2022-10-26 18:01:40 +08:00
using namespace UkuiSearch;
BatchIndexer::BatchIndexer(const QStringList &folders, const QStringList &blackList, QAtomicInt& stop, WorkMode mode, Targets target)
2022-10-26 18:01:40 +08:00
: m_folders(folders),
m_blackList(blackList),
m_stop(&stop),
m_mode(mode),
m_target(target)
{
}
void BatchIndexer::run()
2022-10-26 18:01:40 +08:00
{
QElapsedTimer timer;
timer.start();
if(m_target == Target::None || m_stop->LOAD) {
Q_EMIT done(m_mode);
2022-10-26 18:01:40 +08:00
return;
}
fetch();
if(m_target & Target::Basic) {
basicIndex();
}
if(m_target & Target::Content) {
contentIndex();
}
m_cache.clear();
malloc_trim(0);
qDebug() << "FirstRunIndexer: time :" << timer.elapsed() << "milliseconds";
Q_EMIT done(m_mode);
2022-10-26 18:01:40 +08:00
}
void BatchIndexer::fetch()
2022-10-26 18:01:40 +08:00
{
qDebug() << "Now begin fetching files to be indexed...";
qDebug() << "Index folders:" << m_folders << "blacklist :" << m_blackList;
QQueue<QString> bfs;
for(QString blockPath : m_blackList) {
for(QString path : m_folders) {
if(FileUtils::isOrUnder(path, blockPath)) {
m_folders.removeOne(path);
}
}
}
m_cache.append(m_folders);
for(QString path : m_folders) {
bfs.enqueue(path);
}
QFileInfoList list;
QDir dir;
QStringList tmpList = m_blackList;
// QDir::Hidden
dir.setFilter(QDir::Dirs | QDir::Files | QDir::NoDotAndDotDot);
dir.setSorting(QDir::DirsFirst);
while(!bfs.empty()) {
dir.setPath(bfs.dequeue());
list = dir.entryInfoList();
for(auto i : list) {
bool isBlocked = false;
for(QString path : tmpList) {
if(i.absoluteFilePath() == path) {
isBlocked = true;
tmpList.removeOne(path);
break;
}
}
if(isBlocked)
continue;
if(i.isDir() && (!(i.isSymLink()))) {
bfs.enqueue(i.absoluteFilePath());
}
m_cache.append(i.absoluteFilePath());
}
}
qDebug() << m_cache.size() << "files founded, start index...";
}
void BatchIndexer::basicIndex()
2022-10-26 18:01:40 +08:00
{
qDebug() << "Begin basic index";
WritableDatabase basicDb(DataBaseType::Basic);
if(!basicDb.open()) {
qWarning() << "Basic db open failed, fail to run basic index!";
return;
}
QStringList filesNeedIndex;
if(m_mode == WorkMode::Rebuild) {
basicDb.rebuild();
if(!basicDb.open()) {
qWarning() << "basicDb db open failed, fail to run basic index!";
return;
}
filesNeedIndex = m_cache;
qDebug() <<filesNeedIndex.size() << "files need index.";
} else if(m_mode == WorkMode::Add) {
filesNeedIndex = m_cache;
qDebug() <<filesNeedIndex.size() << "files need index.";
} else if (m_mode == WorkMode::Update) {
2022-10-26 18:01:40 +08:00
QFileInfo info;
QMap<std::string, std::string> indexTimes = basicDb.getIndexTimes();
qDebug() << indexTimes.size() << "documents recorded";
for(const QString& path : m_cache) {
info.setFile(path);
if(indexTimes.take(FileUtils::makeDocUterm(path)) != info.lastModified().toString("yyyyMMddHHmmsszzz").toStdString()) {
filesNeedIndex.append(path);
}
}
if(!indexTimes.isEmpty()) {
2022-10-26 18:01:40 +08:00
qDebug() << indexTimes.size() << "documents need remove.";
for(std::string uniqueTerm : indexTimes.keys()) {
basicDb.removeDocument(uniqueTerm);
}
basicDb.commit();
}
qDebug() << filesNeedIndex.size() << "files need update.";
2022-10-26 18:01:40 +08:00
}
uint allSize = filesNeedIndex.size();
Q_EMIT progress(IndexType::Basic, allSize, 0);
uint batchSize = 0;
uint finishNum = 0;
for (const QString& path: filesNeedIndex) {
BasicIndexer indexer(path);
if(indexer.index()) {
basicDb.addDocument(indexer.document());
++batchSize;
++finishNum;
}
if(batchSize >= 8192) {
qDebug() << "8192 finished.";
basicDb.commit();
Q_EMIT progress(IndexType::Basic, allSize, finishNum);
//文件名索引很快
if(m_stop->LOAD) {
qDebug() << "Index stopped, abort basic index.";
filesNeedIndex.clear();
return;
}
2022-10-26 18:01:40 +08:00
batchSize = 0;
}
}
//TODO:xapian默认10000条自动commit一次需要根据内存占用情况调整。
basicDb.commit();
Q_EMIT progress(IndexType::Basic, allSize, finishNum);
Q_EMIT basicIndexDone(finishNum);
filesNeedIndex.clear();
qDebug() << "Finish basic index";
}
void BatchIndexer::contentIndex()
2022-10-26 18:01:40 +08:00
{
qDebug() << "Begin content index";
if(m_stop->LOAD) {
2022-10-26 18:01:40 +08:00
qDebug() << "Index stopped, abort content index.";
return;
}
WritableDatabase contentDb(DataBaseType::Content);
if(!contentDb.open()) {
qWarning() << "Content db open failed, fail to run content index!";
return;
}
QStringList filesNeedIndex;
QStringList filesNeedOCRIndex;
QMap<QString, bool> suffixMap = targetFileTypeMap;
QFileInfo info;
// ocr
// bool ocrEnable = FileIndexerConfig::getInstance()->isOCREnable();
if(FileIndexerConfig::getInstance()->isOCREnable()) {
qDebug() << "OCR enabled.";
2023-04-24 18:40:47 +08:00
suffixMap.INSERT(targetPhotographTypeMap);
2022-10-26 18:01:40 +08:00
}
if(m_mode == WorkMode::Rebuild) {
contentDb.rebuild();
if(!contentDb.open()) {
return;
}
}
if(m_mode == WorkMode::Rebuild || m_mode == WorkMode::Add) {
2022-10-26 18:01:40 +08:00
for(QString path : m_cache) {
info.setFile(path);
if(true == suffixMap[info.suffix()] && info.isFile()) {
if(!FileUtils::isEncrypedOrUnsupport(path, info.suffix())) {
filesNeedIndex.append(path);
}
}
}
} else if(m_mode == WorkMode::Update) {
2022-10-26 18:01:40 +08:00
QMap<std::string, std::string> indexTimes = contentDb.getIndexTimes();
qDebug() << indexTimes.size() << "documents recorded";
for(QString path : m_cache) {
info.setFile(path);
if(true == suffixMap[info.suffix()] && info.isFile()) {
std::string uterm = FileUtils::makeDocUterm(path);
if(indexTimes.value(uterm) != info.lastModified().toString("yyyyMMddHHmmsszzz").toStdString()) {
if(!FileUtils::isEncrypedOrUnsupport(path, info.suffix())) {
filesNeedIndex.append(path);
indexTimes.remove(uterm);
}
} else {
indexTimes.remove(uterm);
}
}
}
if(!indexTimes.isEmpty()) {
2022-10-26 18:01:40 +08:00
qDebug() << indexTimes.size() << "documents need remove";
for(std::string uniqueTerm : indexTimes.keys()) {
contentDb.removeDocument(uniqueTerm);
}
contentDb.commit();
}
}
uint allSize = filesNeedIndex.size();
qDebug() << allSize << "files need content index.";
Q_EMIT progress(IndexType::Contents, allSize, 0);
uint batchSize = 0;
uint finishNum = 0;
for (QString path : filesNeedIndex) {
if(m_stop->LOAD) {
qDebug() << "Index stopped, interrupt content index.";
filesNeedIndex.clear();
filesNeedOCRIndex.clear();
return;
}
2022-10-26 18:01:40 +08:00
info.setFile(path);
if(true == targetPhotographTypeMap[info.suffix()]) {
filesNeedOCRIndex.append(path);
filesNeedIndex.removeOne(path);
continue;
}
fileContentIndexer indexer(path);
if(indexer.index()) {
contentDb.addDocument(indexer.document());
++batchSize;
++finishNum;
} else {
2022-12-23 11:11:06 +08:00
// qDebug() << "Extract fail===" << path;
2022-10-26 18:01:40 +08:00
}
if(batchSize >= 30) {
contentDb.commit();
qDebug() << "30 finished.";
Q_EMIT progress(IndexType::Contents, allSize, finishNum);
batchSize = 0;
}
}
contentDb.commit();
Q_EMIT progress(IndexType::Contents, allSize, finishNum);
filesNeedIndex.clear();
qDebug() << "Content index for normal files finished, now begin OCR index";
int ocrSize = filesNeedOCRIndex.size();
qDebug() << ocrSize << "pictures need OCR index.";
batchSize = 0;
int ocrFinishNum = 0;
for(QString path : filesNeedOCRIndex) {
if(m_stop->LOAD) {
qDebug() << "Index stopped, interrupt content index.";
filesNeedOCRIndex.clear();
return;
}
2022-10-26 18:01:40 +08:00
fileContentIndexer indexer(path);
if(indexer.index()) {
contentDb.addDocument(indexer.document());
++batchSize;
++ocrFinishNum;
} else {
2022-12-23 11:11:06 +08:00
// qDebug() << "Extract fail===" << path;
2022-10-26 18:01:40 +08:00
}
if(batchSize >= 30) {
contentDb.commit();
qDebug() << "30 finished.";
Q_EMIT progress(IndexType::Contents, allSize, finishNum + ocrFinishNum);
Q_EMIT progress(IndexType::OCR, ocrSize, ocrFinishNum);
batchSize = 0;
}
}
contentDb.commit();
Q_EMIT progress(IndexType::OCR, ocrSize, ocrFinishNum);
Q_EMIT progress(IndexType::Contents, allSize, finishNum + ocrFinishNum);
filesNeedOCRIndex.clear();
qDebug() << "Finish OCR index.";
Q_EMIT contentIndexDone(finishNum + ocrFinishNum);
qDebug() << "Finish content index";
}