ukui-search/libsearch/index/batch-indexer.cpp

320 lines
10 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: iaom <zhangpengfei@kylinos.cn>
*
*/
#include "batch-indexer.h"
#include <QFileInfo>
#include <QElapsedTimer>
#include <QDebug>
#include <malloc.h>
#include <QQueue>
#include <QDateTime>
#include "file-utils.h"
#include "basic-indexer.h"
#include "file-indexer-config.h"
#include "file-content-indexer.h"
#include "writable-database.h"
#include "compatible-define.h"
using namespace UkuiSearch;
BatchIndexer::BatchIndexer(const QStringList &folders, const QStringList &blackList, QAtomicInt& stop, WorkMode mode, Targets target)
: m_folders(folders),
m_blackList(blackList),
m_stop(&stop),
m_mode(mode),
m_target(target)
{
}
void BatchIndexer::run()
{
QElapsedTimer timer;
timer.start();
if(m_target == Target::None || m_stop->LOAD) {
Q_EMIT done(m_mode);
return;
}
fetch();
if(m_target & Target::Basic) {
basicIndex();
}
if(m_target & Target::Content) {
contentIndex();
}
m_cache.clear();
malloc_trim(0);
qDebug() << "FirstRunIndexer: time :" << timer.elapsed() << "milliseconds";
Q_EMIT done(m_mode);
}
void BatchIndexer::fetch()
{
qDebug() << "Now begin fetching files to be indexed...";
qDebug() << "Index folders:" << m_folders << "blacklist :" << m_blackList;
QQueue<QString> bfs;
for(QString blockPath : m_blackList) {
for(QString path : m_folders) {
if(FileUtils::isOrUnder(path, blockPath)) {
m_folders.removeOne(path);
}
}
}
m_cache.append(m_folders);
for(QString path : m_folders) {
bfs.enqueue(path);
}
QFileInfoList list;
QDir dir;
QStringList tmpList = m_blackList;
// QDir::Hidden
dir.setFilter(QDir::Dirs | QDir::Files | QDir::NoDotAndDotDot);
dir.setSorting(QDir::DirsFirst);
while(!bfs.empty()) {
dir.setPath(bfs.dequeue());
list = dir.entryInfoList();
for(auto i : list) {
bool isBlocked = false;
for(QString path : tmpList) {
if(i.absoluteFilePath() == path) {
isBlocked = true;
tmpList.removeOne(path);
break;
}
}
if(isBlocked)
continue;
if(i.isDir() && (!(i.isSymLink()))) {
bfs.enqueue(i.absoluteFilePath());
}
m_cache.append(i.absoluteFilePath());
}
}
qDebug() << m_cache.size() << "files founded, start index...";
}
void BatchIndexer::basicIndex()
{
qDebug() << "Begin basic index";
WritableDatabase basicDb(DataBaseType::Basic);
if(!basicDb.open()) {
qWarning() << "Basic db open failed, fail to run basic index!";
return;
}
QStringList filesNeedIndex;
if(m_mode == WorkMode::Rebuild) {
basicDb.rebuild();
if(!basicDb.open()) {
qWarning() << "basicDb db open failed, fail to run basic index!";
return;
}
filesNeedIndex = m_cache;
qDebug() <<filesNeedIndex.size() << "files need index.";
} else if(m_mode == WorkMode::Add) {
filesNeedIndex = m_cache;
qDebug() <<filesNeedIndex.size() << "files need index.";
} else if (m_mode == WorkMode::Update) {
QFileInfo info;
QMap<std::string, std::string> indexTimes = basicDb.getIndexTimes();
qDebug() << indexTimes.size() << "documents recorded";
for(const QString& path : m_cache) {
info.setFile(path);
if(indexTimes.take(FileUtils::makeDocUterm(path)) != info.lastModified().toString("yyyyMMddHHmmsszzz").toStdString()) {
filesNeedIndex.append(path);
}
}
if(!indexTimes.isEmpty()) {
qDebug() << indexTimes.size() << "documents need remove.";
for(std::string uniqueTerm : indexTimes.keys()) {
basicDb.removeDocument(uniqueTerm);
}
basicDb.commit();
}
qDebug() << filesNeedIndex.size() << "files need update.";
}
uint allSize = filesNeedIndex.size();
Q_EMIT progress(IndexType::Basic, allSize, 0);
uint batchSize = 0;
uint finishNum = 0;
for (const QString& path: filesNeedIndex) {
BasicIndexer indexer(path);
if(indexer.index()) {
basicDb.addDocument(indexer.document());
++batchSize;
++finishNum;
}
if(batchSize >= 8192) {
qDebug() << "8192 finished.";
basicDb.commit();
Q_EMIT progress(IndexType::Basic, allSize, finishNum);
//文件名索引很快
if(m_stop->LOAD) {
qDebug() << "Index stopped, abort basic index.";
filesNeedIndex.clear();
return;
}
batchSize = 0;
}
}
//TODO:xapian默认10000条自动commit一次需要根据内存占用情况调整。
basicDb.commit();
Q_EMIT progress(IndexType::Basic, allSize, finishNum);
Q_EMIT basicIndexDone(finishNum);
filesNeedIndex.clear();
qDebug() << "Finish basic index";
}
void BatchIndexer::contentIndex()
{
qDebug() << "Begin content index";
if(m_stop->LOAD) {
qDebug() << "Index stopped, abort content index.";
return;
}
WritableDatabase contentDb(DataBaseType::Content);
if(!contentDb.open()) {
qWarning() << "Content db open failed, fail to run content index!";
return;
}
QStringList filesNeedIndex;
QStringList filesNeedOCRIndex;
QMap<QString, bool> suffixMap = targetFileTypeMap;
QFileInfo info;
// ocr
// bool ocrEnable = FileIndexerConfig::getInstance()->isOCREnable();
if(FileIndexerConfig::getInstance()->isOCREnable()) {
qDebug() << "OCR enabled.";
suffixMap.INSERT(targetPhotographTypeMap);
}
if(m_mode == WorkMode::Rebuild) {
contentDb.rebuild();
if(!contentDb.open()) {
return;
}
}
if(m_mode == WorkMode::Rebuild || m_mode == WorkMode::Add) {
for(QString path : m_cache) {
info.setFile(path);
if(true == suffixMap[info.suffix()] && info.isFile()) {
if(!FileUtils::isEncrypedOrUnsupport(path, info.suffix())) {
filesNeedIndex.append(path);
}
}
}
} else if(m_mode == WorkMode::Update) {
QMap<std::string, std::string> indexTimes = contentDb.getIndexTimes();
qDebug() << indexTimes.size() << "documents recorded";
for(QString path : m_cache) {
info.setFile(path);
if(true == suffixMap[info.suffix()] && info.isFile()) {
std::string uterm = FileUtils::makeDocUterm(path);
if(indexTimes.value(uterm) != info.lastModified().toString("yyyyMMddHHmmsszzz").toStdString()) {
if(!FileUtils::isEncrypedOrUnsupport(path, info.suffix())) {
filesNeedIndex.append(path);
indexTimes.remove(uterm);
}
} else {
indexTimes.remove(uterm);
}
}
}
if(!indexTimes.isEmpty()) {
qDebug() << indexTimes.size() << "documents need remove";
for(std::string uniqueTerm : indexTimes.keys()) {
contentDb.removeDocument(uniqueTerm);
}
contentDb.commit();
}
}
uint allSize = filesNeedIndex.size();
qDebug() << allSize << "files need content index.";
Q_EMIT progress(IndexType::Contents, allSize, 0);
uint batchSize = 0;
uint finishNum = 0;
for (QString path : filesNeedIndex) {
if(m_stop->LOAD) {
qDebug() << "Index stopped, interrupt content index.";
filesNeedIndex.clear();
filesNeedOCRIndex.clear();
return;
}
info.setFile(path);
if(true == targetPhotographTypeMap[info.suffix()]) {
filesNeedOCRIndex.append(path);
filesNeedIndex.removeOne(path);
continue;
}
fileContentIndexer indexer(path);
if(indexer.index()) {
contentDb.addDocument(indexer.document());
++batchSize;
++finishNum;
} else {
// qDebug() << "Extract fail===" << path;
}
if(batchSize >= 30) {
contentDb.commit();
qDebug() << "30 finished.";
Q_EMIT progress(IndexType::Contents, allSize, finishNum);
batchSize = 0;
}
}
contentDb.commit();
Q_EMIT progress(IndexType::Contents, allSize, finishNum);
filesNeedIndex.clear();
qDebug() << "Content index for normal files finished, now begin OCR index";
int ocrSize = filesNeedOCRIndex.size();
qDebug() << ocrSize << "pictures need OCR index.";
batchSize = 0;
int ocrFinishNum = 0;
for(QString path : filesNeedOCRIndex) {
if(m_stop->LOAD) {
qDebug() << "Index stopped, interrupt content index.";
filesNeedOCRIndex.clear();
return;
}
fileContentIndexer indexer(path);
if(indexer.index()) {
contentDb.addDocument(indexer.document());
++batchSize;
++ocrFinishNum;
} else {
// qDebug() << "Extract fail===" << path;
}
if(batchSize >= 30) {
contentDb.commit();
qDebug() << "30 finished.";
Q_EMIT progress(IndexType::Contents, allSize, finishNum + ocrFinishNum);
Q_EMIT progress(IndexType::OCR, ocrSize, ocrFinishNum);
batchSize = 0;
}
}
contentDb.commit();
Q_EMIT progress(IndexType::OCR, ocrSize, ocrFinishNum);
Q_EMIT progress(IndexType::Contents, allSize, finishNum + ocrFinishNum);
filesNeedOCRIndex.clear();
qDebug() << "Finish OCR index.";
Q_EMIT contentIndexDone(finishNum + ocrFinishNum);
qDebug() << "Finish content index";
}