ukui-search/libsearch/index/batch-indexer.cpp

373 lines
12 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: iaom <zhangpengfei@kylinos.cn>
*
*/
#include "batch-indexer.h"
#include <QFileInfo>
#include <QElapsedTimer>
#include <QDebug>
#include <malloc.h>
#include <QQueue>
#include <QDateTime>
#include "file-utils.h"
#include "basic-indexer.h"
#include "file-indexer-config.h"
#include "file-content-indexer.h"
#include "writable-database.h"
#include "compatible-define.h"
using namespace UkuiSearch;
BatchIndexer::BatchIndexer(const QStringList &folders, const QStringList &blackList,
QAtomicInt& indexStop, QAtomicInt &contentIndexStop, QAtomicInt &contentIndexOcrStop,
WorkMode mode, Targets target)
: m_folders(folders),
m_blackList(blackList),
m_indexStop(&indexStop),
m_contentIndexStop(&contentIndexStop),
m_contentIndexOcrStop(&contentIndexOcrStop),
m_mode(mode),
m_target(target)
{
}
void BatchIndexer::run()
{
QElapsedTimer timer;
timer.start();
if(m_target == Target::None || (m_indexStop->LOAD && m_contentIndexStop->LOAD)) {
Q_EMIT done(m_mode, m_target);
return;
}
fetch();
if(m_target & Target::Basic) {
basicIndex();
Q_EMIT basicIndexDone(m_mode);
}
if(m_target & Target::Content) {
contentIndex();
Q_EMIT contentIndexDone(m_mode);
}
if(m_target & Target::Ocr) {
ocrIndex();
Q_EMIT ocrContentIndexDone(m_mode);
}
m_cache.clear();
malloc_trim(0);
qDebug() << "FirstRunIndexer: time :" << timer.elapsed() << "milliseconds";
Q_EMIT done(m_mode, m_target);
}
void BatchIndexer::fetch()
{
qDebug() << "Now begin fetching files to be indexed...";
qDebug() << "Index folders:" << m_folders << "blacklist :" << m_blackList;
QQueue<QString> bfs;
for(const QString& blockPath : m_blackList) {
for(const QString& path : m_folders) {
if(FileUtils::isOrUnder(path, blockPath)) {
m_folders.removeOne(path);
}
}
}
m_cache.append(m_folders);
for(const QString &path : m_folders) {
bfs.enqueue(path);
}
QFileInfoList list;
QDir dir;
QStringList tmpList = m_blackList;
// QDir::Hidden
dir.setFilter(QDir::Dirs | QDir::Files | QDir::NoDotAndDotDot);
dir.setSorting(QDir::DirsFirst);
while(!bfs.empty()) {
dir.setPath(bfs.dequeue());
list = dir.entryInfoList();
for(const auto& i : list) {
bool isBlocked = false;
for(const QString &path : tmpList) {
if(i.absoluteFilePath() == path) {
isBlocked = true;
tmpList.removeOne(path);
break;
}
}
if(isBlocked)
continue;
if(i.isDir() && (!(i.isSymLink()))) {
bfs.enqueue(i.absoluteFilePath());
}
m_cache.append(i.absoluteFilePath());
}
}
qDebug() << m_cache.size() << "files founded, start index...";
}
void BatchIndexer::basicIndex()
{
qDebug() << "Begin basic index";
WritableDatabase basicDb(DataBaseType::Basic);
if(!basicDb.open()) {
qWarning() << "Basic db open failed, fail to run basic index!";
return;
}
QStringList filesNeedIndex;
if(m_mode == WorkMode::Rebuild) {
basicDb.rebuild();
if(!basicDb.open()) {
qWarning() << "basicDb db open failed, fail to run basic index!";
return;
}
filesNeedIndex = m_cache;
qDebug() <<filesNeedIndex.size() << "files need index.";
} else if(m_mode == WorkMode::Add) {
filesNeedIndex = m_cache;
qDebug() <<filesNeedIndex.size() << "files need index.";
} else if (m_mode == WorkMode::Update) {
QFileInfo info;
QMap<std::string, std::string> indexTimes = basicDb.getIndexTimes();
qDebug() << indexTimes.size() << "documents recorded";
for(const QString& path : m_cache) {
info.setFile(path);
if(indexTimes.take(FileUtils::makeDocUterm(path)) != info.lastModified().toString("yyyyMMddHHmmsszzz").toStdString()) {
filesNeedIndex.append(path);
}
}
if(!indexTimes.isEmpty()) {
qDebug() << indexTimes.size() << "documents need remove.";
for(const std::string& uniqueTerm : indexTimes.keys()) {
basicDb.removeDocument(uniqueTerm);
}
basicDb.commit();
}
qDebug() << filesNeedIndex.size() << "files need update.";
}
uint allSize = filesNeedIndex.size();
Q_EMIT progress(IndexType::Basic, allSize, 0);
uint batchSize = 0;
uint finishNum = 0;
for (const QString& path: filesNeedIndex) {
BasicIndexer indexer(path);
if(indexer.index()) {
basicDb.addDocument(indexer.document());
++batchSize;
++finishNum;
}
if(batchSize >= 8192) {
qDebug() << finishNum << "of" << allSize <<"finished.";
basicDb.commit();
Q_EMIT progress(IndexType::Basic, allSize, finishNum);
//文件名索引很快
if(m_indexStop->LOAD) {
qDebug() << "Index stopped, abort basic index.";
filesNeedIndex.clear();
return;
}
batchSize = 0;
}
}
//TODO:xapian默认10000条自动commit一次需要根据内存占用情况调整。
basicDb.commit();
Q_EMIT progress(IndexType::Basic, allSize, finishNum);
filesNeedIndex.clear();
qDebug() << "Finish basic index";
}
void BatchIndexer::contentIndex()
{
qDebug() << "Begin content index";
if(m_contentIndexStop->LOAD) {
qDebug() << "Index stopped, abort content index.";
return;
}
WritableDatabase contentDb(DataBaseType::Content);
if(!contentDb.open()) {
qWarning() << "Content db open failed, fail to run content index!";
return;
}
QStringList filesNeedIndex;
QFileInfo info;
if(m_mode == WorkMode::Rebuild) {
contentDb.rebuild();
if(!contentDb.open()) {
return;
}
}
if(m_mode == WorkMode::Rebuild || m_mode == WorkMode::Add) {
for(const QString& path : m_cache) {
info.setFile(path);
if(targetFileTypeMap[info.suffix()] && info.isFile()) {
if(!FileUtils::isEncrypedOrUnsupport(path, info.suffix())) {
filesNeedIndex.append(path);
}
}
}
} else if(m_mode == WorkMode::Update) {
QMap<std::string, std::string> indexTimes = contentDb.getIndexTimes();
qDebug() << indexTimes.size() << "documents recorded";
for(const QString& path : m_cache) {
info.setFile(path);
if(targetFileTypeMap[info.suffix()] && info.isFile()) {
std::string uterm = FileUtils::makeDocUterm(path);
if(indexTimes.value(uterm) != info.lastModified().toString("yyyyMMddHHmmsszzz").toStdString()) {
if(!FileUtils::isEncrypedOrUnsupport(path, info.suffix())) {
filesNeedIndex.append(path);
indexTimes.remove(uterm);
}
} else {
indexTimes.remove(uterm);
}
}
}
if(!indexTimes.isEmpty()) {
qDebug() << indexTimes.size() << "documents need remove";
for(const std::string& uniqueTerm : indexTimes.keys()) {
contentDb.removeDocument(uniqueTerm);
}
contentDb.commit();
}
}
uint allSize = filesNeedIndex.size();
qDebug() << allSize << "files need content index.";
Q_EMIT progress(IndexType::Contents, allSize, 0);
uint batchSize = 0;
uint finishNum = 0;
for (const QString& path : filesNeedIndex) {
if(m_contentIndexStop->LOAD) {
qDebug() << "Index stopped, interrupt content index.";
filesNeedIndex.clear();
return;
}
fileContentIndexer indexer(path);
if(indexer.index()) {
contentDb.addDocument(indexer.document());
++batchSize;
++finishNum;
} else {
// qDebug() << "Extract fail===" << path;
}
if(batchSize >= 30) {
contentDb.commit();
qDebug() << finishNum << "of" << allSize <<"finished.";
Q_EMIT progress(IndexType::Contents, allSize, finishNum);
batchSize = 0;
}
}
contentDb.commit();
Q_EMIT progress(IndexType::Contents, allSize, finishNum);
filesNeedIndex.clear();
qDebug() << "Finish content index";
}
void BatchIndexer::ocrIndex()
{
qDebug() << "Begin ocr content index";
if(m_contentIndexOcrStop->LOAD) {
qDebug() << "Index stopped, abort ocr content index.";
return;
}
WritableDatabase contentDb(DataBaseType::OcrContent);
if(!contentDb.open()) {
qWarning() << "Content db open failed, fail to run ocr content index!";
return;
}
QStringList filesNeedOCRIndex;
QFileInfo info;
if(m_mode == WorkMode::Rebuild) {
contentDb.rebuild();
if(!contentDb.open()) {
return;
}
}
if(m_mode == WorkMode::Rebuild || m_mode == WorkMode::Add) {
for(const QString &path : m_cache) {
info.setFile(path);
if(targetPhotographTypeMap[info.suffix()] && info.isFile()) {
if(!FileUtils::isEncrypedOrUnsupport(path, info.suffix())) {
filesNeedOCRIndex.append(path);
}
}
}
} else {
QMap<std::string, std::string> indexTimes = contentDb.getIndexTimes();
qDebug() << indexTimes.size() << "documents recorded";
for(const QString& path : m_cache) {
info.setFile(path);
if(targetPhotographTypeMap[info.suffix()] && info.isFile()) {
std::string uterm = FileUtils::makeDocUterm(path);
if(indexTimes.value(uterm) != info.lastModified().toString("yyyyMMddHHmmsszzz").toStdString()) {
if(!FileUtils::isEncrypedOrUnsupport(path, info.suffix())) {
filesNeedOCRIndex.append(path);
indexTimes.remove(uterm);
}
} else {
indexTimes.remove(uterm);
}
}
}
if(!indexTimes.isEmpty()) {
qDebug() << indexTimes.size() << "documents need remove";
for(const std::string& uniqueTerm : indexTimes.keys()) {
contentDb.removeDocument(uniqueTerm);
}
contentDb.commit();
}
}
uint allSize = filesNeedOCRIndex.size();
qDebug() << allSize << "pictures need ocr content index.";
Q_EMIT progress(IndexType::OCR, allSize, 0);
uint batchSize = 0;
uint finishNum = 0;
for (const QString &path : filesNeedOCRIndex) {
if(m_contentIndexOcrStop->LOAD) {
qDebug() << "Index stopped, interrupt ocr content index.";
filesNeedOCRIndex.clear();
return;
}
fileContentIndexer indexer(path);
if(indexer.index()) {
contentDb.addDocument(indexer.document());
++batchSize;
++finishNum;
} else {
// qDebug() << "Extract fail===" << path;
}
if(batchSize >= 10) {
contentDb.commit();
qDebug() << finishNum << "of" << allSize <<"finished.";
Q_EMIT progress(IndexType::OCR, allSize, finishNum);
batchSize = 0;
}
}
contentDb.commit();
Q_EMIT progress(IndexType::OCR, allSize, finishNum);
filesNeedOCRIndex.clear();
qDebug() << "Ocr content index finished,";
}