321 lines
11 KiB
C++
321 lines
11 KiB
C++
/*
|
||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||
*
|
||
* This program is free software: you can redistribute it and/or modify
|
||
* it under the terms of the GNU General Public License as published by
|
||
* the Free Software Foundation, either version 3 of the License, or
|
||
* (at your option) any later version.
|
||
*
|
||
* This program is distributed in the hope that it will be useful,
|
||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
* GNU General Public License for more details.
|
||
*
|
||
* You should have received a copy of the GNU General Public License
|
||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||
*
|
||
* Authors: iaom <zhangpengfei@kylinos.cn>
|
||
*
|
||
*/
|
||
#include "batch-indexer.h"
|
||
#include <QFileInfo>
|
||
#include <QElapsedTimer>
|
||
#include <QDebug>
|
||
#include <malloc.h>
|
||
#include <QQueue>
|
||
#include <QDateTime>
|
||
|
||
#include "file-utils.h"
|
||
#include "basic-indexer.h"
|
||
#include "file-indexer-config.h"
|
||
#include "file-content-indexer.h"
|
||
#include "writable-database.h"
|
||
#include "compatible-define.h"
|
||
using namespace UkuiSearch;
|
||
BatchIndexer::BatchIndexer(const QStringList &folders, const QStringList &blackList, QAtomicInt& indexStop, QAtomicInt &contentIndexStop, WorkMode mode, Targets target)
|
||
: m_folders(folders),
|
||
m_blackList(blackList),
|
||
m_indexStop(&indexStop),
|
||
m_contentIndexStop(&contentIndexStop),
|
||
m_mode(mode),
|
||
m_target(target)
|
||
{
|
||
}
|
||
|
||
void BatchIndexer::run()
|
||
{
|
||
QElapsedTimer timer;
|
||
timer.start();
|
||
if(m_target == Target::None || (m_indexStop->LOAD && m_contentIndexStop->LOAD)) {
|
||
Q_EMIT done(m_mode, m_target);
|
||
return;
|
||
}
|
||
fetch();
|
||
|
||
if(m_target & Target::Basic) {
|
||
basicIndex();
|
||
Q_EMIT basicIndexDone(m_mode);
|
||
}
|
||
if(m_target & Target::Content) {
|
||
contentIndex();
|
||
Q_EMIT contentIndexDone(m_mode);
|
||
}
|
||
m_cache.clear();
|
||
malloc_trim(0);
|
||
qDebug() << "FirstRunIndexer: time :" << timer.elapsed() << "milliseconds";
|
||
Q_EMIT done(m_mode, m_target);
|
||
}
|
||
|
||
void BatchIndexer::fetch()
|
||
{
|
||
qDebug() << "Now begin fetching files to be indexed...";
|
||
qDebug() << "Index folders:" << m_folders << "blacklist :" << m_blackList;
|
||
QQueue<QString> bfs;
|
||
for(QString blockPath : m_blackList) {
|
||
for(QString path : m_folders) {
|
||
if(FileUtils::isOrUnder(path, blockPath)) {
|
||
m_folders.removeOne(path);
|
||
}
|
||
}
|
||
}
|
||
m_cache.append(m_folders);
|
||
for(QString path : m_folders) {
|
||
bfs.enqueue(path);
|
||
}
|
||
QFileInfoList list;
|
||
QDir dir;
|
||
QStringList tmpList = m_blackList;
|
||
// QDir::Hidden
|
||
dir.setFilter(QDir::Dirs | QDir::Files | QDir::NoDotAndDotDot);
|
||
dir.setSorting(QDir::DirsFirst);
|
||
while(!bfs.empty()) {
|
||
dir.setPath(bfs.dequeue());
|
||
list = dir.entryInfoList();
|
||
for(auto i : list) {
|
||
bool isBlocked = false;
|
||
for(QString path : tmpList) {
|
||
if(i.absoluteFilePath() == path) {
|
||
isBlocked = true;
|
||
tmpList.removeOne(path);
|
||
break;
|
||
}
|
||
}
|
||
if(isBlocked)
|
||
continue;
|
||
|
||
if(i.isDir() && (!(i.isSymLink()))) {
|
||
bfs.enqueue(i.absoluteFilePath());
|
||
}
|
||
m_cache.append(i.absoluteFilePath());
|
||
}
|
||
}
|
||
qDebug() << m_cache.size() << "files founded, start index...";
|
||
}
|
||
|
||
void BatchIndexer::basicIndex()
|
||
{
|
||
qDebug() << "Begin basic index";
|
||
WritableDatabase basicDb(DataBaseType::Basic);
|
||
if(!basicDb.open()) {
|
||
qWarning() << "Basic db open failed, fail to run basic index!";
|
||
return;
|
||
}
|
||
QStringList filesNeedIndex;
|
||
if(m_mode == WorkMode::Rebuild) {
|
||
basicDb.rebuild();
|
||
if(!basicDb.open()) {
|
||
qWarning() << "basicDb db open failed, fail to run basic index!";
|
||
return;
|
||
}
|
||
filesNeedIndex = m_cache;
|
||
qDebug() <<filesNeedIndex.size() << "files need index.";
|
||
} else if(m_mode == WorkMode::Add) {
|
||
filesNeedIndex = m_cache;
|
||
qDebug() <<filesNeedIndex.size() << "files need index.";
|
||
} else if (m_mode == WorkMode::Update) {
|
||
QFileInfo info;
|
||
QMap<std::string, std::string> indexTimes = basicDb.getIndexTimes();
|
||
qDebug() << indexTimes.size() << "documents recorded";
|
||
for(const QString& path : m_cache) {
|
||
info.setFile(path);
|
||
if(indexTimes.take(FileUtils::makeDocUterm(path)) != info.lastModified().toString("yyyyMMddHHmmsszzz").toStdString()) {
|
||
filesNeedIndex.append(path);
|
||
}
|
||
}
|
||
if(!indexTimes.isEmpty()) {
|
||
qDebug() << indexTimes.size() << "documents need remove.";
|
||
for(std::string uniqueTerm : indexTimes.keys()) {
|
||
basicDb.removeDocument(uniqueTerm);
|
||
}
|
||
basicDb.commit();
|
||
}
|
||
qDebug() << filesNeedIndex.size() << "files need update.";
|
||
}
|
||
uint allSize = filesNeedIndex.size();
|
||
Q_EMIT progress(IndexType::Basic, allSize, 0);
|
||
uint batchSize = 0;
|
||
uint finishNum = 0;
|
||
for (const QString& path: filesNeedIndex) {
|
||
BasicIndexer indexer(path);
|
||
if(indexer.index()) {
|
||
basicDb.addDocument(indexer.document());
|
||
++batchSize;
|
||
++finishNum;
|
||
}
|
||
if(batchSize >= 8192) {
|
||
qDebug() << "8192 finished.";
|
||
basicDb.commit();
|
||
Q_EMIT progress(IndexType::Basic, allSize, finishNum);
|
||
//文件名索引很快
|
||
if(m_indexStop->LOAD) {
|
||
qDebug() << "Index stopped, abort basic index.";
|
||
filesNeedIndex.clear();
|
||
return;
|
||
}
|
||
batchSize = 0;
|
||
}
|
||
}
|
||
//TODO:xapian默认10000条自动commit一次,需要根据内存占用情况调整。
|
||
basicDb.commit();
|
||
Q_EMIT progress(IndexType::Basic, allSize, finishNum);
|
||
filesNeedIndex.clear();
|
||
qDebug() << "Finish basic index";
|
||
}
|
||
|
||
void BatchIndexer::contentIndex()
|
||
{
|
||
qDebug() << "Begin content index";
|
||
if(m_contentIndexStop->LOAD) {
|
||
qDebug() << "Index stopped, abort content index.";
|
||
return;
|
||
}
|
||
WritableDatabase contentDb(DataBaseType::Content);
|
||
if(!contentDb.open()) {
|
||
qWarning() << "Content db open failed, fail to run content index!";
|
||
return;
|
||
}
|
||
QStringList filesNeedIndex;
|
||
QStringList filesNeedOCRIndex;
|
||
QMap<QString, bool> suffixMap = targetFileTypeMap;
|
||
QFileInfo info;
|
||
// ocr
|
||
// bool ocrEnable = FileIndexerConfig::getInstance()->isOCREnable();
|
||
if(FileIndexerConfig::getInstance()->isOCREnable()) {
|
||
qDebug() << "OCR enabled.";
|
||
suffixMap.INSERT(targetPhotographTypeMap);
|
||
}
|
||
if(m_mode == WorkMode::Rebuild) {
|
||
contentDb.rebuild();
|
||
if(!contentDb.open()) {
|
||
return;
|
||
}
|
||
}
|
||
if(m_mode == WorkMode::Rebuild || m_mode == WorkMode::Add) {
|
||
for(QString path : m_cache) {
|
||
info.setFile(path);
|
||
if(true == suffixMap[info.suffix()] && info.isFile()) {
|
||
if(!FileUtils::isEncrypedOrUnsupport(path, info.suffix())) {
|
||
filesNeedIndex.append(path);
|
||
}
|
||
}
|
||
}
|
||
} else if(m_mode == WorkMode::Update) {
|
||
QMap<std::string, std::string> indexTimes = contentDb.getIndexTimes();
|
||
qDebug() << indexTimes.size() << "documents recorded";
|
||
for(QString path : m_cache) {
|
||
info.setFile(path);
|
||
if(true == suffixMap[info.suffix()] && info.isFile()) {
|
||
std::string uterm = FileUtils::makeDocUterm(path);
|
||
if(indexTimes.value(uterm) != info.lastModified().toString("yyyyMMddHHmmsszzz").toStdString()) {
|
||
if(!FileUtils::isEncrypedOrUnsupport(path, info.suffix())) {
|
||
filesNeedIndex.append(path);
|
||
indexTimes.remove(uterm);
|
||
}
|
||
} else {
|
||
indexTimes.remove(uterm);
|
||
}
|
||
}
|
||
}
|
||
if(!indexTimes.isEmpty()) {
|
||
qDebug() << indexTimes.size() << "documents need remove";
|
||
for(std::string uniqueTerm : indexTimes.keys()) {
|
||
contentDb.removeDocument(uniqueTerm);
|
||
}
|
||
contentDb.commit();
|
||
}
|
||
}
|
||
|
||
uint allSize = filesNeedIndex.size();
|
||
qDebug() << allSize << "files need content index.";
|
||
Q_EMIT progress(IndexType::Contents, allSize, 0);
|
||
|
||
uint batchSize = 0;
|
||
uint finishNum = 0;
|
||
for (QString path : filesNeedIndex) {
|
||
if(m_contentIndexStop->LOAD) {
|
||
qDebug() << "Index stopped, interrupt content index.";
|
||
filesNeedIndex.clear();
|
||
filesNeedOCRIndex.clear();
|
||
return;
|
||
}
|
||
info.setFile(path);
|
||
if(true == targetPhotographTypeMap[info.suffix()]) {
|
||
filesNeedOCRIndex.append(path);
|
||
filesNeedIndex.removeOne(path);
|
||
continue;
|
||
}
|
||
fileContentIndexer indexer(path);
|
||
if(indexer.index()) {
|
||
contentDb.addDocument(indexer.document());
|
||
++batchSize;
|
||
++finishNum;
|
||
} else {
|
||
// qDebug() << "Extract fail===" << path;
|
||
}
|
||
if(batchSize >= 30) {
|
||
contentDb.commit();
|
||
qDebug() << "30 finished.";
|
||
Q_EMIT progress(IndexType::Contents, allSize, finishNum);
|
||
batchSize = 0;
|
||
}
|
||
}
|
||
contentDb.commit();
|
||
Q_EMIT progress(IndexType::Contents, allSize, finishNum);
|
||
|
||
filesNeedIndex.clear();
|
||
qDebug() << "Content index for normal files finished, now begin OCR index";
|
||
int ocrSize = filesNeedOCRIndex.size();
|
||
qDebug() << ocrSize << "pictures need OCR index.";
|
||
|
||
batchSize = 0;
|
||
int ocrFinishNum = 0;
|
||
for(QString path : filesNeedOCRIndex) {
|
||
if(m_contentIndexStop->LOAD) {
|
||
qDebug() << "Index stopped, interrupt content index.";
|
||
filesNeedOCRIndex.clear();
|
||
return;
|
||
}
|
||
fileContentIndexer indexer(path);
|
||
if(indexer.index()) {
|
||
contentDb.addDocument(indexer.document());
|
||
++batchSize;
|
||
++ocrFinishNum;
|
||
} else {
|
||
// qDebug() << "Extract fail===" << path;
|
||
}
|
||
if(batchSize >= 30) {
|
||
contentDb.commit();
|
||
qDebug() << "30 finished.";
|
||
Q_EMIT progress(IndexType::Contents, allSize, finishNum + ocrFinishNum);
|
||
Q_EMIT progress(IndexType::OCR, ocrSize, ocrFinishNum);
|
||
batchSize = 0;
|
||
}
|
||
}
|
||
contentDb.commit();
|
||
Q_EMIT progress(IndexType::OCR, ocrSize, ocrFinishNum);
|
||
Q_EMIT progress(IndexType::Contents, allSize, finishNum + ocrFinishNum);
|
||
filesNeedOCRIndex.clear();
|
||
qDebug() << "Finish OCR index.";
|
||
qDebug() << "Finish content index";
|
||
}
|