2022-10-26 18:01:40 +08:00
|
|
|
|
/*
|
|
|
|
|
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
|
|
|
|
*
|
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
|
|
|
* (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
|
*
|
|
|
|
|
* Authors: iaom <zhangpengfei@kylinos.cn>
|
|
|
|
|
*
|
|
|
|
|
*/
|
2022-12-15 16:13:16 +08:00
|
|
|
|
#include "batch-indexer.h"
|
2022-10-26 18:01:40 +08:00
|
|
|
|
#include <QFileInfo>
|
|
|
|
|
#include <QTime>
|
|
|
|
|
#include <malloc.h>
|
|
|
|
|
|
|
|
|
|
#include "file-utils.h"
|
|
|
|
|
#include "basic-indexer.h"
|
|
|
|
|
#include "file-indexer-config.h"
|
|
|
|
|
#include "file-content-indexer.h"
|
|
|
|
|
#include "writable-database.h"
|
2022-12-15 16:13:16 +08:00
|
|
|
|
#include "compatible-define.h"
|
2022-10-26 18:01:40 +08:00
|
|
|
|
using namespace UkuiSearch;
|
2022-12-15 16:13:16 +08:00
|
|
|
|
BatchIndexer::BatchIndexer(const QStringList &folders, const QStringList &blackList, QAtomicInt& stop, WorkMode mode, Targets target)
|
2022-10-26 18:01:40 +08:00
|
|
|
|
: m_folders(folders),
|
|
|
|
|
m_blackList(blackList),
|
|
|
|
|
m_stop(&stop),
|
|
|
|
|
m_mode(mode),
|
|
|
|
|
m_target(target)
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-15 16:13:16 +08:00
|
|
|
|
void BatchIndexer::run()
|
2022-10-26 18:01:40 +08:00
|
|
|
|
{
|
|
|
|
|
QTime t = QTime::currentTime();
|
2022-12-15 16:13:16 +08:00
|
|
|
|
if(m_target == Target::None || m_stop->LOAD) {
|
|
|
|
|
Q_EMIT done(m_mode);
|
2022-10-26 18:01:40 +08:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
fetch();
|
|
|
|
|
|
|
|
|
|
if(m_target & Target::Basic) {
|
|
|
|
|
basicIndex();
|
|
|
|
|
}
|
|
|
|
|
if(m_target & Target::Content) {
|
|
|
|
|
contentIndex();
|
|
|
|
|
}
|
|
|
|
|
m_cache.clear();
|
|
|
|
|
malloc_trim(0);
|
|
|
|
|
qDebug() << "FirstRunIndexer: time :" << t.elapsed();
|
2022-12-09 16:30:58 +08:00
|
|
|
|
Q_EMIT done(m_mode);
|
2022-10-26 18:01:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
2022-12-15 16:13:16 +08:00
|
|
|
|
void BatchIndexer::fetch()
|
2022-10-26 18:01:40 +08:00
|
|
|
|
{
|
|
|
|
|
qDebug() << "Now begin fetching files to be indexed...";
|
|
|
|
|
qDebug() << "Index folders:" << m_folders << "blacklist :" << m_blackList;
|
|
|
|
|
QQueue<QString> bfs;
|
|
|
|
|
for(QString blockPath : m_blackList) {
|
|
|
|
|
for(QString path : m_folders) {
|
|
|
|
|
if(FileUtils::isOrUnder(path, blockPath)) {
|
|
|
|
|
m_folders.removeOne(path);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
m_cache.append(m_folders);
|
|
|
|
|
for(QString path : m_folders) {
|
|
|
|
|
bfs.enqueue(path);
|
|
|
|
|
}
|
|
|
|
|
QFileInfoList list;
|
|
|
|
|
QDir dir;
|
|
|
|
|
QStringList tmpList = m_blackList;
|
|
|
|
|
// QDir::Hidden
|
|
|
|
|
dir.setFilter(QDir::Dirs | QDir::Files | QDir::NoDotAndDotDot);
|
|
|
|
|
dir.setSorting(QDir::DirsFirst);
|
|
|
|
|
while(!bfs.empty()) {
|
|
|
|
|
dir.setPath(bfs.dequeue());
|
|
|
|
|
list = dir.entryInfoList();
|
|
|
|
|
for(auto i : list) {
|
|
|
|
|
bool isBlocked = false;
|
|
|
|
|
for(QString path : tmpList) {
|
|
|
|
|
if(i.absoluteFilePath() == path) {
|
|
|
|
|
isBlocked = true;
|
|
|
|
|
tmpList.removeOne(path);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if(isBlocked)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if(i.isDir() && (!(i.isSymLink()))) {
|
|
|
|
|
bfs.enqueue(i.absoluteFilePath());
|
|
|
|
|
}
|
|
|
|
|
m_cache.append(i.absoluteFilePath());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
qDebug() << m_cache.size() << "files founded, start index...";
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-15 16:13:16 +08:00
|
|
|
|
void BatchIndexer::basicIndex()
|
2022-10-26 18:01:40 +08:00
|
|
|
|
{
|
|
|
|
|
qDebug() << "Begin basic index";
|
|
|
|
|
WritableDatabase basicDb(DataBaseType::Basic);
|
|
|
|
|
if(!basicDb.open()) {
|
|
|
|
|
qWarning() << "Basic db open failed, fail to run basic index!";
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
QStringList filesNeedIndex;
|
|
|
|
|
if(m_mode == WorkMode::Rebuild) {
|
|
|
|
|
basicDb.rebuild();
|
|
|
|
|
if(!basicDb.open()) {
|
|
|
|
|
qWarning() << "basicDb db open failed, fail to run basic index!";
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
filesNeedIndex = m_cache;
|
|
|
|
|
qDebug() <<filesNeedIndex.size() << "files need index.";
|
2023-04-04 16:54:52 +08:00
|
|
|
|
} else if (m_mode == WorkMode::Update || m_mode == WorkMode::Add) {
|
2022-10-26 18:01:40 +08:00
|
|
|
|
QFileInfo info;
|
|
|
|
|
QMap<std::string, std::string> indexTimes = basicDb.getIndexTimes();
|
|
|
|
|
qDebug() << indexTimes.size() << "documents recorded";
|
|
|
|
|
for(const QString& path : m_cache) {
|
|
|
|
|
info.setFile(path);
|
|
|
|
|
if(indexTimes.take(FileUtils::makeDocUterm(path)) != info.lastModified().toString("yyyyMMddHHmmsszzz").toStdString()) {
|
|
|
|
|
filesNeedIndex.append(path);
|
|
|
|
|
}
|
|
|
|
|
}
|
2023-04-04 16:54:52 +08:00
|
|
|
|
if(m_mode == WorkMode::Update && !indexTimes.isEmpty()) {
|
2022-10-26 18:01:40 +08:00
|
|
|
|
qDebug() << indexTimes.size() << "documents need remove.";
|
|
|
|
|
for(std::string uniqueTerm : indexTimes.keys()) {
|
|
|
|
|
basicDb.removeDocument(uniqueTerm);
|
|
|
|
|
}
|
|
|
|
|
basicDb.commit();
|
|
|
|
|
}
|
2023-04-04 16:54:52 +08:00
|
|
|
|
|
|
|
|
|
qDebug() << filesNeedIndex.size() << "files need index.";
|
2022-10-26 18:01:40 +08:00
|
|
|
|
}
|
|
|
|
|
uint allSize = filesNeedIndex.size();
|
|
|
|
|
Q_EMIT progress(IndexType::Basic, allSize, 0);
|
|
|
|
|
uint batchSize = 0;
|
|
|
|
|
uint finishNum = 0;
|
|
|
|
|
for (const QString& path: filesNeedIndex) {
|
|
|
|
|
BasicIndexer indexer(path);
|
|
|
|
|
if(indexer.index()) {
|
|
|
|
|
basicDb.addDocument(indexer.document());
|
|
|
|
|
++batchSize;
|
|
|
|
|
++finishNum;
|
|
|
|
|
}
|
|
|
|
|
if(batchSize >= 8192) {
|
|
|
|
|
qDebug() << "8192 finished.";
|
|
|
|
|
basicDb.commit();
|
|
|
|
|
Q_EMIT progress(IndexType::Basic, allSize, finishNum);
|
2022-12-15 16:13:16 +08:00
|
|
|
|
//文件名索引很快
|
|
|
|
|
if(m_stop->LOAD) {
|
|
|
|
|
qDebug() << "Index stopped, abort basic index.";
|
|
|
|
|
filesNeedIndex.clear();
|
|
|
|
|
return;
|
|
|
|
|
}
|
2022-10-26 18:01:40 +08:00
|
|
|
|
batchSize = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
//TODO:xapian默认10000条自动commit一次,需要根据内存占用情况调整。
|
|
|
|
|
basicDb.commit();
|
|
|
|
|
Q_EMIT progress(IndexType::Basic, allSize, finishNum);
|
|
|
|
|
Q_EMIT basicIndexDone(finishNum);
|
|
|
|
|
filesNeedIndex.clear();
|
|
|
|
|
qDebug() << "Finish basic index";
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-15 16:13:16 +08:00
|
|
|
|
void BatchIndexer::contentIndex()
|
2022-10-26 18:01:40 +08:00
|
|
|
|
{
|
|
|
|
|
qDebug() << "Begin content index";
|
2022-12-15 16:13:16 +08:00
|
|
|
|
if(m_stop->LOAD) {
|
2022-10-26 18:01:40 +08:00
|
|
|
|
qDebug() << "Index stopped, abort content index.";
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
WritableDatabase contentDb(DataBaseType::Content);
|
|
|
|
|
if(!contentDb.open()) {
|
|
|
|
|
qWarning() << "Content db open failed, fail to run content index!";
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
QStringList filesNeedIndex;
|
|
|
|
|
QStringList filesNeedOCRIndex;
|
|
|
|
|
QMap<QString, bool> suffixMap = targetFileTypeMap;
|
|
|
|
|
QFileInfo info;
|
|
|
|
|
// ocr
|
|
|
|
|
// bool ocrEnable = FileIndexerConfig::getInstance()->isOCREnable();
|
|
|
|
|
if(FileIndexerConfig::getInstance()->isOCREnable()) {
|
|
|
|
|
qDebug() << "OCR enabled.";
|
|
|
|
|
suffixMap.unite(targetPhotographTypeMap);
|
|
|
|
|
}
|
|
|
|
|
if(m_mode == WorkMode::Rebuild) {
|
|
|
|
|
contentDb.rebuild();
|
|
|
|
|
if(!contentDb.open()) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
for(QString path : m_cache) {
|
|
|
|
|
info.setFile(path);
|
|
|
|
|
if(true == suffixMap[info.suffix()] && info.isFile()) {
|
|
|
|
|
if(!FileUtils::isEncrypedOrUnsupport(path, info.suffix())) {
|
|
|
|
|
filesNeedIndex.append(path);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2023-04-04 16:54:52 +08:00
|
|
|
|
} else if(m_mode == WorkMode::Update || m_mode == WorkMode::Add) {
|
2022-10-26 18:01:40 +08:00
|
|
|
|
QMap<std::string, std::string> indexTimes = contentDb.getIndexTimes();
|
|
|
|
|
qDebug() << indexTimes.size() << "documents recorded";
|
|
|
|
|
for(QString path : m_cache) {
|
|
|
|
|
info.setFile(path);
|
|
|
|
|
if(true == suffixMap[info.suffix()] && info.isFile()) {
|
|
|
|
|
std::string uterm = FileUtils::makeDocUterm(path);
|
|
|
|
|
if(indexTimes.value(uterm) != info.lastModified().toString("yyyyMMddHHmmsszzz").toStdString()) {
|
|
|
|
|
if(!FileUtils::isEncrypedOrUnsupport(path, info.suffix())) {
|
|
|
|
|
filesNeedIndex.append(path);
|
|
|
|
|
indexTimes.remove(uterm);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
indexTimes.remove(uterm);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2023-04-04 16:54:52 +08:00
|
|
|
|
if(m_mode == WorkMode::Update && !indexTimes.isEmpty()) {
|
2022-10-26 18:01:40 +08:00
|
|
|
|
qDebug() << indexTimes.size() << "documents need remove";
|
|
|
|
|
for(std::string uniqueTerm : indexTimes.keys()) {
|
|
|
|
|
contentDb.removeDocument(uniqueTerm);
|
|
|
|
|
}
|
|
|
|
|
contentDb.commit();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-04 16:54:52 +08:00
|
|
|
|
|
2022-10-26 18:01:40 +08:00
|
|
|
|
uint allSize = filesNeedIndex.size();
|
|
|
|
|
qDebug() << allSize << "files need content index.";
|
|
|
|
|
Q_EMIT progress(IndexType::Contents, allSize, 0);
|
|
|
|
|
|
|
|
|
|
uint batchSize = 0;
|
|
|
|
|
uint finishNum = 0;
|
|
|
|
|
for (QString path : filesNeedIndex) {
|
2022-12-15 16:13:16 +08:00
|
|
|
|
if(m_stop->LOAD) {
|
|
|
|
|
qDebug() << "Index stopped, interrupt content index.";
|
|
|
|
|
filesNeedIndex.clear();
|
|
|
|
|
filesNeedOCRIndex.clear();
|
|
|
|
|
return;
|
|
|
|
|
}
|
2022-10-26 18:01:40 +08:00
|
|
|
|
info.setFile(path);
|
|
|
|
|
if(true == targetPhotographTypeMap[info.suffix()]) {
|
|
|
|
|
filesNeedOCRIndex.append(path);
|
|
|
|
|
filesNeedIndex.removeOne(path);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
fileContentIndexer indexer(path);
|
|
|
|
|
if(indexer.index()) {
|
|
|
|
|
contentDb.addDocument(indexer.document());
|
|
|
|
|
++batchSize;
|
|
|
|
|
++finishNum;
|
|
|
|
|
} else {
|
2022-12-23 11:11:06 +08:00
|
|
|
|
// qDebug() << "Extract fail===" << path;
|
2022-10-26 18:01:40 +08:00
|
|
|
|
}
|
|
|
|
|
if(batchSize >= 30) {
|
|
|
|
|
contentDb.commit();
|
|
|
|
|
qDebug() << "30 finished.";
|
|
|
|
|
Q_EMIT progress(IndexType::Contents, allSize, finishNum);
|
|
|
|
|
batchSize = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
contentDb.commit();
|
|
|
|
|
Q_EMIT progress(IndexType::Contents, allSize, finishNum);
|
|
|
|
|
|
|
|
|
|
filesNeedIndex.clear();
|
|
|
|
|
qDebug() << "Content index for normal files finished, now begin OCR index";
|
|
|
|
|
int ocrSize = filesNeedOCRIndex.size();
|
|
|
|
|
qDebug() << ocrSize << "pictures need OCR index.";
|
|
|
|
|
|
|
|
|
|
batchSize = 0;
|
|
|
|
|
int ocrFinishNum = 0;
|
|
|
|
|
for(QString path : filesNeedOCRIndex) {
|
2022-12-15 16:13:16 +08:00
|
|
|
|
if(m_stop->LOAD) {
|
|
|
|
|
qDebug() << "Index stopped, interrupt content index.";
|
|
|
|
|
filesNeedOCRIndex.clear();
|
|
|
|
|
return;
|
|
|
|
|
}
|
2022-10-26 18:01:40 +08:00
|
|
|
|
fileContentIndexer indexer(path);
|
|
|
|
|
if(indexer.index()) {
|
|
|
|
|
contentDb.addDocument(indexer.document());
|
|
|
|
|
++batchSize;
|
|
|
|
|
++ocrFinishNum;
|
|
|
|
|
} else {
|
2022-12-23 11:11:06 +08:00
|
|
|
|
// qDebug() << "Extract fail===" << path;
|
2022-10-26 18:01:40 +08:00
|
|
|
|
}
|
|
|
|
|
if(batchSize >= 30) {
|
|
|
|
|
contentDb.commit();
|
|
|
|
|
qDebug() << "30 finished.";
|
|
|
|
|
Q_EMIT progress(IndexType::Contents, allSize, finishNum + ocrFinishNum);
|
|
|
|
|
Q_EMIT progress(IndexType::OCR, ocrSize, ocrFinishNum);
|
|
|
|
|
batchSize = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
contentDb.commit();
|
|
|
|
|
Q_EMIT progress(IndexType::OCR, ocrSize, ocrFinishNum);
|
|
|
|
|
Q_EMIT progress(IndexType::Contents, allSize, finishNum + ocrFinishNum);
|
|
|
|
|
filesNeedOCRIndex.clear();
|
|
|
|
|
qDebug() << "Finish OCR index.";
|
|
|
|
|
Q_EMIT contentIndexDone(finishNum + ocrFinishNum);
|
|
|
|
|
qDebug() << "Finish content index";
|
|
|
|
|
}
|