Merge from dev-unity.

This commit is contained in:
iaom 2021-07-20 11:28:36 +08:00
commit 0894cd56a8
16 changed files with 267 additions and 73 deletions

View File

@ -66,8 +66,6 @@ QVector<SKeyWord> ChineseSegmentation::callSegement(std::string s) {
keywordres.clear();
// keywordres.shrink_to_fit();
return vecNeeds;
}

View File

@ -167,6 +167,121 @@ public:
}
}
void Find_Reverse(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
vector<struct DatDag>&res, size_t max_word_len) const {
res.clear();
res.resize(end - begin);
string text_str;
EncodeRunesToString(begin, end, text_str);
static const size_t max_num = 128;
JiebaDAT::result_pair_type result_pairs[max_num] = {};
size_t str_size = end - begin;
for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
begin_pos -= (end - i - 1)->len;
std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
res[str_size - i - 1].nexts.push_back(pair<size_t, const DatMemElem *>(str_size - i, nullptr));
for (std::size_t idx = 0; idx < num_results; ++idx) {
auto & match = result_pairs[idx];
if ((match.value < 0) || ((size_t)match.value >= elements_num_)) {
continue;
}
auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
if (char_num > max_word_len) {
continue;
}
auto pValue = &elements_ptr_[match.value];
if (1 == char_num) {
res[str_size - i - 1].nexts[0].second = pValue;
continue;
}
res[str_size - i - 1].nexts.push_back(pair<size_t, const DatMemElem *>(str_size - 1 - i + char_num, pValue));
}
}
}
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
vector<WordRange>& words, size_t max_word_len) const {
string text_str;
EncodeRunesToString(begin, end, text_str);
static const size_t max_num = 128;
JiebaDAT::result_pair_type result_pairs[max_num] = {};//存放字典查询结果
size_t str_size = end - begin;
double max_weight[str_size];//存放逆向路径最大weight
for (size_t i = 0; i<str_size; i++) {
max_weight[i] = -3.14e+100;
}
int max_next[str_size];//存放动态规划后的分词结果
memset(max_next,-1,str_size);
double val(0);
for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
size_t nextPos = str_size - i;//逆向计算
begin_pos -= (end - i - 1)->len;
std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
if (0 == num_results) {//字典不存在则单独分词
val = min_weight_;
if (nextPos < str_size) {
val += max_weight[nextPos];
}
if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
max_weight[nextPos - 1] = val;
max_next[nextPos - 1] = nextPos;
}
} else {//字典存在则根据查询结果数量计算最大概率路径
for (std::size_t idx = 0; idx < num_results; ++idx) {
auto & match = result_pairs[idx];
if ((match.value < 0) || ((size_t)match.value >= elements_num_)) {
continue;
}
auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
if (char_num > max_word_len) {
continue;
}
auto pValue = &elements_ptr_[match.value];
val = pValue->weight;
if (1 == char_num) {
if (nextPos < str_size) {
val += max_weight[nextPos];
}
if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
max_weight[nextPos - 1] = val;
max_next[nextPos - 1] = nextPos;
}
} else {
if (nextPos - 1 + char_num < str_size) {
val += max_weight[nextPos - 1 + char_num];
}
if ((nextPos - 1 + char_num <= str_size) && (val > max_weight[nextPos - 1])) {
max_weight[nextPos - 1] = val;
max_next[nextPos - 1] = nextPos - 1 + char_num;
}
}
}
}
}
for (size_t i = 0; i < str_size;) {//统计动态规划结果
assert(max_next[i] > i);
assert(max_next[i] <= str_size);
WordRange wr(begin + i, begin + max_next[i] - 1);
words.push_back(wr);
i = max_next[i];
}
}
double GetMinWeight() const {
return min_weight_;
}
@ -284,7 +399,7 @@ private:
//const int fd =::mkstemp(&tmp_filepath[0]);
//原mkstemp用法有误已修复--jxx20210519
const int fd =::mkstemp((char *)tmp_filepath.data());
qDebug() << "mkstemp error:" << errno << tmp_filepath.data();
qDebug() << "mkstemp :" << errno << tmp_filepath.data();
assert(fd >= 0);
::fchmod(fd, 0644);

View File

@ -49,6 +49,13 @@ public:
dat_.Find(begin, end, res, max_word_len);
}
void Find(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<WordRange>& words,
size_t max_word_len = MAX_WORD_LENGTH) const {
dat_.Find(begin, end, words, max_word_len);
}
bool IsUserDictSingleChineseWord(const Rune& word) const {
return IsIn(user_dict_single_chinese_word_, word);
}

View File

@ -138,10 +138,10 @@ private:
size_t now, old, stat;
double tmp, endE, endS;
vector<int> path(XYSize);
vector<double> weight(XYSize);
//int path[XYSize];
//double weight[XYSize];
//vector<int> path(XYSize);
//vector<double> weight(XYSize);
int path[XYSize];
double weight[XYSize];
//start
for (size_t y = 0; y < Y; y++) {

View File

@ -22,10 +22,11 @@ public:
RuneStrArray::const_iterator end,
vector<WordRange>& words,
bool, size_t max_word_len) const override {
vector<DatDag> dags;
dictTrie_->Find(begin, end, dags, max_word_len);//依据DAG词典生成DAG--jxx
CalcDP(dags);//动态规划Dynamic ProgrammingDP根据DAG计算最优动态规划路径--jxx
CutByDag(begin, end, dags, words);//依据DAG最优路径分词--jxx
// vector<DatDag> dags;
// dictTrie_->Find(begin, end, dags, max_word_len);//依据DAG词典生成DAG--jxx
// CalcDP(dags);//动态规划Dynamic ProgrammingDP根据DAG计算最优动态规划路径--jxx
// CutByDag(begin, end, dags, words);//依据DAG最优路径分词--jxx
dictTrie_->Find(begin, end, words, max_word_len);
}
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
@ -48,6 +49,7 @@ public:
return dictTrie_->IsUserDictSingleChineseWord(value);
}
private:
/*
void CalcDP(vector<DatDag>& dags) const {
double val(0);
for (auto rit = dags.rbegin(); rit != dags.rend(); rit++) {
@ -73,6 +75,35 @@ private:
}
}
}
*/
/* 倒叙方式重写CalcDP函数初步测试未发现问题*/
void CalcDP(vector<DatDag>& dags) const {
double val(0);
size_t size = dags.size();
for (size_t i = 0; i < size; i++) {
dags[size - 1 - i].max_next = -1;
dags[size - 1 - i].max_weight = MIN_DOUBLE;
for (const auto & it : dags[size - 1 - i].nexts) {
const auto nextPos = it.first;
val = dictTrie_->GetMinWeight();
if (nullptr != it.second) {
val = it.second->weight;
}
if (nextPos < dags.size()) {
val += dags[nextPos].max_weight;
}
if ((nextPos <= dags.size()) && (val > dags[size - 1 - i].max_weight)) {
dags[size - 1 - i].max_weight = val;
dags[size - 1 - i].max_next = nextPos;
}
}
}
}
void CutByDag(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator,

View File

@ -123,65 +123,76 @@ public:
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
size_t) const override {
vector<WordRange> words;
assert(end >= begin);
words.reserve(end - begin);
mpSeg_.CutRuneArray(begin, end, words);
vector<WordRange> hmmRes;
hmmRes.reserve(end - begin);
assert(end >= begin);
if (3 == begin->len or 4 == begin->len) {
words.reserve(end - begin);
mpSeg_.CutRuneArray(begin, end, words);
hmmRes.reserve(words.size());
} else {
hmmRes.reserve(end - begin);
}
for (size_t i = 0; i < words.size(); i++) {
if (words.size() != 0) {//存在中文分词结果
for (size_t i = 0; i < words.size(); i++) {
string str = GetStringFromRunes(s, words[i].left, words[i].right);
string str = GetStringFromRunes(s, words[i].left, words[i].right);
if (stopWords_.find(str) != stopWords_.end()) {
continue;
}
if (words[i].left != words[i].right) {
res[str].offsets.push_back(words[i].left->offset);
res[str].weight += 1.0;
continue;
}
if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune)
|| i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back
if (stopWords_.find(str) != stopWords_.end()) {
continue;
}
res[str].offsets.push_back(words[i].left->offset);
res[str].weight += 1.0;
continue;
}
// if mp Get a single one and it is not in userdict, collect it in sequence
size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里i字符不是最后一个字符直接判定j字符
while (j < (words.size() - 1)
&& words[j].left == words[j].right
&& !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
j++;
}
// Cut the sequence with hmm
assert(j - 1 >= i);
// TODO
hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
//put hmm result to result
for (size_t k = 0; k < hmmRes.size(); k++) {
string hmmStr = GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right);
if (IsSingleWord(hmmStr) || stopWords_.find(hmmStr) != stopWords_.end()) {
if (words[i].left != words[i].right) {
res[str].offsets.push_back(words[i].left->offset);
res[str].weight += 1.0;
continue;
}
res[hmmStr].offsets.push_back(hmmRes[k].left->offset);
res[hmmStr].weight += 1.0;
if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune)
|| i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back
if (stopWords_.find(str) != stopWords_.end()) {
continue;
}
res[str].offsets.push_back(words[i].left->offset);
res[str].weight += 1.0;
continue;
}
// if mp Get a single one and it is not in userdict, collect it in sequence
size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里i字符不是最后一个字符直接判定j字符
while (j < (words.size() - 1)
&& words[j].left == words[j].right
&& !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
j++;
}
// Cut the sequence with hmm
assert(j - 1 >= i);
// TODO
hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
//put hmm result to result
for (size_t k = 0; k < hmmRes.size(); k++) {
string hmmStr = GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right);
if (IsSingleWord(hmmStr) || stopWords_.find(hmmStr) != stopWords_.end()) {
continue;
}
res[hmmStr].offsets.push_back(hmmRes[k].left->offset);
res[hmmStr].weight += 1.0;
}
//clear tmp vars
hmmRes.clear();
//let i jump over this piece
i = j - 1;
}
} else {//不存在中文分词结果
for (size_t i = 0; i < (size_t)(end - begin); i++) {
string str = s.substr((begin+i)->offset, (begin+i)->len);
res[str].offsets.push_back((begin+i)->offset);
res[str].weight += 1.0;
}
//clear tmp vars
hmmRes.clear();
//let i jump over this piece
i = j - 1;
}
}

View File

@ -57,7 +57,6 @@ public:
}
wordRange.left = cursor_;
if (cursor_->rune == 0x20) {
while (cursor_ != sentence_.end()) {
if (cursor_->rune != 0x20) {
@ -71,7 +70,10 @@ public:
cursor_ ++;
}
}
int num = 0;
int max_num = 0;
uint32_t utf8_num = cursor_->len;
while (cursor_ != sentence_.end()) {
if (cursor_->rune == 0x20) {
if (wordRange.left == cursor_) {
@ -83,8 +85,8 @@ public:
}
cursor_ ++;
num++;
if (num >= 1024) { //todo 防止一次性传入过多字节暂定限制为1024个字
max_num++;
if (max_num >= 1024 or cursor_->len != utf8_num) { //todo 防止一次性传入过多字节暂定限制为1024个字
wordRange.right = cursor_;
return true;
}

View File

@ -178,6 +178,22 @@ QString FileUtils::getSettingName(const QString& setting) {
return setting.right(setting.length() - setting.lastIndexOf("/") - 1);
}
bool FileUtils::isOrUnder(QString pathA, QString pathB)
{
if(pathA[0] != "/")
pathA.prepend("/");
if(pathB[0] != "/")
pathB.prepend("/");
if(pathA.length() < pathB.length())
return false;
if(pathA == pathB || pathA.startsWith(pathB + "/"))
return true;
return false;
}
void FileUtils::loadHanziTable(const QString &fileName) {
QFile file(fileName);

View File

@ -70,6 +70,8 @@ public:
static QString getFileName(const QString &);
static QString getAppName(const QString &);
static QString getSettingName(const QString &);
//A is or under B
static bool isOrUnder(QString pathA, QString pathB);
//chinese character to pinyin
static QMap<QString, QStringList> map_chinese2pinyin;

View File

@ -155,15 +155,19 @@ bool GlobalSettings::setBlockDirs(const QString &path, int &returnCode, bool rem
//why QSetting's key can't start with "/"??
QString pathKey = path.right(path.length() - 1);
if (pathKey.endsWith(QLatin1Char('/'))) {
pathKey = pathKey.mid(0, pathKey.length() - 1);
}
QStringList blockDirs = m_block_dirs_settings->allKeys();
for(QString i : blockDirs) {
if(pathKey.startsWith(i)) {
if(FileUtils::isOrUnder(pathKey, i)) {
// returnCode = QString(tr("My parent folder has been blocked!"));
returnCode = PATH_PARENT_BLOCKED;
return false;
}
if(i.startsWith(pathKey))
if(FileUtils::isOrUnder(i, pathKey))
m_block_dirs_settings->remove(i);
}
m_block_dirs_settings->setValue(pathKey, "0");

View File

@ -36,6 +36,7 @@
#include <QDBusInterface>
#include <QApplication>
#include "libsearch_global.h"
#include "file-utils.h"
#define CONTROL_CENTER_PERSONALISE_GSETTINGS_ID "org.ukui.control-center.personalise"
#define TRANSPARENCY_KEY "transparency"

View File

@ -49,7 +49,7 @@ bool InotifyWatch::removeWatch(const QString &path, bool removeFromDatabase)
for(QMap<int, QString>::Iterator i = currentPath.begin(); i != currentPath.end();) {
// qDebug() << i.value();
// if(i.value().length() > path.length()) {
if(i.value().startsWith(path)) {
if(FileUtils::isOrUnder(i.value(), path)) {
qDebug() << "remove path: " << i.value();
inotify_rm_watch(m_inotifyFd, currentPath.key(path));
PendingFile f(i.value());
@ -65,7 +65,8 @@ bool InotifyWatch::removeWatch(const QString &path, bool removeFromDatabase)
for(QMap<int, QString>::Iterator i = currentPath.begin(); i != currentPath.end();) {
// qDebug() << i.value();
if(i.value().length() > path.length()) {
if(i.value().startsWith(path)) {
if(FileUtils::isOrUnder(i.value(), path)) {
// if(i.value().startsWith(path + "/")) {
// qDebug() << "remove path: " << i.value();
inotify_rm_watch(m_inotifyFd, currentPath.key(path));
currentPath.erase(i++);

View File

@ -18,6 +18,7 @@
*
*/
#include "pending-file-queue.h"
#include "file-utils.h"
#include <malloc.h>
using namespace Zeeker;
static PendingFileQueue *global_instance_pending_file_queue = nullptr;
@ -88,7 +89,7 @@ void PendingFileQueue::enqueue(const PendingFile &file)
// Because our datebase need to delete those indexs one by one.
if(file.shouldRemoveIndex() && file.isDir()) {
const auto keepFile = [&file](const PendingFile& pending) {
return (!pending.path().startsWith(file.path()) || pending.shouldRemoveIndex());
return (!FileUtils::isOrUnder(pending.path(), file.path()) || pending.shouldRemoveIndex());
};
const auto end = m_cache.end();
const auto droppedFilesBegin = std::stable_partition(m_cache.begin(), end, keepFile);

View File

@ -81,7 +81,7 @@ void SearchManager::onKeywordSearch(QString keyword, QQueue<QString> *searchResu
bool SearchManager::isBlocked(QString &path) {
QStringList blockList = GlobalSettings::getInstance()->getBlockDirs();
for(QString i : blockList) {
if(path.startsWith(i.prepend("/")))
if(FileUtils::isOrUnder(path, i))
return true;
}
return false;
@ -429,7 +429,7 @@ void DirectSearch::run() {
bool findIndex = false;
for (QString j : blockList) {
if (i.absoluteFilePath().startsWith(j.prepend("/"))) {
if (FileUtils::isOrUnder(i.absoluteFilePath(), j)) {
findIndex = true;
break;
}

View File

@ -5205,8 +5205,10 @@ int KBinaryParser::InitDocOle(FILE* pFile, long lFilesize, QString &content) {
for(iIndex = 0, ulTmp = ulSbdStartblock;
iIndex < (int)tBBDLen && ulTmp != END_OF_CHAIN;
iIndex++, ulTmp = aulBBD[ulTmp]) {
if(ulTmp >= (ULONG)tBBDLen)
if(ulTmp >= (ULONG)tBBDLen) {
qWarning("The Big Block Depot is damaged");
return -1;
}
aulSbdList[iIndex] = ulTmp;
}
@ -5349,7 +5351,10 @@ bool KBinaryParser::RunParser(QString strFile, QString &content) {
(void)fclose(pFile);
return false;
}
InitDocOle(pFile, lFileSize, content);
// If InitDocOle failed, -1 will be returned.
if(InitDocOle(pFile, lFileSize, content)) {
qWarning() << "InitDocOle failed!" << strFile;
}
fclose(pFile);
return true;
}

View File

@ -9,7 +9,7 @@ TEMPLATE = app
PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0
CONFIG += c++11 link_pkgconfig no_keywords lrelease
LIBS += -lxapian -lgsettings-qt -lquazip5 -lX11
#LIBS += -lukui-log4qt -L/usr/local/lib/libjemalloc -ljemalloc
LIBS += -lukui-log4qt #-L/usr/local/lib/libjemalloc -ljemalloc
# The following define makes your compiler emit warnings if you use
# any Qt feature that has been marked deprecated (the exact warnings
# depend on your compiler). Please consult the documentation of the