Optimization of IDF dictionary loading mode; Limit the maximum number of words segmentation; Other optimization;
This commit is contained in:
parent
660e81ccc0
commit
bbf27b9d5f
|
@ -33,6 +33,19 @@ struct DatElement {
|
|||
}
|
||||
};
|
||||
|
||||
struct IdfElement {
|
||||
string word;
|
||||
double idf = 0;
|
||||
|
||||
bool operator < (const IdfElement & b) const {
|
||||
if (word == b.word) {
|
||||
return this->idf > b.idf;
|
||||
}
|
||||
|
||||
return this->word < b.word;
|
||||
}
|
||||
};
|
||||
|
||||
inline std::ostream & operator << (std::ostream& os, const DatElement & elem) {
|
||||
return os << "word=" << elem.word << "/tag=" << elem.tag << "/weight=" << elem.weight;
|
||||
}
|
||||
|
@ -91,13 +104,24 @@ public:
|
|||
JiebaDAT::result_pair_type find_result;
|
||||
dat_.exactMatchSearch(key.c_str(), find_result);
|
||||
|
||||
if ((0 == find_result.length) || (find_result.value < 0) || (find_result.value >= elements_num_)) {
|
||||
if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= elements_num_)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return &elements_ptr_[ find_result.value ];
|
||||
}
|
||||
|
||||
const double Find(const string & key, std::size_t length, std::size_t node_pos) const {
|
||||
JiebaDAT::result_pair_type find_result;
|
||||
dat_.exactMatchSearch(key.c_str(), find_result, length, node_pos);
|
||||
|
||||
if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= elements_num_)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return idf_elements_ptr_[ find_result.value ];
|
||||
}
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
||||
vector<struct DatDag>&res, size_t max_word_len) const {
|
||||
|
||||
|
@ -119,7 +143,7 @@ public:
|
|||
for (std::size_t idx = 0; idx < num_results; ++idx) {
|
||||
auto & match = result_pairs[idx];
|
||||
|
||||
if ((match.value < 0) || (match.value >= elements_num_)) {
|
||||
if ((match.value < 0) || ((size_t)match.value >= elements_num_)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -156,6 +180,11 @@ public:
|
|||
return InitAttachDat(dat_cache_file, md5);
|
||||
}
|
||||
|
||||
bool InitBuildDat(vector<IdfElement>& elements, const string & dat_cache_file, const string & md5) {
|
||||
BuildDatCache(elements, dat_cache_file, md5);
|
||||
return InitIdfAttachDat(dat_cache_file, md5);
|
||||
}
|
||||
|
||||
bool InitAttachDat(const string & dat_cache_file, const string & md5) {
|
||||
mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
|
||||
|
||||
|
@ -187,6 +216,37 @@ public:
|
|||
return true;
|
||||
}
|
||||
|
||||
bool InitIdfAttachDat(const string & dat_cache_file, const string & md5) {
|
||||
mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
|
||||
|
||||
if (mmap_fd_ < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
|
||||
assert(seek_off >= 0);
|
||||
mmap_length_ = seek_off;
|
||||
|
||||
mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
|
||||
assert(MAP_FAILED != mmap_addr_);
|
||||
|
||||
assert(mmap_length_ >= sizeof(CacheFileHeader));
|
||||
CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
|
||||
elements_num_ = header.elements_num;
|
||||
min_weight_ = header.min_weight;
|
||||
assert(sizeof(header.md5_hex) == md5.size());
|
||||
|
||||
if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(double) + header.dat_size * dat_.unit_size());
|
||||
idf_elements_ptr_ = (const double *)(mmap_addr_ + sizeof(header));
|
||||
const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(double) * elements_num_;
|
||||
dat_.set_array(dat_ptr, header.dat_size);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
void BuildDatCache(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
|
||||
std::sort(elements.begin(), elements.end());
|
||||
|
@ -240,12 +300,62 @@ private:
|
|||
}
|
||||
}
|
||||
|
||||
void BuildDatCache(vector<IdfElement>& elements, const string & dat_cache_file, const string & md5) {
|
||||
std::sort(elements.begin(), elements.end());
|
||||
|
||||
vector<const char*> keys_ptr_vec;
|
||||
vector<int> values_vec;
|
||||
vector<double> mem_elem_vec;
|
||||
|
||||
keys_ptr_vec.reserve(elements.size());
|
||||
values_vec.reserve(elements.size());
|
||||
mem_elem_vec.reserve(elements.size());
|
||||
|
||||
CacheFileHeader header;
|
||||
header.min_weight = min_weight_;
|
||||
assert(sizeof(header.md5_hex) == md5.size());
|
||||
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
|
||||
|
||||
for (size_t i = 0; i < elements.size(); ++i) {
|
||||
keys_ptr_vec.push_back(elements[i].word.data());
|
||||
values_vec.push_back(i);
|
||||
mem_elem_vec.push_back(elements[i].idf);
|
||||
}
|
||||
|
||||
auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
|
||||
assert(0 == ret);
|
||||
header.elements_num = mem_elem_vec.size();
|
||||
header.dat_size = dat_.size();
|
||||
|
||||
{
|
||||
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
|
||||
::umask(S_IWGRP | S_IWOTH);
|
||||
//const int fd =::mkstemp(&tmp_filepath[0]);
|
||||
//原mkstemp用法有误,已修复--jxx20210519
|
||||
const int fd =::mkstemp((char *)tmp_filepath.data());
|
||||
qDebug() << "mkstemp error:" << errno << tmp_filepath.data();
|
||||
assert(fd >= 0);
|
||||
::fchmod(fd, 0644);
|
||||
|
||||
auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
|
||||
write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(double) * mem_elem_vec.size());
|
||||
write_bytes += ::write(fd, dat_.array(), dat_.total_size());
|
||||
|
||||
assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(double) + dat_.total_size());
|
||||
::close(fd);
|
||||
|
||||
const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
|
||||
assert(0 == rename_ret);
|
||||
}
|
||||
}
|
||||
|
||||
DatTrie(const DatTrie &);
|
||||
DatTrie &operator=(const DatTrie &);
|
||||
|
||||
private:
|
||||
JiebaDAT dat_;
|
||||
const DatMemElem * elements_ptr_ = nullptr;
|
||||
const double * idf_elements_ptr_= nullptr;
|
||||
size_t elements_num_ = 0;
|
||||
double min_weight_ = 0;
|
||||
|
||||
|
|
|
@ -130,7 +130,7 @@ private:
|
|||
dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) + ".dat_cache";
|
||||
}
|
||||
QString path = QString::fromStdString(dat_cache_path);
|
||||
qDebug() << "#########path:" << path;
|
||||
qDebug() << "#########Dict path:" << path;
|
||||
if (dat_.InitAttachDat(dat_cache_path, md5)) {
|
||||
LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;
|
||||
total_dict_size_ = file_size_sum;
|
||||
|
|
|
@ -138,10 +138,10 @@ private:
|
|||
size_t now, old, stat;
|
||||
double tmp, endE, endS;
|
||||
|
||||
//vector<int> path(XYSize);
|
||||
//vector<double> weight(XYSize);
|
||||
int path[XYSize];
|
||||
double weight[XYSize];
|
||||
vector<int> path(XYSize);
|
||||
vector<double> weight(XYSize);
|
||||
//int path[XYSize];
|
||||
//double weight[XYSize];
|
||||
|
||||
//start
|
||||
for (size_t y = 0; y < Y; y++) {
|
||||
|
|
|
@ -0,0 +1,134 @@
|
|||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <stdint.h>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "Unicode.hpp"
|
||||
#include "DatTrie.hpp"
|
||||
#include <QDebug>
|
||||
namespace cppjieba {
|
||||
|
||||
using namespace limonp;
|
||||
|
||||
const size_t IDF_COLUMN_NUM = 2;
|
||||
|
||||
class IdfTrie {
|
||||
public:
|
||||
enum UserWordWeightOption {
|
||||
WordWeightMin,
|
||||
WordWeightMedian,
|
||||
WordWeightMax,
|
||||
}; // enum UserWordWeightOption
|
||||
|
||||
IdfTrie(const string& dict_path, const string & dat_cache_path = "",
|
||||
UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
|
||||
Init(dict_path, dat_cache_path, user_word_weight_opt);
|
||||
}
|
||||
|
||||
~IdfTrie() {}
|
||||
|
||||
double Find(const string & word, std::size_t length = 0, std::size_t node_pos = 0) const {
|
||||
return dat_.Find(word, length, node_pos);
|
||||
}
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<struct DatDag>&res,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
dat_.Find(begin, end, res, max_word_len);
|
||||
}
|
||||
|
||||
bool IsUserDictSingleChineseWord(const Rune& word) const {
|
||||
return IsIn(user_dict_single_chinese_word_, word);
|
||||
}
|
||||
|
||||
double GetMinWeight() const {
|
||||
return dat_.GetMinWeight();
|
||||
}
|
||||
|
||||
size_t GetTotalDictSize() const {
|
||||
return total_dict_size_;
|
||||
}
|
||||
|
||||
private:
|
||||
void Init(const string& dict_path, string dat_cache_path,
|
||||
UserWordWeightOption user_word_weight_opt) {
|
||||
size_t file_size_sum = 0;
|
||||
const string md5 = CalcFileListMD5(dict_path, file_size_sum);
|
||||
|
||||
if (dat_cache_path.empty()) {
|
||||
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
|
||||
dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) + ".dat_cache";
|
||||
}
|
||||
QString path = QString::fromStdString(dat_cache_path);
|
||||
qDebug() << "#########Idf path:" << path;
|
||||
if (dat_.InitIdfAttachDat(dat_cache_path, md5)) {
|
||||
total_dict_size_ = file_size_sum;
|
||||
return;
|
||||
}
|
||||
|
||||
LoadDefaultIdf(dict_path);
|
||||
double idf_sum_ = CalcIdfSum(static_node_infos_);
|
||||
assert(static_node_infos_.size());
|
||||
idfAverage_ = idf_sum_ / static_node_infos_.size();
|
||||
assert(idfAverage_ > 0.0);
|
||||
double min_weight = 0;
|
||||
dat_.SetMinWeight(min_weight);
|
||||
|
||||
const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
|
||||
assert(build_ret);
|
||||
total_dict_size_ = file_size_sum;
|
||||
vector<IdfElement>().swap(static_node_infos_);
|
||||
}
|
||||
|
||||
void LoadDefaultIdf(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
if(not ifs.is_open()){
|
||||
return ;
|
||||
}
|
||||
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
|
||||
string line;
|
||||
vector<string> buf;
|
||||
size_t lineno = 0;
|
||||
|
||||
for (; getline(ifs, line); lineno++) {
|
||||
if (line.empty()) {
|
||||
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
|
||||
continue;
|
||||
}
|
||||
Split(line, buf, " ");
|
||||
XCHECK(buf.size() == IDF_COLUMN_NUM) << "split result illegal, line:" << line;
|
||||
IdfElement node_info;
|
||||
node_info.word = buf[0];
|
||||
node_info.idf = atof(buf[1].c_str());
|
||||
static_node_infos_.push_back(node_info);
|
||||
}
|
||||
}
|
||||
|
||||
double CalcIdfSum(const vector<IdfElement>& node_infos) const {
|
||||
double sum = 0.0;
|
||||
|
||||
for (size_t i = 0; i < node_infos.size(); i++) {
|
||||
sum += node_infos[i].idf;
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
public:
|
||||
double idfAverage_;
|
||||
private:
|
||||
vector<IdfElement> static_node_infos_;
|
||||
size_t total_dict_size_ = 0;
|
||||
DatTrie dat_;
|
||||
unordered_set<Rune> user_dict_single_chinese_word_;
|
||||
};
|
||||
}
|
||||
|
|
@ -21,7 +21,7 @@ public:
|
|||
mix_seg_(&dict_trie_, &model_, stopWordPath),
|
||||
full_seg_(&dict_trie_),
|
||||
query_seg_(&dict_trie_, &model_, stopWordPath),
|
||||
extractor(&dict_trie_, &model_, idfPath, stopWordPath){ }
|
||||
extractor(&dict_trie_, &model_, idfPath, dat_cache_path,stopWordPath){ }
|
||||
~Jieba() { }
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#include <cmath>
|
||||
#include "MixSegment.hpp"
|
||||
#include "IdfTrie.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
|
@ -11,18 +12,14 @@ using namespace std;
|
|||
/*utf8*/
|
||||
class KeywordExtractor {
|
||||
public:
|
||||
// struct Word {
|
||||
// string word;
|
||||
// vector<size_t> offsets;
|
||||
// double weight;
|
||||
// }; // struct Word
|
||||
|
||||
KeywordExtractor(const DictTrie* dictTrie,
|
||||
const HMMModel* model,
|
||||
const string& idfPath,
|
||||
const string& dat_cache_path,
|
||||
const string& stopWordPath)
|
||||
: segment_(dictTrie, model, stopWordPath) {
|
||||
LoadIdfDict(idfPath);
|
||||
: segment_(dictTrie, model, stopWordPath),
|
||||
idf_trie_(idfPath,dat_cache_path){
|
||||
}
|
||||
~KeywordExtractor() {
|
||||
}
|
||||
|
@ -63,12 +60,11 @@ public:
|
|||
keywords.reserve(wordmap.size());
|
||||
|
||||
for (unordered_map<string, KeyWord>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
||||
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);//IDF词典查找
|
||||
|
||||
if (cit != idfMap_.end()) {
|
||||
itr->second.weight *= cit->second;
|
||||
double idf = idf_trie_.Find(itr->first);
|
||||
if (-1 != idf) {//IDF词典查找
|
||||
itr->second.weight *= idf;
|
||||
} else {
|
||||
itr->second.weight *= idfAverage_;
|
||||
itr->second.weight *= idf_trie_.idfAverage_;
|
||||
}
|
||||
|
||||
itr->second.word = itr->first;
|
||||
|
@ -80,51 +76,13 @@ public:
|
|||
keywords.resize(topN);
|
||||
}
|
||||
private:
|
||||
void LoadIdfDict(const string& idfPath) {
|
||||
ifstream ifs(idfPath.c_str());
|
||||
if(not ifs.is_open()){
|
||||
return ;
|
||||
}
|
||||
XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
|
||||
string line ;
|
||||
vector<string> buf;
|
||||
double idf = 0.0;
|
||||
double idfSum = 0.0;
|
||||
size_t lineno = 0;
|
||||
|
||||
for (; getline(ifs, line); lineno++) {
|
||||
buf.clear();
|
||||
|
||||
if (line.empty()) {
|
||||
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
|
||||
continue;
|
||||
}
|
||||
|
||||
Split(line, buf, " ");
|
||||
|
||||
if (buf.size() != 2) {
|
||||
XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
|
||||
continue;
|
||||
}
|
||||
|
||||
idf = atof(buf[1].c_str());
|
||||
idfMap_[buf[0]] = idf;
|
||||
idfSum += idf;
|
||||
|
||||
}
|
||||
|
||||
assert(lineno);
|
||||
idfAverage_ = idfSum / lineno;
|
||||
assert(idfAverage_ > 0.0);
|
||||
}
|
||||
|
||||
static bool Compare(const KeyWord& lhs, const KeyWord& rhs) {
|
||||
return lhs.weight > rhs.weight;
|
||||
}
|
||||
|
||||
MixSegment segment_;
|
||||
unordered_map<string, double> idfMap_;
|
||||
double idfAverage_;
|
||||
IdfTrie idf_trie_;
|
||||
|
||||
unordered_set<Rune> symbols_;
|
||||
}; // class KeywordExtractor
|
||||
|
|
|
@ -156,8 +156,9 @@ public:
|
|||
// if mp Get a single one and it is not in userdict, collect it in sequence
|
||||
size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里(i字符不是最后一个字符),直接判定j字符
|
||||
|
||||
while (j < (words.size() - 1) && words[j].left == words[j].right &&
|
||||
!mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
|
||||
while (j < (words.size() - 1)
|
||||
&& words[j].left == words[j].right
|
||||
&& !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
|
||||
j++;
|
||||
}
|
||||
|
||||
|
|
|
@ -71,7 +71,7 @@ public:
|
|||
cursor_ ++;
|
||||
}
|
||||
}
|
||||
|
||||
int num = 0;
|
||||
while (cursor_ != sentence_.end()) {
|
||||
if (cursor_->rune == 0x20) {
|
||||
if (wordRange.left == cursor_) {
|
||||
|
@ -83,6 +83,11 @@ public:
|
|||
}
|
||||
|
||||
cursor_ ++;
|
||||
num++;
|
||||
if (num >= 1024) { //todo 防止一次性传入过多字节,暂定限制为1024个字
|
||||
wordRange.right = cursor_;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
wordRange.right = sentence_.end();
|
||||
|
|
|
@ -97,24 +97,6 @@ inline RuneArray DecodeRunesInString(const string& s) {
|
|||
|
||||
//重写DecodeRunesInString函数,将实现放入函数中降低内存占用加快处理流程--jxx20210518
|
||||
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
|
||||
/*
|
||||
RuneArray arr;
|
||||
|
||||
if (not DecodeRunesInString(s, arr)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
runes.clear();
|
||||
|
||||
uint32_t offset = 0;
|
||||
|
||||
for (uint32_t i = 0; i < arr.size(); ++i) {
|
||||
const uint32_t len = limonp::UnicodeToUtf8Bytes(arr[i]);
|
||||
RuneInfo x(arr[i], offset, len, i, 1);
|
||||
runes.push_back(x);
|
||||
offset += len;
|
||||
}
|
||||
*/
|
||||
|
||||
uint32_t tmp;
|
||||
uint32_t offset = 0;
|
||||
|
|
|
@ -2,6 +2,7 @@ INCLUDEPATH += $$PWD
|
|||
|
||||
HEADERS += \
|
||||
$$PWD/DictTrie.hpp \
|
||||
$$PWD/IdfTrie.hpp \
|
||||
$$PWD/FullSegment.hpp \
|
||||
$$PWD/HMMModel.hpp \
|
||||
$$PWD/HMMSegment.hpp \
|
||||
|
@ -17,5 +18,4 @@ HEADERS += \
|
|||
$$PWD/TextRankExtractor.hpp \
|
||||
$$PWD/Trie.hpp \
|
||||
$$PWD/Unicode.hpp
|
||||
|
||||
include(limonp/limonp.pri)
|
||||
|
|
|
@ -19,6 +19,8 @@ DEFINES += QT_DEPRECATED_WARNINGS
|
|||
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
|
||||
include(cppjieba/cppjieba.pri)
|
||||
|
||||
#LIBS += -L/usr/local/lib/libjemalloc -ljemalloc
|
||||
|
||||
SOURCES += \
|
||||
chinese-segmentation.cpp \
|
||||
|
||||
|
|
|
@ -108,12 +108,14 @@ void ConstructDocumentForContent::run() {
|
|||
FileReader::getTextContent(m_path, content);
|
||||
if(content.isEmpty())
|
||||
return;
|
||||
QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
|
||||
QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
|
||||
//QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
|
||||
//QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
|
||||
Document doc;
|
||||
doc.setData(content);
|
||||
doc.setUniqueTerm(uniqueterm);
|
||||
doc.addTerm(upTerm);
|
||||
//doc.setUniqueTerm(uniqueterm);
|
||||
doc.setUniqueTerm(FileUtils::makeDocUterm(m_path));
|
||||
//doc.addTerm(upTerm);
|
||||
doc.addTerm(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
|
||||
doc.addValue(m_path);
|
||||
|
||||
//'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
|
||||
|
@ -131,6 +133,7 @@ void ConstructDocumentForContent::run() {
|
|||
IndexGenerator::_mutex_doc_list_content.unlock();
|
||||
content.clear();
|
||||
content.squeeze();
|
||||
|
||||
term.clear();
|
||||
term.shrink_to_fit();
|
||||
return;
|
||||
|
|
|
@ -37,7 +37,7 @@ void Document::addPosting(std::string term, QVector<size_t> offset, int weight)
|
|||
}
|
||||
}
|
||||
|
||||
void Document::addPosting(std::string term, std::vector<size_t> offset, int weight) {
|
||||
void Document::addPosting(std::string &term, std::vector<size_t> &offset, int weight) {
|
||||
if(term == "")
|
||||
return;
|
||||
if(term.length() > 240)
|
||||
|
@ -63,6 +63,12 @@ void Document::addTerm(QString term) {
|
|||
m_document.add_term(term.toStdString());
|
||||
}
|
||||
|
||||
void Document::addTerm(std::string term) {
|
||||
if(term.empty())
|
||||
return;
|
||||
m_document.add_term(term);
|
||||
}
|
||||
|
||||
void Document::addValue(QString value) {
|
||||
m_document.add_value(1, value.toStdString());
|
||||
}
|
||||
|
@ -73,12 +79,20 @@ void Document::setUniqueTerm(QString term) {
|
|||
m_document.add_term(term.toStdString());
|
||||
|
||||
// m_unique_term = new QString(term);
|
||||
m_unique_term = std::move(term);
|
||||
m_unique_term = std::move(term.toStdString());
|
||||
}
|
||||
|
||||
void Document::setUniqueTerm(std::string term) {
|
||||
if(term.empty())
|
||||
return;
|
||||
m_document.add_term(term);
|
||||
m_unique_term = term;
|
||||
}
|
||||
|
||||
std::string Document::getUniqueTerm() {
|
||||
// qDebug()<<"m_unique_term!"<<*m_unique_term;
|
||||
// qDebug() << QString::fromStdString(m_unique_term.toStdString());
|
||||
return m_unique_term.toStdString();
|
||||
return m_unique_term;//.toStdString();
|
||||
}
|
||||
|
||||
void Document::setIndexText(QStringList indexText) {
|
||||
|
|
|
@ -41,11 +41,13 @@ public:
|
|||
}
|
||||
void setData(QString &data);
|
||||
void addPosting(std::string term, QVector<size_t> offset, int weight = 1);
|
||||
void addPosting(std::string term, std::vector<size_t> offset, int weight = 1);
|
||||
void addPosting(std::string &term, std::vector<size_t> &offset, int weight = 1);
|
||||
void addPosting(std::string term, unsigned int offset, int weight = 1);
|
||||
void addTerm(QString term);
|
||||
void addTerm(std::string term);
|
||||
void addValue(QString value);
|
||||
void setUniqueTerm(QString term);
|
||||
void setUniqueTerm(std::string term);
|
||||
std::string getUniqueTerm();
|
||||
void setIndexText(QStringList indexText);
|
||||
QStringList getIndexText();
|
||||
|
@ -53,7 +55,8 @@ public:
|
|||
private:
|
||||
Xapian::Document m_document;
|
||||
QStringList m_index_text;
|
||||
QString m_unique_term;
|
||||
//QString m_unique_term;
|
||||
std::string m_unique_term;
|
||||
|
||||
};
|
||||
}
|
||||
|
|
|
@ -154,7 +154,6 @@ void FirstIndex::run() {
|
|||
|
||||
|
||||
++FileUtils::_index_status;
|
||||
|
||||
pid_t pid;
|
||||
pid = fork();
|
||||
if(pid == 0) {
|
||||
|
@ -235,6 +234,7 @@ void FirstIndex::run() {
|
|||
qDebug() << "content index end;";
|
||||
sem.release(2);
|
||||
});
|
||||
|
||||
mutex1.lock();
|
||||
mutex2.lock();
|
||||
mutex3.lock();
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
#include "index-generator.h"
|
||||
#include "chinese-segmentation.h"
|
||||
#include <QStandardPaths>
|
||||
|
||||
#include <malloc.h>
|
||||
|
||||
#define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/index_data").toStdString()
|
||||
#define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/content_index_data").toStdString()
|
||||
|
@ -127,11 +127,11 @@ bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) {
|
|||
// GlobalSettings::getInstance()->setValue(CONTENT_INDEX_DATABASE_STATE, "2");
|
||||
// FileUtils::_index_status &= ~0x2;
|
||||
qDebug() << "finish creatAllIndex for content";
|
||||
|
||||
IndexGenerator::_doc_list_content.clear();
|
||||
IndexGenerator::_doc_list_content.squeeze();
|
||||
QVector<Document>().swap(IndexGenerator::_doc_list_content);
|
||||
// delete _doc_list_content;
|
||||
// _doc_list_content = nullptr;
|
||||
malloc_trim(0);
|
||||
}
|
||||
Q_EMIT this->transactionFinished();
|
||||
return true;
|
||||
|
|
|
@ -33,7 +33,7 @@ include(plugininterface/plugin-interface.pri)
|
|||
include(pluginmanage/plugin-manager.pri)
|
||||
|
||||
LIBS += -L$$OUT_PWD/../libchinese-segmentation/ -lchinese-segmentation
|
||||
LIBS += -lxapian -lquazip5 -luchardet
|
||||
LIBS += -lxapian -lquazip5 -luchardet #-L/usr/local/lib/libjemalloc -ljemalloc
|
||||
|
||||
SOURCES += \
|
||||
file-utils.cpp \
|
||||
|
|
|
@ -9,7 +9,7 @@ TEMPLATE = app
|
|||
PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0
|
||||
CONFIG += c++11 link_pkgconfig no_keywords lrelease
|
||||
LIBS += -lxapian -lgsettings-qt -lquazip5 -lX11
|
||||
LIBS += -lukui-log4qt
|
||||
#LIBS += -lukui-log4qt -L/usr/local/lib/libjemalloc -ljemalloc
|
||||
# The following define makes your compiler emit warnings if you use
|
||||
# any Qt feature that has been marked deprecated (the exact warnings
|
||||
# depend on your compiler). Please consult the documentation of the
|
||||
|
|
|
@ -19,3 +19,4 @@ src.depends = libsearch
|
|||
|
||||
CONFIG += ordered
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue