Merge pull request #250 from ukui/main

merge from main
This commit is contained in:
iaom 2021-05-31 18:47:43 +08:00 committed by GitHub
commit 1117d75025
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
49 changed files with 4347 additions and 2773 deletions

View File

@ -30,12 +30,12 @@ ChineseSegmentation::ChineseSegmentation() {
const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
const char * const IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
const char * const STOP_WORD_PATH = "/usr/share/ukui-search/res/dict/stop_words.utf8";
m_jieba = new cppjieba::Jieba(DICT_PATH,
HMM_PATH,
USER_DICT_PATH,
IDF_PATH,
STOP_WORD_PATH);
STOP_WORD_PATH,
"");
}
ChineseSegmentation::~ChineseSegmentation() {
@ -72,6 +72,15 @@ QVector<SKeyWord> ChineseSegmentation::callSegement(std::string s) {
}
std::vector<cppjieba::KeywordExtractor::Word> ChineseSegmentation::callSegementStd(const std::string &str) {
const size_t topk = -1;
std::vector<cppjieba::KeywordExtractor::Word> keywordres;
ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk);
return keywordres;
}
void ChineseSegmentation::convert(std::vector<cppjieba::KeywordExtractor::Word> &keywordres, QVector<SKeyWord> &kw) {
for(auto i : keywordres) {
SKeyWord temp;

View File

@ -48,6 +48,9 @@ public:
static ChineseSegmentation *getInstance();
~ChineseSegmentation();
QVector<SKeyWord> callSegement(std::string s);
//新添加callSegementStd函数修改返回值为stdvector<cppjieba::KeywordExtractor::Word>并简化内部处理流程--jxx20210517
//修改函数入参形式为引用去掉Qstring与std::string转换代码--jxx20210519
std::vector<cppjieba::KeywordExtractor::Word> callSegementStd(const std::string& str);
void convert(std::vector<cppjieba::KeywordExtractor::Word>& keywordres, QVector<SKeyWord>& kw);
private:
static QMutex m_mutex;

View File

@ -0,0 +1,286 @@
#pragma once
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <QDebug>
#include <algorithm>
#include <utility>
#include "limonp/Md5.hpp"
#include "Unicode.hpp"
#include "darts.h"
namespace cppjieba {
using std::pair;
struct DatElement {
string word;
string tag;
double weight = 0;
bool operator < (const DatElement & b) const {
if (word == b.word) {
return this->weight > b.weight;
}
return this->word < b.word;
}
};
inline std::ostream & operator << (std::ostream& os, const DatElement & elem) {
return os << "word=" << elem.word << "/tag=" << elem.tag << "/weight=" << elem.weight;
}
struct DatMemElem {
double weight = 0.0;
char tag[8] = {};
void SetTag(const string & str) {
memset(&tag[0], 0, sizeof(tag));
strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
}
string GetTag() const {
return &tag[0];
}
};
inline std::ostream & operator << (std::ostream& os, const DatMemElem & elem) {
return os << "/tag=" << elem.GetTag() << "/weight=" << elem.weight;
}
struct DatDag {
limonp::LocalVector<pair<size_t, const DatMemElem *> > nexts;
double max_weight;
int max_next;
};
typedef Darts::DoubleArray JiebaDAT;
struct CacheFileHeader {
char md5_hex[32] = {};
double min_weight = 0;
uint32_t elements_num = 0;
uint32_t dat_size = 0;
};
static_assert(sizeof(DatMemElem) == 16, "DatMemElem length invalid");
static_assert((sizeof(CacheFileHeader) % sizeof(DatMemElem)) == 0, "DatMemElem CacheFileHeader length equal");
class DatTrie {
public:
DatTrie() {}
~DatTrie() {
::munmap(mmap_addr_, mmap_length_);
mmap_addr_ = nullptr;
mmap_length_ = 0;
::close(mmap_fd_);
mmap_fd_ = -1;
}
const DatMemElem * Find(const string & key) const {
JiebaDAT::result_pair_type find_result;
dat_.exactMatchSearch(key.c_str(), find_result);
if ((0 == find_result.length) || (find_result.value < 0) || (find_result.value >= elements_num_)) {
return nullptr;
}
return &elements_ptr_[ find_result.value ];
}
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
vector<struct DatDag>&res, size_t max_word_len) const {
res.clear();
res.resize(end - begin);
string text_str;
EncodeRunesToString(begin, end, text_str);
static const size_t max_num = 128;
JiebaDAT::result_pair_type result_pairs[max_num] = {};
for (size_t i = 0, begin_pos = 0; i < size_t(end - begin); i++) {
std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + 1, nullptr));
for (std::size_t idx = 0; idx < num_results; ++idx) {
auto & match = result_pairs[idx];
if ((match.value < 0) || (match.value >= elements_num_)) {
continue;
}
auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
if (char_num > max_word_len) {
continue;
}
auto pValue = &elements_ptr_[match.value];
if (1 == char_num) {
res[i].nexts[0].second = pValue;
continue;
}
res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + char_num, pValue));
}
begin_pos += limonp::UnicodeToUtf8Bytes((begin + i)->rune);
}
}
double GetMinWeight() const {
return min_weight_;
}
void SetMinWeight(double d) {
min_weight_ = d ;
}
bool InitBuildDat(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
BuildDatCache(elements, dat_cache_file, md5);
return InitAttachDat(dat_cache_file, md5);
}
bool InitAttachDat(const string & dat_cache_file, const string & md5) {
mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
if (mmap_fd_ < 0) {
return false;
}
const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
assert(seek_off >= 0);
mmap_length_ = seek_off;
mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
assert(MAP_FAILED != mmap_addr_);
assert(mmap_length_ >= sizeof(CacheFileHeader));
CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
elements_num_ = header.elements_num;
min_weight_ = header.min_weight;
assert(sizeof(header.md5_hex) == md5.size());
if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
return false;
}
assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(DatMemElem) + header.dat_size * dat_.unit_size());
elements_ptr_ = (const DatMemElem *)(mmap_addr_ + sizeof(header));
const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(DatMemElem) * elements_num_;
dat_.set_array(dat_ptr, header.dat_size);
return true;
}
private:
void BuildDatCache(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
std::sort(elements.begin(), elements.end());
vector<const char*> keys_ptr_vec;
vector<int> values_vec;
vector<DatMemElem> mem_elem_vec;
keys_ptr_vec.reserve(elements.size());
values_vec.reserve(elements.size());
mem_elem_vec.reserve(elements.size());
CacheFileHeader header;
header.min_weight = min_weight_;
assert(sizeof(header.md5_hex) == md5.size());
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
for (size_t i = 0; i < elements.size(); ++i) {
keys_ptr_vec.push_back(elements[i].word.data());
values_vec.push_back(i);
mem_elem_vec.push_back(DatMemElem());
auto & mem_elem = mem_elem_vec.back();
mem_elem.weight = elements[i].weight;
mem_elem.SetTag(elements[i].tag);
}
auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
assert(0 == ret);
header.elements_num = mem_elem_vec.size();
header.dat_size = dat_.size();
{
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
::umask(S_IWGRP | S_IWOTH);
//const int fd =::mkstemp(&tmp_filepath[0]);
//原mkstemp用法有误已修复--jxx20210519
const int fd =::mkstemp((char *)tmp_filepath.data());
qDebug() << "mkstemp error:" << errno << tmp_filepath.data();
assert(fd >= 0);
::fchmod(fd, 0644);
auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(mem_elem_vec[0]) * mem_elem_vec.size());
write_bytes += ::write(fd, dat_.array(), dat_.total_size());
assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(mem_elem_vec[0]) + dat_.total_size());
::close(fd);
const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
assert(0 == rename_ret);
}
}
DatTrie(const DatTrie &);
DatTrie &operator=(const DatTrie &);
private:
JiebaDAT dat_;
const DatMemElem * elements_ptr_ = nullptr;
size_t elements_num_ = 0;
double min_weight_ = 0;
int mmap_fd_ = -1;
size_t mmap_length_ = 0;
char * mmap_addr_ = nullptr;
};
inline string CalcFileListMD5(const string & files_list, size_t & file_size_sum) {
limonp::MD5 md5;
const auto files = limonp::Split(files_list, "|;");
file_size_sum = 0;
for (auto const & local_path : files) {
const int fd = ::open(local_path.c_str(), O_RDONLY);
if( fd < 0){
continue;
}
auto const len = ::lseek(fd, 0, SEEK_END);
if (len > 0) {
void * addr = ::mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
assert(MAP_FAILED != addr);
md5.Update((unsigned char *) addr, len);
file_size_sum += len;
::munmap(addr, len);
}
::close(fd);
}
md5.Final();
return string(md5.digestChars);
}
}

View File

@ -1,23 +1,4 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_DICT_TRIE_HPP
#define CPPJIEBA_DICT_TRIE_HPP
#pragma once
#include <iostream>
#include <fstream>
@ -31,8 +12,8 @@
#include "limonp/StringUtil.hpp"
#include "limonp/Logging.hpp"
#include "Unicode.hpp"
#include "Trie.hpp"
#include "DatTrie.hpp"
#include <QDebug>
namespace cppjieba {
using namespace limonp;
@ -50,58 +31,22 @@ public:
WordWeightMax,
}; // enum UserWordWeightOption
DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
Init(dict_path, user_dict_paths, user_word_weight_opt);
DictTrie(const string& dict_path, const string& user_dict_paths = "", const string & dat_cache_path = "",
UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
Init(dict_path, user_dict_paths, dat_cache_path, user_word_weight_opt);
}
~DictTrie() {
delete trie_;
}
~DictTrie() {}
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
DictUnit node_info;
if(!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
return false;
}
active_node_infos_.push_back(node_info);
trie_->InsertNode(node_info.word, &active_node_infos_.back());
return true;
}
bool InsertUserWord(const string& word, int freq, const string& tag = UNKNOWN_TAG) {
DictUnit node_info;
double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
if(!MakeNodeInfo(node_info, word, weight, tag)) {
return false;
}
active_node_infos_.push_back(node_info);
trie_->InsertNode(node_info.word, &active_node_infos_.back());
return true;
}
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
return trie_->Find(begin, end);
const DatMemElem* Find(const string & word) const {
return dat_.Find(word);
}
void Find(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<struct Dag>&res,
vector<struct DatDag>&res,
size_t max_word_len = MAX_WORD_LENGTH) const {
trie_->Find(begin, end, res, max_word_len);
}
bool Find(const string& word) {
const DictUnit *tmp = NULL;
RuneStrArray runes;
if(!DecodeRunesInString(word, runes)) {
XLOG(ERROR) << "Decode failed.";
}
tmp = Find(runes.begin(), runes.end());
if(tmp == NULL) {
return false;
} else {
return true;
}
dat_.Find(begin, end, res, max_word_len);
}
bool IsUserDictSingleChineseWord(const Rune& word) const {
@ -109,182 +54,176 @@ public:
}
double GetMinWeight() const {
return min_weight_;
return dat_.GetMinWeight();
}
void InserUserDictNode(const string& line) {
size_t GetTotalDictSize() const {
return total_dict_size_;
}
void InserUserDictNode(const string& line, bool saveNodeInfo = true) {
vector<string> buf;
DictUnit node_info;
DatElement node_info;
Split(line, buf, " ");
if(buf.size() == 1) {
MakeNodeInfo(node_info,
buf[0],
user_word_default_weight_,
UNKNOWN_TAG);
} else if(buf.size() == 2) {
MakeNodeInfo(node_info,
buf[0],
user_word_default_weight_,
buf[1]);
} else if(buf.size() == 3) {
int freq = atoi(buf[1].c_str());
assert(freq_sum_ > 0.0);
double weight = log(1.0 * freq / freq_sum_);
MakeNodeInfo(node_info, buf[0], weight, buf[2]);
if (buf.size() == 0) {
return;
}
static_node_infos_.push_back(node_info);
if(node_info.word.size() == 1) {
user_dict_single_chinese_word_.insert(node_info.word[0]);
node_info.word = buf[0];
node_info.weight = user_word_default_weight_;
node_info.tag = UNKNOWN_TAG;
if (buf.size() == 2) {
node_info.tag = buf[1];
} else if (buf.size() == 3) {
if (freq_sum_ > 0.0) {
const int freq = atoi(buf[1].c_str());
node_info.weight = log(1.0 * freq / freq_sum_);
node_info.tag = buf[2];
}
}
if (saveNodeInfo) {
static_node_infos_.push_back(node_info);
}
if (Utf8CharNum(node_info.word) == 1) {
RuneArray word;
if (DecodeRunesInString(node_info.word, word)) {
user_dict_single_chinese_word_.insert(word[0]);
} else {
XLOG(ERROR) << "Decode " << node_info.word << " failed.";
}
}
}
void LoadUserDict(const vector<string>& buf) {
for(size_t i = 0; i < buf.size(); i++) {
InserUserDictNode(buf[i]);
}
}
void LoadUserDict(const set<string>& buf) {
std::set<string>::const_iterator iter;
for(iter = buf.begin(); iter != buf.end(); iter++) {
InserUserDictNode(*iter);
}
}
void LoadUserDict(const string& filePaths) {
void LoadUserDict(const string& filePaths, bool saveNodeInfo = true) {
vector<string> files = limonp::Split(filePaths, "|;");
size_t lineno = 0;
for(size_t i = 0; i < files.size(); i++) {
for (size_t i = 0; i < files.size(); i++) {
ifstream ifs(files[i].c_str());
XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
string line;
for(; getline(ifs, line); lineno++) {
if(line.size() == 0) {
for (; getline(ifs, line);) {
if (line.size() == 0) {
continue;
}
InserUserDictNode(line);
InserUserDictNode(line, saveNodeInfo);
}
}
}
private:
void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
LoadDict(dict_path);
void Init(const string& dict_path, const string& user_dict_paths, string dat_cache_path,
UserWordWeightOption user_word_weight_opt) {
const auto dict_list = dict_path + "|" + user_dict_paths;
size_t file_size_sum = 0;
const string md5 = CalcFileListMD5(dict_list, file_size_sum);
if (dat_cache_path.empty()) {
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) + ".dat_cache";
}
QString path = QString::fromStdString(dat_cache_path);
qDebug() << "#########path:" << path;
if (dat_.InitAttachDat(dat_cache_path, md5)) {
LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;
total_dict_size_ = file_size_sum;
return;
}
LoadDefaultDict(dict_path);
freq_sum_ = CalcFreqSum(static_node_infos_);
CalculateWeight(static_node_infos_, freq_sum_);
SetStaticWordWeights(user_word_weight_opt);
double min_weight = 0;
SetStaticWordWeights(user_word_weight_opt, min_weight);
dat_.SetMinWeight(min_weight);
if(user_dict_paths.size()) {
LoadUserDict(user_dict_paths);
}
Shrink(static_node_infos_);
CreateTrie(static_node_infos_);
LoadUserDict(user_dict_paths);
const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
assert(build_ret);
total_dict_size_ = file_size_sum;
vector<DatElement>().swap(static_node_infos_);
}
void CreateTrie(const vector<DictUnit>& dictUnits) {
assert(dictUnits.size());
vector<Unicode> words;
vector<const DictUnit*> valuePointers;
for(size_t i = 0 ; i < dictUnits.size(); i ++) {
words.push_back(dictUnits[i].word);
valuePointers.push_back(&dictUnits[i]);
}
trie_ = new Trie(words, valuePointers);
}
bool MakeNodeInfo(DictUnit& node_info,
const string& word,
double weight,
const string& tag) {
if(!DecodeRunesInString(word, node_info.word)) {
XLOG(ERROR) << "Decode " << word << " failed.";
return false;
}
node_info.weight = weight;
node_info.tag = tag;
return true;
}
void LoadDict(const string& filePath) {
void LoadDefaultDict(const string& filePath) {
ifstream ifs(filePath.c_str());
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
string line;
vector<string> buf;
DictUnit node_info;
for(size_t lineno = 0; getline(ifs, line); lineno++) {
for (; getline(ifs, line);) {
Split(line, buf, " ");
XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
MakeNodeInfo(node_info,
buf[0],
atof(buf[1].c_str()),
buf[2]);
DatElement node_info;
node_info.word = buf[0];
node_info.weight = atof(buf[1].c_str());
node_info.tag = buf[2];
static_node_infos_.push_back(node_info);
}
}
static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
static bool WeightCompare(const DatElement& lhs, const DatElement& rhs) {
return lhs.weight < rhs.weight;
}
void SetStaticWordWeights(UserWordWeightOption option) {
void SetStaticWordWeights(UserWordWeightOption option, double & min_weight) {
XCHECK(!static_node_infos_.empty());
vector<DictUnit> x = static_node_infos_;
vector<DatElement> x = static_node_infos_;
sort(x.begin(), x.end(), WeightCompare);
min_weight_ = x[0].weight;
max_weight_ = x[x.size() - 1].weight;
median_weight_ = x[x.size() / 2].weight;
switch(option) {
case WordWeightMin:
user_word_default_weight_ = min_weight_;
break;
case WordWeightMedian:
user_word_default_weight_ = median_weight_;
break;
default:
user_word_default_weight_ = max_weight_;
break;
if(x.empty()){
return;
}
min_weight = x[0].weight;
const double max_weight_ = x[x.size() - 1].weight;
const double median_weight_ = x[x.size() / 2].weight;
switch (option) {
case WordWeightMin:
user_word_default_weight_ = min_weight;
break;
case WordWeightMedian:
user_word_default_weight_ = median_weight_;
break;
default:
user_word_default_weight_ = max_weight_;
break;
}
}
double CalcFreqSum(const vector<DictUnit>& node_infos) const {
double CalcFreqSum(const vector<DatElement>& node_infos) const {
double sum = 0.0;
for(size_t i = 0; i < node_infos.size(); i++) {
for (size_t i = 0; i < node_infos.size(); i++) {
sum += node_infos[i].weight;
}
return sum;
}
void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
assert(sum > 0.0);
for(size_t i = 0; i < node_infos.size(); i++) {
DictUnit& node_info = node_infos[i];
void CalculateWeight(vector<DatElement>& node_infos, double sum) const {
for (size_t i = 0; i < node_infos.size(); i++) {
DatElement& node_info = node_infos[i];
assert(node_info.weight > 0.0);
node_info.weight = log(double(node_info.weight) / sum);
}
}
void Shrink(vector<DictUnit>& units) const {
vector<DictUnit>(units.begin(), units.end()).swap(units);
}
vector<DictUnit> static_node_infos_;
deque<DictUnit> active_node_infos_; // must not be vector
Trie * trie_;
private:
vector<DatElement> static_node_infos_;
size_t total_dict_size_ = 0;
DatTrie dat_;
double freq_sum_;
double min_weight_;
double max_weight_;
double median_weight_;
double user_word_default_weight_;
unordered_set<Rune> user_dict_single_chinese_word_;
};
}
#endif

View File

@ -1,23 +1,4 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_FULLSEGMENT_H
#define CPPJIEBA_FULLSEGMENT_H
#pragma once
#include <algorithm>
#include <set>
@ -30,82 +11,45 @@
namespace cppjieba {
class FullSegment: public SegmentBase {
public:
FullSegment(const string& dictPath) {
dictTrie_ = new DictTrie(dictPath);
isNeedDestroy_ = true;
}
FullSegment(const DictTrie* dictTrie)
: dictTrie_(dictTrie), isNeedDestroy_(false) {
: dictTrie_(dictTrie) {
assert(dictTrie_);
}
~FullSegment() {
if(isNeedDestroy_) {
delete dictTrie_;
}
}
void Cut(const string& sentence,
vector<string>& words) const {
vector<Word> tmp;
Cut(sentence, tmp);
GetStringsFromWords(tmp, words);
}
void Cut(const string& sentence,
vector<Word>& words) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<WordRange> wrs;
wrs.reserve(sentence.size() / 2);
while(pre_filter.HasNext()) {
range = pre_filter.Next();
Cut(range.begin, range.end, wrs);
}
words.clear();
words.reserve(wrs.size());
GetWordsFromWordRanges(sentence, wrs, words);
}
void Cut(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<WordRange>& res) const {
// result of searching in trie tree
LocalVector<pair<size_t, const DictUnit*> > tRes;
~FullSegment() { }
// max index of res's words
size_t maxIdx = 0;
// always equals to (uItr - begin)
size_t uIdx = 0;
// tmp variables
size_t wordLen = 0;
virtual void Cut(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<WordRange>& res, bool, size_t) const override {
assert(dictTrie_);
vector<struct Dag> dags;
vector<struct DatDag> dags;
dictTrie_->Find(begin, end, dags);
for(size_t i = 0; i < dags.size(); i++) {
for(size_t j = 0; j < dags[i].nexts.size(); j++) {
size_t nextoffset = dags[i].nexts[j].first;
size_t max_word_end_pos = 0;
for (size_t i = 0; i < dags.size(); i++) {
for (const auto & kv : dags[i].nexts) {
const size_t nextoffset = kv.first - 1;
assert(nextoffset < dags.size());
const DictUnit* du = dags[i].nexts[j].second;
if(du == NULL) {
if(dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
WordRange wr(begin + i, begin + nextoffset);
res.push_back(wr);
}
} else {
wordLen = du->word.size();
if(wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
WordRange wr(begin + i, begin + nextoffset);
res.push_back(wr);
}
const auto wordLen = nextoffset - i + 1;
const bool is_not_covered_single_word = ((dags[i].nexts.size() == 1) && (max_word_end_pos <= i));
const bool is_oov = (nullptr == kv.second); //Out-of-Vocabulary
if ((is_not_covered_single_word) || ((not is_oov) && (wordLen >= 2))) {
WordRange wr(begin + i, begin + nextoffset);
res.push_back(wr);
}
maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
max_word_end_pos = max(max_word_end_pos, nextoffset + 1);
}
uIdx++;
}
}
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
size_t) const override {
}
private:
const DictTrie* dictTrie_;
bool isNeedDestroy_;
};
}
#endif

View File

@ -1,26 +1,6 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_HMMMODEL_H
#define CPPJIEBA_HMMMODEL_H
#pragma once
#include "limonp/StringUtil.hpp"
#include "Trie.hpp"
namespace cppjieba {
@ -59,16 +39,18 @@ struct HMMModel {
XCHECK(GetLine(ifile, line));
Split(line, tmp, " ");
XCHECK(tmp.size() == STATUS_SUM);
for(size_t j = 0; j < tmp.size(); j++) {
for (size_t j = 0; j < tmp.size(); j++) {
startProb[j] = atof(tmp[j].c_str());
}
//Load transProb
for(size_t i = 0; i < STATUS_SUM; i++) {
for (size_t i = 0; i < STATUS_SUM; i++) {
XCHECK(GetLine(ifile, line));
Split(line, tmp, " ");
XCHECK(tmp.size() == STATUS_SUM);
for(size_t j = 0; j < STATUS_SUM; j++) {
for (size_t j = 0; j < tmp.size(); j++) {
transProb[i][j] = atof(tmp[j].c_str());
}
}
@ -92,43 +74,55 @@ struct HMMModel {
double GetEmitProb(const EmitProbMap* ptMp, Rune key,
double defVal)const {
EmitProbMap::const_iterator cit = ptMp->find(key);
if(cit == ptMp->end()) {
if (cit == ptMp->end()) {
return defVal;
}
return cit->second;
}
bool GetLine(ifstream& ifile, string& line) {
while(getline(ifile, line)) {
while (getline(ifile, line)) {
Trim(line);
if(line.empty()) {
if (line.empty()) {
continue;
}
if(StartsWith(line, "#")) {
if (StartsWith(line, "#")) {
continue;
}
return true;
}
return false;
}
bool LoadEmitProb(const string& line, EmitProbMap& mp) {
if(line.empty()) {
if (line.empty()) {
return false;
}
vector<string> tmp, tmp2;
Unicode unicode;
RuneArray unicode;
Split(line, tmp, ",");
for(size_t i = 0; i < tmp.size(); i++) {
for (size_t i = 0; i < tmp.size(); i++) {
Split(tmp[i], tmp2, ":");
if(2 != tmp2.size()) {
if (2 != tmp2.size()) {
XLOG(ERROR) << "emitProb illegal.";
return false;
}
if(!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
XLOG(ERROR) << "TransCode failed.";
return false;
}
mp[unicode[0]] = atof(tmp2[1].c_str());
}
return true;
}
@ -144,4 +138,3 @@ struct HMMModel {
} // namespace cppjieba
#endif

View File

@ -1,23 +1,4 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIBEA_HMMSEGMENT_H
#define CPPJIBEA_HMMSEGMENT_H
#pragma once
#include <iostream>
#include <fstream>
@ -29,58 +10,40 @@
namespace cppjieba {
class HMMSegment: public SegmentBase {
public:
HMMSegment(const string& filePath)
: model_(new HMMModel(filePath)), isNeedDestroy_(true) {
}
HMMSegment(const HMMModel* model)
: model_(model), isNeedDestroy_(false) {
}
~HMMSegment() {
if(isNeedDestroy_) {
delete model_;
}
: model_(model) {
}
~HMMSegment() { }
void Cut(const string& sentence,
vector<string>& words) const {
vector<Word> tmp;
Cut(sentence, tmp);
GetStringsFromWords(tmp, words);
}
void Cut(const string& sentence,
vector<Word>& words) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<WordRange> wrs;
wrs.reserve(sentence.size() / 2);
while(pre_filter.HasNext()) {
range = pre_filter.Next();
Cut(range.begin, range.end, wrs);
}
words.clear();
words.reserve(wrs.size());
GetWordsFromWordRanges(sentence, wrs, words);
}
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool,
size_t) const override {
RuneStrArray::const_iterator left = begin;
RuneStrArray::const_iterator right = begin;
while(right != end) {
if(right->rune < 0x80) {
if(left != right) {
while (right != end) {
if (right->rune < 0x80) {
if (left != right) {
InternalCut(left, right, res);
}
left = right;
do {
right = SequentialLetterRule(left, end);
if(right != left) {
if (right != left) {
break;
}
right = NumbersRule(left, end);
if(right != left) {
if (right != left) {
break;
}
right ++;
} while(false);
} while (false);
WordRange wr(left, right - 1);
res.push_back(wr);
left = right;
@ -88,45 +51,61 @@ public:
right++;
}
}
if(left != right) {
if (left != right) {
InternalCut(left, right, res);
}
}
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
size_t) const override {
}
private:
// sequential letters rule
RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end) const {
Rune x = begin->rune;
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
begin ++;
} else {
return begin;
}
while(begin != end) {
while (begin != end) {
x = begin->rune;
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
begin ++;
} else {
break;
}
}
return begin;
}
//
RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
Rune x = begin->rune;
if('0' <= x && x <= '9') {
if ('0' <= x && x <= '9') {
begin ++;
} else {
return begin;
}
while(begin != end) {
while (begin != end) {
x = begin->rune;
if(('0' <= x && x <= '9') || x == '.') {
if (('0' <= x && x <= '9') || x == '.') {
begin++;
} else {
break;
}
}
return begin;
}
void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
@ -135,8 +114,9 @@ private:
RuneStrArray::const_iterator left = begin;
RuneStrArray::const_iterator right;
for(size_t i = 0; i < status.size(); i++) {
if(status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
for (size_t i = 0; i < status.size(); i++) {
if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
right = begin + i + 1;
WordRange wr(left, right - 1);
res.push_back(wr);
@ -159,23 +139,25 @@ private:
vector<double> weight(XYSize);
//start
for(size_t y = 0; y < Y; y++) {
for (size_t y = 0; y < Y; y++) {
weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
path[0 + y * X] = -1;
}
double emitProb;
for(size_t x = 1; x < X; x++) {
for(size_t y = 0; y < Y; y++) {
for (size_t x = 1; x < X; x++) {
for (size_t y = 0; y < Y; y++) {
now = x + y * X;
weight[now] = MIN_DOUBLE;
path[now] = HMMModel::E; // warning
emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin + x)->rune, MIN_DOUBLE);
for(size_t preY = 0; preY < Y; preY++) {
for (size_t preY = 0; preY < Y; preY++) {
old = x - 1 + preY * X;
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
if(tmp > weight[now]) {
if (tmp > weight[now]) {
weight[now] = tmp;
path[now] = preY;
}
@ -186,23 +168,23 @@ private:
endE = weight[X - 1 + HMMModel::E * X];
endS = weight[X - 1 + HMMModel::S * X];
stat = 0;
if(endE >= endS) {
if (endE >= endS) {
stat = HMMModel::E;
} else {
stat = HMMModel::S;
}
status.resize(X);
for(int x = X - 1 ; x >= 0; x--) {
for (int x = X - 1 ; x >= 0; x--) {
status[x] = stat;
stat = path[x + stat * X];
}
}
const HMMModel* model_;
bool isNeedDestroy_;
}; // class HMMSegment
} // namespace cppjieba
#endif

View File

@ -1,24 +1,6 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEAB_JIEBA_H
#define CPPJIEAB_JIEBA_H
#pragma once
#include <memory>
#include "QuerySegment.hpp"
#include "KeywordExtractor.hpp"
@ -29,56 +11,48 @@ public:
Jieba(const string& dict_path,
const string& model_path,
const string& user_dict_path,
const string& idfPath,
const string& stopWordPath)
: dict_trie_(dict_path, user_dict_path),
const string& idfPath = "",
const string& stopWordPath = "",
const string& dat_cache_path = "")
: dict_trie_(dict_path, user_dict_path, dat_cache_path),
model_(model_path),
mp_seg_(&dict_trie_),
hmm_seg_(&model_),
mix_seg_(&dict_trie_, &model_),
full_seg_(&dict_trie_),
query_seg_(&dict_trie_, &model_),
extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
}
~Jieba() {
}
struct LocWord {
string word;
size_t begin;
size_t end;
}; // struct LocWord
extractor(&dict_trie_, &model_, idfPath, stopWordPath){ }
~Jieba() { }
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
mix_seg_.Cut(sentence, words, hmm);
mix_seg_.CutToStr(sentence, words, hmm);
}
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
mix_seg_.Cut(sentence, words, hmm);
mix_seg_.CutToWord(sentence, words, hmm);
}
void CutAll(const string& sentence, vector<string>& words) const {
full_seg_.Cut(sentence, words);
full_seg_.CutToStr(sentence, words);
}
void CutAll(const string& sentence, vector<Word>& words) const {
full_seg_.Cut(sentence, words);
full_seg_.CutToWord(sentence, words);
}
void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
query_seg_.Cut(sentence, words, hmm);
query_seg_.CutToStr(sentence, words, hmm);
}
void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
query_seg_.Cut(sentence, words, hmm);
query_seg_.CutToWord(sentence, words, hmm);
}
void CutHMM(const string& sentence, vector<string>& words) const {
hmm_seg_.Cut(sentence, words);
hmm_seg_.CutToStr(sentence, words);
}
void CutHMM(const string& sentence, vector<Word>& words) const {
hmm_seg_.Cut(sentence, words);
hmm_seg_.CutToWord(sentence, words);
}
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
mp_seg_.Cut(sentence, words, max_word_len);
mp_seg_.CutToStr(sentence, words, false, max_word_len);
}
void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
mp_seg_.Cut(sentence, words, max_word_len);
mp_seg_.CutToWord(sentence, words, false, max_word_len);
}
void Tag(const string& sentence, vector<pair<string, string> >& words) const {
@ -87,16 +61,8 @@ public:
string LookupTag(const string &str) const {
return mix_seg_.LookupTag(str);
}
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
return dict_trie_.InsertUserWord(word, tag);
}
bool InsertUserWord(const string& word, int freq, const string& tag = UNKNOWN_TAG) {
return dict_trie_.InsertUserWord(word, freq, tag);
}
bool Find(const string& word) {
return dict_trie_.Find(word);
return nullptr != dict_trie_.Find(word);
}
void ResetSeparators(const string& s) {
@ -116,18 +82,6 @@ public:
return &model_;
}
void LoadUserDict(const vector<string>& buf) {
dict_trie_.LoadUserDict(buf);
}
void LoadUserDict(const set<string>& buf) {
dict_trie_.LoadUserDict(buf);
}
void LoadUserDict(const string& path) {
dict_trie_.LoadUserDict(path);
}
private:
DictTrie dict_trie_;
HMMModel model_;
@ -145,4 +99,3 @@ public:
} // namespace cppjieba
#endif // CPPJIEAB_JIEBA_H

View File

@ -1,23 +1,4 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
#pragma once
#include <cmath>
#include <set>
@ -37,15 +18,6 @@ public:
double weight;
}; // struct Word
KeywordExtractor(const string& dictPath,
const string& hmmFilePath,
const string& idfPath,
const string& stopWordPath,
const string& userDict = "")
: segment_(dictPath, hmmFilePath, userDict) {
LoadIdfDict(idfPath);
LoadStopWordDict(stopWordPath);
}
KeywordExtractor(const DictTrie* dictTrie,
const HMMModel* model,
const string& idfPath,
@ -60,7 +32,8 @@ public:
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
vector<Word> topWords;
Extract(sentence, topWords, topN);
for(size_t i = 0; i < topWords.size(); i++) {
for (size_t i = 0; i < topWords.size(); i++) {
keywords.push_back(topWords[i].word);
}
}
@ -68,43 +41,52 @@ public:
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
vector<Word> topWords;
Extract(sentence, topWords, topN);
for(size_t i = 0; i < topWords.size(); i++) {
for (size_t i = 0; i < topWords.size(); i++) {
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
}
}
void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
vector<string> words;
segment_.Cut(sentence, words);
segment_.CutToStr(sentence, words);//将字符串string分解为words放入vector
map<string, Word> wordmap;
map<string, Word> wordmap;//插入字符串与Word的map相同string统计词频叠加权重
size_t offset = 0;
for(size_t i = 0; i < words.size(); ++i) {
for (size_t i = 0; i < words.size(); ++i) {
size_t t = offset;
offset += words[i].size();
if(IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
continue;
}
wordmap[words[i]].offsets.push_back(t);
wordmap[words[i]].weight += 1.0;
}
if(offset != sentence.size()) {
if (offset != sentence.size()) {
XLOG(ERROR) << "words illegal";
return;
}
keywords.clear();
keywords.reserve(wordmap.size());
for(map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
if(cit != idfMap_.end()) {
for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);//IDF词典查找
if (cit != idfMap_.end()) {
itr->second.weight *= cit->second;
} else {
itr->second.weight *= idfAverage_;
}
itr->second.word = itr->first;
keywords.push_back(itr->second);
}
topN = min(topN, keywords.size());
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
keywords.resize(topN);
@ -112,23 +94,31 @@ public:
private:
void LoadIdfDict(const string& idfPath) {
ifstream ifs(idfPath.c_str());
if(not ifs.is_open()){
return ;
}
XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
string line ;
vector<string> buf;
double idf = 0.0;
double idfSum = 0.0;
size_t lineno = 0;
for(; getline(ifs, line); lineno++) {
for (; getline(ifs, line); lineno++) {
buf.clear();
if(line.empty()) {
if (line.empty()) {
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
continue;
}
Split(line, buf, " ");
if(buf.size() != 2) {
if (buf.size() != 2) {
XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
continue;
}
idf = atof(buf[1].c_str());
idfMap_[buf[0]] = idf;
idfSum += idf;
@ -141,11 +131,16 @@ private:
}
void LoadStopWordDict(const string& filePath) {
ifstream ifs(filePath.c_str());
if(not ifs.is_open()){
return ;
}
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
string line ;
while(getline(ifs, line)) {
while (getline(ifs, line)) {
stopWords_.insert(line);
}
assert(stopWords_.size());
}
@ -161,11 +156,11 @@ private:
}; // class KeywordExtractor
inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
"}";
}
} // namespace cppjieba
#endif

View File

@ -1,23 +1,4 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_MPSEGMENT_H
#define CPPJIEBA_MPSEGMENT_H
#pragma once
#include <algorithm>
#include <set>
@ -31,63 +12,32 @@ namespace cppjieba {
class MPSegment: public SegmentTagged {
public:
MPSegment(const string& dictPath, const string& userDictPath = "")
: dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
}
MPSegment(const DictTrie* dictTrie)
: dictTrie_(dictTrie), isNeedDestroy_(false) {
: dictTrie_(dictTrie) {
assert(dictTrie_);
}
~MPSegment() {
if(isNeedDestroy_) {
delete dictTrie_;
}
~MPSegment() { }
virtual void Cut(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<WordRange>& words,
bool, size_t max_word_len) const override {
vector<DatDag> dags;
dictTrie_->Find(begin, end, dags, max_word_len);//依据DAG词典生成DAG--jxx
CalcDP(dags);//动态规划Dynamic ProgrammingDP根据DAG计算最优动态规划路径--jxx
CutByDag(begin, end, dags, words);//依据DAG最优路径分词--jxx
}
void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, MAX_WORD_LENGTH);
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
size_t) const override {
}
void Cut(const string& sentence,
vector<string>& words,
size_t max_word_len) const {
vector<Word> tmp;
Cut(sentence, tmp, max_word_len);
GetStringsFromWords(tmp, words);
}
void Cut(const string& sentence,
vector<Word>& words,
size_t max_word_len = MAX_WORD_LENGTH) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<WordRange> wrs;
wrs.reserve(sentence.size() / 2);
while(pre_filter.HasNext()) {
range = pre_filter.Next();
Cut(range.begin, range.end, wrs, max_word_len);
}
words.clear();
words.reserve(wrs.size());
GetWordsFromWordRanges(sentence, wrs, words);
}
void Cut(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<WordRange>& words,
size_t max_word_len = MAX_WORD_LENGTH) const {
vector<Dag> dags;
dictTrie_->Find(begin,
end,
dags,
max_word_len);
CalcDP(dags);
CutByDag(begin, end, dags, words);
}
const DictTrie* GetDictTrie() const {
const DictTrie* GetDictTrie() const override {
return dictTrie_;
}
bool Tag(const string& src, vector<pair<string, string> >& res) const {
bool Tag(const string& src, vector<pair<string, string> >& res) const override {
return tagger_.Tag(src, res, *this);
}
@ -95,61 +45,50 @@ public:
return dictTrie_->IsUserDictSingleChineseWord(value);
}
private:
void CalcDP(vector<Dag>& dags) const {
size_t nextPos;
const DictUnit* p;
double val;
void CalcDP(vector<DatDag>& dags) const {
for (auto rit = dags.rbegin(); rit != dags.rend(); rit++) {
rit->max_next = -1;
rit->max_weight = MIN_DOUBLE;
for(vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
rit->pInfo = NULL;
rit->weight = MIN_DOUBLE;
assert(!rit->nexts.empty());
for(LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
nextPos = it->first;
p = it->second;
val = 0.0;
if(nextPos + 1 < dags.size()) {
val += dags[nextPos + 1].weight;
for (const auto & it : rit->nexts) {
const auto nextPos = it.first;
double val = dictTrie_->GetMinWeight();
if (nullptr != it.second) {
val = it.second->weight;
}
if(p) {
val += p->weight;
} else {
val += dictTrie_->GetMinWeight();
if (nextPos < dags.size()) {
val += dags[nextPos].max_weight;
}
if(val > rit->weight) {
rit->pInfo = p;
rit->weight = val;
if ((nextPos <= dags.size()) && (val > rit->max_weight)) {
rit->max_weight = val;
rit->max_next = nextPos;
}
}
}
}
void CutByDag(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
const vector<Dag>& dags,
RuneStrArray::const_iterator,
const vector<DatDag>& dags,
vector<WordRange>& words) const {
size_t i = 0;
while(i < dags.size()) {
const DictUnit* p = dags[i].pInfo;
if(p) {
assert(p->word.size() >= 1);
WordRange wr(begin + i, begin + i + p->word.size() - 1);
words.push_back(wr);
i += p->word.size();
} else { //single chinese word
WordRange wr(begin + i, begin + i);
words.push_back(wr);
i++;
}
for (size_t i = 0; i < dags.size();) {
const auto next = dags[i].max_next;
assert(next > i);
assert(next <= dags.size());
WordRange wr(begin + i, begin + next - 1);
words.push_back(wr);
i = next;
}
}
const DictTrie* dictTrie_;
bool isNeedDestroy_;
PosTagger tagger_;
}; // class MPSegment
} // namespace cppjieba
#endif

View File

@ -1,23 +1,4 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_MIXSEGMENT_H
#define CPPJIEBA_MIXSEGMENT_H
#pragma once
#include <cassert>
#include "MPSegment.hpp"
@ -28,70 +9,49 @@
namespace cppjieba {
class MixSegment: public SegmentTagged {
public:
MixSegment(const string& mpSegDict, const string& hmmSegDict,
const string& userDict = "")
: mpSeg_(mpSegDict, userDict),
hmmSeg_(hmmSegDict) {
}
MixSegment(const DictTrie* dictTrie, const HMMModel* model)
: mpSeg_(dictTrie), hmmSeg_(model) {
}
~MixSegment() {
}
~MixSegment() {}
void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, true);
}
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
vector<Word> tmp;
Cut(sentence, tmp, hmm);
GetStringsFromWords(tmp, words);
}
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<WordRange> wrs;
wrs.reserve(sentence.size() / 2);
while(pre_filter.HasNext()) {
range = pre_filter.Next();
Cut(range.begin, range.end, wrs, hmm);
}
words.clear();
words.reserve(wrs.size());
GetWordsFromWordRanges(sentence, wrs, words);
}
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
if(!hmm) {
mpSeg_.Cut(begin, end, res);
virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
size_t) const override {
if (!hmm) {
mpSeg_.CutRuneArray(begin, end, res);
return;
}
vector<WordRange> words;
assert(end >= begin);
words.reserve(end - begin);
mpSeg_.Cut(begin, end, words);
mpSeg_.CutRuneArray(begin, end, words);
vector<WordRange> hmmRes;
hmmRes.reserve(end - begin);
for(size_t i = 0; i < words.size(); i++) {
for (size_t i = 0; i < words.size(); i++) {
//if mp Get a word, it's ok, put it into result
if(words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
if (words[i].left != words[i].right || (words[i].left == words[i].right &&
mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
res.push_back(words[i]);
continue;
}
// if mp Get a single one and it is not in userdict, collect it in sequence
size_t j = i;
while(j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
while (j < words.size() && words[j].left == words[j].right &&
!mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
j++;
}
// Cut the sequence with hmm
assert(j - 1 >= i);
// TODO
hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes);
hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
//put hmm result to result
for(size_t k = 0; k < hmmRes.size(); k++) {
for (size_t k = 0; k < hmmRes.size(); k++) {
res.push_back(hmmRes[k]);
}
@ -103,11 +63,61 @@ public:
}
}
const DictTrie* GetDictTrie() const {
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
size_t) const override {
//目前hmm默认开启后期如有需要关闭再修改--jxx20210519
// if (!hmm) {
// mpSeg_.CutRuneArray(begin, end, res);
// return;
// }
vector<WordRange> words;
assert(end >= begin);
words.reserve(end - begin);
mpSeg_.CutRuneArray(begin, end, words);
vector<WordRange> hmmRes;
hmmRes.reserve(end - begin);
for (size_t i = 0; i < words.size(); i++) {
//if mp Get a word, it's ok, put it into result
if (words[i].left != words[i].right || (words[i].left == words[i].right &&
mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
res.push_back(GetStringFromRunes(s, words[i].left, words[i].right));
continue;
}
// if mp Get a single one and it is not in userdict, collect it in sequence
size_t j = i;
while (j < words.size() && words[j].left == words[j].right &&
!mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
j++;
}
// Cut the sequence with hmm
assert(j - 1 >= i);
// TODO
hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
//put hmm result to result
for (size_t k = 0; k < hmmRes.size(); k++) {
res.push_back(GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right));
}
//clear tmp vars
hmmRes.clear();
//let i jump over this piece
i = j - 1;
}
}
const DictTrie* GetDictTrie() const override {
return mpSeg_.GetDictTrie();
}
bool Tag(const string& src, vector<pair<string, string> >& res) const {
bool Tag(const string& src, vector<pair<string, string> >& res) const override {
return tagger_.Tag(src, res, *this);
}
@ -124,4 +134,3 @@ private:
} // namespace cppjieba
#endif

View File

@ -1,27 +1,8 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_POS_TAGGING_H
#define CPPJIEBA_POS_TAGGING_H
#pragma once
#include "limonp/StringUtil.hpp"
#include "SegmentTagged.hpp"
#include "DictTrie.hpp"
#include "SegmentTagged.hpp"
namespace cppjieba {
using namespace limonp;
@ -39,28 +20,31 @@ public:
bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
vector<string> CutRes;
segment.Cut(src, CutRes);
segment.CutToStr(src, CutRes);
for(vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
}
return !res.empty();
}
string LookupTag(const string &str, const SegmentTagged& segment) const {
const DictUnit *tmp = NULL;
RuneStrArray runes;
const DictTrie * dict = segment.GetDictTrie();
assert(dict != NULL);
if(!DecodeRunesInString(str, runes)) {
XLOG(ERROR) << "Decode failed.";
return POS_X;
}
tmp = dict->Find(runes.begin(), runes.end());
if(tmp == NULL || tmp->tag.empty()) {
const auto tmp = dict->Find(str);
if (tmp == NULL || tmp->GetTag().empty()) {
RuneStrArray runes;
if (!DecodeRunesInString(str, runes)) {
XLOG(ERROR) << "Decode failed.";
return POS_X;
}
return SpecialRule(runes);
} else {
return tmp->tag;
return tmp->GetTag();
}
}
@ -68,22 +52,27 @@ private:
const char* SpecialRule(const RuneStrArray& unicode) const {
size_t m = 0;
size_t eng = 0;
for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
if(unicode[i].rune < 0x80) {
for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
if (unicode[i].rune < 0x80) {
eng ++;
if('0' <= unicode[i].rune && unicode[i].rune <= '9') {
if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
m++;
}
}
}
// ascii char is not found
if(eng == 0) {
if (eng == 0) {
return POS_X;
}
// all the ascii is number char
if(m == eng) {
if (m == eng) {
return POS_M;
}
// the ascii chars contain english letter
return POS_ENG;
}
@ -92,4 +81,3 @@ private:
} // namespace cppjieba
#endif

View File

@ -1,43 +1,20 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_PRE_FILTER_H
#define CPPJIEBA_PRE_FILTER_H
#pragma once
#include "Trie.hpp"
#include "limonp/Logging.hpp"
#include <unordered_set>
#include "Unicode.hpp"
namespace cppjieba {
class PreFilter {
public:
//TODO use WordRange instead of Range
struct Range {
RuneStrArray::const_iterator begin;
RuneStrArray::const_iterator end;
}; // struct Range
PreFilter(const unordered_set<Rune>& symbols,
PreFilter(const std::unordered_set<Rune>& symbols,
const string& sentence)
: symbols_(symbols) {
if(!DecodeRunesInString(sentence, sentence_)) {
XLOG(ERROR) << "decode failed. ";
if (!DecodeRunesInString(sentence, sentence_)) {
XLOG(ERROR) << "decode failed. "<<sentence;
}
cursor_ = sentence_.begin();
}
~PreFilter() {
@ -45,28 +22,31 @@ public:
bool HasNext() const {
return cursor_ != sentence_.end();
}
Range Next() {
Range range;
range.begin = cursor_;
while(cursor_ != sentence_.end()) {
if(IsIn(symbols_, cursor_->rune)) {
if(range.begin == cursor_) {
WordRange Next() {
WordRange range(cursor_, cursor_);
while (cursor_ != sentence_.end()) {
//if (IsIn(symbols_, cursor_->rune)) {
if (cursor_->rune == 0x20) {
if (range.left == cursor_) {
cursor_ ++;
}
range.end = cursor_;
range.right = cursor_;
return range;
}
cursor_ ++;
}
range.end = sentence_.end();
range.right = sentence_.end();
return range;
}
private:
RuneStrArray::const_iterator cursor_;
RuneStrArray sentence_;
const unordered_set<Rune>& symbols_;
const std::unordered_set<Rune>& symbols_;
}; // class PreFilter
} // namespace cppjieba
#endif // CPPJIEBA_PRE_FILTER_H

View File

@ -1,23 +1,4 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_QUERYSEGMENT_H
#define CPPJIEBA_QUERYSEGMENT_H
#pragma once
#include <algorithm>
#include <set>
@ -28,74 +9,65 @@
#include "FullSegment.hpp"
#include "MixSegment.hpp"
#include "Unicode.hpp"
#include "DictTrie.hpp"
namespace cppjieba {
class QuerySegment: public SegmentBase {
public:
QuerySegment(const string& dict, const string& model, const string& userDict = "")
: mixSeg_(dict, model, userDict),
trie_(mixSeg_.GetDictTrie()) {
}
QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
: mixSeg_(dictTrie, model), trie_(dictTrie) {
}
~QuerySegment() {
}
void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, true);
}
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
vector<Word> tmp;
Cut(sentence, tmp, hmm);
GetStringsFromWords(tmp, words);
}
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<WordRange> wrs;
wrs.reserve(sentence.size() / 2);
while(pre_filter.HasNext()) {
range = pre_filter.Next();
Cut(range.begin, range.end, wrs, hmm);
}
words.clear();
words.reserve(wrs.size());
GetWordsFromWordRanges(sentence, wrs, words);
}
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
size_t) const override {
//use mix Cut first
vector<WordRange> mixRes;
mixSeg_.Cut(begin, end, mixRes, hmm);
mixSeg_.CutRuneArray(begin, end, mixRes, hmm);
vector<WordRange> fullRes;
for(vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
if(mixResItr->Length() > 2) {
for(size_t i = 0; i + 1 < mixResItr->Length(); i++) {
WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
if(trie_->Find(wr.left, wr.right + 1) != NULL) {
for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
if (mixResItr->Length() > 2) {
for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 2);
if (trie_->Find(text) != NULL) {
WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
res.push_back(wr);
}
}
}
if(mixResItr->Length() > 3) {
for(size_t i = 0; i + 2 < mixResItr->Length(); i++) {
WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
if(trie_->Find(wr.left, wr.right + 1) != NULL) {
if (mixResItr->Length() > 3) {
for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 3);
if (trie_->Find(text) != NULL) {
WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
res.push_back(wr);
}
}
}
res.push_back(*mixResItr);
}
}
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
size_t) const override {
}
private:
bool IsAllAscii(const Unicode& s) const {
for(size_t i = 0; i < s.size(); i++) {
if(s[i] >= 0x80) {
bool IsAllAscii(const RuneArray& s) const {
for (size_t i = 0; i < s.size(); i++) {
if (s[i] >= 0x80) {
return false;
}
}
return true;
}
MixSegment mixSeg_;
@ -104,4 +76,3 @@ private:
} // namespace cppjieba
#endif

View File

@ -1,23 +1,4 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_SEGMENTBASE_H
#define CPPJIEBA_SEGMENTBASE_H
#pragma once
#include "limonp/Logging.hpp"
#include "PreFilter.hpp"
@ -35,24 +16,69 @@ public:
SegmentBase() {
XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
}
virtual ~SegmentBase() {
virtual ~SegmentBase() { }
virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
size_t max_word_len) const = 0;
//添加基于sentence的cut方法减少中间变量的存储与格式转换--jxx20210517
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
size_t max_word_len) const = 0;
//重写CutToStr函数简化获取vector<string>& words的流程降低内存占用--jxx20210517
void CutToStr(const string& sentence, vector<string>& words, bool hmm = true,
size_t max_word_len = MAX_WORD_LENGTH) const {
/*
vector<Word> tmp;
CutToWord(sentence, tmp, hmm, max_word_len);
GetStringsFromWords(tmp, words);
*/
PreFilter pre_filter(symbols_, sentence);
words.clear();
words.reserve(sentence.size() / 2);//todo 参考源码,参数待定
while (pre_filter.HasNext()) {
auto range = pre_filter.Next();
CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len);
}
}
virtual void Cut(const string& sentence, vector<string>& words) const = 0;
void CutToWord(const string& sentence, vector<Word>& words, bool hmm = true,
size_t max_word_len = MAX_WORD_LENGTH) const {
PreFilter pre_filter(symbols_, sentence);
vector<WordRange> wrs;
wrs.reserve(sentence.size() / 2);
while (pre_filter.HasNext()) {
auto range = pre_filter.Next();
Cut(range.left, range.right, wrs, hmm, max_word_len);
}
words.clear();
words.reserve(wrs.size());
GetWordsFromWordRanges(sentence, wrs, words);
wrs.clear();
vector<WordRange>().swap(wrs);
}
void CutRuneArray(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res,
bool hmm = true, size_t max_word_len = MAX_WORD_LENGTH) const {
Cut(begin, end, res, hmm, max_word_len);
}
bool ResetSeparators(const string& s) {
symbols_.clear();
RuneStrArray runes;
if(!DecodeRunesInString(s, runes)) {
if (!DecodeRunesInString(s, runes)) {
XLOG(ERROR) << "decode " << s << " failed";
return false;
}
for(size_t i = 0; i < runes.size(); i++) {
if(!symbols_.insert(runes[i].rune).second) {
for (size_t i = 0; i < runes.size(); i++) {
if (!symbols_.insert(runes[i].rune).second) {
XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
return false;
}
}
return true;
}
protected:
@ -61,4 +87,3 @@ protected:
} // cppjieba
#endif

View File

@ -1,23 +1,4 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_SEGMENTTAGGED_H
#define CPPJIEBA_SEGMENTTAGGED_H
#pragma once
#include "SegmentBase.hpp"
@ -38,4 +19,3 @@ public:
} // cppjieba
#endif

View File

@ -1,212 +1,205 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
#define CPPJIEBA_TEXTRANK_EXTRACTOR_H
#include <cmath>
#include "Jieba.hpp"
namespace cppjieba {
using namespace limonp;
using namespace std;
class TextRankExtractor {
public:
typedef struct _Word {
string word;
vector<size_t> offsets;
double weight;
} Word; // struct Word
private:
typedef std::map<string, Word> WordMap;
class WordGraph {
private:
typedef double Score;
typedef string Node;
typedef std::set<Node> NodeSet;
typedef std::map<Node, double> Edges;
typedef std::map<Node, Edges> Graph;
//typedef std::unordered_map<Node,double> Edges;
//typedef std::unordered_map<Node,Edges> Graph;
double d;
Graph graph;
NodeSet nodeSet;
public:
WordGraph(): d(0.85) {};
WordGraph(double in_d): d(in_d) {};
void addEdge(Node start, Node end, double weight) {
Edges temp;
Edges::iterator gotEdges;
nodeSet.insert(start);
nodeSet.insert(end);
graph[start][end] += weight;
graph[end][start] += weight;
}
void rank(WordMap &ws, size_t rankTime = 10) {
WordMap outSum;
Score wsdef, min_rank, max_rank;
if(graph.size() == 0)
return;
wsdef = 1.0 / graph.size();
for(Graph::iterator edges = graph.begin(); edges != graph.end(); ++edges) {
// edges->first start节点edge->first end节点edge->second 权重
ws[edges->first].word = edges->first;
ws[edges->first].weight = wsdef;
outSum[edges->first].weight = 0;
for(Edges::iterator edge = edges->second.begin(); edge != edges->second.end(); ++edge) {
outSum[edges->first].weight += edge->second;
}
}
//sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
for(size_t i = 0; i < rankTime; i++) {
for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++) {
double s = 0;
for(Edges::iterator edge = graph[*node].begin(); edge != graph[*node].end(); edge++)
// edge->first end节点edge->second 权重
s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
ws[*node].weight = (1 - d) + d * s;
}
}
min_rank = max_rank = ws.begin()->second.weight;
for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
if(i->second.weight < min_rank) {
min_rank = i->second.weight;
}
if(i->second.weight > max_rank) {
max_rank = i->second.weight;
}
}
for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
}
}
};
public:
TextRankExtractor(const string& dictPath,
const string& hmmFilePath,
const string& stopWordPath,
const string& userDict = "")
: segment_(dictPath, hmmFilePath, userDict) {
LoadStopWordDict(stopWordPath);
}
TextRankExtractor(const DictTrie* dictTrie,
const HMMModel* model,
const string& stopWordPath)
: segment_(dictTrie, model) {
LoadStopWordDict(stopWordPath);
}
TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
LoadStopWordDict(stopWordPath);
}
~TextRankExtractor() {
}
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
vector<Word> topWords;
Extract(sentence, topWords, topN);
for(size_t i = 0; i < topWords.size(); i++) {
keywords.push_back(topWords[i].word);
}
}
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
vector<Word> topWords;
Extract(sentence, topWords, topN);
for(size_t i = 0; i < topWords.size(); i++) {
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
}
}
void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span = 5, size_t rankTime = 10) const {
vector<string> words;
segment_.Cut(sentence, words);
TextRankExtractor::WordGraph graph;
WordMap wordmap;
size_t offset = 0;
for(size_t i = 0; i < words.size(); i++) {
size_t t = offset;
offset += words[i].size();
if(IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
continue;
}
for(size_t j = i + 1, skip = 0; j < i + span + skip && j < words.size(); j++) {
if(IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
skip++;
continue;
}
graph.addEdge(words[i], words[j], 1);
}
wordmap[words[i]].offsets.push_back(t);
}
if(offset != sentence.size()) {
XLOG(ERROR) << "words illegal";
return;
}
graph.rank(wordmap, rankTime);
keywords.clear();
keywords.reserve(wordmap.size());
for(WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
keywords.push_back(itr->second);
}
topN = min(topN, keywords.size());
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
keywords.resize(topN);
}
private:
void LoadStopWordDict(const string& filePath) {
ifstream ifs(filePath.c_str());
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
string line ;
while(getline(ifs, line)) {
stopWords_.insert(line);
}
assert(stopWords_.size());
}
static bool Compare(const Word &x, const Word &y) {
return x.weight > y.weight;
}
MixSegment segment_;
unordered_set<string> stopWords_;
}; // class TextRankExtractor
inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
}
} // namespace cppjieba
#endif
#include <cmath>
#include "Jieba.hpp"
namespace cppjieba {
using namespace limonp;
using namespace std;
class TextRankExtractor {
public:
typedef struct _Word {
string word;
vector<size_t> offsets;
double weight;
} Word; // struct Word
private:
typedef std::map<string, Word> WordMap;
class WordGraph {
private:
typedef double Score;
typedef string Node;
typedef std::set<Node> NodeSet;
typedef std::map<Node, double> Edges;
typedef std::map<Node, Edges> Graph;
//typedef std::unordered_map<Node,double> Edges;
//typedef std::unordered_map<Node,Edges> Graph;
double d;
Graph graph;
NodeSet nodeSet;
public:
WordGraph(): d(0.85) {};
WordGraph(double in_d): d(in_d) {};
void addEdge(Node start, Node end, double weight) {
Edges temp;
Edges::iterator gotEdges;
nodeSet.insert(start);
nodeSet.insert(end);
graph[start][end] += weight;
graph[end][start] += weight;
}
void rank(WordMap &ws, size_t rankTime = 10) {
WordMap outSum;
Score wsdef, min_rank, max_rank;
if (graph.size() == 0) {
return;
}
wsdef = 1.0 / graph.size();
for (Graph::iterator edges = graph.begin(); edges != graph.end(); ++edges) {
// edges->first start节点edge->first end节点edge->second 权重
ws[edges->first].word = edges->first;
ws[edges->first].weight = wsdef;
outSum[edges->first].weight = 0;
for (Edges::iterator edge = edges->second.begin(); edge != edges->second.end(); ++edge) {
outSum[edges->first].weight += edge->second;
}
}
//sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
for (size_t i = 0; i < rankTime; i++) {
for (NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++) {
double s = 0;
for (Edges::iterator edge = graph[*node].begin(); edge != graph[*node].end(); edge++)
// edge->first end节点edge->second 权重
{
s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
}
ws[*node].weight = (1 - d) + d * s;
}
}
min_rank = max_rank = ws.begin()->second.weight;
for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
if (i->second.weight < min_rank) {
min_rank = i->second.weight;
}
if (i->second.weight > max_rank) {
max_rank = i->second.weight;
}
}
for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
}
}
};
public:
TextRankExtractor(const DictTrie* dictTrie,
const HMMModel* model,
const string& stopWordPath)
: segment_(dictTrie, model) {
LoadStopWordDict(stopWordPath);
}
TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
LoadStopWordDict(stopWordPath);
}
~TextRankExtractor() {
}
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
vector<Word> topWords;
Extract(sentence, topWords, topN);
for (size_t i = 0; i < topWords.size(); i++) {
keywords.push_back(topWords[i].word);
}
}
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
vector<Word> topWords;
Extract(sentence, topWords, topN);
for (size_t i = 0; i < topWords.size(); i++) {
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
}
}
void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span = 5, size_t rankTime = 10) const {
vector<string> words;
segment_.CutToStr(sentence, words);
TextRankExtractor::WordGraph graph;
WordMap wordmap;
size_t offset = 0;
for (size_t i = 0; i < words.size(); i++) {
size_t t = offset;
offset += words[i].size();
if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
continue;
}
for (size_t j = i + 1, skip = 0; j < i + span + skip && j < words.size(); j++) {
if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
skip++;
continue;
}
graph.addEdge(words[i], words[j], 1);
}
wordmap[words[i]].offsets.push_back(t);
}
if (offset != sentence.size()) {
XLOG(ERROR) << "words illegal";
return;
}
graph.rank(wordmap, rankTime);
keywords.clear();
keywords.reserve(wordmap.size());
for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
keywords.push_back(itr->second);
}
topN = min(topN, keywords.size());
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
keywords.resize(topN);
}
private:
void LoadStopWordDict(const string& filePath) {
ifstream ifs(filePath.c_str());
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
string line ;
while (getline(ifs, line)) {
stopWords_.insert(line);
}
assert(stopWords_.size());
}
static bool Compare(const Word &x, const Word &y) {
return x.weight > y.weight;
}
MixSegment segment_;
unordered_set<string> stopWords_;
}; // class TextRankExtractor
inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
"}";
}
} // namespace cppjieba

View File

@ -1,192 +0,0 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_TRIE_HPP
#define CPPJIEBA_TRIE_HPP
#include <vector>
#include <queue>
#include "limonp/StdExtension.hpp"
#include "Unicode.hpp"
namespace cppjieba {
using namespace std;
const size_t MAX_WORD_LENGTH = 512;
struct DictUnit {
Unicode word;
double weight;
string tag;
}; // struct DictUnit
// for debugging
// inline ostream & operator << (ostream& os, const DictUnit& unit) {
// string s;
// s << unit.word;
// return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
// }
struct Dag {
RuneStr runestr;
// [offset, nexts.first]
limonp::LocalVector<pair<size_t, const DictUnit*> > nexts;
const DictUnit * pInfo;
double weight;
size_t nextPos; // TODO
Dag(): runestr(), pInfo(NULL), weight(0.0), nextPos(0) {
}
}; // struct Dag
typedef Rune TrieKey;
class TrieNode {
public :
TrieNode(): next(NULL), ptValue(NULL) {
}
public:
typedef unordered_map<TrieKey, TrieNode*> NextMap;
NextMap *next;
const DictUnit *ptValue;
};
class Trie {
public:
Trie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers)
: root_(new TrieNode) {
CreateTrie(keys, valuePointers);
}
~Trie() {
DeleteNode(root_);
}
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
if(begin == end) {
return NULL;
}
const TrieNode* ptNode = root_;
TrieNode::NextMap::const_iterator citer;
for(RuneStrArray::const_iterator it = begin; it != end; it++) {
if(NULL == ptNode->next) {
return NULL;
}
citer = ptNode->next->find(it->rune);
if(ptNode->next->end() == citer) {
return NULL;
}
ptNode = citer->second;
}
return ptNode->ptValue;
}
void Find(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<struct Dag>&res,
size_t max_word_len = MAX_WORD_LENGTH) const {
assert(root_ != NULL);
res.resize(end - begin);
const TrieNode *ptNode = NULL;
TrieNode::NextMap::const_iterator citer;
for(size_t i = 0; i < size_t(end - begin); i++) {
res[i].runestr = *(begin + i);
if(root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
ptNode = citer->second;
} else {
ptNode = NULL;
}
if(ptNode != NULL) {
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
} else {
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, static_cast<const DictUnit*>(NULL)));
}
for(size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) {
if(ptNode == NULL || ptNode->next == NULL) {
break;
}
citer = ptNode->next->find((begin + j)->rune);
if(ptNode->next->end() == citer) {
break;
}
ptNode = citer->second;
if(NULL != ptNode->ptValue) {
res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
}
}
}
}
void InsertNode(const Unicode& key, const DictUnit* ptValue) {
if(key.begin() == key.end()) {
return;
}
TrieNode::NextMap::const_iterator kmIter;
TrieNode *ptNode = root_;
for(Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
if(NULL == ptNode->next) {
ptNode->next = new TrieNode::NextMap;
}
kmIter = ptNode->next->find(*citer);
if(ptNode->next->end() == kmIter) {
TrieNode *nextNode = new TrieNode;
ptNode->next->insert(make_pair(*citer, nextNode));
ptNode = nextNode;
} else {
ptNode = kmIter->second;
}
}
assert(ptNode != NULL);
ptNode->ptValue = ptValue;
}
private:
void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
if(valuePointers.empty() || keys.empty()) {
return;
}
assert(keys.size() == valuePointers.size());
for(size_t i = 0; i < keys.size(); i++) {
InsertNode(keys[i], valuePointers[i]);
}
}
void DeleteNode(TrieNode* node) {
if(NULL == node) {
return;
}
if(NULL != node->next) {
for(TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) {
DeleteNode(it->second);
}
delete node->next;
}
delete node;
}
TrieNode* root_;
}; // class Trie
} // namespace cppjieba
#endif // CPPJIEBA_TRIE_HPP

View File

@ -1,23 +1,4 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef CPPJIEBA_UNICODE_H
#define CPPJIEBA_UNICODE_H
#pragma once
#include <stdint.h>
#include <stdlib.h>
@ -25,6 +6,7 @@
#include <vector>
#include <ostream>
#include "limonp/LocalVector.hpp"
#include "limonp/StringUtil.hpp"
namespace cppjieba {
@ -50,28 +32,28 @@ inline std::ostream& operator << (std::ostream& os, const Word& w) {
return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
}
struct RuneStr {
struct RuneInfo {
Rune rune;
uint32_t offset;
uint32_t len;
uint32_t unicode_offset;
uint32_t unicode_length;
RuneStr(): rune(0), offset(0), len(0), unicode_offset(0), unicode_length(0) {
uint32_t unicode_offset = 0;
uint32_t unicode_length = 0;
RuneInfo(): rune(0), offset(0), len(0) {
}
RuneStr(Rune r, uint32_t o, uint32_t l)
: rune(r), offset(o), len(l), unicode_offset(0), unicode_length(0) {
RuneInfo(Rune r, uint32_t o, uint32_t l)
: rune(r), offset(o), len(l) {
}
RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
RuneInfo(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
: rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
}
}; // struct RuneStr
}; // struct RuneInfo
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
inline std::ostream& operator << (std::ostream& os, const RuneInfo& r) {
return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
}
typedef limonp::LocalVector<Rune> Unicode;
typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
typedef limonp::LocalVector<Rune> RuneArray;
typedef limonp::LocalVector<struct RuneInfo> RuneStrArray;
// [left, right]
struct WordRange {
@ -83,127 +65,169 @@ struct WordRange {
size_t Length() const {
return right - left + 1;
}
bool IsAllAscii() const {
for(RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
if(iter->rune >= 0x80) {
for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
if (iter->rune >= 0x80) {
return false;
}
}
return true;
}
}; // struct WordRange
struct RuneStrLite {
uint32_t rune;
uint32_t len;
RuneStrLite(): rune(0), len(0) {
}
RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
}
}; // struct RuneStrLite
inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
RuneStrLite rp(0, 0);
if(str == NULL || len == 0) {
return rp;
}
if(!(str[0] & 0x80)) { // 0xxxxxxx
// 7bit, total 7bit
rp.rune = (uint8_t)(str[0]) & 0x7f;
rp.len = 1;
} else if((uint8_t)str[0] <= 0xdf && 1 < len) {
// 110xxxxxx
// 5bit, total 5bit
rp.rune = (uint8_t)(str[0]) & 0x1f;
// 6bit, total 11bit
rp.rune <<= 6;
rp.rune |= (uint8_t)(str[1]) & 0x3f;
rp.len = 2;
} else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
// 4bit, total 4bit
rp.rune = (uint8_t)(str[0]) & 0x0f;
// 6bit, total 10bit
rp.rune <<= 6;
rp.rune |= (uint8_t)(str[1]) & 0x3f;
// 6bit, total 16bit
rp.rune <<= 6;
rp.rune |= (uint8_t)(str[2]) & 0x3f;
rp.len = 3;
} else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
// 3bit, total 3bit
rp.rune = (uint8_t)(str[0]) & 0x07;
// 6bit, total 9bit
rp.rune <<= 6;
rp.rune |= (uint8_t)(str[1]) & 0x3f;
// 6bit, total 15bit
rp.rune <<= 6;
rp.rune |= (uint8_t)(str[2]) & 0x3f;
// 6bit, total 21bit
rp.rune <<= 6;
rp.rune |= (uint8_t)(str[3]) & 0x3f;
rp.len = 4;
} else {
rp.rune = 0;
rp.len = 0;
}
return rp;
inline bool DecodeRunesInString(const string& s, RuneArray& arr) {
arr.clear();
return limonp::Utf8ToUnicode32(s, arr);
}
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
runes.clear();
runes.reserve(len / 2);
for(uint32_t i = 0, j = 0; i < len;) {
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
if(rp.len == 0) {
runes.clear();
return false;
}
RuneStr x(rp.rune, i, rp.len, j, 1);
runes.push_back(x);
i += rp.len;
++j;
}
return true;
inline RuneArray DecodeRunesInString(const string& s) {
RuneArray result;
DecodeRunesInString(s, result);
return result;
}
//重写DecodeRunesInString函数将实现放入函数中降低内存占用加快处理流程--jxx20210518
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
return DecodeRunesInString(s.c_str(), s.size(), runes);
}
/*
RuneArray arr;
inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
unicode.clear();
RuneStrArray runes;
if(!DecodeRunesInString(s, len, runes)) {
if (not DecodeRunesInString(s, arr)) {
return false;
}
unicode.reserve(runes.size());
for(size_t i = 0; i < runes.size(); i++) {
unicode.push_back(runes[i].rune);
runes.clear();
uint32_t offset = 0;
for (uint32_t i = 0; i < arr.size(); ++i) {
const uint32_t len = limonp::UnicodeToUtf8Bytes(arr[i]);
RuneInfo x(arr[i], offset, len, i, 1);
runes.push_back(x);
offset += len;
}
*/
uint32_t tmp;
uint32_t offset = 0;
runes.clear();
for(size_t i = 0; i < s.size();) {
if(!(s.data()[i] & 0x80)) { // 0xxxxxxx
// 7bit, total 7bit
tmp = (uint8_t)(s.data()[i]) & 0x7f;
i++;
} else if ((uint8_t)s.data()[i] <= 0xdf && i + 1 < s.size()) { // 110xxxxxx
// 5bit, total 5bit
tmp = (uint8_t)(s.data()[i]) & 0x1f;
// 6bit, total 11bit
tmp <<= 6;
tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
i += 2;
} else if((uint8_t)s.data()[i] <= 0xef && i + 2 < s.size()) { // 1110xxxxxx
// 4bit, total 4bit
tmp = (uint8_t)(s.data()[i]) & 0x0f;
// 6bit, total 10bit
tmp <<= 6;
tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
// 6bit, total 16bit
tmp <<= 6;
tmp |= (uint8_t)(s.data()[i+2]) & 0x3f;
i += 3;
} else if((uint8_t)s.data()[i] <= 0xf7 && i + 3 < s.size()) { // 11110xxxx
// 3bit, total 3bit
tmp = (uint8_t)(s.data()[i]) & 0x07;
// 6bit, total 9bit
tmp <<= 6;
tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
// 6bit, total 15bit
tmp <<= 6;
tmp |= (uint8_t)(s.data()[i+2]) & 0x3f;
// 6bit, total 21bit
tmp <<= 6;
tmp |= (uint8_t)(s.data()[i+3]) & 0x3f;
i += 4;
} else {
return false;
}
uint32_t len = limonp::UnicodeToUtf8Bytes(tmp);
RuneInfo x(tmp, offset, len, i, 1);
runes.push_back(x);
offset += len;
}
return true;
}
class RunePtrWrapper {
public:
const RuneInfo * m_ptr = nullptr;
public:
explicit RunePtrWrapper(const RuneInfo * p) : m_ptr(p) {}
uint32_t operator *() {
return m_ptr->rune;
}
RunePtrWrapper operator ++(int) {
m_ptr ++;
return RunePtrWrapper(m_ptr);
}
bool operator !=(const RunePtrWrapper & b) const {
return this->m_ptr != b.m_ptr;
}
};
inline string EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) {
string str;
RunePtrWrapper it_begin(begin), it_end(end);
limonp::Unicode32ToUtf8(it_begin, it_end, str);
return str;
}
inline void EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, string& str) {
RunePtrWrapper it_begin(begin), it_end(end);
limonp::Unicode32ToUtf8(it_begin, it_end, str);
return;
}
class Unicode32Counter {
public :
size_t length = 0;
void clear() {
length = 0;
}
void push_back(uint32_t) {
++length;
}
};
inline size_t Utf8CharNum(const char * str, size_t length) {
Unicode32Counter c;
if (limonp::Utf8ToUnicode32(str, length, c)) {
return c.length;
}
return 0;
}
inline size_t Utf8CharNum(const string & str) {
return Utf8CharNum(str.data(), str.size());
}
inline bool IsSingleWord(const string& str) {
RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
return rp.len == str.size();
}
inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
return DecodeRunesInString(s.c_str(), s.size(), unicode);
}
inline Unicode DecodeRunesInString(const string& s) {
Unicode result;
DecodeRunesInString(s, result);
return result;
return Utf8CharNum(str) == 1;
}
@ -218,28 +242,31 @@ inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left,
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
assert(right->offset >= left->offset);
uint32_t len = right->offset - left->offset + right->len;
return s.substr(left->offset, len);
uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length).word;
}
inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
for(size_t i = 0; i < wrs.size(); i++) {
for (size_t i = 0; i < wrs.size(); i++) {
words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
}
}
inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
vector<Word> result;
GetWordsFromWordRanges(s, wrs, result);
return result;
inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<string>& words) {
for (size_t i = 0; i < wrs.size(); i++) {
words.push_back(GetStringFromRunes(s, wrs[i].left, wrs[i].right));
}
}
inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
strs.resize(words.size());
for(size_t i = 0; i < words.size(); ++i) {
for (size_t i = 0; i < words.size(); ++i) {
strs[i] = words[i].word;
}
}
const size_t MAX_WORD_LENGTH = 512;
} // namespace cppjieba
#endif // CPPJIEBA_UNICODE_H

File diff suppressed because it is too large Load Diff

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
/************************************
* file enc : ascii
* author : wuyanyi09@gmail.com
@ -33,54 +15,54 @@ namespace limonp {
using namespace std;
class ArgvContext {
public :
ArgvContext(int argc, const char* const * argv) {
for(int i = 0; i < argc; i++) {
if(StartsWith(argv[i], "-")) {
if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) {
mpss_[argv[i]] = argv[i + 1];
i++;
} else {
sset_.insert(argv[i]);
}
} else {
args_.push_back(argv[i]);
}
public :
ArgvContext(int argc, const char* const * argv) {
for(int i = 0; i < argc; i++) {
if(StartsWith(argv[i], "-")) {
if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) {
mpss_[argv[i]] = argv[i+1];
i++;
} else {
sset_.insert(argv[i]);
}
} else {
args_.push_back(argv[i]);
}
}
~ArgvContext() {
}
}
~ArgvContext() {
}
friend ostream& operator << (ostream& os, const ArgvContext& args);
string operator [](size_t i) const {
if(i < args_.size()) {
return args_[i];
}
return "";
friend ostream& operator << (ostream& os, const ArgvContext& args);
string operator [](size_t i) const {
if(i < args_.size()) {
return args_[i];
}
string operator [](const string& key) const {
map<string, string>::const_iterator it = mpss_.find(key);
if(it != mpss_.end()) {
return it->second;
}
return "";
return "";
}
string operator [](const string& key) const {
map<string, string>::const_iterator it = mpss_.find(key);
if(it != mpss_.end()) {
return it->second;
}
return "";
}
bool HasKey(const string& key) const {
if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) {
return true;
}
return false;
bool HasKey(const string& key) const {
if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) {
return true;
}
return false;
}
private:
vector<string> args_;
map<string, string> mpss_;
set<string> sset_;
private:
vector<string> args_;
map<string, string> mpss_;
set<string> sset_;
}; // class ArgvContext
inline ostream& operator << (ostream& os, const ArgvContext& args) {
return os << args.args_ << args.mpss_ << args.sset_;
return os<<args.args_<<args.mpss_<<args.sset_;
}
} // namespace limonp

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_BLOCKINGQUEUE_HPP
#define LIMONP_BLOCKINGQUEUE_HPP
@ -25,41 +7,41 @@
namespace limonp {
template<class T>
class BlockingQueue: NonCopyable {
public:
BlockingQueue()
: mutex_(), notEmpty_(mutex_), queue_() {
}
public:
BlockingQueue()
: mutex_(), notEmpty_(mutex_), queue_() {
}
void Push(const T& x) {
MutexLockGuard lock(mutex_);
queue_.push(x);
notEmpty_.Notify(); // Wait morphing saves us
}
void Push(const T& x) {
MutexLockGuard lock(mutex_);
queue_.push(x);
notEmpty_.Notify(); // Wait morphing saves us
}
T Pop() {
MutexLockGuard lock(mutex_);
// always use a while-loop, due to spurious wakeup
while(queue_.empty()) {
notEmpty_.Wait();
}
assert(!queue_.empty());
T front(queue_.front());
queue_.pop();
return front;
T Pop() {
MutexLockGuard lock(mutex_);
// always use a while-loop, due to spurious wakeup
while (queue_.empty()) {
notEmpty_.Wait();
}
assert(!queue_.empty());
T front(queue_.front());
queue_.pop();
return front;
}
size_t Size() const {
MutexLockGuard lock(mutex_);
return queue_.size();
}
bool Empty() const {
return Size() == 0;
}
size_t Size() const {
MutexLockGuard lock(mutex_);
return queue_.size();
}
bool Empty() const {
return Size() == 0;
}
private:
mutable MutexLock mutex_;
Condition notEmpty_;
std::queue<T> queue_;
private:
mutable MutexLock mutex_;
Condition notEmpty_;
std::queue<T> queue_;
}; // class BlockingQueue
} // namespace limonp

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
#define LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
@ -25,59 +7,59 @@ namespace limonp {
template<typename T>
class BoundedBlockingQueue : NonCopyable {
public:
explicit BoundedBlockingQueue(size_t maxSize)
: mutex_(),
notEmpty_(mutex_),
notFull_(mutex_),
queue_(maxSize) {
}
public:
explicit BoundedBlockingQueue(size_t maxSize)
: mutex_(),
notEmpty_(mutex_),
notFull_(mutex_),
queue_(maxSize) {
}
void Push(const T& x) {
MutexLockGuard lock(mutex_);
while(queue_.Full()) {
notFull_.Wait();
}
assert(!queue_.Full());
queue_.Push(x);
notEmpty_.Notify();
void Push(const T& x) {
MutexLockGuard lock(mutex_);
while (queue_.Full()) {
notFull_.Wait();
}
assert(!queue_.Full());
queue_.Push(x);
notEmpty_.Notify();
}
T Pop() {
MutexLockGuard lock(mutex_);
while(queue_.Empty()) {
notEmpty_.Wait();
}
assert(!queue_.Empty());
T res = queue_.Pop();
notFull_.Notify();
return res;
T Pop() {
MutexLockGuard lock(mutex_);
while (queue_.Empty()) {
notEmpty_.Wait();
}
assert(!queue_.Empty());
T res = queue_.Pop();
notFull_.Notify();
return res;
}
bool Empty() const {
MutexLockGuard lock(mutex_);
return queue_.Empty();
}
bool Empty() const {
MutexLockGuard lock(mutex_);
return queue_.Empty();
}
bool Full() const {
MutexLockGuard lock(mutex_);
return queue_.Full();
}
bool Full() const {
MutexLockGuard lock(mutex_);
return queue_.Full();
}
size_t size() const {
MutexLockGuard lock(mutex_);
return queue_.size();
}
size_t size() const {
MutexLockGuard lock(mutex_);
return queue_.size();
}
size_t capacity() const {
return queue_.capacity();
}
size_t capacity() const {
return queue_.capacity();
}
private:
mutable MutexLock mutex_;
Condition notEmpty_;
Condition notFull_;
BoundedQueue<T> queue_;
private:
mutable MutexLock mutex_;
Condition notEmpty_;
Condition notFull_;
BoundedQueue<T> queue_;
}; // class BoundedBlockingQueue
} // namespace limonp

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_BOUNDED_QUEUE_HPP
#define LIMONP_BOUNDED_QUEUE_HPP
@ -27,55 +9,55 @@ namespace limonp {
using namespace std;
template<class T>
class BoundedQueue {
public:
explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) {
head_ = 0;
tail_ = 0;
size_ = 0;
assert(capacity_);
}
~BoundedQueue() {
}
public:
explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) {
head_ = 0;
tail_ = 0;
size_ = 0;
assert(capacity_);
}
~BoundedQueue() {
}
void Clear() {
head_ = 0;
tail_ = 0;
size_ = 0;
}
bool Empty() const {
return !size_;
}
bool Full() const {
return capacity_ == size_;
}
size_t Size() const {
return size_;
}
size_t Capacity() const {
return capacity_;
}
void Clear() {
head_ = 0;
tail_ = 0;
size_ = 0;
}
bool Empty() const {
return !size_;
}
bool Full() const {
return capacity_ == size_;
}
size_t Size() const {
return size_;
}
size_t Capacity() const {
return capacity_;
}
void Push(const T& t) {
assert(!Full());
circular_buffer_[tail_] = t;
tail_ = (tail_ + 1) % capacity_;
size_ ++;
}
void Push(const T& t) {
assert(!Full());
circular_buffer_[tail_] = t;
tail_ = (tail_ + 1) % capacity_;
size_ ++;
}
T Pop() {
assert(!Empty());
size_t oldPos = head_;
head_ = (head_ + 1) % capacity_;
size_ --;
return circular_buffer_[oldPos];
}
T Pop() {
assert(!Empty());
size_t oldPos = head_;
head_ = (head_ + 1) % capacity_;
size_ --;
return circular_buffer_[oldPos];
}
private:
size_t head_;
size_t tail_;
size_t size_;
const size_t capacity_;
vector<T> circular_buffer_;
private:
size_t head_;
size_t tail_;
size_t size_;
const size_t capacity_;
vector<T> circular_buffer_;
}; // class BoundedQueue
} // namespace limonp

View File

@ -1,222 +1,204 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_CLOSURE_HPP
#define LIMONP_CLOSURE_HPP
namespace limonp {
class ClosureInterface {
public:
virtual ~ClosureInterface() {
}
virtual void Run() = 0;
public:
virtual ~ClosureInterface() {
}
virtual void Run() = 0;
};
template <class Funct>
class Closure0: public ClosureInterface {
public:
Closure0(Funct fun) {
fun_ = fun;
}
virtual ~Closure0() {
}
virtual void Run() {
(*fun_)();
}
private:
Funct fun_;
};
public:
Closure0(Funct fun) {
fun_ = fun;
}
virtual ~Closure0() {
}
virtual void Run() {
(*fun_)();
}
private:
Funct fun_;
};
template <class Funct, class Arg1>
class Closure1: public ClosureInterface {
public:
Closure1(Funct fun, Arg1 arg1) {
fun_ = fun;
arg1_ = arg1;
}
virtual ~Closure1() {
}
virtual void Run() {
(*fun_)(arg1_);
}
private:
Funct fun_;
Arg1 arg1_;
};
public:
Closure1(Funct fun, Arg1 arg1) {
fun_ = fun;
arg1_ = arg1;
}
virtual ~Closure1() {
}
virtual void Run() {
(*fun_)(arg1_);
}
private:
Funct fun_;
Arg1 arg1_;
};
template <class Funct, class Arg1, class Arg2>
class Closure2: public ClosureInterface {
public:
Closure2(Funct fun, Arg1 arg1, Arg2 arg2) {
fun_ = fun;
arg1_ = arg1;
arg2_ = arg2;
}
virtual ~Closure2() {
}
virtual void Run() {
(*fun_)(arg1_, arg2_);
}
private:
Funct fun_;
Arg1 arg1_;
Arg2 arg2_;
};
public:
Closure2(Funct fun, Arg1 arg1, Arg2 arg2) {
fun_ = fun;
arg1_ = arg1;
arg2_ = arg2;
}
virtual ~Closure2() {
}
virtual void Run() {
(*fun_)(arg1_, arg2_);
}
private:
Funct fun_;
Arg1 arg1_;
Arg2 arg2_;
};
template <class Funct, class Arg1, class Arg2, class Arg3>
class Closure3: public ClosureInterface {
public:
Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
fun_ = fun;
arg1_ = arg1;
arg2_ = arg2;
arg3_ = arg3;
}
virtual ~Closure3() {
}
virtual void Run() {
(*fun_)(arg1_, arg2_, arg3_);
}
private:
Funct fun_;
Arg1 arg1_;
Arg2 arg2_;
Arg3 arg3_;
};
public:
Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
fun_ = fun;
arg1_ = arg1;
arg2_ = arg2;
arg3_ = arg3;
}
virtual ~Closure3() {
}
virtual void Run() {
(*fun_)(arg1_, arg2_, arg3_);
}
private:
Funct fun_;
Arg1 arg1_;
Arg2 arg2_;
Arg3 arg3_;
};
template <class Obj, class Funct>
template <class Obj, class Funct>
class ObjClosure0: public ClosureInterface {
public:
ObjClosure0(Obj* p, Funct fun) {
p_ = p;
fun_ = fun;
}
virtual ~ObjClosure0() {
}
virtual void Run() {
(p_->*fun_)();
}
private:
Obj* p_;
Funct fun_;
};
public:
ObjClosure0(Obj* p, Funct fun) {
p_ = p;
fun_ = fun;
}
virtual ~ObjClosure0() {
}
virtual void Run() {
(p_->*fun_)();
}
private:
Obj* p_;
Funct fun_;
};
template <class Obj, class Funct, class Arg1>
template <class Obj, class Funct, class Arg1>
class ObjClosure1: public ClosureInterface {
public:
ObjClosure1(Obj* p, Funct fun, Arg1 arg1) {
p_ = p;
fun_ = fun;
arg1_ = arg1;
}
virtual ~ObjClosure1() {
}
virtual void Run() {
(p_->*fun_)(arg1_);
}
private:
Obj* p_;
Funct fun_;
Arg1 arg1_;
};
public:
ObjClosure1(Obj* p, Funct fun, Arg1 arg1) {
p_ = p;
fun_ = fun;
arg1_ = arg1;
}
virtual ~ObjClosure1() {
}
virtual void Run() {
(p_->*fun_)(arg1_);
}
private:
Obj* p_;
Funct fun_;
Arg1 arg1_;
};
template <class Obj, class Funct, class Arg1, class Arg2>
template <class Obj, class Funct, class Arg1, class Arg2>
class ObjClosure2: public ClosureInterface {
public:
ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) {
p_ = p;
fun_ = fun;
arg1_ = arg1;
arg2_ = arg2;
}
virtual ~ObjClosure2() {
}
virtual void Run() {
(p_->*fun_)(arg1_, arg2_);
}
private:
Obj* p_;
Funct fun_;
Arg1 arg1_;
Arg2 arg2_;
};
template <class Obj, class Funct, class Arg1, class Arg2, class Arg3>
public:
ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) {
p_ = p;
fun_ = fun;
arg1_ = arg1;
arg2_ = arg2;
}
virtual ~ObjClosure2() {
}
virtual void Run() {
(p_->*fun_)(arg1_, arg2_);
}
private:
Obj* p_;
Funct fun_;
Arg1 arg1_;
Arg2 arg2_;
};
template <class Obj, class Funct, class Arg1, class Arg2, class Arg3>
class ObjClosure3: public ClosureInterface {
public:
ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
p_ = p;
fun_ = fun;
arg1_ = arg1;
arg2_ = arg2;
arg3_ = arg3;
}
virtual ~ObjClosure3() {
}
virtual void Run() {
(p_->*fun_)(arg1_, arg2_, arg3_);
}
private:
Obj* p_;
Funct fun_;
Arg1 arg1_;
Arg2 arg2_;
Arg3 arg3_;
};
public:
ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
p_ = p;
fun_ = fun;
arg1_ = arg1;
arg2_ = arg2;
arg3_ = arg3;
}
virtual ~ObjClosure3() {
}
virtual void Run() {
(p_->*fun_)(arg1_, arg2_, arg3_);
}
private:
Obj* p_;
Funct fun_;
Arg1 arg1_;
Arg2 arg2_;
Arg3 arg3_;
};
template<class R>
ClosureInterface* NewClosure(R(*fun)()) {
return new Closure0<R(*)()>(fun);
ClosureInterface* NewClosure(R (*fun)()) {
return new Closure0<R (*)()>(fun);
}
template<class R, class Arg1>
ClosureInterface* NewClosure(R(*fun)(Arg1), Arg1 arg1) {
return new Closure1<R(*)(Arg1), Arg1>(fun, arg1);
ClosureInterface* NewClosure(R (*fun)(Arg1), Arg1 arg1) {
return new Closure1<R (*)(Arg1), Arg1>(fun, arg1);
}
template<class R, class Arg1, class Arg2>
ClosureInterface* NewClosure(R(*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
return new Closure2<R(*)(Arg1, Arg2), Arg1, Arg2>(fun, arg1, arg2);
ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
return new Closure2<R (*)(Arg1, Arg2), Arg1, Arg2>(fun, arg1, arg2);
}
template<class R, class Arg1, class Arg2, class Arg3>
ClosureInterface* NewClosure(R(*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
return new Closure3<R(*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(fun, arg1, arg2, arg3);
ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
return new Closure3<R (*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(fun, arg1, arg2, arg3);
}
template<class R, class Obj>
ClosureInterface* NewClosure(Obj* obj, R(Obj::* fun)()) {
return new ObjClosure0<Obj, R(Obj::*)()>(obj, fun);
ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)()) {
return new ObjClosure0<Obj, R (Obj::* )()>(obj, fun);
}
template<class R, class Obj, class Arg1>
ClosureInterface* NewClosure(Obj* obj, R(Obj::* fun)(Arg1), Arg1 arg1) {
return new ObjClosure1<Obj, R(Obj::*)(Arg1), Arg1>(obj, fun, arg1);
ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1), Arg1 arg1) {
return new ObjClosure1<Obj, R (Obj::* )(Arg1), Arg1>(obj, fun, arg1);
}
template<class R, class Obj, class Arg1, class Arg2>
ClosureInterface* NewClosure(Obj* obj, R(Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
return new ObjClosure2<Obj, R(Obj::*)(Arg1, Arg2), Arg1, Arg2>(obj, fun, arg1, arg2);
ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
return new ObjClosure2<Obj, R (Obj::*)(Arg1, Arg2), Arg1, Arg2>(obj, fun, arg1, arg2);
}
template<class R, class Obj, class Arg1, class Arg2, class Arg3>
ClosureInterface* NewClosure(Obj* obj, R(Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
return new ObjClosure3<Obj, R(Obj::*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(obj, fun, arg1, arg2, arg3);
ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
return new ObjClosure3<Obj, R (Obj::*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(obj, fun, arg1, arg2, arg3);
}
} // namespace limonp

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_COLOR_PRINT_HPP
#define LIMONP_COLOR_PRINT_HPP
@ -27,21 +9,21 @@ namespace limonp {
using std::string;
enum Color {
BLACK = 30,
RED,
GREEN,
YELLOW,
BLUE,
PURPLE
BLACK = 30,
RED,
GREEN,
YELLOW,
BLUE,
PURPLE
}; // enum Color
static void ColorPrintln(enum Color color, const char * fmt, ...) {
va_list ap;
printf("\033[0;%dm", color);
va_start(ap, fmt);
vprintf(fmt, ap);
va_end(ap);
printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly
va_list ap;
printf("\033[0;%dm", color);
va_start(ap, fmt);
vprintf(fmt, ap);
va_end(ap);
printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly
}
} // namespace limonp

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_CONDITION_HPP
#define LIMONP_CONDITION_HPP
@ -24,31 +6,31 @@
namespace limonp {
class Condition : NonCopyable {
public:
explicit Condition(MutexLock& mutex)
: mutex_(mutex) {
XCHECK(!pthread_cond_init(&pcond_, NULL));
}
public:
explicit Condition(MutexLock& mutex)
: mutex_(mutex) {
XCHECK(!pthread_cond_init(&pcond_, NULL));
}
~Condition() {
XCHECK(!pthread_cond_destroy(&pcond_));
}
~Condition() {
XCHECK(!pthread_cond_destroy(&pcond_));
}
void Wait() {
XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex()));
}
void Wait() {
XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex()));
}
void Notify() {
XCHECK(!pthread_cond_signal(&pcond_));
}
void Notify() {
XCHECK(!pthread_cond_signal(&pcond_));
}
void NotifyAll() {
XCHECK(!pthread_cond_broadcast(&pcond_));
}
void NotifyAll() {
XCHECK(!pthread_cond_broadcast(&pcond_));
}
private:
MutexLock& mutex_;
pthread_cond_t pcond_;
private:
MutexLock& mutex_;
pthread_cond_t pcond_;
}; // class Condition
} // namespace limonp

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
/************************************
* file enc : utf8
* author : wuyanyi09@gmail.com
@ -34,86 +16,86 @@ namespace limonp {
using namespace std;
class Config {
public:
explicit Config(const string& filePath) {
LoadFile(filePath);
}
public:
explicit Config(const string& filePath) {
LoadFile(filePath);
}
operator bool () {
return !map_.empty();
}
operator bool () {
return !map_.empty();
}
string Get(const string& key, const string& defaultvalue) const {
map<string, string>::const_iterator it = map_.find(key);
if(map_.end() != it) {
return it->second;
}
return defaultvalue;
string Get(const string& key, const string& defaultvalue) const {
map<string, string>::const_iterator it = map_.find(key);
if(map_.end() != it) {
return it->second;
}
int Get(const string& key, int defaultvalue) const {
string str = Get(key, "");
if("" == str) {
return defaultvalue;
}
return atoi(str.c_str());
return defaultvalue;
}
int Get(const string& key, int defaultvalue) const {
string str = Get(key, "");
if("" == str) {
return defaultvalue;
}
const char* operator [](const char* key) const {
if(NULL == key) {
return NULL;
}
map<string, string>::const_iterator it = map_.find(key);
if(map_.end() != it) {
return it->second.c_str();
}
return NULL;
return atoi(str.c_str());
}
const char* operator [] (const char* key) const {
if(NULL == key) {
return NULL;
}
map<string, string>::const_iterator it = map_.find(key);
if(map_.end() != it) {
return it->second.c_str();
}
return NULL;
}
string GetConfigInfo() const {
string res;
res << *this;
return res;
string GetConfigInfo() const {
string res;
res << *this;
return res;
}
private:
void LoadFile(const string& filePath) {
ifstream ifs(filePath.c_str());
assert(ifs);
string line;
vector<string> vecBuf;
size_t lineno = 0;
while(getline(ifs, line)) {
lineno ++;
Trim(line);
if(line.empty() || StartsWith(line, "#")) {
continue;
}
vecBuf.clear();
Split(line, vecBuf, "=");
if(2 != vecBuf.size()) {
fprintf(stderr, "line[%s] illegal.\n", line.c_str());
assert(false);
continue;
}
string& key = vecBuf[0];
string& value = vecBuf[1];
Trim(key);
Trim(value);
if(!map_.insert(make_pair(key, value)).second) {
fprintf(stderr, "key[%s] already exits.\n", key.c_str());
assert(false);
continue;
}
}
ifs.close();
}
private:
void LoadFile(const string& filePath) {
ifstream ifs(filePath.c_str());
assert(ifs);
string line;
vector<string> vecBuf;
size_t lineno = 0;
while(getline(ifs, line)) {
lineno ++;
Trim(line);
if(line.empty() || StartsWith(line, "#")) {
continue;
}
vecBuf.clear();
Split(line, vecBuf, "=");
if(2 != vecBuf.size()) {
fprintf(stderr, "line[%s] illegal.\n", line.c_str());
assert(false);
continue;
}
string& key = vecBuf[0];
string& value = vecBuf[1];
Trim(key);
Trim(value);
if(!map_.insert(make_pair(key, value)).second) {
fprintf(stderr, "key[%s] already exits.\n", key.c_str());
assert(false);
continue;
}
}
ifs.close();
}
friend ostream& operator << (ostream& os, const Config& config);
friend ostream& operator << (ostream& os, const Config& config);
map<string, string> map_;
map<string, string> map_;
}; // class Config
inline ostream& operator << (ostream& os, const Config& config) {
return os << config.map_;
return os << config.map_;
}
} // namespace limonp

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_FILELOCK_HPP
#define LIMONP_FILELOCK_HPP
@ -33,58 +15,58 @@ namespace limonp {
using std::string;
class FileLock {
public:
FileLock() : fd_(-1), ok_(true) {
public:
FileLock() : fd_(-1), ok_(true) {
}
~FileLock() {
if(fd_ > 0) {
Close();
}
~FileLock() {
if(fd_ > 0) {
Close();
}
}
void Open(const string& fname) {
assert(fd_ == -1);
fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
if(fd_ < 0) {
ok_ = false;
err_ = strerror(errno);
}
void Open(const string& fname) {
assert(fd_ == -1);
fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
if(fd_ < 0) {
ok_ = false;
err_ = strerror(errno);
}
}
void Close() {
::close(fd_);
}
void Lock() {
if(LockOrUnlock(fd_, true) < 0) {
ok_ = false;
err_ = strerror(errno);
}
void Close() {
::close(fd_);
}
void Lock() {
if(LockOrUnlock(fd_, true) < 0) {
ok_ = false;
err_ = strerror(errno);
}
}
void UnLock() {
if(LockOrUnlock(fd_, false) < 0) {
ok_ = false;
err_ = strerror(errno);
}
}
bool Ok() const {
return ok_;
}
string Error() const {
return err_;
}
private:
static int LockOrUnlock(int fd, bool lock) {
errno = 0;
struct flock f;
memset(&f, 0, sizeof(f));
f.l_type = (lock ? F_WRLCK : F_UNLCK);
f.l_whence = SEEK_SET;
f.l_start = 0;
f.l_len = 0; // Lock/unlock entire file
return fcntl(fd, F_SETLK, &f);
}
void UnLock() {
if(LockOrUnlock(fd_, false) < 0) {
ok_ = false;
err_ = strerror(errno);
}
}
bool Ok() const {
return ok_;
}
string Error() const {
return err_;
}
private:
static int LockOrUnlock(int fd, bool lock) {
errno = 0;
struct flock f;
memset(&f, 0, sizeof(f));
f.l_type = (lock ? F_WRLCK : F_UNLCK);
f.l_whence = SEEK_SET;
f.l_start = 0;
f.l_len = 0; // Lock/unlock entire file
return fcntl(fd, F_SETLK, &f);
}
int fd_;
bool ok_;
string err_;
int fd_;
bool ok_;
string err_;
}; // class FileLock
}// namespace limonp

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_FORCE_PUBLIC_H
#define LIMONP_FORCE_PUBLIC_H

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_LOCAL_VECTOR_HPP
#define LIMONP_LOCAL_VECTOR_HPP
@ -33,123 +15,126 @@ using namespace std;
const size_t LOCAL_VECTOR_BUFFER_SIZE = 16;
template <class T>
class LocalVector {
public:
typedef const T* const_iterator ;
typedef T value_type;
typedef size_t size_type;
private:
T buffer_[LOCAL_VECTOR_BUFFER_SIZE];
T * ptr_;
size_t size_;
size_t capacity_;
public:
LocalVector() {
init_();
};
LocalVector(const LocalVector<T>& vec) {
init_();
*this = vec;
public:
typedef const T* const_iterator ;
typedef T value_type;
typedef size_t size_type;
private:
T buffer_[LOCAL_VECTOR_BUFFER_SIZE];
T * ptr_;
size_t size_;
size_t capacity_;
public:
LocalVector() {
init_();
};
LocalVector(const LocalVector<T>& vec) {
init_();
*this = vec;
}
LocalVector(const_iterator begin, const_iterator end) { // TODO: make it faster
init_();
while(begin != end) {
push_back(*begin++);
}
LocalVector(const_iterator begin, const_iterator end) { // TODO: make it faster
init_();
while(begin != end) {
push_back(*begin++);
}
}
LocalVector(size_t size, const T& t) { // TODO: make it faster
init_();
while(size--) {
push_back(t);
}
LocalVector(size_t size, const T& t) { // TODO: make it faster
init_();
while(size--) {
push_back(t);
}
}
~LocalVector() {
if(ptr_ != buffer_) {
free(ptr_);
}
~LocalVector() {
if(ptr_ != buffer_) {
free(ptr_);
}
};
public:
LocalVector<T>& operator = (const LocalVector<T>& vec) {
clear();
size_ = vec.size();
capacity_ = vec.capacity();
if(vec.buffer_ == vec.ptr_) {
memcpy(buffer_, vec.buffer_, sizeof(T) * size_);
ptr_ = buffer_;
} else {
ptr_ = (T*) malloc(vec.capacity() * sizeof(T));
assert(ptr_);
memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T));
}
return *this;
};
public:
LocalVector<T>& operator = (const LocalVector<T>& vec) {
if(this == &vec){
return *this;
}
clear();
size_ = vec.size();
capacity_ = vec.capacity();
if(vec.buffer_ == vec.ptr_) {
memcpy(buffer_, vec.buffer_, sizeof(T) * size_);
ptr_ = buffer_;
} else {
ptr_ = (T*) malloc(vec.capacity() * sizeof(T));
assert(ptr_);
memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T));
}
private:
void init_() {
ptr_ = buffer_;
size_ = 0;
capacity_ = LOCAL_VECTOR_BUFFER_SIZE;
return *this;
}
private:
void init_() {
ptr_ = buffer_;
size_ = 0;
capacity_ = LOCAL_VECTOR_BUFFER_SIZE;
}
public:
T& operator [] (size_t i) {
return ptr_[i];
}
const T& operator [] (size_t i) const {
return ptr_[i];
}
void push_back(const T& t) {
if(size_ == capacity_) {
assert(capacity_);
reserve(capacity_ * 2);
}
public:
T& operator [](size_t i) {
return ptr_[i];
ptr_[size_ ++ ] = t;
}
void reserve(size_t size) {
if(size <= capacity_) {
return;
}
const T& operator [](size_t i) const {
return ptr_[i];
T * next = (T*)malloc(sizeof(T) * size);
assert(next);
T * old = ptr_;
ptr_ = next;
memcpy(ptr_, old, sizeof(T) * capacity_);
capacity_ = size;
if(old != buffer_) {
free(old);
}
void push_back(const T& t) {
if(size_ == capacity_) {
assert(capacity_);
reserve(capacity_ * 2);
}
ptr_[size_ ++ ] = t;
}
void reserve(size_t size) {
if(size <= capacity_) {
return;
}
T * next = (T*)malloc(sizeof(T) * size);
assert(next);
T * old = ptr_;
ptr_ = next;
memcpy(ptr_, old, sizeof(T) * capacity_);
capacity_ = size;
if(old != buffer_) {
free(old);
}
}
bool empty() const {
return 0 == size();
}
size_t size() const {
return size_;
}
size_t capacity() const {
return capacity_;
}
const_iterator begin() const {
return ptr_;
}
const_iterator end() const {
return ptr_ + size_;
}
void clear() {
if(ptr_ != buffer_) {
free(ptr_);
}
init_();
}
bool empty() const {
return 0 == size();
}
size_t size() const {
return size_;
}
size_t capacity() const {
return capacity_;
}
const_iterator begin() const {
return ptr_;
}
const_iterator end() const {
return ptr_ + size_;
}
void clear() {
if(ptr_ != buffer_) {
free(ptr_);
}
init_();
}
};
template <class T>
ostream & operator << (ostream& os, const LocalVector<T>& vec) {
if(vec.empty()) {
return os << "[]";
}
os << "[\"" << vec[0];
for(size_t i = 1; i < vec.size(); i++) {
os << "\", \"" << vec[i];
}
os << "\"]";
return os;
if(vec.empty()) {
return os << "[]";
}
os<<"[\""<<vec[0];
for(size_t i = 1; i < vec.size(); i++) {
os<<"\", \""<<vec[i];
}
os<<"\"]";
return os;
}
}

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_LOGGING_HPP
#define LIMONP_LOGGING_HPP
@ -38,55 +20,56 @@
namespace limonp {
enum {
LL_DEBUG = 0,
LL_INFO = 1,
LL_WARNING = 2,
LL_ERROR = 3,
LL_FATAL = 4,
LL_DEBUG = 0,
LL_INFO = 1,
LL_WARNING = 2,
LL_ERROR = 3,
LL_FATAL = 4,
}; // enum
static const char * LOG_LEVEL_ARRAY[] = {"DEBUG", "INFO", "WARN", "ERROR", "FATAL"};
static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S";
static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"};
class Logger {
public:
Logger(size_t level, const char* filename, int lineno)
: level_(level) {
public:
Logger(size_t level, const char* filename, int lineno)
: level_(level) {
#ifdef LOGGING_LEVEL
if(level_ < LOGGING_LEVEL) {
return;
}
if (level_ < LOGGING_LEVEL) {
return;
}
#endif
assert(level_ <= sizeof(LOG_LEVEL_ARRAY) / sizeof(*LOG_LEVEL_ARRAY));
char buf[32];
time_t now;
time(&now);
strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&now));
stream_ << buf
<< " " << filename
<< ":" << lineno
<< " " << LOG_LEVEL_ARRAY[level_]
<< " ";
}
~Logger() {
assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY));
char buf[32];
time_t now;
time(&now);
struct tm result;
localtime_r(&now, &result);
strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &result);
stream_ << buf
<< " " << filename
<< ":" << lineno
<< " " << LOG_LEVEL_ARRAY[level_]
<< " ";
}
~Logger() {
#ifdef LOGGING_LEVEL
if(level_ < LOGGING_LEVEL) {
return;
}
if (level_ < LOGGING_LEVEL) {
return;
}
#endif
std::cerr << stream_.str() << std::endl;
if(level_ == LL_FATAL) {
abort();
}
std::cerr << stream_.str() << std::endl;
if (level_ == LL_FATAL) {
abort();
}
}
std::ostream& Stream() {
return stream_;
}
std::ostream& Stream() {
return stream_;
}
private:
std::ostringstream stream_;
size_t level_;
private:
std::ostringstream stream_;
size_t level_;
}; // class Logger
} // namespace limonp

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_MUTEX_LOCK_HPP
#define LIMONP_MUTEX_LOCK_HPP
@ -26,40 +8,40 @@
namespace limonp {
class MutexLock: NonCopyable {
public:
MutexLock() {
XCHECK(!pthread_mutex_init(&mutex_, NULL));
}
~MutexLock() {
XCHECK(!pthread_mutex_destroy(&mutex_));
}
pthread_mutex_t* GetPthreadMutex() {
return &mutex_;
}
public:
MutexLock() {
XCHECK(!pthread_mutex_init(&mutex_, NULL));
}
~MutexLock() {
XCHECK(!pthread_mutex_destroy(&mutex_));
}
pthread_mutex_t* GetPthreadMutex() {
return &mutex_;
}
private:
void Lock() {
XCHECK(!pthread_mutex_lock(&mutex_));
}
void Unlock() {
XCHECK(!pthread_mutex_unlock(&mutex_));
}
friend class MutexLockGuard;
private:
void Lock() {
XCHECK(!pthread_mutex_lock(&mutex_));
}
void Unlock() {
XCHECK(!pthread_mutex_unlock(&mutex_));
}
friend class MutexLockGuard;
pthread_mutex_t mutex_;
pthread_mutex_t mutex_;
}; // class MutexLock
class MutexLockGuard: NonCopyable {
public:
explicit MutexLockGuard(MutexLock & mutex)
: mutex_(mutex) {
mutex_.Lock();
}
~MutexLockGuard() {
mutex_.Unlock();
}
private:
MutexLock & mutex_;
public:
explicit MutexLockGuard(MutexLock & mutex)
: mutex_(mutex) {
mutex_.Lock();
}
~MutexLockGuard() {
mutex_.Unlock();
}
private:
MutexLock & mutex_;
}; // class MutexLockGuard
#define MutexLockGuard(x) XCHECK(false);

View File

@ -1,35 +1,19 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
/************************************
************************************/
#ifndef LIMONP_NONCOPYABLE_H
#define LIMONP_NONCOPYABLE_H
namespace limonp {
class NonCopyable {
protected:
NonCopyable() {
}
~NonCopyable() {
}
private:
NonCopyable(const NonCopyable&);
const NonCopyable& operator=(const NonCopyable&);
protected:
NonCopyable() {
}
~NonCopyable() {
}
private:
NonCopyable(const NonCopyable& );
const NonCopyable& operator=(const NonCopyable& );
}; // class NonCopyable
} // namespace limonp

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_STD_EXTEMSION_HPP
#define LIMONP_STD_EXTEMSION_HPP
@ -51,123 +33,123 @@ namespace std {
template<typename T>
ostream& operator << (ostream& os, const vector<T>& v) {
if(v.empty()) {
return os << "[]";
}
os << "[" << v[0];
for(size_t i = 1; i < v.size(); i++) {
os << ", " << v[i];
}
os << "]";
return os;
if(v.empty()) {
return os << "[]";
}
os<<"["<<v[0];
for(size_t i = 1; i < v.size(); i++) {
os<<", "<<v[i];
}
os<<"]";
return os;
}
template<>
inline ostream& operator << (ostream& os, const vector<string>& v) {
if(v.empty()) {
return os << "[]";
}
os << "[\"" << v[0];
for(size_t i = 1; i < v.size(); i++) {
os << "\", \"" << v[i];
}
os << "\"]";
return os;
if(v.empty()) {
return os << "[]";
}
os<<"[\""<<v[0];
for(size_t i = 1; i < v.size(); i++) {
os<<"\", \""<<v[i];
}
os<<"\"]";
return os;
}
template<typename T>
ostream& operator << (ostream& os, const deque<T>& dq) {
if(dq.empty()) {
return os << "[]";
}
os << "[\"" << dq[0];
for(size_t i = 1; i < dq.size(); i++) {
os << "\", \"" << dq[i];
}
os << "\"]";
return os;
if(dq.empty()) {
return os << "[]";
}
os<<"[\""<<dq[0];
for(size_t i = 1; i < dq.size(); i++) {
os<<"\", \""<<dq[i];
}
os<<"\"]";
return os;
}
template<class T1, class T2>
ostream& operator << (ostream& os, const pair<T1, T2>& pr) {
os << pr.first << ":" << pr.second ;
return os;
os << pr.first << ":" << pr.second ;
return os;
}
template<class T>
string& operator << (string& str, const T& obj) {
stringstream ss;
ss << obj; // call ostream& operator << (ostream& os,
return str = ss.str();
stringstream ss;
ss << obj; // call ostream& operator << (ostream& os,
return str = ss.str();
}
template<class T1, class T2>
ostream& operator << (ostream& os, const map<T1, T2>& mp) {
if(mp.empty()) {
os << "{}";
return os;
}
os << '{';
typename map<T1, T2>::const_iterator it = mp.begin();
os << *it;
it++;
while(it != mp.end()) {
os << ", " << *it;
it++;
}
os << '}';
if(mp.empty()) {
os<<"{}";
return os;
}
os<<'{';
typename map<T1, T2>::const_iterator it = mp.begin();
os<<*it;
it++;
while(it != mp.end()) {
os<<", "<<*it;
it++;
}
os<<'}';
return os;
}
template<class T1, class T2>
ostream& operator << (ostream& os, const std::unordered_map<T1, T2>& mp) {
if(mp.empty()) {
return os << "{}";
}
os << '{';
typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
os << *it;
it++;
while(it != mp.end()) {
os << ", " << *it++;
}
return os << '}';
if(mp.empty()) {
return os << "{}";
}
os<<'{';
typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
os<<*it;
it++;
while(it != mp.end()) {
os<<", "<<*it++;
}
return os<<'}';
}
template<class T>
ostream& operator << (ostream& os, const set<T>& st) {
if(st.empty()) {
os << "{}";
return os;
}
os << '{';
typename set<T>::const_iterator it = st.begin();
os << *it;
it++;
while(it != st.end()) {
os << ", " << *it;
it++;
}
os << '}';
if(st.empty()) {
os << "{}";
return os;
}
os<<'{';
typename set<T>::const_iterator it = st.begin();
os<<*it;
it++;
while(it != st.end()) {
os<<", "<<*it;
it++;
}
os<<'}';
return os;
}
template<class KeyType, class ContainType>
bool IsIn(const ContainType& contain, const KeyType& key) {
return contain.end() != contain.find(key);
return contain.end() != contain.find(key);
}
template<class T>
basic_string<T> & operator << (basic_string<T> & s, ifstream & ifs) {
return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
}
template<class T>
ofstream & operator << (ofstream & ofs, const basic_string<T>& s) {
ostreambuf_iterator<T> itr(ofs);
copy(s.begin(), s.end(), itr);
return ofs;
ostreambuf_iterator<T> itr (ofs);
copy(s.begin(), s.end(), itr);
return ofs;
}
} // namespace std

View File

@ -1,27 +1,14 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
/************************************
* file enc : ascii
* author : wuyanyi09@gmail.com
************************************/
#ifndef LIMONP_STR_FUNCTS_H
#define LIMONP_STR_FUNCTS_H
#include <stdint.h>
#include <stdio.h>
#include <stdarg.h>
#include <memory.h>
#include <sys/types.h>
#include <fstream>
#include <iostream>
#include <string>
@ -29,14 +16,9 @@
#include <algorithm>
#include <cctype>
#include <map>
#include <stdint.h>
#include <stdio.h>
#include <stdarg.h>
#include <memory.h>
#include <functional>
#include <locale>
#include <sstream>
#include <sys/types.h>
#include <iterator>
#include <algorithm>
#include "StdExtension.hpp"
@ -44,339 +26,356 @@
namespace limonp {
using namespace std;
inline string StringFormat(const char* fmt, ...) {
int size = 256;
std::string str;
va_list ap;
while(1) {
str.resize(size);
va_start(ap, fmt);
int n = vsnprintf((char *)str.c_str(), size, fmt, ap);
va_end(ap);
if(n > -1 && n < size) {
str.resize(n);
return str;
}
if(n > -1)
size = n + 1;
else
size *= 2;
int size = 256;
std::string str;
va_list ap;
while (1) {
str.resize(size);
va_start(ap, fmt);
int n = vsnprintf((char *)str.c_str(), size, fmt, ap);
va_end(ap);
if (n > -1 && n < size) {
str.resize(n);
return str;
}
return str;
if (n > -1)
size = n + 1;
else
size *= 2;
}
return str;
}
template<class T>
void Join(T begin, T end, string& res, const string& connector) {
if(begin == end) {
return;
}
stringstream ss;
ss << *begin;
begin++;
while(begin != end) {
ss << connector << *begin;
begin ++;
}
res = ss.str();
if(begin == end) {
return;
}
stringstream ss;
ss<<*begin;
begin++;
while(begin != end) {
ss << connector << *begin;
begin ++;
}
res = ss.str();
}
template<class T>
string Join(T begin, T end, const string& connector) {
string res;
Join(begin, end, res, connector);
return res;
string res;
Join(begin ,end, res, connector);
return res;
}
inline string& Upper(string& str) {
transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
return str;
transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
return str;
}
inline string& Lower(string& str) {
transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
return str;
transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
return str;
}
inline bool IsSpace(unsigned c) {
// when passing large int as the argument of isspace, it core dump, so here need a type cast.
return c > 0xff ? false : std::isspace(c & 0xff) != 0;
// when passing large int as the argument of isspace, it core dump, so here need a type cast.
return c > 0xff ? false : std::isspace(c & 0xff);
}
inline std::string& LTrim(std::string &s) {
s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))));
return s;
s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))));
return s;
}
inline std::string& RTrim(std::string &s) {
s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))).base(), s.end());
return s;
s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))).base(), s.end());
return s;
}
inline std::string& Trim(std::string &s) {
return LTrim(RTrim(s));
return LTrim(RTrim(s));
}
inline std::string& LTrim(std::string & s, char x) {
s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to<char>(), x))));
return s;
s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to<char>(), x))));
return s;
}
inline std::string& RTrim(std::string & s, char x) {
s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to<char>(), x))).base(), s.end());
return s;
s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to<char>(), x))).base(), s.end());
return s;
}
inline std::string& Trim(std::string &s, char x) {
return LTrim(RTrim(s, x), x);
return LTrim(RTrim(s, x), x);
}
inline void Split(const string& src, vector<string>& res, const string& pattern, size_t maxsplit = string::npos) {
res.clear();
size_t Start = 0;
size_t end = 0;
string sub;
while(Start < src.size()) {
end = src.find_first_of(pattern, Start);
if(string::npos == end || res.size() >= maxsplit) {
sub = src.substr(Start);
res.push_back(sub);
return;
}
sub = src.substr(Start, end - Start);
res.push_back(sub);
Start = end + 1;
res.clear();
size_t Start = 0;
size_t end = 0;
string sub;
while(Start < src.size()) {
end = src.find_first_of(pattern, Start);
if(string::npos == end || res.size() >= maxsplit) {
sub = src.substr(Start);
res.push_back(sub);
return;
}
return;
sub = src.substr(Start, end - Start);
res.push_back(sub);
Start = end + 1;
}
return;
}
inline vector<string> Split(const string& src, const string& pattern, size_t maxsplit = string::npos) {
vector<string> res;
Split(src, res, pattern, maxsplit);
return res;
vector<string> res;
Split(src, res, pattern, maxsplit);
return res;
}
inline bool StartsWith(const string& str, const string& prefix) {
if(prefix.length() > str.length()) {
return false;
}
return 0 == str.compare(0, prefix.length(), prefix);
if(prefix.length() > str.length()) {
return false;
}
return 0 == str.compare(0, prefix.length(), prefix);
}
inline bool EndsWith(const string& str, const string& suffix) {
if(suffix.length() > str.length()) {
return false;
}
return 0 == str.compare(str.length() - suffix.length(), suffix.length(), suffix);
if(suffix.length() > str.length()) {
return false;
}
return 0 == str.compare(str.length() - suffix.length(), suffix.length(), suffix);
}
inline bool IsInStr(const string& str, char ch) {
return str.find(ch) != string::npos;
return str.find(ch) != string::npos;
}
inline uint16_t TwocharToUint16(char high, char low) {
return (((uint16_t(high) & 0x00ff) << 8) | (uint16_t(low) & 0x00ff));
return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
}
template <class Uint16Container>
bool Utf8ToUnicode(const char * const str, size_t len, Uint16Container& vec) {
if(!str) {
return false;
if(!str) {
return false;
}
char ch1, ch2;
uint16_t tmp;
vec.clear();
for(size_t i = 0; i < len;) {
if(!(str[i] & 0x80)) { // 0xxxxxxx
vec.push_back(str[i]);
i++;
} else if ((uint8_t)str[i] <= 0xdf && i + 1 < len) { // 110xxxxxx
ch1 = (str[i] >> 2) & 0x07;
ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
vec.push_back(tmp);
i += 2;
} else if((uint8_t)str[i] <= 0xef && i + 2 < len) {
ch1 = ((uint8_t)str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
ch2 = (((uint8_t)str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
vec.push_back(tmp);
i += 3;
} else {
return false;
}
char ch1, ch2;
uint16_t tmp;
vec.clear();
for(size_t i = 0; i < len;) {
if(!(str[i] & 0x80)) { // 0xxxxxxx
vec.push_back(str[i]);
i++;
} else if((uint8_t)str[i] <= 0xdf && i + 1 < len) { // 110xxxxxx
ch1 = (str[i] >> 2) & 0x07;
ch2 = (str[i + 1] & 0x3f) | ((str[i] & 0x03) << 6);
tmp = (((uint16_t(ch1) & 0x00ff) << 8) | (uint16_t(ch2) & 0x00ff));
vec.push_back(tmp);
i += 2;
} else if((uint8_t)str[i] <= 0xef && i + 2 < len) {
ch1 = ((uint8_t)str[i] << 4) | ((str[i + 1] >> 2) & 0x0f);
ch2 = (((uint8_t)str[i + 1] << 6) & 0xc0) | (str[i + 2] & 0x3f);
tmp = (((uint16_t(ch1) & 0x00ff) << 8) | (uint16_t(ch2) & 0x00ff));
vec.push_back(tmp);
i += 3;
} else {
return false;
}
}
return true;
}
return true;
}
template <class Uint16Container>
bool Utf8ToUnicode(const string& str, Uint16Container& vec) {
return Utf8ToUnicode(str.c_str(), str.size(), vec);
return Utf8ToUnicode(str.c_str(), str.size(), vec);
}
template <class Uint32Container>
bool Utf8ToUnicode32(const char * str, size_t size, Uint32Container& vec) {
uint32_t tmp;
vec.clear();
for(size_t i = 0; i < size;) {
if(!(str[i] & 0x80)) { // 0xxxxxxx
// 7bit, total 7bit
tmp = (uint8_t)(str[i]) & 0x7f;
i++;
} else if ((uint8_t)str[i] <= 0xdf && i + 1 < size) { // 110xxxxxx
// 5bit, total 5bit
tmp = (uint8_t)(str[i]) & 0x1f;
// 6bit, total 11bit
tmp <<= 6;
tmp |= (uint8_t)(str[i+1]) & 0x3f;
i += 2;
} else if((uint8_t)str[i] <= 0xef && i + 2 < size) { // 1110xxxxxx
// 4bit, total 4bit
tmp = (uint8_t)(str[i]) & 0x0f;
// 6bit, total 10bit
tmp <<= 6;
tmp |= (uint8_t)(str[i+1]) & 0x3f;
// 6bit, total 16bit
tmp <<= 6;
tmp |= (uint8_t)(str[i+2]) & 0x3f;
i += 3;
} else if((uint8_t)str[i] <= 0xf7 && i + 3 < size) { // 11110xxxx
// 3bit, total 3bit
tmp = (uint8_t)(str[i]) & 0x07;
// 6bit, total 9bit
tmp <<= 6;
tmp |= (uint8_t)(str[i+1]) & 0x3f;
// 6bit, total 15bit
tmp <<= 6;
tmp |= (uint8_t)(str[i+2]) & 0x3f;
// 6bit, total 21bit
tmp <<= 6;
tmp |= (uint8_t)(str[i+3]) & 0x3f;
i += 4;
} else {
return false;
}
vec.push_back(tmp);
}
return true;
}
template <class Uint32Container>
bool Utf8ToUnicode32(const string& str, Uint32Container& vec) {
uint32_t tmp;
vec.clear();
for(size_t i = 0; i < str.size();) {
if(!(str[i] & 0x80)) { // 0xxxxxxx
// 7bit, total 7bit
tmp = (uint8_t)(str[i]) & 0x7f;
i++;
} else if((uint8_t)str[i] <= 0xdf && i + 1 < str.size()) { // 110xxxxxx
// 5bit, total 5bit
tmp = (uint8_t)(str[i]) & 0x1f;
return Utf8ToUnicode32(str.data(), str.size(), vec);
}
// 6bit, total 11bit
tmp <<= 6;
tmp |= (uint8_t)(str[i + 1]) & 0x3f;
i += 2;
} else if((uint8_t)str[i] <= 0xef && i + 2 < str.size()) { // 1110xxxxxx
// 4bit, total 4bit
tmp = (uint8_t)(str[i]) & 0x0f;
// 6bit, total 10bit
tmp <<= 6;
tmp |= (uint8_t)(str[i + 1]) & 0x3f;
// 6bit, total 16bit
tmp <<= 6;
tmp |= (uint8_t)(str[i + 2]) & 0x3f;
i += 3;
} else if((uint8_t)str[i] <= 0xf7 && i + 3 < str.size()) { // 11110xxxx
// 3bit, total 3bit
tmp = (uint8_t)(str[i]) & 0x07;
// 6bit, total 9bit
tmp <<= 6;
tmp |= (uint8_t)(str[i + 1]) & 0x3f;
// 6bit, total 15bit
tmp <<= 6;
tmp |= (uint8_t)(str[i + 2]) & 0x3f;
// 6bit, total 21bit
tmp <<= 6;
tmp |= (uint8_t)(str[i + 3]) & 0x3f;
i += 4;
} else {
return false;
}
vec.push_back(tmp);
inline int UnicodeToUtf8Bytes(uint32_t ui){
if(ui <= 0x7f) {
return 1;
} else if(ui <= 0x7ff) {
return 2;
} else if(ui <= 0xffff) {
return 3;
} else {
return 4;
}
return true;
}
template <class Uint32ContainerConIter>
void Unicode32ToUtf8(Uint32ContainerConIter begin, Uint32ContainerConIter end, string& res) {
res.clear();
uint32_t ui;
while(begin != end) {
ui = *begin;
if(ui <= 0x7f) {
res += char(ui);
} else if(ui <= 0x7ff) {
res += char(((ui >> 6) & 0x1f) | 0xc0);
res += char((ui & 0x3f) | 0x80);
} else if(ui <= 0xffff) {
res += char(((ui >> 12) & 0x0f) | 0xe0);
res += char(((ui >> 6) & 0x3f) | 0x80);
res += char((ui & 0x3f) | 0x80);
} else {
res += char(((ui >> 18) & 0x03) | 0xf0);
res += char(((ui >> 12) & 0x3f) | 0x80);
res += char(((ui >> 6) & 0x3f) | 0x80);
res += char((ui & 0x3f) | 0x80);
}
begin ++;
res.clear();
uint32_t ui;
while(begin != end) {
ui = *begin;
if(ui <= 0x7f) {
res += char(ui);
} else if(ui <= 0x7ff) {
res += char(((ui >> 6) & 0x1f) | 0xc0);
res += char((ui & 0x3f) | 0x80);
} else if(ui <= 0xffff) {
res += char(((ui >> 12) & 0x0f) | 0xe0);
res += char(((ui >> 6) & 0x3f) | 0x80);
res += char((ui & 0x3f) | 0x80);
} else {
res += char(((ui >> 18) & 0x03) | 0xf0);
res += char(((ui >> 12) & 0x3f) | 0x80);
res += char(((ui >> 6) & 0x3f) | 0x80);
res += char((ui & 0x3f) | 0x80);
}
begin ++;
}
}
template <class Uint16ContainerConIter>
void UnicodeToUtf8(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
res.clear();
uint16_t ui;
while(begin != end) {
ui = *begin;
if(ui <= 0x7f) {
res += char(ui);
} else if(ui <= 0x7ff) {
res += char(((ui >> 6) & 0x1f) | 0xc0);
res += char((ui & 0x3f) | 0x80);
} else {
res += char(((ui >> 12) & 0x0f) | 0xe0);
res += char(((ui >> 6) & 0x3f) | 0x80);
res += char((ui & 0x3f) | 0x80);
}
begin ++;
res.clear();
uint16_t ui;
while(begin != end) {
ui = *begin;
if(ui <= 0x7f) {
res += char(ui);
} else if(ui <= 0x7ff) {
res += char(((ui>>6) & 0x1f) | 0xc0);
res += char((ui & 0x3f) | 0x80);
} else {
res += char(((ui >> 12) & 0x0f )| 0xe0);
res += char(((ui>>6) & 0x3f )| 0x80 );
res += char((ui & 0x3f) | 0x80);
}
begin ++;
}
}
template <class Uint16Container>
bool GBKTrans(const char* const str, size_t len, Uint16Container& vec) {
vec.clear();
if(!str) {
return true;
}
size_t i = 0;
while(i < len) {
if(0 == (str[i] & 0x80)) {
vec.push_back(uint16_t(str[i]));
i++;
} else {
if(i + 1 < len) { //&& (str[i+1] & 0x80))
uint16_t tmp = (((uint16_t(str[i]) & 0x00ff) << 8) | (uint16_t(str[i + 1]) & 0x00ff));
vec.push_back(tmp);
i += 2;
} else {
return false;
}
}
}
vec.clear();
if(!str) {
return true;
}
size_t i = 0;
while(i < len) {
if(0 == (str[i] & 0x80)) {
vec.push_back(uint16_t(str[i]));
i++;
} else {
if(i + 1 < len) { //&& (str[i+1] & 0x80))
uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
vec.push_back(tmp);
i += 2;
} else {
return false;
}
}
}
return true;
}
template <class Uint16Container>
bool GBKTrans(const string& str, Uint16Container& vec) {
return GBKTrans(str.c_str(), str.size(), vec);
return GBKTrans(str.c_str(), str.size(), vec);
}
template <class Uint16ContainerConIter>
void GBKTrans(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
res.clear();
//pair<char, char> pa;
char first, second;
while(begin != end) {
//pa = uint16ToChar2(*begin);
first = ((*begin) >> 8) & 0x00ff;
second = (*begin) & 0x00ff;
if(first & 0x80) {
res += first;
res += second;
} else {
res += second;
}
begin++;
res.clear();
//pair<char, char> pa;
char first, second;
while(begin != end) {
//pa = uint16ToChar2(*begin);
first = ((*begin)>>8) & 0x00ff;
second = (*begin) & 0x00ff;
if(first & 0x80) {
res += first;
res += second;
} else {
res += second;
}
begin++;
}
}
/*
* format example: "%Y-%m-%d %H:%M:%S"
*/
inline void GetTime(const string& format, string& timeStr) {
time_t timeNow;
time(&timeNow);
timeStr.resize(64);
size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), localtime(&timeNow));
timeStr.resize(len);
}
// inline void GetTime(const string& format, string& timeStr) {
// time_t timeNow;
// time(&timeNow);
// timeStr.resize(64);
// size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), localtime(&timeNow));
// timeStr.resize(len);
// }
inline string PathJoin(const string& path1, const string& path2) {
if(EndsWith(path1, "/")) {
return path1 + path2;
}
return path1 + "/" + path2;
if(EndsWith(path1, "/")) {
return path1 + path2;
}
return path1 + "/" + path2;
}
}

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_THREAD_HPP
#define LIMONP_THREAD_HPP
@ -25,36 +7,36 @@
namespace limonp {
class IThread: NonCopyable {
public:
IThread(): isStarted(false), isJoined(false) {
public:
IThread(): isStarted(false), isJoined(false) {
}
virtual ~IThread() {
if(isStarted && !isJoined) {
XCHECK(!pthread_detach(thread_));
}
virtual ~IThread() {
if(isStarted && !isJoined) {
XCHECK(!pthread_detach(thread_));
}
};
};
virtual void Run() = 0;
void Start() {
XCHECK(!isStarted);
XCHECK(!pthread_create(&thread_, NULL, Worker, this));
isStarted = true;
}
void Join() {
XCHECK(!isJoined);
XCHECK(!pthread_join(thread_, NULL));
isJoined = true;
}
private:
static void * Worker(void * data) {
IThread * ptr = (IThread*) data;
ptr->Run();
return NULL;
}
virtual void Run() = 0;
void Start() {
XCHECK(!isStarted);
XCHECK(!pthread_create(&thread_, NULL, Worker, this));
isStarted = true;
}
void Join() {
XCHECK(!isJoined);
XCHECK(!pthread_join(thread_, NULL));
isJoined = true;
}
private:
static void * Worker(void * data) {
IThread * ptr = (IThread* ) data;
ptr->Run();
return NULL;
}
pthread_t thread_;
bool isStarted;
bool isJoined;
pthread_t thread_;
bool isStarted;
bool isJoined;
}; // class IThread
} // namespace limonp

View File

@ -1,21 +1,3 @@
/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*
*/
#ifndef LIMONP_THREAD_POOL_HPP
#define LIMONP_THREAD_POOL_HPP
@ -30,73 +12,73 @@ using namespace std;
//class ThreadPool;
class ThreadPool: NonCopyable {
public:
class Worker: public IThread {
public:
Worker(ThreadPool* pool): ptThreadPool_(pool) {
assert(ptThreadPool_);
}
virtual ~Worker() {
}
virtual void Run() {
while(true) {
ClosureInterface* closure = ptThreadPool_->queue_.Pop();
if(closure == NULL) {
break;
}
try {
closure->Run();
} catch(std::exception& e) {
XLOG(ERROR) << e.what();
} catch(...) {
XLOG(ERROR) << " unknown exception.";
}
delete closure;
}
}
private:
ThreadPool * ptThreadPool_;
}; // class Worker
ThreadPool(size_t thread_num)
: threads_(thread_num),
queue_(thread_num) {
assert(thread_num);
for(size_t i = 0; i < threads_.size(); i ++) {
threads_[i] = new Worker(this);
}
public:
class Worker: public IThread {
public:
Worker(ThreadPool* pool): ptThreadPool_(pool) {
assert(ptThreadPool_);
}
~ThreadPool() {
Stop();
virtual ~Worker() {
}
void Start() {
for(size_t i = 0; i < threads_.size(); i++) {
threads_[i]->Start();
virtual void Run() {
while (true) {
ClosureInterface* closure = ptThreadPool_->queue_.Pop();
if (closure == NULL) {
break;
}
}
void Stop() {
for(size_t i = 0; i < threads_.size(); i ++) {
queue_.Push(NULL);
try {
closure->Run();
} catch(std::exception& e) {
XLOG(ERROR) << e.what();
} catch(...) {
XLOG(ERROR) << " unknown exception.";
}
for(size_t i = 0; i < threads_.size(); i ++) {
threads_[i]->Join();
delete threads_[i];
}
threads_.clear();
delete closure;
}
}
private:
ThreadPool * ptThreadPool_;
}; // class Worker
void Add(ClosureInterface* task) {
assert(task);
queue_.Push(task);
ThreadPool(size_t thread_num)
: threads_(thread_num),
queue_(thread_num) {
assert(thread_num);
for(size_t i = 0; i < threads_.size(); i ++) {
threads_[i] = new Worker(this);
}
}
~ThreadPool() {
Stop();
}
private:
friend class Worker;
void Start() {
for(size_t i = 0; i < threads_.size(); i++) {
threads_[i]->Start();
}
}
void Stop() {
for(size_t i = 0; i < threads_.size(); i ++) {
queue_.Push(NULL);
}
for(size_t i = 0; i < threads_.size(); i ++) {
threads_[i]->Join();
delete threads_[i];
}
threads_.clear();
}
vector<IThread*> threads_;
BoundedBlockingQueue<ClosureInterface*> queue_;
void Add(ClosureInterface* task) {
assert(task);
queue_.Push(task);
}
private:
friend class Worker;
vector<IThread*> threads_;
BoundedBlockingQueue<ClosureInterface*> queue_;
}; // class ThreadPool
} // namespace limonp

View File

@ -20,6 +20,7 @@
*
*/
#include "file-utils.h"
#include <QXmlStreamReader>
using namespace Zeeker;
size_t FileUtils::_max_index_count = 0;
@ -488,6 +489,22 @@ void FileUtils::getDocxTextContent(QString &path, QString &textcontent) {
fileR.open(QIODevice::ReadOnly); //读取方式打开
QXmlStreamReader reader(&fileR);
while (!reader.atEnd()){
if(reader.readNextStartElement() and reader.name().toString() == "t"){
textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
break;
}
}
}
fileR.close();
file.close();
return;
/* //原加载DOM文档方式
QDomDocument doc;
doc.setContent(fileR.readAll());
fileR.close();
@ -512,6 +529,7 @@ void FileUtils::getDocxTextContent(QString &path, QString &textcontent) {
}
file.close();
return;
*/
}
void FileUtils::getPptxTextContent(QString &path, QString &textcontent) {
@ -529,6 +547,31 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent) {
}
if(fileList.isEmpty())
return;
for(int i = 0; i < fileList.size(); ++i){
QString name = prefix + QString::number(i + 1) + ".xml";
if(!file.setCurrentFile(name)) {
continue;
}
QuaZipFile fileR(&file);
fileR.open(QIODevice::ReadOnly);
QXmlStreamReader reader(&fileR);
while (!reader.atEnd()){
if(reader.readNextStartElement() and reader.name().toString() == "t"){
textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
break;
}
}
}
fileR.close();
}
file.close();
return;
/*
QDomElement sptree;
QDomElement sp;
QDomElement txbody;
@ -596,6 +639,7 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent) {
}
file.close();
return;
*/
}
void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) {
@ -610,8 +654,24 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) {
return;
QuaZipFile fileR(&file);
fileR.open(QIODevice::ReadOnly); //读取方式打开
fileR.open(QIODevice::ReadOnly);
QXmlStreamReader reader(&fileR);
while (!reader.atEnd()){
if(reader.readNextStartElement() and reader.name().toString() == "t"){
textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
break;
}
}
}
fileR.close();
file.close();
return;
/*
QDomDocument doc;
doc.setContent(fileR.readAll());
fileR.close();
@ -641,6 +701,7 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) {
}
file.close();
return;
*/
}
void FileUtils::getPdfTextContent(QString &path, QString &textcontent) {
@ -650,7 +711,7 @@ void FileUtils::getPdfTextContent(QString &path, QString &textcontent) {
const QRectF qf;
int pageNum = doc->numPages();
for(int i = 0; i < pageNum; ++i) {
textcontent.append(doc->page(i)->text(qf).replace("\n", ""));
textcontent.append(doc->page(i)->text(qf).replace("\n", "").replace("\r", " "));
if(textcontent.length() >= MAX_CONTENT_LENGTH / 3)
break;
}
@ -679,7 +740,7 @@ void FileUtils::getTxtContent(QString &path, QString &textcontent) {
stream.setCodec(codec);
uchardet_delete(chardet);
textcontent = stream.readAll().replace("\n", "");
textcontent = stream.readAll().replace("\n", "").replace("\r", " ");
file.close();
encodedString.clear();

View File

@ -110,17 +110,21 @@ void ConstructDocumentForContent::run() {
return;
QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000).toStdString());
Document doc;
doc.setData(content);
doc.setUniqueTerm(uniqueterm);
doc.addTerm(upTerm);
doc.addValue(m_path);
for(int i = 0; i < term.size(); ++i) {
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
//'\xEF\xBC\x8C' is "" "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
// QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000));
//修改函数返回类型修改入参为std::string引用--jxx20210519
std::vector<cppjieba::KeywordExtractor::Word> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
for(size_t i = 0; i < term.size(); ++i) {
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
}
Zeeker::_mutex_doc_list_content.lock();

View File

@ -37,6 +37,17 @@ void Document::addPosting(std::string term, QVector<size_t> offset, int weight)
}
}
void Document::addPosting(std::string term, std::vector<size_t> offset, int weight) {
if(term == "")
return;
if(term.length() > 240)
term = QString::fromStdString(term).left(30).toStdString();
for(size_t i : offset) {
m_document.add_posting(term, i, weight);
}
}
void Document::addPosting(std::string term, unsigned int offset, int weight) {
if(term == "")
return;

View File

@ -41,6 +41,7 @@ public:
}
void setData(QString &data);
void addPosting(std::string term, QVector<size_t> offset, int weight = 1);
void addPosting(std::string term, std::vector<size_t> offset, int weight = 1);
void addPosting(std::string term, unsigned int offset, int weight = 1);
void addTerm(QString term);
void addValue(QString value);

View File

@ -31,8 +31,9 @@ void FileReader::getTextContent(QString path, QString &textContent) {
QFileInfo file(path);
QString strsfx = file.suffix();
if(name == "application/zip") {
if(strsfx.endsWith("docx"))
if(strsfx.endsWith("docx")){
FileUtils::getDocxTextContent(path, textContent);
}
if(strsfx.endsWith("pptx"))
FileUtils::getPptxTextContent(path, textContent);
if(strsfx.endsWith("xlsx"))

View File

@ -46,7 +46,54 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
// qDebug() << "there are some shit here"<<fileInfo.fileName() << fileInfo.absoluteFilePath() << QString(fileInfo.isDir() ? "1" : "0");
this->q_index->enqueue(QVector<QString>() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0"));
if((fileInfo.fileName().split(".", QString::SkipEmptyParts).length() > 1) && (true == targetFileTypeMap[fileInfo.fileName().split(".").last()])) {
this->q_content_index->enqueue(fileInfo.absoluteFilePath());
//this->q_content_index->enqueue(fileInfo.absoluteFilePath());
if(fileInfo.fileName().split(".").last() == "docx"){
QuaZip file(fileInfo.absoluteFilePath());
if(!file.open(QuaZip::mdUnzip))
return;
if(!file.setCurrentFile("word/document.xml", QuaZip::csSensitive))
return;
QuaZipFile fileR(&file);
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//docx解压缩后的xml文件为实际需要解析文件大小
qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
qDebug() << "文件大小:" << fileR.usize();
file.close();
}else if(fileInfo.fileName().split(".").last() == "pptx"){
QuaZip file(fileInfo.absoluteFilePath());
if(!file.open(QuaZip::mdUnzip))
return;
QString prefix("ppt/slides/slide");
qint64 fileSize(0);
qint64 fileIndex(0);
for(QString i : file.getFileNameList()) {
if(i.startsWith(prefix)){
QString name = prefix + QString::number(fileIndex + 1) + ".xml";
fileIndex++;
if(!file.setCurrentFile(name)) {
continue;
}
QuaZipFile fileR(&file);
fileSize += fileR.usize();
}
}
file.close();
qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
qDebug() << "文件大小:" << fileSize;
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileSize));//pptx解压缩后的xml文件为实际需要解析文件大小
}else if(fileInfo.fileName().split(".").last() == "xlsx"){
QuaZip file(fileInfo.absoluteFilePath());
if(!file.open(QuaZip::mdUnzip))
return;
if(!file.setCurrentFile("xl/sharedStrings.xml", QuaZip::csSensitive))
return;
QuaZipFile fileR(&file);
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//xlsx解压缩后的xml文件为实际解析文件大小
qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
qDebug() << "文件大小:" << fileR.usize();
file.close();
}else{
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
}
}
}
@ -90,8 +137,9 @@ void FirstIndex::run() {
this->q_index = new QQueue<QVector<QString>>();
//this->q_content_index = new QQueue<QString>();
NEW_QUEUE(this->q_content_index);
//NEW_QUEUE(this->q_content_index);
// this->mlm = new MessageListManager();
this->q_content_index = new QQueue<QPair<QString,qint64>>();
int fifo_fd;
char buffer[2];
@ -168,9 +216,14 @@ void FirstIndex::run() {
qDebug() << "q_content_index:" << q_content_index->size();
while(!this->q_content_index->empty()) {
// for (size_t i = 0; (i < this->u_send_length) && (!this->q_content_index->empty()); ++i){
for(size_t i = 0; (i < 30) && (!this->q_content_index->empty()); ++i) {
tmp->enqueue(this->q_content_index->dequeue());
qint64 fileSize = 0;
//修改一次处理的数据量从30个文件改为文件总大小为50M以下50M为暂定值--jxx20210519
for(size_t i = 0;/* (i < 30) && */(fileSize < 50*1024*1024) && (!this->q_content_index->empty()); ++i) {
QPair<QString,qint64> tempPair = this->q_content_index->dequeue();
fileSize += tempPair.second;
tmp->enqueue(tempPair.first);
}
// qDebug() << ">>>>>>>>all fileSize:" << fileSize << "file num:" << tmp->size() << "<<<<<<<<<<<<<<<<<<<";
this->p_indexGenerator->creatAllIndex(tmp);
tmp->clear();
}

View File

@ -62,7 +62,9 @@ private:
//test
QQueue<QVector<QString>>* q_index;
QQueue<QString>* q_content_index;
// QQueue<QString>* q_content_index;
//修改QQueue存储数据为QPair<QString,qint64>,增加存储文件大小数据便于处理时统计--jxx20210519
QQueue<QPair<QString,qint64>>* q_content_index;
const QMap<QString, bool> targetFileTypeMap = {
std::map<QString, bool>::value_type("doc", true),

View File

@ -27,7 +27,7 @@ QMutex SearchManager::m_mutex1;
QMutex SearchManager::m_mutex2;
QMutex SearchManager::m_mutex3;
SearchManager::SearchManager(QObject *parent) : QObject(parent) {
m_pool.setMaxThreadCount(2);
m_pool.setMaxThreadCount(3);
m_pool.setExpiryTimeout(1000);
}
@ -280,29 +280,15 @@ int FileContentSearch::keywordSearchContent() {
words.append(sKeyWord.at(i).word).append(" ");
}
Xapian::Query query = qp.parse_query(words);
// Xapian::Query query = qp.parse_query(keyword.toStdString());
// QVector<SKeyWord> sKeyWord = ChineseSegmentation::getInstance()->callSegement(keyword);
// //Creat a query
// std::string words;
// for(int i=0;i<sKeyWord.size();i++)
// {
// words.append(sKeyWord.at(i).word).append(" ");
// }
// Xapian::Query query = qp.parse_query(words);
// std::vector<Xapian::Query> v;
// for(int i=0;i<sKeyWord.size();i++)
// {
// v.push_back(Xapian::Query(sKeyWord.at(i).word));
// qDebug()<<QString::fromStdString(sKeyWord.at(i).word);
// }
// Xapian::Query queryPhrase =Xapian::Query(Xapian::Query::OP_AND, v.begin(), v.end());
std::vector<Xapian::Query> v;
for(int i=0; i<sKeyWord.size(); i++) {
v.push_back(Xapian::Query(sKeyWord.at(i).word));
qDebug() << QString::fromStdString(sKeyWord.at(i).word);
}
Xapian::Query query = Xapian::Query(Xapian::Query::OP_AND, v.begin(), v.end());
qDebug() << "keywordSearchContent:" << QString::fromStdString(query.get_description());
enquire.set_query(query);

View File

@ -67,7 +67,7 @@ unix {
INSTALLS += target
header.path = /usr/include/ukui-search
header.files += *.h index/*.h appsearch/*.h settingsearch/*.h
header.files += *.h index/*.h appsearch/*.h settingsearch/*.h plugininterface/*.h
INSTALLS += header
}

View File

@ -4963,7 +4963,7 @@ bool KBinaryParser::read8DocText(FILE *pFile, const ppsInfoType *pPPS,
if(bUsesUnicode) {
ushort* usAucData = (ushort*)ptaucBytes;
content.append(QString::fromUtf16(usAucData).replace("\r", ""));
content.append(QString::fromUtf16(usAucData).replace("\n", "").replace("\r", " "));
usAucData = (ushort*)xfree((void*)usAucData);
ptaucBytes = NULL;
if(content.length() >= 682666) //20480000/3
@ -5066,7 +5066,7 @@ int KBinaryParser:: readSSTRecord(readDataParam &rdParam, ppsInfoType PPS_info,
} else {
ushort* usData = (ushort*)chData;
content.append(QString::fromUtf16(usData).replace("\r", ""));
content.append(QString::fromUtf16(usData).replace("\n", "").replace("\r", " "));
usData = (ushort*)xfree((void*)usData);
chData = NULL;
if(content.length() >= 682666) //20480000/3
@ -5131,7 +5131,7 @@ ULONG KBinaryParser::readPPtRecord(FILE* pFile, ppsInfoType* PPS_info, ULONG* au
return -1;
ushort* usData = (ushort*)chData;
content.append(QString::fromUtf16(usData).replace("\r", ""));
content.append(QString::fromUtf16(usData).replace("\n", "").replace("\r", " "));
usData = (ushort*)xfree((void*)usData);
chData = NULL;

View File

@ -32,6 +32,7 @@
#include <QStyleOption>
#include <QApplication>
#include <QPainter>
#include <QPainterPath>
namespace Zeeker {
class CreateIndexAskDialog : public QDialog {