commit
1117d75025
|
@ -30,12 +30,12 @@ ChineseSegmentation::ChineseSegmentation() {
|
|||
const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
|
||||
const char * const IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
|
||||
const char * const STOP_WORD_PATH = "/usr/share/ukui-search/res/dict/stop_words.utf8";
|
||||
|
||||
m_jieba = new cppjieba::Jieba(DICT_PATH,
|
||||
HMM_PATH,
|
||||
USER_DICT_PATH,
|
||||
IDF_PATH,
|
||||
STOP_WORD_PATH);
|
||||
STOP_WORD_PATH,
|
||||
"");
|
||||
}
|
||||
|
||||
ChineseSegmentation::~ChineseSegmentation() {
|
||||
|
@ -72,6 +72,15 @@ QVector<SKeyWord> ChineseSegmentation::callSegement(std::string s) {
|
|||
|
||||
}
|
||||
|
||||
std::vector<cppjieba::KeywordExtractor::Word> ChineseSegmentation::callSegementStd(const std::string &str) {
|
||||
|
||||
const size_t topk = -1;
|
||||
std::vector<cppjieba::KeywordExtractor::Word> keywordres;
|
||||
ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk);
|
||||
|
||||
return keywordres;
|
||||
}
|
||||
|
||||
void ChineseSegmentation::convert(std::vector<cppjieba::KeywordExtractor::Word> &keywordres, QVector<SKeyWord> &kw) {
|
||||
for(auto i : keywordres) {
|
||||
SKeyWord temp;
|
||||
|
|
|
@ -48,6 +48,9 @@ public:
|
|||
static ChineseSegmentation *getInstance();
|
||||
~ChineseSegmentation();
|
||||
QVector<SKeyWord> callSegement(std::string s);
|
||||
//新添加callSegementStd函数,修改返回值为std::vector<cppjieba::KeywordExtractor::Word>并简化内部处理流程--jxx20210517
|
||||
//修改函数入参形式为引用,去掉Qstring与std::string转换代码--jxx20210519
|
||||
std::vector<cppjieba::KeywordExtractor::Word> callSegementStd(const std::string& str);
|
||||
void convert(std::vector<cppjieba::KeywordExtractor::Word>& keywordres, QVector<SKeyWord>& kw);
|
||||
private:
|
||||
static QMutex m_mutex;
|
||||
|
|
|
@ -0,0 +1,286 @@
|
|||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <QDebug>
|
||||
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
#include "limonp/Md5.hpp"
|
||||
#include "Unicode.hpp"
|
||||
#include "darts.h"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
using std::pair;
|
||||
|
||||
struct DatElement {
|
||||
string word;
|
||||
string tag;
|
||||
double weight = 0;
|
||||
|
||||
bool operator < (const DatElement & b) const {
|
||||
if (word == b.word) {
|
||||
return this->weight > b.weight;
|
||||
}
|
||||
|
||||
return this->word < b.word;
|
||||
}
|
||||
};
|
||||
|
||||
inline std::ostream & operator << (std::ostream& os, const DatElement & elem) {
|
||||
return os << "word=" << elem.word << "/tag=" << elem.tag << "/weight=" << elem.weight;
|
||||
}
|
||||
|
||||
struct DatMemElem {
|
||||
double weight = 0.0;
|
||||
char tag[8] = {};
|
||||
|
||||
void SetTag(const string & str) {
|
||||
memset(&tag[0], 0, sizeof(tag));
|
||||
strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
|
||||
}
|
||||
|
||||
string GetTag() const {
|
||||
return &tag[0];
|
||||
}
|
||||
};
|
||||
|
||||
inline std::ostream & operator << (std::ostream& os, const DatMemElem & elem) {
|
||||
return os << "/tag=" << elem.GetTag() << "/weight=" << elem.weight;
|
||||
}
|
||||
|
||||
struct DatDag {
|
||||
limonp::LocalVector<pair<size_t, const DatMemElem *> > nexts;
|
||||
double max_weight;
|
||||
int max_next;
|
||||
};
|
||||
|
||||
typedef Darts::DoubleArray JiebaDAT;
|
||||
|
||||
|
||||
struct CacheFileHeader {
|
||||
char md5_hex[32] = {};
|
||||
double min_weight = 0;
|
||||
uint32_t elements_num = 0;
|
||||
uint32_t dat_size = 0;
|
||||
};
|
||||
|
||||
static_assert(sizeof(DatMemElem) == 16, "DatMemElem length invalid");
|
||||
static_assert((sizeof(CacheFileHeader) % sizeof(DatMemElem)) == 0, "DatMemElem CacheFileHeader length equal");
|
||||
|
||||
|
||||
class DatTrie {
|
||||
public:
|
||||
DatTrie() {}
|
||||
~DatTrie() {
|
||||
::munmap(mmap_addr_, mmap_length_);
|
||||
mmap_addr_ = nullptr;
|
||||
mmap_length_ = 0;
|
||||
|
||||
::close(mmap_fd_);
|
||||
mmap_fd_ = -1;
|
||||
}
|
||||
|
||||
const DatMemElem * Find(const string & key) const {
|
||||
JiebaDAT::result_pair_type find_result;
|
||||
dat_.exactMatchSearch(key.c_str(), find_result);
|
||||
|
||||
if ((0 == find_result.length) || (find_result.value < 0) || (find_result.value >= elements_num_)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return &elements_ptr_[ find_result.value ];
|
||||
}
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
||||
vector<struct DatDag>&res, size_t max_word_len) const {
|
||||
|
||||
res.clear();
|
||||
res.resize(end - begin);
|
||||
|
||||
string text_str;
|
||||
EncodeRunesToString(begin, end, text_str);
|
||||
|
||||
static const size_t max_num = 128;
|
||||
JiebaDAT::result_pair_type result_pairs[max_num] = {};
|
||||
|
||||
for (size_t i = 0, begin_pos = 0; i < size_t(end - begin); i++) {
|
||||
|
||||
std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
|
||||
|
||||
res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + 1, nullptr));
|
||||
|
||||
for (std::size_t idx = 0; idx < num_results; ++idx) {
|
||||
auto & match = result_pairs[idx];
|
||||
|
||||
if ((match.value < 0) || (match.value >= elements_num_)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
|
||||
|
||||
if (char_num > max_word_len) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto pValue = &elements_ptr_[match.value];
|
||||
|
||||
if (1 == char_num) {
|
||||
res[i].nexts[0].second = pValue;
|
||||
continue;
|
||||
}
|
||||
|
||||
res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + char_num, pValue));
|
||||
}
|
||||
|
||||
begin_pos += limonp::UnicodeToUtf8Bytes((begin + i)->rune);
|
||||
}
|
||||
}
|
||||
|
||||
double GetMinWeight() const {
|
||||
return min_weight_;
|
||||
}
|
||||
|
||||
void SetMinWeight(double d) {
|
||||
min_weight_ = d ;
|
||||
}
|
||||
|
||||
bool InitBuildDat(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
|
||||
BuildDatCache(elements, dat_cache_file, md5);
|
||||
return InitAttachDat(dat_cache_file, md5);
|
||||
}
|
||||
|
||||
bool InitAttachDat(const string & dat_cache_file, const string & md5) {
|
||||
mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
|
||||
|
||||
if (mmap_fd_ < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
|
||||
assert(seek_off >= 0);
|
||||
mmap_length_ = seek_off;
|
||||
|
||||
mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
|
||||
assert(MAP_FAILED != mmap_addr_);
|
||||
|
||||
assert(mmap_length_ >= sizeof(CacheFileHeader));
|
||||
CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
|
||||
elements_num_ = header.elements_num;
|
||||
min_weight_ = header.min_weight;
|
||||
assert(sizeof(header.md5_hex) == md5.size());
|
||||
|
||||
if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(DatMemElem) + header.dat_size * dat_.unit_size());
|
||||
elements_ptr_ = (const DatMemElem *)(mmap_addr_ + sizeof(header));
|
||||
const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(DatMemElem) * elements_num_;
|
||||
dat_.set_array(dat_ptr, header.dat_size);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
void BuildDatCache(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
|
||||
std::sort(elements.begin(), elements.end());
|
||||
|
||||
vector<const char*> keys_ptr_vec;
|
||||
vector<int> values_vec;
|
||||
vector<DatMemElem> mem_elem_vec;
|
||||
|
||||
keys_ptr_vec.reserve(elements.size());
|
||||
values_vec.reserve(elements.size());
|
||||
mem_elem_vec.reserve(elements.size());
|
||||
|
||||
CacheFileHeader header;
|
||||
header.min_weight = min_weight_;
|
||||
assert(sizeof(header.md5_hex) == md5.size());
|
||||
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
|
||||
|
||||
for (size_t i = 0; i < elements.size(); ++i) {
|
||||
keys_ptr_vec.push_back(elements[i].word.data());
|
||||
values_vec.push_back(i);
|
||||
mem_elem_vec.push_back(DatMemElem());
|
||||
auto & mem_elem = mem_elem_vec.back();
|
||||
mem_elem.weight = elements[i].weight;
|
||||
mem_elem.SetTag(elements[i].tag);
|
||||
}
|
||||
|
||||
auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
|
||||
assert(0 == ret);
|
||||
header.elements_num = mem_elem_vec.size();
|
||||
header.dat_size = dat_.size();
|
||||
|
||||
{
|
||||
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
|
||||
::umask(S_IWGRP | S_IWOTH);
|
||||
//const int fd =::mkstemp(&tmp_filepath[0]);
|
||||
//原mkstemp用法有误,已修复--jxx20210519
|
||||
const int fd =::mkstemp((char *)tmp_filepath.data());
|
||||
qDebug() << "mkstemp error:" << errno << tmp_filepath.data();
|
||||
assert(fd >= 0);
|
||||
::fchmod(fd, 0644);
|
||||
|
||||
auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
|
||||
write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(mem_elem_vec[0]) * mem_elem_vec.size());
|
||||
write_bytes += ::write(fd, dat_.array(), dat_.total_size());
|
||||
|
||||
assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(mem_elem_vec[0]) + dat_.total_size());
|
||||
::close(fd);
|
||||
|
||||
const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
|
||||
assert(0 == rename_ret);
|
||||
}
|
||||
}
|
||||
|
||||
DatTrie(const DatTrie &);
|
||||
DatTrie &operator=(const DatTrie &);
|
||||
|
||||
private:
|
||||
JiebaDAT dat_;
|
||||
const DatMemElem * elements_ptr_ = nullptr;
|
||||
size_t elements_num_ = 0;
|
||||
double min_weight_ = 0;
|
||||
|
||||
int mmap_fd_ = -1;
|
||||
size_t mmap_length_ = 0;
|
||||
char * mmap_addr_ = nullptr;
|
||||
};
|
||||
|
||||
|
||||
inline string CalcFileListMD5(const string & files_list, size_t & file_size_sum) {
|
||||
limonp::MD5 md5;
|
||||
|
||||
const auto files = limonp::Split(files_list, "|;");
|
||||
file_size_sum = 0;
|
||||
|
||||
for (auto const & local_path : files) {
|
||||
const int fd = ::open(local_path.c_str(), O_RDONLY);
|
||||
if( fd < 0){
|
||||
continue;
|
||||
}
|
||||
auto const len = ::lseek(fd, 0, SEEK_END);
|
||||
if (len > 0) {
|
||||
void * addr = ::mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
|
||||
assert(MAP_FAILED != addr);
|
||||
|
||||
md5.Update((unsigned char *) addr, len);
|
||||
file_size_sum += len;
|
||||
|
||||
::munmap(addr, len);
|
||||
}
|
||||
::close(fd);
|
||||
}
|
||||
|
||||
md5.Final();
|
||||
return string(md5.digestChars);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,23 +1,4 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_DICT_TRIE_HPP
|
||||
#define CPPJIEBA_DICT_TRIE_HPP
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
@ -31,8 +12,8 @@
|
|||
#include "limonp/StringUtil.hpp"
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "Unicode.hpp"
|
||||
#include "Trie.hpp"
|
||||
|
||||
#include "DatTrie.hpp"
|
||||
#include <QDebug>
|
||||
namespace cppjieba {
|
||||
|
||||
using namespace limonp;
|
||||
|
@ -50,58 +31,22 @@ public:
|
|||
WordWeightMax,
|
||||
}; // enum UserWordWeightOption
|
||||
|
||||
DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
|
||||
Init(dict_path, user_dict_paths, user_word_weight_opt);
|
||||
DictTrie(const string& dict_path, const string& user_dict_paths = "", const string & dat_cache_path = "",
|
||||
UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
|
||||
Init(dict_path, user_dict_paths, dat_cache_path, user_word_weight_opt);
|
||||
}
|
||||
|
||||
~DictTrie() {
|
||||
delete trie_;
|
||||
}
|
||||
~DictTrie() {}
|
||||
|
||||
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||
DictUnit node_info;
|
||||
if(!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
|
||||
return false;
|
||||
}
|
||||
active_node_infos_.push_back(node_info);
|
||||
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool InsertUserWord(const string& word, int freq, const string& tag = UNKNOWN_TAG) {
|
||||
DictUnit node_info;
|
||||
double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
|
||||
if(!MakeNodeInfo(node_info, word, weight, tag)) {
|
||||
return false;
|
||||
}
|
||||
active_node_infos_.push_back(node_info);
|
||||
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
||||
return true;
|
||||
}
|
||||
|
||||
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
||||
return trie_->Find(begin, end);
|
||||
const DatMemElem* Find(const string & word) const {
|
||||
return dat_.Find(word);
|
||||
}
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<struct Dag>&res,
|
||||
vector<struct DatDag>&res,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
trie_->Find(begin, end, res, max_word_len);
|
||||
}
|
||||
|
||||
bool Find(const string& word) {
|
||||
const DictUnit *tmp = NULL;
|
||||
RuneStrArray runes;
|
||||
if(!DecodeRunesInString(word, runes)) {
|
||||
XLOG(ERROR) << "Decode failed.";
|
||||
}
|
||||
tmp = Find(runes.begin(), runes.end());
|
||||
if(tmp == NULL) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
dat_.Find(begin, end, res, max_word_len);
|
||||
}
|
||||
|
||||
bool IsUserDictSingleChineseWord(const Rune& word) const {
|
||||
|
@ -109,182 +54,176 @@ public:
|
|||
}
|
||||
|
||||
double GetMinWeight() const {
|
||||
return min_weight_;
|
||||
return dat_.GetMinWeight();
|
||||
}
|
||||
|
||||
void InserUserDictNode(const string& line) {
|
||||
size_t GetTotalDictSize() const {
|
||||
return total_dict_size_;
|
||||
}
|
||||
|
||||
void InserUserDictNode(const string& line, bool saveNodeInfo = true) {
|
||||
vector<string> buf;
|
||||
DictUnit node_info;
|
||||
DatElement node_info;
|
||||
Split(line, buf, " ");
|
||||
if(buf.size() == 1) {
|
||||
MakeNodeInfo(node_info,
|
||||
buf[0],
|
||||
user_word_default_weight_,
|
||||
UNKNOWN_TAG);
|
||||
} else if(buf.size() == 2) {
|
||||
MakeNodeInfo(node_info,
|
||||
buf[0],
|
||||
user_word_default_weight_,
|
||||
buf[1]);
|
||||
} else if(buf.size() == 3) {
|
||||
int freq = atoi(buf[1].c_str());
|
||||
assert(freq_sum_ > 0.0);
|
||||
double weight = log(1.0 * freq / freq_sum_);
|
||||
MakeNodeInfo(node_info, buf[0], weight, buf[2]);
|
||||
|
||||
if (buf.size() == 0) {
|
||||
return;
|
||||
}
|
||||
static_node_infos_.push_back(node_info);
|
||||
if(node_info.word.size() == 1) {
|
||||
user_dict_single_chinese_word_.insert(node_info.word[0]);
|
||||
|
||||
node_info.word = buf[0];
|
||||
node_info.weight = user_word_default_weight_;
|
||||
node_info.tag = UNKNOWN_TAG;
|
||||
|
||||
if (buf.size() == 2) {
|
||||
node_info.tag = buf[1];
|
||||
} else if (buf.size() == 3) {
|
||||
if (freq_sum_ > 0.0) {
|
||||
const int freq = atoi(buf[1].c_str());
|
||||
node_info.weight = log(1.0 * freq / freq_sum_);
|
||||
node_info.tag = buf[2];
|
||||
}
|
||||
}
|
||||
|
||||
if (saveNodeInfo) {
|
||||
static_node_infos_.push_back(node_info);
|
||||
}
|
||||
|
||||
if (Utf8CharNum(node_info.word) == 1) {
|
||||
RuneArray word;
|
||||
|
||||
if (DecodeRunesInString(node_info.word, word)) {
|
||||
user_dict_single_chinese_word_.insert(word[0]);
|
||||
} else {
|
||||
XLOG(ERROR) << "Decode " << node_info.word << " failed.";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void LoadUserDict(const vector<string>& buf) {
|
||||
for(size_t i = 0; i < buf.size(); i++) {
|
||||
InserUserDictNode(buf[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void LoadUserDict(const set<string>& buf) {
|
||||
std::set<string>::const_iterator iter;
|
||||
for(iter = buf.begin(); iter != buf.end(); iter++) {
|
||||
InserUserDictNode(*iter);
|
||||
}
|
||||
}
|
||||
|
||||
void LoadUserDict(const string& filePaths) {
|
||||
void LoadUserDict(const string& filePaths, bool saveNodeInfo = true) {
|
||||
vector<string> files = limonp::Split(filePaths, "|;");
|
||||
size_t lineno = 0;
|
||||
for(size_t i = 0; i < files.size(); i++) {
|
||||
|
||||
for (size_t i = 0; i < files.size(); i++) {
|
||||
ifstream ifs(files[i].c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
|
||||
string line;
|
||||
|
||||
for(; getline(ifs, line); lineno++) {
|
||||
if(line.size() == 0) {
|
||||
for (; getline(ifs, line);) {
|
||||
if (line.size() == 0) {
|
||||
continue;
|
||||
}
|
||||
InserUserDictNode(line);
|
||||
|
||||
InserUserDictNode(line, saveNodeInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
|
||||
LoadDict(dict_path);
|
||||
void Init(const string& dict_path, const string& user_dict_paths, string dat_cache_path,
|
||||
UserWordWeightOption user_word_weight_opt) {
|
||||
const auto dict_list = dict_path + "|" + user_dict_paths;
|
||||
size_t file_size_sum = 0;
|
||||
const string md5 = CalcFileListMD5(dict_list, file_size_sum);
|
||||
|
||||
if (dat_cache_path.empty()) {
|
||||
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
|
||||
dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) + ".dat_cache";
|
||||
}
|
||||
QString path = QString::fromStdString(dat_cache_path);
|
||||
qDebug() << "#########path:" << path;
|
||||
if (dat_.InitAttachDat(dat_cache_path, md5)) {
|
||||
LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;
|
||||
total_dict_size_ = file_size_sum;
|
||||
return;
|
||||
}
|
||||
|
||||
LoadDefaultDict(dict_path);
|
||||
freq_sum_ = CalcFreqSum(static_node_infos_);
|
||||
CalculateWeight(static_node_infos_, freq_sum_);
|
||||
SetStaticWordWeights(user_word_weight_opt);
|
||||
double min_weight = 0;
|
||||
SetStaticWordWeights(user_word_weight_opt, min_weight);
|
||||
dat_.SetMinWeight(min_weight);
|
||||
|
||||
if(user_dict_paths.size()) {
|
||||
LoadUserDict(user_dict_paths);
|
||||
}
|
||||
Shrink(static_node_infos_);
|
||||
CreateTrie(static_node_infos_);
|
||||
LoadUserDict(user_dict_paths);
|
||||
const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
|
||||
assert(build_ret);
|
||||
total_dict_size_ = file_size_sum;
|
||||
vector<DatElement>().swap(static_node_infos_);
|
||||
}
|
||||
|
||||
void CreateTrie(const vector<DictUnit>& dictUnits) {
|
||||
assert(dictUnits.size());
|
||||
vector<Unicode> words;
|
||||
vector<const DictUnit*> valuePointers;
|
||||
for(size_t i = 0 ; i < dictUnits.size(); i ++) {
|
||||
words.push_back(dictUnits[i].word);
|
||||
valuePointers.push_back(&dictUnits[i]);
|
||||
}
|
||||
|
||||
trie_ = new Trie(words, valuePointers);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
bool MakeNodeInfo(DictUnit& node_info,
|
||||
const string& word,
|
||||
double weight,
|
||||
const string& tag) {
|
||||
if(!DecodeRunesInString(word, node_info.word)) {
|
||||
XLOG(ERROR) << "Decode " << word << " failed.";
|
||||
return false;
|
||||
}
|
||||
node_info.weight = weight;
|
||||
node_info.tag = tag;
|
||||
return true;
|
||||
}
|
||||
|
||||
void LoadDict(const string& filePath) {
|
||||
void LoadDefaultDict(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
|
||||
string line;
|
||||
vector<string> buf;
|
||||
|
||||
DictUnit node_info;
|
||||
for(size_t lineno = 0; getline(ifs, line); lineno++) {
|
||||
for (; getline(ifs, line);) {
|
||||
Split(line, buf, " ");
|
||||
XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
|
||||
MakeNodeInfo(node_info,
|
||||
buf[0],
|
||||
atof(buf[1].c_str()),
|
||||
buf[2]);
|
||||
DatElement node_info;
|
||||
node_info.word = buf[0];
|
||||
node_info.weight = atof(buf[1].c_str());
|
||||
node_info.tag = buf[2];
|
||||
static_node_infos_.push_back(node_info);
|
||||
}
|
||||
}
|
||||
|
||||
static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
|
||||
static bool WeightCompare(const DatElement& lhs, const DatElement& rhs) {
|
||||
return lhs.weight < rhs.weight;
|
||||
}
|
||||
|
||||
void SetStaticWordWeights(UserWordWeightOption option) {
|
||||
void SetStaticWordWeights(UserWordWeightOption option, double & min_weight) {
|
||||
XCHECK(!static_node_infos_.empty());
|
||||
vector<DictUnit> x = static_node_infos_;
|
||||
vector<DatElement> x = static_node_infos_;
|
||||
sort(x.begin(), x.end(), WeightCompare);
|
||||
min_weight_ = x[0].weight;
|
||||
max_weight_ = x[x.size() - 1].weight;
|
||||
median_weight_ = x[x.size() / 2].weight;
|
||||
switch(option) {
|
||||
case WordWeightMin:
|
||||
user_word_default_weight_ = min_weight_;
|
||||
break;
|
||||
case WordWeightMedian:
|
||||
user_word_default_weight_ = median_weight_;
|
||||
break;
|
||||
default:
|
||||
user_word_default_weight_ = max_weight_;
|
||||
break;
|
||||
if(x.empty()){
|
||||
return;
|
||||
}
|
||||
min_weight = x[0].weight;
|
||||
const double max_weight_ = x[x.size() - 1].weight;
|
||||
const double median_weight_ = x[x.size() / 2].weight;
|
||||
|
||||
switch (option) {
|
||||
case WordWeightMin:
|
||||
user_word_default_weight_ = min_weight;
|
||||
break;
|
||||
|
||||
case WordWeightMedian:
|
||||
user_word_default_weight_ = median_weight_;
|
||||
break;
|
||||
|
||||
default:
|
||||
user_word_default_weight_ = max_weight_;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
double CalcFreqSum(const vector<DictUnit>& node_infos) const {
|
||||
double CalcFreqSum(const vector<DatElement>& node_infos) const {
|
||||
double sum = 0.0;
|
||||
for(size_t i = 0; i < node_infos.size(); i++) {
|
||||
|
||||
for (size_t i = 0; i < node_infos.size(); i++) {
|
||||
sum += node_infos[i].weight;
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
|
||||
assert(sum > 0.0);
|
||||
for(size_t i = 0; i < node_infos.size(); i++) {
|
||||
DictUnit& node_info = node_infos[i];
|
||||
void CalculateWeight(vector<DatElement>& node_infos, double sum) const {
|
||||
for (size_t i = 0; i < node_infos.size(); i++) {
|
||||
DatElement& node_info = node_infos[i];
|
||||
assert(node_info.weight > 0.0);
|
||||
node_info.weight = log(double(node_info.weight) / sum);
|
||||
}
|
||||
}
|
||||
|
||||
void Shrink(vector<DictUnit>& units) const {
|
||||
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
||||
}
|
||||
|
||||
vector<DictUnit> static_node_infos_;
|
||||
deque<DictUnit> active_node_infos_; // must not be vector
|
||||
Trie * trie_;
|
||||
private:
|
||||
vector<DatElement> static_node_infos_;
|
||||
size_t total_dict_size_ = 0;
|
||||
DatTrie dat_;
|
||||
|
||||
double freq_sum_;
|
||||
double min_weight_;
|
||||
double max_weight_;
|
||||
double median_weight_;
|
||||
double user_word_default_weight_;
|
||||
unordered_set<Rune> user_dict_single_chinese_word_;
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,23 +1,4 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_FULLSEGMENT_H
|
||||
#define CPPJIEBA_FULLSEGMENT_H
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
|
@ -30,82 +11,45 @@
|
|||
namespace cppjieba {
|
||||
class FullSegment: public SegmentBase {
|
||||
public:
|
||||
FullSegment(const string& dictPath) {
|
||||
dictTrie_ = new DictTrie(dictPath);
|
||||
isNeedDestroy_ = true;
|
||||
}
|
||||
FullSegment(const DictTrie* dictTrie)
|
||||
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
||||
: dictTrie_(dictTrie) {
|
||||
assert(dictTrie_);
|
||||
}
|
||||
~FullSegment() {
|
||||
if(isNeedDestroy_) {
|
||||
delete dictTrie_;
|
||||
}
|
||||
}
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence,
|
||||
vector<Word>& words) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size() / 2);
|
||||
while(pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, wrs);
|
||||
}
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
void Cut(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<WordRange>& res) const {
|
||||
// result of searching in trie tree
|
||||
LocalVector<pair<size_t, const DictUnit*> > tRes;
|
||||
~FullSegment() { }
|
||||
|
||||
// max index of res's words
|
||||
size_t maxIdx = 0;
|
||||
|
||||
// always equals to (uItr - begin)
|
||||
size_t uIdx = 0;
|
||||
|
||||
// tmp variables
|
||||
size_t wordLen = 0;
|
||||
virtual void Cut(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<WordRange>& res, bool, size_t) const override {
|
||||
assert(dictTrie_);
|
||||
vector<struct Dag> dags;
|
||||
vector<struct DatDag> dags;
|
||||
dictTrie_->Find(begin, end, dags);
|
||||
for(size_t i = 0; i < dags.size(); i++) {
|
||||
for(size_t j = 0; j < dags[i].nexts.size(); j++) {
|
||||
size_t nextoffset = dags[i].nexts[j].first;
|
||||
size_t max_word_end_pos = 0;
|
||||
|
||||
for (size_t i = 0; i < dags.size(); i++) {
|
||||
for (const auto & kv : dags[i].nexts) {
|
||||
const size_t nextoffset = kv.first - 1;
|
||||
assert(nextoffset < dags.size());
|
||||
const DictUnit* du = dags[i].nexts[j].second;
|
||||
if(du == NULL) {
|
||||
if(dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
|
||||
WordRange wr(begin + i, begin + nextoffset);
|
||||
res.push_back(wr);
|
||||
}
|
||||
} else {
|
||||
wordLen = du->word.size();
|
||||
if(wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
|
||||
WordRange wr(begin + i, begin + nextoffset);
|
||||
res.push_back(wr);
|
||||
}
|
||||
const auto wordLen = nextoffset - i + 1;
|
||||
const bool is_not_covered_single_word = ((dags[i].nexts.size() == 1) && (max_word_end_pos <= i));
|
||||
const bool is_oov = (nullptr == kv.second); //Out-of-Vocabulary
|
||||
|
||||
if ((is_not_covered_single_word) || ((not is_oov) && (wordLen >= 2))) {
|
||||
WordRange wr(begin + i, begin + nextoffset);
|
||||
res.push_back(wr);
|
||||
}
|
||||
maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
|
||||
|
||||
max_word_end_pos = max(max_word_end_pos, nextoffset + 1);
|
||||
}
|
||||
uIdx++;
|
||||
}
|
||||
}
|
||||
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
|
||||
size_t) const override {
|
||||
|
||||
}
|
||||
|
||||
private:
|
||||
const DictTrie* dictTrie_;
|
||||
bool isNeedDestroy_;
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,26 +1,6 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_HMMMODEL_H
|
||||
#define CPPJIEBA_HMMMODEL_H
|
||||
#pragma once
|
||||
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "Trie.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
|
@ -59,16 +39,18 @@ struct HMMModel {
|
|||
XCHECK(GetLine(ifile, line));
|
||||
Split(line, tmp, " ");
|
||||
XCHECK(tmp.size() == STATUS_SUM);
|
||||
for(size_t j = 0; j < tmp.size(); j++) {
|
||||
|
||||
for (size_t j = 0; j < tmp.size(); j++) {
|
||||
startProb[j] = atof(tmp[j].c_str());
|
||||
}
|
||||
|
||||
//Load transProb
|
||||
for(size_t i = 0; i < STATUS_SUM; i++) {
|
||||
for (size_t i = 0; i < STATUS_SUM; i++) {
|
||||
XCHECK(GetLine(ifile, line));
|
||||
Split(line, tmp, " ");
|
||||
XCHECK(tmp.size() == STATUS_SUM);
|
||||
for(size_t j = 0; j < STATUS_SUM; j++) {
|
||||
|
||||
for (size_t j = 0; j < tmp.size(); j++) {
|
||||
transProb[i][j] = atof(tmp[j].c_str());
|
||||
}
|
||||
}
|
||||
|
@ -92,43 +74,55 @@ struct HMMModel {
|
|||
double GetEmitProb(const EmitProbMap* ptMp, Rune key,
|
||||
double defVal)const {
|
||||
EmitProbMap::const_iterator cit = ptMp->find(key);
|
||||
if(cit == ptMp->end()) {
|
||||
|
||||
if (cit == ptMp->end()) {
|
||||
return defVal;
|
||||
}
|
||||
|
||||
return cit->second;
|
||||
}
|
||||
bool GetLine(ifstream& ifile, string& line) {
|
||||
while(getline(ifile, line)) {
|
||||
while (getline(ifile, line)) {
|
||||
Trim(line);
|
||||
if(line.empty()) {
|
||||
|
||||
if (line.empty()) {
|
||||
continue;
|
||||
}
|
||||
if(StartsWith(line, "#")) {
|
||||
|
||||
if (StartsWith(line, "#")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
bool LoadEmitProb(const string& line, EmitProbMap& mp) {
|
||||
if(line.empty()) {
|
||||
if (line.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<string> tmp, tmp2;
|
||||
Unicode unicode;
|
||||
RuneArray unicode;
|
||||
Split(line, tmp, ",");
|
||||
for(size_t i = 0; i < tmp.size(); i++) {
|
||||
|
||||
for (size_t i = 0; i < tmp.size(); i++) {
|
||||
Split(tmp[i], tmp2, ":");
|
||||
if(2 != tmp2.size()) {
|
||||
|
||||
if (2 != tmp2.size()) {
|
||||
XLOG(ERROR) << "emitProb illegal.";
|
||||
return false;
|
||||
}
|
||||
if(!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
|
||||
|
||||
if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
|
||||
XLOG(ERROR) << "TransCode failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
mp[unicode[0]] = atof(tmp2[1].c_str());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -144,4 +138,3 @@ struct HMMModel {
|
|||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,23 +1,4 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIBEA_HMMSEGMENT_H
|
||||
#define CPPJIBEA_HMMSEGMENT_H
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
@ -29,58 +10,40 @@
|
|||
namespace cppjieba {
|
||||
class HMMSegment: public SegmentBase {
|
||||
public:
|
||||
HMMSegment(const string& filePath)
|
||||
: model_(new HMMModel(filePath)), isNeedDestroy_(true) {
|
||||
}
|
||||
HMMSegment(const HMMModel* model)
|
||||
: model_(model), isNeedDestroy_(false) {
|
||||
}
|
||||
~HMMSegment() {
|
||||
if(isNeedDestroy_) {
|
||||
delete model_;
|
||||
}
|
||||
: model_(model) {
|
||||
}
|
||||
~HMMSegment() { }
|
||||
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence,
|
||||
vector<Word>& words) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size() / 2);
|
||||
while(pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, wrs);
|
||||
}
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
|
||||
virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool,
|
||||
size_t) const override {
|
||||
RuneStrArray::const_iterator left = begin;
|
||||
RuneStrArray::const_iterator right = begin;
|
||||
while(right != end) {
|
||||
if(right->rune < 0x80) {
|
||||
if(left != right) {
|
||||
|
||||
while (right != end) {
|
||||
if (right->rune < 0x80) {
|
||||
if (left != right) {
|
||||
InternalCut(left, right, res);
|
||||
}
|
||||
|
||||
left = right;
|
||||
|
||||
do {
|
||||
right = SequentialLetterRule(left, end);
|
||||
if(right != left) {
|
||||
|
||||
if (right != left) {
|
||||
break;
|
||||
}
|
||||
|
||||
right = NumbersRule(left, end);
|
||||
if(right != left) {
|
||||
|
||||
if (right != left) {
|
||||
break;
|
||||
}
|
||||
|
||||
right ++;
|
||||
} while(false);
|
||||
} while (false);
|
||||
|
||||
WordRange wr(left, right - 1);
|
||||
res.push_back(wr);
|
||||
left = right;
|
||||
|
@ -88,45 +51,61 @@ public:
|
|||
right++;
|
||||
}
|
||||
}
|
||||
if(left != right) {
|
||||
|
||||
if (left != right) {
|
||||
InternalCut(left, right, res);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
|
||||
size_t) const override {
|
||||
|
||||
}
|
||||
|
||||
private:
|
||||
// sequential letters rule
|
||||
RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
||||
RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end) const {
|
||||
Rune x = begin->rune;
|
||||
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
||||
|
||||
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
||||
begin ++;
|
||||
} else {
|
||||
return begin;
|
||||
}
|
||||
while(begin != end) {
|
||||
|
||||
while (begin != end) {
|
||||
x = begin->rune;
|
||||
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
|
||||
|
||||
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
|
||||
begin ++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return begin;
|
||||
}
|
||||
//
|
||||
RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
||||
Rune x = begin->rune;
|
||||
if('0' <= x && x <= '9') {
|
||||
|
||||
if ('0' <= x && x <= '9') {
|
||||
begin ++;
|
||||
} else {
|
||||
return begin;
|
||||
}
|
||||
while(begin != end) {
|
||||
|
||||
while (begin != end) {
|
||||
x = begin->rune;
|
||||
if(('0' <= x && x <= '9') || x == '.') {
|
||||
|
||||
if (('0' <= x && x <= '9') || x == '.') {
|
||||
begin++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return begin;
|
||||
}
|
||||
void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
|
||||
|
@ -135,8 +114,9 @@ private:
|
|||
|
||||
RuneStrArray::const_iterator left = begin;
|
||||
RuneStrArray::const_iterator right;
|
||||
for(size_t i = 0; i < status.size(); i++) {
|
||||
if(status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
|
||||
|
||||
for (size_t i = 0; i < status.size(); i++) {
|
||||
if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
|
||||
right = begin + i + 1;
|
||||
WordRange wr(left, right - 1);
|
||||
res.push_back(wr);
|
||||
|
@ -159,23 +139,25 @@ private:
|
|||
vector<double> weight(XYSize);
|
||||
|
||||
//start
|
||||
for(size_t y = 0; y < Y; y++) {
|
||||
for (size_t y = 0; y < Y; y++) {
|
||||
weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
|
||||
path[0 + y * X] = -1;
|
||||
}
|
||||
|
||||
double emitProb;
|
||||
|
||||
for(size_t x = 1; x < X; x++) {
|
||||
for(size_t y = 0; y < Y; y++) {
|
||||
for (size_t x = 1; x < X; x++) {
|
||||
for (size_t y = 0; y < Y; y++) {
|
||||
now = x + y * X;
|
||||
weight[now] = MIN_DOUBLE;
|
||||
path[now] = HMMModel::E; // warning
|
||||
emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin + x)->rune, MIN_DOUBLE);
|
||||
for(size_t preY = 0; preY < Y; preY++) {
|
||||
|
||||
for (size_t preY = 0; preY < Y; preY++) {
|
||||
old = x - 1 + preY * X;
|
||||
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
|
||||
if(tmp > weight[now]) {
|
||||
|
||||
if (tmp > weight[now]) {
|
||||
weight[now] = tmp;
|
||||
path[now] = preY;
|
||||
}
|
||||
|
@ -186,23 +168,23 @@ private:
|
|||
endE = weight[X - 1 + HMMModel::E * X];
|
||||
endS = weight[X - 1 + HMMModel::S * X];
|
||||
stat = 0;
|
||||
if(endE >= endS) {
|
||||
|
||||
if (endE >= endS) {
|
||||
stat = HMMModel::E;
|
||||
} else {
|
||||
stat = HMMModel::S;
|
||||
}
|
||||
|
||||
status.resize(X);
|
||||
for(int x = X - 1 ; x >= 0; x--) {
|
||||
|
||||
for (int x = X - 1 ; x >= 0; x--) {
|
||||
status[x] = stat;
|
||||
stat = path[x + stat * X];
|
||||
}
|
||||
}
|
||||
|
||||
const HMMModel* model_;
|
||||
bool isNeedDestroy_;
|
||||
}; // class HMMSegment
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,24 +1,6 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEAB_JIEBA_H
|
||||
#define CPPJIEAB_JIEBA_H
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include "QuerySegment.hpp"
|
||||
#include "KeywordExtractor.hpp"
|
||||
|
||||
|
@ -29,56 +11,48 @@ public:
|
|||
Jieba(const string& dict_path,
|
||||
const string& model_path,
|
||||
const string& user_dict_path,
|
||||
const string& idfPath,
|
||||
const string& stopWordPath)
|
||||
: dict_trie_(dict_path, user_dict_path),
|
||||
const string& idfPath = "",
|
||||
const string& stopWordPath = "",
|
||||
const string& dat_cache_path = "")
|
||||
: dict_trie_(dict_path, user_dict_path, dat_cache_path),
|
||||
model_(model_path),
|
||||
mp_seg_(&dict_trie_),
|
||||
hmm_seg_(&model_),
|
||||
mix_seg_(&dict_trie_, &model_),
|
||||
full_seg_(&dict_trie_),
|
||||
query_seg_(&dict_trie_, &model_),
|
||||
extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
|
||||
|
||||
}
|
||||
~Jieba() {
|
||||
}
|
||||
|
||||
struct LocWord {
|
||||
string word;
|
||||
size_t begin;
|
||||
size_t end;
|
||||
}; // struct LocWord
|
||||
extractor(&dict_trie_, &model_, idfPath, stopWordPath){ }
|
||||
~Jieba() { }
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
mix_seg_.Cut(sentence, words, hmm);
|
||||
mix_seg_.CutToStr(sentence, words, hmm);
|
||||
}
|
||||
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
||||
mix_seg_.Cut(sentence, words, hmm);
|
||||
mix_seg_.CutToWord(sentence, words, hmm);
|
||||
}
|
||||
void CutAll(const string& sentence, vector<string>& words) const {
|
||||
full_seg_.Cut(sentence, words);
|
||||
full_seg_.CutToStr(sentence, words);
|
||||
}
|
||||
void CutAll(const string& sentence, vector<Word>& words) const {
|
||||
full_seg_.Cut(sentence, words);
|
||||
full_seg_.CutToWord(sentence, words);
|
||||
}
|
||||
void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
query_seg_.Cut(sentence, words, hmm);
|
||||
query_seg_.CutToStr(sentence, words, hmm);
|
||||
}
|
||||
void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
||||
query_seg_.Cut(sentence, words, hmm);
|
||||
query_seg_.CutToWord(sentence, words, hmm);
|
||||
}
|
||||
void CutHMM(const string& sentence, vector<string>& words) const {
|
||||
hmm_seg_.Cut(sentence, words);
|
||||
hmm_seg_.CutToStr(sentence, words);
|
||||
}
|
||||
void CutHMM(const string& sentence, vector<Word>& words) const {
|
||||
hmm_seg_.Cut(sentence, words);
|
||||
hmm_seg_.CutToWord(sentence, words);
|
||||
}
|
||||
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
|
||||
mp_seg_.Cut(sentence, words, max_word_len);
|
||||
mp_seg_.CutToStr(sentence, words, false, max_word_len);
|
||||
}
|
||||
void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
|
||||
mp_seg_.Cut(sentence, words, max_word_len);
|
||||
mp_seg_.CutToWord(sentence, words, false, max_word_len);
|
||||
}
|
||||
|
||||
void Tag(const string& sentence, vector<pair<string, string> >& words) const {
|
||||
|
@ -87,16 +61,8 @@ public:
|
|||
string LookupTag(const string &str) const {
|
||||
return mix_seg_.LookupTag(str);
|
||||
}
|
||||
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||
return dict_trie_.InsertUserWord(word, tag);
|
||||
}
|
||||
|
||||
bool InsertUserWord(const string& word, int freq, const string& tag = UNKNOWN_TAG) {
|
||||
return dict_trie_.InsertUserWord(word, freq, tag);
|
||||
}
|
||||
|
||||
bool Find(const string& word) {
|
||||
return dict_trie_.Find(word);
|
||||
return nullptr != dict_trie_.Find(word);
|
||||
}
|
||||
|
||||
void ResetSeparators(const string& s) {
|
||||
|
@ -116,18 +82,6 @@ public:
|
|||
return &model_;
|
||||
}
|
||||
|
||||
void LoadUserDict(const vector<string>& buf) {
|
||||
dict_trie_.LoadUserDict(buf);
|
||||
}
|
||||
|
||||
void LoadUserDict(const set<string>& buf) {
|
||||
dict_trie_.LoadUserDict(buf);
|
||||
}
|
||||
|
||||
void LoadUserDict(const string& path) {
|
||||
dict_trie_.LoadUserDict(path);
|
||||
}
|
||||
|
||||
private:
|
||||
DictTrie dict_trie_;
|
||||
HMMModel model_;
|
||||
|
@ -145,4 +99,3 @@ public:
|
|||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif // CPPJIEAB_JIEBA_H
|
||||
|
|
|
@ -1,23 +1,4 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
#include <set>
|
||||
|
@ -37,15 +18,6 @@ public:
|
|||
double weight;
|
||||
}; // struct Word
|
||||
|
||||
KeywordExtractor(const string& dictPath,
|
||||
const string& hmmFilePath,
|
||||
const string& idfPath,
|
||||
const string& stopWordPath,
|
||||
const string& userDict = "")
|
||||
: segment_(dictPath, hmmFilePath, userDict) {
|
||||
LoadIdfDict(idfPath);
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
KeywordExtractor(const DictTrie* dictTrie,
|
||||
const HMMModel* model,
|
||||
const string& idfPath,
|
||||
|
@ -60,7 +32,8 @@ public:
|
|||
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
||||
vector<Word> topWords;
|
||||
Extract(sentence, topWords, topN);
|
||||
for(size_t i = 0; i < topWords.size(); i++) {
|
||||
|
||||
for (size_t i = 0; i < topWords.size(); i++) {
|
||||
keywords.push_back(topWords[i].word);
|
||||
}
|
||||
}
|
||||
|
@ -68,43 +41,52 @@ public:
|
|||
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||
vector<Word> topWords;
|
||||
Extract(sentence, topWords, topN);
|
||||
for(size_t i = 0; i < topWords.size(); i++) {
|
||||
|
||||
for (size_t i = 0; i < topWords.size(); i++) {
|
||||
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
||||
}
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
|
||||
vector<string> words;
|
||||
segment_.Cut(sentence, words);
|
||||
segment_.CutToStr(sentence, words);//将字符串string分解为words放入vector
|
||||
|
||||
map<string, Word> wordmap;
|
||||
map<string, Word> wordmap;//插入字符串与Word的map,相同string统计词频叠加权重
|
||||
size_t offset = 0;
|
||||
for(size_t i = 0; i < words.size(); ++i) {
|
||||
|
||||
for (size_t i = 0; i < words.size(); ++i) {
|
||||
size_t t = offset;
|
||||
offset += words[i].size();
|
||||
if(IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
|
||||
|
||||
if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
wordmap[words[i]].offsets.push_back(t);
|
||||
wordmap[words[i]].weight += 1.0;
|
||||
}
|
||||
if(offset != sentence.size()) {
|
||||
|
||||
if (offset != sentence.size()) {
|
||||
XLOG(ERROR) << "words illegal";
|
||||
return;
|
||||
}
|
||||
|
||||
keywords.clear();
|
||||
keywords.reserve(wordmap.size());
|
||||
for(map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
||||
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
|
||||
if(cit != idfMap_.end()) {
|
||||
|
||||
for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
||||
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);//IDF词典查找
|
||||
|
||||
if (cit != idfMap_.end()) {
|
||||
itr->second.weight *= cit->second;
|
||||
} else {
|
||||
itr->second.weight *= idfAverage_;
|
||||
}
|
||||
|
||||
itr->second.word = itr->first;
|
||||
keywords.push_back(itr->second);
|
||||
}
|
||||
|
||||
topN = min(topN, keywords.size());
|
||||
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
||||
keywords.resize(topN);
|
||||
|
@ -112,23 +94,31 @@ public:
|
|||
private:
|
||||
void LoadIdfDict(const string& idfPath) {
|
||||
ifstream ifs(idfPath.c_str());
|
||||
if(not ifs.is_open()){
|
||||
return ;
|
||||
}
|
||||
XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
|
||||
string line ;
|
||||
vector<string> buf;
|
||||
double idf = 0.0;
|
||||
double idfSum = 0.0;
|
||||
size_t lineno = 0;
|
||||
for(; getline(ifs, line); lineno++) {
|
||||
|
||||
for (; getline(ifs, line); lineno++) {
|
||||
buf.clear();
|
||||
if(line.empty()) {
|
||||
|
||||
if (line.empty()) {
|
||||
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
|
||||
continue;
|
||||
}
|
||||
|
||||
Split(line, buf, " ");
|
||||
if(buf.size() != 2) {
|
||||
|
||||
if (buf.size() != 2) {
|
||||
XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
|
||||
continue;
|
||||
}
|
||||
|
||||
idf = atof(buf[1].c_str());
|
||||
idfMap_[buf[0]] = idf;
|
||||
idfSum += idf;
|
||||
|
@ -141,11 +131,16 @@ private:
|
|||
}
|
||||
void LoadStopWordDict(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
if(not ifs.is_open()){
|
||||
return ;
|
||||
}
|
||||
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
|
||||
string line ;
|
||||
while(getline(ifs, line)) {
|
||||
|
||||
while (getline(ifs, line)) {
|
||||
stopWords_.insert(line);
|
||||
}
|
||||
|
||||
assert(stopWords_.size());
|
||||
}
|
||||
|
||||
|
@ -161,11 +156,11 @@ private:
|
|||
}; // class KeywordExtractor
|
||||
|
||||
inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
|
||||
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
|
||||
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
|
||||
"}";
|
||||
}
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -1,23 +1,4 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_MPSEGMENT_H
|
||||
#define CPPJIEBA_MPSEGMENT_H
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
|
@ -31,63 +12,32 @@ namespace cppjieba {
|
|||
|
||||
class MPSegment: public SegmentTagged {
|
||||
public:
|
||||
MPSegment(const string& dictPath, const string& userDictPath = "")
|
||||
: dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
|
||||
}
|
||||
MPSegment(const DictTrie* dictTrie)
|
||||
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
||||
: dictTrie_(dictTrie) {
|
||||
assert(dictTrie_);
|
||||
}
|
||||
~MPSegment() {
|
||||
if(isNeedDestroy_) {
|
||||
delete dictTrie_;
|
||||
}
|
||||
~MPSegment() { }
|
||||
|
||||
virtual void Cut(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<WordRange>& words,
|
||||
bool, size_t max_word_len) const override {
|
||||
vector<DatDag> dags;
|
||||
dictTrie_->Find(begin, end, dags, max_word_len);//依据DAG词典生成DAG--jxx
|
||||
CalcDP(dags);//动态规划(Dynamic Programming,DP),根据DAG计算最优动态规划路径--jxx
|
||||
CutByDag(begin, end, dags, words);//依据DAG最优路径分词--jxx
|
||||
}
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words) const {
|
||||
Cut(sentence, words, MAX_WORD_LENGTH);
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
|
||||
size_t) const override {
|
||||
|
||||
}
|
||||
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words,
|
||||
size_t max_word_len) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp, max_word_len);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence,
|
||||
vector<Word>& words,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size() / 2);
|
||||
while(pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, wrs, max_word_len);
|
||||
}
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
void Cut(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<WordRange>& words,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
vector<Dag> dags;
|
||||
dictTrie_->Find(begin,
|
||||
end,
|
||||
dags,
|
||||
max_word_len);
|
||||
CalcDP(dags);
|
||||
CutByDag(begin, end, dags, words);
|
||||
}
|
||||
|
||||
const DictTrie* GetDictTrie() const {
|
||||
const DictTrie* GetDictTrie() const override {
|
||||
return dictTrie_;
|
||||
}
|
||||
|
||||
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
||||
bool Tag(const string& src, vector<pair<string, string> >& res) const override {
|
||||
return tagger_.Tag(src, res, *this);
|
||||
}
|
||||
|
||||
|
@ -95,61 +45,50 @@ public:
|
|||
return dictTrie_->IsUserDictSingleChineseWord(value);
|
||||
}
|
||||
private:
|
||||
void CalcDP(vector<Dag>& dags) const {
|
||||
size_t nextPos;
|
||||
const DictUnit* p;
|
||||
double val;
|
||||
void CalcDP(vector<DatDag>& dags) const {
|
||||
for (auto rit = dags.rbegin(); rit != dags.rend(); rit++) {
|
||||
rit->max_next = -1;
|
||||
rit->max_weight = MIN_DOUBLE;
|
||||
|
||||
for(vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
|
||||
rit->pInfo = NULL;
|
||||
rit->weight = MIN_DOUBLE;
|
||||
assert(!rit->nexts.empty());
|
||||
for(LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
|
||||
nextPos = it->first;
|
||||
p = it->second;
|
||||
val = 0.0;
|
||||
if(nextPos + 1 < dags.size()) {
|
||||
val += dags[nextPos + 1].weight;
|
||||
for (const auto & it : rit->nexts) {
|
||||
const auto nextPos = it.first;
|
||||
double val = dictTrie_->GetMinWeight();
|
||||
|
||||
if (nullptr != it.second) {
|
||||
val = it.second->weight;
|
||||
}
|
||||
|
||||
if(p) {
|
||||
val += p->weight;
|
||||
} else {
|
||||
val += dictTrie_->GetMinWeight();
|
||||
if (nextPos < dags.size()) {
|
||||
val += dags[nextPos].max_weight;
|
||||
}
|
||||
if(val > rit->weight) {
|
||||
rit->pInfo = p;
|
||||
rit->weight = val;
|
||||
|
||||
if ((nextPos <= dags.size()) && (val > rit->max_weight)) {
|
||||
rit->max_weight = val;
|
||||
rit->max_next = nextPos;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CutByDag(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
const vector<Dag>& dags,
|
||||
RuneStrArray::const_iterator,
|
||||
const vector<DatDag>& dags,
|
||||
vector<WordRange>& words) const {
|
||||
size_t i = 0;
|
||||
while(i < dags.size()) {
|
||||
const DictUnit* p = dags[i].pInfo;
|
||||
if(p) {
|
||||
assert(p->word.size() >= 1);
|
||||
WordRange wr(begin + i, begin + i + p->word.size() - 1);
|
||||
words.push_back(wr);
|
||||
i += p->word.size();
|
||||
} else { //single chinese word
|
||||
WordRange wr(begin + i, begin + i);
|
||||
words.push_back(wr);
|
||||
i++;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < dags.size();) {
|
||||
const auto next = dags[i].max_next;
|
||||
assert(next > i);
|
||||
assert(next <= dags.size());
|
||||
WordRange wr(begin + i, begin + next - 1);
|
||||
words.push_back(wr);
|
||||
i = next;
|
||||
}
|
||||
}
|
||||
|
||||
const DictTrie* dictTrie_;
|
||||
bool isNeedDestroy_;
|
||||
PosTagger tagger_;
|
||||
|
||||
}; // class MPSegment
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,23 +1,4 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_MIXSEGMENT_H
|
||||
#define CPPJIEBA_MIXSEGMENT_H
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include "MPSegment.hpp"
|
||||
|
@ -28,70 +9,49 @@
|
|||
namespace cppjieba {
|
||||
class MixSegment: public SegmentTagged {
|
||||
public:
|
||||
MixSegment(const string& mpSegDict, const string& hmmSegDict,
|
||||
const string& userDict = "")
|
||||
: mpSeg_(mpSegDict, userDict),
|
||||
hmmSeg_(hmmSegDict) {
|
||||
}
|
||||
MixSegment(const DictTrie* dictTrie, const HMMModel* model)
|
||||
: mpSeg_(dictTrie), hmmSeg_(model) {
|
||||
}
|
||||
~MixSegment() {
|
||||
}
|
||||
~MixSegment() {}
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words) const {
|
||||
Cut(sentence, words, true);
|
||||
}
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp, hmm);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size() / 2);
|
||||
while(pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, wrs, hmm);
|
||||
}
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
|
||||
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
|
||||
if(!hmm) {
|
||||
mpSeg_.Cut(begin, end, res);
|
||||
virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
|
||||
size_t) const override {
|
||||
if (!hmm) {
|
||||
mpSeg_.CutRuneArray(begin, end, res);
|
||||
return;
|
||||
}
|
||||
|
||||
vector<WordRange> words;
|
||||
assert(end >= begin);
|
||||
words.reserve(end - begin);
|
||||
mpSeg_.Cut(begin, end, words);
|
||||
mpSeg_.CutRuneArray(begin, end, words);
|
||||
|
||||
vector<WordRange> hmmRes;
|
||||
hmmRes.reserve(end - begin);
|
||||
for(size_t i = 0; i < words.size(); i++) {
|
||||
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
//if mp Get a word, it's ok, put it into result
|
||||
if(words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
|
||||
if (words[i].left != words[i].right || (words[i].left == words[i].right &&
|
||||
mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
|
||||
res.push_back(words[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// if mp Get a single one and it is not in userdict, collect it in sequence
|
||||
size_t j = i;
|
||||
while(j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
|
||||
|
||||
while (j < words.size() && words[j].left == words[j].right &&
|
||||
!mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
|
||||
j++;
|
||||
}
|
||||
|
||||
// Cut the sequence with hmm
|
||||
assert(j - 1 >= i);
|
||||
// TODO
|
||||
hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes);
|
||||
hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
|
||||
|
||||
//put hmm result to result
|
||||
for(size_t k = 0; k < hmmRes.size(); k++) {
|
||||
for (size_t k = 0; k < hmmRes.size(); k++) {
|
||||
res.push_back(hmmRes[k]);
|
||||
}
|
||||
|
||||
|
@ -103,11 +63,61 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
const DictTrie* GetDictTrie() const {
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
|
||||
size_t) const override {
|
||||
//目前hmm默认开启,后期如有需要关闭再修改--jxx20210519
|
||||
// if (!hmm) {
|
||||
// mpSeg_.CutRuneArray(begin, end, res);
|
||||
// return;
|
||||
// }
|
||||
|
||||
vector<WordRange> words;
|
||||
assert(end >= begin);
|
||||
words.reserve(end - begin);
|
||||
mpSeg_.CutRuneArray(begin, end, words);
|
||||
|
||||
vector<WordRange> hmmRes;
|
||||
hmmRes.reserve(end - begin);
|
||||
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
//if mp Get a word, it's ok, put it into result
|
||||
if (words[i].left != words[i].right || (words[i].left == words[i].right &&
|
||||
mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
|
||||
res.push_back(GetStringFromRunes(s, words[i].left, words[i].right));
|
||||
continue;
|
||||
}
|
||||
|
||||
// if mp Get a single one and it is not in userdict, collect it in sequence
|
||||
size_t j = i;
|
||||
|
||||
while (j < words.size() && words[j].left == words[j].right &&
|
||||
!mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
|
||||
j++;
|
||||
}
|
||||
|
||||
// Cut the sequence with hmm
|
||||
assert(j - 1 >= i);
|
||||
// TODO
|
||||
hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
|
||||
|
||||
//put hmm result to result
|
||||
for (size_t k = 0; k < hmmRes.size(); k++) {
|
||||
res.push_back(GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right));
|
||||
}
|
||||
|
||||
//clear tmp vars
|
||||
hmmRes.clear();
|
||||
|
||||
//let i jump over this piece
|
||||
i = j - 1;
|
||||
}
|
||||
}
|
||||
|
||||
const DictTrie* GetDictTrie() const override {
|
||||
return mpSeg_.GetDictTrie();
|
||||
}
|
||||
|
||||
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
||||
bool Tag(const string& src, vector<pair<string, string> >& res) const override {
|
||||
return tagger_.Tag(src, res, *this);
|
||||
}
|
||||
|
||||
|
@ -124,4 +134,3 @@ private:
|
|||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,27 +1,8 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_POS_TAGGING_H
|
||||
#define CPPJIEBA_POS_TAGGING_H
|
||||
#pragma once
|
||||
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "SegmentTagged.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "SegmentTagged.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
using namespace limonp;
|
||||
|
@ -39,28 +20,31 @@ public:
|
|||
|
||||
bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
|
||||
vector<string> CutRes;
|
||||
segment.Cut(src, CutRes);
|
||||
segment.CutToStr(src, CutRes);
|
||||
|
||||
for(vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
|
||||
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
|
||||
res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
|
||||
}
|
||||
|
||||
return !res.empty();
|
||||
}
|
||||
|
||||
string LookupTag(const string &str, const SegmentTagged& segment) const {
|
||||
const DictUnit *tmp = NULL;
|
||||
RuneStrArray runes;
|
||||
const DictTrie * dict = segment.GetDictTrie();
|
||||
assert(dict != NULL);
|
||||
if(!DecodeRunesInString(str, runes)) {
|
||||
XLOG(ERROR) << "Decode failed.";
|
||||
return POS_X;
|
||||
}
|
||||
tmp = dict->Find(runes.begin(), runes.end());
|
||||
if(tmp == NULL || tmp->tag.empty()) {
|
||||
const auto tmp = dict->Find(str);
|
||||
|
||||
if (tmp == NULL || tmp->GetTag().empty()) {
|
||||
RuneStrArray runes;
|
||||
|
||||
if (!DecodeRunesInString(str, runes)) {
|
||||
XLOG(ERROR) << "Decode failed.";
|
||||
return POS_X;
|
||||
}
|
||||
|
||||
return SpecialRule(runes);
|
||||
} else {
|
||||
return tmp->tag;
|
||||
return tmp->GetTag();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -68,22 +52,27 @@ private:
|
|||
const char* SpecialRule(const RuneStrArray& unicode) const {
|
||||
size_t m = 0;
|
||||
size_t eng = 0;
|
||||
for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
|
||||
if(unicode[i].rune < 0x80) {
|
||||
|
||||
for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
|
||||
if (unicode[i].rune < 0x80) {
|
||||
eng ++;
|
||||
if('0' <= unicode[i].rune && unicode[i].rune <= '9') {
|
||||
|
||||
if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
|
||||
m++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ascii char is not found
|
||||
if(eng == 0) {
|
||||
if (eng == 0) {
|
||||
return POS_X;
|
||||
}
|
||||
|
||||
// all the ascii is number char
|
||||
if(m == eng) {
|
||||
if (m == eng) {
|
||||
return POS_M;
|
||||
}
|
||||
|
||||
// the ascii chars contain english letter
|
||||
return POS_ENG;
|
||||
}
|
||||
|
@ -92,4 +81,3 @@ private:
|
|||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,43 +1,20 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_PRE_FILTER_H
|
||||
#define CPPJIEBA_PRE_FILTER_H
|
||||
#pragma once
|
||||
|
||||
#include "Trie.hpp"
|
||||
#include "limonp/Logging.hpp"
|
||||
#include <unordered_set>
|
||||
#include "Unicode.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
class PreFilter {
|
||||
public:
|
||||
//TODO use WordRange instead of Range
|
||||
struct Range {
|
||||
RuneStrArray::const_iterator begin;
|
||||
RuneStrArray::const_iterator end;
|
||||
}; // struct Range
|
||||
|
||||
PreFilter(const unordered_set<Rune>& symbols,
|
||||
PreFilter(const std::unordered_set<Rune>& symbols,
|
||||
const string& sentence)
|
||||
: symbols_(symbols) {
|
||||
if(!DecodeRunesInString(sentence, sentence_)) {
|
||||
XLOG(ERROR) << "decode failed. ";
|
||||
if (!DecodeRunesInString(sentence, sentence_)) {
|
||||
XLOG(ERROR) << "decode failed. "<<sentence;
|
||||
}
|
||||
|
||||
cursor_ = sentence_.begin();
|
||||
}
|
||||
~PreFilter() {
|
||||
|
@ -45,28 +22,31 @@ public:
|
|||
bool HasNext() const {
|
||||
return cursor_ != sentence_.end();
|
||||
}
|
||||
Range Next() {
|
||||
Range range;
|
||||
range.begin = cursor_;
|
||||
while(cursor_ != sentence_.end()) {
|
||||
if(IsIn(symbols_, cursor_->rune)) {
|
||||
if(range.begin == cursor_) {
|
||||
WordRange Next() {
|
||||
WordRange range(cursor_, cursor_);
|
||||
|
||||
while (cursor_ != sentence_.end()) {
|
||||
//if (IsIn(symbols_, cursor_->rune)) {
|
||||
if (cursor_->rune == 0x20) {
|
||||
if (range.left == cursor_) {
|
||||
cursor_ ++;
|
||||
}
|
||||
range.end = cursor_;
|
||||
|
||||
range.right = cursor_;
|
||||
return range;
|
||||
}
|
||||
|
||||
cursor_ ++;
|
||||
}
|
||||
range.end = sentence_.end();
|
||||
|
||||
range.right = sentence_.end();
|
||||
return range;
|
||||
}
|
||||
private:
|
||||
RuneStrArray::const_iterator cursor_;
|
||||
RuneStrArray sentence_;
|
||||
const unordered_set<Rune>& symbols_;
|
||||
const std::unordered_set<Rune>& symbols_;
|
||||
}; // class PreFilter
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif // CPPJIEBA_PRE_FILTER_H
|
||||
|
|
|
@ -1,23 +1,4 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_QUERYSEGMENT_H
|
||||
#define CPPJIEBA_QUERYSEGMENT_H
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
|
@ -28,74 +9,65 @@
|
|||
#include "FullSegment.hpp"
|
||||
#include "MixSegment.hpp"
|
||||
#include "Unicode.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
class QuerySegment: public SegmentBase {
|
||||
public:
|
||||
QuerySegment(const string& dict, const string& model, const string& userDict = "")
|
||||
: mixSeg_(dict, model, userDict),
|
||||
trie_(mixSeg_.GetDictTrie()) {
|
||||
}
|
||||
QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
|
||||
: mixSeg_(dictTrie, model), trie_(dictTrie) {
|
||||
}
|
||||
~QuerySegment() {
|
||||
}
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words) const {
|
||||
Cut(sentence, words, true);
|
||||
}
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp, hmm);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size() / 2);
|
||||
while(pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, wrs, hmm);
|
||||
}
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
|
||||
virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
|
||||
size_t) const override {
|
||||
//use mix Cut first
|
||||
vector<WordRange> mixRes;
|
||||
mixSeg_.Cut(begin, end, mixRes, hmm);
|
||||
mixSeg_.CutRuneArray(begin, end, mixRes, hmm);
|
||||
|
||||
vector<WordRange> fullRes;
|
||||
for(vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
||||
if(mixResItr->Length() > 2) {
|
||||
for(size_t i = 0; i + 1 < mixResItr->Length(); i++) {
|
||||
WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
|
||||
if(trie_->Find(wr.left, wr.right + 1) != NULL) {
|
||||
|
||||
for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
||||
if (mixResItr->Length() > 2) {
|
||||
for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
|
||||
string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 2);
|
||||
|
||||
if (trie_->Find(text) != NULL) {
|
||||
WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
|
||||
res.push_back(wr);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(mixResItr->Length() > 3) {
|
||||
for(size_t i = 0; i + 2 < mixResItr->Length(); i++) {
|
||||
WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
|
||||
if(trie_->Find(wr.left, wr.right + 1) != NULL) {
|
||||
|
||||
if (mixResItr->Length() > 3) {
|
||||
for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
|
||||
string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 3);
|
||||
|
||||
if (trie_->Find(text) != NULL) {
|
||||
WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
|
||||
res.push_back(wr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
res.push_back(*mixResItr);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
|
||||
size_t) const override {
|
||||
|
||||
}
|
||||
|
||||
private:
|
||||
bool IsAllAscii(const Unicode& s) const {
|
||||
for(size_t i = 0; i < s.size(); i++) {
|
||||
if(s[i] >= 0x80) {
|
||||
bool IsAllAscii(const RuneArray& s) const {
|
||||
for (size_t i = 0; i < s.size(); i++) {
|
||||
if (s[i] >= 0x80) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
MixSegment mixSeg_;
|
||||
|
@ -104,4 +76,3 @@ private:
|
|||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,23 +1,4 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_SEGMENTBASE_H
|
||||
#define CPPJIEBA_SEGMENTBASE_H
|
||||
#pragma once
|
||||
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "PreFilter.hpp"
|
||||
|
@ -35,24 +16,69 @@ public:
|
|||
SegmentBase() {
|
||||
XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
|
||||
}
|
||||
virtual ~SegmentBase() {
|
||||
virtual ~SegmentBase() { }
|
||||
|
||||
virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
|
||||
size_t max_word_len) const = 0;
|
||||
//添加基于sentence的cut方法,减少中间变量的存储与格式转换--jxx20210517
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
|
||||
size_t max_word_len) const = 0;
|
||||
//重写CutToStr函数,简化获取vector<string>& words的流程,降低内存占用--jxx20210517
|
||||
void CutToStr(const string& sentence, vector<string>& words, bool hmm = true,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
/*
|
||||
vector<Word> tmp;
|
||||
CutToWord(sentence, tmp, hmm, max_word_len);
|
||||
GetStringsFromWords(tmp, words);
|
||||
*/
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
words.clear();
|
||||
words.reserve(sentence.size() / 2);//todo 参考源码,参数待定
|
||||
while (pre_filter.HasNext()) {
|
||||
auto range = pre_filter.Next();
|
||||
CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void Cut(const string& sentence, vector<string>& words) const = 0;
|
||||
void CutToWord(const string& sentence, vector<Word>& words, bool hmm = true,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size() / 2);
|
||||
|
||||
while (pre_filter.HasNext()) {
|
||||
auto range = pre_filter.Next();
|
||||
Cut(range.left, range.right, wrs, hmm, max_word_len);
|
||||
}
|
||||
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
wrs.clear();
|
||||
vector<WordRange>().swap(wrs);
|
||||
}
|
||||
|
||||
void CutRuneArray(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res,
|
||||
bool hmm = true, size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
Cut(begin, end, res, hmm, max_word_len);
|
||||
}
|
||||
|
||||
bool ResetSeparators(const string& s) {
|
||||
symbols_.clear();
|
||||
RuneStrArray runes;
|
||||
if(!DecodeRunesInString(s, runes)) {
|
||||
|
||||
if (!DecodeRunesInString(s, runes)) {
|
||||
XLOG(ERROR) << "decode " << s << " failed";
|
||||
return false;
|
||||
}
|
||||
for(size_t i = 0; i < runes.size(); i++) {
|
||||
if(!symbols_.insert(runes[i].rune).second) {
|
||||
|
||||
for (size_t i = 0; i < runes.size(); i++) {
|
||||
if (!symbols_.insert(runes[i].rune).second) {
|
||||
XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
protected:
|
||||
|
@ -61,4 +87,3 @@ protected:
|
|||
|
||||
} // cppjieba
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,23 +1,4 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_SEGMENTTAGGED_H
|
||||
#define CPPJIEBA_SEGMENTTAGGED_H
|
||||
#pragma once
|
||||
|
||||
#include "SegmentBase.hpp"
|
||||
|
||||
|
@ -38,4 +19,3 @@ public:
|
|||
|
||||
} // cppjieba
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,212 +1,205 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
|
||||
#define CPPJIEBA_TEXTRANK_EXTRACTOR_H
|
||||
|
||||
#include <cmath>
|
||||
#include "Jieba.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
using namespace limonp;
|
||||
using namespace std;
|
||||
|
||||
class TextRankExtractor {
|
||||
public:
|
||||
typedef struct _Word {
|
||||
string word;
|
||||
vector<size_t> offsets;
|
||||
double weight;
|
||||
} Word; // struct Word
|
||||
private:
|
||||
typedef std::map<string, Word> WordMap;
|
||||
|
||||
class WordGraph {
|
||||
private:
|
||||
typedef double Score;
|
||||
typedef string Node;
|
||||
typedef std::set<Node> NodeSet;
|
||||
|
||||
typedef std::map<Node, double> Edges;
|
||||
typedef std::map<Node, Edges> Graph;
|
||||
//typedef std::unordered_map<Node,double> Edges;
|
||||
//typedef std::unordered_map<Node,Edges> Graph;
|
||||
|
||||
double d;
|
||||
Graph graph;
|
||||
NodeSet nodeSet;
|
||||
public:
|
||||
WordGraph(): d(0.85) {};
|
||||
WordGraph(double in_d): d(in_d) {};
|
||||
|
||||
void addEdge(Node start, Node end, double weight) {
|
||||
Edges temp;
|
||||
Edges::iterator gotEdges;
|
||||
nodeSet.insert(start);
|
||||
nodeSet.insert(end);
|
||||
graph[start][end] += weight;
|
||||
graph[end][start] += weight;
|
||||
}
|
||||
|
||||
void rank(WordMap &ws, size_t rankTime = 10) {
|
||||
WordMap outSum;
|
||||
Score wsdef, min_rank, max_rank;
|
||||
|
||||
if(graph.size() == 0)
|
||||
return;
|
||||
|
||||
wsdef = 1.0 / graph.size();
|
||||
|
||||
for(Graph::iterator edges = graph.begin(); edges != graph.end(); ++edges) {
|
||||
// edges->first start节点;edge->first end节点;edge->second 权重
|
||||
ws[edges->first].word = edges->first;
|
||||
ws[edges->first].weight = wsdef;
|
||||
outSum[edges->first].weight = 0;
|
||||
for(Edges::iterator edge = edges->second.begin(); edge != edges->second.end(); ++edge) {
|
||||
outSum[edges->first].weight += edge->second;
|
||||
}
|
||||
}
|
||||
//sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
|
||||
for(size_t i = 0; i < rankTime; i++) {
|
||||
for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++) {
|
||||
double s = 0;
|
||||
for(Edges::iterator edge = graph[*node].begin(); edge != graph[*node].end(); edge++)
|
||||
// edge->first end节点;edge->second 权重
|
||||
s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
|
||||
ws[*node].weight = (1 - d) + d * s;
|
||||
}
|
||||
}
|
||||
|
||||
min_rank = max_rank = ws.begin()->second.weight;
|
||||
for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
|
||||
if(i->second.weight < min_rank) {
|
||||
min_rank = i->second.weight;
|
||||
}
|
||||
if(i->second.weight > max_rank) {
|
||||
max_rank = i->second.weight;
|
||||
}
|
||||
}
|
||||
for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
|
||||
ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
public:
|
||||
TextRankExtractor(const string& dictPath,
|
||||
const string& hmmFilePath,
|
||||
const string& stopWordPath,
|
||||
const string& userDict = "")
|
||||
: segment_(dictPath, hmmFilePath, userDict) {
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
TextRankExtractor(const DictTrie* dictTrie,
|
||||
const HMMModel* model,
|
||||
const string& stopWordPath)
|
||||
: segment_(dictTrie, model) {
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
~TextRankExtractor() {
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
||||
vector<Word> topWords;
|
||||
Extract(sentence, topWords, topN);
|
||||
for(size_t i = 0; i < topWords.size(); i++) {
|
||||
keywords.push_back(topWords[i].word);
|
||||
}
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||
vector<Word> topWords;
|
||||
Extract(sentence, topWords, topN);
|
||||
for(size_t i = 0; i < topWords.size(); i++) {
|
||||
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
||||
}
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span = 5, size_t rankTime = 10) const {
|
||||
vector<string> words;
|
||||
segment_.Cut(sentence, words);
|
||||
|
||||
TextRankExtractor::WordGraph graph;
|
||||
WordMap wordmap;
|
||||
size_t offset = 0;
|
||||
|
||||
for(size_t i = 0; i < words.size(); i++) {
|
||||
size_t t = offset;
|
||||
offset += words[i].size();
|
||||
if(IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
|
||||
continue;
|
||||
}
|
||||
for(size_t j = i + 1, skip = 0; j < i + span + skip && j < words.size(); j++) {
|
||||
if(IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
|
||||
skip++;
|
||||
continue;
|
||||
}
|
||||
graph.addEdge(words[i], words[j], 1);
|
||||
}
|
||||
wordmap[words[i]].offsets.push_back(t);
|
||||
}
|
||||
if(offset != sentence.size()) {
|
||||
XLOG(ERROR) << "words illegal";
|
||||
return;
|
||||
}
|
||||
|
||||
graph.rank(wordmap, rankTime);
|
||||
|
||||
keywords.clear();
|
||||
keywords.reserve(wordmap.size());
|
||||
for(WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
||||
keywords.push_back(itr->second);
|
||||
}
|
||||
|
||||
topN = min(topN, keywords.size());
|
||||
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
||||
keywords.resize(topN);
|
||||
}
|
||||
private:
|
||||
void LoadStopWordDict(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
|
||||
string line ;
|
||||
while(getline(ifs, line)) {
|
||||
stopWords_.insert(line);
|
||||
}
|
||||
assert(stopWords_.size());
|
||||
}
|
||||
|
||||
static bool Compare(const Word &x, const Word &y) {
|
||||
return x.weight > y.weight;
|
||||
}
|
||||
|
||||
MixSegment segment_;
|
||||
unordered_set<string> stopWords_;
|
||||
}; // class TextRankExtractor
|
||||
|
||||
inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
|
||||
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
|
||||
}
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#include <cmath>
|
||||
#include "Jieba.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
using namespace limonp;
|
||||
using namespace std;
|
||||
|
||||
class TextRankExtractor {
|
||||
public:
|
||||
typedef struct _Word {
|
||||
string word;
|
||||
vector<size_t> offsets;
|
||||
double weight;
|
||||
} Word; // struct Word
|
||||
private:
|
||||
typedef std::map<string, Word> WordMap;
|
||||
|
||||
class WordGraph {
|
||||
private:
|
||||
typedef double Score;
|
||||
typedef string Node;
|
||||
typedef std::set<Node> NodeSet;
|
||||
|
||||
typedef std::map<Node, double> Edges;
|
||||
typedef std::map<Node, Edges> Graph;
|
||||
//typedef std::unordered_map<Node,double> Edges;
|
||||
//typedef std::unordered_map<Node,Edges> Graph;
|
||||
|
||||
double d;
|
||||
Graph graph;
|
||||
NodeSet nodeSet;
|
||||
public:
|
||||
WordGraph(): d(0.85) {};
|
||||
WordGraph(double in_d): d(in_d) {};
|
||||
|
||||
void addEdge(Node start, Node end, double weight) {
|
||||
Edges temp;
|
||||
Edges::iterator gotEdges;
|
||||
nodeSet.insert(start);
|
||||
nodeSet.insert(end);
|
||||
graph[start][end] += weight;
|
||||
graph[end][start] += weight;
|
||||
}
|
||||
|
||||
void rank(WordMap &ws, size_t rankTime = 10) {
|
||||
WordMap outSum;
|
||||
Score wsdef, min_rank, max_rank;
|
||||
|
||||
if (graph.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
wsdef = 1.0 / graph.size();
|
||||
|
||||
for (Graph::iterator edges = graph.begin(); edges != graph.end(); ++edges) {
|
||||
// edges->first start节点;edge->first end节点;edge->second 权重
|
||||
ws[edges->first].word = edges->first;
|
||||
ws[edges->first].weight = wsdef;
|
||||
outSum[edges->first].weight = 0;
|
||||
|
||||
for (Edges::iterator edge = edges->second.begin(); edge != edges->second.end(); ++edge) {
|
||||
outSum[edges->first].weight += edge->second;
|
||||
}
|
||||
}
|
||||
|
||||
//sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
|
||||
for (size_t i = 0; i < rankTime; i++) {
|
||||
for (NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++) {
|
||||
double s = 0;
|
||||
|
||||
for (Edges::iterator edge = graph[*node].begin(); edge != graph[*node].end(); edge++)
|
||||
// edge->first end节点;edge->second 权重
|
||||
{
|
||||
s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
|
||||
}
|
||||
|
||||
ws[*node].weight = (1 - d) + d * s;
|
||||
}
|
||||
}
|
||||
|
||||
min_rank = max_rank = ws.begin()->second.weight;
|
||||
|
||||
for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
|
||||
if (i->second.weight < min_rank) {
|
||||
min_rank = i->second.weight;
|
||||
}
|
||||
|
||||
if (i->second.weight > max_rank) {
|
||||
max_rank = i->second.weight;
|
||||
}
|
||||
}
|
||||
|
||||
for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
|
||||
ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
public:
|
||||
TextRankExtractor(const DictTrie* dictTrie,
|
||||
const HMMModel* model,
|
||||
const string& stopWordPath)
|
||||
: segment_(dictTrie, model) {
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
~TextRankExtractor() {
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
||||
vector<Word> topWords;
|
||||
Extract(sentence, topWords, topN);
|
||||
|
||||
for (size_t i = 0; i < topWords.size(); i++) {
|
||||
keywords.push_back(topWords[i].word);
|
||||
}
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||
vector<Word> topWords;
|
||||
Extract(sentence, topWords, topN);
|
||||
|
||||
for (size_t i = 0; i < topWords.size(); i++) {
|
||||
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
||||
}
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span = 5, size_t rankTime = 10) const {
|
||||
vector<string> words;
|
||||
segment_.CutToStr(sentence, words);
|
||||
|
||||
TextRankExtractor::WordGraph graph;
|
||||
WordMap wordmap;
|
||||
size_t offset = 0;
|
||||
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
size_t t = offset;
|
||||
offset += words[i].size();
|
||||
|
||||
if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (size_t j = i + 1, skip = 0; j < i + span + skip && j < words.size(); j++) {
|
||||
if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
|
||||
skip++;
|
||||
continue;
|
||||
}
|
||||
|
||||
graph.addEdge(words[i], words[j], 1);
|
||||
}
|
||||
|
||||
wordmap[words[i]].offsets.push_back(t);
|
||||
}
|
||||
|
||||
if (offset != sentence.size()) {
|
||||
XLOG(ERROR) << "words illegal";
|
||||
return;
|
||||
}
|
||||
|
||||
graph.rank(wordmap, rankTime);
|
||||
|
||||
keywords.clear();
|
||||
keywords.reserve(wordmap.size());
|
||||
|
||||
for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
||||
keywords.push_back(itr->second);
|
||||
}
|
||||
|
||||
topN = min(topN, keywords.size());
|
||||
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
||||
keywords.resize(topN);
|
||||
}
|
||||
private:
|
||||
void LoadStopWordDict(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
|
||||
string line ;
|
||||
|
||||
while (getline(ifs, line)) {
|
||||
stopWords_.insert(line);
|
||||
}
|
||||
|
||||
assert(stopWords_.size());
|
||||
}
|
||||
|
||||
static bool Compare(const Word &x, const Word &y) {
|
||||
return x.weight > y.weight;
|
||||
}
|
||||
|
||||
MixSegment segment_;
|
||||
unordered_set<string> stopWords_;
|
||||
}; // class TextRankExtractor
|
||||
|
||||
inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
|
||||
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
|
||||
"}";
|
||||
}
|
||||
} // namespace cppjieba
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,192 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_TRIE_HPP
|
||||
#define CPPJIEBA_TRIE_HPP
|
||||
|
||||
#include <vector>
|
||||
#include <queue>
|
||||
#include "limonp/StdExtension.hpp"
|
||||
#include "Unicode.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
using namespace std;
|
||||
|
||||
const size_t MAX_WORD_LENGTH = 512;
|
||||
|
||||
struct DictUnit {
|
||||
Unicode word;
|
||||
double weight;
|
||||
string tag;
|
||||
}; // struct DictUnit
|
||||
|
||||
// for debugging
|
||||
// inline ostream & operator << (ostream& os, const DictUnit& unit) {
|
||||
// string s;
|
||||
// s << unit.word;
|
||||
// return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
|
||||
// }
|
||||
|
||||
struct Dag {
|
||||
RuneStr runestr;
|
||||
// [offset, nexts.first]
|
||||
limonp::LocalVector<pair<size_t, const DictUnit*> > nexts;
|
||||
const DictUnit * pInfo;
|
||||
double weight;
|
||||
size_t nextPos; // TODO
|
||||
Dag(): runestr(), pInfo(NULL), weight(0.0), nextPos(0) {
|
||||
}
|
||||
}; // struct Dag
|
||||
|
||||
typedef Rune TrieKey;
|
||||
|
||||
class TrieNode {
|
||||
public :
|
||||
TrieNode(): next(NULL), ptValue(NULL) {
|
||||
}
|
||||
public:
|
||||
typedef unordered_map<TrieKey, TrieNode*> NextMap;
|
||||
NextMap *next;
|
||||
const DictUnit *ptValue;
|
||||
};
|
||||
|
||||
class Trie {
|
||||
public:
|
||||
Trie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers)
|
||||
: root_(new TrieNode) {
|
||||
CreateTrie(keys, valuePointers);
|
||||
}
|
||||
~Trie() {
|
||||
DeleteNode(root_);
|
||||
}
|
||||
|
||||
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
||||
if(begin == end) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const TrieNode* ptNode = root_;
|
||||
TrieNode::NextMap::const_iterator citer;
|
||||
for(RuneStrArray::const_iterator it = begin; it != end; it++) {
|
||||
if(NULL == ptNode->next) {
|
||||
return NULL;
|
||||
}
|
||||
citer = ptNode->next->find(it->rune);
|
||||
if(ptNode->next->end() == citer) {
|
||||
return NULL;
|
||||
}
|
||||
ptNode = citer->second;
|
||||
}
|
||||
return ptNode->ptValue;
|
||||
}
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<struct Dag>&res,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
assert(root_ != NULL);
|
||||
res.resize(end - begin);
|
||||
|
||||
const TrieNode *ptNode = NULL;
|
||||
TrieNode::NextMap::const_iterator citer;
|
||||
for(size_t i = 0; i < size_t(end - begin); i++) {
|
||||
res[i].runestr = *(begin + i);
|
||||
|
||||
if(root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
|
||||
ptNode = citer->second;
|
||||
} else {
|
||||
ptNode = NULL;
|
||||
}
|
||||
if(ptNode != NULL) {
|
||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
||||
} else {
|
||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, static_cast<const DictUnit*>(NULL)));
|
||||
}
|
||||
|
||||
for(size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) {
|
||||
if(ptNode == NULL || ptNode->next == NULL) {
|
||||
break;
|
||||
}
|
||||
citer = ptNode->next->find((begin + j)->rune);
|
||||
if(ptNode->next->end() == citer) {
|
||||
break;
|
||||
}
|
||||
ptNode = citer->second;
|
||||
if(NULL != ptNode->ptValue) {
|
||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void InsertNode(const Unicode& key, const DictUnit* ptValue) {
|
||||
if(key.begin() == key.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
TrieNode::NextMap::const_iterator kmIter;
|
||||
TrieNode *ptNode = root_;
|
||||
for(Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
|
||||
if(NULL == ptNode->next) {
|
||||
ptNode->next = new TrieNode::NextMap;
|
||||
}
|
||||
kmIter = ptNode->next->find(*citer);
|
||||
if(ptNode->next->end() == kmIter) {
|
||||
TrieNode *nextNode = new TrieNode;
|
||||
|
||||
ptNode->next->insert(make_pair(*citer, nextNode));
|
||||
ptNode = nextNode;
|
||||
} else {
|
||||
ptNode = kmIter->second;
|
||||
}
|
||||
}
|
||||
assert(ptNode != NULL);
|
||||
ptNode->ptValue = ptValue;
|
||||
}
|
||||
|
||||
private:
|
||||
void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
|
||||
if(valuePointers.empty() || keys.empty()) {
|
||||
return;
|
||||
}
|
||||
assert(keys.size() == valuePointers.size());
|
||||
|
||||
for(size_t i = 0; i < keys.size(); i++) {
|
||||
InsertNode(keys[i], valuePointers[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void DeleteNode(TrieNode* node) {
|
||||
if(NULL == node) {
|
||||
return;
|
||||
}
|
||||
if(NULL != node->next) {
|
||||
for(TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) {
|
||||
DeleteNode(it->second);
|
||||
}
|
||||
delete node->next;
|
||||
}
|
||||
delete node;
|
||||
}
|
||||
|
||||
TrieNode* root_;
|
||||
}; // class Trie
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif // CPPJIEBA_TRIE_HPP
|
|
@ -1,23 +1,4 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef CPPJIEBA_UNICODE_H
|
||||
#define CPPJIEBA_UNICODE_H
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -25,6 +6,7 @@
|
|||
#include <vector>
|
||||
#include <ostream>
|
||||
#include "limonp/LocalVector.hpp"
|
||||
#include "limonp/StringUtil.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
|
@ -50,28 +32,28 @@ inline std::ostream& operator << (std::ostream& os, const Word& w) {
|
|||
return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
|
||||
}
|
||||
|
||||
struct RuneStr {
|
||||
struct RuneInfo {
|
||||
Rune rune;
|
||||
uint32_t offset;
|
||||
uint32_t len;
|
||||
uint32_t unicode_offset;
|
||||
uint32_t unicode_length;
|
||||
RuneStr(): rune(0), offset(0), len(0), unicode_offset(0), unicode_length(0) {
|
||||
uint32_t unicode_offset = 0;
|
||||
uint32_t unicode_length = 0;
|
||||
RuneInfo(): rune(0), offset(0), len(0) {
|
||||
}
|
||||
RuneStr(Rune r, uint32_t o, uint32_t l)
|
||||
: rune(r), offset(o), len(l), unicode_offset(0), unicode_length(0) {
|
||||
RuneInfo(Rune r, uint32_t o, uint32_t l)
|
||||
: rune(r), offset(o), len(l) {
|
||||
}
|
||||
RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
|
||||
RuneInfo(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
|
||||
: rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
||||
}
|
||||
}; // struct RuneStr
|
||||
}; // struct RuneInfo
|
||||
|
||||
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
|
||||
inline std::ostream& operator << (std::ostream& os, const RuneInfo& r) {
|
||||
return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
|
||||
}
|
||||
|
||||
typedef limonp::LocalVector<Rune> Unicode;
|
||||
typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
|
||||
typedef limonp::LocalVector<Rune> RuneArray;
|
||||
typedef limonp::LocalVector<struct RuneInfo> RuneStrArray;
|
||||
|
||||
// [left, right]
|
||||
struct WordRange {
|
||||
|
@ -83,127 +65,169 @@ struct WordRange {
|
|||
size_t Length() const {
|
||||
return right - left + 1;
|
||||
}
|
||||
|
||||
bool IsAllAscii() const {
|
||||
for(RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
|
||||
if(iter->rune >= 0x80) {
|
||||
for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
|
||||
if (iter->rune >= 0x80) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}; // struct WordRange
|
||||
|
||||
struct RuneStrLite {
|
||||
uint32_t rune;
|
||||
uint32_t len;
|
||||
RuneStrLite(): rune(0), len(0) {
|
||||
}
|
||||
RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
|
||||
}
|
||||
}; // struct RuneStrLite
|
||||
|
||||
inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
|
||||
RuneStrLite rp(0, 0);
|
||||
if(str == NULL || len == 0) {
|
||||
return rp;
|
||||
}
|
||||
if(!(str[0] & 0x80)) { // 0xxxxxxx
|
||||
// 7bit, total 7bit
|
||||
rp.rune = (uint8_t)(str[0]) & 0x7f;
|
||||
rp.len = 1;
|
||||
} else if((uint8_t)str[0] <= 0xdf && 1 < len) {
|
||||
// 110xxxxxx
|
||||
// 5bit, total 5bit
|
||||
rp.rune = (uint8_t)(str[0]) & 0x1f;
|
||||
|
||||
// 6bit, total 11bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
||||
rp.len = 2;
|
||||
} else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
|
||||
// 4bit, total 4bit
|
||||
rp.rune = (uint8_t)(str[0]) & 0x0f;
|
||||
|
||||
// 6bit, total 10bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
||||
|
||||
// 6bit, total 16bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
||||
|
||||
rp.len = 3;
|
||||
} else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
|
||||
// 3bit, total 3bit
|
||||
rp.rune = (uint8_t)(str[0]) & 0x07;
|
||||
|
||||
// 6bit, total 9bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
||||
|
||||
// 6bit, total 15bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
||||
|
||||
// 6bit, total 21bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[3]) & 0x3f;
|
||||
|
||||
rp.len = 4;
|
||||
} else {
|
||||
rp.rune = 0;
|
||||
rp.len = 0;
|
||||
}
|
||||
return rp;
|
||||
inline bool DecodeRunesInString(const string& s, RuneArray& arr) {
|
||||
arr.clear();
|
||||
return limonp::Utf8ToUnicode32(s, arr);
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
|
||||
runes.clear();
|
||||
runes.reserve(len / 2);
|
||||
for(uint32_t i = 0, j = 0; i < len;) {
|
||||
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
||||
if(rp.len == 0) {
|
||||
runes.clear();
|
||||
return false;
|
||||
}
|
||||
RuneStr x(rp.rune, i, rp.len, j, 1);
|
||||
runes.push_back(x);
|
||||
i += rp.len;
|
||||
++j;
|
||||
}
|
||||
return true;
|
||||
inline RuneArray DecodeRunesInString(const string& s) {
|
||||
RuneArray result;
|
||||
DecodeRunesInString(s, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
//重写DecodeRunesInString函数,将实现放入函数中降低内存占用加快处理流程--jxx20210518
|
||||
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
|
||||
return DecodeRunesInString(s.c_str(), s.size(), runes);
|
||||
}
|
||||
/*
|
||||
RuneArray arr;
|
||||
|
||||
inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
|
||||
unicode.clear();
|
||||
RuneStrArray runes;
|
||||
if(!DecodeRunesInString(s, len, runes)) {
|
||||
if (not DecodeRunesInString(s, arr)) {
|
||||
return false;
|
||||
}
|
||||
unicode.reserve(runes.size());
|
||||
for(size_t i = 0; i < runes.size(); i++) {
|
||||
unicode.push_back(runes[i].rune);
|
||||
|
||||
runes.clear();
|
||||
|
||||
uint32_t offset = 0;
|
||||
|
||||
for (uint32_t i = 0; i < arr.size(); ++i) {
|
||||
const uint32_t len = limonp::UnicodeToUtf8Bytes(arr[i]);
|
||||
RuneInfo x(arr[i], offset, len, i, 1);
|
||||
runes.push_back(x);
|
||||
offset += len;
|
||||
}
|
||||
*/
|
||||
|
||||
uint32_t tmp;
|
||||
uint32_t offset = 0;
|
||||
runes.clear();
|
||||
for(size_t i = 0; i < s.size();) {
|
||||
if(!(s.data()[i] & 0x80)) { // 0xxxxxxx
|
||||
// 7bit, total 7bit
|
||||
tmp = (uint8_t)(s.data()[i]) & 0x7f;
|
||||
i++;
|
||||
} else if ((uint8_t)s.data()[i] <= 0xdf && i + 1 < s.size()) { // 110xxxxxx
|
||||
// 5bit, total 5bit
|
||||
tmp = (uint8_t)(s.data()[i]) & 0x1f;
|
||||
|
||||
// 6bit, total 11bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
|
||||
i += 2;
|
||||
} else if((uint8_t)s.data()[i] <= 0xef && i + 2 < s.size()) { // 1110xxxxxx
|
||||
// 4bit, total 4bit
|
||||
tmp = (uint8_t)(s.data()[i]) & 0x0f;
|
||||
|
||||
// 6bit, total 10bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
|
||||
|
||||
// 6bit, total 16bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(s.data()[i+2]) & 0x3f;
|
||||
|
||||
i += 3;
|
||||
} else if((uint8_t)s.data()[i] <= 0xf7 && i + 3 < s.size()) { // 11110xxxx
|
||||
// 3bit, total 3bit
|
||||
tmp = (uint8_t)(s.data()[i]) & 0x07;
|
||||
|
||||
// 6bit, total 9bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
|
||||
|
||||
// 6bit, total 15bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(s.data()[i+2]) & 0x3f;
|
||||
|
||||
// 6bit, total 21bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(s.data()[i+3]) & 0x3f;
|
||||
|
||||
i += 4;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
uint32_t len = limonp::UnicodeToUtf8Bytes(tmp);
|
||||
RuneInfo x(tmp, offset, len, i, 1);
|
||||
runes.push_back(x);
|
||||
offset += len;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
class RunePtrWrapper {
|
||||
public:
|
||||
const RuneInfo * m_ptr = nullptr;
|
||||
|
||||
public:
|
||||
explicit RunePtrWrapper(const RuneInfo * p) : m_ptr(p) {}
|
||||
|
||||
uint32_t operator *() {
|
||||
return m_ptr->rune;
|
||||
}
|
||||
|
||||
RunePtrWrapper operator ++(int) {
|
||||
m_ptr ++;
|
||||
return RunePtrWrapper(m_ptr);
|
||||
}
|
||||
|
||||
bool operator !=(const RunePtrWrapper & b) const {
|
||||
return this->m_ptr != b.m_ptr;
|
||||
}
|
||||
};
|
||||
|
||||
inline string EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) {
|
||||
string str;
|
||||
RunePtrWrapper it_begin(begin), it_end(end);
|
||||
limonp::Unicode32ToUtf8(it_begin, it_end, str);
|
||||
return str;
|
||||
}
|
||||
|
||||
inline void EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, string& str) {
|
||||
RunePtrWrapper it_begin(begin), it_end(end);
|
||||
limonp::Unicode32ToUtf8(it_begin, it_end, str);
|
||||
return;
|
||||
}
|
||||
|
||||
class Unicode32Counter {
|
||||
public :
|
||||
size_t length = 0;
|
||||
void clear() {
|
||||
length = 0;
|
||||
}
|
||||
void push_back(uint32_t) {
|
||||
++length;
|
||||
}
|
||||
};
|
||||
|
||||
inline size_t Utf8CharNum(const char * str, size_t length) {
|
||||
Unicode32Counter c;
|
||||
|
||||
if (limonp::Utf8ToUnicode32(str, length, c)) {
|
||||
return c.length;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
inline size_t Utf8CharNum(const string & str) {
|
||||
return Utf8CharNum(str.data(), str.size());
|
||||
}
|
||||
|
||||
inline bool IsSingleWord(const string& str) {
|
||||
RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
|
||||
return rp.len == str.size();
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
|
||||
return DecodeRunesInString(s.c_str(), s.size(), unicode);
|
||||
}
|
||||
|
||||
inline Unicode DecodeRunesInString(const string& s) {
|
||||
Unicode result;
|
||||
DecodeRunesInString(s, result);
|
||||
return result;
|
||||
return Utf8CharNum(str) == 1;
|
||||
}
|
||||
|
||||
|
||||
|
@ -218,28 +242,31 @@ inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left,
|
|||
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
||||
assert(right->offset >= left->offset);
|
||||
uint32_t len = right->offset - left->offset + right->len;
|
||||
return s.substr(left->offset, len);
|
||||
uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
|
||||
return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length).word;
|
||||
}
|
||||
|
||||
inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
|
||||
for(size_t i = 0; i < wrs.size(); i++) {
|
||||
for (size_t i = 0; i < wrs.size(); i++) {
|
||||
words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
|
||||
}
|
||||
}
|
||||
|
||||
inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
|
||||
vector<Word> result;
|
||||
GetWordsFromWordRanges(s, wrs, result);
|
||||
return result;
|
||||
inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<string>& words) {
|
||||
for (size_t i = 0; i < wrs.size(); i++) {
|
||||
words.push_back(GetStringFromRunes(s, wrs[i].left, wrs[i].right));
|
||||
}
|
||||
}
|
||||
|
||||
inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
|
||||
strs.resize(words.size());
|
||||
for(size_t i = 0; i < words.size(); ++i) {
|
||||
|
||||
for (size_t i = 0; i < words.size(); ++i) {
|
||||
strs[i] = words[i].word;
|
||||
}
|
||||
}
|
||||
|
||||
const size_t MAX_WORD_LENGTH = 512;
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif // CPPJIEBA_UNICODE_H
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
/************************************
|
||||
* file enc : ascii
|
||||
* author : wuyanyi09@gmail.com
|
||||
|
@ -33,54 +15,54 @@ namespace limonp {
|
|||
using namespace std;
|
||||
|
||||
class ArgvContext {
|
||||
public :
|
||||
ArgvContext(int argc, const char* const * argv) {
|
||||
for(int i = 0; i < argc; i++) {
|
||||
if(StartsWith(argv[i], "-")) {
|
||||
if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) {
|
||||
mpss_[argv[i]] = argv[i + 1];
|
||||
i++;
|
||||
} else {
|
||||
sset_.insert(argv[i]);
|
||||
}
|
||||
} else {
|
||||
args_.push_back(argv[i]);
|
||||
}
|
||||
public :
|
||||
ArgvContext(int argc, const char* const * argv) {
|
||||
for(int i = 0; i < argc; i++) {
|
||||
if(StartsWith(argv[i], "-")) {
|
||||
if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) {
|
||||
mpss_[argv[i]] = argv[i+1];
|
||||
i++;
|
||||
} else {
|
||||
sset_.insert(argv[i]);
|
||||
}
|
||||
} else {
|
||||
args_.push_back(argv[i]);
|
||||
}
|
||||
}
|
||||
~ArgvContext() {
|
||||
}
|
||||
}
|
||||
~ArgvContext() {
|
||||
}
|
||||
|
||||
friend ostream& operator << (ostream& os, const ArgvContext& args);
|
||||
string operator [](size_t i) const {
|
||||
if(i < args_.size()) {
|
||||
return args_[i];
|
||||
}
|
||||
return "";
|
||||
friend ostream& operator << (ostream& os, const ArgvContext& args);
|
||||
string operator [](size_t i) const {
|
||||
if(i < args_.size()) {
|
||||
return args_[i];
|
||||
}
|
||||
string operator [](const string& key) const {
|
||||
map<string, string>::const_iterator it = mpss_.find(key);
|
||||
if(it != mpss_.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return "";
|
||||
return "";
|
||||
}
|
||||
string operator [](const string& key) const {
|
||||
map<string, string>::const_iterator it = mpss_.find(key);
|
||||
if(it != mpss_.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
bool HasKey(const string& key) const {
|
||||
if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
bool HasKey(const string& key) const {
|
||||
if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
vector<string> args_;
|
||||
map<string, string> mpss_;
|
||||
set<string> sset_;
|
||||
private:
|
||||
vector<string> args_;
|
||||
map<string, string> mpss_;
|
||||
set<string> sset_;
|
||||
}; // class ArgvContext
|
||||
|
||||
inline ostream& operator << (ostream& os, const ArgvContext& args) {
|
||||
return os << args.args_ << args.mpss_ << args.sset_;
|
||||
return os<<args.args_<<args.mpss_<<args.sset_;
|
||||
}
|
||||
|
||||
} // namespace limonp
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_BLOCKINGQUEUE_HPP
|
||||
#define LIMONP_BLOCKINGQUEUE_HPP
|
||||
|
||||
|
@ -25,41 +7,41 @@
|
|||
namespace limonp {
|
||||
template<class T>
|
||||
class BlockingQueue: NonCopyable {
|
||||
public:
|
||||
BlockingQueue()
|
||||
: mutex_(), notEmpty_(mutex_), queue_() {
|
||||
}
|
||||
public:
|
||||
BlockingQueue()
|
||||
: mutex_(), notEmpty_(mutex_), queue_() {
|
||||
}
|
||||
|
||||
void Push(const T& x) {
|
||||
MutexLockGuard lock(mutex_);
|
||||
queue_.push(x);
|
||||
notEmpty_.Notify(); // Wait morphing saves us
|
||||
}
|
||||
void Push(const T& x) {
|
||||
MutexLockGuard lock(mutex_);
|
||||
queue_.push(x);
|
||||
notEmpty_.Notify(); // Wait morphing saves us
|
||||
}
|
||||
|
||||
T Pop() {
|
||||
MutexLockGuard lock(mutex_);
|
||||
// always use a while-loop, due to spurious wakeup
|
||||
while(queue_.empty()) {
|
||||
notEmpty_.Wait();
|
||||
}
|
||||
assert(!queue_.empty());
|
||||
T front(queue_.front());
|
||||
queue_.pop();
|
||||
return front;
|
||||
T Pop() {
|
||||
MutexLockGuard lock(mutex_);
|
||||
// always use a while-loop, due to spurious wakeup
|
||||
while (queue_.empty()) {
|
||||
notEmpty_.Wait();
|
||||
}
|
||||
assert(!queue_.empty());
|
||||
T front(queue_.front());
|
||||
queue_.pop();
|
||||
return front;
|
||||
}
|
||||
|
||||
size_t Size() const {
|
||||
MutexLockGuard lock(mutex_);
|
||||
return queue_.size();
|
||||
}
|
||||
bool Empty() const {
|
||||
return Size() == 0;
|
||||
}
|
||||
size_t Size() const {
|
||||
MutexLockGuard lock(mutex_);
|
||||
return queue_.size();
|
||||
}
|
||||
bool Empty() const {
|
||||
return Size() == 0;
|
||||
}
|
||||
|
||||
private:
|
||||
mutable MutexLock mutex_;
|
||||
Condition notEmpty_;
|
||||
std::queue<T> queue_;
|
||||
private:
|
||||
mutable MutexLock mutex_;
|
||||
Condition notEmpty_;
|
||||
std::queue<T> queue_;
|
||||
}; // class BlockingQueue
|
||||
|
||||
} // namespace limonp
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
|
||||
#define LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
|
||||
|
||||
|
@ -25,59 +7,59 @@ namespace limonp {
|
|||
|
||||
template<typename T>
|
||||
class BoundedBlockingQueue : NonCopyable {
|
||||
public:
|
||||
explicit BoundedBlockingQueue(size_t maxSize)
|
||||
: mutex_(),
|
||||
notEmpty_(mutex_),
|
||||
notFull_(mutex_),
|
||||
queue_(maxSize) {
|
||||
}
|
||||
public:
|
||||
explicit BoundedBlockingQueue(size_t maxSize)
|
||||
: mutex_(),
|
||||
notEmpty_(mutex_),
|
||||
notFull_(mutex_),
|
||||
queue_(maxSize) {
|
||||
}
|
||||
|
||||
void Push(const T& x) {
|
||||
MutexLockGuard lock(mutex_);
|
||||
while(queue_.Full()) {
|
||||
notFull_.Wait();
|
||||
}
|
||||
assert(!queue_.Full());
|
||||
queue_.Push(x);
|
||||
notEmpty_.Notify();
|
||||
void Push(const T& x) {
|
||||
MutexLockGuard lock(mutex_);
|
||||
while (queue_.Full()) {
|
||||
notFull_.Wait();
|
||||
}
|
||||
assert(!queue_.Full());
|
||||
queue_.Push(x);
|
||||
notEmpty_.Notify();
|
||||
}
|
||||
|
||||
T Pop() {
|
||||
MutexLockGuard lock(mutex_);
|
||||
while(queue_.Empty()) {
|
||||
notEmpty_.Wait();
|
||||
}
|
||||
assert(!queue_.Empty());
|
||||
T res = queue_.Pop();
|
||||
notFull_.Notify();
|
||||
return res;
|
||||
T Pop() {
|
||||
MutexLockGuard lock(mutex_);
|
||||
while (queue_.Empty()) {
|
||||
notEmpty_.Wait();
|
||||
}
|
||||
assert(!queue_.Empty());
|
||||
T res = queue_.Pop();
|
||||
notFull_.Notify();
|
||||
return res;
|
||||
}
|
||||
|
||||
bool Empty() const {
|
||||
MutexLockGuard lock(mutex_);
|
||||
return queue_.Empty();
|
||||
}
|
||||
bool Empty() const {
|
||||
MutexLockGuard lock(mutex_);
|
||||
return queue_.Empty();
|
||||
}
|
||||
|
||||
bool Full() const {
|
||||
MutexLockGuard lock(mutex_);
|
||||
return queue_.Full();
|
||||
}
|
||||
bool Full() const {
|
||||
MutexLockGuard lock(mutex_);
|
||||
return queue_.Full();
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
MutexLockGuard lock(mutex_);
|
||||
return queue_.size();
|
||||
}
|
||||
size_t size() const {
|
||||
MutexLockGuard lock(mutex_);
|
||||
return queue_.size();
|
||||
}
|
||||
|
||||
size_t capacity() const {
|
||||
return queue_.capacity();
|
||||
}
|
||||
size_t capacity() const {
|
||||
return queue_.capacity();
|
||||
}
|
||||
|
||||
private:
|
||||
mutable MutexLock mutex_;
|
||||
Condition notEmpty_;
|
||||
Condition notFull_;
|
||||
BoundedQueue<T> queue_;
|
||||
private:
|
||||
mutable MutexLock mutex_;
|
||||
Condition notEmpty_;
|
||||
Condition notFull_;
|
||||
BoundedQueue<T> queue_;
|
||||
}; // class BoundedBlockingQueue
|
||||
|
||||
} // namespace limonp
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_BOUNDED_QUEUE_HPP
|
||||
#define LIMONP_BOUNDED_QUEUE_HPP
|
||||
|
||||
|
@ -27,55 +9,55 @@ namespace limonp {
|
|||
using namespace std;
|
||||
template<class T>
|
||||
class BoundedQueue {
|
||||
public:
|
||||
explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) {
|
||||
head_ = 0;
|
||||
tail_ = 0;
|
||||
size_ = 0;
|
||||
assert(capacity_);
|
||||
}
|
||||
~BoundedQueue() {
|
||||
}
|
||||
public:
|
||||
explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) {
|
||||
head_ = 0;
|
||||
tail_ = 0;
|
||||
size_ = 0;
|
||||
assert(capacity_);
|
||||
}
|
||||
~BoundedQueue() {
|
||||
}
|
||||
|
||||
void Clear() {
|
||||
head_ = 0;
|
||||
tail_ = 0;
|
||||
size_ = 0;
|
||||
}
|
||||
bool Empty() const {
|
||||
return !size_;
|
||||
}
|
||||
bool Full() const {
|
||||
return capacity_ == size_;
|
||||
}
|
||||
size_t Size() const {
|
||||
return size_;
|
||||
}
|
||||
size_t Capacity() const {
|
||||
return capacity_;
|
||||
}
|
||||
void Clear() {
|
||||
head_ = 0;
|
||||
tail_ = 0;
|
||||
size_ = 0;
|
||||
}
|
||||
bool Empty() const {
|
||||
return !size_;
|
||||
}
|
||||
bool Full() const {
|
||||
return capacity_ == size_;
|
||||
}
|
||||
size_t Size() const {
|
||||
return size_;
|
||||
}
|
||||
size_t Capacity() const {
|
||||
return capacity_;
|
||||
}
|
||||
|
||||
void Push(const T& t) {
|
||||
assert(!Full());
|
||||
circular_buffer_[tail_] = t;
|
||||
tail_ = (tail_ + 1) % capacity_;
|
||||
size_ ++;
|
||||
}
|
||||
void Push(const T& t) {
|
||||
assert(!Full());
|
||||
circular_buffer_[tail_] = t;
|
||||
tail_ = (tail_ + 1) % capacity_;
|
||||
size_ ++;
|
||||
}
|
||||
|
||||
T Pop() {
|
||||
assert(!Empty());
|
||||
size_t oldPos = head_;
|
||||
head_ = (head_ + 1) % capacity_;
|
||||
size_ --;
|
||||
return circular_buffer_[oldPos];
|
||||
}
|
||||
T Pop() {
|
||||
assert(!Empty());
|
||||
size_t oldPos = head_;
|
||||
head_ = (head_ + 1) % capacity_;
|
||||
size_ --;
|
||||
return circular_buffer_[oldPos];
|
||||
}
|
||||
|
||||
private:
|
||||
size_t head_;
|
||||
size_t tail_;
|
||||
size_t size_;
|
||||
const size_t capacity_;
|
||||
vector<T> circular_buffer_;
|
||||
private:
|
||||
size_t head_;
|
||||
size_t tail_;
|
||||
size_t size_;
|
||||
const size_t capacity_;
|
||||
vector<T> circular_buffer_;
|
||||
|
||||
}; // class BoundedQueue
|
||||
} // namespace limonp
|
||||
|
|
|
@ -1,222 +1,204 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_CLOSURE_HPP
|
||||
#define LIMONP_CLOSURE_HPP
|
||||
|
||||
namespace limonp {
|
||||
|
||||
class ClosureInterface {
|
||||
public:
|
||||
virtual ~ClosureInterface() {
|
||||
}
|
||||
virtual void Run() = 0;
|
||||
public:
|
||||
virtual ~ClosureInterface() {
|
||||
}
|
||||
virtual void Run() = 0;
|
||||
};
|
||||
|
||||
template <class Funct>
|
||||
class Closure0: public ClosureInterface {
|
||||
public:
|
||||
Closure0(Funct fun) {
|
||||
fun_ = fun;
|
||||
}
|
||||
virtual ~Closure0() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(*fun_)();
|
||||
}
|
||||
private:
|
||||
Funct fun_;
|
||||
};
|
||||
public:
|
||||
Closure0(Funct fun) {
|
||||
fun_ = fun;
|
||||
}
|
||||
virtual ~Closure0() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(*fun_)();
|
||||
}
|
||||
private:
|
||||
Funct fun_;
|
||||
};
|
||||
|
||||
template <class Funct, class Arg1>
|
||||
class Closure1: public ClosureInterface {
|
||||
public:
|
||||
Closure1(Funct fun, Arg1 arg1) {
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
}
|
||||
virtual ~Closure1() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(*fun_)(arg1_);
|
||||
}
|
||||
private:
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
};
|
||||
public:
|
||||
Closure1(Funct fun, Arg1 arg1) {
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
}
|
||||
virtual ~Closure1() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(*fun_)(arg1_);
|
||||
}
|
||||
private:
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
};
|
||||
|
||||
template <class Funct, class Arg1, class Arg2>
|
||||
class Closure2: public ClosureInterface {
|
||||
public:
|
||||
Closure2(Funct fun, Arg1 arg1, Arg2 arg2) {
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
arg2_ = arg2;
|
||||
}
|
||||
virtual ~Closure2() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(*fun_)(arg1_, arg2_);
|
||||
}
|
||||
private:
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
Arg2 arg2_;
|
||||
};
|
||||
public:
|
||||
Closure2(Funct fun, Arg1 arg1, Arg2 arg2) {
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
arg2_ = arg2;
|
||||
}
|
||||
virtual ~Closure2() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(*fun_)(arg1_, arg2_);
|
||||
}
|
||||
private:
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
Arg2 arg2_;
|
||||
};
|
||||
|
||||
template <class Funct, class Arg1, class Arg2, class Arg3>
|
||||
class Closure3: public ClosureInterface {
|
||||
public:
|
||||
Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
arg2_ = arg2;
|
||||
arg3_ = arg3;
|
||||
}
|
||||
virtual ~Closure3() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(*fun_)(arg1_, arg2_, arg3_);
|
||||
}
|
||||
private:
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
Arg2 arg2_;
|
||||
Arg3 arg3_;
|
||||
};
|
||||
public:
|
||||
Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
arg2_ = arg2;
|
||||
arg3_ = arg3;
|
||||
}
|
||||
virtual ~Closure3() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(*fun_)(arg1_, arg2_, arg3_);
|
||||
}
|
||||
private:
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
Arg2 arg2_;
|
||||
Arg3 arg3_;
|
||||
};
|
||||
|
||||
template <class Obj, class Funct>
|
||||
template <class Obj, class Funct>
|
||||
class ObjClosure0: public ClosureInterface {
|
||||
public:
|
||||
ObjClosure0(Obj* p, Funct fun) {
|
||||
p_ = p;
|
||||
fun_ = fun;
|
||||
}
|
||||
virtual ~ObjClosure0() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(p_->*fun_)();
|
||||
}
|
||||
private:
|
||||
Obj* p_;
|
||||
Funct fun_;
|
||||
};
|
||||
public:
|
||||
ObjClosure0(Obj* p, Funct fun) {
|
||||
p_ = p;
|
||||
fun_ = fun;
|
||||
}
|
||||
virtual ~ObjClosure0() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(p_->*fun_)();
|
||||
}
|
||||
private:
|
||||
Obj* p_;
|
||||
Funct fun_;
|
||||
};
|
||||
|
||||
template <class Obj, class Funct, class Arg1>
|
||||
template <class Obj, class Funct, class Arg1>
|
||||
class ObjClosure1: public ClosureInterface {
|
||||
public:
|
||||
ObjClosure1(Obj* p, Funct fun, Arg1 arg1) {
|
||||
p_ = p;
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
}
|
||||
virtual ~ObjClosure1() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(p_->*fun_)(arg1_);
|
||||
}
|
||||
private:
|
||||
Obj* p_;
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
};
|
||||
public:
|
||||
ObjClosure1(Obj* p, Funct fun, Arg1 arg1) {
|
||||
p_ = p;
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
}
|
||||
virtual ~ObjClosure1() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(p_->*fun_)(arg1_);
|
||||
}
|
||||
private:
|
||||
Obj* p_;
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
};
|
||||
|
||||
template <class Obj, class Funct, class Arg1, class Arg2>
|
||||
template <class Obj, class Funct, class Arg1, class Arg2>
|
||||
class ObjClosure2: public ClosureInterface {
|
||||
public:
|
||||
ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) {
|
||||
p_ = p;
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
arg2_ = arg2;
|
||||
}
|
||||
virtual ~ObjClosure2() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(p_->*fun_)(arg1_, arg2_);
|
||||
}
|
||||
private:
|
||||
Obj* p_;
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
Arg2 arg2_;
|
||||
};
|
||||
template <class Obj, class Funct, class Arg1, class Arg2, class Arg3>
|
||||
public:
|
||||
ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) {
|
||||
p_ = p;
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
arg2_ = arg2;
|
||||
}
|
||||
virtual ~ObjClosure2() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(p_->*fun_)(arg1_, arg2_);
|
||||
}
|
||||
private:
|
||||
Obj* p_;
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
Arg2 arg2_;
|
||||
};
|
||||
template <class Obj, class Funct, class Arg1, class Arg2, class Arg3>
|
||||
class ObjClosure3: public ClosureInterface {
|
||||
public:
|
||||
ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
|
||||
p_ = p;
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
arg2_ = arg2;
|
||||
arg3_ = arg3;
|
||||
}
|
||||
virtual ~ObjClosure3() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(p_->*fun_)(arg1_, arg2_, arg3_);
|
||||
}
|
||||
private:
|
||||
Obj* p_;
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
Arg2 arg2_;
|
||||
Arg3 arg3_;
|
||||
};
|
||||
public:
|
||||
ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
|
||||
p_ = p;
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
arg2_ = arg2;
|
||||
arg3_ = arg3;
|
||||
}
|
||||
virtual ~ObjClosure3() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(p_->*fun_)(arg1_, arg2_, arg3_);
|
||||
}
|
||||
private:
|
||||
Obj* p_;
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
Arg2 arg2_;
|
||||
Arg3 arg3_;
|
||||
};
|
||||
|
||||
template<class R>
|
||||
ClosureInterface* NewClosure(R(*fun)()) {
|
||||
return new Closure0<R(*)()>(fun);
|
||||
ClosureInterface* NewClosure(R (*fun)()) {
|
||||
return new Closure0<R (*)()>(fun);
|
||||
}
|
||||
|
||||
template<class R, class Arg1>
|
||||
ClosureInterface* NewClosure(R(*fun)(Arg1), Arg1 arg1) {
|
||||
return new Closure1<R(*)(Arg1), Arg1>(fun, arg1);
|
||||
ClosureInterface* NewClosure(R (*fun)(Arg1), Arg1 arg1) {
|
||||
return new Closure1<R (*)(Arg1), Arg1>(fun, arg1);
|
||||
}
|
||||
|
||||
template<class R, class Arg1, class Arg2>
|
||||
ClosureInterface* NewClosure(R(*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
|
||||
return new Closure2<R(*)(Arg1, Arg2), Arg1, Arg2>(fun, arg1, arg2);
|
||||
ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
|
||||
return new Closure2<R (*)(Arg1, Arg2), Arg1, Arg2>(fun, arg1, arg2);
|
||||
}
|
||||
|
||||
template<class R, class Arg1, class Arg2, class Arg3>
|
||||
ClosureInterface* NewClosure(R(*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
|
||||
return new Closure3<R(*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(fun, arg1, arg2, arg3);
|
||||
ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
|
||||
return new Closure3<R (*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(fun, arg1, arg2, arg3);
|
||||
}
|
||||
|
||||
template<class R, class Obj>
|
||||
ClosureInterface* NewClosure(Obj* obj, R(Obj::* fun)()) {
|
||||
return new ObjClosure0<Obj, R(Obj::*)()>(obj, fun);
|
||||
ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)()) {
|
||||
return new ObjClosure0<Obj, R (Obj::* )()>(obj, fun);
|
||||
}
|
||||
|
||||
template<class R, class Obj, class Arg1>
|
||||
ClosureInterface* NewClosure(Obj* obj, R(Obj::* fun)(Arg1), Arg1 arg1) {
|
||||
return new ObjClosure1<Obj, R(Obj::*)(Arg1), Arg1>(obj, fun, arg1);
|
||||
ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1), Arg1 arg1) {
|
||||
return new ObjClosure1<Obj, R (Obj::* )(Arg1), Arg1>(obj, fun, arg1);
|
||||
}
|
||||
|
||||
template<class R, class Obj, class Arg1, class Arg2>
|
||||
ClosureInterface* NewClosure(Obj* obj, R(Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
|
||||
return new ObjClosure2<Obj, R(Obj::*)(Arg1, Arg2), Arg1, Arg2>(obj, fun, arg1, arg2);
|
||||
ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
|
||||
return new ObjClosure2<Obj, R (Obj::*)(Arg1, Arg2), Arg1, Arg2>(obj, fun, arg1, arg2);
|
||||
}
|
||||
|
||||
template<class R, class Obj, class Arg1, class Arg2, class Arg3>
|
||||
ClosureInterface* NewClosure(Obj* obj, R(Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
|
||||
return new ObjClosure3<Obj, R(Obj::*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(obj, fun, arg1, arg2, arg3);
|
||||
ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
|
||||
return new ObjClosure3<Obj, R (Obj::*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(obj, fun, arg1, arg2, arg3);
|
||||
}
|
||||
|
||||
} // namespace limonp
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_COLOR_PRINT_HPP
|
||||
#define LIMONP_COLOR_PRINT_HPP
|
||||
|
||||
|
@ -27,21 +9,21 @@ namespace limonp {
|
|||
using std::string;
|
||||
|
||||
enum Color {
|
||||
BLACK = 30,
|
||||
RED,
|
||||
GREEN,
|
||||
YELLOW,
|
||||
BLUE,
|
||||
PURPLE
|
||||
BLACK = 30,
|
||||
RED,
|
||||
GREEN,
|
||||
YELLOW,
|
||||
BLUE,
|
||||
PURPLE
|
||||
}; // enum Color
|
||||
|
||||
static void ColorPrintln(enum Color color, const char * fmt, ...) {
|
||||
va_list ap;
|
||||
printf("\033[0;%dm", color);
|
||||
va_start(ap, fmt);
|
||||
vprintf(fmt, ap);
|
||||
va_end(ap);
|
||||
printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly
|
||||
va_list ap;
|
||||
printf("\033[0;%dm", color);
|
||||
va_start(ap, fmt);
|
||||
vprintf(fmt, ap);
|
||||
va_end(ap);
|
||||
printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly
|
||||
}
|
||||
|
||||
} // namespace limonp
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_CONDITION_HPP
|
||||
#define LIMONP_CONDITION_HPP
|
||||
|
||||
|
@ -24,31 +6,31 @@
|
|||
namespace limonp {
|
||||
|
||||
class Condition : NonCopyable {
|
||||
public:
|
||||
explicit Condition(MutexLock& mutex)
|
||||
: mutex_(mutex) {
|
||||
XCHECK(!pthread_cond_init(&pcond_, NULL));
|
||||
}
|
||||
public:
|
||||
explicit Condition(MutexLock& mutex)
|
||||
: mutex_(mutex) {
|
||||
XCHECK(!pthread_cond_init(&pcond_, NULL));
|
||||
}
|
||||
|
||||
~Condition() {
|
||||
XCHECK(!pthread_cond_destroy(&pcond_));
|
||||
}
|
||||
~Condition() {
|
||||
XCHECK(!pthread_cond_destroy(&pcond_));
|
||||
}
|
||||
|
||||
void Wait() {
|
||||
XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex()));
|
||||
}
|
||||
void Wait() {
|
||||
XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex()));
|
||||
}
|
||||
|
||||
void Notify() {
|
||||
XCHECK(!pthread_cond_signal(&pcond_));
|
||||
}
|
||||
void Notify() {
|
||||
XCHECK(!pthread_cond_signal(&pcond_));
|
||||
}
|
||||
|
||||
void NotifyAll() {
|
||||
XCHECK(!pthread_cond_broadcast(&pcond_));
|
||||
}
|
||||
void NotifyAll() {
|
||||
XCHECK(!pthread_cond_broadcast(&pcond_));
|
||||
}
|
||||
|
||||
private:
|
||||
MutexLock& mutex_;
|
||||
pthread_cond_t pcond_;
|
||||
private:
|
||||
MutexLock& mutex_;
|
||||
pthread_cond_t pcond_;
|
||||
}; // class Condition
|
||||
|
||||
} // namespace limonp
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
/************************************
|
||||
* file enc : utf8
|
||||
* author : wuyanyi09@gmail.com
|
||||
|
@ -34,86 +16,86 @@ namespace limonp {
|
|||
using namespace std;
|
||||
|
||||
class Config {
|
||||
public:
|
||||
explicit Config(const string& filePath) {
|
||||
LoadFile(filePath);
|
||||
}
|
||||
public:
|
||||
explicit Config(const string& filePath) {
|
||||
LoadFile(filePath);
|
||||
}
|
||||
|
||||
operator bool () {
|
||||
return !map_.empty();
|
||||
}
|
||||
operator bool () {
|
||||
return !map_.empty();
|
||||
}
|
||||
|
||||
string Get(const string& key, const string& defaultvalue) const {
|
||||
map<string, string>::const_iterator it = map_.find(key);
|
||||
if(map_.end() != it) {
|
||||
return it->second;
|
||||
}
|
||||
return defaultvalue;
|
||||
string Get(const string& key, const string& defaultvalue) const {
|
||||
map<string, string>::const_iterator it = map_.find(key);
|
||||
if(map_.end() != it) {
|
||||
return it->second;
|
||||
}
|
||||
int Get(const string& key, int defaultvalue) const {
|
||||
string str = Get(key, "");
|
||||
if("" == str) {
|
||||
return defaultvalue;
|
||||
}
|
||||
return atoi(str.c_str());
|
||||
return defaultvalue;
|
||||
}
|
||||
int Get(const string& key, int defaultvalue) const {
|
||||
string str = Get(key, "");
|
||||
if("" == str) {
|
||||
return defaultvalue;
|
||||
}
|
||||
const char* operator [](const char* key) const {
|
||||
if(NULL == key) {
|
||||
return NULL;
|
||||
}
|
||||
map<string, string>::const_iterator it = map_.find(key);
|
||||
if(map_.end() != it) {
|
||||
return it->second.c_str();
|
||||
}
|
||||
return NULL;
|
||||
return atoi(str.c_str());
|
||||
}
|
||||
const char* operator [] (const char* key) const {
|
||||
if(NULL == key) {
|
||||
return NULL;
|
||||
}
|
||||
map<string, string>::const_iterator it = map_.find(key);
|
||||
if(map_.end() != it) {
|
||||
return it->second.c_str();
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
string GetConfigInfo() const {
|
||||
string res;
|
||||
res << *this;
|
||||
return res;
|
||||
string GetConfigInfo() const {
|
||||
string res;
|
||||
res << *this;
|
||||
return res;
|
||||
}
|
||||
|
||||
private:
|
||||
void LoadFile(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
assert(ifs);
|
||||
string line;
|
||||
vector<string> vecBuf;
|
||||
size_t lineno = 0;
|
||||
while(getline(ifs, line)) {
|
||||
lineno ++;
|
||||
Trim(line);
|
||||
if(line.empty() || StartsWith(line, "#")) {
|
||||
continue;
|
||||
}
|
||||
vecBuf.clear();
|
||||
Split(line, vecBuf, "=");
|
||||
if(2 != vecBuf.size()) {
|
||||
fprintf(stderr, "line[%s] illegal.\n", line.c_str());
|
||||
assert(false);
|
||||
continue;
|
||||
}
|
||||
string& key = vecBuf[0];
|
||||
string& value = vecBuf[1];
|
||||
Trim(key);
|
||||
Trim(value);
|
||||
if(!map_.insert(make_pair(key, value)).second) {
|
||||
fprintf(stderr, "key[%s] already exits.\n", key.c_str());
|
||||
assert(false);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
ifs.close();
|
||||
}
|
||||
|
||||
private:
|
||||
void LoadFile(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
assert(ifs);
|
||||
string line;
|
||||
vector<string> vecBuf;
|
||||
size_t lineno = 0;
|
||||
while(getline(ifs, line)) {
|
||||
lineno ++;
|
||||
Trim(line);
|
||||
if(line.empty() || StartsWith(line, "#")) {
|
||||
continue;
|
||||
}
|
||||
vecBuf.clear();
|
||||
Split(line, vecBuf, "=");
|
||||
if(2 != vecBuf.size()) {
|
||||
fprintf(stderr, "line[%s] illegal.\n", line.c_str());
|
||||
assert(false);
|
||||
continue;
|
||||
}
|
||||
string& key = vecBuf[0];
|
||||
string& value = vecBuf[1];
|
||||
Trim(key);
|
||||
Trim(value);
|
||||
if(!map_.insert(make_pair(key, value)).second) {
|
||||
fprintf(stderr, "key[%s] already exits.\n", key.c_str());
|
||||
assert(false);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
ifs.close();
|
||||
}
|
||||
friend ostream& operator << (ostream& os, const Config& config);
|
||||
|
||||
friend ostream& operator << (ostream& os, const Config& config);
|
||||
|
||||
map<string, string> map_;
|
||||
map<string, string> map_;
|
||||
}; // class Config
|
||||
|
||||
inline ostream& operator << (ostream& os, const Config& config) {
|
||||
return os << config.map_;
|
||||
return os << config.map_;
|
||||
}
|
||||
|
||||
} // namespace limonp
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_FILELOCK_HPP
|
||||
#define LIMONP_FILELOCK_HPP
|
||||
|
||||
|
@ -33,58 +15,58 @@ namespace limonp {
|
|||
using std::string;
|
||||
|
||||
class FileLock {
|
||||
public:
|
||||
FileLock() : fd_(-1), ok_(true) {
|
||||
public:
|
||||
FileLock() : fd_(-1), ok_(true) {
|
||||
}
|
||||
~FileLock() {
|
||||
if(fd_ > 0) {
|
||||
Close();
|
||||
}
|
||||
~FileLock() {
|
||||
if(fd_ > 0) {
|
||||
Close();
|
||||
}
|
||||
}
|
||||
void Open(const string& fname) {
|
||||
assert(fd_ == -1);
|
||||
fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
|
||||
if(fd_ < 0) {
|
||||
ok_ = false;
|
||||
err_ = strerror(errno);
|
||||
}
|
||||
void Open(const string& fname) {
|
||||
assert(fd_ == -1);
|
||||
fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
|
||||
if(fd_ < 0) {
|
||||
ok_ = false;
|
||||
err_ = strerror(errno);
|
||||
}
|
||||
}
|
||||
void Close() {
|
||||
::close(fd_);
|
||||
}
|
||||
void Lock() {
|
||||
if(LockOrUnlock(fd_, true) < 0) {
|
||||
ok_ = false;
|
||||
err_ = strerror(errno);
|
||||
}
|
||||
void Close() {
|
||||
::close(fd_);
|
||||
}
|
||||
void Lock() {
|
||||
if(LockOrUnlock(fd_, true) < 0) {
|
||||
ok_ = false;
|
||||
err_ = strerror(errno);
|
||||
}
|
||||
}
|
||||
void UnLock() {
|
||||
if(LockOrUnlock(fd_, false) < 0) {
|
||||
ok_ = false;
|
||||
err_ = strerror(errno);
|
||||
}
|
||||
}
|
||||
bool Ok() const {
|
||||
return ok_;
|
||||
}
|
||||
string Error() const {
|
||||
return err_;
|
||||
}
|
||||
private:
|
||||
static int LockOrUnlock(int fd, bool lock) {
|
||||
errno = 0;
|
||||
struct flock f;
|
||||
memset(&f, 0, sizeof(f));
|
||||
f.l_type = (lock ? F_WRLCK : F_UNLCK);
|
||||
f.l_whence = SEEK_SET;
|
||||
f.l_start = 0;
|
||||
f.l_len = 0; // Lock/unlock entire file
|
||||
return fcntl(fd, F_SETLK, &f);
|
||||
}
|
||||
void UnLock() {
|
||||
if(LockOrUnlock(fd_, false) < 0) {
|
||||
ok_ = false;
|
||||
err_ = strerror(errno);
|
||||
}
|
||||
}
|
||||
bool Ok() const {
|
||||
return ok_;
|
||||
}
|
||||
string Error() const {
|
||||
return err_;
|
||||
}
|
||||
private:
|
||||
static int LockOrUnlock(int fd, bool lock) {
|
||||
errno = 0;
|
||||
struct flock f;
|
||||
memset(&f, 0, sizeof(f));
|
||||
f.l_type = (lock ? F_WRLCK : F_UNLCK);
|
||||
f.l_whence = SEEK_SET;
|
||||
f.l_start = 0;
|
||||
f.l_len = 0; // Lock/unlock entire file
|
||||
return fcntl(fd, F_SETLK, &f);
|
||||
}
|
||||
|
||||
int fd_;
|
||||
bool ok_;
|
||||
string err_;
|
||||
int fd_;
|
||||
bool ok_;
|
||||
string err_;
|
||||
}; // class FileLock
|
||||
|
||||
}// namespace limonp
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_FORCE_PUBLIC_H
|
||||
#define LIMONP_FORCE_PUBLIC_H
|
||||
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_LOCAL_VECTOR_HPP
|
||||
#define LIMONP_LOCAL_VECTOR_HPP
|
||||
|
||||
|
@ -33,123 +15,126 @@ using namespace std;
|
|||
const size_t LOCAL_VECTOR_BUFFER_SIZE = 16;
|
||||
template <class T>
|
||||
class LocalVector {
|
||||
public:
|
||||
typedef const T* const_iterator ;
|
||||
typedef T value_type;
|
||||
typedef size_t size_type;
|
||||
private:
|
||||
T buffer_[LOCAL_VECTOR_BUFFER_SIZE];
|
||||
T * ptr_;
|
||||
size_t size_;
|
||||
size_t capacity_;
|
||||
public:
|
||||
LocalVector() {
|
||||
init_();
|
||||
};
|
||||
LocalVector(const LocalVector<T>& vec) {
|
||||
init_();
|
||||
*this = vec;
|
||||
public:
|
||||
typedef const T* const_iterator ;
|
||||
typedef T value_type;
|
||||
typedef size_t size_type;
|
||||
private:
|
||||
T buffer_[LOCAL_VECTOR_BUFFER_SIZE];
|
||||
T * ptr_;
|
||||
size_t size_;
|
||||
size_t capacity_;
|
||||
public:
|
||||
LocalVector() {
|
||||
init_();
|
||||
};
|
||||
LocalVector(const LocalVector<T>& vec) {
|
||||
init_();
|
||||
*this = vec;
|
||||
}
|
||||
LocalVector(const_iterator begin, const_iterator end) { // TODO: make it faster
|
||||
init_();
|
||||
while(begin != end) {
|
||||
push_back(*begin++);
|
||||
}
|
||||
LocalVector(const_iterator begin, const_iterator end) { // TODO: make it faster
|
||||
init_();
|
||||
while(begin != end) {
|
||||
push_back(*begin++);
|
||||
}
|
||||
}
|
||||
LocalVector(size_t size, const T& t) { // TODO: make it faster
|
||||
init_();
|
||||
while(size--) {
|
||||
push_back(t);
|
||||
}
|
||||
LocalVector(size_t size, const T& t) { // TODO: make it faster
|
||||
init_();
|
||||
while(size--) {
|
||||
push_back(t);
|
||||
}
|
||||
}
|
||||
~LocalVector() {
|
||||
if(ptr_ != buffer_) {
|
||||
free(ptr_);
|
||||
}
|
||||
~LocalVector() {
|
||||
if(ptr_ != buffer_) {
|
||||
free(ptr_);
|
||||
}
|
||||
};
|
||||
public:
|
||||
LocalVector<T>& operator = (const LocalVector<T>& vec) {
|
||||
clear();
|
||||
size_ = vec.size();
|
||||
capacity_ = vec.capacity();
|
||||
if(vec.buffer_ == vec.ptr_) {
|
||||
memcpy(buffer_, vec.buffer_, sizeof(T) * size_);
|
||||
ptr_ = buffer_;
|
||||
} else {
|
||||
ptr_ = (T*) malloc(vec.capacity() * sizeof(T));
|
||||
assert(ptr_);
|
||||
memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T));
|
||||
}
|
||||
return *this;
|
||||
};
|
||||
public:
|
||||
LocalVector<T>& operator = (const LocalVector<T>& vec) {
|
||||
if(this == &vec){
|
||||
return *this;
|
||||
}
|
||||
clear();
|
||||
size_ = vec.size();
|
||||
capacity_ = vec.capacity();
|
||||
if(vec.buffer_ == vec.ptr_) {
|
||||
memcpy(buffer_, vec.buffer_, sizeof(T) * size_);
|
||||
ptr_ = buffer_;
|
||||
} else {
|
||||
ptr_ = (T*) malloc(vec.capacity() * sizeof(T));
|
||||
assert(ptr_);
|
||||
memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T));
|
||||
}
|
||||
private:
|
||||
void init_() {
|
||||
ptr_ = buffer_;
|
||||
size_ = 0;
|
||||
capacity_ = LOCAL_VECTOR_BUFFER_SIZE;
|
||||
return *this;
|
||||
}
|
||||
private:
|
||||
void init_() {
|
||||
ptr_ = buffer_;
|
||||
size_ = 0;
|
||||
capacity_ = LOCAL_VECTOR_BUFFER_SIZE;
|
||||
}
|
||||
public:
|
||||
T& operator [] (size_t i) {
|
||||
return ptr_[i];
|
||||
}
|
||||
const T& operator [] (size_t i) const {
|
||||
return ptr_[i];
|
||||
}
|
||||
void push_back(const T& t) {
|
||||
if(size_ == capacity_) {
|
||||
assert(capacity_);
|
||||
reserve(capacity_ * 2);
|
||||
}
|
||||
public:
|
||||
T& operator [](size_t i) {
|
||||
return ptr_[i];
|
||||
ptr_[size_ ++ ] = t;
|
||||
}
|
||||
void reserve(size_t size) {
|
||||
if(size <= capacity_) {
|
||||
return;
|
||||
}
|
||||
const T& operator [](size_t i) const {
|
||||
return ptr_[i];
|
||||
T * next = (T*)malloc(sizeof(T) * size);
|
||||
assert(next);
|
||||
T * old = ptr_;
|
||||
ptr_ = next;
|
||||
memcpy(ptr_, old, sizeof(T) * capacity_);
|
||||
capacity_ = size;
|
||||
if(old != buffer_) {
|
||||
free(old);
|
||||
}
|
||||
void push_back(const T& t) {
|
||||
if(size_ == capacity_) {
|
||||
assert(capacity_);
|
||||
reserve(capacity_ * 2);
|
||||
}
|
||||
ptr_[size_ ++ ] = t;
|
||||
}
|
||||
void reserve(size_t size) {
|
||||
if(size <= capacity_) {
|
||||
return;
|
||||
}
|
||||
T * next = (T*)malloc(sizeof(T) * size);
|
||||
assert(next);
|
||||
T * old = ptr_;
|
||||
ptr_ = next;
|
||||
memcpy(ptr_, old, sizeof(T) * capacity_);
|
||||
capacity_ = size;
|
||||
if(old != buffer_) {
|
||||
free(old);
|
||||
}
|
||||
}
|
||||
bool empty() const {
|
||||
return 0 == size();
|
||||
}
|
||||
size_t size() const {
|
||||
return size_;
|
||||
}
|
||||
size_t capacity() const {
|
||||
return capacity_;
|
||||
}
|
||||
const_iterator begin() const {
|
||||
return ptr_;
|
||||
}
|
||||
const_iterator end() const {
|
||||
return ptr_ + size_;
|
||||
}
|
||||
void clear() {
|
||||
if(ptr_ != buffer_) {
|
||||
free(ptr_);
|
||||
}
|
||||
init_();
|
||||
}
|
||||
bool empty() const {
|
||||
return 0 == size();
|
||||
}
|
||||
size_t size() const {
|
||||
return size_;
|
||||
}
|
||||
size_t capacity() const {
|
||||
return capacity_;
|
||||
}
|
||||
const_iterator begin() const {
|
||||
return ptr_;
|
||||
}
|
||||
const_iterator end() const {
|
||||
return ptr_ + size_;
|
||||
}
|
||||
void clear() {
|
||||
if(ptr_ != buffer_) {
|
||||
free(ptr_);
|
||||
}
|
||||
init_();
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
ostream & operator << (ostream& os, const LocalVector<T>& vec) {
|
||||
if(vec.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
os << "[\"" << vec[0];
|
||||
for(size_t i = 1; i < vec.size(); i++) {
|
||||
os << "\", \"" << vec[i];
|
||||
}
|
||||
os << "\"]";
|
||||
return os;
|
||||
if(vec.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
os<<"[\""<<vec[0];
|
||||
for(size_t i = 1; i < vec.size(); i++) {
|
||||
os<<"\", \""<<vec[i];
|
||||
}
|
||||
os<<"\"]";
|
||||
return os;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_LOGGING_HPP
|
||||
#define LIMONP_LOGGING_HPP
|
||||
|
||||
|
@ -38,55 +20,56 @@
|
|||
namespace limonp {
|
||||
|
||||
enum {
|
||||
LL_DEBUG = 0,
|
||||
LL_INFO = 1,
|
||||
LL_WARNING = 2,
|
||||
LL_ERROR = 3,
|
||||
LL_FATAL = 4,
|
||||
LL_DEBUG = 0,
|
||||
LL_INFO = 1,
|
||||
LL_WARNING = 2,
|
||||
LL_ERROR = 3,
|
||||
LL_FATAL = 4,
|
||||
}; // enum
|
||||
|
||||
static const char * LOG_LEVEL_ARRAY[] = {"DEBUG", "INFO", "WARN", "ERROR", "FATAL"};
|
||||
static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S";
|
||||
static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"};
|
||||
|
||||
class Logger {
|
||||
public:
|
||||
Logger(size_t level, const char* filename, int lineno)
|
||||
: level_(level) {
|
||||
public:
|
||||
Logger(size_t level, const char* filename, int lineno)
|
||||
: level_(level) {
|
||||
#ifdef LOGGING_LEVEL
|
||||
if(level_ < LOGGING_LEVEL) {
|
||||
return;
|
||||
}
|
||||
if (level_ < LOGGING_LEVEL) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
assert(level_ <= sizeof(LOG_LEVEL_ARRAY) / sizeof(*LOG_LEVEL_ARRAY));
|
||||
char buf[32];
|
||||
time_t now;
|
||||
time(&now);
|
||||
strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&now));
|
||||
stream_ << buf
|
||||
<< " " << filename
|
||||
<< ":" << lineno
|
||||
<< " " << LOG_LEVEL_ARRAY[level_]
|
||||
<< " ";
|
||||
}
|
||||
~Logger() {
|
||||
assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY));
|
||||
char buf[32];
|
||||
time_t now;
|
||||
time(&now);
|
||||
struct tm result;
|
||||
localtime_r(&now, &result);
|
||||
strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &result);
|
||||
stream_ << buf
|
||||
<< " " << filename
|
||||
<< ":" << lineno
|
||||
<< " " << LOG_LEVEL_ARRAY[level_]
|
||||
<< " ";
|
||||
}
|
||||
~Logger() {
|
||||
#ifdef LOGGING_LEVEL
|
||||
if(level_ < LOGGING_LEVEL) {
|
||||
return;
|
||||
}
|
||||
if (level_ < LOGGING_LEVEL) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
std::cerr << stream_.str() << std::endl;
|
||||
if(level_ == LL_FATAL) {
|
||||
abort();
|
||||
}
|
||||
std::cerr << stream_.str() << std::endl;
|
||||
if (level_ == LL_FATAL) {
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream& Stream() {
|
||||
return stream_;
|
||||
}
|
||||
std::ostream& Stream() {
|
||||
return stream_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::ostringstream stream_;
|
||||
size_t level_;
|
||||
private:
|
||||
std::ostringstream stream_;
|
||||
size_t level_;
|
||||
}; // class Logger
|
||||
|
||||
} // namespace limonp
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_MUTEX_LOCK_HPP
|
||||
#define LIMONP_MUTEX_LOCK_HPP
|
||||
|
||||
|
@ -26,40 +8,40 @@
|
|||
namespace limonp {
|
||||
|
||||
class MutexLock: NonCopyable {
|
||||
public:
|
||||
MutexLock() {
|
||||
XCHECK(!pthread_mutex_init(&mutex_, NULL));
|
||||
}
|
||||
~MutexLock() {
|
||||
XCHECK(!pthread_mutex_destroy(&mutex_));
|
||||
}
|
||||
pthread_mutex_t* GetPthreadMutex() {
|
||||
return &mutex_;
|
||||
}
|
||||
public:
|
||||
MutexLock() {
|
||||
XCHECK(!pthread_mutex_init(&mutex_, NULL));
|
||||
}
|
||||
~MutexLock() {
|
||||
XCHECK(!pthread_mutex_destroy(&mutex_));
|
||||
}
|
||||
pthread_mutex_t* GetPthreadMutex() {
|
||||
return &mutex_;
|
||||
}
|
||||
|
||||
private:
|
||||
void Lock() {
|
||||
XCHECK(!pthread_mutex_lock(&mutex_));
|
||||
}
|
||||
void Unlock() {
|
||||
XCHECK(!pthread_mutex_unlock(&mutex_));
|
||||
}
|
||||
friend class MutexLockGuard;
|
||||
private:
|
||||
void Lock() {
|
||||
XCHECK(!pthread_mutex_lock(&mutex_));
|
||||
}
|
||||
void Unlock() {
|
||||
XCHECK(!pthread_mutex_unlock(&mutex_));
|
||||
}
|
||||
friend class MutexLockGuard;
|
||||
|
||||
pthread_mutex_t mutex_;
|
||||
pthread_mutex_t mutex_;
|
||||
}; // class MutexLock
|
||||
|
||||
class MutexLockGuard: NonCopyable {
|
||||
public:
|
||||
explicit MutexLockGuard(MutexLock & mutex)
|
||||
: mutex_(mutex) {
|
||||
mutex_.Lock();
|
||||
}
|
||||
~MutexLockGuard() {
|
||||
mutex_.Unlock();
|
||||
}
|
||||
private:
|
||||
MutexLock & mutex_;
|
||||
public:
|
||||
explicit MutexLockGuard(MutexLock & mutex)
|
||||
: mutex_(mutex) {
|
||||
mutex_.Lock();
|
||||
}
|
||||
~MutexLockGuard() {
|
||||
mutex_.Unlock();
|
||||
}
|
||||
private:
|
||||
MutexLock & mutex_;
|
||||
}; // class MutexLockGuard
|
||||
|
||||
#define MutexLockGuard(x) XCHECK(false);
|
||||
|
|
|
@ -1,35 +1,19 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
/************************************
|
||||
************************************/
|
||||
#ifndef LIMONP_NONCOPYABLE_H
|
||||
#define LIMONP_NONCOPYABLE_H
|
||||
|
||||
namespace limonp {
|
||||
|
||||
class NonCopyable {
|
||||
protected:
|
||||
NonCopyable() {
|
||||
}
|
||||
~NonCopyable() {
|
||||
}
|
||||
private:
|
||||
NonCopyable(const NonCopyable&);
|
||||
const NonCopyable& operator=(const NonCopyable&);
|
||||
protected:
|
||||
NonCopyable() {
|
||||
}
|
||||
~NonCopyable() {
|
||||
}
|
||||
private:
|
||||
NonCopyable(const NonCopyable& );
|
||||
const NonCopyable& operator=(const NonCopyable& );
|
||||
}; // class NonCopyable
|
||||
|
||||
} // namespace limonp
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_STD_EXTEMSION_HPP
|
||||
#define LIMONP_STD_EXTEMSION_HPP
|
||||
|
||||
|
@ -51,123 +33,123 @@ namespace std {
|
|||
|
||||
template<typename T>
|
||||
ostream& operator << (ostream& os, const vector<T>& v) {
|
||||
if(v.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
os << "[" << v[0];
|
||||
for(size_t i = 1; i < v.size(); i++) {
|
||||
os << ", " << v[i];
|
||||
}
|
||||
os << "]";
|
||||
return os;
|
||||
if(v.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
os<<"["<<v[0];
|
||||
for(size_t i = 1; i < v.size(); i++) {
|
||||
os<<", "<<v[i];
|
||||
}
|
||||
os<<"]";
|
||||
return os;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline ostream& operator << (ostream& os, const vector<string>& v) {
|
||||
if(v.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
os << "[\"" << v[0];
|
||||
for(size_t i = 1; i < v.size(); i++) {
|
||||
os << "\", \"" << v[i];
|
||||
}
|
||||
os << "\"]";
|
||||
return os;
|
||||
if(v.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
os<<"[\""<<v[0];
|
||||
for(size_t i = 1; i < v.size(); i++) {
|
||||
os<<"\", \""<<v[i];
|
||||
}
|
||||
os<<"\"]";
|
||||
return os;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
ostream& operator << (ostream& os, const deque<T>& dq) {
|
||||
if(dq.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
os << "[\"" << dq[0];
|
||||
for(size_t i = 1; i < dq.size(); i++) {
|
||||
os << "\", \"" << dq[i];
|
||||
}
|
||||
os << "\"]";
|
||||
return os;
|
||||
if(dq.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
os<<"[\""<<dq[0];
|
||||
for(size_t i = 1; i < dq.size(); i++) {
|
||||
os<<"\", \""<<dq[i];
|
||||
}
|
||||
os<<"\"]";
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
template<class T1, class T2>
|
||||
ostream& operator << (ostream& os, const pair<T1, T2>& pr) {
|
||||
os << pr.first << ":" << pr.second ;
|
||||
return os;
|
||||
os << pr.first << ":" << pr.second ;
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
template<class T>
|
||||
string& operator << (string& str, const T& obj) {
|
||||
stringstream ss;
|
||||
ss << obj; // call ostream& operator << (ostream& os,
|
||||
return str = ss.str();
|
||||
stringstream ss;
|
||||
ss << obj; // call ostream& operator << (ostream& os,
|
||||
return str = ss.str();
|
||||
}
|
||||
|
||||
template<class T1, class T2>
|
||||
ostream& operator << (ostream& os, const map<T1, T2>& mp) {
|
||||
if(mp.empty()) {
|
||||
os << "{}";
|
||||
return os;
|
||||
}
|
||||
os << '{';
|
||||
typename map<T1, T2>::const_iterator it = mp.begin();
|
||||
os << *it;
|
||||
it++;
|
||||
while(it != mp.end()) {
|
||||
os << ", " << *it;
|
||||
it++;
|
||||
}
|
||||
os << '}';
|
||||
if(mp.empty()) {
|
||||
os<<"{}";
|
||||
return os;
|
||||
}
|
||||
os<<'{';
|
||||
typename map<T1, T2>::const_iterator it = mp.begin();
|
||||
os<<*it;
|
||||
it++;
|
||||
while(it != mp.end()) {
|
||||
os<<", "<<*it;
|
||||
it++;
|
||||
}
|
||||
os<<'}';
|
||||
return os;
|
||||
}
|
||||
template<class T1, class T2>
|
||||
ostream& operator << (ostream& os, const std::unordered_map<T1, T2>& mp) {
|
||||
if(mp.empty()) {
|
||||
return os << "{}";
|
||||
}
|
||||
os << '{';
|
||||
typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
|
||||
os << *it;
|
||||
it++;
|
||||
while(it != mp.end()) {
|
||||
os << ", " << *it++;
|
||||
}
|
||||
return os << '}';
|
||||
if(mp.empty()) {
|
||||
return os << "{}";
|
||||
}
|
||||
os<<'{';
|
||||
typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
|
||||
os<<*it;
|
||||
it++;
|
||||
while(it != mp.end()) {
|
||||
os<<", "<<*it++;
|
||||
}
|
||||
return os<<'}';
|
||||
}
|
||||
|
||||
template<class T>
|
||||
ostream& operator << (ostream& os, const set<T>& st) {
|
||||
if(st.empty()) {
|
||||
os << "{}";
|
||||
return os;
|
||||
}
|
||||
os << '{';
|
||||
typename set<T>::const_iterator it = st.begin();
|
||||
os << *it;
|
||||
it++;
|
||||
while(it != st.end()) {
|
||||
os << ", " << *it;
|
||||
it++;
|
||||
}
|
||||
os << '}';
|
||||
if(st.empty()) {
|
||||
os << "{}";
|
||||
return os;
|
||||
}
|
||||
os<<'{';
|
||||
typename set<T>::const_iterator it = st.begin();
|
||||
os<<*it;
|
||||
it++;
|
||||
while(it != st.end()) {
|
||||
os<<", "<<*it;
|
||||
it++;
|
||||
}
|
||||
os<<'}';
|
||||
return os;
|
||||
}
|
||||
|
||||
template<class KeyType, class ContainType>
|
||||
bool IsIn(const ContainType& contain, const KeyType& key) {
|
||||
return contain.end() != contain.find(key);
|
||||
return contain.end() != contain.find(key);
|
||||
}
|
||||
|
||||
template<class T>
|
||||
basic_string<T> & operator << (basic_string<T> & s, ifstream & ifs) {
|
||||
return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
|
||||
return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
|
||||
}
|
||||
|
||||
template<class T>
|
||||
ofstream & operator << (ofstream & ofs, const basic_string<T>& s) {
|
||||
ostreambuf_iterator<T> itr(ofs);
|
||||
copy(s.begin(), s.end(), itr);
|
||||
return ofs;
|
||||
ostreambuf_iterator<T> itr (ofs);
|
||||
copy(s.begin(), s.end(), itr);
|
||||
return ofs;
|
||||
}
|
||||
|
||||
} // namespace std
|
||||
|
|
|
@ -1,27 +1,14 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
/************************************
|
||||
* file enc : ascii
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
#ifndef LIMONP_STR_FUNCTS_H
|
||||
#define LIMONP_STR_FUNCTS_H
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include <memory.h>
|
||||
#include <sys/types.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
@ -29,14 +16,9 @@
|
|||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <map>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include <memory.h>
|
||||
#include <functional>
|
||||
#include <locale>
|
||||
#include <sstream>
|
||||
#include <sys/types.h>
|
||||
#include <iterator>
|
||||
#include <algorithm>
|
||||
#include "StdExtension.hpp"
|
||||
|
@ -44,339 +26,356 @@
|
|||
namespace limonp {
|
||||
using namespace std;
|
||||
inline string StringFormat(const char* fmt, ...) {
|
||||
int size = 256;
|
||||
std::string str;
|
||||
va_list ap;
|
||||
while(1) {
|
||||
str.resize(size);
|
||||
va_start(ap, fmt);
|
||||
int n = vsnprintf((char *)str.c_str(), size, fmt, ap);
|
||||
va_end(ap);
|
||||
if(n > -1 && n < size) {
|
||||
str.resize(n);
|
||||
return str;
|
||||
}
|
||||
if(n > -1)
|
||||
size = n + 1;
|
||||
else
|
||||
size *= 2;
|
||||
int size = 256;
|
||||
std::string str;
|
||||
va_list ap;
|
||||
while (1) {
|
||||
str.resize(size);
|
||||
va_start(ap, fmt);
|
||||
int n = vsnprintf((char *)str.c_str(), size, fmt, ap);
|
||||
va_end(ap);
|
||||
if (n > -1 && n < size) {
|
||||
str.resize(n);
|
||||
return str;
|
||||
}
|
||||
return str;
|
||||
if (n > -1)
|
||||
size = n + 1;
|
||||
else
|
||||
size *= 2;
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void Join(T begin, T end, string& res, const string& connector) {
|
||||
if(begin == end) {
|
||||
return;
|
||||
}
|
||||
stringstream ss;
|
||||
ss << *begin;
|
||||
begin++;
|
||||
while(begin != end) {
|
||||
ss << connector << *begin;
|
||||
begin ++;
|
||||
}
|
||||
res = ss.str();
|
||||
if(begin == end) {
|
||||
return;
|
||||
}
|
||||
stringstream ss;
|
||||
ss<<*begin;
|
||||
begin++;
|
||||
while(begin != end) {
|
||||
ss << connector << *begin;
|
||||
begin ++;
|
||||
}
|
||||
res = ss.str();
|
||||
}
|
||||
|
||||
template<class T>
|
||||
string Join(T begin, T end, const string& connector) {
|
||||
string res;
|
||||
Join(begin, end, res, connector);
|
||||
return res;
|
||||
string res;
|
||||
Join(begin ,end, res, connector);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline string& Upper(string& str) {
|
||||
transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
|
||||
return str;
|
||||
transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
|
||||
return str;
|
||||
}
|
||||
|
||||
inline string& Lower(string& str) {
|
||||
transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
|
||||
return str;
|
||||
transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
|
||||
return str;
|
||||
}
|
||||
|
||||
inline bool IsSpace(unsigned c) {
|
||||
// when passing large int as the argument of isspace, it core dump, so here need a type cast.
|
||||
return c > 0xff ? false : std::isspace(c & 0xff) != 0;
|
||||
// when passing large int as the argument of isspace, it core dump, so here need a type cast.
|
||||
return c > 0xff ? false : std::isspace(c & 0xff);
|
||||
}
|
||||
|
||||
inline std::string& LTrim(std::string &s) {
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))));
|
||||
return s;
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))));
|
||||
return s;
|
||||
}
|
||||
|
||||
inline std::string& RTrim(std::string &s) {
|
||||
s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))).base(), s.end());
|
||||
return s;
|
||||
s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))).base(), s.end());
|
||||
return s;
|
||||
}
|
||||
|
||||
inline std::string& Trim(std::string &s) {
|
||||
return LTrim(RTrim(s));
|
||||
return LTrim(RTrim(s));
|
||||
}
|
||||
|
||||
inline std::string& LTrim(std::string & s, char x) {
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to<char>(), x))));
|
||||
return s;
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to<char>(), x))));
|
||||
return s;
|
||||
}
|
||||
|
||||
inline std::string& RTrim(std::string & s, char x) {
|
||||
s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to<char>(), x))).base(), s.end());
|
||||
return s;
|
||||
s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to<char>(), x))).base(), s.end());
|
||||
return s;
|
||||
}
|
||||
|
||||
inline std::string& Trim(std::string &s, char x) {
|
||||
return LTrim(RTrim(s, x), x);
|
||||
return LTrim(RTrim(s, x), x);
|
||||
}
|
||||
|
||||
inline void Split(const string& src, vector<string>& res, const string& pattern, size_t maxsplit = string::npos) {
|
||||
res.clear();
|
||||
size_t Start = 0;
|
||||
size_t end = 0;
|
||||
string sub;
|
||||
while(Start < src.size()) {
|
||||
end = src.find_first_of(pattern, Start);
|
||||
if(string::npos == end || res.size() >= maxsplit) {
|
||||
sub = src.substr(Start);
|
||||
res.push_back(sub);
|
||||
return;
|
||||
}
|
||||
sub = src.substr(Start, end - Start);
|
||||
res.push_back(sub);
|
||||
Start = end + 1;
|
||||
res.clear();
|
||||
size_t Start = 0;
|
||||
size_t end = 0;
|
||||
string sub;
|
||||
while(Start < src.size()) {
|
||||
end = src.find_first_of(pattern, Start);
|
||||
if(string::npos == end || res.size() >= maxsplit) {
|
||||
sub = src.substr(Start);
|
||||
res.push_back(sub);
|
||||
return;
|
||||
}
|
||||
return;
|
||||
sub = src.substr(Start, end - Start);
|
||||
res.push_back(sub);
|
||||
Start = end + 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
inline vector<string> Split(const string& src, const string& pattern, size_t maxsplit = string::npos) {
|
||||
vector<string> res;
|
||||
Split(src, res, pattern, maxsplit);
|
||||
return res;
|
||||
vector<string> res;
|
||||
Split(src, res, pattern, maxsplit);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline bool StartsWith(const string& str, const string& prefix) {
|
||||
if(prefix.length() > str.length()) {
|
||||
return false;
|
||||
}
|
||||
return 0 == str.compare(0, prefix.length(), prefix);
|
||||
if(prefix.length() > str.length()) {
|
||||
return false;
|
||||
}
|
||||
return 0 == str.compare(0, prefix.length(), prefix);
|
||||
}
|
||||
|
||||
inline bool EndsWith(const string& str, const string& suffix) {
|
||||
if(suffix.length() > str.length()) {
|
||||
return false;
|
||||
}
|
||||
return 0 == str.compare(str.length() - suffix.length(), suffix.length(), suffix);
|
||||
if(suffix.length() > str.length()) {
|
||||
return false;
|
||||
}
|
||||
return 0 == str.compare(str.length() - suffix.length(), suffix.length(), suffix);
|
||||
}
|
||||
|
||||
inline bool IsInStr(const string& str, char ch) {
|
||||
return str.find(ch) != string::npos;
|
||||
return str.find(ch) != string::npos;
|
||||
}
|
||||
|
||||
inline uint16_t TwocharToUint16(char high, char low) {
|
||||
return (((uint16_t(high) & 0x00ff) << 8) | (uint16_t(low) & 0x00ff));
|
||||
return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
|
||||
}
|
||||
|
||||
template <class Uint16Container>
|
||||
bool Utf8ToUnicode(const char * const str, size_t len, Uint16Container& vec) {
|
||||
if(!str) {
|
||||
return false;
|
||||
if(!str) {
|
||||
return false;
|
||||
}
|
||||
char ch1, ch2;
|
||||
uint16_t tmp;
|
||||
vec.clear();
|
||||
for(size_t i = 0; i < len;) {
|
||||
if(!(str[i] & 0x80)) { // 0xxxxxxx
|
||||
vec.push_back(str[i]);
|
||||
i++;
|
||||
} else if ((uint8_t)str[i] <= 0xdf && i + 1 < len) { // 110xxxxxx
|
||||
ch1 = (str[i] >> 2) & 0x07;
|
||||
ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
|
||||
tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
|
||||
vec.push_back(tmp);
|
||||
i += 2;
|
||||
} else if((uint8_t)str[i] <= 0xef && i + 2 < len) {
|
||||
ch1 = ((uint8_t)str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
|
||||
ch2 = (((uint8_t)str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
|
||||
tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
|
||||
vec.push_back(tmp);
|
||||
i += 3;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
char ch1, ch2;
|
||||
uint16_t tmp;
|
||||
vec.clear();
|
||||
for(size_t i = 0; i < len;) {
|
||||
if(!(str[i] & 0x80)) { // 0xxxxxxx
|
||||
vec.push_back(str[i]);
|
||||
i++;
|
||||
} else if((uint8_t)str[i] <= 0xdf && i + 1 < len) { // 110xxxxxx
|
||||
ch1 = (str[i] >> 2) & 0x07;
|
||||
ch2 = (str[i + 1] & 0x3f) | ((str[i] & 0x03) << 6);
|
||||
tmp = (((uint16_t(ch1) & 0x00ff) << 8) | (uint16_t(ch2) & 0x00ff));
|
||||
vec.push_back(tmp);
|
||||
i += 2;
|
||||
} else if((uint8_t)str[i] <= 0xef && i + 2 < len) {
|
||||
ch1 = ((uint8_t)str[i] << 4) | ((str[i + 1] >> 2) & 0x0f);
|
||||
ch2 = (((uint8_t)str[i + 1] << 6) & 0xc0) | (str[i + 2] & 0x3f);
|
||||
tmp = (((uint16_t(ch1) & 0x00ff) << 8) | (uint16_t(ch2) & 0x00ff));
|
||||
vec.push_back(tmp);
|
||||
i += 3;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class Uint16Container>
|
||||
bool Utf8ToUnicode(const string& str, Uint16Container& vec) {
|
||||
return Utf8ToUnicode(str.c_str(), str.size(), vec);
|
||||
return Utf8ToUnicode(str.c_str(), str.size(), vec);
|
||||
}
|
||||
|
||||
template <class Uint32Container>
|
||||
bool Utf8ToUnicode32(const char * str, size_t size, Uint32Container& vec) {
|
||||
uint32_t tmp;
|
||||
vec.clear();
|
||||
for(size_t i = 0; i < size;) {
|
||||
if(!(str[i] & 0x80)) { // 0xxxxxxx
|
||||
// 7bit, total 7bit
|
||||
tmp = (uint8_t)(str[i]) & 0x7f;
|
||||
i++;
|
||||
} else if ((uint8_t)str[i] <= 0xdf && i + 1 < size) { // 110xxxxxx
|
||||
// 5bit, total 5bit
|
||||
tmp = (uint8_t)(str[i]) & 0x1f;
|
||||
|
||||
// 6bit, total 11bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i+1]) & 0x3f;
|
||||
i += 2;
|
||||
} else if((uint8_t)str[i] <= 0xef && i + 2 < size) { // 1110xxxxxx
|
||||
// 4bit, total 4bit
|
||||
tmp = (uint8_t)(str[i]) & 0x0f;
|
||||
|
||||
// 6bit, total 10bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i+1]) & 0x3f;
|
||||
|
||||
// 6bit, total 16bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i+2]) & 0x3f;
|
||||
|
||||
i += 3;
|
||||
} else if((uint8_t)str[i] <= 0xf7 && i + 3 < size) { // 11110xxxx
|
||||
// 3bit, total 3bit
|
||||
tmp = (uint8_t)(str[i]) & 0x07;
|
||||
|
||||
// 6bit, total 9bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i+1]) & 0x3f;
|
||||
|
||||
// 6bit, total 15bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i+2]) & 0x3f;
|
||||
|
||||
// 6bit, total 21bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i+3]) & 0x3f;
|
||||
|
||||
i += 4;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
vec.push_back(tmp);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class Uint32Container>
|
||||
bool Utf8ToUnicode32(const string& str, Uint32Container& vec) {
|
||||
uint32_t tmp;
|
||||
vec.clear();
|
||||
for(size_t i = 0; i < str.size();) {
|
||||
if(!(str[i] & 0x80)) { // 0xxxxxxx
|
||||
// 7bit, total 7bit
|
||||
tmp = (uint8_t)(str[i]) & 0x7f;
|
||||
i++;
|
||||
} else if((uint8_t)str[i] <= 0xdf && i + 1 < str.size()) { // 110xxxxxx
|
||||
// 5bit, total 5bit
|
||||
tmp = (uint8_t)(str[i]) & 0x1f;
|
||||
return Utf8ToUnicode32(str.data(), str.size(), vec);
|
||||
}
|
||||
|
||||
// 6bit, total 11bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i + 1]) & 0x3f;
|
||||
i += 2;
|
||||
} else if((uint8_t)str[i] <= 0xef && i + 2 < str.size()) { // 1110xxxxxx
|
||||
// 4bit, total 4bit
|
||||
tmp = (uint8_t)(str[i]) & 0x0f;
|
||||
|
||||
// 6bit, total 10bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i + 1]) & 0x3f;
|
||||
|
||||
// 6bit, total 16bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i + 2]) & 0x3f;
|
||||
|
||||
i += 3;
|
||||
} else if((uint8_t)str[i] <= 0xf7 && i + 3 < str.size()) { // 11110xxxx
|
||||
// 3bit, total 3bit
|
||||
tmp = (uint8_t)(str[i]) & 0x07;
|
||||
|
||||
// 6bit, total 9bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i + 1]) & 0x3f;
|
||||
|
||||
// 6bit, total 15bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i + 2]) & 0x3f;
|
||||
|
||||
// 6bit, total 21bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i + 3]) & 0x3f;
|
||||
|
||||
i += 4;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
vec.push_back(tmp);
|
||||
inline int UnicodeToUtf8Bytes(uint32_t ui){
|
||||
if(ui <= 0x7f) {
|
||||
return 1;
|
||||
} else if(ui <= 0x7ff) {
|
||||
return 2;
|
||||
} else if(ui <= 0xffff) {
|
||||
return 3;
|
||||
} else {
|
||||
return 4;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class Uint32ContainerConIter>
|
||||
void Unicode32ToUtf8(Uint32ContainerConIter begin, Uint32ContainerConIter end, string& res) {
|
||||
res.clear();
|
||||
uint32_t ui;
|
||||
while(begin != end) {
|
||||
ui = *begin;
|
||||
if(ui <= 0x7f) {
|
||||
res += char(ui);
|
||||
} else if(ui <= 0x7ff) {
|
||||
res += char(((ui >> 6) & 0x1f) | 0xc0);
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
} else if(ui <= 0xffff) {
|
||||
res += char(((ui >> 12) & 0x0f) | 0xe0);
|
||||
res += char(((ui >> 6) & 0x3f) | 0x80);
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
} else {
|
||||
res += char(((ui >> 18) & 0x03) | 0xf0);
|
||||
res += char(((ui >> 12) & 0x3f) | 0x80);
|
||||
res += char(((ui >> 6) & 0x3f) | 0x80);
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
}
|
||||
begin ++;
|
||||
res.clear();
|
||||
uint32_t ui;
|
||||
while(begin != end) {
|
||||
ui = *begin;
|
||||
if(ui <= 0x7f) {
|
||||
res += char(ui);
|
||||
} else if(ui <= 0x7ff) {
|
||||
res += char(((ui >> 6) & 0x1f) | 0xc0);
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
} else if(ui <= 0xffff) {
|
||||
res += char(((ui >> 12) & 0x0f) | 0xe0);
|
||||
res += char(((ui >> 6) & 0x3f) | 0x80);
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
} else {
|
||||
res += char(((ui >> 18) & 0x03) | 0xf0);
|
||||
res += char(((ui >> 12) & 0x3f) | 0x80);
|
||||
res += char(((ui >> 6) & 0x3f) | 0x80);
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
}
|
||||
begin ++;
|
||||
}
|
||||
}
|
||||
|
||||
template <class Uint16ContainerConIter>
|
||||
void UnicodeToUtf8(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
|
||||
res.clear();
|
||||
uint16_t ui;
|
||||
while(begin != end) {
|
||||
ui = *begin;
|
||||
if(ui <= 0x7f) {
|
||||
res += char(ui);
|
||||
} else if(ui <= 0x7ff) {
|
||||
res += char(((ui >> 6) & 0x1f) | 0xc0);
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
} else {
|
||||
res += char(((ui >> 12) & 0x0f) | 0xe0);
|
||||
res += char(((ui >> 6) & 0x3f) | 0x80);
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
}
|
||||
begin ++;
|
||||
res.clear();
|
||||
uint16_t ui;
|
||||
while(begin != end) {
|
||||
ui = *begin;
|
||||
if(ui <= 0x7f) {
|
||||
res += char(ui);
|
||||
} else if(ui <= 0x7ff) {
|
||||
res += char(((ui>>6) & 0x1f) | 0xc0);
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
} else {
|
||||
res += char(((ui >> 12) & 0x0f )| 0xe0);
|
||||
res += char(((ui>>6) & 0x3f )| 0x80 );
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
}
|
||||
begin ++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class Uint16Container>
|
||||
bool GBKTrans(const char* const str, size_t len, Uint16Container& vec) {
|
||||
vec.clear();
|
||||
if(!str) {
|
||||
return true;
|
||||
}
|
||||
size_t i = 0;
|
||||
while(i < len) {
|
||||
if(0 == (str[i] & 0x80)) {
|
||||
vec.push_back(uint16_t(str[i]));
|
||||
i++;
|
||||
} else {
|
||||
if(i + 1 < len) { //&& (str[i+1] & 0x80))
|
||||
uint16_t tmp = (((uint16_t(str[i]) & 0x00ff) << 8) | (uint16_t(str[i + 1]) & 0x00ff));
|
||||
vec.push_back(tmp);
|
||||
i += 2;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
vec.clear();
|
||||
if(!str) {
|
||||
return true;
|
||||
}
|
||||
size_t i = 0;
|
||||
while(i < len) {
|
||||
if(0 == (str[i] & 0x80)) {
|
||||
vec.push_back(uint16_t(str[i]));
|
||||
i++;
|
||||
} else {
|
||||
if(i + 1 < len) { //&& (str[i+1] & 0x80))
|
||||
uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
|
||||
vec.push_back(tmp);
|
||||
i += 2;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class Uint16Container>
|
||||
bool GBKTrans(const string& str, Uint16Container& vec) {
|
||||
return GBKTrans(str.c_str(), str.size(), vec);
|
||||
return GBKTrans(str.c_str(), str.size(), vec);
|
||||
}
|
||||
|
||||
template <class Uint16ContainerConIter>
|
||||
void GBKTrans(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
|
||||
res.clear();
|
||||
//pair<char, char> pa;
|
||||
char first, second;
|
||||
while(begin != end) {
|
||||
//pa = uint16ToChar2(*begin);
|
||||
first = ((*begin) >> 8) & 0x00ff;
|
||||
second = (*begin) & 0x00ff;
|
||||
if(first & 0x80) {
|
||||
res += first;
|
||||
res += second;
|
||||
} else {
|
||||
res += second;
|
||||
}
|
||||
begin++;
|
||||
res.clear();
|
||||
//pair<char, char> pa;
|
||||
char first, second;
|
||||
while(begin != end) {
|
||||
//pa = uint16ToChar2(*begin);
|
||||
first = ((*begin)>>8) & 0x00ff;
|
||||
second = (*begin) & 0x00ff;
|
||||
if(first & 0x80) {
|
||||
res += first;
|
||||
res += second;
|
||||
} else {
|
||||
res += second;
|
||||
}
|
||||
begin++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* format example: "%Y-%m-%d %H:%M:%S"
|
||||
*/
|
||||
inline void GetTime(const string& format, string& timeStr) {
|
||||
time_t timeNow;
|
||||
time(&timeNow);
|
||||
timeStr.resize(64);
|
||||
size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), localtime(&timeNow));
|
||||
timeStr.resize(len);
|
||||
}
|
||||
// inline void GetTime(const string& format, string& timeStr) {
|
||||
// time_t timeNow;
|
||||
// time(&timeNow);
|
||||
// timeStr.resize(64);
|
||||
// size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), localtime(&timeNow));
|
||||
// timeStr.resize(len);
|
||||
// }
|
||||
|
||||
inline string PathJoin(const string& path1, const string& path2) {
|
||||
if(EndsWith(path1, "/")) {
|
||||
return path1 + path2;
|
||||
}
|
||||
return path1 + "/" + path2;
|
||||
if(EndsWith(path1, "/")) {
|
||||
return path1 + path2;
|
||||
}
|
||||
return path1 + "/" + path2;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_THREAD_HPP
|
||||
#define LIMONP_THREAD_HPP
|
||||
|
||||
|
@ -25,36 +7,36 @@
|
|||
namespace limonp {
|
||||
|
||||
class IThread: NonCopyable {
|
||||
public:
|
||||
IThread(): isStarted(false), isJoined(false) {
|
||||
public:
|
||||
IThread(): isStarted(false), isJoined(false) {
|
||||
}
|
||||
virtual ~IThread() {
|
||||
if(isStarted && !isJoined) {
|
||||
XCHECK(!pthread_detach(thread_));
|
||||
}
|
||||
virtual ~IThread() {
|
||||
if(isStarted && !isJoined) {
|
||||
XCHECK(!pthread_detach(thread_));
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
virtual void Run() = 0;
|
||||
void Start() {
|
||||
XCHECK(!isStarted);
|
||||
XCHECK(!pthread_create(&thread_, NULL, Worker, this));
|
||||
isStarted = true;
|
||||
}
|
||||
void Join() {
|
||||
XCHECK(!isJoined);
|
||||
XCHECK(!pthread_join(thread_, NULL));
|
||||
isJoined = true;
|
||||
}
|
||||
private:
|
||||
static void * Worker(void * data) {
|
||||
IThread * ptr = (IThread*) data;
|
||||
ptr->Run();
|
||||
return NULL;
|
||||
}
|
||||
virtual void Run() = 0;
|
||||
void Start() {
|
||||
XCHECK(!isStarted);
|
||||
XCHECK(!pthread_create(&thread_, NULL, Worker, this));
|
||||
isStarted = true;
|
||||
}
|
||||
void Join() {
|
||||
XCHECK(!isJoined);
|
||||
XCHECK(!pthread_join(thread_, NULL));
|
||||
isJoined = true;
|
||||
}
|
||||
private:
|
||||
static void * Worker(void * data) {
|
||||
IThread * ptr = (IThread* ) data;
|
||||
ptr->Run();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
pthread_t thread_;
|
||||
bool isStarted;
|
||||
bool isJoined;
|
||||
pthread_t thread_;
|
||||
bool isStarted;
|
||||
bool isJoined;
|
||||
}; // class IThread
|
||||
|
||||
} // namespace limonp
|
||||
|
|
|
@ -1,21 +1,3 @@
|
|||
/*
|
||||
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*
|
||||
*/
|
||||
#ifndef LIMONP_THREAD_POOL_HPP
|
||||
#define LIMONP_THREAD_POOL_HPP
|
||||
|
||||
|
@ -30,73 +12,73 @@ using namespace std;
|
|||
|
||||
//class ThreadPool;
|
||||
class ThreadPool: NonCopyable {
|
||||
public:
|
||||
class Worker: public IThread {
|
||||
public:
|
||||
Worker(ThreadPool* pool): ptThreadPool_(pool) {
|
||||
assert(ptThreadPool_);
|
||||
}
|
||||
virtual ~Worker() {
|
||||
}
|
||||
|
||||
virtual void Run() {
|
||||
while(true) {
|
||||
ClosureInterface* closure = ptThreadPool_->queue_.Pop();
|
||||
if(closure == NULL) {
|
||||
break;
|
||||
}
|
||||
try {
|
||||
closure->Run();
|
||||
} catch(std::exception& e) {
|
||||
XLOG(ERROR) << e.what();
|
||||
} catch(...) {
|
||||
XLOG(ERROR) << " unknown exception.";
|
||||
}
|
||||
delete closure;
|
||||
}
|
||||
}
|
||||
private:
|
||||
ThreadPool * ptThreadPool_;
|
||||
}; // class Worker
|
||||
|
||||
ThreadPool(size_t thread_num)
|
||||
: threads_(thread_num),
|
||||
queue_(thread_num) {
|
||||
assert(thread_num);
|
||||
for(size_t i = 0; i < threads_.size(); i ++) {
|
||||
threads_[i] = new Worker(this);
|
||||
}
|
||||
public:
|
||||
class Worker: public IThread {
|
||||
public:
|
||||
Worker(ThreadPool* pool): ptThreadPool_(pool) {
|
||||
assert(ptThreadPool_);
|
||||
}
|
||||
~ThreadPool() {
|
||||
Stop();
|
||||
virtual ~Worker() {
|
||||
}
|
||||
|
||||
void Start() {
|
||||
for(size_t i = 0; i < threads_.size(); i++) {
|
||||
threads_[i]->Start();
|
||||
virtual void Run() {
|
||||
while (true) {
|
||||
ClosureInterface* closure = ptThreadPool_->queue_.Pop();
|
||||
if (closure == NULL) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
void Stop() {
|
||||
for(size_t i = 0; i < threads_.size(); i ++) {
|
||||
queue_.Push(NULL);
|
||||
try {
|
||||
closure->Run();
|
||||
} catch(std::exception& e) {
|
||||
XLOG(ERROR) << e.what();
|
||||
} catch(...) {
|
||||
XLOG(ERROR) << " unknown exception.";
|
||||
}
|
||||
for(size_t i = 0; i < threads_.size(); i ++) {
|
||||
threads_[i]->Join();
|
||||
delete threads_[i];
|
||||
}
|
||||
threads_.clear();
|
||||
delete closure;
|
||||
}
|
||||
}
|
||||
private:
|
||||
ThreadPool * ptThreadPool_;
|
||||
}; // class Worker
|
||||
|
||||
void Add(ClosureInterface* task) {
|
||||
assert(task);
|
||||
queue_.Push(task);
|
||||
ThreadPool(size_t thread_num)
|
||||
: threads_(thread_num),
|
||||
queue_(thread_num) {
|
||||
assert(thread_num);
|
||||
for(size_t i = 0; i < threads_.size(); i ++) {
|
||||
threads_[i] = new Worker(this);
|
||||
}
|
||||
}
|
||||
~ThreadPool() {
|
||||
Stop();
|
||||
}
|
||||
|
||||
private:
|
||||
friend class Worker;
|
||||
void Start() {
|
||||
for(size_t i = 0; i < threads_.size(); i++) {
|
||||
threads_[i]->Start();
|
||||
}
|
||||
}
|
||||
void Stop() {
|
||||
for(size_t i = 0; i < threads_.size(); i ++) {
|
||||
queue_.Push(NULL);
|
||||
}
|
||||
for(size_t i = 0; i < threads_.size(); i ++) {
|
||||
threads_[i]->Join();
|
||||
delete threads_[i];
|
||||
}
|
||||
threads_.clear();
|
||||
}
|
||||
|
||||
vector<IThread*> threads_;
|
||||
BoundedBlockingQueue<ClosureInterface*> queue_;
|
||||
void Add(ClosureInterface* task) {
|
||||
assert(task);
|
||||
queue_.Push(task);
|
||||
}
|
||||
|
||||
private:
|
||||
friend class Worker;
|
||||
|
||||
vector<IThread*> threads_;
|
||||
BoundedBlockingQueue<ClosureInterface*> queue_;
|
||||
}; // class ThreadPool
|
||||
|
||||
} // namespace limonp
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
*
|
||||
*/
|
||||
#include "file-utils.h"
|
||||
#include <QXmlStreamReader>
|
||||
|
||||
using namespace Zeeker;
|
||||
size_t FileUtils::_max_index_count = 0;
|
||||
|
@ -488,6 +489,22 @@ void FileUtils::getDocxTextContent(QString &path, QString &textcontent) {
|
|||
|
||||
fileR.open(QIODevice::ReadOnly); //读取方式打开
|
||||
|
||||
QXmlStreamReader reader(&fileR);
|
||||
|
||||
while (!reader.atEnd()){
|
||||
if(reader.readNextStartElement() and reader.name().toString() == "t"){
|
||||
textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
||||
if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fileR.close();
|
||||
file.close();
|
||||
return;
|
||||
|
||||
/* //原加载DOM文档方式;
|
||||
QDomDocument doc;
|
||||
doc.setContent(fileR.readAll());
|
||||
fileR.close();
|
||||
|
@ -512,6 +529,7 @@ void FileUtils::getDocxTextContent(QString &path, QString &textcontent) {
|
|||
}
|
||||
file.close();
|
||||
return;
|
||||
*/
|
||||
}
|
||||
|
||||
void FileUtils::getPptxTextContent(QString &path, QString &textcontent) {
|
||||
|
@ -529,6 +547,31 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent) {
|
|||
}
|
||||
if(fileList.isEmpty())
|
||||
return;
|
||||
|
||||
for(int i = 0; i < fileList.size(); ++i){
|
||||
QString name = prefix + QString::number(i + 1) + ".xml";
|
||||
if(!file.setCurrentFile(name)) {
|
||||
continue;
|
||||
}
|
||||
QuaZipFile fileR(&file);
|
||||
fileR.open(QIODevice::ReadOnly);
|
||||
|
||||
QXmlStreamReader reader(&fileR);
|
||||
|
||||
while (!reader.atEnd()){
|
||||
if(reader.readNextStartElement() and reader.name().toString() == "t"){
|
||||
textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
||||
if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
fileR.close();
|
||||
}
|
||||
file.close();
|
||||
return;
|
||||
|
||||
/*
|
||||
QDomElement sptree;
|
||||
QDomElement sp;
|
||||
QDomElement txbody;
|
||||
|
@ -596,6 +639,7 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent) {
|
|||
}
|
||||
file.close();
|
||||
return;
|
||||
*/
|
||||
}
|
||||
|
||||
void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) {
|
||||
|
@ -610,8 +654,24 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) {
|
|||
return;
|
||||
QuaZipFile fileR(&file);
|
||||
|
||||
fileR.open(QIODevice::ReadOnly); //读取方式打开
|
||||
fileR.open(QIODevice::ReadOnly);
|
||||
|
||||
QXmlStreamReader reader(&fileR);
|
||||
|
||||
while (!reader.atEnd()){
|
||||
if(reader.readNextStartElement() and reader.name().toString() == "t"){
|
||||
textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
|
||||
if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fileR.close();
|
||||
file.close();
|
||||
return;
|
||||
|
||||
/*
|
||||
QDomDocument doc;
|
||||
doc.setContent(fileR.readAll());
|
||||
fileR.close();
|
||||
|
@ -641,6 +701,7 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) {
|
|||
}
|
||||
file.close();
|
||||
return;
|
||||
*/
|
||||
}
|
||||
|
||||
void FileUtils::getPdfTextContent(QString &path, QString &textcontent) {
|
||||
|
@ -650,7 +711,7 @@ void FileUtils::getPdfTextContent(QString &path, QString &textcontent) {
|
|||
const QRectF qf;
|
||||
int pageNum = doc->numPages();
|
||||
for(int i = 0; i < pageNum; ++i) {
|
||||
textcontent.append(doc->page(i)->text(qf).replace("\n", ""));
|
||||
textcontent.append(doc->page(i)->text(qf).replace("\n", "").replace("\r", " "));
|
||||
if(textcontent.length() >= MAX_CONTENT_LENGTH / 3)
|
||||
break;
|
||||
}
|
||||
|
@ -679,7 +740,7 @@ void FileUtils::getTxtContent(QString &path, QString &textcontent) {
|
|||
stream.setCodec(codec);
|
||||
uchardet_delete(chardet);
|
||||
|
||||
textcontent = stream.readAll().replace("\n", "");
|
||||
textcontent = stream.readAll().replace("\n", "").replace("\r", " ");
|
||||
|
||||
file.close();
|
||||
encodedString.clear();
|
||||
|
|
|
@ -110,17 +110,21 @@ void ConstructDocumentForContent::run() {
|
|||
return;
|
||||
QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
|
||||
QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
|
||||
|
||||
QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000).toStdString());
|
||||
|
||||
Document doc;
|
||||
doc.setData(content);
|
||||
doc.setUniqueTerm(uniqueterm);
|
||||
doc.addTerm(upTerm);
|
||||
doc.addValue(m_path);
|
||||
for(int i = 0; i < term.size(); ++i) {
|
||||
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
|
||||
|
||||
//'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
|
||||
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
|
||||
|
||||
// QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000));
|
||||
//修改函数返回类型,修改入参为std::string引用--jxx20210519
|
||||
std::vector<cppjieba::KeywordExtractor::Word> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
|
||||
|
||||
for(size_t i = 0; i < term.size(); ++i) {
|
||||
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
|
||||
}
|
||||
|
||||
Zeeker::_mutex_doc_list_content.lock();
|
||||
|
|
|
@ -37,6 +37,17 @@ void Document::addPosting(std::string term, QVector<size_t> offset, int weight)
|
|||
}
|
||||
}
|
||||
|
||||
void Document::addPosting(std::string term, std::vector<size_t> offset, int weight) {
|
||||
if(term == "")
|
||||
return;
|
||||
if(term.length() > 240)
|
||||
term = QString::fromStdString(term).left(30).toStdString();
|
||||
|
||||
for(size_t i : offset) {
|
||||
m_document.add_posting(term, i, weight);
|
||||
}
|
||||
}
|
||||
|
||||
void Document::addPosting(std::string term, unsigned int offset, int weight) {
|
||||
if(term == "")
|
||||
return;
|
||||
|
|
|
@ -41,6 +41,7 @@ public:
|
|||
}
|
||||
void setData(QString &data);
|
||||
void addPosting(std::string term, QVector<size_t> offset, int weight = 1);
|
||||
void addPosting(std::string term, std::vector<size_t> offset, int weight = 1);
|
||||
void addPosting(std::string term, unsigned int offset, int weight = 1);
|
||||
void addTerm(QString term);
|
||||
void addValue(QString value);
|
||||
|
|
|
@ -31,8 +31,9 @@ void FileReader::getTextContent(QString path, QString &textContent) {
|
|||
QFileInfo file(path);
|
||||
QString strsfx = file.suffix();
|
||||
if(name == "application/zip") {
|
||||
if(strsfx.endsWith("docx"))
|
||||
if(strsfx.endsWith("docx")){
|
||||
FileUtils::getDocxTextContent(path, textContent);
|
||||
}
|
||||
if(strsfx.endsWith("pptx"))
|
||||
FileUtils::getPptxTextContent(path, textContent);
|
||||
if(strsfx.endsWith("xlsx"))
|
||||
|
|
|
@ -46,7 +46,54 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
|
|||
// qDebug() << "there are some shit here"<<fileInfo.fileName() << fileInfo.absoluteFilePath() << QString(fileInfo.isDir() ? "1" : "0");
|
||||
this->q_index->enqueue(QVector<QString>() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0"));
|
||||
if((fileInfo.fileName().split(".", QString::SkipEmptyParts).length() > 1) && (true == targetFileTypeMap[fileInfo.fileName().split(".").last()])) {
|
||||
this->q_content_index->enqueue(fileInfo.absoluteFilePath());
|
||||
//this->q_content_index->enqueue(fileInfo.absoluteFilePath());
|
||||
if(fileInfo.fileName().split(".").last() == "docx"){
|
||||
QuaZip file(fileInfo.absoluteFilePath());
|
||||
if(!file.open(QuaZip::mdUnzip))
|
||||
return;
|
||||
if(!file.setCurrentFile("word/document.xml", QuaZip::csSensitive))
|
||||
return;
|
||||
QuaZipFile fileR(&file);
|
||||
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//docx解压缩后的xml文件为实际需要解析文件大小
|
||||
qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
|
||||
qDebug() << "文件大小:" << fileR.usize();
|
||||
file.close();
|
||||
}else if(fileInfo.fileName().split(".").last() == "pptx"){
|
||||
QuaZip file(fileInfo.absoluteFilePath());
|
||||
if(!file.open(QuaZip::mdUnzip))
|
||||
return;
|
||||
QString prefix("ppt/slides/slide");
|
||||
qint64 fileSize(0);
|
||||
qint64 fileIndex(0);
|
||||
for(QString i : file.getFileNameList()) {
|
||||
if(i.startsWith(prefix)){
|
||||
QString name = prefix + QString::number(fileIndex + 1) + ".xml";
|
||||
fileIndex++;
|
||||
if(!file.setCurrentFile(name)) {
|
||||
continue;
|
||||
}
|
||||
QuaZipFile fileR(&file);
|
||||
fileSize += fileR.usize();
|
||||
}
|
||||
}
|
||||
file.close();
|
||||
qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
|
||||
qDebug() << "文件大小:" << fileSize;
|
||||
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileSize));//pptx解压缩后的xml文件为实际需要解析文件大小
|
||||
}else if(fileInfo.fileName().split(".").last() == "xlsx"){
|
||||
QuaZip file(fileInfo.absoluteFilePath());
|
||||
if(!file.open(QuaZip::mdUnzip))
|
||||
return;
|
||||
if(!file.setCurrentFile("xl/sharedStrings.xml", QuaZip::csSensitive))
|
||||
return;
|
||||
QuaZipFile fileR(&file);
|
||||
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//xlsx解压缩后的xml文件为实际解析文件大小
|
||||
qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
|
||||
qDebug() << "文件大小:" << fileR.usize();
|
||||
file.close();
|
||||
}else{
|
||||
this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -90,8 +137,9 @@ void FirstIndex::run() {
|
|||
|
||||
this->q_index = new QQueue<QVector<QString>>();
|
||||
//this->q_content_index = new QQueue<QString>();
|
||||
NEW_QUEUE(this->q_content_index);
|
||||
//NEW_QUEUE(this->q_content_index);
|
||||
// this->mlm = new MessageListManager();
|
||||
this->q_content_index = new QQueue<QPair<QString,qint64>>();
|
||||
|
||||
int fifo_fd;
|
||||
char buffer[2];
|
||||
|
@ -168,9 +216,14 @@ void FirstIndex::run() {
|
|||
qDebug() << "q_content_index:" << q_content_index->size();
|
||||
while(!this->q_content_index->empty()) {
|
||||
// for (size_t i = 0; (i < this->u_send_length) && (!this->q_content_index->empty()); ++i){
|
||||
for(size_t i = 0; (i < 30) && (!this->q_content_index->empty()); ++i) {
|
||||
tmp->enqueue(this->q_content_index->dequeue());
|
||||
qint64 fileSize = 0;
|
||||
//修改一次处理的数据量,从30个文件改为文件总大小为50M以下,50M为暂定值--jxx20210519
|
||||
for(size_t i = 0;/* (i < 30) && */(fileSize < 50*1024*1024) && (!this->q_content_index->empty()); ++i) {
|
||||
QPair<QString,qint64> tempPair = this->q_content_index->dequeue();
|
||||
fileSize += tempPair.second;
|
||||
tmp->enqueue(tempPair.first);
|
||||
}
|
||||
// qDebug() << ">>>>>>>>all fileSize:" << fileSize << "file num:" << tmp->size() << "<<<<<<<<<<<<<<<<<<<";
|
||||
this->p_indexGenerator->creatAllIndex(tmp);
|
||||
tmp->clear();
|
||||
}
|
||||
|
|
|
@ -62,7 +62,9 @@ private:
|
|||
|
||||
//test
|
||||
QQueue<QVector<QString>>* q_index;
|
||||
QQueue<QString>* q_content_index;
|
||||
// QQueue<QString>* q_content_index;
|
||||
//修改QQueue存储数据为QPair<QString,qint64>,增加存储文件大小数据便于处理时统计--jxx20210519
|
||||
QQueue<QPair<QString,qint64>>* q_content_index;
|
||||
|
||||
const QMap<QString, bool> targetFileTypeMap = {
|
||||
std::map<QString, bool>::value_type("doc", true),
|
||||
|
|
|
@ -27,7 +27,7 @@ QMutex SearchManager::m_mutex1;
|
|||
QMutex SearchManager::m_mutex2;
|
||||
QMutex SearchManager::m_mutex3;
|
||||
SearchManager::SearchManager(QObject *parent) : QObject(parent) {
|
||||
m_pool.setMaxThreadCount(2);
|
||||
m_pool.setMaxThreadCount(3);
|
||||
m_pool.setExpiryTimeout(1000);
|
||||
}
|
||||
|
||||
|
@ -280,29 +280,15 @@ int FileContentSearch::keywordSearchContent() {
|
|||
words.append(sKeyWord.at(i).word).append(" ");
|
||||
}
|
||||
|
||||
Xapian::Query query = qp.parse_query(words);
|
||||
// Xapian::Query query = qp.parse_query(keyword.toStdString());
|
||||
|
||||
|
||||
|
||||
// QVector<SKeyWord> sKeyWord = ChineseSegmentation::getInstance()->callSegement(keyword);
|
||||
// //Creat a query
|
||||
// std::string words;
|
||||
// for(int i=0;i<sKeyWord.size();i++)
|
||||
// {
|
||||
// words.append(sKeyWord.at(i).word).append(" ");
|
||||
// }
|
||||
|
||||
|
||||
// Xapian::Query query = qp.parse_query(words);
|
||||
|
||||
// std::vector<Xapian::Query> v;
|
||||
// for(int i=0;i<sKeyWord.size();i++)
|
||||
// {
|
||||
// v.push_back(Xapian::Query(sKeyWord.at(i).word));
|
||||
// qDebug()<<QString::fromStdString(sKeyWord.at(i).word);
|
||||
// }
|
||||
// Xapian::Query queryPhrase =Xapian::Query(Xapian::Query::OP_AND, v.begin(), v.end());
|
||||
std::vector<Xapian::Query> v;
|
||||
for(int i=0; i<sKeyWord.size(); i++) {
|
||||
v.push_back(Xapian::Query(sKeyWord.at(i).word));
|
||||
qDebug() << QString::fromStdString(sKeyWord.at(i).word);
|
||||
}
|
||||
Xapian::Query query = Xapian::Query(Xapian::Query::OP_AND, v.begin(), v.end());
|
||||
|
||||
qDebug() << "keywordSearchContent:" << QString::fromStdString(query.get_description());
|
||||
|
||||
enquire.set_query(query);
|
||||
|
|
|
@ -67,7 +67,7 @@ unix {
|
|||
INSTALLS += target
|
||||
|
||||
header.path = /usr/include/ukui-search
|
||||
header.files += *.h index/*.h appsearch/*.h settingsearch/*.h
|
||||
header.files += *.h index/*.h appsearch/*.h settingsearch/*.h plugininterface/*.h
|
||||
INSTALLS += header
|
||||
}
|
||||
|
||||
|
|
|
@ -4963,7 +4963,7 @@ bool KBinaryParser::read8DocText(FILE *pFile, const ppsInfoType *pPPS,
|
|||
|
||||
if(bUsesUnicode) {
|
||||
ushort* usAucData = (ushort*)ptaucBytes;
|
||||
content.append(QString::fromUtf16(usAucData).replace("\r", ""));
|
||||
content.append(QString::fromUtf16(usAucData).replace("\n", "").replace("\r", " "));
|
||||
usAucData = (ushort*)xfree((void*)usAucData);
|
||||
ptaucBytes = NULL;
|
||||
if(content.length() >= 682666) //20480000/3
|
||||
|
@ -5066,7 +5066,7 @@ int KBinaryParser:: readSSTRecord(readDataParam &rdParam, ppsInfoType PPS_info,
|
|||
} else {
|
||||
ushort* usData = (ushort*)chData;
|
||||
|
||||
content.append(QString::fromUtf16(usData).replace("\r", ""));
|
||||
content.append(QString::fromUtf16(usData).replace("\n", "").replace("\r", " "));
|
||||
usData = (ushort*)xfree((void*)usData);
|
||||
chData = NULL;
|
||||
if(content.length() >= 682666) //20480000/3
|
||||
|
@ -5131,7 +5131,7 @@ ULONG KBinaryParser::readPPtRecord(FILE* pFile, ppsInfoType* PPS_info, ULONG* au
|
|||
return -1;
|
||||
ushort* usData = (ushort*)chData;
|
||||
|
||||
content.append(QString::fromUtf16(usData).replace("\r", ""));
|
||||
content.append(QString::fromUtf16(usData).replace("\n", "").replace("\r", " "));
|
||||
|
||||
usData = (ushort*)xfree((void*)usData);
|
||||
chData = NULL;
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
#include <QStyleOption>
|
||||
#include <QApplication>
|
||||
#include <QPainter>
|
||||
#include <QPainterPath>
|
||||
|
||||
namespace Zeeker {
|
||||
class CreateIndexAskDialog : public QDialog {
|
||||
|
|
Loading…
Reference in New Issue