forked from openkylin/ukui-search
Merge pull request #36 from iaom/1230-dev
Add file content index and search funtion(lib).
This commit is contained in:
commit
5a3e5dd370
|
@ -1,5 +1,45 @@
|
|||
#include "chinese-segmentation.h"
|
||||
#include <QFileInfo>
|
||||
|
||||
ChineseSegmentation::ChineseSegmentation()
|
||||
{
|
||||
}
|
||||
|
||||
QVector<SKeyWord> ChineseSegmentation::callSegement(QString *str)
|
||||
{
|
||||
const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
|
||||
const char * const HMM_PATH = "/usr/share/ukui-search/res/dict/hmm_model.utf8";
|
||||
const char * const USER_DICT_PATH ="/usr/share/ukui-search/res/dict/user.dict.utf8";
|
||||
const char * const IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
|
||||
const char * const STOP_WORD_PATH = "/usr/share/ukui-search/res/dict/stop_words.utf8";
|
||||
|
||||
|
||||
cppjieba::Jieba jieba(DICT_PATH,
|
||||
HMM_PATH,
|
||||
USER_DICT_PATH,
|
||||
IDF_PATH,
|
||||
STOP_WORD_PATH);
|
||||
|
||||
std::string s;
|
||||
s=str->toStdString();
|
||||
|
||||
const size_t topk = -1;
|
||||
std::vector<cppjieba::KeywordExtractor::Word> keywordres;
|
||||
jieba.extractor.Extract(s, keywordres, topk);
|
||||
QVector<SKeyWord> vecNeeds;
|
||||
convert(keywordres, vecNeeds);
|
||||
|
||||
return vecNeeds;
|
||||
|
||||
}
|
||||
|
||||
void ChineseSegmentation::convert(std::vector<cppjieba::KeywordExtractor::Word> &keywordres, QVector<SKeyWord> &kw)
|
||||
{
|
||||
for (auto i : keywordres){
|
||||
SKeyWord temp;
|
||||
temp.word = i.word;
|
||||
temp.offsets = QVector<size_t>::fromStdVector(i.offsets);
|
||||
temp.weight = i.weight;
|
||||
kw.append(temp);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,10 +2,27 @@
|
|||
#define CHINESESEGMENTATION_H
|
||||
|
||||
#include "libchinese-segmentation_global.h"
|
||||
#include "cppjieba/Jieba.hpp"
|
||||
//#include "Logging.hpp"
|
||||
//#include "LocalVector.hpp"
|
||||
//#include "cppjieba/QuerySegment.hpp"
|
||||
#include "cppjieba/KeywordExtractor.hpp"
|
||||
#include <QVector>
|
||||
#include <QString>
|
||||
#include <QDebug>
|
||||
|
||||
struct SKeyWord{
|
||||
std::string word;
|
||||
QVector<size_t> offsets;
|
||||
double weight;
|
||||
};
|
||||
|
||||
class CHINESESEGMENTATION_EXPORT ChineseSegmentation
|
||||
{
|
||||
public:
|
||||
static QVector<SKeyWord> callSegement(QString *str);
|
||||
static void convert(std::vector<cppjieba::KeywordExtractor::Word>& keywordres,QVector<SKeyWord>& kw);
|
||||
private:
|
||||
ChineseSegmentation();
|
||||
};
|
||||
|
||||
|
|
|
@ -0,0 +1,277 @@
|
|||
#ifndef CPPJIEBA_DICT_TRIE_HPP
|
||||
#define CPPJIEBA_DICT_TRIE_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <stdint.h>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "Unicode.hpp"
|
||||
#include "Trie.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
using namespace limonp;
|
||||
|
||||
const double MIN_DOUBLE = -3.14e+100;
|
||||
const double MAX_DOUBLE = 3.14e+100;
|
||||
const size_t DICT_COLUMN_NUM = 3;
|
||||
const char* const UNKNOWN_TAG = "";
|
||||
|
||||
class DictTrie {
|
||||
public:
|
||||
enum UserWordWeightOption {
|
||||
WordWeightMin,
|
||||
WordWeightMedian,
|
||||
WordWeightMax,
|
||||
}; // enum UserWordWeightOption
|
||||
|
||||
DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
|
||||
Init(dict_path, user_dict_paths, user_word_weight_opt);
|
||||
}
|
||||
|
||||
~DictTrie() {
|
||||
delete trie_;
|
||||
}
|
||||
|
||||
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||
DictUnit node_info;
|
||||
if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
|
||||
return false;
|
||||
}
|
||||
active_node_infos_.push_back(node_info);
|
||||
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
|
||||
DictUnit node_info;
|
||||
double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
|
||||
if (!MakeNodeInfo(node_info, word, weight , tag)) {
|
||||
return false;
|
||||
}
|
||||
active_node_infos_.push_back(node_info);
|
||||
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
||||
return true;
|
||||
}
|
||||
|
||||
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
||||
return trie_->Find(begin, end);
|
||||
}
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<struct Dag>&res,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
trie_->Find(begin, end, res, max_word_len);
|
||||
}
|
||||
|
||||
bool Find(const string& word)
|
||||
{
|
||||
const DictUnit *tmp = NULL;
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(word, runes))
|
||||
{
|
||||
XLOG(ERROR) << "Decode failed.";
|
||||
}
|
||||
tmp = Find(runes.begin(), runes.end());
|
||||
if (tmp == NULL)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
bool IsUserDictSingleChineseWord(const Rune& word) const {
|
||||
return IsIn(user_dict_single_chinese_word_, word);
|
||||
}
|
||||
|
||||
double GetMinWeight() const {
|
||||
return min_weight_;
|
||||
}
|
||||
|
||||
void InserUserDictNode(const string& line) {
|
||||
vector<string> buf;
|
||||
DictUnit node_info;
|
||||
Split(line, buf, " ");
|
||||
if(buf.size() == 1){
|
||||
MakeNodeInfo(node_info,
|
||||
buf[0],
|
||||
user_word_default_weight_,
|
||||
UNKNOWN_TAG);
|
||||
} else if (buf.size() == 2) {
|
||||
MakeNodeInfo(node_info,
|
||||
buf[0],
|
||||
user_word_default_weight_,
|
||||
buf[1]);
|
||||
} else if (buf.size() == 3) {
|
||||
int freq = atoi(buf[1].c_str());
|
||||
assert(freq_sum_ > 0.0);
|
||||
double weight = log(1.0 * freq / freq_sum_);
|
||||
MakeNodeInfo(node_info, buf[0], weight, buf[2]);
|
||||
}
|
||||
static_node_infos_.push_back(node_info);
|
||||
if (node_info.word.size() == 1) {
|
||||
user_dict_single_chinese_word_.insert(node_info.word[0]);
|
||||
}
|
||||
}
|
||||
|
||||
void LoadUserDict(const vector<string>& buf) {
|
||||
for (size_t i = 0; i < buf.size(); i++) {
|
||||
InserUserDictNode(buf[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void LoadUserDict(const set<string>& buf) {
|
||||
std::set<string>::const_iterator iter;
|
||||
for (iter = buf.begin(); iter != buf.end(); iter++){
|
||||
InserUserDictNode(*iter);
|
||||
}
|
||||
}
|
||||
|
||||
void LoadUserDict(const string& filePaths) {
|
||||
vector<string> files = limonp::Split(filePaths, "|;");
|
||||
size_t lineno = 0;
|
||||
for (size_t i = 0; i < files.size(); i++) {
|
||||
ifstream ifs(files[i].c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
|
||||
string line;
|
||||
|
||||
for (; getline(ifs, line); lineno++) {
|
||||
if (line.size() == 0) {
|
||||
continue;
|
||||
}
|
||||
InserUserDictNode(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
|
||||
LoadDict(dict_path);
|
||||
freq_sum_ = CalcFreqSum(static_node_infos_);
|
||||
CalculateWeight(static_node_infos_, freq_sum_);
|
||||
SetStaticWordWeights(user_word_weight_opt);
|
||||
|
||||
if (user_dict_paths.size()) {
|
||||
LoadUserDict(user_dict_paths);
|
||||
}
|
||||
Shrink(static_node_infos_);
|
||||
CreateTrie(static_node_infos_);
|
||||
}
|
||||
|
||||
void CreateTrie(const vector<DictUnit>& dictUnits) {
|
||||
assert(dictUnits.size());
|
||||
vector<Unicode> words;
|
||||
vector<const DictUnit*> valuePointers;
|
||||
for (size_t i = 0 ; i < dictUnits.size(); i ++) {
|
||||
words.push_back(dictUnits[i].word);
|
||||
valuePointers.push_back(&dictUnits[i]);
|
||||
}
|
||||
|
||||
trie_ = new Trie(words, valuePointers);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
bool MakeNodeInfo(DictUnit& node_info,
|
||||
const string& word,
|
||||
double weight,
|
||||
const string& tag) {
|
||||
if (!DecodeRunesInString(word, node_info.word)) {
|
||||
XLOG(ERROR) << "Decode " << word << " failed.";
|
||||
return false;
|
||||
}
|
||||
node_info.weight = weight;
|
||||
node_info.tag = tag;
|
||||
return true;
|
||||
}
|
||||
|
||||
void LoadDict(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
|
||||
string line;
|
||||
vector<string> buf;
|
||||
|
||||
DictUnit node_info;
|
||||
for (size_t lineno = 0; getline(ifs, line); lineno++) {
|
||||
Split(line, buf, " ");
|
||||
XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
|
||||
MakeNodeInfo(node_info,
|
||||
buf[0],
|
||||
atof(buf[1].c_str()),
|
||||
buf[2]);
|
||||
static_node_infos_.push_back(node_info);
|
||||
}
|
||||
}
|
||||
|
||||
static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
|
||||
return lhs.weight < rhs.weight;
|
||||
}
|
||||
|
||||
void SetStaticWordWeights(UserWordWeightOption option) {
|
||||
XCHECK(!static_node_infos_.empty());
|
||||
vector<DictUnit> x = static_node_infos_;
|
||||
sort(x.begin(), x.end(), WeightCompare);
|
||||
min_weight_ = x[0].weight;
|
||||
max_weight_ = x[x.size() - 1].weight;
|
||||
median_weight_ = x[x.size() / 2].weight;
|
||||
switch (option) {
|
||||
case WordWeightMin:
|
||||
user_word_default_weight_ = min_weight_;
|
||||
break;
|
||||
case WordWeightMedian:
|
||||
user_word_default_weight_ = median_weight_;
|
||||
break;
|
||||
default:
|
||||
user_word_default_weight_ = max_weight_;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
double CalcFreqSum(const vector<DictUnit>& node_infos) const {
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < node_infos.size(); i++) {
|
||||
sum += node_infos[i].weight;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
|
||||
assert(sum > 0.0);
|
||||
for (size_t i = 0; i < node_infos.size(); i++) {
|
||||
DictUnit& node_info = node_infos[i];
|
||||
assert(node_info.weight > 0.0);
|
||||
node_info.weight = log(double(node_info.weight)/sum);
|
||||
}
|
||||
}
|
||||
|
||||
void Shrink(vector<DictUnit>& units) const {
|
||||
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
||||
}
|
||||
|
||||
vector<DictUnit> static_node_infos_;
|
||||
deque<DictUnit> active_node_infos_; // must not be vector
|
||||
Trie * trie_;
|
||||
|
||||
double freq_sum_;
|
||||
double min_weight_;
|
||||
double max_weight_;
|
||||
double median_weight_;
|
||||
double user_word_default_weight_;
|
||||
unordered_set<Rune> user_dict_single_chinese_word_;
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,93 @@
|
|||
#ifndef CPPJIEBA_FULLSEGMENT_H
|
||||
#define CPPJIEBA_FULLSEGMENT_H
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include <cassert>
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "SegmentBase.hpp"
|
||||
#include "Unicode.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
class FullSegment: public SegmentBase {
|
||||
public:
|
||||
FullSegment(const string& dictPath) {
|
||||
dictTrie_ = new DictTrie(dictPath);
|
||||
isNeedDestroy_ = true;
|
||||
}
|
||||
FullSegment(const DictTrie* dictTrie)
|
||||
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
||||
assert(dictTrie_);
|
||||
}
|
||||
~FullSegment() {
|
||||
if (isNeedDestroy_) {
|
||||
delete dictTrie_;
|
||||
}
|
||||
}
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence,
|
||||
vector<Word>& words) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size()/2);
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, wrs);
|
||||
}
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
void Cut(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<WordRange>& res) const {
|
||||
// result of searching in trie tree
|
||||
LocalVector<pair<size_t, const DictUnit*> > tRes;
|
||||
|
||||
// max index of res's words
|
||||
size_t maxIdx = 0;
|
||||
|
||||
// always equals to (uItr - begin)
|
||||
size_t uIdx = 0;
|
||||
|
||||
// tmp variables
|
||||
size_t wordLen = 0;
|
||||
assert(dictTrie_);
|
||||
vector<struct Dag> dags;
|
||||
dictTrie_->Find(begin, end, dags);
|
||||
for (size_t i = 0; i < dags.size(); i++) {
|
||||
for (size_t j = 0; j < dags[i].nexts.size(); j++) {
|
||||
size_t nextoffset = dags[i].nexts[j].first;
|
||||
assert(nextoffset < dags.size());
|
||||
const DictUnit* du = dags[i].nexts[j].second;
|
||||
if (du == NULL) {
|
||||
if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
|
||||
WordRange wr(begin + i, begin + nextoffset);
|
||||
res.push_back(wr);
|
||||
}
|
||||
} else {
|
||||
wordLen = du->word.size();
|
||||
if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
|
||||
WordRange wr(begin + i, begin + nextoffset);
|
||||
res.push_back(wr);
|
||||
}
|
||||
}
|
||||
maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
|
||||
}
|
||||
uIdx++;
|
||||
}
|
||||
}
|
||||
private:
|
||||
const DictTrie* dictTrie_;
|
||||
bool isNeedDestroy_;
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,129 @@
|
|||
#ifndef CPPJIEBA_HMMMODEL_H
|
||||
#define CPPJIEBA_HMMMODEL_H
|
||||
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "Trie.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
using namespace limonp;
|
||||
typedef unordered_map<Rune, double> EmitProbMap;
|
||||
|
||||
struct HMMModel {
|
||||
/*
|
||||
* STATUS:
|
||||
* 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S
|
||||
* */
|
||||
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
|
||||
|
||||
HMMModel(const string& modelPath) {
|
||||
memset(startProb, 0, sizeof(startProb));
|
||||
memset(transProb, 0, sizeof(transProb));
|
||||
statMap[0] = 'B';
|
||||
statMap[1] = 'E';
|
||||
statMap[2] = 'M';
|
||||
statMap[3] = 'S';
|
||||
emitProbVec.push_back(&emitProbB);
|
||||
emitProbVec.push_back(&emitProbE);
|
||||
emitProbVec.push_back(&emitProbM);
|
||||
emitProbVec.push_back(&emitProbS);
|
||||
LoadModel(modelPath);
|
||||
}
|
||||
~HMMModel() {
|
||||
}
|
||||
void LoadModel(const string& filePath) {
|
||||
ifstream ifile(filePath.c_str());
|
||||
XCHECK(ifile.is_open()) << "open " << filePath << " failed";
|
||||
string line;
|
||||
vector<string> tmp;
|
||||
vector<string> tmp2;
|
||||
//Load startProb
|
||||
XCHECK(GetLine(ifile, line));
|
||||
Split(line, tmp, " ");
|
||||
XCHECK(tmp.size() == STATUS_SUM);
|
||||
for (size_t j = 0; j< tmp.size(); j++) {
|
||||
startProb[j] = atof(tmp[j].c_str());
|
||||
}
|
||||
|
||||
//Load transProb
|
||||
for (size_t i = 0; i < STATUS_SUM; i++) {
|
||||
XCHECK(GetLine(ifile, line));
|
||||
Split(line, tmp, " ");
|
||||
XCHECK(tmp.size() == STATUS_SUM);
|
||||
for (size_t j =0; j < STATUS_SUM; j++) {
|
||||
transProb[i][j] = atof(tmp[j].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
//Load emitProbB
|
||||
XCHECK(GetLine(ifile, line));
|
||||
XCHECK(LoadEmitProb(line, emitProbB));
|
||||
|
||||
//Load emitProbE
|
||||
XCHECK(GetLine(ifile, line));
|
||||
XCHECK(LoadEmitProb(line, emitProbE));
|
||||
|
||||
//Load emitProbM
|
||||
XCHECK(GetLine(ifile, line));
|
||||
XCHECK(LoadEmitProb(line, emitProbM));
|
||||
|
||||
//Load emitProbS
|
||||
XCHECK(GetLine(ifile, line));
|
||||
XCHECK(LoadEmitProb(line, emitProbS));
|
||||
}
|
||||
double GetEmitProb(const EmitProbMap* ptMp, Rune key,
|
||||
double defVal)const {
|
||||
EmitProbMap::const_iterator cit = ptMp->find(key);
|
||||
if (cit == ptMp->end()) {
|
||||
return defVal;
|
||||
}
|
||||
return cit->second;
|
||||
}
|
||||
bool GetLine(ifstream& ifile, string& line) {
|
||||
while (getline(ifile, line)) {
|
||||
Trim(line);
|
||||
if (line.empty()) {
|
||||
continue;
|
||||
}
|
||||
if (StartsWith(line, "#")) {
|
||||
continue;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool LoadEmitProb(const string& line, EmitProbMap& mp) {
|
||||
if (line.empty()) {
|
||||
return false;
|
||||
}
|
||||
vector<string> tmp, tmp2;
|
||||
Unicode unicode;
|
||||
Split(line, tmp, ",");
|
||||
for (size_t i = 0; i < tmp.size(); i++) {
|
||||
Split(tmp[i], tmp2, ":");
|
||||
if (2 != tmp2.size()) {
|
||||
XLOG(ERROR) << "emitProb illegal.";
|
||||
return false;
|
||||
}
|
||||
if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
|
||||
XLOG(ERROR) << "TransCode failed.";
|
||||
return false;
|
||||
}
|
||||
mp[unicode[0]] = atof(tmp2[1].c_str());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
char statMap[STATUS_SUM];
|
||||
double startProb[STATUS_SUM];
|
||||
double transProb[STATUS_SUM][STATUS_SUM];
|
||||
EmitProbMap emitProbB;
|
||||
EmitProbMap emitProbE;
|
||||
EmitProbMap emitProbM;
|
||||
EmitProbMap emitProbS;
|
||||
vector<EmitProbMap* > emitProbVec;
|
||||
}; // struct HMMModel
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
|
@ -0,0 +1,190 @@
|
|||
#ifndef CPPJIBEA_HMMSEGMENT_H
|
||||
#define CPPJIBEA_HMMSEGMENT_H
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <memory.h>
|
||||
#include <cassert>
|
||||
#include "HMMModel.hpp"
|
||||
#include "SegmentBase.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
class HMMSegment: public SegmentBase {
|
||||
public:
|
||||
HMMSegment(const string& filePath)
|
||||
: model_(new HMMModel(filePath)), isNeedDestroy_(true) {
|
||||
}
|
||||
HMMSegment(const HMMModel* model)
|
||||
: model_(model), isNeedDestroy_(false) {
|
||||
}
|
||||
~HMMSegment() {
|
||||
if (isNeedDestroy_) {
|
||||
delete model_;
|
||||
}
|
||||
}
|
||||
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence,
|
||||
vector<Word>& words) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size()/2);
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, wrs);
|
||||
}
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
|
||||
RuneStrArray::const_iterator left = begin;
|
||||
RuneStrArray::const_iterator right = begin;
|
||||
while (right != end) {
|
||||
if (right->rune < 0x80) {
|
||||
if (left != right) {
|
||||
InternalCut(left, right, res);
|
||||
}
|
||||
left = right;
|
||||
do {
|
||||
right = SequentialLetterRule(left, end);
|
||||
if (right != left) {
|
||||
break;
|
||||
}
|
||||
right = NumbersRule(left, end);
|
||||
if (right != left) {
|
||||
break;
|
||||
}
|
||||
right ++;
|
||||
} while (false);
|
||||
WordRange wr(left, right - 1);
|
||||
res.push_back(wr);
|
||||
left = right;
|
||||
} else {
|
||||
right++;
|
||||
}
|
||||
}
|
||||
if (left != right) {
|
||||
InternalCut(left, right, res);
|
||||
}
|
||||
}
|
||||
private:
|
||||
// sequential letters rule
|
||||
RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
||||
Rune x = begin->rune;
|
||||
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
||||
begin ++;
|
||||
} else {
|
||||
return begin;
|
||||
}
|
||||
while (begin != end) {
|
||||
x = begin->rune;
|
||||
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
|
||||
begin ++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return begin;
|
||||
}
|
||||
//
|
||||
RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
||||
Rune x = begin->rune;
|
||||
if ('0' <= x && x <= '9') {
|
||||
begin ++;
|
||||
} else {
|
||||
return begin;
|
||||
}
|
||||
while (begin != end) {
|
||||
x = begin->rune;
|
||||
if ( ('0' <= x && x <= '9') || x == '.') {
|
||||
begin++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return begin;
|
||||
}
|
||||
void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
|
||||
vector<size_t> status;
|
||||
Viterbi(begin, end, status);
|
||||
|
||||
RuneStrArray::const_iterator left = begin;
|
||||
RuneStrArray::const_iterator right;
|
||||
for (size_t i = 0; i < status.size(); i++) {
|
||||
if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
|
||||
right = begin + i + 1;
|
||||
WordRange wr(left, right - 1);
|
||||
res.push_back(wr);
|
||||
left = right;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Viterbi(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<size_t>& status) const {
|
||||
size_t Y = HMMModel::STATUS_SUM;
|
||||
size_t X = end - begin;
|
||||
|
||||
size_t XYSize = X * Y;
|
||||
size_t now, old, stat;
|
||||
double tmp, endE, endS;
|
||||
|
||||
vector<int> path(XYSize);
|
||||
vector<double> weight(XYSize);
|
||||
|
||||
//start
|
||||
for (size_t y = 0; y < Y; y++) {
|
||||
weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
|
||||
path[0 + y * X] = -1;
|
||||
}
|
||||
|
||||
double emitProb;
|
||||
|
||||
for (size_t x = 1; x < X; x++) {
|
||||
for (size_t y = 0; y < Y; y++) {
|
||||
now = x + y*X;
|
||||
weight[now] = MIN_DOUBLE;
|
||||
path[now] = HMMModel::E; // warning
|
||||
emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE);
|
||||
for (size_t preY = 0; preY < Y; preY++) {
|
||||
old = x - 1 + preY * X;
|
||||
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
|
||||
if (tmp > weight[now]) {
|
||||
weight[now] = tmp;
|
||||
path[now] = preY;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
endE = weight[X-1+HMMModel::E*X];
|
||||
endS = weight[X-1+HMMModel::S*X];
|
||||
stat = 0;
|
||||
if (endE >= endS) {
|
||||
stat = HMMModel::E;
|
||||
} else {
|
||||
stat = HMMModel::S;
|
||||
}
|
||||
|
||||
status.resize(X);
|
||||
for (int x = X -1 ; x >= 0; x--) {
|
||||
status[x] = stat;
|
||||
stat = path[x + stat*X];
|
||||
}
|
||||
}
|
||||
|
||||
const HMMModel* model_;
|
||||
bool isNeedDestroy_;
|
||||
}; // class HMMSegment
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
|
@ -0,0 +1,131 @@
|
|||
#ifndef CPPJIEAB_JIEBA_H
|
||||
#define CPPJIEAB_JIEBA_H
|
||||
|
||||
#include "QuerySegment.hpp"
|
||||
#include "KeywordExtractor.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
class Jieba {
|
||||
public:
|
||||
Jieba(const string& dict_path,
|
||||
const string& model_path,
|
||||
const string& user_dict_path,
|
||||
const string& idfPath,
|
||||
const string& stopWordPath)
|
||||
: dict_trie_(dict_path, user_dict_path),
|
||||
model_(model_path),
|
||||
mp_seg_(&dict_trie_),
|
||||
hmm_seg_(&model_),
|
||||
mix_seg_(&dict_trie_, &model_),
|
||||
full_seg_(&dict_trie_),
|
||||
query_seg_(&dict_trie_, &model_),
|
||||
extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
|
||||
|
||||
}
|
||||
~Jieba() {
|
||||
}
|
||||
|
||||
struct LocWord {
|
||||
string word;
|
||||
size_t begin;
|
||||
size_t end;
|
||||
}; // struct LocWord
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
mix_seg_.Cut(sentence, words, hmm);
|
||||
}
|
||||
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
||||
mix_seg_.Cut(sentence, words, hmm);
|
||||
}
|
||||
void CutAll(const string& sentence, vector<string>& words) const {
|
||||
full_seg_.Cut(sentence, words);
|
||||
}
|
||||
void CutAll(const string& sentence, vector<Word>& words) const {
|
||||
full_seg_.Cut(sentence, words);
|
||||
}
|
||||
void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
query_seg_.Cut(sentence, words, hmm);
|
||||
}
|
||||
void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
||||
query_seg_.Cut(sentence, words, hmm);
|
||||
}
|
||||
void CutHMM(const string& sentence, vector<string>& words) const {
|
||||
hmm_seg_.Cut(sentence, words);
|
||||
}
|
||||
void CutHMM(const string& sentence, vector<Word>& words) const {
|
||||
hmm_seg_.Cut(sentence, words);
|
||||
}
|
||||
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
|
||||
mp_seg_.Cut(sentence, words, max_word_len);
|
||||
}
|
||||
void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
|
||||
mp_seg_.Cut(sentence, words, max_word_len);
|
||||
}
|
||||
|
||||
void Tag(const string& sentence, vector<pair<string, string> >& words) const {
|
||||
mix_seg_.Tag(sentence, words);
|
||||
}
|
||||
string LookupTag(const string &str) const {
|
||||
return mix_seg_.LookupTag(str);
|
||||
}
|
||||
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||
return dict_trie_.InsertUserWord(word, tag);
|
||||
}
|
||||
|
||||
bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
|
||||
return dict_trie_.InsertUserWord(word,freq, tag);
|
||||
}
|
||||
|
||||
bool Find(const string& word)
|
||||
{
|
||||
return dict_trie_.Find(word);
|
||||
}
|
||||
|
||||
void ResetSeparators(const string& s) {
|
||||
//TODO
|
||||
mp_seg_.ResetSeparators(s);
|
||||
hmm_seg_.ResetSeparators(s);
|
||||
mix_seg_.ResetSeparators(s);
|
||||
full_seg_.ResetSeparators(s);
|
||||
query_seg_.ResetSeparators(s);
|
||||
}
|
||||
|
||||
const DictTrie* GetDictTrie() const {
|
||||
return &dict_trie_;
|
||||
}
|
||||
|
||||
const HMMModel* GetHMMModel() const {
|
||||
return &model_;
|
||||
}
|
||||
|
||||
void LoadUserDict(const vector<string>& buf) {
|
||||
dict_trie_.LoadUserDict(buf);
|
||||
}
|
||||
|
||||
void LoadUserDict(const set<string>& buf) {
|
||||
dict_trie_.LoadUserDict(buf);
|
||||
}
|
||||
|
||||
void LoadUserDict(const string& path) {
|
||||
dict_trie_.LoadUserDict(path);
|
||||
}
|
||||
|
||||
private:
|
||||
DictTrie dict_trie_;
|
||||
HMMModel model_;
|
||||
|
||||
// They share the same dict trie and model
|
||||
MPSegment mp_seg_;
|
||||
HMMSegment hmm_seg_;
|
||||
MixSegment mix_seg_;
|
||||
FullSegment full_seg_;
|
||||
QuerySegment query_seg_;
|
||||
|
||||
public:
|
||||
KeywordExtractor extractor;
|
||||
}; // class Jieba
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif // CPPJIEAB_JIEBA_H
|
|
@ -0,0 +1,153 @@
|
|||
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||
|
||||
#include <cmath>
|
||||
#include <set>
|
||||
#include "MixSegment.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
using namespace limonp;
|
||||
using namespace std;
|
||||
|
||||
/*utf8*/
|
||||
class KeywordExtractor {
|
||||
public:
|
||||
struct Word {
|
||||
string word;
|
||||
vector<size_t> offsets;
|
||||
double weight;
|
||||
}; // struct Word
|
||||
|
||||
KeywordExtractor(const string& dictPath,
|
||||
const string& hmmFilePath,
|
||||
const string& idfPath,
|
||||
const string& stopWordPath,
|
||||
const string& userDict = "")
|
||||
: segment_(dictPath, hmmFilePath, userDict) {
|
||||
LoadIdfDict(idfPath);
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
KeywordExtractor(const DictTrie* dictTrie,
|
||||
const HMMModel* model,
|
||||
const string& idfPath,
|
||||
const string& stopWordPath)
|
||||
: segment_(dictTrie, model) {
|
||||
LoadIdfDict(idfPath);
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
~KeywordExtractor() {
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
||||
vector<Word> topWords;
|
||||
Extract(sentence, topWords, topN);
|
||||
for (size_t i = 0; i < topWords.size(); i++) {
|
||||
keywords.push_back(topWords[i].word);
|
||||
}
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||
vector<Word> topWords;
|
||||
Extract(sentence, topWords, topN);
|
||||
for (size_t i = 0; i < topWords.size(); i++) {
|
||||
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
||||
}
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
|
||||
vector<string> words;
|
||||
segment_.Cut(sentence, words);
|
||||
|
||||
map<string, Word> wordmap;
|
||||
size_t offset = 0;
|
||||
for (size_t i = 0; i < words.size(); ++i) {
|
||||
size_t t = offset;
|
||||
offset += words[i].size();
|
||||
if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
|
||||
continue;
|
||||
}
|
||||
wordmap[words[i]].offsets.push_back(t);
|
||||
wordmap[words[i]].weight += 1.0;
|
||||
}
|
||||
if (offset != sentence.size()) {
|
||||
XLOG(ERROR) << "words illegal";
|
||||
return;
|
||||
}
|
||||
|
||||
keywords.clear();
|
||||
keywords.reserve(wordmap.size());
|
||||
for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
||||
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
|
||||
if (cit != idfMap_.end()) {
|
||||
itr->second.weight *= cit->second;
|
||||
} else {
|
||||
itr->second.weight *= idfAverage_;
|
||||
}
|
||||
itr->second.word = itr->first;
|
||||
keywords.push_back(itr->second);
|
||||
}
|
||||
topN = min(topN, keywords.size());
|
||||
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
||||
keywords.resize(topN);
|
||||
}
|
||||
private:
|
||||
void LoadIdfDict(const string& idfPath) {
|
||||
ifstream ifs(idfPath.c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
|
||||
string line ;
|
||||
vector<string> buf;
|
||||
double idf = 0.0;
|
||||
double idfSum = 0.0;
|
||||
size_t lineno = 0;
|
||||
for (; getline(ifs, line); lineno++) {
|
||||
buf.clear();
|
||||
if (line.empty()) {
|
||||
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
|
||||
continue;
|
||||
}
|
||||
Split(line, buf, " ");
|
||||
if (buf.size() != 2) {
|
||||
XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
|
||||
continue;
|
||||
}
|
||||
idf = atof(buf[1].c_str());
|
||||
idfMap_[buf[0]] = idf;
|
||||
idfSum += idf;
|
||||
|
||||
}
|
||||
|
||||
assert(lineno);
|
||||
idfAverage_ = idfSum / lineno;
|
||||
assert(idfAverage_ > 0.0);
|
||||
}
|
||||
void LoadStopWordDict(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
|
||||
string line ;
|
||||
while (getline(ifs, line)) {
|
||||
stopWords_.insert(line);
|
||||
}
|
||||
assert(stopWords_.size());
|
||||
}
|
||||
|
||||
static bool Compare(const Word& lhs, const Word& rhs) {
|
||||
return lhs.weight > rhs.weight;
|
||||
}
|
||||
|
||||
MixSegment segment_;
|
||||
unordered_map<string, double> idfMap_;
|
||||
double idfAverage_;
|
||||
|
||||
unordered_set<string> stopWords_;
|
||||
}; // class KeywordExtractor
|
||||
|
||||
inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
|
||||
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
|
||||
}
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
#ifndef CPPJIEBA_MPSEGMENT_H
|
||||
#define CPPJIEBA_MPSEGMENT_H
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include <cassert>
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "SegmentTagged.hpp"
|
||||
#include "PosTagger.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
class MPSegment: public SegmentTagged {
|
||||
public:
|
||||
MPSegment(const string& dictPath, const string& userDictPath = "")
|
||||
: dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
|
||||
}
|
||||
MPSegment(const DictTrie* dictTrie)
|
||||
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
||||
assert(dictTrie_);
|
||||
}
|
||||
~MPSegment() {
|
||||
if (isNeedDestroy_) {
|
||||
delete dictTrie_;
|
||||
}
|
||||
}
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words) const {
|
||||
Cut(sentence, words, MAX_WORD_LENGTH);
|
||||
}
|
||||
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words,
|
||||
size_t max_word_len) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp, max_word_len);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence,
|
||||
vector<Word>& words,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size()/2);
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, wrs, max_word_len);
|
||||
}
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
void Cut(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<WordRange>& words,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
vector<Dag> dags;
|
||||
dictTrie_->Find(begin,
|
||||
end,
|
||||
dags,
|
||||
max_word_len);
|
||||
CalcDP(dags);
|
||||
CutByDag(begin, end, dags, words);
|
||||
}
|
||||
|
||||
const DictTrie* GetDictTrie() const {
|
||||
return dictTrie_;
|
||||
}
|
||||
|
||||
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
||||
return tagger_.Tag(src, res, *this);
|
||||
}
|
||||
|
||||
bool IsUserDictSingleChineseWord(const Rune& value) const {
|
||||
return dictTrie_->IsUserDictSingleChineseWord(value);
|
||||
}
|
||||
private:
|
||||
void CalcDP(vector<Dag>& dags) const {
|
||||
size_t nextPos;
|
||||
const DictUnit* p;
|
||||
double val;
|
||||
|
||||
for (vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
|
||||
rit->pInfo = NULL;
|
||||
rit->weight = MIN_DOUBLE;
|
||||
assert(!rit->nexts.empty());
|
||||
for (LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
|
||||
nextPos = it->first;
|
||||
p = it->second;
|
||||
val = 0.0;
|
||||
if (nextPos + 1 < dags.size()) {
|
||||
val += dags[nextPos + 1].weight;
|
||||
}
|
||||
|
||||
if (p) {
|
||||
val += p->weight;
|
||||
} else {
|
||||
val += dictTrie_->GetMinWeight();
|
||||
}
|
||||
if (val > rit->weight) {
|
||||
rit->pInfo = p;
|
||||
rit->weight = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
void CutByDag(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
const vector<Dag>& dags,
|
||||
vector<WordRange>& words) const {
|
||||
size_t i = 0;
|
||||
while (i < dags.size()) {
|
||||
const DictUnit* p = dags[i].pInfo;
|
||||
if (p) {
|
||||
assert(p->word.size() >= 1);
|
||||
WordRange wr(begin + i, begin + i + p->word.size() - 1);
|
||||
words.push_back(wr);
|
||||
i += p->word.size();
|
||||
} else { //single chinese word
|
||||
WordRange wr(begin + i, begin + i);
|
||||
words.push_back(wr);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const DictTrie* dictTrie_;
|
||||
bool isNeedDestroy_;
|
||||
PosTagger tagger_;
|
||||
|
||||
}; // class MPSegment
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
|
@ -0,0 +1,109 @@
|
|||
#ifndef CPPJIEBA_MIXSEGMENT_H
|
||||
#define CPPJIEBA_MIXSEGMENT_H
|
||||
|
||||
#include <cassert>
|
||||
#include "MPSegment.hpp"
|
||||
#include "HMMSegment.hpp"
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "PosTagger.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
class MixSegment: public SegmentTagged {
|
||||
public:
|
||||
MixSegment(const string& mpSegDict, const string& hmmSegDict,
|
||||
const string& userDict = "")
|
||||
: mpSeg_(mpSegDict, userDict),
|
||||
hmmSeg_(hmmSegDict) {
|
||||
}
|
||||
MixSegment(const DictTrie* dictTrie, const HMMModel* model)
|
||||
: mpSeg_(dictTrie), hmmSeg_(model) {
|
||||
}
|
||||
~MixSegment() {
|
||||
}
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words) const {
|
||||
Cut(sentence, words, true);
|
||||
}
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp, hmm);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size() / 2);
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, wrs, hmm);
|
||||
}
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
|
||||
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
|
||||
if (!hmm) {
|
||||
mpSeg_.Cut(begin, end, res);
|
||||
return;
|
||||
}
|
||||
vector<WordRange> words;
|
||||
assert(end >= begin);
|
||||
words.reserve(end - begin);
|
||||
mpSeg_.Cut(begin, end, words);
|
||||
|
||||
vector<WordRange> hmmRes;
|
||||
hmmRes.reserve(end - begin);
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
//if mp Get a word, it's ok, put it into result
|
||||
if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
|
||||
res.push_back(words[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// if mp Get a single one and it is not in userdict, collect it in sequence
|
||||
size_t j = i;
|
||||
while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
|
||||
j++;
|
||||
}
|
||||
|
||||
// Cut the sequence with hmm
|
||||
assert(j - 1 >= i);
|
||||
// TODO
|
||||
hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes);
|
||||
//put hmm result to result
|
||||
for (size_t k = 0; k < hmmRes.size(); k++) {
|
||||
res.push_back(hmmRes[k]);
|
||||
}
|
||||
|
||||
//clear tmp vars
|
||||
hmmRes.clear();
|
||||
|
||||
//let i jump over this piece
|
||||
i = j - 1;
|
||||
}
|
||||
}
|
||||
|
||||
const DictTrie* GetDictTrie() const {
|
||||
return mpSeg_.GetDictTrie();
|
||||
}
|
||||
|
||||
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
||||
return tagger_.Tag(src, res, *this);
|
||||
}
|
||||
|
||||
string LookupTag(const string &str) const {
|
||||
return tagger_.LookupTag(str, *this);
|
||||
}
|
||||
|
||||
private:
|
||||
MPSegment mpSeg_;
|
||||
HMMSegment hmmSeg_;
|
||||
PosTagger tagger_;
|
||||
|
||||
}; // class MixSegment
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
|
@ -0,0 +1,77 @@
|
|||
#ifndef CPPJIEBA_POS_TAGGING_H
|
||||
#define CPPJIEBA_POS_TAGGING_H
|
||||
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "SegmentTagged.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
using namespace limonp;
|
||||
|
||||
static const char* const POS_M = "m";
|
||||
static const char* const POS_ENG = "eng";
|
||||
static const char* const POS_X = "x";
|
||||
|
||||
class PosTagger {
|
||||
public:
|
||||
PosTagger() {
|
||||
}
|
||||
~PosTagger() {
|
||||
}
|
||||
|
||||
bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
|
||||
vector<string> CutRes;
|
||||
segment.Cut(src, CutRes);
|
||||
|
||||
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
|
||||
res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
|
||||
}
|
||||
return !res.empty();
|
||||
}
|
||||
|
||||
string LookupTag(const string &str, const SegmentTagged& segment) const {
|
||||
const DictUnit *tmp = NULL;
|
||||
RuneStrArray runes;
|
||||
const DictTrie * dict = segment.GetDictTrie();
|
||||
assert(dict != NULL);
|
||||
if (!DecodeRunesInString(str, runes)) {
|
||||
XLOG(ERROR) << "Decode failed.";
|
||||
return POS_X;
|
||||
}
|
||||
tmp = dict->Find(runes.begin(), runes.end());
|
||||
if (tmp == NULL || tmp->tag.empty()) {
|
||||
return SpecialRule(runes);
|
||||
} else {
|
||||
return tmp->tag;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const char* SpecialRule(const RuneStrArray& unicode) const {
|
||||
size_t m = 0;
|
||||
size_t eng = 0;
|
||||
for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
|
||||
if (unicode[i].rune < 0x80) {
|
||||
eng ++;
|
||||
if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
|
||||
m++;
|
||||
}
|
||||
}
|
||||
}
|
||||
// ascii char is not found
|
||||
if (eng == 0) {
|
||||
return POS_X;
|
||||
}
|
||||
// all the ascii is number char
|
||||
if (m == eng) {
|
||||
return POS_M;
|
||||
}
|
||||
// the ascii chars contain english letter
|
||||
return POS_ENG;
|
||||
}
|
||||
|
||||
}; // class PosTagger
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
|
@ -0,0 +1,54 @@
|
|||
#ifndef CPPJIEBA_PRE_FILTER_H
|
||||
#define CPPJIEBA_PRE_FILTER_H
|
||||
|
||||
#include "Trie.hpp"
|
||||
#include "limonp/Logging.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
class PreFilter {
|
||||
public:
|
||||
//TODO use WordRange instead of Range
|
||||
struct Range {
|
||||
RuneStrArray::const_iterator begin;
|
||||
RuneStrArray::const_iterator end;
|
||||
}; // struct Range
|
||||
|
||||
PreFilter(const unordered_set<Rune>& symbols,
|
||||
const string& sentence)
|
||||
: symbols_(symbols) {
|
||||
if (!DecodeRunesInString(sentence, sentence_)) {
|
||||
XLOG(ERROR) << "decode failed. ";
|
||||
}
|
||||
cursor_ = sentence_.begin();
|
||||
}
|
||||
~PreFilter() {
|
||||
}
|
||||
bool HasNext() const {
|
||||
return cursor_ != sentence_.end();
|
||||
}
|
||||
Range Next() {
|
||||
Range range;
|
||||
range.begin = cursor_;
|
||||
while (cursor_ != sentence_.end()) {
|
||||
if (IsIn(symbols_, cursor_->rune)) {
|
||||
if (range.begin == cursor_) {
|
||||
cursor_ ++;
|
||||
}
|
||||
range.end = cursor_;
|
||||
return range;
|
||||
}
|
||||
cursor_ ++;
|
||||
}
|
||||
range.end = sentence_.end();
|
||||
return range;
|
||||
}
|
||||
private:
|
||||
RuneStrArray::const_iterator cursor_;
|
||||
RuneStrArray sentence_;
|
||||
const unordered_set<Rune>& symbols_;
|
||||
}; // class PreFilter
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif // CPPJIEBA_PRE_FILTER_H
|
|
@ -0,0 +1,89 @@
|
|||
#ifndef CPPJIEBA_QUERYSEGMENT_H
|
||||
#define CPPJIEBA_QUERYSEGMENT_H
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include <cassert>
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "SegmentBase.hpp"
|
||||
#include "FullSegment.hpp"
|
||||
#include "MixSegment.hpp"
|
||||
#include "Unicode.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
class QuerySegment: public SegmentBase {
|
||||
public:
|
||||
QuerySegment(const string& dict, const string& model, const string& userDict = "")
|
||||
: mixSeg_(dict, model, userDict),
|
||||
trie_(mixSeg_.GetDictTrie()) {
|
||||
}
|
||||
QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
|
||||
: mixSeg_(dictTrie, model), trie_(dictTrie) {
|
||||
}
|
||||
~QuerySegment() {
|
||||
}
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words) const {
|
||||
Cut(sentence, words, true);
|
||||
}
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp, hmm);
|
||||
GetStringsFromWords(tmp, words);
|
||||
}
|
||||
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<WordRange> wrs;
|
||||
wrs.reserve(sentence.size()/2);
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
Cut(range.begin, range.end, wrs, hmm);
|
||||
}
|
||||
words.clear();
|
||||
words.reserve(wrs.size());
|
||||
GetWordsFromWordRanges(sentence, wrs, words);
|
||||
}
|
||||
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
|
||||
//use mix Cut first
|
||||
vector<WordRange> mixRes;
|
||||
mixSeg_.Cut(begin, end, mixRes, hmm);
|
||||
|
||||
vector<WordRange> fullRes;
|
||||
for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
||||
if (mixResItr->Length() > 2) {
|
||||
for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
|
||||
WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
|
||||
if (trie_->Find(wr.left, wr.right + 1) != NULL) {
|
||||
res.push_back(wr);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mixResItr->Length() > 3) {
|
||||
for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
|
||||
WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
|
||||
if (trie_->Find(wr.left, wr.right + 1) != NULL) {
|
||||
res.push_back(wr);
|
||||
}
|
||||
}
|
||||
}
|
||||
res.push_back(*mixResItr);
|
||||
}
|
||||
}
|
||||
private:
|
||||
bool IsAllAscii(const Unicode& s) const {
|
||||
for(size_t i = 0; i < s.size(); i++) {
|
||||
if (s[i] >= 0x80) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
MixSegment mixSeg_;
|
||||
const DictTrie* trie_;
|
||||
}; // QuerySegment
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
|
@ -0,0 +1,46 @@
|
|||
#ifndef CPPJIEBA_SEGMENTBASE_H
|
||||
#define CPPJIEBA_SEGMENTBASE_H
|
||||
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "PreFilter.hpp"
|
||||
#include <cassert>
|
||||
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
|
||||
|
||||
using namespace limonp;
|
||||
|
||||
class SegmentBase {
|
||||
public:
|
||||
SegmentBase() {
|
||||
XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
|
||||
}
|
||||
virtual ~SegmentBase() {
|
||||
}
|
||||
|
||||
virtual void Cut(const string& sentence, vector<string>& words) const = 0;
|
||||
|
||||
bool ResetSeparators(const string& s) {
|
||||
symbols_.clear();
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(s, runes)) {
|
||||
XLOG(ERROR) << "decode " << s << " failed";
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < runes.size(); i++) {
|
||||
if (!symbols_.insert(runes[i].rune).second) {
|
||||
XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
protected:
|
||||
unordered_set<Rune> symbols_;
|
||||
}; // class SegmentBase
|
||||
|
||||
} // cppjieba
|
||||
|
||||
#endif
|
|
@ -0,0 +1,23 @@
|
|||
#ifndef CPPJIEBA_SEGMENTTAGGED_H
|
||||
#define CPPJIEBA_SEGMENTTAGGED_H
|
||||
|
||||
#include "SegmentBase.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
class SegmentTagged : public SegmentBase{
|
||||
public:
|
||||
SegmentTagged() {
|
||||
}
|
||||
virtual ~SegmentTagged() {
|
||||
}
|
||||
|
||||
virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
|
||||
|
||||
virtual const DictTrie* GetDictTrie() const = 0;
|
||||
|
||||
}; // class SegmentTagged
|
||||
|
||||
} // cppjieba
|
||||
|
||||
#endif
|
|
@ -0,0 +1,190 @@
|
|||
#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
|
||||
#define CPPJIEBA_TEXTRANK_EXTRACTOR_H
|
||||
|
||||
#include <cmath>
|
||||
#include "Jieba.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
using namespace limonp;
|
||||
using namespace std;
|
||||
|
||||
class TextRankExtractor {
|
||||
public:
|
||||
typedef struct _Word {string word;vector<size_t> offsets;double weight;} Word; // struct Word
|
||||
private:
|
||||
typedef std::map<string,Word> WordMap;
|
||||
|
||||
class WordGraph{
|
||||
private:
|
||||
typedef double Score;
|
||||
typedef string Node;
|
||||
typedef std::set<Node> NodeSet;
|
||||
|
||||
typedef std::map<Node,double> Edges;
|
||||
typedef std::map<Node,Edges> Graph;
|
||||
//typedef std::unordered_map<Node,double> Edges;
|
||||
//typedef std::unordered_map<Node,Edges> Graph;
|
||||
|
||||
double d;
|
||||
Graph graph;
|
||||
NodeSet nodeSet;
|
||||
public:
|
||||
WordGraph(): d(0.85) {};
|
||||
WordGraph(double in_d): d(in_d) {};
|
||||
|
||||
void addEdge(Node start,Node end,double weight){
|
||||
Edges temp;
|
||||
Edges::iterator gotEdges;
|
||||
nodeSet.insert(start);
|
||||
nodeSet.insert(end);
|
||||
graph[start][end]+=weight;
|
||||
graph[end][start]+=weight;
|
||||
}
|
||||
|
||||
void rank(WordMap &ws,size_t rankTime=10){
|
||||
WordMap outSum;
|
||||
Score wsdef, min_rank, max_rank;
|
||||
|
||||
if( graph.size() == 0)
|
||||
return;
|
||||
|
||||
wsdef = 1.0 / graph.size();
|
||||
|
||||
for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){
|
||||
// edges->first start节点;edge->first end节点;edge->second 权重
|
||||
ws[edges->first].word=edges->first;
|
||||
ws[edges->first].weight=wsdef;
|
||||
outSum[edges->first].weight=0;
|
||||
for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){
|
||||
outSum[edges->first].weight+=edge->second;
|
||||
}
|
||||
}
|
||||
//sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
|
||||
for( size_t i=0; i<rankTime; i++ ){
|
||||
for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++ ){
|
||||
double s = 0;
|
||||
for( Edges::iterator edge= graph[*node].begin(); edge != graph[*node].end(); edge++ )
|
||||
// edge->first end节点;edge->second 权重
|
||||
s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
|
||||
ws[*node].weight = (1 - d) + d * s;
|
||||
}
|
||||
}
|
||||
|
||||
min_rank=max_rank=ws.begin()->second.weight;
|
||||
for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
|
||||
if( i->second.weight < min_rank ){
|
||||
min_rank = i->second.weight;
|
||||
}
|
||||
if( i->second.weight > max_rank ){
|
||||
max_rank = i->second.weight;
|
||||
}
|
||||
}
|
||||
for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
|
||||
ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
public:
|
||||
TextRankExtractor(const string& dictPath,
|
||||
const string& hmmFilePath,
|
||||
const string& stopWordPath,
|
||||
const string& userDict = "")
|
||||
: segment_(dictPath, hmmFilePath, userDict) {
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
TextRankExtractor(const DictTrie* dictTrie,
|
||||
const HMMModel* model,
|
||||
const string& stopWordPath)
|
||||
: segment_(dictTrie, model) {
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
~TextRankExtractor() {
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
||||
vector<Word> topWords;
|
||||
Extract(sentence, topWords, topN);
|
||||
for (size_t i = 0; i < topWords.size(); i++) {
|
||||
keywords.push_back(topWords[i].word);
|
||||
}
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||
vector<Word> topWords;
|
||||
Extract(sentence, topWords, topN);
|
||||
for (size_t i = 0; i < topWords.size(); i++) {
|
||||
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
||||
}
|
||||
}
|
||||
|
||||
void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
|
||||
vector<string> words;
|
||||
segment_.Cut(sentence, words);
|
||||
|
||||
TextRankExtractor::WordGraph graph;
|
||||
WordMap wordmap;
|
||||
size_t offset = 0;
|
||||
|
||||
for(size_t i=0; i < words.size(); i++){
|
||||
size_t t = offset;
|
||||
offset += words[i].size();
|
||||
if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
|
||||
continue;
|
||||
}
|
||||
for(size_t j=i+1,skip=0;j<i+span+skip && j<words.size();j++){
|
||||
if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
|
||||
skip++;
|
||||
continue;
|
||||
}
|
||||
graph.addEdge(words[i],words[j],1);
|
||||
}
|
||||
wordmap[words[i]].offsets.push_back(t);
|
||||
}
|
||||
if (offset != sentence.size()) {
|
||||
XLOG(ERROR) << "words illegal";
|
||||
return;
|
||||
}
|
||||
|
||||
graph.rank(wordmap,rankTime);
|
||||
|
||||
keywords.clear();
|
||||
keywords.reserve(wordmap.size());
|
||||
for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
||||
keywords.push_back(itr->second);
|
||||
}
|
||||
|
||||
topN = min(topN, keywords.size());
|
||||
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
||||
keywords.resize(topN);
|
||||
}
|
||||
private:
|
||||
void LoadStopWordDict(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
|
||||
string line ;
|
||||
while (getline(ifs, line)) {
|
||||
stopWords_.insert(line);
|
||||
}
|
||||
assert(stopWords_.size());
|
||||
}
|
||||
|
||||
static bool Compare(const Word &x,const Word &y){
|
||||
return x.weight > y.weight;
|
||||
}
|
||||
|
||||
MixSegment segment_;
|
||||
unordered_set<string> stopWords_;
|
||||
}; // class TextRankExtractor
|
||||
|
||||
inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
|
||||
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
|
||||
}
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
#ifndef CPPJIEBA_TRIE_HPP
|
||||
#define CPPJIEBA_TRIE_HPP
|
||||
|
||||
#include <vector>
|
||||
#include <queue>
|
||||
#include "limonp/StdExtension.hpp"
|
||||
#include "Unicode.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
using namespace std;
|
||||
|
||||
const size_t MAX_WORD_LENGTH = 512;
|
||||
|
||||
struct DictUnit {
|
||||
Unicode word;
|
||||
double weight;
|
||||
string tag;
|
||||
}; // struct DictUnit
|
||||
|
||||
// for debugging
|
||||
// inline ostream & operator << (ostream& os, const DictUnit& unit) {
|
||||
// string s;
|
||||
// s << unit.word;
|
||||
// return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
|
||||
// }
|
||||
|
||||
struct Dag {
|
||||
RuneStr runestr;
|
||||
// [offset, nexts.first]
|
||||
limonp::LocalVector<pair<size_t, const DictUnit*> > nexts;
|
||||
const DictUnit * pInfo;
|
||||
double weight;
|
||||
size_t nextPos; // TODO
|
||||
Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) {
|
||||
}
|
||||
}; // struct Dag
|
||||
|
||||
typedef Rune TrieKey;
|
||||
|
||||
class TrieNode {
|
||||
public :
|
||||
TrieNode(): next(NULL), ptValue(NULL) {
|
||||
}
|
||||
public:
|
||||
typedef unordered_map<TrieKey, TrieNode*> NextMap;
|
||||
NextMap *next;
|
||||
const DictUnit *ptValue;
|
||||
};
|
||||
|
||||
class Trie {
|
||||
public:
|
||||
Trie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers)
|
||||
: root_(new TrieNode) {
|
||||
CreateTrie(keys, valuePointers);
|
||||
}
|
||||
~Trie() {
|
||||
DeleteNode(root_);
|
||||
}
|
||||
|
||||
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
||||
if (begin == end) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const TrieNode* ptNode = root_;
|
||||
TrieNode::NextMap::const_iterator citer;
|
||||
for (RuneStrArray::const_iterator it = begin; it != end; it++) {
|
||||
if (NULL == ptNode->next) {
|
||||
return NULL;
|
||||
}
|
||||
citer = ptNode->next->find(it->rune);
|
||||
if (ptNode->next->end() == citer) {
|
||||
return NULL;
|
||||
}
|
||||
ptNode = citer->second;
|
||||
}
|
||||
return ptNode->ptValue;
|
||||
}
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<struct Dag>&res,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
assert(root_ != NULL);
|
||||
res.resize(end - begin);
|
||||
|
||||
const TrieNode *ptNode = NULL;
|
||||
TrieNode::NextMap::const_iterator citer;
|
||||
for (size_t i = 0; i < size_t(end - begin); i++) {
|
||||
res[i].runestr = *(begin + i);
|
||||
|
||||
if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
|
||||
ptNode = citer->second;
|
||||
} else {
|
||||
ptNode = NULL;
|
||||
}
|
||||
if (ptNode != NULL) {
|
||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
||||
} else {
|
||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, static_cast<const DictUnit*>(NULL)));
|
||||
}
|
||||
|
||||
for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) {
|
||||
if (ptNode == NULL || ptNode->next == NULL) {
|
||||
break;
|
||||
}
|
||||
citer = ptNode->next->find((begin + j)->rune);
|
||||
if (ptNode->next->end() == citer) {
|
||||
break;
|
||||
}
|
||||
ptNode = citer->second;
|
||||
if (NULL != ptNode->ptValue) {
|
||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void InsertNode(const Unicode& key, const DictUnit* ptValue) {
|
||||
if (key.begin() == key.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
TrieNode::NextMap::const_iterator kmIter;
|
||||
TrieNode *ptNode = root_;
|
||||
for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
|
||||
if (NULL == ptNode->next) {
|
||||
ptNode->next = new TrieNode::NextMap;
|
||||
}
|
||||
kmIter = ptNode->next->find(*citer);
|
||||
if (ptNode->next->end() == kmIter) {
|
||||
TrieNode *nextNode = new TrieNode;
|
||||
|
||||
ptNode->next->insert(make_pair(*citer, nextNode));
|
||||
ptNode = nextNode;
|
||||
} else {
|
||||
ptNode = kmIter->second;
|
||||
}
|
||||
}
|
||||
assert(ptNode != NULL);
|
||||
ptNode->ptValue = ptValue;
|
||||
}
|
||||
|
||||
private:
|
||||
void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
|
||||
if (valuePointers.empty() || keys.empty()) {
|
||||
return;
|
||||
}
|
||||
assert(keys.size() == valuePointers.size());
|
||||
|
||||
for (size_t i = 0; i < keys.size(); i++) {
|
||||
InsertNode(keys[i], valuePointers[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void DeleteNode(TrieNode* node) {
|
||||
if (NULL == node) {
|
||||
return;
|
||||
}
|
||||
if (NULL != node->next) {
|
||||
for (TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) {
|
||||
DeleteNode(it->second);
|
||||
}
|
||||
delete node->next;
|
||||
}
|
||||
delete node;
|
||||
}
|
||||
|
||||
TrieNode* root_;
|
||||
}; // class Trie
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif // CPPJIEBA_TRIE_HPP
|
|
@ -0,0 +1,227 @@
|
|||
#ifndef CPPJIEBA_UNICODE_H
|
||||
#define CPPJIEBA_UNICODE_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <ostream>
|
||||
#include "limonp/LocalVector.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
typedef uint32_t Rune;
|
||||
|
||||
struct Word {
|
||||
string word;
|
||||
uint32_t offset;
|
||||
uint32_t unicode_offset;
|
||||
uint32_t unicode_length;
|
||||
Word(const string& w, uint32_t o)
|
||||
: word(w), offset(o) {
|
||||
}
|
||||
Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
|
||||
: word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
||||
}
|
||||
}; // struct Word
|
||||
|
||||
inline std::ostream& operator << (std::ostream& os, const Word& w) {
|
||||
return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
|
||||
}
|
||||
|
||||
struct RuneStr {
|
||||
Rune rune;
|
||||
uint32_t offset;
|
||||
uint32_t len;
|
||||
uint32_t unicode_offset;
|
||||
uint32_t unicode_length;
|
||||
RuneStr(): rune(0), offset(0), len(0), unicode_offset(0), unicode_length(0) {
|
||||
}
|
||||
RuneStr(Rune r, uint32_t o, uint32_t l)
|
||||
: rune(r), offset(o), len(l), unicode_offset(0), unicode_length(0) {
|
||||
}
|
||||
RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
|
||||
: rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
||||
}
|
||||
}; // struct RuneStr
|
||||
|
||||
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
|
||||
return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
|
||||
}
|
||||
|
||||
typedef limonp::LocalVector<Rune> Unicode;
|
||||
typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
|
||||
|
||||
// [left, right]
|
||||
struct WordRange {
|
||||
RuneStrArray::const_iterator left;
|
||||
RuneStrArray::const_iterator right;
|
||||
WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r)
|
||||
: left(l), right(r) {
|
||||
}
|
||||
size_t Length() const {
|
||||
return right - left + 1;
|
||||
}
|
||||
bool IsAllAscii() const {
|
||||
for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
|
||||
if (iter->rune >= 0x80) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}; // struct WordRange
|
||||
|
||||
struct RuneStrLite {
|
||||
uint32_t rune;
|
||||
uint32_t len;
|
||||
RuneStrLite(): rune(0), len(0) {
|
||||
}
|
||||
RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
|
||||
}
|
||||
}; // struct RuneStrLite
|
||||
|
||||
inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
|
||||
RuneStrLite rp(0, 0);
|
||||
if (str == NULL || len == 0) {
|
||||
return rp;
|
||||
}
|
||||
if (!(str[0] & 0x80)) { // 0xxxxxxx
|
||||
// 7bit, total 7bit
|
||||
rp.rune = (uint8_t)(str[0]) & 0x7f;
|
||||
rp.len = 1;
|
||||
} else if ((uint8_t)str[0] <= 0xdf && 1 < len) {
|
||||
// 110xxxxxx
|
||||
// 5bit, total 5bit
|
||||
rp.rune = (uint8_t)(str[0]) & 0x1f;
|
||||
|
||||
// 6bit, total 11bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
||||
rp.len = 2;
|
||||
} else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
|
||||
// 4bit, total 4bit
|
||||
rp.rune = (uint8_t)(str[0]) & 0x0f;
|
||||
|
||||
// 6bit, total 10bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
||||
|
||||
// 6bit, total 16bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
||||
|
||||
rp.len = 3;
|
||||
} else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
|
||||
// 3bit, total 3bit
|
||||
rp.rune = (uint8_t)(str[0]) & 0x07;
|
||||
|
||||
// 6bit, total 9bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
||||
|
||||
// 6bit, total 15bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
||||
|
||||
// 6bit, total 21bit
|
||||
rp.rune <<= 6;
|
||||
rp.rune |= (uint8_t)(str[3]) & 0x3f;
|
||||
|
||||
rp.len = 4;
|
||||
} else {
|
||||
rp.rune = 0;
|
||||
rp.len = 0;
|
||||
}
|
||||
return rp;
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
|
||||
runes.clear();
|
||||
runes.reserve(len / 2);
|
||||
for (uint32_t i = 0, j = 0; i < len;) {
|
||||
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
||||
if (rp.len == 0) {
|
||||
runes.clear();
|
||||
return false;
|
||||
}
|
||||
RuneStr x(rp.rune, i, rp.len, j, 1);
|
||||
runes.push_back(x);
|
||||
i += rp.len;
|
||||
++j;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
|
||||
return DecodeRunesInString(s.c_str(), s.size(), runes);
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
|
||||
unicode.clear();
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(s, len, runes)) {
|
||||
return false;
|
||||
}
|
||||
unicode.reserve(runes.size());
|
||||
for (size_t i = 0; i < runes.size(); i++) {
|
||||
unicode.push_back(runes[i].rune);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool IsSingleWord(const string& str) {
|
||||
RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
|
||||
return rp.len == str.size();
|
||||
}
|
||||
|
||||
inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
|
||||
return DecodeRunesInString(s.c_str(), s.size(), unicode);
|
||||
}
|
||||
|
||||
inline Unicode DecodeRunesInString(const string& s) {
|
||||
Unicode result;
|
||||
DecodeRunesInString(s, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// [left, right]
|
||||
inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
||||
assert(right->offset >= left->offset);
|
||||
uint32_t len = right->offset - left->offset + right->len;
|
||||
uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
|
||||
return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
|
||||
}
|
||||
|
||||
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
||||
assert(right->offset >= left->offset);
|
||||
uint32_t len = right->offset - left->offset + right->len;
|
||||
return s.substr(left->offset, len);
|
||||
}
|
||||
|
||||
inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
|
||||
for (size_t i = 0; i < wrs.size(); i++) {
|
||||
words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
|
||||
}
|
||||
}
|
||||
|
||||
inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
|
||||
vector<Word> result;
|
||||
GetWordsFromWordRanges(s, wrs, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
|
||||
strs.resize(words.size());
|
||||
for (size_t i = 0; i < words.size(); ++i) {
|
||||
strs[i] = words[i].word;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif // CPPJIEBA_UNICODE_H
|
|
@ -0,0 +1,21 @@
|
|||
INCLUDEPATH += $$PWD
|
||||
|
||||
HEADERS += \
|
||||
$$PWD/DictTrie.hpp \
|
||||
$$PWD/FullSegment.hpp \
|
||||
$$PWD/HMMModel.hpp \
|
||||
$$PWD/HMMSegment.hpp \
|
||||
$$PWD/Jieba.hpp \
|
||||
$$PWD/KeywordExtractor.hpp \
|
||||
$$PWD/MPSegment.hpp \
|
||||
$$PWD/MixSegment.hpp \
|
||||
$$PWD/PosTagger.hpp \
|
||||
$$PWD/PreFilter.hpp \
|
||||
$$PWD/QuerySegment.hpp \
|
||||
$$PWD/SegmentBase.hpp \
|
||||
$$PWD/SegmentTagged.hpp \
|
||||
$$PWD/TextRankExtractor.hpp \
|
||||
$$PWD/Trie.hpp \
|
||||
$$PWD/Unicode.hpp
|
||||
|
||||
include(limonp/limonp.pri)
|
|
@ -0,0 +1,70 @@
|
|||
/************************************
|
||||
* file enc : ascii
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
|
||||
#ifndef LIMONP_ARGV_FUNCTS_H
|
||||
#define LIMONP_ARGV_FUNCTS_H
|
||||
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include "StringUtil.hpp"
|
||||
|
||||
namespace limonp {
|
||||
|
||||
using namespace std;
|
||||
|
||||
class ArgvContext {
|
||||
public :
|
||||
ArgvContext(int argc, const char* const * argv) {
|
||||
for(int i = 0; i < argc; i++) {
|
||||
if(StartsWith(argv[i], "-")) {
|
||||
if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) {
|
||||
mpss_[argv[i]] = argv[i+1];
|
||||
i++;
|
||||
} else {
|
||||
sset_.insert(argv[i]);
|
||||
}
|
||||
} else {
|
||||
args_.push_back(argv[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
~ArgvContext() {
|
||||
}
|
||||
|
||||
friend ostream& operator << (ostream& os, const ArgvContext& args);
|
||||
string operator [](size_t i) const {
|
||||
if(i < args_.size()) {
|
||||
return args_[i];
|
||||
}
|
||||
return "";
|
||||
}
|
||||
string operator [](const string& key) const {
|
||||
map<string, string>::const_iterator it = mpss_.find(key);
|
||||
if(it != mpss_.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
bool HasKey(const string& key) const {
|
||||
if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
vector<string> args_;
|
||||
map<string, string> mpss_;
|
||||
set<string> sset_;
|
||||
}; // class ArgvContext
|
||||
|
||||
inline ostream& operator << (ostream& os, const ArgvContext& args) {
|
||||
return os<<args.args_<<args.mpss_<<args.sset_;
|
||||
}
|
||||
|
||||
} // namespace limonp
|
||||
|
||||
#endif
|
|
@ -0,0 +1,49 @@
|
|||
#ifndef LIMONP_BLOCKINGQUEUE_HPP
|
||||
#define LIMONP_BLOCKINGQUEUE_HPP
|
||||
|
||||
#include <queue>
|
||||
#include "Condition.hpp"
|
||||
|
||||
namespace limonp {
|
||||
template<class T>
|
||||
class BlockingQueue: NonCopyable {
|
||||
public:
|
||||
BlockingQueue()
|
||||
: mutex_(), notEmpty_(mutex_), queue_() {
|
||||
}
|
||||
|
||||
void Push(const T& x) {
|
||||
MutexLockGuard lock(mutex_);
|
||||
queue_.push(x);
|
||||
notEmpty_.Notify(); // Wait morphing saves us
|
||||
}
|
||||
|
||||
T Pop() {
|
||||
MutexLockGuard lock(mutex_);
|
||||
// always use a while-loop, due to spurious wakeup
|
||||
while (queue_.empty()) {
|
||||
notEmpty_.Wait();
|
||||
}
|
||||
assert(!queue_.empty());
|
||||
T front(queue_.front());
|
||||
queue_.pop();
|
||||
return front;
|
||||
}
|
||||
|
||||
size_t Size() const {
|
||||
MutexLockGuard lock(mutex_);
|
||||
return queue_.size();
|
||||
}
|
||||
bool Empty() const {
|
||||
return Size() == 0;
|
||||
}
|
||||
|
||||
private:
|
||||
mutable MutexLock mutex_;
|
||||
Condition notEmpty_;
|
||||
std::queue<T> queue_;
|
||||
}; // class BlockingQueue
|
||||
|
||||
} // namespace limonp
|
||||
|
||||
#endif // LIMONP_BLOCKINGQUEUE_HPP
|
|
@ -0,0 +1,67 @@
|
|||
#ifndef LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
|
||||
#define LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
|
||||
|
||||
#include "BoundedQueue.hpp"
|
||||
|
||||
namespace limonp {
|
||||
|
||||
template<typename T>
|
||||
class BoundedBlockingQueue : NonCopyable {
|
||||
public:
|
||||
explicit BoundedBlockingQueue(size_t maxSize)
|
||||
: mutex_(),
|
||||
notEmpty_(mutex_),
|
||||
notFull_(mutex_),
|
||||
queue_(maxSize) {
|
||||
}
|
||||
|
||||
void Push(const T& x) {
|
||||
MutexLockGuard lock(mutex_);
|
||||
while (queue_.Full()) {
|
||||
notFull_.Wait();
|
||||
}
|
||||
assert(!queue_.Full());
|
||||
queue_.Push(x);
|
||||
notEmpty_.Notify();
|
||||
}
|
||||
|
||||
T Pop() {
|
||||
MutexLockGuard lock(mutex_);
|
||||
while (queue_.Empty()) {
|
||||
notEmpty_.Wait();
|
||||
}
|
||||
assert(!queue_.Empty());
|
||||
T res = queue_.Pop();
|
||||
notFull_.Notify();
|
||||
return res;
|
||||
}
|
||||
|
||||
bool Empty() const {
|
||||
MutexLockGuard lock(mutex_);
|
||||
return queue_.Empty();
|
||||
}
|
||||
|
||||
bool Full() const {
|
||||
MutexLockGuard lock(mutex_);
|
||||
return queue_.Full();
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
MutexLockGuard lock(mutex_);
|
||||
return queue_.size();
|
||||
}
|
||||
|
||||
size_t capacity() const {
|
||||
return queue_.capacity();
|
||||
}
|
||||
|
||||
private:
|
||||
mutable MutexLock mutex_;
|
||||
Condition notEmpty_;
|
||||
Condition notFull_;
|
||||
BoundedQueue<T> queue_;
|
||||
}; // class BoundedBlockingQueue
|
||||
|
||||
} // namespace limonp
|
||||
|
||||
#endif // LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
|
|
@ -0,0 +1,65 @@
|
|||
#ifndef LIMONP_BOUNDED_QUEUE_HPP
|
||||
#define LIMONP_BOUNDED_QUEUE_HPP
|
||||
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <cassert>
|
||||
|
||||
namespace limonp {
|
||||
using namespace std;
|
||||
template<class T>
|
||||
class BoundedQueue {
|
||||
public:
|
||||
explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) {
|
||||
head_ = 0;
|
||||
tail_ = 0;
|
||||
size_ = 0;
|
||||
assert(capacity_);
|
||||
}
|
||||
~BoundedQueue() {
|
||||
}
|
||||
|
||||
void Clear() {
|
||||
head_ = 0;
|
||||
tail_ = 0;
|
||||
size_ = 0;
|
||||
}
|
||||
bool Empty() const {
|
||||
return !size_;
|
||||
}
|
||||
bool Full() const {
|
||||
return capacity_ == size_;
|
||||
}
|
||||
size_t Size() const {
|
||||
return size_;
|
||||
}
|
||||
size_t Capacity() const {
|
||||
return capacity_;
|
||||
}
|
||||
|
||||
void Push(const T& t) {
|
||||
assert(!Full());
|
||||
circular_buffer_[tail_] = t;
|
||||
tail_ = (tail_ + 1) % capacity_;
|
||||
size_ ++;
|
||||
}
|
||||
|
||||
T Pop() {
|
||||
assert(!Empty());
|
||||
size_t oldPos = head_;
|
||||
head_ = (head_ + 1) % capacity_;
|
||||
size_ --;
|
||||
return circular_buffer_[oldPos];
|
||||
}
|
||||
|
||||
private:
|
||||
size_t head_;
|
||||
size_t tail_;
|
||||
size_t size_;
|
||||
const size_t capacity_;
|
||||
vector<T> circular_buffer_;
|
||||
|
||||
}; // class BoundedQueue
|
||||
} // namespace limonp
|
||||
|
||||
#endif
|
|
@ -0,0 +1,206 @@
|
|||
#ifndef LIMONP_CLOSURE_HPP
|
||||
#define LIMONP_CLOSURE_HPP
|
||||
|
||||
namespace limonp {
|
||||
|
||||
class ClosureInterface {
|
||||
public:
|
||||
virtual ~ClosureInterface() {
|
||||
}
|
||||
virtual void Run() = 0;
|
||||
};
|
||||
|
||||
template <class Funct>
|
||||
class Closure0: public ClosureInterface {
|
||||
public:
|
||||
Closure0(Funct fun) {
|
||||
fun_ = fun;
|
||||
}
|
||||
virtual ~Closure0() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(*fun_)();
|
||||
}
|
||||
private:
|
||||
Funct fun_;
|
||||
};
|
||||
|
||||
template <class Funct, class Arg1>
|
||||
class Closure1: public ClosureInterface {
|
||||
public:
|
||||
Closure1(Funct fun, Arg1 arg1) {
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
}
|
||||
virtual ~Closure1() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(*fun_)(arg1_);
|
||||
}
|
||||
private:
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
};
|
||||
|
||||
template <class Funct, class Arg1, class Arg2>
|
||||
class Closure2: public ClosureInterface {
|
||||
public:
|
||||
Closure2(Funct fun, Arg1 arg1, Arg2 arg2) {
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
arg2_ = arg2;
|
||||
}
|
||||
virtual ~Closure2() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(*fun_)(arg1_, arg2_);
|
||||
}
|
||||
private:
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
Arg2 arg2_;
|
||||
};
|
||||
|
||||
template <class Funct, class Arg1, class Arg2, class Arg3>
|
||||
class Closure3: public ClosureInterface {
|
||||
public:
|
||||
Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
arg2_ = arg2;
|
||||
arg3_ = arg3;
|
||||
}
|
||||
virtual ~Closure3() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(*fun_)(arg1_, arg2_, arg3_);
|
||||
}
|
||||
private:
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
Arg2 arg2_;
|
||||
Arg3 arg3_;
|
||||
};
|
||||
|
||||
template <class Obj, class Funct>
|
||||
class ObjClosure0: public ClosureInterface {
|
||||
public:
|
||||
ObjClosure0(Obj* p, Funct fun) {
|
||||
p_ = p;
|
||||
fun_ = fun;
|
||||
}
|
||||
virtual ~ObjClosure0() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(p_->*fun_)();
|
||||
}
|
||||
private:
|
||||
Obj* p_;
|
||||
Funct fun_;
|
||||
};
|
||||
|
||||
template <class Obj, class Funct, class Arg1>
|
||||
class ObjClosure1: public ClosureInterface {
|
||||
public:
|
||||
ObjClosure1(Obj* p, Funct fun, Arg1 arg1) {
|
||||
p_ = p;
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
}
|
||||
virtual ~ObjClosure1() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(p_->*fun_)(arg1_);
|
||||
}
|
||||
private:
|
||||
Obj* p_;
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
};
|
||||
|
||||
template <class Obj, class Funct, class Arg1, class Arg2>
|
||||
class ObjClosure2: public ClosureInterface {
|
||||
public:
|
||||
ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) {
|
||||
p_ = p;
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
arg2_ = arg2;
|
||||
}
|
||||
virtual ~ObjClosure2() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(p_->*fun_)(arg1_, arg2_);
|
||||
}
|
||||
private:
|
||||
Obj* p_;
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
Arg2 arg2_;
|
||||
};
|
||||
template <class Obj, class Funct, class Arg1, class Arg2, class Arg3>
|
||||
class ObjClosure3: public ClosureInterface {
|
||||
public:
|
||||
ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
|
||||
p_ = p;
|
||||
fun_ = fun;
|
||||
arg1_ = arg1;
|
||||
arg2_ = arg2;
|
||||
arg3_ = arg3;
|
||||
}
|
||||
virtual ~ObjClosure3() {
|
||||
}
|
||||
virtual void Run() {
|
||||
(p_->*fun_)(arg1_, arg2_, arg3_);
|
||||
}
|
||||
private:
|
||||
Obj* p_;
|
||||
Funct fun_;
|
||||
Arg1 arg1_;
|
||||
Arg2 arg2_;
|
||||
Arg3 arg3_;
|
||||
};
|
||||
|
||||
template<class R>
|
||||
ClosureInterface* NewClosure(R (*fun)()) {
|
||||
return new Closure0<R (*)()>(fun);
|
||||
}
|
||||
|
||||
template<class R, class Arg1>
|
||||
ClosureInterface* NewClosure(R (*fun)(Arg1), Arg1 arg1) {
|
||||
return new Closure1<R (*)(Arg1), Arg1>(fun, arg1);
|
||||
}
|
||||
|
||||
template<class R, class Arg1, class Arg2>
|
||||
ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
|
||||
return new Closure2<R (*)(Arg1, Arg2), Arg1, Arg2>(fun, arg1, arg2);
|
||||
}
|
||||
|
||||
template<class R, class Arg1, class Arg2, class Arg3>
|
||||
ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
|
||||
return new Closure3<R (*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(fun, arg1, arg2, arg3);
|
||||
}
|
||||
|
||||
template<class R, class Obj>
|
||||
ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)()) {
|
||||
return new ObjClosure0<Obj, R (Obj::* )()>(obj, fun);
|
||||
}
|
||||
|
||||
template<class R, class Obj, class Arg1>
|
||||
ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1), Arg1 arg1) {
|
||||
return new ObjClosure1<Obj, R (Obj::* )(Arg1), Arg1>(obj, fun, arg1);
|
||||
}
|
||||
|
||||
template<class R, class Obj, class Arg1, class Arg2>
|
||||
ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
|
||||
return new ObjClosure2<Obj, R (Obj::*)(Arg1, Arg2), Arg1, Arg2>(obj, fun, arg1, arg2);
|
||||
}
|
||||
|
||||
template<class R, class Obj, class Arg1, class Arg2, class Arg3>
|
||||
ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
|
||||
return new ObjClosure3<Obj, R (Obj::*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(obj, fun, arg1, arg2, arg3);
|
||||
}
|
||||
|
||||
} // namespace limonp
|
||||
|
||||
#endif // LIMONP_CLOSURE_HPP
|
|
@ -0,0 +1,31 @@
|
|||
#ifndef LIMONP_COLOR_PRINT_HPP
|
||||
#define LIMONP_COLOR_PRINT_HPP
|
||||
|
||||
#include <string>
|
||||
#include <stdarg.h>
|
||||
|
||||
namespace limonp {
|
||||
|
||||
using std::string;
|
||||
|
||||
enum Color {
|
||||
BLACK = 30,
|
||||
RED,
|
||||
GREEN,
|
||||
YELLOW,
|
||||
BLUE,
|
||||
PURPLE
|
||||
}; // enum Color
|
||||
|
||||
static void ColorPrintln(enum Color color, const char * fmt, ...) {
|
||||
va_list ap;
|
||||
printf("\033[0;%dm", color);
|
||||
va_start(ap, fmt);
|
||||
vprintf(fmt, ap);
|
||||
va_end(ap);
|
||||
printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly
|
||||
}
|
||||
|
||||
} // namespace limonp
|
||||
|
||||
#endif // LIMONP_COLOR_PRINT_HPP
|
|
@ -0,0 +1,38 @@
|
|||
#ifndef LIMONP_CONDITION_HPP
|
||||
#define LIMONP_CONDITION_HPP
|
||||
|
||||
#include "MutexLock.hpp"
|
||||
|
||||
namespace limonp {
|
||||
|
||||
class Condition : NonCopyable {
|
||||
public:
|
||||
explicit Condition(MutexLock& mutex)
|
||||
: mutex_(mutex) {
|
||||
XCHECK(!pthread_cond_init(&pcond_, NULL));
|
||||
}
|
||||
|
||||
~Condition() {
|
||||
XCHECK(!pthread_cond_destroy(&pcond_));
|
||||
}
|
||||
|
||||
void Wait() {
|
||||
XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex()));
|
||||
}
|
||||
|
||||
void Notify() {
|
||||
XCHECK(!pthread_cond_signal(&pcond_));
|
||||
}
|
||||
|
||||
void NotifyAll() {
|
||||
XCHECK(!pthread_cond_broadcast(&pcond_));
|
||||
}
|
||||
|
||||
private:
|
||||
MutexLock& mutex_;
|
||||
pthread_cond_t pcond_;
|
||||
}; // class Condition
|
||||
|
||||
} // namespace limonp
|
||||
|
||||
#endif // LIMONP_CONDITION_HPP
|
|
@ -0,0 +1,103 @@
|
|||
/************************************
|
||||
* file enc : utf8
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
#ifndef LIMONP_CONFIG_H
|
||||
#define LIMONP_CONFIG_H
|
||||
|
||||
#include <map>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <assert.h>
|
||||
#include "StringUtil.hpp"
|
||||
|
||||
namespace limonp {
|
||||
|
||||
using namespace std;
|
||||
|
||||
class Config {
|
||||
public:
|
||||
explicit Config(const string& filePath) {
|
||||
LoadFile(filePath);
|
||||
}
|
||||
|
||||
operator bool () {
|
||||
return !map_.empty();
|
||||
}
|
||||
|
||||
string Get(const string& key, const string& defaultvalue) const {
|
||||
map<string, string>::const_iterator it = map_.find(key);
|
||||
if(map_.end() != it) {
|
||||
return it->second;
|
||||
}
|
||||
return defaultvalue;
|
||||
}
|
||||
int Get(const string& key, int defaultvalue) const {
|
||||
string str = Get(key, "");
|
||||
if("" == str) {
|
||||
return defaultvalue;
|
||||
}
|
||||
return atoi(str.c_str());
|
||||
}
|
||||
const char* operator [] (const char* key) const {
|
||||
if(NULL == key) {
|
||||
return NULL;
|
||||
}
|
||||
map<string, string>::const_iterator it = map_.find(key);
|
||||
if(map_.end() != it) {
|
||||
return it->second.c_str();
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
string GetConfigInfo() const {
|
||||
string res;
|
||||
res << *this;
|
||||
return res;
|
||||
}
|
||||
|
||||
private:
|
||||
void LoadFile(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
assert(ifs);
|
||||
string line;
|
||||
vector<string> vecBuf;
|
||||
size_t lineno = 0;
|
||||
while(getline(ifs, line)) {
|
||||
lineno ++;
|
||||
Trim(line);
|
||||
if(line.empty() || StartsWith(line, "#")) {
|
||||
continue;
|
||||
}
|
||||
vecBuf.clear();
|
||||
Split(line, vecBuf, "=");
|
||||
if(2 != vecBuf.size()) {
|
||||
fprintf(stderr, "line[%s] illegal.\n", line.c_str());
|
||||
assert(false);
|
||||
continue;
|
||||
}
|
||||
string& key = vecBuf[0];
|
||||
string& value = vecBuf[1];
|
||||
Trim(key);
|
||||
Trim(value);
|
||||
if(!map_.insert(make_pair(key, value)).second) {
|
||||
fprintf(stderr, "key[%s] already exits.\n", key.c_str());
|
||||
assert(false);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
ifs.close();
|
||||
}
|
||||
|
||||
friend ostream& operator << (ostream& os, const Config& config);
|
||||
|
||||
map<string, string> map_;
|
||||
}; // class Config
|
||||
|
||||
inline ostream& operator << (ostream& os, const Config& config) {
|
||||
return os << config.map_;
|
||||
}
|
||||
|
||||
} // namespace limonp
|
||||
|
||||
#endif // LIMONP_CONFIG_H
|
|
@ -0,0 +1,74 @@
|
|||
#ifndef LIMONP_FILELOCK_HPP
|
||||
#define LIMONP_FILELOCK_HPP
|
||||
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <string>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
|
||||
namespace limonp {
|
||||
|
||||
using std::string;
|
||||
|
||||
class FileLock {
|
||||
public:
|
||||
FileLock() : fd_(-1), ok_(true) {
|
||||
}
|
||||
~FileLock() {
|
||||
if(fd_ > 0) {
|
||||
Close();
|
||||
}
|
||||
}
|
||||
void Open(const string& fname) {
|
||||
assert(fd_ == -1);
|
||||
fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
|
||||
if(fd_ < 0) {
|
||||
ok_ = false;
|
||||
err_ = strerror(errno);
|
||||
}
|
||||
}
|
||||
void Close() {
|
||||
::close(fd_);
|
||||
}
|
||||
void Lock() {
|
||||
if(LockOrUnlock(fd_, true) < 0) {
|
||||
ok_ = false;
|
||||
err_ = strerror(errno);
|
||||
}
|
||||
}
|
||||
void UnLock() {
|
||||
if(LockOrUnlock(fd_, false) < 0) {
|
||||
ok_ = false;
|
||||
err_ = strerror(errno);
|
||||
}
|
||||
}
|
||||
bool Ok() const {
|
||||
return ok_;
|
||||
}
|
||||
string Error() const {
|
||||
return err_;
|
||||
}
|
||||
private:
|
||||
static int LockOrUnlock(int fd, bool lock) {
|
||||
errno = 0;
|
||||
struct flock f;
|
||||
memset(&f, 0, sizeof(f));
|
||||
f.l_type = (lock ? F_WRLCK : F_UNLCK);
|
||||
f.l_whence = SEEK_SET;
|
||||
f.l_start = 0;
|
||||
f.l_len = 0; // Lock/unlock entire file
|
||||
return fcntl(fd, F_SETLK, &f);
|
||||
}
|
||||
|
||||
int fd_;
|
||||
bool ok_;
|
||||
string err_;
|
||||
}; // class FileLock
|
||||
|
||||
}// namespace limonp
|
||||
|
||||
#endif // LIMONP_FILELOCK_HPP
|
|
@ -0,0 +1,7 @@
|
|||
#ifndef LIMONP_FORCE_PUBLIC_H
|
||||
#define LIMONP_FORCE_PUBLIC_H
|
||||
|
||||
#define private public
|
||||
#define protected public
|
||||
|
||||
#endif // LIMONP_FORCE_PUBLIC_H
|
|
@ -0,0 +1,139 @@
|
|||
#ifndef LIMONP_LOCAL_VECTOR_HPP
|
||||
#define LIMONP_LOCAL_VECTOR_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
namespace limonp {
|
||||
using namespace std;
|
||||
/*
|
||||
* LocalVector<T> : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector<T> may be dangerous..
|
||||
* LocalVector<T> is simple and not well-tested.
|
||||
*/
|
||||
const size_t LOCAL_VECTOR_BUFFER_SIZE = 16;
|
||||
template <class T>
|
||||
class LocalVector {
|
||||
public:
|
||||
typedef const T* const_iterator ;
|
||||
typedef T value_type;
|
||||
typedef size_t size_type;
|
||||
private:
|
||||
T buffer_[LOCAL_VECTOR_BUFFER_SIZE];
|
||||
T * ptr_;
|
||||
size_t size_;
|
||||
size_t capacity_;
|
||||
public:
|
||||
LocalVector() {
|
||||
init_();
|
||||
};
|
||||
LocalVector(const LocalVector<T>& vec) {
|
||||
init_();
|
||||
*this = vec;
|
||||
}
|
||||
LocalVector(const_iterator begin, const_iterator end) { // TODO: make it faster
|
||||
init_();
|
||||
while(begin != end) {
|
||||
push_back(*begin++);
|
||||
}
|
||||
}
|
||||
LocalVector(size_t size, const T& t) { // TODO: make it faster
|
||||
init_();
|
||||
while(size--) {
|
||||
push_back(t);
|
||||
}
|
||||
}
|
||||
~LocalVector() {
|
||||
if(ptr_ != buffer_) {
|
||||
free(ptr_);
|
||||
}
|
||||
};
|
||||
public:
|
||||
LocalVector<T>& operator = (const LocalVector<T>& vec) {
|
||||
clear();
|
||||
size_ = vec.size();
|
||||
capacity_ = vec.capacity();
|
||||
if(vec.buffer_ == vec.ptr_) {
|
||||
memcpy(buffer_, vec.buffer_, sizeof(T) * size_);
|
||||
ptr_ = buffer_;
|
||||
} else {
|
||||
ptr_ = (T*) malloc(vec.capacity() * sizeof(T));
|
||||
assert(ptr_);
|
||||
memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
private:
|
||||
void init_() {
|
||||
ptr_ = buffer_;
|
||||
size_ = 0;
|
||||
capacity_ = LOCAL_VECTOR_BUFFER_SIZE;
|
||||
}
|
||||
public:
|
||||
T& operator [] (size_t i) {
|
||||
return ptr_[i];
|
||||
}
|
||||
const T& operator [] (size_t i) const {
|
||||
return ptr_[i];
|
||||
}
|
||||
void push_back(const T& t) {
|
||||
if(size_ == capacity_) {
|
||||
assert(capacity_);
|
||||
reserve(capacity_ * 2);
|
||||
}
|
||||
ptr_[size_ ++ ] = t;
|
||||
}
|
||||
void reserve(size_t size) {
|
||||
if(size <= capacity_) {
|
||||
return;
|
||||
}
|
||||
T * next = (T*)malloc(sizeof(T) * size);
|
||||
assert(next);
|
||||
T * old = ptr_;
|
||||
ptr_ = next;
|
||||
memcpy(ptr_, old, sizeof(T) * capacity_);
|
||||
capacity_ = size;
|
||||
if(old != buffer_) {
|
||||
free(old);
|
||||
}
|
||||
}
|
||||
bool empty() const {
|
||||
return 0 == size();
|
||||
}
|
||||
size_t size() const {
|
||||
return size_;
|
||||
}
|
||||
size_t capacity() const {
|
||||
return capacity_;
|
||||
}
|
||||
const_iterator begin() const {
|
||||
return ptr_;
|
||||
}
|
||||
const_iterator end() const {
|
||||
return ptr_ + size_;
|
||||
}
|
||||
void clear() {
|
||||
if(ptr_ != buffer_) {
|
||||
free(ptr_);
|
||||
}
|
||||
init_();
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
ostream & operator << (ostream& os, const LocalVector<T>& vec) {
|
||||
if(vec.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
os<<"[\""<<vec[0];
|
||||
for(size_t i = 1; i < vec.size(); i++) {
|
||||
os<<"\", \""<<vec[i];
|
||||
}
|
||||
os<<"\"]";
|
||||
return os;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,76 @@
|
|||
#ifndef LIMONP_LOGGING_HPP
|
||||
#define LIMONP_LOGGING_HPP
|
||||
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
|
||||
#ifdef XLOG
|
||||
#error "XLOG has been defined already"
|
||||
#endif // XLOG
|
||||
#ifdef XCHECK
|
||||
#error "XCHECK has been defined already"
|
||||
#endif // XCHECK
|
||||
|
||||
#define XLOG(level) limonp::Logger(limonp::LL_##level, __FILE__, __LINE__).Stream()
|
||||
#define XCHECK(exp) if(!(exp)) XLOG(FATAL) << "exp: ["#exp << "] false. "
|
||||
|
||||
namespace limonp {
|
||||
|
||||
enum {
|
||||
LL_DEBUG = 0,
|
||||
LL_INFO = 1,
|
||||
LL_WARNING = 2,
|
||||
LL_ERROR = 3,
|
||||
LL_FATAL = 4,
|
||||
}; // enum
|
||||
|
||||
static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"};
|
||||
static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S";
|
||||
|
||||
class Logger {
|
||||
public:
|
||||
Logger(size_t level, const char* filename, int lineno)
|
||||
: level_(level) {
|
||||
#ifdef LOGGING_LEVEL
|
||||
if (level_ < LOGGING_LEVEL) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY));
|
||||
char buf[32];
|
||||
time_t now;
|
||||
time(&now);
|
||||
strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&now));
|
||||
stream_ << buf
|
||||
<< " " << filename
|
||||
<< ":" << lineno
|
||||
<< " " << LOG_LEVEL_ARRAY[level_]
|
||||
<< " ";
|
||||
}
|
||||
~Logger() {
|
||||
#ifdef LOGGING_LEVEL
|
||||
if (level_ < LOGGING_LEVEL) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
std::cerr << stream_.str() << std::endl;
|
||||
if (level_ == LL_FATAL) {
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream& Stream() {
|
||||
return stream_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::ostringstream stream_;
|
||||
size_t level_;
|
||||
}; // class Logger
|
||||
|
||||
} // namespace limonp
|
||||
|
||||
#endif // LIMONP_LOGGING_HPP
|
|
@ -0,0 +1,411 @@
|
|||
#ifndef __MD5_H__
|
||||
#define __MD5_H__
|
||||
|
||||
// Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
|
||||
// rights reserved.
|
||||
|
||||
// License to copy and use this software is granted provided that it
|
||||
// is identified as the "RSA Data Security, Inc. MD5 Message-Digest
|
||||
// Algorithm" in all material mentioning or referencing this software
|
||||
// or this function.
|
||||
//
|
||||
// License is also granted to make and use derivative works provided
|
||||
// that such works are identified as "derived from the RSA Data
|
||||
// Security, Inc. MD5 Message-Digest Algorithm" in all material
|
||||
// mentioning or referencing the derived work.
|
||||
//
|
||||
// RSA Data Security, Inc. makes no representations concerning either
|
||||
// the merchantability of this software or the suitability of this
|
||||
// software for any particular purpose. It is provided "as is"
|
||||
// without express or implied warranty of any kind.
|
||||
//
|
||||
// These notices must be retained in any copies of any part of this
|
||||
// documentation and/or software.
|
||||
|
||||
|
||||
|
||||
// The original md5 implementation avoids external libraries.
|
||||
// This version has dependency on stdio.h for file input and
|
||||
// string.h for memcpy.
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
|
||||
namespace limonp {
|
||||
|
||||
//#pragma region MD5 defines
|
||||
// Constants for MD5Transform routine.
|
||||
#define S11 7
|
||||
#define S12 12
|
||||
#define S13 17
|
||||
#define S14 22
|
||||
#define S21 5
|
||||
#define S22 9
|
||||
#define S23 14
|
||||
#define S24 20
|
||||
#define S31 4
|
||||
#define S32 11
|
||||
#define S33 16
|
||||
#define S34 23
|
||||
#define S41 6
|
||||
#define S42 10
|
||||
#define S43 15
|
||||
#define S44 21
|
||||
|
||||
|
||||
// F, G, H and I are basic MD5 functions.
|
||||
#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
|
||||
#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
|
||||
#define H(x, y, z) ((x) ^ (y) ^ (z))
|
||||
#define I(x, y, z) ((y) ^ ((x) | (~z)))
|
||||
|
||||
// ROTATE_LEFT rotates x left n bits.
|
||||
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
|
||||
|
||||
// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
|
||||
// Rotation is separate from addition to prevent recomputation.
|
||||
#define FF(a, b, c, d, x, s, ac) { \
|
||||
(a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \
|
||||
(a) = ROTATE_LEFT ((a), (s)); \
|
||||
(a) += (b); \
|
||||
}
|
||||
#define GG(a, b, c, d, x, s, ac) { \
|
||||
(a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \
|
||||
(a) = ROTATE_LEFT ((a), (s)); \
|
||||
(a) += (b); \
|
||||
}
|
||||
#define HH(a, b, c, d, x, s, ac) { \
|
||||
(a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \
|
||||
(a) = ROTATE_LEFT ((a), (s)); \
|
||||
(a) += (b); \
|
||||
}
|
||||
#define II(a, b, c, d, x, s, ac) { \
|
||||
(a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \
|
||||
(a) = ROTATE_LEFT ((a), (s)); \
|
||||
(a) += (b); \
|
||||
}
|
||||
//#pragma endregion
|
||||
|
||||
|
||||
typedef unsigned char BYTE ;
|
||||
|
||||
// POINTER defines a generic pointer type
|
||||
typedef unsigned char *POINTER;
|
||||
|
||||
// UINT2 defines a two byte word
|
||||
typedef unsigned short int UINT2;
|
||||
|
||||
// UINT4 defines a four byte word
|
||||
typedef unsigned int UINT4;
|
||||
|
||||
static unsigned char PADDING[64] = {
|
||||
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
// convenient object that wraps
|
||||
// the C-functions for use in C++ only
|
||||
class MD5 {
|
||||
private:
|
||||
struct __context_t {
|
||||
UINT4 state[4]; /* state (ABCD) */
|
||||
UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */
|
||||
unsigned char buffer[64]; /* input buffer */
|
||||
} context ;
|
||||
|
||||
//#pragma region static helper functions
|
||||
// The core of the MD5 algorithm is here.
|
||||
// MD5 basic transformation. Transforms state based on block.
|
||||
static void MD5Transform( UINT4 state[4], unsigned char block[64] ) {
|
||||
UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
|
||||
|
||||
Decode (x, block, 64);
|
||||
|
||||
/* Round 1 */
|
||||
FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
|
||||
FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
|
||||
FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
|
||||
FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
|
||||
FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
|
||||
FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
|
||||
FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
|
||||
FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
|
||||
FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
|
||||
FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
|
||||
FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
|
||||
FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
|
||||
FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
|
||||
FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
|
||||
FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
|
||||
FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
|
||||
|
||||
/* Round 2 */
|
||||
GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
|
||||
GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
|
||||
GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
|
||||
GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
|
||||
GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
|
||||
GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */
|
||||
GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
|
||||
GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
|
||||
GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
|
||||
GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
|
||||
GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
|
||||
GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
|
||||
GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
|
||||
GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
|
||||
GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
|
||||
GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
|
||||
|
||||
/* Round 3 */
|
||||
HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
|
||||
HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
|
||||
HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
|
||||
HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
|
||||
HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
|
||||
HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
|
||||
HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
|
||||
HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
|
||||
HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
|
||||
HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
|
||||
HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
|
||||
HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */
|
||||
HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
|
||||
HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
|
||||
HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
|
||||
HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
|
||||
|
||||
/* Round 4 */
|
||||
II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
|
||||
II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
|
||||
II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
|
||||
II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
|
||||
II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
|
||||
II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
|
||||
II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
|
||||
II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
|
||||
II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
|
||||
II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
|
||||
II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
|
||||
II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
|
||||
II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
|
||||
II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
|
||||
II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
|
||||
II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
|
||||
|
||||
state[0] += a;
|
||||
state[1] += b;
|
||||
state[2] += c;
|
||||
state[3] += d;
|
||||
|
||||
// Zeroize sensitive information.
|
||||
memset((POINTER)x, 0, sizeof (x));
|
||||
}
|
||||
|
||||
// Encodes input (UINT4) into output (unsigned char). Assumes len is
|
||||
// a multiple of 4.
|
||||
static void Encode( unsigned char *output, UINT4 *input, unsigned int len ) {
|
||||
unsigned int i, j;
|
||||
|
||||
for (i = 0, j = 0; j < len; i++, j += 4) {
|
||||
output[j] = (unsigned char)(input[i] & 0xff);
|
||||
output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
|
||||
output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
|
||||
output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
|
||||
}
|
||||
}
|
||||
|
||||
// Decodes input (unsigned char) into output (UINT4). Assumes len is
|
||||
// a multiple of 4.
|
||||
static void Decode( UINT4 *output, unsigned char *input, unsigned int len ) {
|
||||
unsigned int i, j;
|
||||
|
||||
for (i = 0, j = 0; j < len; i++, j += 4)
|
||||
output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
|
||||
(((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
|
||||
}
|
||||
//#pragma endregion
|
||||
|
||||
|
||||
public:
|
||||
// MAIN FUNCTIONS
|
||||
MD5() {
|
||||
Init() ;
|
||||
}
|
||||
|
||||
// MD5 initialization. Begins an MD5 operation, writing a new context.
|
||||
void Init() {
|
||||
context.count[0] = context.count[1] = 0;
|
||||
|
||||
// Load magic initialization constants.
|
||||
context.state[0] = 0x67452301;
|
||||
context.state[1] = 0xefcdab89;
|
||||
context.state[2] = 0x98badcfe;
|
||||
context.state[3] = 0x10325476;
|
||||
}
|
||||
|
||||
// MD5 block update operation. Continues an MD5 message-digest
|
||||
// operation, processing another message block, and updating the
|
||||
// context.
|
||||
void Update(
|
||||
unsigned char *input, // input block
|
||||
unsigned int inputLen ) { // length of input block
|
||||
unsigned int i, index, partLen;
|
||||
|
||||
// Compute number of bytes mod 64
|
||||
index = (unsigned int)((context.count[0] >> 3) & 0x3F);
|
||||
|
||||
// Update number of bits
|
||||
if ((context.count[0] += ((UINT4)inputLen << 3))
|
||||
< ((UINT4)inputLen << 3))
|
||||
context.count[1]++;
|
||||
context.count[1] += ((UINT4)inputLen >> 29);
|
||||
|
||||
partLen = 64 - index;
|
||||
|
||||
// Transform as many times as possible.
|
||||
if (inputLen >= partLen) {
|
||||
memcpy((POINTER)&context.buffer[index], (POINTER)input, partLen);
|
||||
MD5Transform (context.state, context.buffer);
|
||||
|
||||
for (i = partLen; i + 63 < inputLen; i += 64)
|
||||
MD5Transform (context.state, &input[i]);
|
||||
|
||||
index = 0;
|
||||
} else
|
||||
i = 0;
|
||||
|
||||
/* Buffer remaining input */
|
||||
memcpy((POINTER)&context.buffer[index], (POINTER)&input[i], inputLen-i);
|
||||
}
|
||||
|
||||
// MD5 finalization. Ends an MD5 message-digest operation, writing the
|
||||
// the message digest and zeroizing the context.
|
||||
// Writes to digestRaw
|
||||
void Final() {
|
||||
unsigned char bits[8];
|
||||
unsigned int index, padLen;
|
||||
|
||||
// Save number of bits
|
||||
Encode( bits, context.count, 8 );
|
||||
|
||||
// Pad out to 56 mod 64.
|
||||
index = (unsigned int)((context.count[0] >> 3) & 0x3f);
|
||||
padLen = (index < 56) ? (56 - index) : (120 - index);
|
||||
Update( PADDING, padLen );
|
||||
|
||||
// Append length (before padding)
|
||||
Update( bits, 8 );
|
||||
|
||||
// Store state in digest
|
||||
Encode( digestRaw, context.state, 16);
|
||||
|
||||
// Zeroize sensitive information.
|
||||
memset((POINTER)&context, 0, sizeof (context));
|
||||
|
||||
writeToString() ;
|
||||
}
|
||||
|
||||
/// Buffer must be 32+1 (nul) = 33 chars long at least
|
||||
void writeToString() {
|
||||
int pos ;
|
||||
|
||||
for( pos = 0 ; pos < 16 ; pos++ )
|
||||
sprintf( digestChars+(pos*2), "%02x", digestRaw[pos] ) ;
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
// an MD5 digest is a 16-byte number (32 hex digits)
|
||||
BYTE digestRaw[ 16 ] ;
|
||||
|
||||
// This version of the digest is actually
|
||||
// a "printf'd" version of the digest.
|
||||
char digestChars[ 33 ] ;
|
||||
|
||||
/// Load a file from disk and digest it
|
||||
// Digests a file and returns the result.
|
||||
const char* digestFile( const char *filename ) {
|
||||
if (NULL == filename || strcmp(filename, "") == 0)
|
||||
return NULL;
|
||||
|
||||
Init() ;
|
||||
|
||||
FILE *file;
|
||||
|
||||
unsigned char buffer[1024] ;
|
||||
|
||||
if((file = fopen (filename, "rb")) == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
int len;
|
||||
while( (len = fread( buffer, 1, 1024, file )) )
|
||||
Update( buffer, len ) ;
|
||||
Final();
|
||||
|
||||
fclose( file );
|
||||
|
||||
return digestChars ;
|
||||
}
|
||||
|
||||
/// Digests a byte-array already in memory
|
||||
const char* digestMemory( BYTE *memchunk, int len ) {
|
||||
if (NULL == memchunk)
|
||||
return NULL;
|
||||
|
||||
Init() ;
|
||||
Update( memchunk, len ) ;
|
||||
Final() ;
|
||||
|
||||
return digestChars ;
|
||||
}
|
||||
|
||||
// Digests a string and prints the result.
|
||||
const char* digestString(const char *string ) {
|
||||
if (string == NULL)
|
||||
return NULL;
|
||||
|
||||
Init() ;
|
||||
Update( (unsigned char*)string, strlen(string) ) ;
|
||||
Final() ;
|
||||
|
||||
return digestChars ;
|
||||
}
|
||||
};
|
||||
|
||||
inline bool md5String(const char* str, std::string& res) {
|
||||
if (NULL == str) {
|
||||
res = "";
|
||||
return false;
|
||||
}
|
||||
|
||||
MD5 md5;
|
||||
const char *pRes = md5.digestString(str);
|
||||
if (NULL == pRes) {
|
||||
res = "";
|
||||
return false;
|
||||
}
|
||||
|
||||
res = pRes;
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool md5File(const char* filepath, std::string& res) {
|
||||
if (NULL == filepath || strcmp(filepath, "") == 0) {
|
||||
res = "";
|
||||
return false;
|
||||
}
|
||||
|
||||
MD5 md5;
|
||||
const char *pRes = md5.digestFile(filepath);
|
||||
|
||||
if (NULL == pRes) {
|
||||
res = "";
|
||||
return false;
|
||||
}
|
||||
|
||||
res = pRes;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,51 @@
|
|||
#ifndef LIMONP_MUTEX_LOCK_HPP
|
||||
#define LIMONP_MUTEX_LOCK_HPP
|
||||
|
||||
#include <pthread.h>
|
||||
#include "NonCopyable.hpp"
|
||||
#include "Logging.hpp"
|
||||
|
||||
namespace limonp {
|
||||
|
||||
class MutexLock: NonCopyable {
|
||||
public:
|
||||
MutexLock() {
|
||||
XCHECK(!pthread_mutex_init(&mutex_, NULL));
|
||||
}
|
||||
~MutexLock() {
|
||||
XCHECK(!pthread_mutex_destroy(&mutex_));
|
||||
}
|
||||
pthread_mutex_t* GetPthreadMutex() {
|
||||
return &mutex_;
|
||||
}
|
||||
|
||||
private:
|
||||
void Lock() {
|
||||
XCHECK(!pthread_mutex_lock(&mutex_));
|
||||
}
|
||||
void Unlock() {
|
||||
XCHECK(!pthread_mutex_unlock(&mutex_));
|
||||
}
|
||||
friend class MutexLockGuard;
|
||||
|
||||
pthread_mutex_t mutex_;
|
||||
}; // class MutexLock
|
||||
|
||||
class MutexLockGuard: NonCopyable {
|
||||
public:
|
||||
explicit MutexLockGuard(MutexLock & mutex)
|
||||
: mutex_(mutex) {
|
||||
mutex_.Lock();
|
||||
}
|
||||
~MutexLockGuard() {
|
||||
mutex_.Unlock();
|
||||
}
|
||||
private:
|
||||
MutexLock & mutex_;
|
||||
}; // class MutexLockGuard
|
||||
|
||||
#define MutexLockGuard(x) XCHECK(false);
|
||||
|
||||
} // namespace limonp
|
||||
|
||||
#endif // LIMONP_MUTEX_LOCK_HPP
|
|
@ -0,0 +1,21 @@
|
|||
/************************************
|
||||
************************************/
|
||||
#ifndef LIMONP_NONCOPYABLE_H
|
||||
#define LIMONP_NONCOPYABLE_H
|
||||
|
||||
namespace limonp {
|
||||
|
||||
class NonCopyable {
|
||||
protected:
|
||||
NonCopyable() {
|
||||
}
|
||||
~NonCopyable() {
|
||||
}
|
||||
private:
|
||||
NonCopyable(const NonCopyable& );
|
||||
const NonCopyable& operator=(const NonCopyable& );
|
||||
}; // class NonCopyable
|
||||
|
||||
} // namespace limonp
|
||||
|
||||
#endif // LIMONP_NONCOPYABLE_H
|
|
@ -0,0 +1,157 @@
|
|||
#ifndef LIMONP_STD_EXTEMSION_HPP
|
||||
#define LIMONP_STD_EXTEMSION_HPP
|
||||
|
||||
#include <map>
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#elif(__cplusplus >= 201103L)
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#elif defined _MSC_VER
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#else
|
||||
#include <tr1/unordered_map>
|
||||
#include <tr1/unordered_set>
|
||||
namespace std {
|
||||
using std::tr1::unordered_map;
|
||||
using std::tr1::unordered_set;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <deque>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
namespace std {
|
||||
|
||||
template<typename T>
|
||||
ostream& operator << (ostream& os, const vector<T>& v) {
|
||||
if(v.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
os<<"["<<v[0];
|
||||
for(size_t i = 1; i < v.size(); i++) {
|
||||
os<<", "<<v[i];
|
||||
}
|
||||
os<<"]";
|
||||
return os;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline ostream& operator << (ostream& os, const vector<string>& v) {
|
||||
if(v.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
os<<"[\""<<v[0];
|
||||
for(size_t i = 1; i < v.size(); i++) {
|
||||
os<<"\", \""<<v[i];
|
||||
}
|
||||
os<<"\"]";
|
||||
return os;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
ostream& operator << (ostream& os, const deque<T>& dq) {
|
||||
if(dq.empty()) {
|
||||
return os << "[]";
|
||||
}
|
||||
os<<"[\""<<dq[0];
|
||||
for(size_t i = 1; i < dq.size(); i++) {
|
||||
os<<"\", \""<<dq[i];
|
||||
}
|
||||
os<<"\"]";
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
template<class T1, class T2>
|
||||
ostream& operator << (ostream& os, const pair<T1, T2>& pr) {
|
||||
os << pr.first << ":" << pr.second ;
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
template<class T>
|
||||
string& operator << (string& str, const T& obj) {
|
||||
stringstream ss;
|
||||
ss << obj; // call ostream& operator << (ostream& os,
|
||||
return str = ss.str();
|
||||
}
|
||||
|
||||
template<class T1, class T2>
|
||||
ostream& operator << (ostream& os, const map<T1, T2>& mp) {
|
||||
if(mp.empty()) {
|
||||
os<<"{}";
|
||||
return os;
|
||||
}
|
||||
os<<'{';
|
||||
typename map<T1, T2>::const_iterator it = mp.begin();
|
||||
os<<*it;
|
||||
it++;
|
||||
while(it != mp.end()) {
|
||||
os<<", "<<*it;
|
||||
it++;
|
||||
}
|
||||
os<<'}';
|
||||
return os;
|
||||
}
|
||||
template<class T1, class T2>
|
||||
ostream& operator << (ostream& os, const std::unordered_map<T1, T2>& mp) {
|
||||
if(mp.empty()) {
|
||||
return os << "{}";
|
||||
}
|
||||
os<<'{';
|
||||
typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
|
||||
os<<*it;
|
||||
it++;
|
||||
while(it != mp.end()) {
|
||||
os<<", "<<*it++;
|
||||
}
|
||||
return os<<'}';
|
||||
}
|
||||
|
||||
template<class T>
|
||||
ostream& operator << (ostream& os, const set<T>& st) {
|
||||
if(st.empty()) {
|
||||
os << "{}";
|
||||
return os;
|
||||
}
|
||||
os<<'{';
|
||||
typename set<T>::const_iterator it = st.begin();
|
||||
os<<*it;
|
||||
it++;
|
||||
while(it != st.end()) {
|
||||
os<<", "<<*it;
|
||||
it++;
|
||||
}
|
||||
os<<'}';
|
||||
return os;
|
||||
}
|
||||
|
||||
template<class KeyType, class ContainType>
|
||||
bool IsIn(const ContainType& contain, const KeyType& key) {
|
||||
return contain.end() != contain.find(key);
|
||||
}
|
||||
|
||||
template<class T>
|
||||
basic_string<T> & operator << (basic_string<T> & s, ifstream & ifs) {
|
||||
return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
|
||||
}
|
||||
|
||||
template<class T>
|
||||
ofstream & operator << (ofstream & ofs, const basic_string<T>& s) {
|
||||
ostreambuf_iterator<T> itr (ofs);
|
||||
copy(s.begin(), s.end(), itr);
|
||||
return ofs;
|
||||
}
|
||||
|
||||
} // namespace std
|
||||
|
||||
#endif
|
|
@ -0,0 +1,365 @@
|
|||
/************************************
|
||||
* file enc : ascii
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
#ifndef LIMONP_STR_FUNCTS_H
|
||||
#define LIMONP_STR_FUNCTS_H
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <map>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include <memory.h>
|
||||
#include <functional>
|
||||
#include <locale>
|
||||
#include <sstream>
|
||||
#include <sys/types.h>
|
||||
#include <iterator>
|
||||
#include <algorithm>
|
||||
#include "StdExtension.hpp"
|
||||
|
||||
namespace limonp {
|
||||
using namespace std;
|
||||
inline string StringFormat(const char* fmt, ...) {
|
||||
int size = 256;
|
||||
std::string str;
|
||||
va_list ap;
|
||||
while (1) {
|
||||
str.resize(size);
|
||||
va_start(ap, fmt);
|
||||
int n = vsnprintf((char *)str.c_str(), size, fmt, ap);
|
||||
va_end(ap);
|
||||
if (n > -1 && n < size) {
|
||||
str.resize(n);
|
||||
return str;
|
||||
}
|
||||
if (n > -1)
|
||||
size = n + 1;
|
||||
else
|
||||
size *= 2;
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void Join(T begin, T end, string& res, const string& connector) {
|
||||
if(begin == end) {
|
||||
return;
|
||||
}
|
||||
stringstream ss;
|
||||
ss<<*begin;
|
||||
begin++;
|
||||
while(begin != end) {
|
||||
ss << connector << *begin;
|
||||
begin ++;
|
||||
}
|
||||
res = ss.str();
|
||||
}
|
||||
|
||||
template<class T>
|
||||
string Join(T begin, T end, const string& connector) {
|
||||
string res;
|
||||
Join(begin ,end, res, connector);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline string& Upper(string& str) {
|
||||
transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
|
||||
return str;
|
||||
}
|
||||
|
||||
inline string& Lower(string& str) {
|
||||
transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
|
||||
return str;
|
||||
}
|
||||
|
||||
inline bool IsSpace(unsigned c) {
|
||||
// when passing large int as the argument of isspace, it core dump, so here need a type cast.
|
||||
return c > 0xff ? false : std::isspace(c & 0xff) != 0;
|
||||
}
|
||||
|
||||
inline std::string& LTrim(std::string &s) {
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))));
|
||||
return s;
|
||||
}
|
||||
|
||||
inline std::string& RTrim(std::string &s) {
|
||||
s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))).base(), s.end());
|
||||
return s;
|
||||
}
|
||||
|
||||
inline std::string& Trim(std::string &s) {
|
||||
return LTrim(RTrim(s));
|
||||
}
|
||||
|
||||
inline std::string& LTrim(std::string & s, char x) {
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to<char>(), x))));
|
||||
return s;
|
||||
}
|
||||
|
||||
inline std::string& RTrim(std::string & s, char x) {
|
||||
s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to<char>(), x))).base(), s.end());
|
||||
return s;
|
||||
}
|
||||
|
||||
inline std::string& Trim(std::string &s, char x) {
|
||||
return LTrim(RTrim(s, x), x);
|
||||
}
|
||||
|
||||
inline void Split(const string& src, vector<string>& res, const string& pattern, size_t maxsplit = string::npos) {
|
||||
res.clear();
|
||||
size_t Start = 0;
|
||||
size_t end = 0;
|
||||
string sub;
|
||||
while(Start < src.size()) {
|
||||
end = src.find_first_of(pattern, Start);
|
||||
if(string::npos == end || res.size() >= maxsplit) {
|
||||
sub = src.substr(Start);
|
||||
res.push_back(sub);
|
||||
return;
|
||||
}
|
||||
sub = src.substr(Start, end - Start);
|
||||
res.push_back(sub);
|
||||
Start = end + 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
inline vector<string> Split(const string& src, const string& pattern, size_t maxsplit = string::npos) {
|
||||
vector<string> res;
|
||||
Split(src, res, pattern, maxsplit);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline bool StartsWith(const string& str, const string& prefix) {
|
||||
if(prefix.length() > str.length()) {
|
||||
return false;
|
||||
}
|
||||
return 0 == str.compare(0, prefix.length(), prefix);
|
||||
}
|
||||
|
||||
inline bool EndsWith(const string& str, const string& suffix) {
|
||||
if(suffix.length() > str.length()) {
|
||||
return false;
|
||||
}
|
||||
return 0 == str.compare(str.length() - suffix.length(), suffix.length(), suffix);
|
||||
}
|
||||
|
||||
inline bool IsInStr(const string& str, char ch) {
|
||||
return str.find(ch) != string::npos;
|
||||
}
|
||||
|
||||
inline uint16_t TwocharToUint16(char high, char low) {
|
||||
return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
|
||||
}
|
||||
|
||||
template <class Uint16Container>
|
||||
bool Utf8ToUnicode(const char * const str, size_t len, Uint16Container& vec) {
|
||||
if(!str) {
|
||||
return false;
|
||||
}
|
||||
char ch1, ch2;
|
||||
uint16_t tmp;
|
||||
vec.clear();
|
||||
for(size_t i = 0; i < len;) {
|
||||
if(!(str[i] & 0x80)) { // 0xxxxxxx
|
||||
vec.push_back(str[i]);
|
||||
i++;
|
||||
} else if ((uint8_t)str[i] <= 0xdf && i + 1 < len) { // 110xxxxxx
|
||||
ch1 = (str[i] >> 2) & 0x07;
|
||||
ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
|
||||
tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
|
||||
vec.push_back(tmp);
|
||||
i += 2;
|
||||
} else if((uint8_t)str[i] <= 0xef && i + 2 < len) {
|
||||
ch1 = ((uint8_t)str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
|
||||
ch2 = (((uint8_t)str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
|
||||
tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
|
||||
vec.push_back(tmp);
|
||||
i += 3;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class Uint16Container>
|
||||
bool Utf8ToUnicode(const string& str, Uint16Container& vec) {
|
||||
return Utf8ToUnicode(str.c_str(), str.size(), vec);
|
||||
}
|
||||
|
||||
template <class Uint32Container>
|
||||
bool Utf8ToUnicode32(const string& str, Uint32Container& vec) {
|
||||
uint32_t tmp;
|
||||
vec.clear();
|
||||
for(size_t i = 0; i < str.size();) {
|
||||
if(!(str[i] & 0x80)) { // 0xxxxxxx
|
||||
// 7bit, total 7bit
|
||||
tmp = (uint8_t)(str[i]) & 0x7f;
|
||||
i++;
|
||||
} else if ((uint8_t)str[i] <= 0xdf && i + 1 < str.size()) { // 110xxxxxx
|
||||
// 5bit, total 5bit
|
||||
tmp = (uint8_t)(str[i]) & 0x1f;
|
||||
|
||||
// 6bit, total 11bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i+1]) & 0x3f;
|
||||
i += 2;
|
||||
} else if((uint8_t)str[i] <= 0xef && i + 2 < str.size()) { // 1110xxxxxx
|
||||
// 4bit, total 4bit
|
||||
tmp = (uint8_t)(str[i]) & 0x0f;
|
||||
|
||||
// 6bit, total 10bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i+1]) & 0x3f;
|
||||
|
||||
// 6bit, total 16bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i+2]) & 0x3f;
|
||||
|
||||
i += 3;
|
||||
} else if((uint8_t)str[i] <= 0xf7 && i + 3 < str.size()) { // 11110xxxx
|
||||
// 3bit, total 3bit
|
||||
tmp = (uint8_t)(str[i]) & 0x07;
|
||||
|
||||
// 6bit, total 9bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i+1]) & 0x3f;
|
||||
|
||||
// 6bit, total 15bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i+2]) & 0x3f;
|
||||
|
||||
// 6bit, total 21bit
|
||||
tmp <<= 6;
|
||||
tmp |= (uint8_t)(str[i+3]) & 0x3f;
|
||||
|
||||
i += 4;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
vec.push_back(tmp);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class Uint32ContainerConIter>
|
||||
void Unicode32ToUtf8(Uint32ContainerConIter begin, Uint32ContainerConIter end, string& res) {
|
||||
res.clear();
|
||||
uint32_t ui;
|
||||
while(begin != end) {
|
||||
ui = *begin;
|
||||
if(ui <= 0x7f) {
|
||||
res += char(ui);
|
||||
} else if(ui <= 0x7ff) {
|
||||
res += char(((ui >> 6) & 0x1f) | 0xc0);
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
} else if(ui <= 0xffff) {
|
||||
res += char(((ui >> 12) & 0x0f) | 0xe0);
|
||||
res += char(((ui >> 6) & 0x3f) | 0x80);
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
} else {
|
||||
res += char(((ui >> 18) & 0x03) | 0xf0);
|
||||
res += char(((ui >> 12) & 0x3f) | 0x80);
|
||||
res += char(((ui >> 6) & 0x3f) | 0x80);
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
}
|
||||
begin ++;
|
||||
}
|
||||
}
|
||||
|
||||
template <class Uint16ContainerConIter>
|
||||
void UnicodeToUtf8(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
|
||||
res.clear();
|
||||
uint16_t ui;
|
||||
while(begin != end) {
|
||||
ui = *begin;
|
||||
if(ui <= 0x7f) {
|
||||
res += char(ui);
|
||||
} else if(ui <= 0x7ff) {
|
||||
res += char(((ui>>6) & 0x1f) | 0xc0);
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
} else {
|
||||
res += char(((ui >> 12) & 0x0f )| 0xe0);
|
||||
res += char(((ui>>6) & 0x3f )| 0x80 );
|
||||
res += char((ui & 0x3f) | 0x80);
|
||||
}
|
||||
begin ++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class Uint16Container>
|
||||
bool GBKTrans(const char* const str, size_t len, Uint16Container& vec) {
|
||||
vec.clear();
|
||||
if(!str) {
|
||||
return true;
|
||||
}
|
||||
size_t i = 0;
|
||||
while(i < len) {
|
||||
if(0 == (str[i] & 0x80)) {
|
||||
vec.push_back(uint16_t(str[i]));
|
||||
i++;
|
||||
} else {
|
||||
if(i + 1 < len) { //&& (str[i+1] & 0x80))
|
||||
uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
|
||||
vec.push_back(tmp);
|
||||
i += 2;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class Uint16Container>
|
||||
bool GBKTrans(const string& str, Uint16Container& vec) {
|
||||
return GBKTrans(str.c_str(), str.size(), vec);
|
||||
}
|
||||
|
||||
template <class Uint16ContainerConIter>
|
||||
void GBKTrans(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
|
||||
res.clear();
|
||||
//pair<char, char> pa;
|
||||
char first, second;
|
||||
while(begin != end) {
|
||||
//pa = uint16ToChar2(*begin);
|
||||
first = ((*begin)>>8) & 0x00ff;
|
||||
second = (*begin) & 0x00ff;
|
||||
if(first & 0x80) {
|
||||
res += first;
|
||||
res += second;
|
||||
} else {
|
||||
res += second;
|
||||
}
|
||||
begin++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* format example: "%Y-%m-%d %H:%M:%S"
|
||||
*/
|
||||
inline void GetTime(const string& format, string& timeStr) {
|
||||
time_t timeNow;
|
||||
time(&timeNow);
|
||||
timeStr.resize(64);
|
||||
size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), localtime(&timeNow));
|
||||
timeStr.resize(len);
|
||||
}
|
||||
|
||||
inline string PathJoin(const string& path1, const string& path2) {
|
||||
if(EndsWith(path1, "/")) {
|
||||
return path1 + path2;
|
||||
}
|
||||
return path1 + "/" + path2;
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,44 @@
|
|||
#ifndef LIMONP_THREAD_HPP
|
||||
#define LIMONP_THREAD_HPP
|
||||
|
||||
#include "Logging.hpp"
|
||||
#include "NonCopyable.hpp"
|
||||
|
||||
namespace limonp {
|
||||
|
||||
class IThread: NonCopyable {
|
||||
public:
|
||||
IThread(): isStarted(false), isJoined(false) {
|
||||
}
|
||||
virtual ~IThread() {
|
||||
if(isStarted && !isJoined) {
|
||||
XCHECK(!pthread_detach(thread_));
|
||||
}
|
||||
};
|
||||
|
||||
virtual void Run() = 0;
|
||||
void Start() {
|
||||
XCHECK(!isStarted);
|
||||
XCHECK(!pthread_create(&thread_, NULL, Worker, this));
|
||||
isStarted = true;
|
||||
}
|
||||
void Join() {
|
||||
XCHECK(!isJoined);
|
||||
XCHECK(!pthread_join(thread_, NULL));
|
||||
isJoined = true;
|
||||
}
|
||||
private:
|
||||
static void * Worker(void * data) {
|
||||
IThread * ptr = (IThread* ) data;
|
||||
ptr->Run();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
pthread_t thread_;
|
||||
bool isStarted;
|
||||
bool isJoined;
|
||||
}; // class IThread
|
||||
|
||||
} // namespace limonp
|
||||
|
||||
#endif // LIMONP_THREAD_HPP
|
|
@ -0,0 +1,86 @@
|
|||
#ifndef LIMONP_THREAD_POOL_HPP
|
||||
#define LIMONP_THREAD_POOL_HPP
|
||||
|
||||
#include "Thread.hpp"
|
||||
#include "BlockingQueue.hpp"
|
||||
#include "BoundedBlockingQueue.hpp"
|
||||
#include "Closure.hpp"
|
||||
|
||||
namespace limonp {
|
||||
|
||||
using namespace std;
|
||||
|
||||
//class ThreadPool;
|
||||
class ThreadPool: NonCopyable {
|
||||
public:
|
||||
class Worker: public IThread {
|
||||
public:
|
||||
Worker(ThreadPool* pool): ptThreadPool_(pool) {
|
||||
assert(ptThreadPool_);
|
||||
}
|
||||
virtual ~Worker() {
|
||||
}
|
||||
|
||||
virtual void Run() {
|
||||
while (true) {
|
||||
ClosureInterface* closure = ptThreadPool_->queue_.Pop();
|
||||
if (closure == NULL) {
|
||||
break;
|
||||
}
|
||||
try {
|
||||
closure->Run();
|
||||
} catch(std::exception& e) {
|
||||
XLOG(ERROR) << e.what();
|
||||
} catch(...) {
|
||||
XLOG(ERROR) << " unknown exception.";
|
||||
}
|
||||
delete closure;
|
||||
}
|
||||
}
|
||||
private:
|
||||
ThreadPool * ptThreadPool_;
|
||||
}; // class Worker
|
||||
|
||||
ThreadPool(size_t thread_num)
|
||||
: threads_(thread_num),
|
||||
queue_(thread_num) {
|
||||
assert(thread_num);
|
||||
for(size_t i = 0; i < threads_.size(); i ++) {
|
||||
threads_[i] = new Worker(this);
|
||||
}
|
||||
}
|
||||
~ThreadPool() {
|
||||
Stop();
|
||||
}
|
||||
|
||||
void Start() {
|
||||
for(size_t i = 0; i < threads_.size(); i++) {
|
||||
threads_[i]->Start();
|
||||
}
|
||||
}
|
||||
void Stop() {
|
||||
for(size_t i = 0; i < threads_.size(); i ++) {
|
||||
queue_.Push(NULL);
|
||||
}
|
||||
for(size_t i = 0; i < threads_.size(); i ++) {
|
||||
threads_[i]->Join();
|
||||
delete threads_[i];
|
||||
}
|
||||
threads_.clear();
|
||||
}
|
||||
|
||||
void Add(ClosureInterface* task) {
|
||||
assert(task);
|
||||
queue_.Push(task);
|
||||
}
|
||||
|
||||
private:
|
||||
friend class Worker;
|
||||
|
||||
vector<IThread*> threads_;
|
||||
BoundedBlockingQueue<ClosureInterface*> queue_;
|
||||
}; // class ThreadPool
|
||||
|
||||
} // namespace limonp
|
||||
|
||||
#endif // LIMONP_THREAD_POOL_HPP
|
|
@ -0,0 +1,22 @@
|
|||
INCLUDEPATH += $$PWD
|
||||
|
||||
HEADERS += \
|
||||
$$PWD/ArgvContext.hpp \
|
||||
$$PWD/BlockingQueue.hpp \
|
||||
$$PWD/BoundedBlockingQueue.hpp \
|
||||
$$PWD/BoundedQueue.hpp \
|
||||
$$PWD/Closure.hpp \
|
||||
$$PWD/Colors.hpp \
|
||||
$$PWD/Condition.hpp \
|
||||
$$PWD/Config.hpp \
|
||||
$$PWD/FileLock.hpp \
|
||||
$$PWD/ForcePublic.hpp \
|
||||
$$PWD/LocalVector.hpp \
|
||||
$$PWD/Logging.hpp \
|
||||
$$PWD/Md5.hpp \
|
||||
$$PWD/MutexLock.hpp \
|
||||
$$PWD/NonCopyable.hpp \
|
||||
$$PWD/StdExtension.hpp \
|
||||
$$PWD/StringUtil.hpp \
|
||||
$$PWD/Thread.hpp \
|
||||
$$PWD/ThreadPool.hpp
|
|
@ -0,0 +1,31 @@
|
|||
# CppJieba字典
|
||||
|
||||
文件后缀名代表的是词典的编码方式。
|
||||
比如filename.utf8 是 utf8编码,filename.gbk 是 gbk编码方式。
|
||||
|
||||
|
||||
## 分词
|
||||
|
||||
### jieba.dict.utf8/gbk
|
||||
|
||||
作为最大概率法(MPSegment: Max Probability)分词所使用的词典。
|
||||
|
||||
### hmm_model.utf8/gbk
|
||||
|
||||
作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
|
||||
|
||||
__对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__
|
||||
|
||||
|
||||
## 关键词抽取
|
||||
|
||||
### idf.utf8
|
||||
|
||||
IDF(Inverse Document Frequency)
|
||||
在KeywordExtractor中,使用的是经典的TF-IDF算法,所以需要这么一个词典提供IDF信息。
|
||||
|
||||
### stop_words.utf8
|
||||
|
||||
停用词词典
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,259 @@
|
|||
#初始状态的概率
|
||||
#格式
|
||||
#状态:概率
|
||||
B,a:-4.7623052146
|
||||
B,ad:-6.68006603678
|
||||
B,ag:-3.14e+100
|
||||
B,an:-8.69708322302
|
||||
B,b:-5.01837436211
|
||||
B,bg:-3.14e+100
|
||||
B,c:-3.42388018495
|
||||
B,d:-3.97504752976
|
||||
B,df:-8.88897423083
|
||||
B,dg:-3.14e+100
|
||||
B,e:-8.56355183039
|
||||
B,en:-3.14e+100
|
||||
B,f:-5.49163041848
|
||||
B,g:-3.14e+100
|
||||
B,h:-13.53336513
|
||||
B,i:-6.11578472756
|
||||
B,in:-3.14e+100
|
||||
B,j:-5.05761912847
|
||||
B,jn:-3.14e+100
|
||||
B,k:-3.14e+100
|
||||
B,l:-4.90588358466
|
||||
B,ln:-3.14e+100
|
||||
B,m:-3.6524299819
|
||||
B,mg:-3.14e+100
|
||||
B,mq:-6.7869530014
|
||||
B,n:-1.69662577975
|
||||
B,ng:-3.14e+100
|
||||
B,nr:-2.23104959138
|
||||
B,nrfg:-5.87372217541
|
||||
B,nrt:-4.98564273352
|
||||
B,ns:-2.8228438315
|
||||
B,nt:-4.84609166818
|
||||
B,nz:-3.94698846058
|
||||
B,o:-8.43349870215
|
||||
B,p:-4.20098413209
|
||||
B,q:-6.99812385896
|
||||
B,qe:-3.14e+100
|
||||
B,qg:-3.14e+100
|
||||
B,r:-3.40981877908
|
||||
B,rg:-3.14e+100
|
||||
B,rr:-12.4347528413
|
||||
B,rz:-7.94611647157
|
||||
B,s:-5.52267359084
|
||||
B,t:-3.36474790945
|
||||
B,tg:-3.14e+100
|
||||
B,u:-9.1639172775
|
||||
B,ud:-3.14e+100
|
||||
B,ug:-3.14e+100
|
||||
B,uj:-3.14e+100
|
||||
B,ul:-3.14e+100
|
||||
B,uv:-3.14e+100
|
||||
B,uz:-3.14e+100
|
||||
B,v:-2.67405848743
|
||||
B,vd:-9.04472876024
|
||||
B,vg:-3.14e+100
|
||||
B,vi:-12.4347528413
|
||||
B,vn:-4.33156108902
|
||||
B,vq:-12.1470707689
|
||||
B,w:-3.14e+100
|
||||
B,x:-3.14e+100
|
||||
B,y:-9.84448567586
|
||||
B,yg:-3.14e+100
|
||||
B,z:-7.04568111149
|
||||
B,zg:-3.14e+100
|
||||
E,a:-3.14e+100
|
||||
E,ad:-3.14e+100
|
||||
E,ag:-3.14e+100
|
||||
E,an:-3.14e+100
|
||||
E,b:-3.14e+100
|
||||
E,bg:-3.14e+100
|
||||
E,c:-3.14e+100
|
||||
E,d:-3.14e+100
|
||||
E,df:-3.14e+100
|
||||
E,dg:-3.14e+100
|
||||
E,e:-3.14e+100
|
||||
E,en:-3.14e+100
|
||||
E,f:-3.14e+100
|
||||
E,g:-3.14e+100
|
||||
E,h:-3.14e+100
|
||||
E,i:-3.14e+100
|
||||
E,in:-3.14e+100
|
||||
E,j:-3.14e+100
|
||||
E,jn:-3.14e+100
|
||||
E,k:-3.14e+100
|
||||
E,l:-3.14e+100
|
||||
E,ln:-3.14e+100
|
||||
E,m:-3.14e+100
|
||||
E,mg:-3.14e+100
|
||||
E,mq:-3.14e+100
|
||||
E,n:-3.14e+100
|
||||
E,ng:-3.14e+100
|
||||
E,nr:-3.14e+100
|
||||
E,nrfg:-3.14e+100
|
||||
E,nrt:-3.14e+100
|
||||
E,ns:-3.14e+100
|
||||
E,nt:-3.14e+100
|
||||
E,nz:-3.14e+100
|
||||
E,o:-3.14e+100
|
||||
E,p:-3.14e+100
|
||||
E,q:-3.14e+100
|
||||
E,qe:-3.14e+100
|
||||
E,qg:-3.14e+100
|
||||
E,r:-3.14e+100
|
||||
E,rg:-3.14e+100
|
||||
E,rr:-3.14e+100
|
||||
E,rz:-3.14e+100
|
||||
E,s:-3.14e+100
|
||||
E,t:-3.14e+100
|
||||
E,tg:-3.14e+100
|
||||
E,u:-3.14e+100
|
||||
E,ud:-3.14e+100
|
||||
E,ug:-3.14e+100
|
||||
E,uj:-3.14e+100
|
||||
E,ul:-3.14e+100
|
||||
E,uv:-3.14e+100
|
||||
E,uz:-3.14e+100
|
||||
E,v:-3.14e+100
|
||||
E,vd:-3.14e+100
|
||||
E,vg:-3.14e+100
|
||||
E,vi:-3.14e+100
|
||||
E,vn:-3.14e+100
|
||||
E,vq:-3.14e+100
|
||||
E,w:-3.14e+100
|
||||
E,x:-3.14e+100
|
||||
E,y:-3.14e+100
|
||||
E,yg:-3.14e+100
|
||||
E,z:-3.14e+100
|
||||
E,zg:-3.14e+100
|
||||
M,a:-3.14e+100
|
||||
M,ad:-3.14e+100
|
||||
M,ag:-3.14e+100
|
||||
M,an:-3.14e+100
|
||||
M,b:-3.14e+100
|
||||
M,bg:-3.14e+100
|
||||
M,c:-3.14e+100
|
||||
M,d:-3.14e+100
|
||||
M,df:-3.14e+100
|
||||
M,dg:-3.14e+100
|
||||
M,e:-3.14e+100
|
||||
M,en:-3.14e+100
|
||||
M,f:-3.14e+100
|
||||
M,g:-3.14e+100
|
||||
M,h:-3.14e+100
|
||||
M,i:-3.14e+100
|
||||
M,in:-3.14e+100
|
||||
M,j:-3.14e+100
|
||||
M,jn:-3.14e+100
|
||||
M,k:-3.14e+100
|
||||
M,l:-3.14e+100
|
||||
M,ln:-3.14e+100
|
||||
M,m:-3.14e+100
|
||||
M,mg:-3.14e+100
|
||||
M,mq:-3.14e+100
|
||||
M,n:-3.14e+100
|
||||
M,ng:-3.14e+100
|
||||
M,nr:-3.14e+100
|
||||
M,nrfg:-3.14e+100
|
||||
M,nrt:-3.14e+100
|
||||
M,ns:-3.14e+100
|
||||
M,nt:-3.14e+100
|
||||
M,nz:-3.14e+100
|
||||
M,o:-3.14e+100
|
||||
M,p:-3.14e+100
|
||||
M,q:-3.14e+100
|
||||
M,qe:-3.14e+100
|
||||
M,qg:-3.14e+100
|
||||
M,r:-3.14e+100
|
||||
M,rg:-3.14e+100
|
||||
M,rr:-3.14e+100
|
||||
M,rz:-3.14e+100
|
||||
M,s:-3.14e+100
|
||||
M,t:-3.14e+100
|
||||
M,tg:-3.14e+100
|
||||
M,u:-3.14e+100
|
||||
M,ud:-3.14e+100
|
||||
M,ug:-3.14e+100
|
||||
M,uj:-3.14e+100
|
||||
M,ul:-3.14e+100
|
||||
M,uv:-3.14e+100
|
||||
M,uz:-3.14e+100
|
||||
M,v:-3.14e+100
|
||||
M,vd:-3.14e+100
|
||||
M,vg:-3.14e+100
|
||||
M,vi:-3.14e+100
|
||||
M,vn:-3.14e+100
|
||||
M,vq:-3.14e+100
|
||||
M,w:-3.14e+100
|
||||
M,x:-3.14e+100
|
||||
M,y:-3.14e+100
|
||||
M,yg:-3.14e+100
|
||||
M,z:-3.14e+100
|
||||
M,zg:-3.14e+100
|
||||
S,a:-3.90253968313
|
||||
S,ad:-11.0484584802
|
||||
S,ag:-6.95411391796
|
||||
S,an:-12.8402179494
|
||||
S,b:-6.47288876397
|
||||
S,bg:-3.14e+100
|
||||
S,c:-4.78696679586
|
||||
S,d:-3.90391976418
|
||||
S,df:-3.14e+100
|
||||
S,dg:-8.9483976513
|
||||
S,e:-5.94251300628
|
||||
S,en:-3.14e+100
|
||||
S,f:-5.19482024998
|
||||
S,g:-6.50782681533
|
||||
S,h:-8.65056320738
|
||||
S,i:-3.14e+100
|
||||
S,in:-3.14e+100
|
||||
S,j:-4.91199211964
|
||||
S,jn:-3.14e+100
|
||||
S,k:-6.94032059583
|
||||
S,l:-3.14e+100
|
||||
S,ln:-3.14e+100
|
||||
S,m:-3.26920065212
|
||||
S,mg:-10.8253149289
|
||||
S,mq:-3.14e+100
|
||||
S,n:-3.85514838976
|
||||
S,ng:-4.9134348611
|
||||
S,nr:-4.48366310396
|
||||
S,nrfg:-3.14e+100
|
||||
S,nrt:-3.14e+100
|
||||
S,ns:-3.14e+100
|
||||
S,nt:-12.1470707689
|
||||
S,nz:-3.14e+100
|
||||
S,o:-8.46446092775
|
||||
S,p:-2.98684018136
|
||||
S,q:-4.88865861826
|
||||
S,qe:-3.14e+100
|
||||
S,qg:-3.14e+100
|
||||
S,r:-2.76353367841
|
||||
S,rg:-10.2752685919
|
||||
S,rr:-3.14e+100
|
||||
S,rz:-3.14e+100
|
||||
S,s:-3.14e+100
|
||||
S,t:-3.14e+100
|
||||
S,tg:-6.27284253188
|
||||
S,u:-6.94032059583
|
||||
S,ud:-7.72823016105
|
||||
S,ug:-7.53940370266
|
||||
S,uj:-6.85251045118
|
||||
S,ul:-8.41537131755
|
||||
S,uv:-8.15808672229
|
||||
S,uz:-9.29925862537
|
||||
S,v:-3.05329230341
|
||||
S,vd:-3.14e+100
|
||||
S,vg:-5.94301818437
|
||||
S,vi:-3.14e+100
|
||||
S,vn:-11.4539235883
|
||||
S,vq:-3.14e+100
|
||||
S,w:-3.14e+100
|
||||
S,x:-8.42741965607
|
||||
S,y:-6.19707946995
|
||||
S,yg:-13.53336513
|
||||
S,z:-3.14e+100
|
||||
S,zg:-3.14e+100
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,4 @@
|
|||
云计算
|
||||
韩玉鉴赏
|
||||
蓝翔 nz
|
||||
区块链 10 nz
|
|
@ -1,5 +1,6 @@
|
|||
QT -= gui
|
||||
|
||||
#VERSION = 0.0.1
|
||||
TARGET = chinese-segmentation
|
||||
TEMPLATE = lib
|
||||
DEFINES += LIBCHINESESEGMENTATION_LIBRARY
|
||||
|
@ -16,7 +17,7 @@ DEFINES += QT_DEPRECATED_WARNINGS
|
|||
# In order to do so, uncomment the following line.
|
||||
# You can also select to disable deprecated APIs only up to a certain version of Qt.
|
||||
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
|
||||
#include(jieba/jieba.pri)
|
||||
include(cppjieba/cppjieba.pri)
|
||||
|
||||
SOURCES += \
|
||||
chinese-segmentation.cpp \
|
||||
|
@ -25,6 +26,13 @@ HEADERS += \
|
|||
chinese-segmentation.h \
|
||||
libchinese-segmentation_global.h
|
||||
|
||||
dict_files.path = /usr/share/ukui-search/res/dict/
|
||||
dict_files.files = $$PWD/dict/*.utf8\
|
||||
dict_files.files += $$PWD/dict/pos_dict/*.utf8\
|
||||
|
||||
INSTALLS += \
|
||||
dict_files \
|
||||
|
||||
# Default rules for deployment.
|
||||
unix {
|
||||
target.path = /usr/lib
|
||||
|
@ -33,3 +41,16 @@ unix {
|
|||
|
||||
#DISTFILES += \
|
||||
# jiaba/jieba.pri
|
||||
|
||||
DISTFILES += \
|
||||
dict/README.md \
|
||||
dict/hmm_model.utf8 \
|
||||
dict/idf.utf8 \
|
||||
dict/jieba.dict.utf8 \
|
||||
dict/pos_dict/char_state_tab.utf8 \
|
||||
dict/pos_dict/prob_emit.utf8 \
|
||||
dict/pos_dict/prob_start.utf8 \
|
||||
dict/pos_dict/prob_trans.utf8 \
|
||||
dict/stop_words.utf8 \
|
||||
dict/user.dict.utf8
|
||||
|
||||
|
|
|
@ -1 +1,7 @@
|
|||
INCLUDEPATH += $$PWD
|
||||
|
||||
HEADERS += \
|
||||
$$PWD/app-match.h
|
||||
|
||||
SOURCES += \
|
||||
$$PWD/app-match.cpp
|
||||
|
|
|
@ -398,11 +398,11 @@ QString *FileUtils::getDocxTextContent(QString &path)
|
|||
QFileInfo info = QFileInfo(path);
|
||||
if(!info.exists()||info.isDir())
|
||||
return nullptr;
|
||||
QuaZip file("path");
|
||||
if(file.open(QuaZip::mdUnzip))
|
||||
QuaZip file(path);
|
||||
if(!file.open(QuaZip::mdUnzip))
|
||||
return nullptr;
|
||||
|
||||
if(file.setCurrentFile("word/document.xml",QuaZip::csSensitive))
|
||||
if(!file.setCurrentFile("word/document.xml",QuaZip::csSensitive))
|
||||
return nullptr;
|
||||
QuaZipFile fileR(&file);
|
||||
|
||||
|
@ -424,7 +424,6 @@ QString *FileUtils::getDocxTextContent(QString &path)
|
|||
}
|
||||
first = first.nextSiblingElement();
|
||||
}
|
||||
qDebug()<<"size!!!"<<allText->size();
|
||||
return allText;
|
||||
}
|
||||
|
||||
|
|
|
@ -27,8 +27,8 @@ public:
|
|||
|
||||
//parse text,docx.....
|
||||
static QString getMimetype(QString &path, bool getsuffix = false);
|
||||
static QString * getDocxTextContent(QString &path);
|
||||
static QString * getTxtContent(QString &path);
|
||||
static QString *getDocxTextContent(QString &path);
|
||||
static QString *getTxtContent(QString &path);
|
||||
|
||||
private:
|
||||
FileUtils();
|
||||
|
|
|
@ -23,11 +23,11 @@ void Document::setData(QString data)
|
|||
m_document->set_data(data.toStdString());
|
||||
}
|
||||
|
||||
void Document::addterm(QString term)
|
||||
void Document::addterm(std::string term, int weight)
|
||||
{
|
||||
if(term.isEmpty())
|
||||
if(term == "")
|
||||
return;
|
||||
m_document->add_term(term.toStdString());
|
||||
m_document->add_term(term,weight);
|
||||
}
|
||||
|
||||
void Document::addValue(QString value)
|
||||
|
|
|
@ -11,7 +11,7 @@ public:
|
|||
Document();
|
||||
~Document();
|
||||
void setData(QString data);
|
||||
void addterm(QString term);
|
||||
void addterm(std::string term,int weight =1);
|
||||
void addValue(QString value);
|
||||
void setUniqueTerm(QString term);
|
||||
std::string getUniqueTerm();
|
||||
|
|
|
@ -10,7 +10,7 @@ QString *FileReader::getTextContent(QString path)
|
|||
{
|
||||
//获取所有文件内容
|
||||
//先分类
|
||||
QString type =FileUtils::getMimetype(path);
|
||||
QString type =FileUtils::getMimetype(path,true);
|
||||
if(type == "application/zip")
|
||||
return FileUtils::getDocxTextContent(path);
|
||||
else if(type == "text/plain")
|
||||
|
|
|
@ -73,6 +73,43 @@ void FileSearcher::onKeywordSearch(QString keyword, int begin, int num)
|
|||
|
||||
}
|
||||
|
||||
void FileSearcher::onKeywordSearchContent(QString keyword, int begin, int num)
|
||||
{
|
||||
QStringList searchResult;
|
||||
try
|
||||
{
|
||||
qDebug()<<"--content search start--";
|
||||
|
||||
Xapian::Database db(CONTENT_INDEX_PATH);
|
||||
Xapian::Enquire enquire(db);
|
||||
Xapian::QueryParser qp;
|
||||
qp.set_default_op(Xapian::Query::OP_PHRASE);
|
||||
qp.set_database(db);
|
||||
|
||||
//Creat a query
|
||||
Xapian::Query queryPhrase = qp.parse_query(keyword.toStdString(),Xapian::QueryParser::FLAG_PHRASE);
|
||||
|
||||
qDebug()<<QString::fromStdString(queryPhrase.get_description());
|
||||
|
||||
enquire.set_query(queryPhrase);
|
||||
//dir result
|
||||
Xapian::MSet result = enquire.get_mset(begin, begin+num);
|
||||
qDebug()<< "find results count=" <<static_cast<int>(result.get_matches_estimated());
|
||||
searchResult = getResult(result);
|
||||
|
||||
qDebug()<< "--content search finish--";
|
||||
}
|
||||
catch(const Xapian::Error &e)
|
||||
{
|
||||
qDebug() <<QString::fromStdString(e.get_description());
|
||||
qDebug()<< "--content search finish--";
|
||||
return;
|
||||
}
|
||||
Q_EMIT this->contentResult(searchResult);
|
||||
qDebug()<<searchResult;
|
||||
return;
|
||||
}
|
||||
|
||||
QStringList FileSearcher::getResult(Xapian::MSet &result)
|
||||
{
|
||||
//QStringList *pathTobeDelete = new QStringList;
|
||||
|
@ -84,7 +121,6 @@ QStringList FileSearcher::getResult(Xapian::MSet &result)
|
|||
for (auto it = result.begin(); it != result.end(); ++it)
|
||||
{
|
||||
Xapian::Document doc = it.get_document();
|
||||
qDebug()<<"value!!!!"<<QString::fromStdString(doc.get_value(1));
|
||||
std::string data = doc.get_data();
|
||||
Xapian::weight docScoreWeight = it.get_weight();
|
||||
Xapian::percent docScorePercent = it.get_percent();
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#include <QStandardPaths>
|
||||
#include <QVector>
|
||||
#define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/index_data").toStdString()
|
||||
#define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/content_index_data").toStdString()
|
||||
|
||||
|
||||
class FileSearcher : public QObject
|
||||
|
@ -15,10 +16,12 @@ public:
|
|||
explicit FileSearcher(QObject *parent = nullptr);
|
||||
|
||||
public Q_SLOTS:
|
||||
void onKeywordSearch(QString keyword, int begin, int num);
|
||||
void onKeywordSearch(QString keyword, int begin = 0, int num = 20);
|
||||
void onKeywordSearchContent(QString keyword, int begin = 0, int num = 20);
|
||||
|
||||
Q_SIGNALS:
|
||||
void result(QVector<QStringList> resultV);
|
||||
void contentResult(QStringList resultL);
|
||||
private:
|
||||
QStringList getResult(Xapian::MSet &result);
|
||||
};
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
#include <QDebug>
|
||||
#include "filetypefilter.h"
|
||||
#include "index-generator.h"
|
||||
|
||||
FileTypeFilter::FileTypeFilter(const QString& path) : Traverse_BFS(path)
|
||||
{
|
||||
this->result = new QVector<QString>();
|
||||
this->result = new QList<QString>();
|
||||
this->Traverse();
|
||||
}
|
||||
|
||||
|
@ -27,7 +28,16 @@ void FileTypeFilter::DoSomething(const QFileInfo& fileInfo){
|
|||
|
||||
}
|
||||
|
||||
QVector<QString>* FileTypeFilter::getTargetFileAbsolutePath(){
|
||||
QList<QString>* FileTypeFilter::getTargetFileAbsolutePath(){
|
||||
return this->result;
|
||||
}
|
||||
|
||||
|
||||
void FileTypeFilter::Test(){
|
||||
IndexGenerator* ig = IndexGenerator::getInstance();
|
||||
// this->result = new QList<QString>();
|
||||
// this->result->append(QString("/home/zpf/桌面/DOCX 文档.docx"));
|
||||
|
||||
ig->creatAllIndex(this->result);
|
||||
}
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
#include <QObject>
|
||||
#include <QMimeDatabase>
|
||||
#include <QMimeType>
|
||||
#include <QVector>
|
||||
#include <QList>
|
||||
#include "traverse_bfs.h"
|
||||
|
||||
class FileTypeFilter : public QObject, public Traverse_BFS
|
||||
|
@ -14,18 +14,18 @@ public:
|
|||
explicit FileTypeFilter(const QString&);
|
||||
~FileTypeFilter();
|
||||
virtual void DoSomething(const QFileInfo&) final;
|
||||
QVector<QString>* getTargetFileAbsolutePath();
|
||||
|
||||
QList<QString>* getTargetFileAbsolutePath();
|
||||
void Test();
|
||||
Q_SIGNALS:
|
||||
private:
|
||||
const QVector<QString> targetFileTypeVec ={ QString(".doc"),
|
||||
QString(".docx"),
|
||||
QString(".ppt"),
|
||||
const QVector<QString> targetFileTypeVec ={ /*QString(".doc"),*/
|
||||
QString(".docx")/*,*/
|
||||
/*QString(".ppt"),
|
||||
QString(".pptx"),
|
||||
QString(".xls"),
|
||||
QString(".xlsx"),
|
||||
QString(".txt")};
|
||||
QVector<QString>* result;
|
||||
QString(".txt")*/};
|
||||
QList<QString>* result;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -2,9 +2,11 @@
|
|||
#include <QStandardPaths>
|
||||
#include <QFileInfo>
|
||||
#include <QDebug>
|
||||
#include "chinese-segmentation.h"
|
||||
#include "file-utils.h"
|
||||
#include "index-generator.h"
|
||||
#include "chinesecharacterstopinyin.h"
|
||||
|
||||
#include <QtConcurrent>
|
||||
#include <QFuture>
|
||||
|
||||
|
@ -28,6 +30,7 @@ bool IndexGenerator::setIndexdataPath()
|
|||
return true;
|
||||
}
|
||||
|
||||
//文件名索引
|
||||
bool IndexGenerator::creatAllIndex(QList<QVector<QString> > *messageList)
|
||||
{
|
||||
HandlePathList(messageList);
|
||||
|
@ -62,10 +65,32 @@ bool IndexGenerator::creatAllIndex(QList<QVector<QString> > *messageList)
|
|||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool IndexGenerator::creatAllIndex(QVector<QString> *messageList)
|
||||
//文件内容索引
|
||||
bool IndexGenerator::creatAllIndex(QList<QString> *messageList)
|
||||
{
|
||||
HandlePathList(messageList);
|
||||
try
|
||||
{
|
||||
int count =0;
|
||||
for(int i = 0;i < m_doc_list_content->size(); i++)
|
||||
{
|
||||
insertIntoContentDatabase(m_doc_list_content->at(i));
|
||||
|
||||
if(++count == 9999)
|
||||
{
|
||||
count = 0;
|
||||
m_database_content->commit();
|
||||
}
|
||||
}
|
||||
m_database_content->commit();
|
||||
}
|
||||
catch(const Xapian::Error &e)
|
||||
{
|
||||
qDebug()<<"creat content Index fail!"<<QString::fromStdString(e.get_description());
|
||||
return false;
|
||||
}
|
||||
m_doc_list_content->clear();
|
||||
Q_EMIT this->transactionFinished();
|
||||
return true;
|
||||
|
||||
}
|
||||
|
@ -97,6 +122,14 @@ void IndexGenerator::insertIntoDatabase(Document doc)
|
|||
return;
|
||||
}
|
||||
|
||||
void IndexGenerator::insertIntoContentDatabase(Document doc)
|
||||
{
|
||||
Xapian::docid innerId= m_database_content->replace_document(doc.getUniqueTerm(),doc.getXapianDocument());
|
||||
// qDebug()<<"replace doc docid="<<static_cast<int>(innerId);
|
||||
// qDebug()<< "--index finish--";
|
||||
return;
|
||||
}
|
||||
|
||||
void IndexGenerator::HandlePathList(QList<QVector<QString>> *messageList)
|
||||
{
|
||||
qDebug()<<"Begin HandlePathList!";
|
||||
|
@ -114,7 +147,7 @@ void IndexGenerator::HandlePathList(QList<QVector<QString>> *messageList)
|
|||
return;
|
||||
}
|
||||
|
||||
void IndexGenerator::HandlePathList(QVector<QString> *messageList)
|
||||
void IndexGenerator::HandlePathList(QList<QString> *messageList)
|
||||
{
|
||||
qDebug()<<"Begin HandlePathList for content index!";
|
||||
qDebug()<<messageList->size();
|
||||
|
@ -148,10 +181,10 @@ Document IndexGenerator::GenerateDocument(const QVector<QString> &list)
|
|||
|
||||
//多音字版
|
||||
//现加入首字母
|
||||
QStringList pinyin_text_list = FileUtils::findMultiToneWords(QString(list.at(0)).replace(".",""));
|
||||
for (QString& i : pinyin_text_list){
|
||||
i.replace("", " ");
|
||||
}
|
||||
// QStringList pinyin_text_list = FileUtils::findMultiToneWords(QString(list.at(0)).replace(".",""));
|
||||
// for (QString& i : pinyin_text_list){
|
||||
// i.replace("", " ");
|
||||
// }
|
||||
|
||||
QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(sourcePath));
|
||||
// QString uniqueterm1 = QString::fromStdString(QCryptographicHash::hash(sourcePath.toUtf8(),QCryptographicHash::Md5).toStdString());
|
||||
|
@ -169,10 +202,9 @@ Document IndexGenerator::GenerateDocument(const QVector<QString> &list)
|
|||
doc.setUniqueTerm(uniqueterm);
|
||||
doc.addValue(list.at(2));
|
||||
if(list.at(2) == QString("1"))
|
||||
qDebug()<<"value!!!"<<list.at(2);
|
||||
QStringList temp;
|
||||
temp.append(index_text);
|
||||
temp.append(pinyin_text_list);
|
||||
// temp.append(pinyin_text_list);
|
||||
doc.setIndexText(temp);
|
||||
// doc.setIndexText(QStringList()<<index_text<<pinyin_text);
|
||||
// doc.setIndexText(QStringList()<<index_text);
|
||||
|
@ -182,15 +214,19 @@ Document IndexGenerator::GenerateDocument(const QVector<QString> &list)
|
|||
|
||||
Document IndexGenerator::GenerateContentDocument(const QString &path)
|
||||
{
|
||||
//构造文本索引的document
|
||||
FileReader::getTextContent(path);
|
||||
// 构造文本索引的document
|
||||
QString *content = FileReader::getTextContent(path);
|
||||
QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(path));
|
||||
QVector<SKeyWord> term = ChineseSegmentation::callSegement(content);
|
||||
Document doc;
|
||||
doc.setData(path);
|
||||
doc.setUniqueTerm(uniqueterm);
|
||||
for(int i = 0;i<term.size();++i)
|
||||
{
|
||||
doc.addterm(term.at(i).word,static_cast<int>(term.at(i).weight));
|
||||
|
||||
}
|
||||
return doc;
|
||||
|
||||
|
||||
}
|
||||
|
||||
bool IndexGenerator::isIndexdataExist()
|
||||
|
|
|
@ -23,7 +23,7 @@ Q_SIGNALS:
|
|||
void searchFinish();
|
||||
public Q_SLOTS:
|
||||
bool creatAllIndex(QList<QVector<QString>> *messageList);
|
||||
bool creatAllIndex(QVector<QString> *messageList);
|
||||
bool creatAllIndex(QList<QString> *messageList);
|
||||
bool deleteAllIndex(QStringList *pathlist);
|
||||
|
||||
private:
|
||||
|
@ -31,11 +31,12 @@ private:
|
|||
//For file name index
|
||||
void HandlePathList(QList<QVector<QString>> *messageList);
|
||||
//For file content index
|
||||
void HandlePathList(QVector<QString> *messageList);
|
||||
void HandlePathList(QList<QString> *messageList);
|
||||
static Document GenerateDocument(const QVector<QString> &list);
|
||||
static Document GenerateContentDocument(const QString &list);
|
||||
//add one data in database
|
||||
void insertIntoDatabase(Document doc);
|
||||
void insertIntoContentDatabase(Document doc);
|
||||
~IndexGenerator();
|
||||
|
||||
QMap<QString,QStringList> *m_index_map;
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
#include "libsearch.h"
|
||||
|
||||
Libsearch::Libsearch()
|
||||
GlobalSearch::GlobalSearch()
|
||||
{
|
||||
}
|
||||
|
||||
QStringList GlobalSearch::fileSearch(QString keyword, int begin, int num)
|
||||
{
|
||||
|
||||
}
|
||||
|
|
|
@ -7,4 +7,17 @@
|
|||
#include "settingsearch/setting-match.h"
|
||||
#include "index/inotify.h"
|
||||
#include "file-utils.h"
|
||||
|
||||
#include "index/filetypefilter.h"
|
||||
|
||||
class LIBSEARCH_EXPORT GlobalSearch
|
||||
{
|
||||
public:
|
||||
|
||||
static QStringList fileSearch(QString keyword, int begin = 0, int num = -1);
|
||||
|
||||
private:
|
||||
GlobalSearch();
|
||||
};
|
||||
|
||||
#endif // LIBSEARCH_H
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
QT += core gui concurrent xml
|
||||
QT += core concurrent xml
|
||||
|
||||
VERSION = 0.0.1
|
||||
|
||||
|
@ -24,14 +24,16 @@ DEFINES += QT_DEPRECATED_WARNINGS
|
|||
|
||||
include(index/index.pri)
|
||||
include(appsearch/appsearch.pri)
|
||||
include(settingsearch/settingsearch.pri)
|
||||
include(settingsearch/settingsearch.pri))
|
||||
|
||||
LIBS += -L../libchinese-segmentation/ -lchinese-segmentation
|
||||
LIBS += -lxapian -lgsettings-qt -lquazip5
|
||||
|
||||
SOURCES += \
|
||||
file-utils.cpp \
|
||||
globalsettings.cpp \
|
||||
gobject-template.cpp
|
||||
gobject-template.cpp \
|
||||
libsearch.cpp
|
||||
|
||||
HEADERS += \
|
||||
file-utils.h \
|
||||
|
@ -41,7 +43,7 @@ HEADERS += \
|
|||
libsearch.h
|
||||
|
||||
RESOURCES += \
|
||||
resource1.qrc
|
||||
resource1.qrc \
|
||||
|
||||
|
||||
|
||||
|
@ -51,9 +53,7 @@ unix {
|
|||
}
|
||||
!isEmpty(target.path): INSTALLS += target
|
||||
|
||||
win32:CONFIG(release, debug|release): LIBS += -L$$OUT_PWD/../libchinese-segmentation/release/ -lchinese-segmentation
|
||||
else:win32:CONFIG(debug, debug|release): LIBS += -L$$OUT_PWD/../libchinese-segmentation/debug/ -lchinese-segmentation
|
||||
else:unix: LIBS += -L$$OUT_PWD/../libchinese-segmentation/ -lchinese-segmentation
|
||||
|
||||
|
||||
INCLUDEPATH += $$PWD/../libchinese-segmentation
|
||||
DEPENDPATH += $$PWD/../libchinese-segmentation
|
||||
|
|
17
src/main.cpp
17
src/main.cpp
|
@ -31,6 +31,7 @@
|
|||
//#include "inotify-manager.h"
|
||||
#include "libsearch.h"
|
||||
|
||||
|
||||
void centerToScreen(QWidget* widget) {
|
||||
if (!widget)
|
||||
return;
|
||||
|
@ -56,6 +57,22 @@ int main(int argc, char *argv[])
|
|||
// qDebug() << t2;
|
||||
/*-------------InotyifyRefact Test End-----------------*/
|
||||
|
||||
/*-------------InotyifyRefact Test Start---------------*/
|
||||
// QTime t1 = QTime::currentTime();
|
||||
// FileTypeFilter* ftf = new FileTypeFilter("/home");
|
||||
// ftf->Test();
|
||||
// QTime t2 = QTime::currentTime();
|
||||
// delete ftf;
|
||||
// ftf = nullptr;
|
||||
// qDebug() << t1;
|
||||
// qDebug() << t2;
|
||||
/*-------------InotyifyRefact Test End-----------------*/
|
||||
|
||||
/*-------------文本搜索 Test start-----------------*/
|
||||
// FileSearcher *search = new FileSearcher();
|
||||
// search->onKeywordSearchContent("麒麟");
|
||||
/*-------------文本搜索 Test End-----------------*/
|
||||
|
||||
qRegisterMetaType<QVector<QStringList>>("QVector<QStringList>");
|
||||
|
||||
QApplication::setAttribute(Qt::AA_EnableHighDpiScaling);
|
||||
|
|
|
@ -47,13 +47,13 @@ MainWindow::MainWindow(QWidget *parent) :
|
|||
{
|
||||
// FileUtils::findMultiToneWords("仇仇仇仇仇仇仇仇仇仇仇翟康宁test");
|
||||
/*-------------Inotify Test Start---------------*/
|
||||
QTime t1 = QTime::currentTime();
|
||||
InotifyManagerRefact* im = new InotifyManagerRefact("/home");
|
||||
im->Traverse();
|
||||
QTime t2 = QTime::currentTime();
|
||||
qDebug() << t1;
|
||||
qDebug() << t2;
|
||||
im->start();
|
||||
// QTime t1 = QTime::currentTime();
|
||||
// InotifyManagerRefact* im = new InotifyManagerRefact("/home");
|
||||
// im->Traverse();
|
||||
// QTime t2 = QTime::currentTime();
|
||||
// qDebug() << t1;
|
||||
// qDebug() << t2;
|
||||
// im->start();
|
||||
/*-------------Inotify Test End-----------------*/
|
||||
|
||||
this->setWindowFlags(Qt::CustomizeWindowHint | Qt::FramelessWindowHint | Qt::X11BypassWindowManagerHint);
|
||||
|
|
|
@ -57,7 +57,7 @@ qm_files.files = res/translations/*.qm\
|
|||
INSTALLS += \
|
||||
qm_files \
|
||||
|
||||
unix:!macx: LIBS += -L$$OUT_PWD/../libsearch/ -lukui-search
|
||||
unix:!macx: LIBS += -L$$OUT_PWD/../libsearch -lukui-search -L../libchinese-segmentation/ -lchinese-segmentation
|
||||
|
||||
INCLUDEPATH += $$PWD/../libsearch
|
||||
DEPENDPATH += $$PWD/../libsearch
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
TEMPLATE = subdirs
|
||||
SUBDIRS += \
|
||||
libchinese-segmentation \
|
||||
libsearch \
|
||||
src
|
||||
SUBDIRS += $$PWD/libchinese-segmentation \
|
||||
$$PWD/libsearch \
|
||||
$$PWD/src
|
||||
# The following define makes your compiler emit warnings if you use
|
||||
# any Qt feature that has been marked deprecated (the exact warnings
|
||||
# depend on your compiler). Please consult the documentation of the
|
||||
|
@ -14,6 +13,8 @@ DEFINES += QT_DEPRECATED_WARNINGS
|
|||
# You can also select to disable deprecated APIs only up to a certain version of Qt.
|
||||
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
|
||||
|
||||
src.depends = libsearch
|
||||
libsearch.depends = libchinese-segmentation
|
||||
src.depends = libsearch
|
||||
|
||||
CONFIG += ordered
|
||||
|
||||
|
|
Loading…
Reference in New Issue