ukui-search/libchinese-segmentation/cppjieba/KeywordExtractor.hpp

100 lines
2.8 KiB
C++
Raw Normal View History

#pragma once
#include <cmath>
#include "MixSegment.hpp"
#include "IdfTrie.hpp"
namespace cppjieba {
using namespace limonp;
using namespace std;
/*utf8*/
class KeywordExtractor {
2021-04-26 15:06:47 +08:00
public:
KeywordExtractor(const DictTrie* dictTrie,
const HMMModel* model,
const string& idfPath,
const string& dat_cache_path,
2021-04-26 15:06:47 +08:00
const string& stopWordPath)
: segment_(dictTrie, model, stopWordPath),
idf_trie_(idfPath,dat_cache_path){
}
2021-04-26 15:06:47 +08:00
~KeywordExtractor() {
}
2021-04-26 15:06:47 +08:00
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
vector<KeyWord> topWords;
2021-04-26 15:06:47 +08:00
Extract(sentence, topWords, topN);
for (size_t i = 0; i < topWords.size(); i++) {
2021-04-26 15:06:47 +08:00
keywords.push_back(topWords[i].word);
}
}
2021-04-26 15:06:47 +08:00
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
vector<KeyWord> topWords;
2021-04-26 15:06:47 +08:00
Extract(sentence, topWords, topN);
for (size_t i = 0; i < topWords.size(); i++) {
2021-04-26 15:06:47 +08:00
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
}
}
void Extract(const string& sentence, vector<KeyWord>& keywords, size_t topN) const {
2021-04-26 15:06:47 +08:00
unordered_map<string, KeyWord> wordmap;//插入字符串与Word的map相同string统计词频叠加权重
PreFilter pre_filter(symbols_, sentence);
RuneStrArray::const_iterator null_p;
WordRange range(null_p, null_p);
bool isNull(false);
while (pre_filter.Next(range, isNull)) {
if (isNull) {
2021-04-26 15:06:47 +08:00
continue;
}
segment_.CutToStr(sentence, range, wordmap);
2021-04-26 15:06:47 +08:00
}
keywords.clear();
keywords.reserve(wordmap.size());
for (unordered_map<string, KeyWord>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
double idf = idf_trie_.Find(itr->first);
if (-1 != idf) {//IDF词典查找
itr->second.weight *= idf;
2021-04-26 15:06:47 +08:00
} else {
itr->second.weight *= idf_trie_.idfAverage_;
2021-04-26 15:06:47 +08:00
}
2021-04-26 15:06:47 +08:00
itr->second.word = itr->first;
keywords.push_back(itr->second);
}
2021-04-26 15:06:47 +08:00
topN = min(topN, keywords.size());
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
keywords.resize(topN);
}
private:
static bool Compare(const KeyWord& lhs, const KeyWord& rhs) {
2021-04-26 15:06:47 +08:00
return lhs.weight > rhs.weight;
}
2021-04-26 15:06:47 +08:00
MixSegment segment_;
IdfTrie idf_trie_;
unordered_set<Rune> symbols_;
}; // class KeywordExtractor
inline ostream& operator << (ostream& os, const KeyWord& word) {
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
"}";
}
} // namespace cppjieba