2021-05-22 09:18:35 +08:00
|
|
|
|
#pragma once
|
2020-12-31 21:14:13 +08:00
|
|
|
|
|
|
|
|
|
#include <cmath>
|
|
|
|
|
#include "MixSegment.hpp"
|
2021-06-23 15:50:19 +08:00
|
|
|
|
#include "IdfTrie.hpp"
|
2020-12-31 21:14:13 +08:00
|
|
|
|
|
|
|
|
|
namespace cppjieba {
|
|
|
|
|
|
|
|
|
|
using namespace limonp;
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
|
|
/*utf8*/
|
|
|
|
|
class KeywordExtractor {
|
2021-04-26 15:06:47 +08:00
|
|
|
|
public:
|
|
|
|
|
|
|
|
|
|
KeywordExtractor(const DictTrie* dictTrie,
|
|
|
|
|
const HMMModel* model,
|
|
|
|
|
const string& idfPath,
|
2021-06-23 15:50:19 +08:00
|
|
|
|
const string& dat_cache_path,
|
2021-04-26 15:06:47 +08:00
|
|
|
|
const string& stopWordPath)
|
2021-06-23 15:50:19 +08:00
|
|
|
|
: segment_(dictTrie, model, stopWordPath),
|
|
|
|
|
idf_trie_(idfPath,dat_cache_path){
|
2020-12-31 21:14:13 +08:00
|
|
|
|
}
|
2021-04-26 15:06:47 +08:00
|
|
|
|
~KeywordExtractor() {
|
2020-12-31 21:14:13 +08:00
|
|
|
|
}
|
|
|
|
|
|
2021-04-26 15:06:47 +08:00
|
|
|
|
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
2021-06-07 15:37:06 +08:00
|
|
|
|
vector<KeyWord> topWords;
|
2021-04-26 15:06:47 +08:00
|
|
|
|
Extract(sentence, topWords, topN);
|
2021-05-22 09:18:35 +08:00
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < topWords.size(); i++) {
|
2021-04-26 15:06:47 +08:00
|
|
|
|
keywords.push_back(topWords[i].word);
|
|
|
|
|
}
|
2020-12-31 21:14:13 +08:00
|
|
|
|
}
|
|
|
|
|
|
2021-04-26 15:06:47 +08:00
|
|
|
|
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
2021-06-07 15:37:06 +08:00
|
|
|
|
vector<KeyWord> topWords;
|
2021-04-26 15:06:47 +08:00
|
|
|
|
Extract(sentence, topWords, topN);
|
2021-05-22 09:18:35 +08:00
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < topWords.size(); i++) {
|
2021-04-26 15:06:47 +08:00
|
|
|
|
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
|
|
|
|
}
|
2020-12-31 21:14:13 +08:00
|
|
|
|
}
|
|
|
|
|
|
2021-06-07 15:37:06 +08:00
|
|
|
|
void Extract(const string& sentence, vector<KeyWord>& keywords, size_t topN) const {
|
2021-04-26 15:06:47 +08:00
|
|
|
|
|
2021-06-07 15:37:06 +08:00
|
|
|
|
unordered_map<string, KeyWord> wordmap;//插入字符串与Word的map,相同string统计词频叠加权重
|
|
|
|
|
PreFilter pre_filter(symbols_, sentence);
|
|
|
|
|
RuneStrArray::const_iterator null_p;
|
|
|
|
|
WordRange range(null_p, null_p);
|
|
|
|
|
bool isNull(false);
|
|
|
|
|
while (pre_filter.Next(range, isNull)) {
|
|
|
|
|
if (isNull) {
|
2021-04-26 15:06:47 +08:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2021-06-07 15:37:06 +08:00
|
|
|
|
segment_.CutToStr(sentence, range, wordmap);
|
2021-04-26 15:06:47 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
keywords.clear();
|
|
|
|
|
keywords.reserve(wordmap.size());
|
2021-05-22 09:18:35 +08:00
|
|
|
|
|
2021-06-07 15:37:06 +08:00
|
|
|
|
for (unordered_map<string, KeyWord>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
2021-06-23 15:50:19 +08:00
|
|
|
|
double idf = idf_trie_.Find(itr->first);
|
|
|
|
|
if (-1 != idf) {//IDF词典查找
|
|
|
|
|
itr->second.weight *= idf;
|
2021-04-26 15:06:47 +08:00
|
|
|
|
} else {
|
2021-06-23 15:50:19 +08:00
|
|
|
|
itr->second.weight *= idf_trie_.idfAverage_;
|
2021-04-26 15:06:47 +08:00
|
|
|
|
}
|
2021-05-22 09:18:35 +08:00
|
|
|
|
|
2021-04-26 15:06:47 +08:00
|
|
|
|
itr->second.word = itr->first;
|
|
|
|
|
keywords.push_back(itr->second);
|
|
|
|
|
}
|
2021-05-22 09:18:35 +08:00
|
|
|
|
|
2021-04-26 15:06:47 +08:00
|
|
|
|
topN = min(topN, keywords.size());
|
|
|
|
|
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
|
|
|
|
keywords.resize(topN);
|
|
|
|
|
}
|
|
|
|
|
private:
|
2020-12-31 21:14:13 +08:00
|
|
|
|
|
2021-06-07 15:37:06 +08:00
|
|
|
|
static bool Compare(const KeyWord& lhs, const KeyWord& rhs) {
|
2021-04-26 15:06:47 +08:00
|
|
|
|
return lhs.weight > rhs.weight;
|
|
|
|
|
}
|
2020-12-31 21:14:13 +08:00
|
|
|
|
|
2021-04-26 15:06:47 +08:00
|
|
|
|
MixSegment segment_;
|
2021-06-23 15:50:19 +08:00
|
|
|
|
IdfTrie idf_trie_;
|
2020-12-31 21:14:13 +08:00
|
|
|
|
|
2022-03-02 09:27:40 +08:00
|
|
|
|
|
2021-06-07 15:37:06 +08:00
|
|
|
|
unordered_set<Rune> symbols_;
|
2020-12-31 21:14:13 +08:00
|
|
|
|
}; // class KeywordExtractor
|
|
|
|
|
|
2021-06-07 15:37:06 +08:00
|
|
|
|
inline ostream& operator << (ostream& os, const KeyWord& word) {
|
2021-05-22 09:18:35 +08:00
|
|
|
|
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
|
|
|
|
|
"}";
|
2020-12-31 21:14:13 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace cppjieba
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|