2024-01-30 14:42:09 +08:00
|
|
|
|
/*
|
2024-02-06 09:45:29 +08:00
|
|
|
|
* The MIT License (MIT)
|
2024-01-30 14:42:09 +08:00
|
|
|
|
*
|
2024-02-06 09:45:29 +08:00
|
|
|
|
* Copyright (C) 2013 Yanyi Wu
|
2024-01-30 14:42:09 +08:00
|
|
|
|
* Copyright (C) 2023, KylinSoft Co., Ltd.
|
|
|
|
|
*
|
2024-02-06 09:45:29 +08:00
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
2024-01-30 14:42:09 +08:00
|
|
|
|
*
|
2024-02-06 09:45:29 +08:00
|
|
|
|
* The above copyright notice and this permission notice shall be
|
|
|
|
|
* included in all copies or substantial portions of the Software.
|
2024-01-30 14:42:09 +08:00
|
|
|
|
*
|
2024-02-06 09:45:29 +08:00
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
|
|
|
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
|
|
|
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
|
|
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
|
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
2024-01-30 14:42:09 +08:00
|
|
|
|
*
|
2024-02-06 09:45:29 +08:00
|
|
|
|
*/
|
2023-12-28 10:27:40 +08:00
|
|
|
|
#include <cmath>
|
|
|
|
|
#include "Jieba.hpp"
|
|
|
|
|
|
|
|
|
|
namespace cppjieba {
|
|
|
|
|
using namespace limonp;
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
|
|
class TextRankExtractor {
|
|
|
|
|
public:
|
|
|
|
|
typedef struct _Word {
|
|
|
|
|
string word;
|
|
|
|
|
vector<size_t> offsets;
|
|
|
|
|
double weight;
|
|
|
|
|
} Word; // struct Word
|
|
|
|
|
private:
|
|
|
|
|
typedef std::map<string, Word> WordMap;
|
|
|
|
|
|
|
|
|
|
class WordGraph {
|
|
|
|
|
private:
|
|
|
|
|
typedef double Score;
|
|
|
|
|
typedef string Node;
|
|
|
|
|
typedef std::set<Node> NodeSet;
|
|
|
|
|
|
|
|
|
|
typedef std::map<Node, double> Edges;
|
|
|
|
|
typedef std::map<Node, Edges> Graph;
|
|
|
|
|
//typedef std::unordered_map<Node,double> Edges;
|
|
|
|
|
//typedef std::unordered_map<Node,Edges> Graph;
|
|
|
|
|
|
|
|
|
|
double d;
|
|
|
|
|
Graph graph;
|
|
|
|
|
NodeSet nodeSet;
|
|
|
|
|
public:
|
|
|
|
|
WordGraph(): d(0.85) {};
|
|
|
|
|
WordGraph(double in_d): d(in_d) {};
|
|
|
|
|
|
|
|
|
|
void addEdge(Node start, Node end, double weight) {
|
|
|
|
|
Edges temp;
|
|
|
|
|
Edges::iterator gotEdges;
|
|
|
|
|
nodeSet.insert(start);
|
|
|
|
|
nodeSet.insert(end);
|
|
|
|
|
graph[start][end] += weight;
|
|
|
|
|
graph[end][start] += weight;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void rank(WordMap &ws, size_t rankTime = 10) {
|
|
|
|
|
WordMap outSum;
|
|
|
|
|
Score wsdef, min_rank, max_rank;
|
|
|
|
|
|
|
|
|
|
if (graph.size() == 0) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
wsdef = 1.0 / graph.size();
|
|
|
|
|
|
|
|
|
|
for (Graph::iterator edges = graph.begin(); edges != graph.end(); ++edges) {
|
|
|
|
|
// edges->first start节点;edge->first end节点;edge->second 权重
|
|
|
|
|
ws[edges->first].word = edges->first;
|
|
|
|
|
ws[edges->first].weight = wsdef;
|
|
|
|
|
outSum[edges->first].weight = 0;
|
|
|
|
|
|
|
|
|
|
for (Edges::iterator edge = edges->second.begin(); edge != edges->second.end(); ++edge) {
|
|
|
|
|
outSum[edges->first].weight += edge->second;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
|
|
|
|
|
for (size_t i = 0; i < rankTime; i++) {
|
|
|
|
|
for (NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++) {
|
|
|
|
|
double s = 0;
|
|
|
|
|
|
|
|
|
|
for (Edges::iterator edge = graph[*node].begin(); edge != graph[*node].end(); edge++)
|
|
|
|
|
// edge->first end节点;edge->second 权重
|
|
|
|
|
{
|
|
|
|
|
s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ws[*node].weight = (1 - d) + d * s;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
min_rank = max_rank = ws.begin()->second.weight;
|
|
|
|
|
|
|
|
|
|
for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
|
|
|
|
|
if (i->second.weight < min_rank) {
|
|
|
|
|
min_rank = i->second.weight;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (i->second.weight > max_rank) {
|
|
|
|
|
max_rank = i->second.weight;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
|
|
|
|
|
ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
TextRankExtractor(const DictTrie* dictTrie,
|
|
|
|
|
const HMMModel* model,
|
|
|
|
|
const string& stopWordPath)
|
|
|
|
|
: segment_(dictTrie, model) {
|
|
|
|
|
LoadStopWordDict(stopWordPath);
|
|
|
|
|
}
|
|
|
|
|
TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
|
|
|
|
|
LoadStopWordDict(stopWordPath);
|
|
|
|
|
}
|
|
|
|
|
~TextRankExtractor() {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
|
|
|
|
vector<Word> topWords;
|
|
|
|
|
Extract(sentence, topWords, topN);
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < topWords.size(); i++) {
|
|
|
|
|
keywords.push_back(topWords[i].word);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
|
|
|
|
vector<Word> topWords;
|
|
|
|
|
Extract(sentence, topWords, topN);
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < topWords.size(); i++) {
|
|
|
|
|
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span = 5, size_t rankTime = 10) const {
|
|
|
|
|
vector<string> words;
|
|
|
|
|
segment_.CutToStr(sentence, words);
|
|
|
|
|
|
|
|
|
|
TextRankExtractor::WordGraph graph;
|
|
|
|
|
WordMap wordmap;
|
|
|
|
|
size_t offset = 0;
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < words.size(); i++) {
|
|
|
|
|
size_t t = offset;
|
|
|
|
|
offset += words[i].size();
|
|
|
|
|
|
|
|
|
|
if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (size_t j = i + 1, skip = 0; j < i + span + skip && j < words.size(); j++) {
|
|
|
|
|
if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
|
|
|
|
|
skip++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
graph.addEdge(words[i], words[j], 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
wordmap[words[i]].offsets.push_back(t);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (offset != sentence.size()) {
|
|
|
|
|
XLOG(ERROR) << "words illegal";
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
graph.rank(wordmap, rankTime);
|
|
|
|
|
|
|
|
|
|
keywords.clear();
|
|
|
|
|
keywords.reserve(wordmap.size());
|
|
|
|
|
|
|
|
|
|
for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
|
|
|
|
keywords.push_back(itr->second);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
topN = min(topN, keywords.size());
|
|
|
|
|
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
|
|
|
|
keywords.resize(topN);
|
|
|
|
|
}
|
|
|
|
|
private:
|
|
|
|
|
void LoadStopWordDict(const string& filePath) {
|
|
|
|
|
ifstream ifs(filePath.c_str());
|
|
|
|
|
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
|
|
|
|
|
string line ;
|
|
|
|
|
|
|
|
|
|
while (getline(ifs, line)) {
|
|
|
|
|
stopWords_.insert(line);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert(stopWords_.size());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool Compare(const Word &x, const Word &y) {
|
|
|
|
|
return x.weight > y.weight;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MixSegment segment_;
|
|
|
|
|
unordered_set<string> stopWords_;
|
|
|
|
|
}; // class TextRankExtractor
|
|
|
|
|
|
|
|
|
|
inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
|
|
|
|
|
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
|
|
|
|
|
"}";
|
|
|
|
|
}
|
|
|
|
|
} // namespace cppjieba
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|