/* * The MIT License (MIT) * * Copyright (C) 2013 Yanyi Wu * Copyright (C) 2023, KylinSoft Co., Ltd. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * */ #include #include "Jieba.hpp" namespace cppjieba { using namespace limonp; using namespace std; class TextRankExtractor { public: typedef struct _Word { string word; vector offsets; double weight; } Word; // struct Word private: typedef std::map WordMap; class WordGraph { private: typedef double Score; typedef string Node; typedef std::set NodeSet; typedef std::map Edges; typedef std::map Graph; //typedef std::unordered_map Edges; //typedef std::unordered_map Graph; double d; Graph graph; NodeSet nodeSet; public: WordGraph(): d(0.85) {}; WordGraph(double in_d): d(in_d) {}; void addEdge(Node start, Node end, double weight) { Edges temp; Edges::iterator gotEdges; nodeSet.insert(start); nodeSet.insert(end); graph[start][end] += weight; graph[end][start] += weight; } void rank(WordMap &ws, size_t rankTime = 10) { WordMap outSum; Score wsdef, min_rank, max_rank; if (graph.size() == 0) { return; } wsdef = 1.0 / graph.size(); for (Graph::iterator edges = graph.begin(); edges != graph.end(); ++edges) { // edges->first start节点；edge->first end节点；edge->second 权重 ws[edges->first].word = edges->first; ws[edges->first].weight = wsdef; outSum[edges->first].weight = 0; for (Edges::iterator edge = edges->second.begin(); edge != edges->second.end(); ++edge) { outSum[edges->first].weight += edge->second; } } //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序? for (size_t i = 0; i < rankTime; i++) { for (NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++) { double s = 0; for (Edges::iterator edge = graph[*node].begin(); edge != graph[*node].end(); edge++) // edge->first end节点；edge->second 权重 { s += edge->second / outSum[edge->first].weight * ws[edge->first].weight; } ws[*node].weight = (1 - d) + d * s; } } min_rank = max_rank = ws.begin()->second.weight; for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) { if (i->second.weight < min_rank) { min_rank = i->second.weight; } if (i->second.weight > max_rank) { max_rank = i->second.weight; } } for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) { ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0); } } }; public: TextRankExtractor(const DictTrie* dictTrie, const HMMModel* model, const string& stopWordPath) : segment_(dictTrie, model) { LoadStopWordDict(stopWordPath); } TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { LoadStopWordDict(stopWordPath); } ~TextRankExtractor() { } void Extract(const string& sentence, vector& keywords, size_t topN) const { vector topWords; Extract(sentence, topWords, topN); for (size_t i = 0; i < topWords.size(); i++) { keywords.push_back(topWords[i].word); } } void Extract(const string& sentence, vector >& keywords, size_t topN) const { vector topWords; Extract(sentence, topWords, topN); for (size_t i = 0; i < topWords.size(); i++) { keywords.push_back(pair(topWords[i].word, topWords[i].weight)); } } void Extract(const string& sentence, vector& keywords, size_t topN, size_t span = 5, size_t rankTime = 10) const { vector words; segment_.CutToStr(sentence, words); TextRankExtractor::WordGraph graph; WordMap wordmap; size_t offset = 0; for (size_t i = 0; i < words.size(); i++) { size_t t = offset; offset += words[i].size(); if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { continue; } for (size_t j = i + 1, skip = 0; j < i + span + skip && j < words.size(); j++) { if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) { skip++; continue; } graph.addEdge(words[i], words[j], 1); } wordmap[words[i]].offsets.push_back(t); } if (offset != sentence.size()) { XLOG(ERROR) << "words illegal"; return; } graph.rank(wordmap, rankTime); keywords.clear(); keywords.reserve(wordmap.size()); for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { keywords.push_back(itr->second); } topN = min(topN, keywords.size()); partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); keywords.resize(topN); } private: void LoadStopWordDict(const string& filePath) { ifstream ifs(filePath.c_str()); XCHECK(ifs.is_open()) << "open " << filePath << " failed"; string line ; while (getline(ifs, line)) { stopWords_.insert(line); } assert(stopWords_.size()); } static bool Compare(const Word &x, const Word &y) { return x.weight > y.weight; } MixSegment segment_; unordered_set stopWords_; }; // class TextRankExtractor inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) { return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; } } // namespace cppjieba