ukui-search/libchinese-segmentation/cppjieba/KeywordExtractor.hpp

119 lines
3.6 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
*
* Copyright (C) 2023, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
#pragma once
#include <cmath>
#include "MixSegment.hpp"
//#include "IdfTrie.hpp"
#include "idf-trie/idf-trie.h"
namespace cppjieba {
using namespace limonp;
using namespace std;
/*utf8*/
class KeywordExtractor {
public:
KeywordExtractor(const DictTrie* dictTrie,
const HMMModel* model,
const string& idfPath,
const string& dat_cache_path,
const string& stopWordPath)
: segment_(dictTrie, model, stopWordPath),
idf_trie_(idfPath, dat_cache_path){
}
~KeywordExtractor() {
}
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
vector<KeyWord> topWords;
Extract(sentence, topWords, topN);
for (size_t i = 0; i < topWords.size(); i++) {
keywords.push_back(topWords[i].word);
}
}
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
vector<KeyWord> topWords;
Extract(sentence, topWords, topN);
for (size_t i = 0; i < topWords.size(); i++) {
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
}
}
void Extract(const string& sentence, vector<KeyWord>& keywords, size_t topN) const {
unordered_map<string, KeyWord> wordmap;//插入字符串与Word的map相同string统计词频叠加权重
PreFilter pre_filter(symbols_, sentence);
RuneStrArray::const_iterator null_p;
WordRange range(null_p, null_p);
bool isNull(false);
while (pre_filter.Next(range, isNull)) {
if (isNull) {
continue;
}
segment_.CutToStr(sentence, range, wordmap);
}
keywords.clear();
keywords.reserve(wordmap.size());
for (unordered_map<string, KeyWord>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
double idf = idf_trie_.Find(itr->first);
if (-1 != idf) {//IDF词典查找
itr->second.weight *= idf;
} else {
itr->second.weight *= idf_trie_.GetIdfAverage();
}
itr->second.word = itr->first;
keywords.push_back(itr->second);
}
topN = min(topN, keywords.size());
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
keywords.resize(topN);
}
private:
static bool Compare(const KeyWord& lhs, const KeyWord& rhs) {
return lhs.weight > rhs.weight;
}
MixSegment segment_;
IdfTrie idf_trie_;
unordered_set<Rune> symbols_;
}; // class KeywordExtractor
inline ostream& operator << (ostream& os, const KeyWord& word) {
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
"}";
}
} // namespace cppjieba